def cluster_song_in_playlist(self, playlist_id, cluster_n=5, is_detailed=False): """ 获取单个歌单内的歌曲聚类信息 Args: playlist_id: 歌单id cluster_n:聚类数 is_detailed: 返回的结果是否包含详情 Returns: 聚类后的列表 """ playlist_obj = playlist_detail(playlist_id) song_list = [] vec_list = [] song_info_dict = {} ap_cluster = AffinityPropagation() data_process_logger.info('clustering playlist: %s' % playlist_obj['name']) for item in playlist_obj['tracks']: song = item['name'].lower() song_info_dict[song] = { 'name': song, 'artist': item['artists'][0]['name'], 'id': item['id'], 'album_img_url': item['album']['picUrl'], 'site_url': 'http://music.163.com/#/song?id=%s' % item['id'] } # print song if song not in song_list: song_list.append(song) # print self.song2vec_model.vocab.get(song) # print self.song2vec_model.syn0norm == None if self.song2vec_model.vocab.get(song) and len( self.song2vec_model.syn0norm): song_vec = self.song2vec_model.syn0norm[ self.song2vec_model.vocab[song].index] else: data_process_logger.warn( 'The song %s of playlist-%s is not in dataset' % (song, playlist_obj['name'])) song_vec = [ 0 for i in range(self.song2vec_model.vector_size) ] vec_list.append(song_vec) # song_list = list(song_list) if len(vec_list) > 1: cluster_result = ap_cluster.fit(vec_list, song_list) cluster_array = [ [] for i in range(len(cluster_result.cluster_centers_indices_)) ] for i in range(len(cluster_result.labels_)): label = cluster_result.labels_[i] index = i cluster_array[label].append(song_list[i]) return cluster_array, playlist_obj['name'], song_info_dict else: return [song_list], playlist_obj['name'], song_info_dict
def load_csv_data(csv_path, normalize=True, is_combine=False): """ Args: csv_path: normalize: 是否进行标准化 is_combine: 是否进行norm特征和的拼接 Returns: """ from sklearn import preprocessing with open(csv_path, 'rb') as fin: data_process_logger.info('loading file: %s' % csv_path) datas = [] temp_list = [] score_list = [] date_list = [] id_list = [] vec_list = [] for line in fin: line = line.strip() tmp = line.split(',') stock_id = tmp[0] trade_date = tmp[1] score = eval(tmp[2]) score_list.append(score) vec_value = [eval(a) for a in tmp[3:]] vec_list.append(vec_value) date_list.append(trade_date) id_list.append(stock_id) temp_list.append((stock_id, trade_date, score, vec_value)) # all not normalize if not normalize: avg = np.mean(score_list) std = np.std(score_list) for item in temp_list: normalize_score = (item[2] - avg) / std datas.append((item[0], item[1], normalize_score, item[3])) return datas else: score_scale = preprocessing.scale(score_list) score_scale_list = list(score_scale) vec_scale = preprocessing.scale(vec_list) vec_scale_list = vec_scale for i in range(len(id_list)): if is_combine: datas.append( (id_list[i], date_list[i], score_scale_list[i], list(vec_scale_list[i]) + vec_list[i])) else: datas.append( (id_list[i], date_list[i], score_scale_list[i], list(vec_scale_list[i]))) # avg = np.mean(score_list) # std = np.std(score_list) # for item in temp_list: # normalize_score = (item[2] - avg) / std # datas.append((item[0], item[1], normalize_score, item[3])) return datas
def test_quant_data_wrapper(input_file_numbers, model, normalize=True, predict_iteration=None): """ input:(file_names,model) output: mean rank rate """ mean_rank_rates = [] file_number_list = [] if predict_iteration: model.save_model('tmp_model.txt', num_iteration=predict_iteration) model = Booster(model_file='tmp_model.txt') for i in input_file_numbers: data_root_path = '%s/datas/Quant_Datas_v3.0' % (DATA_ROOT) if normalize: fin_path = '%s/pickle_datas/%s_trans_norm.pickle' % (data_root_path, i) else: fin_path = '%s/pickle_datas/%s_trans.pickle' % (data_root_path, i) try: with open(fin_path, 'rb') as fin_data_file: stock_ids, stock_scores, vec_values = cPickle.load(fin_data_file) data_process_logger.info('testing file: %s' % fin_path) input_datas = np.column_stack((stock_ids, stock_scores, vec_values)) mean_rank_rate = test_datas(input_datas, model) if mean_rank_rate >= 0.4: data_analysis_logger.info('the file number is %s, obs = %s' % (i, len(input_datas))) mean_rank_rates.append(mean_rank_rate) file_number_list.append(i) except Exception, e: data_process_logger.info('test file failed: file path=%s, details=%s' % (fin_path, e))
def test_predict(): """ 测试的主入口 Returns: """ model_tag = 'lambdarank_3.0_127leaves_full_eval_earlystop' lightgbm_mod = pickle.load( open('%s/models/lightgbm_%s.model' % (PROJECT_PATH, model_tag), 'rb')) # data_root_path = '%s/datas/Quant-Datas-2.0' % (DATA_ROOT) # fin_path = '%s/pickle_datas/%s_trans_norm.pickle' % (data_root_path, 1) # test_single_lambdarank_file(fin_path, lightgbm_mod) f_numbers, f_rank_rates = pipeline_test_lambdarank_wrapper( # range(1, 3) + [11], # range(1, 300) + range(401, 840) + range(941, 1042) + range(1145, 1200) + range(1301, 1400) + range(1511, 1521), range(540, 640) + range(800, 845) + range(920, 945) + range(1020, 1045) + range(1200, 1214), model=lightgbm_mod) result_tag = 'haha' with open( '%s/pipelines/test_%s_%s_result_%s.csv' % (PROJECT_PATH, model_tag, result_tag, len(f_numbers)), 'wb') as fout: for i in range(len(f_numbers)): fout.write('%s,%s\n' % (f_numbers[i], f_rank_rates[i])) data_process_logger.info( 'result csv: %s/pipelines/test_%s_%s_result_%s.csv' % (PROJECT_PATH, model_tag, result_tag, len(f_numbers)))
def batch_process_real_data(model_path, file_numbers=[], workspace_root='./', model_tag='real', predict_iter=None): """ 进行批量处理实测数据,将结果存在相应文件下的<model_tag>文件夹下 Args: model_tag: 模型tag predict_iter:使用的模型迭代次数 model_path: 预测使用的模型路径 (lightGBM模型) workspace_root: csv文件所在的目录 file_numbers: 处理的文件编号列表 Returns: None """ output_path = os.path.join(workspace_root, '%s_results' % model_tag) if not os.path.exists(output_path): os.makedirs(output_path) predict_mod = pickle.load(open(model_path, 'rb')) # if predict_iter: # predict_mod.save_model('tmp_model.txt', num_iteration=predict_iter) # predict_mod = Booster(model_file='tmp_model.txt') for file_n in file_numbers: fin_path = (workspace_root + '/%s.csv') % file_n fout_path = (output_path + '/%s_result.csv') % file_n turn_csv_into_result(fin_path, fout_path, predict_mod, predict_iter, True, False) data_process_logger.info('Done with %s files' % len(file_numbers))
def test_single_lambdarank_file(fin_path, model_file): try: # global g_model data_analysis_logger.info('testing %s' % fin_path) stock_ids, stock_scores, vec_values, stock_rank_labels, query_count = process_single_pickle_data( fin_path) ylist = model_file.predict(vec_values) origin_score_list = stock_scores combined_score_list = np.column_stack((ylist, origin_score_list)) # input_datas = input_datas.tolist() # origin_ranked_list = sorted(input_datas, cmp=lambda x, y: 1 if x[1] - y[1] > 0 else -1) combined_score_list = combined_score_list.tolist() origin_ranked_list = sorted(combined_score_list, cmp=lambda x, y: 1 if x[1] - y[1] > 0 else -1) # 根据原始值值进行升序排序 # 得到原始值的排名序号 index_ylist = [(i, origin_ranked_list[i][0], origin_ranked_list[i][1]) for i in range(len(origin_ranked_list))] predict_ranked_index_ylist = sorted( index_ylist, cmp=lambda x, y: 1 if x[1] - y[1] < 0 else -1) # 根据预测值进行降序排序,保留原始值排名序号 mean_rank_rate = result_validation(predict_ranked_index_ylist, N=50) if mean_rank_rate >= 0.4: data_analysis_logger.info('the file path is %s, obs = %s' % (fin_path, len(stock_scores))) # mean_rank_rates.append(mean_rank_rate) # file_number_list.append(i) return mean_rank_rate except Exception, e: data_process_logger.info('test file failed: file path=%s, details=%s' % (fin_path, e)) return None
def train_artistsong2vec_model(fout_path, input_datas=None, data_path=None, min_count=5, sorted_vocab=1, window=10, size=250, iter_n=50): if not input_datas and data_path: input_datas = pickle.load(open(data_path, 'rb')) full_data = [] for i in input_datas: tmp = [] for j in i: tmp.append(j[0]) tmp.append(j[1]) full_data.append(tmp) data_process_logger.info('start training') wv_model = gensim.models.Word2Vec(full_data, min_count=min_count, sorted_vocab=sorted_vocab, window=window, size=size, iter=iter_n) with open(fout_path, 'wb') as fout: data_process_logger.info('start saving model') pickle.dump(wv_model, fout) print 'model saved'
def result_validation(ranked_index_ylist, N=50, threshold=0.35): buyer_list = ranked_index_ylist[:N] total_error = 0 origin_rank_list = [] true_roa=[i[2] for i in buyer_list] mean_score = np.mean(true_roa) data_process_logger.info('mean_score = %s' % mean_score) return mean_score
def cluster_artist_in_playlist(self, playlist_id, cluster_n=5, is_detailed=False): """ 获取单个歌单内的歌手聚类信息 Args: playlist_id: 歌单id cluster_n:聚类数 is_detailed: 是否包含详情信息 Returns: 聚类后的列表 """ playlist_obj = playlist_detail(playlist_id) artist_list = [] vec_list = [] ap_cluster = AffinityPropagation() data_process_logger.info('clustering playlist: %s' % playlist_obj['name']) for item in playlist_obj['tracks']: artist = item['artists'][0]['name'].lower() # print artist if artist not in artist_list: artist_list.append(artist) # print self.song2vec_model.vocab.get(artist) # print self.song2vec_model.syn0norm == None if self.artist2vec_model.vocab.get(artist) and len( self.artist2vec_model.syn0norm): artist_vec = self.artist2vec_model.syn0norm[ self.artist2vec_model.vocab[artist].index] else: data_process_logger.warn( 'The artist %s of playlist-%s is not in dataset' % (artist, playlist_obj['name'])) artist_vec = [ 0 for i in range(self.artist2vec_model.vector_size) ] vec_list.append(artist_vec) # artist_list = list(artist_list) # vec_list = list(vec_list) if len(vec_list) > 1: cluster_result = ap_cluster.fit(vec_list, artist_list) cluster_array = [ [] for i in range(len(cluster_result.cluster_centers_indices_)) ] for i in range(len(cluster_result.labels_)): label = cluster_result.labels_[i] index = i cluster_array[label].append(artist_list[i]) return cluster_array, playlist_obj['name'], {} else: return [artist_list], playlist_obj['name'], {}
def infer_missing_datas(fin_csv_path, fout_pickle_path, is_norm=False, is_norm_score=True): """ 处理NaN数据,并将处理后的数据分别存储为csv与pickle文件 Args: is_norm: 是否进行标准化 is_norm_score: 是否对score进行标准化 fin_csv_path: fout_pickle_path: Returns: """ with open(fin_csv_path, 'rb') as fin_csv, \ open(fout_pickle_path, 'wb') as fout_pickle: origin_datas = [] reader = csv.reader(fin_csv) # writer = csv.writer(fout_csv) count = 1 n_feature = 4563 data_process_logger.info('start reading %s' % fin_csv_path) for line in reader: if len(line) == n_feature: single_vec_value = [ float(i) if i != 'NaN' else np.nan for i in line ] # process the 453th col, remove future feature. single_vec_value = single_vec_value[:453] + single_vec_value[ 454:] origin_datas.append(single_vec_value) # data_process_logger.info('handled line %s' % count) else: data_process_logger.info( 'casting line: %s in file %s, it has %s features while the first line has %s' % (count, fin_csv_path, len(line), n_feature)) count += 1 # inferring missing data imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(origin_datas) transformed_datas = imp.transform(origin_datas) if is_norm: # standardising datas stock_ids = transformed_datas[:, 0] stock_scores = transformed_datas[:, 1] vec_values = transformed_datas[:, 2:] scaled_vec_values = preprocessing.scale(vec_values) if is_norm_score: stock_scores = preprocessing.scale(stock_scores) transformed_datas = (stock_ids.tolist(), stock_scores.tolist(), scaled_vec_values.tolist()) # 存为tuple # writting transformed datas # data_process_logger.info('start writting %s' % fout_csv_path) data_process_logger.info('start dumping %s' % fout_pickle_path) # transformed_datas = transformed_datas.tolist() # 转为list进行存储 cPickle.dump(transformed_datas, fout_pickle, protocol=2) data_process_logger.info('%s done' % fin_csv_path) return transformed_datas
def test_single_file(fin_path): try: global g_model with open(fin_path, 'rb') as fin_data_file: stock_ids, stock_scores, vec_values = cPickle.load(fin_data_file) data_process_logger.info('testing file: %s' % fin_path) input_datas = np.column_stack((stock_ids, stock_scores, vec_values)) mean_rank_rate = test_datas(input_datas, g_model) if mean_rank_rate >= 0.4: data_analysis_logger.info('the file number is %s, obs = %s' % (i, len(input_datas))) # mean_rank_rates.append(mean_rank_rate) # file_number_list.append(i) return mean_rank_rate, i except Exception, e: data_process_logger.info('test file failed: file path=%s, details=%s' % (fin_path, e))
def update_userinfo(): """ 临时更新数据库的脚本 Returns: """ DAO_inst = CloudMusicDAO('MusicTaster', 'UserInfos') uids = DAO_inst.db_inst.distinct('userId') count = 0 for uid in uids: userinfo = DAO_inst.db_inst.find_one({'userId': uid}) userinfo['follow_count'] = len(userinfo['follow_ids']) userinfo['fan_count'] = len(userinfo['fan_ids']) DAO_inst.save_unique_item(userinfo, primary_key='userId', is_overwrite=True) data_process_logger.info('No.%s %s-%s' % (count, userinfo['userId'], userinfo['nickname'])) count += 1 print 'done'
def parallel_test_quant_data_wrapper(input_file_numbers, model, normalize=True, predict_iteration=None, process_count=2): """ input:(file_names,model) output: mean rank rate """ mean_rank_rates = [] file_number_list = [] if predict_iteration: model.save_model('tmp_model.txt', num_iteration=predict_iteration) else: model.save_model('tmp_model.txt') global g_model g_model = Booster(model_file='tmp_model.txt') proc_pool = multiprocessing.Pool(process_count) multi_result = [] for i in input_file_numbers: data_root_path = '%s/datas/Quant-Datas-2.0' % (DATA_ROOT) if normalize: fin_path = '%s/pickle_datas/%s_trans_norm.pickle' % (data_root_path, i) else: fin_path = '%s/pickle_datas/%s_trans.pickle' % (data_root_path, i) data_res = proc_pool.apply_async(test_single_file, args=(fin_path,)) multi_result.append(data_res) proc_pool.close() proc_pool.join() # 合并结果 for i in range(len(multi_result)): tmp_mean_rank_rate, file_n = multi_result[i].get() mean_rank_rates.append(tmp_mean_rank_rate) file_number_list.append(file_n) mean_rank_rate = np.mean(mean_rank_rates) std_rank_rate = np.std(mean_rank_rates) var_rank = np.var(mean_rank_rates) data_process_logger.info( 'Tested %s files, all input files mean rank rate is %s, all input files std is %s, var is %s' % ( len(input_file_numbers), mean_rank_rate, std_rank_rate, var_rank)) return file_number_list, mean_rank_rates
def parallel_inferring(file_number_list, process_count=12, is_norm=True, is_norm_score=True, data_root_path=None): """ 并行化进行数据清理 Returns: """ data_process_logger.info('Start parallel inferring, process count = %s' % process_count) proc_pool = multiprocessing.Pool(process_count) # multi_results = [] for i in file_number_list: # data_process_logger.info('loading %s file' % i) # csv_path = '%s/datas/%s.csv' % (PROJECT_PATH, i) if not data_root_path: data_root_path = '%s/datas/Quant-Datas-2.0' % (DATA_ROOT) fin_csv_path = '%s/%s.csv' % (data_root_path, i) if is_norm: # fout_csv_path = '%s/transformed_datas/%s_trans_norm.csv' % (data_root_path, i) # fout_pickle_path = '%s/pickle_datas/%s_trans_norm.pickle' % (data_root_path, i) fout_gzip_path = '%s/gzip_datas_norm/%s_trans_norm.gz' % ( data_root_path, i) else: # fout_csv_path = '%s/transformed_datas/%s_trans.csv' % (data_root_path, i) # fout_pickle_path = '%s/pickle_datas/%s_trans.pickle' % (data_root_path, i) fout_gzip_path = '%s/gzip_datas/%s_trans.gz' % (data_root_path, i) data_res = proc_pool.apply_async(infer_missing_datas_to_gzip, args=(fin_csv_path, fout_gzip_path, is_norm, is_norm_score)) # multi_results.append(data_res) # datas = load_csv_data(csv_path, normalize=True, is_combine=True) # train_datas += datas proc_pool.close() proc_pool.join() data_process_logger.info('Done with %s files' % len(file_number_list))
def train_song2vec_model(fout_path, input_datas=None, data_path=None, min_count=5, sorted_vocab=1, window=10, size=250, iter_n=50): """ 训练song2vec模型 Args: fout_path: input_datas: data_path: min_count: sorted_vocab: window: size: iter_n: Returns: """ if not input_datas and data_path: input_datas = pickle.load(open(data_path, 'rb')) data_process_logger.info('start training') random.shuffle(input_datas) input_datas = input_datas[:45000] wv_model = gensim.models.Word2Vec(input_datas, min_count=min_count, sorted_vocab=sorted_vocab, window=window, size=size, iter=iter_n) with open(fout_path, 'wb') as fout: data_process_logger.info('start saving model') pickle.dump(wv_model, fout) print 'model saved'
def fill_song_comments(): """ 填充歌曲的评论详情 Returns: """ dao_inst = CloudMusicDAO('MusicTaster', 'SongInfos') find_result = dao_inst.db_inst.find({'commentInfo': {'$exists': False}}) count = 0 for song_item in find_result: comm_data = song_comments(song_item['commentThreadId'], limit=10) if comm_data: # 确保评论详情读取正确 del comm_data['code'] # del comm_data['userId'] song_item['commentInfo'] = comm_data song_item['commentCount'] = comm_data['total'] dao_inst.db_inst.save(song_item) data_process_logger.info( 'No.%s %s, comments: %s done' % (count, song_item['name'], song_item['commentCount'])) count += 1 slp = random.random() * 2 + 1 data_process_logger.info('sleep %s sec' % slp) time.sleep(slp)
def prepare_artist_dict(tag=''): playlist_dao_inst = CloudMusicDAO('MusicTaster', 'Playlists') # print playlist_dao_inst.db_inst.find( # {'trackCount': {'$gte': 10, '$lte': 600}, 'playCount': {'$gte': 10}}, # {'name': 1}).limit(100000).count() find_result = playlist_dao_inst.db_inst.find( { 'trackCount': { '$gte': 10, '$lte': 600 }, 'playCount': { '$gte': 5 } }, { 'tracks': 1, 'name': 1 }).limit(100000) # 将歌单中的歌曲名组合成歌曲名序列 total_artists_set = [] count = 0 for item in find_result: data_process_logger.info('No.%s %s' % (count, item['name'])) # 保存歌单中的歌曲序列 artists_seq = [] for song in item['tracks']: sname = song['artists'][0]['name'] artists_seq.append(sname.lower()) total_artists_set.append(artists_seq) count += 1 data_process_logger.info('start building dictionary') artist_dictionary = corpora.Dictionary(total_artists_set) print u'歌单数', artist_dictionary.num_docs try: print u'歌手数', len(artist_dictionary.token2id) except Exception, e: print 'error = %s' % e
def prepare_song_dict(tag=''): """ 从数据库中遍历歌单,准备song2vec的训练数据 Args: tag: 备注tag信息 Returns: """ playlist_dao_inst = CloudMusicDAO('MusicTaster', 'Playlists') print playlist_dao_inst.db_inst.find( { 'trackCount': { '$gte': 3, '$lte': 1000 }, 'playCount': { '$gte': 1 } }, { 'tracks': 1, 'name': 1 }).limit(100000).count() find_result = playlist_dao_inst.db_inst.find( { 'trackCount': { '$gte': 3, '$lte': 1000 }, 'playCount': { '$gte': 1 } }, { 'tracks': 1, 'name': 1 }).limit(100000) # 将歌单中的歌曲名组合成歌曲名序列 total_song_set = [] count = 0 for item in find_result: data_process_logger.info('No.%s %s' % (count, item['name'])) # 保存歌单中的歌曲序列 song_seq = [] for song in item['tracks']: sname = song['name'] song_seq.append(sname.lower()) total_song_set.append(song_seq) count += 1 data_process_logger.info('start building dictionary') song_dictionary = corpora.Dictionary(total_song_set) print u'歌单数', song_dictionary.num_docs print u'歌曲数', song_dictionary.num_pos data_process_logger.info('start saving datas') song_dictionary.save('../datas/song_dictionary_%s.dict' % tag) pickle.dump(total_song_set, open('../datas/songs_seq_%s.dat' % tag, 'wb')) return song_dictionary
def process_single_pickle_data(pickle_file_path, query_label=1): """ 处理单个pickle后的data文件,进行排序、query data的生成等 Returns: tuple of ranked-features and query data """ def rank_value(rank, max_rank): """ 获取排名权重 Returns: """ rank_steps = [0.01, 0.05, 0.1, 0.15, 0.3, 0.4, 0.5, 0.7, 0.9, 1] rank_weights = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1] basic_rank = rank / max_rank for rank_index in range(len(rank_steps)): if basic_rank <= rank_steps[rank_index]: return rank_weights[rank_index] return 0 with open(pickle_file_path, 'rb') as fin: data_process_logger.info('processing file: %s' % pickle_file_path) # stock_ids, stock_scores, vec_values = pickle.load(fin) pickle_obj = pickle.load(fin) combined_obj = [(pickle_obj[0][i], pickle_obj[1][i], pickle_obj[2][i]) for i in range(len(pickle_obj[0]))] ranked_datas = sorted(combined_obj, cmp=lambda x, y: 1 if x[1] - y[1] > 0 else -1) stock_ids = [a[0] for a in ranked_datas] stock_scores = [a[1] for a in ranked_datas] stock_rank_labels = [ rank_value(a, len(ranked_datas)) for a in range(len(ranked_datas)) ] # label for rank vec_values = [a[2] for a in ranked_datas] return stock_ids, stock_scores, vec_values, stock_rank_labels, len( ranked_datas)
def pipeline_test_lambdarank_wrapper(input_file_numbers, model, normalize=True, predict_iteration=None): """ 进行结果测试 Args: input_file_numbers: model: normalize: predict_iteration: Returns: """ mean_rank_rates = [] file_number_list = [] if predict_iteration: model.save_model('tmp_lambdarank_model.txt', num_iteration=predict_iteration) model = Booster(model_file='tmp_lambdarank_model.txt') for i in input_file_numbers: data_root_path = '%s/datas/Quant_Datas_v3.0' % (DATA_ROOT) if normalize: fin_path = '%s/pickle_datas/%s_trans_norm.pickle' % ( data_root_path, i) else: fin_path = '%s/pickle_datas/%s_trans.pickle' % (data_root_path, i) try: mean_rank_rate = test_single_lambdarank_file(fin_path, model) if mean_rank_rate: mean_rank_rates.append(mean_rank_rate) file_number_list.append(i) except Exception, e: data_process_logger.info( 'test file failed: file path=%s, details=%s' % (fin_path, e))
def prepare_gbdt_datas(file_number_list, DATA_ROOT, process_count=2): proc_pool = multiprocessing.Pool(process_count) multi_results = [] for i in file_number_list: # data_process_logger.info('loading %s file' % i) # csv_path = '%s/datas/Quant-Datas/pickle_datas/%s.csv' % (PROJECT_PATH, i) data_root_path = '%s/datas/Quant_Datas_v4.0' % (DATA_ROOT) pickle_path = '%s/pickle_datas/%s_trans_norm.pickle' % (data_root_path, i) data_process_logger.info('add file: %s' % pickle_path) data_res = proc_pool.apply_async(load_pickle_datas, args=(pickle_path,)) multi_results.append(data_res) # datas = load_csv_data(csv_path, normalize=True, is_combine=True) # train_datas += datas proc_pool.close() proc_pool.join() # fetch datas from pool stock_ids, stock_scores, vec_values = multi_results[0].get() # train_datas = tmp_data label_list = stock_scores vec_list = vec_values data_process_logger.info('combining datas...') for i in xrange(1, len(multi_results)): data_process_logger.info('combining No.%s data' % i) try: stock_ids, stock_scores, vec_values = multi_results[i].get() # train_datas = np.row_stack((train_datas, datas)) # np.2darray # train_datas = np.vstack((train_datas, datas)) # train_datas.extend(datas) # label_list.extend(stock_scores) for index in range(len(vec_values)): vec = vec_values[index] label = stock_scores[index] if len(vec) == len(vec_list[-1]): vec_list.append(vec) label_list.append(label) else: print 'not equaling n_feature: %s' % len(vec) except Exception, e: data_process_logger.error('No.%s data failed, details=%s' % (i, str(e.message))) continue
def train_regression_age_model(input_xlist, input_ylist, model_label): """ train age regression model, with grid search Args: input_xlist: input_ylist: model_label: Returns: """ from sklearn import svm from sklearn.ensemble import GradientBoostingRegressor data_process_logger.info('loading model') input_xlist = np.float64(input_xlist) # SVR data_process_logger.info('training svr') clf = svm.SVR() parameters = {'C': [1e3, 5e3, 1e2, 1e1, 1e-1], 'kernel': ['rbf', 'sigmoid'], 'gamma': [0.0001, 0.001, 0.01, 0.1, 0.05]} svr_mod = grid_search.GridSearchCV(clf, parameters, n_jobs=12, scoring='mean_absolute_error') svr_mod.fit(input_xlist, input_ylist) print svr_mod.best_estimator_ fout = open('%s/models/svr_%s.model' % (PROJECT_PATH, model_label), 'wb') cPickle.dump(svr_mod, fout) for item in svr_mod.grid_scores_: print item # GBRT data_process_logger.info('training gbrt') gbrt_mod = GradientBoostingRegressor() gbrt_parameters = {'n_estimators': [300, 350], 'max_depth': [2, 3, 4], 'max_leaf_nodes': [10, 20], 'loss': ['huber', 'lad'], 'subsample': [0.2, 0.5, 0.7]} gbrt_mod = grid_search.GridSearchCV(gbrt_mod, gbrt_parameters, n_jobs=12, scoring='mean_absolute_error') gbrt_mod.fit(input_xlist, input_ylist) gbrt_out = open('%s/models/gbrt_%s.model' % (PROJECT_PATH, model_label), 'wb') cPickle.dump(gbrt_mod, gbrt_out) print gbrt_mod.best_estimator_ for item in gbrt_mod.grid_scores_: print item
def prepare_datas(file_number_list, DATA_ROOT, process_count=2): """ 准备训练数据 Args: process_count: DATA_ROOT: file_number_list: Returns: """ # train_file_number_list = range(1, 300) # load with multi-processor proc_pool = multiprocessing.Pool(process_count) multi_results = [] for i in file_number_list: # data_process_logger.info('loading %s file' % i) # csv_path = '%s/datas/Quant-Datas/pickle_datas/%s.csv' % (PROJECT_PATH, i) data_root_path = '%s/datas/Quant_Datas_v3.0' % (DATA_ROOT) pickle_path = '%s/pickle_datas/%s_trans_norm.pickle' % (data_root_path, i) data_process_logger.info('add file: %s' % pickle_path) data_res = proc_pool.apply_async(process_single_pickle_data, args=(pickle_path, i)) multi_results.append(data_res) # datas = load_csv_data(csv_path, normalize=True, is_combine=True) # train_datas += datas proc_pool.close() proc_pool.join() # fetch datas from pool # stock_ids, stock_scores, vec_values, stock_rank_labels, query_count = multi_results[0].get() # train_datas = tmp_data # label_list = stock_rank_labels # vec_list = vec_values # query_datas = [query_count] label_list = [] vec_list = [] query_datas = [] data_process_logger.info('combining datas...') for i in xrange(0, len(multi_results)): data_process_logger.info('combining No.%s data' % i) try: stock_ids, stock_scores, vec_values, stock_rank_labels, query_count = multi_results[i].get() # train_datas = np.row_stack((train_datas, datas)) # np.2darray # train_datas = np.vstack((train_datas, datas)) # train_datas.extend(datas) # label_list.extend(stock_scores) tmp_vec_list = [] tmp_label_list = [] for index in range(len(vec_values)): vec = vec_values[index] label = stock_rank_labels[index] # if len(vec) == len(vec_list[-1]): if len(vec) >= 4561: tmp_vec_list.append(vec[:4561]) tmp_label_list.append(label) else: raise IndexError('not equaling n_feature: %s' % len(vec)) # query_count -= 1 if query_count == len(tmp_vec_list): vec_list.extend(tmp_vec_list) label_list.extend(tmp_label_list) query_datas.append(query_count) else: raise IndexError('query count %s not equal to len(vec_list): %s' % (query_count, len(vec_list))) except Exception, e: data_process_logger.error('No.%s data failed, details=%s' % (i, str(e.message))) continue
import pickle abs_path = os.path.dirname(os.path.abspath(__file__)) abs_father_path = os.path.dirname(abs_path) PROJECT_PATH = abs_father_path print 'Used file: %s\nProject path=%s' % (__file__, PROJECT_PATH) sys.path.append(PROJECT_PATH) # add flask path sys.path.append(os.path.dirname(os.path.abspath(__file__))) from pipelines.process_real_datas import turn_csv_into_result from utils.lightgbm_operator import LightgbmOperator from utils.logger_utils import data_process_logger app = Flask(__name__) data_process_logger.info('initing lightGBM operator') # init oldbest_mod = pickle.load( open( '%s/models/best_models/lightgbm_New_Quant_Data_rebalanced_norm_gbdt_7leaves_iter30000_best.model' % PROJECT_PATH)) oldbest_mod.save_model('flask_model.txt', num_iteration=27000) oldbest_operator = LightgbmOperator( 'flask_model.txt', 'New_Quant_Data_rebalanced_norm_gbdt_7leaves_iter30000_best') full_mod = pickle.load( open('%s/models/best_models/lightgbm_Full_gbdt_15leaves.model' % PROJECT_PATH)) full_mod.save_model('flask_model.txt', num_iteration=50000)
import sys abs_path = os.path.dirname(os.path.abspath(__file__)) abs_father_path = os.path.dirname(abs_path) PROJECT_PATH = abs_father_path print 'Used file: %s\nProject path=%s' % (__file__, PROJECT_PATH) sys.path.append(PROJECT_PATH) # add flask path sys.path.append(os.path.dirname(os.path.abspath(__file__))) from song2vec.song2vec_operator import Song2VecOperator from utils.logger_utils import data_process_logger app = Flask(__name__) data_process_logger.info('initing song2vec operator') s2v_operator = Song2VecOperator( song2vec_model_path='%s/datas/full_50d_20iter_10win_5min_song2vec.model' % PROJECT_PATH, artist2vec_model_path='%s/datas/full_50d_20iter_10win_5min_artist2vec.model' % PROJECT_PATH) data_process_logger.info('complete init song2vec') @app.route('/musictaster') def hello_world(): return render_template("demo.html") @app.route('/musictaster/similar/song', methods=['POST']) @app.route('/musictaster/similar/song/<song_name>', methods=['GET'])
def turn_csv_into_result(origin_csv_path, output_csv_path, predict_model, predict_iteration, is_norm=True, is_norm_score=True): """ 把原始的feature csv转为排名后的结果csv Args: predict_model: origin_csv_path: output_csv_path: predict_iteration: 预测用的 is_norm: 是否进行标准化 is_norm_score: 是否对分数进行标准化 Returns: """ data_process_logger.info('handling %s' % origin_csv_path) with open(origin_csv_path, 'rb') as fin_csv, open(output_csv_path, 'wb') as fout_csv: reader = csv.reader(fin_csv) writer = csv.writer(fout_csv) # count = 0 origin_datas = [] data_process_logger.info('start reading %s' % origin_csv_path) # 首先进行缺失值的补充和标准化 count = 1 n_feature = 4563 for line in reader: if len(line) == n_feature: single_vec_value = [float(i) if i != 'NaN' else np.nan for i in line] # process the 453th col, remove future feature. single_vec_value = single_vec_value[:453]+single_vec_value[454:] origin_datas.append(single_vec_value) # data_process_logger.info('handled line %s' % count) else: data_process_logger.info( 'casting line: %s in file %s, it has %s features while the first line has %s' % ( count, origin_csv_path, len(line), n_feature)) count += 1 # inferring missing data imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(origin_datas) transformed_datas = imp.transform(origin_datas) if is_norm: # standardising datas stock_ids = transformed_datas[:, 0] stock_scores = transformed_datas[:, 1] vec_values = transformed_datas[:, 2:] scaled_vec_values = preprocessing.scale(vec_values) if is_norm_score: stock_scores = preprocessing.scale(stock_scores) transformed_datas = np.column_stack((stock_ids, stock_scores, scaled_vec_values)) # 进行预测 xlist = [a[2:] for a in transformed_datas] # vec values origin_score_list = [a[1] for a in transformed_datas] stock_ids = [a[0] for a in transformed_datas] score_list = predict_model.predict(xlist, num_iteration=predict_iteration) line_numbers = range(1, len(xlist) + 1) # 对预测结果进行排序并输出csv result = np.column_stack((line_numbers, stock_ids, score_list, origin_score_list)) sorted_result = sorted(result, cmp=lambda x, y: 1 if x[2] - y[2] > 0 else -1) writer.writerow(['origin_line', 'stock_id', 'predict_score', 'origin_score']) for row in sorted_result: writer.writerow([int(row[0]), str(row[1]), row[2], row[3]]) # writting transformed datas data_process_logger.info('complete writting %s' % output_csv_path) return sorted_result
with open(fin_path, 'rb') as fin_data_file: stock_ids, stock_scores, vec_values = cPickle.load(fin_data_file) data_process_logger.info('testing file: %s' % fin_path) input_datas = np.column_stack((stock_ids, stock_scores, vec_values)) mean_rank_rate = test_datas(input_datas, model) if mean_rank_rate >= 0.4: data_analysis_logger.info('the file number is %s, obs = %s' % (i, len(input_datas))) mean_rank_rates.append(mean_rank_rate) file_number_list.append(i) except Exception, e: data_process_logger.info('test file failed: file path=%s, details=%s' % (fin_path, e)) mean_rank_rate = np.mean(mean_rank_rates) std_rank_rate = np.std(mean_rank_rates) var_rank = np.var(mean_rank_rates) data_process_logger.info( 'Tested %s files, all input files mean rank rate is %s, all input files std is %s, var is %s' % ( len(input_file_numbers), mean_rank_rate, std_rank_rate, var_rank)) return file_number_list, mean_rank_rates # ------ For parallel process ----- def test_single_file(fin_path): try: global g_model with open(fin_path, 'rb') as fin_data_file: stock_ids, stock_scores, vec_values = cPickle.load(fin_data_file) data_process_logger.info('testing file: %s' % fin_path) input_datas = np.column_stack((stock_ids, stock_scores, vec_values)) mean_rank_rate = test_datas(input_datas, g_model) if mean_rank_rate >= 0.4: data_analysis_logger.info('the file number is %s, obs = %s' % (i, len(input_datas)))
def infer_missing_datas_to_gzip(fin_csv_path, fout_gzip_path, is_norm=True, is_norm_score=True): """ 处理NaN数据,并将处理后的数据分别存储为csv与pickle文件 Args: is_norm: 是否进行标准化 is_norm_score: 是否对score进行标准化 fin_csv_path: fout_gzip_path: Returns: """ with open(fin_csv_path, 'rb') as fin_csv, \ gzip.open(fout_gzip_path, 'wb') as fout_gzip: origin_datas = [] reader = csv.reader(fin_csv) # writer = csv.writer(fout_csv) count = 1 n_feature = 4563 data_process_logger.info('start reading %s' % fin_csv_path) for line in reader: if len(line) == n_feature: single_vec_value = [ float(i) if i != 'NaN' else np.nan for i in line ] # process the 453th col, remove future feature. # single_vec_value = single_vec_value[:453] + single_vec_value[454:] origin_datas.append(single_vec_value) # data_process_logger.info('handled line %s' % count) else: data_process_logger.info( 'casting line: %s in file %s, it has %s features while the first line has %s' % (count, fin_csv_path, len(line), n_feature)) count += 1 # inferring missing data imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(origin_datas) transformed_datas = imp.transform(origin_datas) stock_ids = [] stock_scores = [] scaled_vec_values = [] stock_ids = transformed_datas[:, 0] stock_scores = transformed_datas[:, 1] scaled_vec_values = transformed_datas[:, 2:] if is_norm: # standardising datas scaled_vec_values = preprocessing.scale(scaled_vec_values) if is_norm_score: stock_scores = preprocessing.scale(stock_scores) # transformed_datas = (stock_ids.tolist(), stock_scores.tolist(), scaled_vec_values.tolist()) # 存为tuple # writting transformed datas # data_process_logger.info('start writting %s' % fout_csv_path) data_process_logger.info('start saving %s' % fout_gzip_path) # transformed_datas = transformed_datas.tolist() # 转为list进行存储 # cPickle.dump(transformed_datas, fout_gzip, protocol=2) for line_index in xrange(len(stock_ids)): stock_id = stock_ids[line_index] stock_score = stock_scores[line_index] scaled_vec_value = scaled_vec_values[line_index] tmp_vec = [int(stock_id)] + [float(stock_score) ] + scaled_vec_value.tolist() tmp_vec_str = [str(a) for a in tmp_vec] try: # tmp_list_vec = tmp_vec.tolist() tmp_line = ','.join(tmp_vec_str) fout_gzip.write(tmp_line + '\n') if line_index % 100 == 0: print 'line %s join success' % line_index except Exception, e: print 'line %s join failed, details=%s' % (line_index, e) data_process_logger.info('%s done' % fin_csv_path) return transformed_datas
data_root_path, i) else: fin_path = '%s/pickle_datas/%s_trans.pickle' % (data_root_path, i) try: mean_rank_rate = test_single_lambdarank_file(fin_path, model) if mean_rank_rate: mean_rank_rates.append(mean_rank_rate) file_number_list.append(i) except Exception, e: data_process_logger.info( 'test file failed: file path=%s, details=%s' % (fin_path, e)) mean_rank_rate = np.mean(mean_rank_rates) std_rank_rate = np.std(mean_rank_rates) var_rank = np.var(mean_rank_rates) data_process_logger.info( 'Tested %s files, all input files mean rank rate is %s, all input files std is %s, var is %s' % (len(input_file_numbers), mean_rank_rate, std_rank_rate, var_rank)) return file_number_list, mean_rank_rates def test_single_lambdarank_file(fin_path, model_file): try: # global g_model data_analysis_logger.info('testing %s' % fin_path) stock_ids, stock_scores, vec_values, stock_rank_labels, query_count = process_single_pickle_data( fin_path) ylist = model_file.predict(vec_values) origin_score_list = stock_scores combined_score_list = np.column_stack((ylist, origin_score_list)) # input_datas = input_datas.tolist() # origin_ranked_list = sorted(input_datas, cmp=lambda x, y: 1 if x[1] - y[1] > 0 else -1)
def load_pickle_datas(tmp_pickle_path): with open(tmp_pickle_path, 'rb') as fin: data_process_logger.info('processing %s' % tmp_pickle_path) pickle_data = cPickle.load(fin) return pickle_data