Example #1
0
    def cluster_song_in_playlist(self,
                                 playlist_id,
                                 cluster_n=5,
                                 is_detailed=False):
        """
        获取单个歌单内的歌曲聚类信息
        Args:
            playlist_id: 歌单id
            cluster_n:聚类数
            is_detailed: 返回的结果是否包含详情

        Returns:
            聚类后的列表
        """
        playlist_obj = playlist_detail(playlist_id)
        song_list = []
        vec_list = []
        song_info_dict = {}
        ap_cluster = AffinityPropagation()
        data_process_logger.info('clustering playlist: %s' %
                                 playlist_obj['name'])
        for item in playlist_obj['tracks']:
            song = item['name'].lower()
            song_info_dict[song] = {
                'name': song,
                'artist': item['artists'][0]['name'],
                'id': item['id'],
                'album_img_url': item['album']['picUrl'],
                'site_url': 'http://music.163.com/#/song?id=%s' % item['id']
            }
            # print song
            if song not in song_list:
                song_list.append(song)
                # print self.song2vec_model.vocab.get(song)
                # print self.song2vec_model.syn0norm == None
                if self.song2vec_model.vocab.get(song) and len(
                        self.song2vec_model.syn0norm):
                    song_vec = self.song2vec_model.syn0norm[
                        self.song2vec_model.vocab[song].index]
                else:
                    data_process_logger.warn(
                        'The song %s of playlist-%s is not in dataset' %
                        (song, playlist_obj['name']))
                    song_vec = [
                        0 for i in range(self.song2vec_model.vector_size)
                    ]
                vec_list.append(song_vec)
        # song_list = list(song_list)
        if len(vec_list) > 1:
            cluster_result = ap_cluster.fit(vec_list, song_list)
            cluster_array = [
                [] for i in range(len(cluster_result.cluster_centers_indices_))
            ]
            for i in range(len(cluster_result.labels_)):
                label = cluster_result.labels_[i]
                index = i
                cluster_array[label].append(song_list[i])
            return cluster_array, playlist_obj['name'], song_info_dict
        else:
            return [song_list], playlist_obj['name'], song_info_dict
def load_csv_data(csv_path, normalize=True, is_combine=False):
    """

    Args:
        csv_path:
        normalize: 是否进行标准化
        is_combine: 是否进行norm特征和的拼接

    Returns:

    """
    from sklearn import preprocessing
    with open(csv_path, 'rb') as fin:
        data_process_logger.info('loading file: %s' % csv_path)
        datas = []
        temp_list = []
        score_list = []
        date_list = []
        id_list = []
        vec_list = []
        for line in fin:
            line = line.strip()
            tmp = line.split(',')
            stock_id = tmp[0]
            trade_date = tmp[1]
            score = eval(tmp[2])
            score_list.append(score)
            vec_value = [eval(a) for a in tmp[3:]]
            vec_list.append(vec_value)
            date_list.append(trade_date)
            id_list.append(stock_id)
            temp_list.append((stock_id, trade_date, score, vec_value))
        # all not normalize
        if not normalize:
            avg = np.mean(score_list)
            std = np.std(score_list)
            for item in temp_list:
                normalize_score = (item[2] - avg) / std
                datas.append((item[0], item[1], normalize_score, item[3]))
            return datas
        else:
            score_scale = preprocessing.scale(score_list)
            score_scale_list = list(score_scale)
            vec_scale = preprocessing.scale(vec_list)
            vec_scale_list = vec_scale
            for i in range(len(id_list)):
                if is_combine:
                    datas.append(
                        (id_list[i], date_list[i], score_scale_list[i],
                         list(vec_scale_list[i]) + vec_list[i]))
                else:
                    datas.append(
                        (id_list[i], date_list[i], score_scale_list[i],
                         list(vec_scale_list[i])))
            # avg = np.mean(score_list)
            #            std = np.std(score_list)
            #            for item in temp_list:
            #                normalize_score = (item[2] - avg) / std
            #                datas.append((item[0], item[1], normalize_score, item[3]))
            return datas
Example #3
0
def test_quant_data_wrapper(input_file_numbers, model, normalize=True, predict_iteration=None):
    """
    input:(file_names,model)
    output: mean rank rate
    """
    mean_rank_rates = []
    file_number_list = []
    if predict_iteration:
        model.save_model('tmp_model.txt', num_iteration=predict_iteration)
        model = Booster(model_file='tmp_model.txt')
    for i in input_file_numbers:
        data_root_path = '%s/datas/Quant_Datas_v3.0' % (DATA_ROOT)
        if normalize:
            fin_path = '%s/pickle_datas/%s_trans_norm.pickle' % (data_root_path, i)
        else:
            fin_path = '%s/pickle_datas/%s_trans.pickle' % (data_root_path, i)
        try:
            with open(fin_path, 'rb') as fin_data_file:
                stock_ids, stock_scores, vec_values = cPickle.load(fin_data_file)
                data_process_logger.info('testing file: %s' % fin_path)
                input_datas = np.column_stack((stock_ids, stock_scores, vec_values))
                mean_rank_rate = test_datas(input_datas, model)
                if mean_rank_rate >= 0.4:
                    data_analysis_logger.info('the file number is %s, obs = %s' % (i, len(input_datas)))
                mean_rank_rates.append(mean_rank_rate)
                file_number_list.append(i)
        except Exception, e:
            data_process_logger.info('test file failed: file path=%s, details=%s' % (fin_path, e))
Example #4
0
def test_predict():
    """
    测试的主入口
    Returns:

    """
    model_tag = 'lambdarank_3.0_127leaves_full_eval_earlystop'
    lightgbm_mod = pickle.load(
        open('%s/models/lightgbm_%s.model' % (PROJECT_PATH, model_tag), 'rb'))
    # data_root_path = '%s/datas/Quant-Datas-2.0' % (DATA_ROOT)
    # fin_path = '%s/pickle_datas/%s_trans_norm.pickle' % (data_root_path, 1)
    # test_single_lambdarank_file(fin_path, lightgbm_mod)
    f_numbers, f_rank_rates = pipeline_test_lambdarank_wrapper(
        # range(1, 3) + [11],
        # range(1, 300) + range(401, 840) + range(941, 1042) + range(1145, 1200) + range(1301, 1400) + range(1511, 1521),
        range(540, 640) + range(800, 845) + range(920, 945) +
        range(1020, 1045) + range(1200, 1214),
        model=lightgbm_mod)
    result_tag = 'haha'
    with open(
            '%s/pipelines/test_%s_%s_result_%s.csv' %
        (PROJECT_PATH, model_tag, result_tag, len(f_numbers)), 'wb') as fout:
        for i in range(len(f_numbers)):
            fout.write('%s,%s\n' % (f_numbers[i], f_rank_rates[i]))
        data_process_logger.info(
            'result csv: %s/pipelines/test_%s_%s_result_%s.csv' %
            (PROJECT_PATH, model_tag, result_tag, len(f_numbers)))
Example #5
0
def batch_process_real_data(model_path, file_numbers=[], workspace_root='./', model_tag='real', predict_iter=None):
    """
    进行批量处理实测数据,将结果存在相应文件下的<model_tag>文件夹下
    Args:
        model_tag: 模型tag
        predict_iter:使用的模型迭代次数
        model_path: 预测使用的模型路径 (lightGBM模型)
        workspace_root: csv文件所在的目录
        file_numbers: 处理的文件编号列表
    Returns:
        None
    """
    output_path = os.path.join(workspace_root, '%s_results' % model_tag)
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    predict_mod = pickle.load(open(model_path, 'rb'))
    # if predict_iter:
    #     predict_mod.save_model('tmp_model.txt', num_iteration=predict_iter)
    #     predict_mod = Booster(model_file='tmp_model.txt')
    for file_n in file_numbers:
        fin_path = (workspace_root + '/%s.csv') % file_n
        fout_path = (output_path + '/%s_result.csv') % file_n
        turn_csv_into_result(fin_path, fout_path, predict_mod, predict_iter, True,
                             False)
    data_process_logger.info('Done with %s files' % len(file_numbers))
Example #6
0
def test_single_lambdarank_file(fin_path, model_file):
    try:
        # global g_model
        data_analysis_logger.info('testing %s' % fin_path)
        stock_ids, stock_scores, vec_values, stock_rank_labels, query_count = process_single_pickle_data(
            fin_path)
        ylist = model_file.predict(vec_values)
        origin_score_list = stock_scores
        combined_score_list = np.column_stack((ylist, origin_score_list))
        # input_datas = input_datas.tolist()
        # origin_ranked_list = sorted(input_datas, cmp=lambda x, y: 1 if x[1] - y[1] > 0 else -1)
        combined_score_list = combined_score_list.tolist()
        origin_ranked_list = sorted(combined_score_list,
                                    cmp=lambda x, y: 1
                                    if x[1] - y[1] > 0 else -1)  # 根据原始值值进行升序排序
        # 得到原始值的排名序号
        index_ylist = [(i, origin_ranked_list[i][0], origin_ranked_list[i][1])
                       for i in range(len(origin_ranked_list))]
        predict_ranked_index_ylist = sorted(
            index_ylist, cmp=lambda x, y: 1
            if x[1] - y[1] < 0 else -1)  # 根据预测值进行降序排序,保留原始值排名序号
        mean_rank_rate = result_validation(predict_ranked_index_ylist, N=50)

        if mean_rank_rate >= 0.4:
            data_analysis_logger.info('the file path is %s, obs = %s' %
                                      (fin_path, len(stock_scores)))
            # mean_rank_rates.append(mean_rank_rate)
            # file_number_list.append(i)
        return mean_rank_rate
    except Exception, e:
        data_process_logger.info('test file failed: file path=%s, details=%s' %
                                 (fin_path, e))
        return None
Example #7
0
def train_artistsong2vec_model(fout_path,
                               input_datas=None,
                               data_path=None,
                               min_count=5,
                               sorted_vocab=1,
                               window=10,
                               size=250,
                               iter_n=50):
    if not input_datas and data_path:
        input_datas = pickle.load(open(data_path, 'rb'))
    full_data = []
    for i in input_datas:
        tmp = []
        for j in i:
            tmp.append(j[0])
            tmp.append(j[1])
        full_data.append(tmp)
    data_process_logger.info('start training')
    wv_model = gensim.models.Word2Vec(full_data,
                                      min_count=min_count,
                                      sorted_vocab=sorted_vocab,
                                      window=window,
                                      size=size,
                                      iter=iter_n)
    with open(fout_path, 'wb') as fout:
        data_process_logger.info('start saving model')
        pickle.dump(wv_model, fout)
        print 'model saved'
Example #8
0
def result_validation(ranked_index_ylist, N=50, threshold=0.35):
    buyer_list = ranked_index_ylist[:N]
    total_error = 0
    origin_rank_list = []
    true_roa=[i[2] for i in buyer_list]
    mean_score = np.mean(true_roa)
    data_process_logger.info('mean_score = %s' % mean_score)
    return mean_score 
Example #9
0
    def cluster_artist_in_playlist(self,
                                   playlist_id,
                                   cluster_n=5,
                                   is_detailed=False):
        """
        获取单个歌单内的歌手聚类信息
        Args:
            playlist_id: 歌单id
            cluster_n:聚类数
            is_detailed: 是否包含详情信息

        Returns:
            聚类后的列表
        """
        playlist_obj = playlist_detail(playlist_id)
        artist_list = []
        vec_list = []
        ap_cluster = AffinityPropagation()
        data_process_logger.info('clustering playlist: %s' %
                                 playlist_obj['name'])
        for item in playlist_obj['tracks']:
            artist = item['artists'][0]['name'].lower()
            # print artist
            if artist not in artist_list:
                artist_list.append(artist)
                # print self.song2vec_model.vocab.get(artist)
                # print self.song2vec_model.syn0norm == None
                if self.artist2vec_model.vocab.get(artist) and len(
                        self.artist2vec_model.syn0norm):
                    artist_vec = self.artist2vec_model.syn0norm[
                        self.artist2vec_model.vocab[artist].index]
                else:
                    data_process_logger.warn(
                        'The artist %s of playlist-%s is not in dataset' %
                        (artist, playlist_obj['name']))
                    artist_vec = [
                        0 for i in range(self.artist2vec_model.vector_size)
                    ]
                vec_list.append(artist_vec)
        # artist_list = list(artist_list)
        # vec_list = list(vec_list)
        if len(vec_list) > 1:
            cluster_result = ap_cluster.fit(vec_list, artist_list)
            cluster_array = [
                [] for i in range(len(cluster_result.cluster_centers_indices_))
            ]
            for i in range(len(cluster_result.labels_)):
                label = cluster_result.labels_[i]
                index = i
                cluster_array[label].append(artist_list[i])
            return cluster_array, playlist_obj['name'], {}
        else:
            return [artist_list], playlist_obj['name'], {}
def infer_missing_datas(fin_csv_path,
                        fout_pickle_path,
                        is_norm=False,
                        is_norm_score=True):
    """
    处理NaN数据,并将处理后的数据分别存储为csv与pickle文件
    Args:
        is_norm: 是否进行标准化
        is_norm_score: 是否对score进行标准化
        fin_csv_path:
        fout_pickle_path:

    Returns:

    """
    with open(fin_csv_path, 'rb') as fin_csv, \
            open(fout_pickle_path, 'wb') as fout_pickle:
        origin_datas = []
        reader = csv.reader(fin_csv)
        # writer = csv.writer(fout_csv)
        count = 1
        n_feature = 4563
        data_process_logger.info('start reading %s' % fin_csv_path)
        for line in reader:
            if len(line) == n_feature:
                single_vec_value = [
                    float(i) if i != 'NaN' else np.nan for i in line
                ]
                # process the 453th col, remove future feature.
                single_vec_value = single_vec_value[:453] + single_vec_value[
                    454:]
                origin_datas.append(single_vec_value)
                # data_process_logger.info('handled line %s' % count)

            else:
                data_process_logger.info(
                    'casting line: %s in file %s, it has %s features while the first line has %s'
                    % (count, fin_csv_path, len(line), n_feature))
            count += 1
        # inferring missing data
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        imp.fit(origin_datas)
        transformed_datas = imp.transform(origin_datas)
        if is_norm:
            # standardising datas
            stock_ids = transformed_datas[:, 0]
            stock_scores = transformed_datas[:, 1]
            vec_values = transformed_datas[:, 2:]
            scaled_vec_values = preprocessing.scale(vec_values)
            if is_norm_score:
                stock_scores = preprocessing.scale(stock_scores)
            transformed_datas = (stock_ids.tolist(), stock_scores.tolist(),
                                 scaled_vec_values.tolist())  # 存为tuple
        # writting transformed datas
        # data_process_logger.info('start writting %s' % fout_csv_path)
        data_process_logger.info('start dumping %s' % fout_pickle_path)
        # transformed_datas = transformed_datas.tolist()  # 转为list进行存储
        cPickle.dump(transformed_datas, fout_pickle, protocol=2)
        data_process_logger.info('%s done' % fin_csv_path)
        return transformed_datas
Example #11
0
def test_single_file(fin_path):
    try:
        global g_model
        with open(fin_path, 'rb') as fin_data_file:
            stock_ids, stock_scores, vec_values = cPickle.load(fin_data_file)
            data_process_logger.info('testing file: %s' % fin_path)
            input_datas = np.column_stack((stock_ids, stock_scores, vec_values))
            mean_rank_rate = test_datas(input_datas, g_model)
            if mean_rank_rate >= 0.4:
                data_analysis_logger.info('the file number is %s, obs = %s' % (i, len(input_datas)))
            # mean_rank_rates.append(mean_rank_rate)
            # file_number_list.append(i)
            return mean_rank_rate, i
    except Exception, e:
        data_process_logger.info('test file failed: file path=%s, details=%s' % (fin_path, e))
Example #12
0
def update_userinfo():
    """
    临时更新数据库的脚本
    Returns:

    """
    DAO_inst = CloudMusicDAO('MusicTaster', 'UserInfos')
    uids = DAO_inst.db_inst.distinct('userId')
    count = 0
    for uid in uids:
        userinfo = DAO_inst.db_inst.find_one({'userId': uid})
        userinfo['follow_count'] = len(userinfo['follow_ids'])
        userinfo['fan_count'] = len(userinfo['fan_ids'])
        DAO_inst.save_unique_item(userinfo, primary_key='userId', is_overwrite=True)
        data_process_logger.info('No.%s %s-%s' % (count, userinfo['userId'], userinfo['nickname']))
        count += 1
    print 'done'
Example #13
0
def parallel_test_quant_data_wrapper(input_file_numbers, model, normalize=True, predict_iteration=None,
                                     process_count=2):
    """
    input:(file_names,model)
    output: mean rank rate
    """
    mean_rank_rates = []
    file_number_list = []
    if predict_iteration:
        model.save_model('tmp_model.txt', num_iteration=predict_iteration)
    else:
        model.save_model('tmp_model.txt')
    global g_model
    g_model = Booster(model_file='tmp_model.txt')
    proc_pool = multiprocessing.Pool(process_count)
    multi_result = []
    for i in input_file_numbers:
        data_root_path = '%s/datas/Quant-Datas-2.0' % (DATA_ROOT)
        if normalize:
            fin_path = '%s/pickle_datas/%s_trans_norm.pickle' % (data_root_path, i)
        else:
            fin_path = '%s/pickle_datas/%s_trans.pickle' % (data_root_path, i)
        data_res = proc_pool.apply_async(test_single_file, args=(fin_path,))
        multi_result.append(data_res)
    proc_pool.close()
    proc_pool.join()
    # 合并结果
    for i in range(len(multi_result)):
        tmp_mean_rank_rate, file_n = multi_result[i].get()
        mean_rank_rates.append(tmp_mean_rank_rate)
        file_number_list.append(file_n)
    mean_rank_rate = np.mean(mean_rank_rates)
    std_rank_rate = np.std(mean_rank_rates)
    var_rank = np.var(mean_rank_rates)
    data_process_logger.info(
        'Tested %s files, all input files mean rank rate is %s, all input files std is %s, var is %s' % (
            len(input_file_numbers), mean_rank_rate, std_rank_rate, var_rank))
    return file_number_list, mean_rank_rates
def parallel_inferring(file_number_list,
                       process_count=12,
                       is_norm=True,
                       is_norm_score=True,
                       data_root_path=None):
    """
    并行化进行数据清理
    Returns:

    """
    data_process_logger.info('Start parallel inferring, process count = %s' %
                             process_count)
    proc_pool = multiprocessing.Pool(process_count)
    # multi_results = []
    for i in file_number_list:
        # data_process_logger.info('loading %s file' % i)
        # csv_path = '%s/datas/%s.csv' % (PROJECT_PATH, i)
        if not data_root_path:
            data_root_path = '%s/datas/Quant-Datas-2.0' % (DATA_ROOT)
        fin_csv_path = '%s/%s.csv' % (data_root_path, i)
        if is_norm:
            # fout_csv_path = '%s/transformed_datas/%s_trans_norm.csv' % (data_root_path, i)
            # fout_pickle_path = '%s/pickle_datas/%s_trans_norm.pickle' % (data_root_path, i)
            fout_gzip_path = '%s/gzip_datas_norm/%s_trans_norm.gz' % (
                data_root_path, i)
        else:
            # fout_csv_path = '%s/transformed_datas/%s_trans.csv' % (data_root_path, i)
            # fout_pickle_path = '%s/pickle_datas/%s_trans.pickle' % (data_root_path, i)
            fout_gzip_path = '%s/gzip_datas/%s_trans.gz' % (data_root_path, i)
        data_res = proc_pool.apply_async(infer_missing_datas_to_gzip,
                                         args=(fin_csv_path, fout_gzip_path,
                                               is_norm, is_norm_score))
        # multi_results.append(data_res)
        # datas = load_csv_data(csv_path, normalize=True, is_combine=True)
        # train_datas += datas
    proc_pool.close()
    proc_pool.join()
    data_process_logger.info('Done with %s files' % len(file_number_list))
Example #15
0
def train_song2vec_model(fout_path,
                         input_datas=None,
                         data_path=None,
                         min_count=5,
                         sorted_vocab=1,
                         window=10,
                         size=250,
                         iter_n=50):
    """
    训练song2vec模型
    Args:
        fout_path:
        input_datas:
        data_path:
        min_count:
        sorted_vocab:
        window:
        size:
        iter_n:

    Returns:

    """
    if not input_datas and data_path:
        input_datas = pickle.load(open(data_path, 'rb'))
    data_process_logger.info('start training')
    random.shuffle(input_datas)
    input_datas = input_datas[:45000]
    wv_model = gensim.models.Word2Vec(input_datas,
                                      min_count=min_count,
                                      sorted_vocab=sorted_vocab,
                                      window=window,
                                      size=size,
                                      iter=iter_n)
    with open(fout_path, 'wb') as fout:
        data_process_logger.info('start saving model')
        pickle.dump(wv_model, fout)
        print 'model saved'
Example #16
0
def fill_song_comments():
    """
    填充歌曲的评论详情
    Returns:

    """
    dao_inst = CloudMusicDAO('MusicTaster', 'SongInfos')
    find_result = dao_inst.db_inst.find({'commentInfo': {'$exists': False}})
    count = 0
    for song_item in find_result:
        comm_data = song_comments(song_item['commentThreadId'], limit=10)
        if comm_data:  # 确保评论详情读取正确
            del comm_data['code']
            # del comm_data['userId']
            song_item['commentInfo'] = comm_data
            song_item['commentCount'] = comm_data['total']
        dao_inst.db_inst.save(song_item)
        data_process_logger.info(
            'No.%s %s, comments: %s done' % (count, song_item['name'], song_item['commentCount']))
        count += 1
        slp = random.random() * 2 + 1
        data_process_logger.info('sleep %s sec' % slp)
        time.sleep(slp)
Example #17
0
def prepare_artist_dict(tag=''):
    playlist_dao_inst = CloudMusicDAO('MusicTaster', 'Playlists')
    # print playlist_dao_inst.db_inst.find(
    #     {'trackCount': {'$gte': 10, '$lte': 600}, 'playCount': {'$gte': 10}},
    #     {'name': 1}).limit(100000).count()
    find_result = playlist_dao_inst.db_inst.find(
        {
            'trackCount': {
                '$gte': 10,
                '$lte': 600
            },
            'playCount': {
                '$gte': 5
            }
        }, {
            'tracks': 1,
            'name': 1
        }).limit(100000)
    # 将歌单中的歌曲名组合成歌曲名序列
    total_artists_set = []
    count = 0
    for item in find_result:
        data_process_logger.info('No.%s %s' % (count, item['name']))
        # 保存歌单中的歌曲序列
        artists_seq = []
        for song in item['tracks']:
            sname = song['artists'][0]['name']
            artists_seq.append(sname.lower())
        total_artists_set.append(artists_seq)
        count += 1
    data_process_logger.info('start building dictionary')
    artist_dictionary = corpora.Dictionary(total_artists_set)
    print u'歌单数', artist_dictionary.num_docs
    try:
        print u'歌手数', len(artist_dictionary.token2id)
    except Exception, e:
        print 'error = %s' % e
Example #18
0
def prepare_song_dict(tag=''):
    """
    从数据库中遍历歌单,准备song2vec的训练数据
    Args:
        tag: 备注tag信息

    Returns:

    """
    playlist_dao_inst = CloudMusicDAO('MusicTaster', 'Playlists')
    print playlist_dao_inst.db_inst.find(
        {
            'trackCount': {
                '$gte': 3,
                '$lte': 1000
            },
            'playCount': {
                '$gte': 1
            }
        }, {
            'tracks': 1,
            'name': 1
        }).limit(100000).count()
    find_result = playlist_dao_inst.db_inst.find(
        {
            'trackCount': {
                '$gte': 3,
                '$lte': 1000
            },
            'playCount': {
                '$gte': 1
            }
        }, {
            'tracks': 1,
            'name': 1
        }).limit(100000)
    # 将歌单中的歌曲名组合成歌曲名序列
    total_song_set = []
    count = 0
    for item in find_result:
        data_process_logger.info('No.%s %s' % (count, item['name']))
        # 保存歌单中的歌曲序列
        song_seq = []
        for song in item['tracks']:
            sname = song['name']
            song_seq.append(sname.lower())
        total_song_set.append(song_seq)
        count += 1
    data_process_logger.info('start building dictionary')
    song_dictionary = corpora.Dictionary(total_song_set)
    print u'歌单数', song_dictionary.num_docs
    print u'歌曲数', song_dictionary.num_pos
    data_process_logger.info('start saving datas')
    song_dictionary.save('../datas/song_dictionary_%s.dict' % tag)
    pickle.dump(total_song_set, open('../datas/songs_seq_%s.dat' % tag, 'wb'))
    return song_dictionary
def process_single_pickle_data(pickle_file_path, query_label=1):
    """
    处理单个pickle后的data文件,进行排序、query data的生成等
    Returns: tuple of ranked-features and query data

    """
    def rank_value(rank, max_rank):
        """
        获取排名权重
        Returns:

        """
        rank_steps = [0.01, 0.05, 0.1, 0.15, 0.3, 0.4, 0.5, 0.7, 0.9, 1]
        rank_weights = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
        basic_rank = rank / max_rank
        for rank_index in range(len(rank_steps)):
            if basic_rank <= rank_steps[rank_index]:
                return rank_weights[rank_index]
        return 0

    with open(pickle_file_path, 'rb') as fin:
        data_process_logger.info('processing file: %s' % pickle_file_path)
        # stock_ids, stock_scores, vec_values = pickle.load(fin)
        pickle_obj = pickle.load(fin)
        combined_obj = [(pickle_obj[0][i], pickle_obj[1][i], pickle_obj[2][i])
                        for i in range(len(pickle_obj[0]))]
        ranked_datas = sorted(combined_obj,
                              cmp=lambda x, y: 1 if x[1] - y[1] > 0 else -1)
        stock_ids = [a[0] for a in ranked_datas]
        stock_scores = [a[1] for a in ranked_datas]
        stock_rank_labels = [
            rank_value(a, len(ranked_datas)) for a in range(len(ranked_datas))
        ]  # label for rank
        vec_values = [a[2] for a in ranked_datas]
        return stock_ids, stock_scores, vec_values, stock_rank_labels, len(
            ranked_datas)
Example #20
0
def pipeline_test_lambdarank_wrapper(input_file_numbers,
                                     model,
                                     normalize=True,
                                     predict_iteration=None):
    """
    进行结果测试
    Args:
        input_file_numbers:
        model:
        normalize:
        predict_iteration:

    Returns:

    """
    mean_rank_rates = []
    file_number_list = []
    if predict_iteration:
        model.save_model('tmp_lambdarank_model.txt',
                         num_iteration=predict_iteration)
        model = Booster(model_file='tmp_lambdarank_model.txt')
    for i in input_file_numbers:
        data_root_path = '%s/datas/Quant_Datas_v3.0' % (DATA_ROOT)
        if normalize:
            fin_path = '%s/pickle_datas/%s_trans_norm.pickle' % (
                data_root_path, i)
        else:
            fin_path = '%s/pickle_datas/%s_trans.pickle' % (data_root_path, i)
        try:
            mean_rank_rate = test_single_lambdarank_file(fin_path, model)
            if mean_rank_rate:
                mean_rank_rates.append(mean_rank_rate)
                file_number_list.append(i)
        except Exception, e:
            data_process_logger.info(
                'test file failed: file path=%s, details=%s' % (fin_path, e))
Example #21
0
def prepare_gbdt_datas(file_number_list, DATA_ROOT, process_count=2):
    proc_pool = multiprocessing.Pool(process_count)
    multi_results = []
    for i in file_number_list:
        # data_process_logger.info('loading %s file' % i)
        # csv_path = '%s/datas/Quant-Datas/pickle_datas/%s.csv' % (PROJECT_PATH, i)
        data_root_path = '%s/datas/Quant_Datas_v4.0' % (DATA_ROOT)
        pickle_path = '%s/pickle_datas/%s_trans_norm.pickle' % (data_root_path, i)
        data_process_logger.info('add file: %s' % pickle_path)
        data_res = proc_pool.apply_async(load_pickle_datas, args=(pickle_path,))
        multi_results.append(data_res)
        # datas = load_csv_data(csv_path, normalize=True, is_combine=True)
        # train_datas += datas
    proc_pool.close()
    proc_pool.join()
    # fetch datas from pool
    stock_ids, stock_scores, vec_values = multi_results[0].get()
    # train_datas = tmp_data
    label_list = stock_scores
    vec_list = vec_values
    data_process_logger.info('combining datas...')
    for i in xrange(1, len(multi_results)):
        data_process_logger.info('combining No.%s data' % i)
        try:
            stock_ids, stock_scores, vec_values = multi_results[i].get()
            # train_datas = np.row_stack((train_datas, datas)) # np.2darray
            # train_datas = np.vstack((train_datas, datas))
            # train_datas.extend(datas)
            # label_list.extend(stock_scores)
            for index in range(len(vec_values)):
                vec = vec_values[index]
                label = stock_scores[index]
                if len(vec) == len(vec_list[-1]):
                    vec_list.append(vec)
                    label_list.append(label)
                else:
                    print 'not equaling n_feature: %s' % len(vec)
        except Exception, e:
            data_process_logger.error('No.%s data failed, details=%s' % (i, str(e.message)))
            continue
Example #22
0
def train_regression_age_model(input_xlist, input_ylist, model_label):
    """
    train age regression model, with grid search
    Args:
        input_xlist:
        input_ylist:
        model_label:

    Returns:

    """
    from sklearn import svm
    from sklearn.ensemble import GradientBoostingRegressor
    data_process_logger.info('loading model')
    input_xlist = np.float64(input_xlist)
    # SVR
    data_process_logger.info('training svr')
    clf = svm.SVR()
    parameters = {'C': [1e3, 5e3, 1e2, 1e1, 1e-1], 'kernel': ['rbf', 'sigmoid'],
                  'gamma': [0.0001, 0.001, 0.01, 0.1, 0.05]}
    svr_mod = grid_search.GridSearchCV(clf, parameters, n_jobs=12, scoring='mean_absolute_error')
    svr_mod.fit(input_xlist, input_ylist)
    print svr_mod.best_estimator_
    fout = open('%s/models/svr_%s.model' % (PROJECT_PATH, model_label), 'wb')
    cPickle.dump(svr_mod, fout)
    for item in svr_mod.grid_scores_:
        print item
    # GBRT
    data_process_logger.info('training gbrt')
    gbrt_mod = GradientBoostingRegressor()
    gbrt_parameters = {'n_estimators': [300, 350], 'max_depth': [2, 3, 4],
                       'max_leaf_nodes': [10, 20], 'loss': ['huber', 'lad'], 'subsample': [0.2, 0.5, 0.7]}
    gbrt_mod = grid_search.GridSearchCV(gbrt_mod, gbrt_parameters, n_jobs=12, scoring='mean_absolute_error')
    gbrt_mod.fit(input_xlist, input_ylist)
    gbrt_out = open('%s/models/gbrt_%s.model' % (PROJECT_PATH, model_label), 'wb')
    cPickle.dump(gbrt_mod, gbrt_out)
    print gbrt_mod.best_estimator_
    for item in gbrt_mod.grid_scores_:
        print item
def prepare_datas(file_number_list, DATA_ROOT, process_count=2):
    """
    准备训练数据
    Args:
        process_count:
        DATA_ROOT:
        file_number_list:

    Returns:

    """
    # train_file_number_list = range(1, 300)
    # load with multi-processor
    proc_pool = multiprocessing.Pool(process_count)
    multi_results = []
    for i in file_number_list:
        # data_process_logger.info('loading %s file' % i)
        # csv_path = '%s/datas/Quant-Datas/pickle_datas/%s.csv' % (PROJECT_PATH, i)
        data_root_path = '%s/datas/Quant_Datas_v3.0' % (DATA_ROOT)
        pickle_path = '%s/pickle_datas/%s_trans_norm.pickle' % (data_root_path, i)
        data_process_logger.info('add file: %s' % pickle_path)
        data_res = proc_pool.apply_async(process_single_pickle_data, args=(pickle_path, i))
        multi_results.append(data_res)
        # datas = load_csv_data(csv_path, normalize=True, is_combine=True)
        # train_datas += datas
    proc_pool.close()
    proc_pool.join()
    # fetch datas from pool
    # stock_ids, stock_scores, vec_values, stock_rank_labels, query_count = multi_results[0].get()
    # train_datas = tmp_data
    # label_list = stock_rank_labels
    # vec_list = vec_values
    # query_datas = [query_count]
    label_list = []
    vec_list = []
    query_datas = []
    data_process_logger.info('combining datas...')
    for i in xrange(0, len(multi_results)):
        data_process_logger.info('combining No.%s data' % i)
        try:
            stock_ids, stock_scores, vec_values, stock_rank_labels, query_count = multi_results[i].get()
            # train_datas = np.row_stack((train_datas, datas)) # np.2darray
            # train_datas = np.vstack((train_datas, datas))
            # train_datas.extend(datas)
            # label_list.extend(stock_scores)
            tmp_vec_list = []
            tmp_label_list = []
            for index in range(len(vec_values)):
                vec = vec_values[index]
                label = stock_rank_labels[index]
                # if len(vec) == len(vec_list[-1]):
                if len(vec) >= 4561:
                    tmp_vec_list.append(vec[:4561])
                    tmp_label_list.append(label)
                else:
                    raise IndexError('not equaling n_feature: %s' % len(vec))
                    # query_count -= 1
            if query_count == len(tmp_vec_list):
                vec_list.extend(tmp_vec_list)
                label_list.extend(tmp_label_list)
                query_datas.append(query_count)
            else:
                raise IndexError('query count %s not equal to len(vec_list): %s' % (query_count, len(vec_list)))
        except Exception, e:
            data_process_logger.error('No.%s data failed, details=%s' % (i, str(e.message)))
            continue
Example #24
0
    import pickle
abs_path = os.path.dirname(os.path.abspath(__file__))
abs_father_path = os.path.dirname(abs_path)
PROJECT_PATH = abs_father_path
print 'Used file: %s\nProject path=%s' % (__file__, PROJECT_PATH)
sys.path.append(PROJECT_PATH)
# add flask path
sys.path.append(os.path.dirname(os.path.abspath(__file__)))

from pipelines.process_real_datas import turn_csv_into_result
from utils.lightgbm_operator import LightgbmOperator
from utils.logger_utils import data_process_logger

app = Flask(__name__)

data_process_logger.info('initing lightGBM operator')

# init
oldbest_mod = pickle.load(
    open(
        '%s/models/best_models/lightgbm_New_Quant_Data_rebalanced_norm_gbdt_7leaves_iter30000_best.model'
        % PROJECT_PATH))
oldbest_mod.save_model('flask_model.txt', num_iteration=27000)
oldbest_operator = LightgbmOperator(
    'flask_model.txt',
    'New_Quant_Data_rebalanced_norm_gbdt_7leaves_iter30000_best')

full_mod = pickle.load(
    open('%s/models/best_models/lightgbm_Full_gbdt_15leaves.model' %
         PROJECT_PATH))
full_mod.save_model('flask_model.txt', num_iteration=50000)
Example #25
0
import sys

abs_path = os.path.dirname(os.path.abspath(__file__))
abs_father_path = os.path.dirname(abs_path)
PROJECT_PATH = abs_father_path
print 'Used file: %s\nProject path=%s' % (__file__, PROJECT_PATH)
sys.path.append(PROJECT_PATH)
# add flask path
sys.path.append(os.path.dirname(os.path.abspath(__file__)))

from song2vec.song2vec_operator import Song2VecOperator
from utils.logger_utils import data_process_logger

app = Flask(__name__)

data_process_logger.info('initing song2vec operator')
s2v_operator = Song2VecOperator(
    song2vec_model_path='%s/datas/full_50d_20iter_10win_5min_song2vec.model' %
    PROJECT_PATH,
    artist2vec_model_path='%s/datas/full_50d_20iter_10win_5min_artist2vec.model'
    % PROJECT_PATH)
data_process_logger.info('complete init song2vec')


@app.route('/musictaster')
def hello_world():
    return render_template("demo.html")


@app.route('/musictaster/similar/song', methods=['POST'])
@app.route('/musictaster/similar/song/<song_name>', methods=['GET'])
Example #26
0
def turn_csv_into_result(origin_csv_path, output_csv_path, predict_model, predict_iteration, is_norm=True,
                         is_norm_score=True):
    """
    把原始的feature csv转为排名后的结果csv
    Args:
        predict_model:
        origin_csv_path:
        output_csv_path:
        predict_iteration: 预测用的
        is_norm: 是否进行标准化
        is_norm_score: 是否对分数进行标准化

    Returns:

    """
    data_process_logger.info('handling %s' % origin_csv_path)
    with open(origin_csv_path, 'rb') as fin_csv, open(output_csv_path, 'wb') as fout_csv:
        reader = csv.reader(fin_csv)
        writer = csv.writer(fout_csv)
        # count = 0
        origin_datas = []
        data_process_logger.info('start reading %s' % origin_csv_path)
        # 首先进行缺失值的补充和标准化
        count = 1
        n_feature = 4563
        for line in reader:
            if len(line) == n_feature:
                single_vec_value = [float(i) if i != 'NaN' else np.nan for i in line]
                 # process the 453th col, remove future feature.
                single_vec_value = single_vec_value[:453]+single_vec_value[454:]
                origin_datas.append(single_vec_value)
                # data_process_logger.info('handled line %s' % count)

            else:
                data_process_logger.info(
                    'casting line: %s in file %s, it has %s features while the first line has %s' % (
                        count, origin_csv_path, len(line), n_feature))
            count += 1
        # inferring missing data
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        imp.fit(origin_datas)
        transformed_datas = imp.transform(origin_datas)
        if is_norm:
            # standardising datas
            stock_ids = transformed_datas[:, 0]
            stock_scores = transformed_datas[:, 1]
            vec_values = transformed_datas[:, 2:]
            scaled_vec_values = preprocessing.scale(vec_values)
            if is_norm_score:
                stock_scores = preprocessing.scale(stock_scores)
            transformed_datas = np.column_stack((stock_ids, stock_scores, scaled_vec_values))
        # 进行预测
        xlist = [a[2:] for a in transformed_datas]  # vec values
        origin_score_list = [a[1] for a in transformed_datas]
        stock_ids = [a[0] for a in transformed_datas]
        score_list = predict_model.predict(xlist, num_iteration=predict_iteration)
        line_numbers = range(1, len(xlist) + 1)
        # 对预测结果进行排序并输出csv
        result = np.column_stack((line_numbers, stock_ids, score_list, origin_score_list))
        sorted_result = sorted(result, cmp=lambda x, y: 1 if x[2] - y[2] > 0 else -1)
        writer.writerow(['origin_line', 'stock_id', 'predict_score', 'origin_score'])
        for row in sorted_result:
            writer.writerow([int(row[0]), str(row[1]), row[2], row[3]])
        # writting transformed datas
        data_process_logger.info('complete writting %s' % output_csv_path)
        return sorted_result
Example #27
0
            with open(fin_path, 'rb') as fin_data_file:
                stock_ids, stock_scores, vec_values = cPickle.load(fin_data_file)
                data_process_logger.info('testing file: %s' % fin_path)
                input_datas = np.column_stack((stock_ids, stock_scores, vec_values))
                mean_rank_rate = test_datas(input_datas, model)
                if mean_rank_rate >= 0.4:
                    data_analysis_logger.info('the file number is %s, obs = %s' % (i, len(input_datas)))
                mean_rank_rates.append(mean_rank_rate)
                file_number_list.append(i)
        except Exception, e:
            data_process_logger.info('test file failed: file path=%s, details=%s' % (fin_path, e))
    mean_rank_rate = np.mean(mean_rank_rates)
    std_rank_rate = np.std(mean_rank_rates)
    var_rank = np.var(mean_rank_rates)
    data_process_logger.info(
        'Tested %s files, all input files mean rank rate is %s, all input files std is %s, var is %s' % (
            len(input_file_numbers), mean_rank_rate, std_rank_rate, var_rank))
    return file_number_list, mean_rank_rates


# ------ For parallel process -----
def test_single_file(fin_path):
    try:
        global g_model
        with open(fin_path, 'rb') as fin_data_file:
            stock_ids, stock_scores, vec_values = cPickle.load(fin_data_file)
            data_process_logger.info('testing file: %s' % fin_path)
            input_datas = np.column_stack((stock_ids, stock_scores, vec_values))
            mean_rank_rate = test_datas(input_datas, g_model)
            if mean_rank_rate >= 0.4:
                data_analysis_logger.info('the file number is %s, obs = %s' % (i, len(input_datas)))
def infer_missing_datas_to_gzip(fin_csv_path,
                                fout_gzip_path,
                                is_norm=True,
                                is_norm_score=True):
    """
    处理NaN数据,并将处理后的数据分别存储为csv与pickle文件
    Args:
        is_norm: 是否进行标准化
        is_norm_score: 是否对score进行标准化
        fin_csv_path:
        fout_gzip_path:

    Returns:

    """
    with open(fin_csv_path, 'rb') as fin_csv, \
            gzip.open(fout_gzip_path, 'wb') as fout_gzip:
        origin_datas = []
        reader = csv.reader(fin_csv)
        # writer = csv.writer(fout_csv)
        count = 1
        n_feature = 4563
        data_process_logger.info('start reading %s' % fin_csv_path)
        for line in reader:
            if len(line) == n_feature:
                single_vec_value = [
                    float(i) if i != 'NaN' else np.nan for i in line
                ]
                # process the 453th col, remove future feature.
                # single_vec_value = single_vec_value[:453] + single_vec_value[454:]
                origin_datas.append(single_vec_value)
                # data_process_logger.info('handled line %s' % count)

            else:
                data_process_logger.info(
                    'casting line: %s in file %s, it has %s features while the first line has %s'
                    % (count, fin_csv_path, len(line), n_feature))
            count += 1
        # inferring missing data
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        imp.fit(origin_datas)
        transformed_datas = imp.transform(origin_datas)
        stock_ids = []
        stock_scores = []
        scaled_vec_values = []
        stock_ids = transformed_datas[:, 0]
        stock_scores = transformed_datas[:, 1]
        scaled_vec_values = transformed_datas[:, 2:]
        if is_norm:
            # standardising datas
            scaled_vec_values = preprocessing.scale(scaled_vec_values)
            if is_norm_score:
                stock_scores = preprocessing.scale(stock_scores)
        # transformed_datas = (stock_ids.tolist(), stock_scores.tolist(), scaled_vec_values.tolist())  # 存为tuple
        # writting transformed datas
        # data_process_logger.info('start writting %s' % fout_csv_path)
        data_process_logger.info('start saving %s' % fout_gzip_path)
        # transformed_datas = transformed_datas.tolist()  # 转为list进行存储
        # cPickle.dump(transformed_datas, fout_gzip, protocol=2)
        for line_index in xrange(len(stock_ids)):
            stock_id = stock_ids[line_index]
            stock_score = stock_scores[line_index]
            scaled_vec_value = scaled_vec_values[line_index]
            tmp_vec = [int(stock_id)] + [float(stock_score)
                                         ] + scaled_vec_value.tolist()
            tmp_vec_str = [str(a) for a in tmp_vec]
            try:
                # tmp_list_vec = tmp_vec.tolist()
                tmp_line = ','.join(tmp_vec_str)
                fout_gzip.write(tmp_line + '\n')
                if line_index % 100 == 0:
                    print 'line %s join success' % line_index
            except Exception, e:
                print 'line %s join failed, details=%s' % (line_index, e)
        data_process_logger.info('%s done' % fin_csv_path)
        return transformed_datas
Example #29
0
                data_root_path, i)
        else:
            fin_path = '%s/pickle_datas/%s_trans.pickle' % (data_root_path, i)
        try:
            mean_rank_rate = test_single_lambdarank_file(fin_path, model)
            if mean_rank_rate:
                mean_rank_rates.append(mean_rank_rate)
                file_number_list.append(i)
        except Exception, e:
            data_process_logger.info(
                'test file failed: file path=%s, details=%s' % (fin_path, e))
    mean_rank_rate = np.mean(mean_rank_rates)
    std_rank_rate = np.std(mean_rank_rates)
    var_rank = np.var(mean_rank_rates)
    data_process_logger.info(
        'Tested %s files, all input files mean rank rate is %s, all input files std is %s, var is %s'
        % (len(input_file_numbers), mean_rank_rate, std_rank_rate, var_rank))
    return file_number_list, mean_rank_rates


def test_single_lambdarank_file(fin_path, model_file):
    try:
        # global g_model
        data_analysis_logger.info('testing %s' % fin_path)
        stock_ids, stock_scores, vec_values, stock_rank_labels, query_count = process_single_pickle_data(
            fin_path)
        ylist = model_file.predict(vec_values)
        origin_score_list = stock_scores
        combined_score_list = np.column_stack((ylist, origin_score_list))
        # input_datas = input_datas.tolist()
        # origin_ranked_list = sorted(input_datas, cmp=lambda x, y: 1 if x[1] - y[1] > 0 else -1)
Example #30
0
def load_pickle_datas(tmp_pickle_path):
    with open(tmp_pickle_path, 'rb') as fin:
        data_process_logger.info('processing %s' % tmp_pickle_path)
        pickle_data = cPickle.load(fin)
        return pickle_data