Beispiel #1
0
        ctr.append((C[i] + alpha_item) / (I[i] + alpha_item + beta_item))
    items['clicked_ratio'] = ctr
    items.drop(['exposure_num', 'clicked_num'], axis=1, inplace=True)

    photo_data = pd.DataFrame()
    photo_data['photo_id'] = user_item_data['photo_id']
    photo_data['exposure_num'] = user_item_data['photo_id'].groupby(
        user_item_data['photo_id']).transform('count')
    photo_data.drop_duplicates(inplace=True)

    photo_data = pd.merge(photo_data, items, how='left', on=['photo_id'])

    photo_data.clicked_ratio.fillna(alpha_item / (alpha_item + beta_item),
                                    inplace=True)

    photo_data = pd.merge(photo_data, face_data, how="left", on=['photo_id'])

    photo_data = pd.merge(photo_data, text_data, how="left", on=['photo_id'])

    photo_data.fillna(0, inplace=True)
    photo_data['have_face_cate'] = photo_data['face_num'].apply(
        lambda x: x >= 1)
    #     photo_data.drop(['face_num'], axis=1, inplace=True)

    print(photo_data.info())

    PHOTO_FEATURE_FILE = 'photo_feature'
    PHOTO_FEATURE_FILE = PHOTO_FEATURE_FILE + '_sample' + '.' + fmt if USE_SAMPLE else PHOTO_FEATURE_FILE + '.' + fmt

    store_data(photo_data, os.path.join(feature_store_path,
                                        PHOTO_FEATURE_FILE), fmt)
Beispiel #2
0
    print(len(count))
    # top_k关键词
    top_key_words = count.most_common(2000)
    top_key_words = [i[0] for i in top_key_words]


    def key_words_num(words):
        num = 0
        for word in words:
            if word in top_key_words:
                num += 1
        return num


    text_data['key_words_num'] = text_data['cover_words'].apply(key_words_num)
    from sklearn.feature_extraction.text import TfidfVectorizer

    vectorizer = TfidfVectorizer(max_df=0.7)
    corpus = text_data['cover_words'].apply(lambda words: ' '.join(words))
    tfidf = vectorizer.fit_transform(corpus)
    avg_tfidf = np.mean(tfidf, axis=1)
    text_data['avg_tfidf'] = avg_tfidf

    text_data.drop(['cover_words'], axis=1, inplace=True)
    TEXT_FEATURE_FILE = 'text_feature'
    TEXT_FEATURE_FILE = TEXT_FEATURE_FILE + '_sample' + '.' + fmt if USE_SAMPLE else TEXT_FEATURE_FILE + '.' + fmt
    feature_store_path = '../sample/features' if USE_SAMPLE else '../data/features'
    if not os.path.exists(feature_store_path):
        os.mkdir(feature_store_path)
    store_data(text_data, os.path.join(feature_store_path, TEXT_FEATURE_FILE), fmt)
    ]
    ensemble_train[uint64_cols] = ensemble_train[uint64_cols].astype('uint64')
    ensemble_train[uint32_cols] = ensemble_train[uint32_cols].astype('uint32')
    ensemble_train[uint16_cols] = ensemble_train[uint16_cols].astype('uint16')
    ensemble_train[uint8_cols] = ensemble_train[uint8_cols].astype('uint8')
    ensemble_train[bool_cols] = ensemble_train[bool_cols].astype('bool')
    ensemble_train[float64_cols] = ensemble_train[float64_cols].astype(
        'float32')
    ensemble_train[y_label] = ensemble_train[y_label].astype('bool')
    print(ensemble_train.info())

    ensemble_test = user_item_test[input_features]
    ensemble_test[uint64_cols] = ensemble_test[uint64_cols].astype('uint64')
    ensemble_test[uint32_cols] = ensemble_test[uint32_cols].astype('uint32')
    ensemble_test[uint16_cols] = ensemble_test[uint16_cols].astype('uint16')
    ensemble_test[uint8_cols] = ensemble_test[uint8_cols].astype('uint8')
    ensemble_test[bool_cols] = ensemble_test[bool_cols].astype('bool')
    ensemble_test[float64_cols] = ensemble_test[float64_cols].astype('float32')
    print(ensemble_test.info())

    ALL_FEATURE_TRAIN_FILE = 'ensemble_feature_train'
    ALL_FEATURE_TRAIN_FILE = ALL_FEATURE_TRAIN_FILE + '_sample' + '.' + fmt if USE_SAMPLE else ALL_FEATURE_TRAIN_FILE + '.' + fmt

    ALL_FEATURE_TEST_FILE = 'ensemble_feature_test'
    ALL_FEATURE_TEST_FILE = ALL_FEATURE_TEST_FILE + '_sample' + '.' + fmt if USE_SAMPLE else ALL_FEATURE_TEST_FILE + '.' + fmt

    store_data(ensemble_train,
               os.path.join(feature_store_path, ALL_FEATURE_TRAIN_FILE), fmt)
    store_data(ensemble_test,
               os.path.join(feature_store_path, ALL_FEATURE_TEST_FILE), fmt)
        bs = BayesianSmoothing(1, 1)
        bs.update(I, C, 10000, 0.0000000001)
        print(bs.alpha, bs.beta)
        alpha_item, beta_item = bs.alpha, bs.beta
    ctr = []
    for i in range(len(I)):
        ctr.append((C[i] + alpha_item) / (I[i] + alpha_item + beta_item))
    items['clicked_ratio'] = ctr
    items.drop(['exposure_num', 'clicked_num'], axis=1, inplace=True)
    clicked_ratio_col_train = pd.merge(photo_train[['user_id', 'photo_id']],
                                       items[['photo_id', 'clicked_ratio']],
                                       how='left',
                                       on=['photo_id'])
    print(clicked_ratio_col_train.head())
    store_data(clicked_ratio_col_train,
               os.path.join(col_feature_store_path, 'clicked_ratio_train.csv'),
               fmt)

    #     高斯噪声
    #     sigma = items.clicked_ratio.std()
    #     u = alpha_item/(alpha_item+beta_item)
    #     items1 = photo_test[['photo_id', 'exposure_num']]
    #     print(photo_test.head())
    #     items1 = items1.drop_duplicates(['photo_id'])
    #     items1['exposure_num_sigma'] = items1['exposure_num'].apply(lambda x: np.exp(-x) * sigma)
    #     print(items1.head(20))
    #     items1['noise'] = items1['exposure_num_sigma'].apply(lambda x: normal(0, x))
    #     print(items1.head(20))
    #     items1['clicked_ratio'] = u + items1['noise']
    #     print(items1.head(20))
    #     items1.loc[items1['clicked_ratio']<0, ['clicked_ratio']] = 0
Beispiel #5
0
 def feature_saver(args):
     df, path, fmt = args
     res = store_data(df, path, fmt)
     return res
Beispiel #6
0
                              fmt=fmt,
                              data_type='test',
                              pool_type=pool_type,
                              num_workers=n)

    user_item_train = fm_trainer.merge()
    print(user_item_train.info())
    user_item_test = fm_tester.merge()
    print(user_item_test.info())

    user_item_train['hour_click_ratio'] = user_item_train.set_index(
        ['user_id', 'time_cate']).groupby(
            level=['user_id', 'time_cate'])['click'].transform('mean').values

    user_item_test = pd.merge(
        user_item_test,
        user_item_train[['user_id', 'time_cate',
                         'hour_click_ratio']].drop_duplicates(),
        how='left',
        on=['user_id', 'time_cate'])

    user_item_test['hour_click_ratio'].fillna(0, inplace=True)
    store_data(
        user_item_train[['user_id', 'photo_id', 'hour_click_ratio']],
        os.path.join(col_feature_store_path, 'hour_click_ratio_train.csv'),
        fmt)

    store_data(
        user_item_test[['user_id', 'photo_id', 'hour_click_ratio']],
        os.path.join(col_feature_store_path, 'hour_click_ratio_test.csv'), fmt)
    fmt = args.format if args.format else 'csv'

    TRAIN_FACE = '../sample/train_face.txt' if USE_SAMPLE else '../data/train_face.txt'
    TEST_FACE = '../sample/test_face.txt' if USE_SAMPLE else '../data/test_face.txt'
    face_train = pd.read_csv(TRAIN_FACE,
                             sep='\t',
                             header=None,
                             names=['photo_id', 'faces'])

    print(face_train.info())

    face_test = pd.read_csv(TEST_FACE,
                            sep='\t',
                            header=None,
                            names=['photo_id', 'faces'])

    print(face_test.info())

    face_data = pd.concat([face_train, face_test])
    face_data = add_face_feature(face_data)
    face_data.drop(['faces'], axis=1, inplace=True)
    face_data.fillna(0, inplace=True)

    FACE_FEATURE_FILE = 'face_feature'
    FACE_FEATURE_FILE = FACE_FEATURE_FILE + '_sample' + '.' + fmt if USE_SAMPLE else FACE_FEATURE_FILE + '.' + fmt
    feature_store_path = '../sample/features' if USE_SAMPLE else '../data/features'
    if not os.path.exists(feature_store_path):
        os.mkdir(feature_store_path)
    store_data(face_data, os.path.join(feature_store_path, FACE_FEATURE_FILE),
               fmt)
Beispiel #8
0
    favors['non_face_click_favor'] = 1 - favors['face_click_favor']

    # 文本平均长度作为偏爱率
    favors['cover_length_favor'] = favors['cover_length'].groupby(
        favors['user_id']).transform('mean')

    favors.drop_duplicates(['user_id'], inplace=True)
    favors.drop(favor_cols, axis=1, inplace=True)
    favors.reset_index(drop=True, inplace=True)

    users = pd.merge(users, favors, how='left', on=['user_id'])

    users.fillna(0, inplace=True)

    # 对用户点击率做贝叶斯平滑
    I, C = users['browse_num'].values, users['click_num'].values
    #bs.update(I, C, 10000, 0.0000000001)
    #print(bs.alpha, bs.beta)
    #alpha, beta = bs.alpha, bs.beta
    ctr = []
    for i in range(len(I)):
        ctr.append((C[i] + alpha) / (I[i] + alpha + beta))
    users['click_ratio'] = ctr

    print(users.info())

    USER_FEATURE_FILE = 'user_feature'
    USER_FEATURE_FILE = USER_FEATURE_FILE + '_sample' + '.' + fmt if USE_SAMPLE else USER_FEATURE_FILE + '.' + fmt

    store_data(users, os.path.join(feature_store_path, USER_FEATURE_FILE), fmt)