Example #1
0
def merge(hdf_out, hdf_in):
    # 开始计时,并打印相关信息
    start = util.print_start(hdf_out)

    # 加载数据
    user = pd.read_hdf(path_intermediate_dataset + hdf_user_fg)
    context = pd.read_hdf(path_intermediate_dataset + hdf_in)

    # 合并为 dataset
    dataset = pd.merge(context, user, on='userID')
    del user
    del context
    gc.collect()

    ad = pd.read_hdf(path_intermediate_dataset + hdf_ad_fg)
    dataset = dataset.merge(ad, on='creativeID')
    del ad
    gc.collect()

    # 构造 is_pref_cat 特征
    dataset['is_pref_cat'] = dataset['appCategory'] == dataset['cat_pref']

    # 删除 creativeID, userID 两列
    del dataset['creativeID']
    del dataset['userID']

    # 存储
    util.safe_save(path_intermediate_dataset, hdf_out, dataset)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Example #2
0
def f_user_activity():
    out_file = path_intermediate_dataset + hdf_user_activity
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_user_activity)

    # 提取用户的活跃度特征
    user_app = pd.read_hdf(path_intermediate_dataset + hdf_user_app)
    user_activity = user_app.groupby('userID').count()
    user_activity.rename(columns={'appID': 'user_activity'}, inplace=True)

    # 手动释放内存
    del user_app
    gc.collect()

    # 离散化
    interval = np.ceil(np.logspace(0, 3, 6))
    user_activity['user_activity'] = \
        pd.cut(user_activity['user_activity'], interval, include_lowest=True, labels=False)
    user_activity.reset_index(inplace=True)

    # 存储
    util.safe_save(path_intermediate_dataset, hdf_user_activity, user_activity)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Example #3
0
def f_app_popularity():
    out_file = path_intermediate_dataset + hdf_app_popularity
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_app_popularity)

    # 加载 user_app
    user_app = pd.read_hdf(path_intermediate_dataset + hdf_user_app)

    # 提取 app 的热度特征
    app_popularity = user_app.groupby('appID').count()
    app_popularity.rename(columns={'userID': 'app_popularity'}, inplace=True)

    # 手动释放内存
    del user_app
    gc.collect()

    # 存储
    util.safe_save(path_intermediate_dataset, hdf_app_popularity,
                   app_popularity)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Example #4
0
def tuning_hyper_parameters_sim():
    # 开始计时,并打印相关信息
    start = time()
    print('\nStart tuning hyper parameters')

    # 加载训练集
    X_train = load_npz(path_modeling_dataset + npz_X)
    y_train = np.load(path_modeling_dataset + npy_y)    

    # 训练模型
    from sklearn.linear_model import SGDClassifier
    clf = SGDClassifier(loss='log', alpha=0.01, n_jobs=-1)    
    clf.fit(X_train, y_train)
    
    # 打印在训练集上的 logloss
    from sklearn.metrics import log_loss
    print('logloss in trainset: ', log_loss(y_train, clf.predict_proba(X_train)))

    # 手动释放内存
    del X_train
    del y_train
    gc.collect()

    # 存储模型
    util.safe_save(path_model, 'sgd_lr.pkl', clf)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Example #5
0
def one_hot():
    # 开始计时,并打印相关信息
    start = time()
    print('\nStart one hot')

    dataset = pd.read_hdf(path_intermediate_dataset + hdf_dataset)

    # y
    y = dataset['label']
    del dataset['label']
    util.safe_save(path_modeling_dataset, npy_y, y)

    # X
    from sklearn.preprocessing import OneHotEncoder
    enc = OneHotEncoder()
    X = enc.fit_transform(dataset.values)
    del dataset
    gc.collect()
    util.safe_save(path_modeling_dataset, npz_X, X)

    testset_ol = pd.read_hdf(path_intermediate_dataset + hdf_testset_ol)

    # X_test_ol
    X_test_ol = enc.transform(testset_ol.values)
    del testset_ol
    gc.collect()
    util.safe_save(path_modeling_dataset, npz_X_test_ol, X_test_ol)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Example #6
0
def f_hour_weight():
    out_file = path_intermediate_dataset + hdf_hour_weight
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_hour_weight)

    # 加载 action.h5
    action = pd.read_hdf(path_intermediate_dataset + 'action.h5')

    # 提取每小时的行为数据
    hour_weight = (action['installTime'] / 100 %
                   100).astype(int).value_counts()
    hour_weight_df = DataFrame(hour_weight).reset_index()
    hour_weight_df.rename(columns={
        'index': 'hour',
        'installTime': 'hour_weight'
    },
                          inplace=True)

    # 手动释放内存
    del action
    gc.collect()

    # 存储
    util.safe_save(path_intermediate_dataset, hdf_hour_weight, hour_weight_df)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Example #7
0
def f_user_pref_cat():
    out_file = path_intermediate_dataset + hdf_user_pref_cat
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_user_pref_cat)

    # 加载数据
    user_app_cat = pd.read_hdf(path_intermediate_dataset + hdf_user_app_cat)

    # 计算同一用户安装相同品类app的数量
    user_cat_count = user_app_cat.groupby(['userID', 'appCategory'],
                                          as_index=False).count()
    user_cat_count.rename(columns={'appID': 'count'}, inplace=True)

    # 手动释放内存
    del user_app_cat
    gc.collect()

    # 提取数量最多的非未知品类,作为用户的偏好品类
    group_idxmax = \
        user_cat_count.loc[user_cat_count['appCategory'] != 0, ['userID', 'count']].groupby('userID').idxmax()
    user_pref_cat = user_cat_count.loc[group_idxmax['count'],
                                       ['userID', 'appCategory']]
    user_pref_cat.rename(columns={'appCategory': 'cat_pref'}, inplace=True)

    # 存储
    util.safe_save(path_intermediate_dataset, hdf_user_pref_cat, user_pref_cat)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Example #8
0
def tuning_hyper_parameters():
    # 开始计时,并打印相关信息
    start = time()
    print('\nStart tuning hyper parameters')

    # 加载训练集
    X_train = load_npz(path_modeling_dataset + npz_X_train)
    y_train = np.load(path_modeling_dataset + npy_y_train)

    from sklearn.metrics import make_scorer, log_loss
    loss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

    from sklearn.model_selection import TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=5)

    # GridSearch
    from sklearn.model_selection import GridSearchCV
    from sklearn.linear_model import SGDClassifier
    alphas = np.logspace(-4, -1, 4)
    param_grid = {'alpha': alphas}
    generator = tscv.split(X_train)
    clf = GridSearchCV(SGDClassifier(loss='log', n_jobs=-1), param_grid, cv=generator, scoring=loss, n_jobs=-1)

    # 训练模型
    clf.fit(X_train, y_train)

    # 打印 cv_results
    cv_results_df = \
        DataFrame(clf.cv_results_)[['rank_test_score', 'param_alpha', 'mean_train_score', 'mean_test_score']]
    cv_results_df.rename(
        columns={'mean_train_score': 'mean_train_loss',
                 'mean_test_score': 'mean_val_loss',
                 'rank_test_score': 'rank_val_loss'},
        inplace=True)
    cv_results_df[['mean_val_loss', 'mean_train_loss']] = -cv_results_df[['mean_val_loss', 'mean_train_loss']]
    print('cv results: ')
    print(cv_results_df)

    # 手动释放内存
    del X_train
    del y_train
    gc.collect()

    # 加载测试集
    X_test = load_npz(path_modeling_dataset + npz_X_test)
    y_test = np.load(path_modeling_dataset + npy_y_test)
    # 打印在测试集上的 logloss
    print('logloss in testset: ', -clf.score(X=X_test, y=y_test))

    # 手动释放内存
    del X_test
    del y_test
    gc.collect()

    # 存储模型
    util.safe_save(path_model, 'sgd_lr.pkl', clf.best_estimator_)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Example #9
0
def fg_user():
    out_file = path_intermediate_dataset + hdf_user_fg
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_user_fg)

    # 加载 user 数据
    user = pd.read_hdf(path_intermediate_dataset + hdf_user)

    # 对 age 分段
    age_interval = [0, 1, 4, 14, 29, 44, 59, 74, 84]
    user['age'] = pd.cut(user['age'],
                         age_interval,
                         right=False,
                         include_lowest=True,
                         labels=False)

    # 加载并添加用户的活跃度特征
    in_file = path_intermediate_dataset + hdf_user_activity
    if not os.path.exists(in_file):
        f_user_activity()
    user_activity = pd.read_hdf(in_file)
    user = user.merge(user_activity, how='left', on='userID')

    # 手动释放内存
    del user_activity
    gc.collect()

    # 将 user_activity 的 NaN 填充为 5
    user['user_activity'].fillna(5, inplace=True)

    # 加载并添加用户对app的品类偏好特征
    in_file = path_intermediate_dataset + hdf_user_pref_cat
    if not os.path.exists(in_file):
        f_user_pref_cat()
    user_pref_cat = pd.read_hdf(in_file)
    user = user.merge(user_pref_cat, how='left', on='userID')

    # 手动释放内存
    del user_pref_cat
    gc.collect()

    # 将 cat_pref 的 NaN 填充为 0
    user['cat_pref'].fillna(0, inplace=True)

    # 存储
    util.safe_save(path_intermediate_dataset, hdf_user_fg, user)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Example #10
0
def f_hour():
    """
    构造与 hour 相关的特征。
    """
    out_file = path_intermediate_dataset + hdf_hour
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_hour)

    # 加载 train.h5
    train_df = pd.read_hdf(path_intermediate_dataset + hdf_train)

    # 从`clickTime`中提取`hour`特征
    train_df['hour'] = (train_df['clickTime'] / 100 % 100).astype(int)

    # 提取`hour`的`weight`特征
    hour_count_positive = train_df.loc[train_df['label'] == 1,
                                       'hour'].value_counts()
    hour_count_positive.sort_index(inplace=True)

    hour_count_positive_df = DataFrame(hour_count_positive)
    hour_count_positive_df.reset_index(inplace=True)
    hour_count_positive_df.columns = ['hour', 'hour_weight']

    # 提取`hour`的`conversion_ratio`特征
    hour_count = train_df['hour'].value_counts()
    hour_count.sort_index(inplace=True)

    hour_count_df = DataFrame(hour_count)
    hour_count_df.reset_index(inplace=True)
    hour_count_df.columns = ['hour', 'hour_weight']

    hour_count_positive_df['hour_conversion_ratio'] = \
        hour_count_positive_df['hour_weight'] / hour_count_df['hour_weight']

    # hour_conversion_ratio 归一化
    mx = hour_count_positive_df['hour_conversion_ratio'].max()
    mn = hour_count_positive_df['hour_conversion_ratio'].min()
    hour_count_positive_df['hour_conversion_ratio'] = \
        (hour_count_positive_df['hour_conversion_ratio'] - mn) / (mx - mn)

    # 存储
    util.safe_save(path_intermediate_dataset, hdf_hour, hour_count_positive_df)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Example #11
0
def f_conversion_ratio_telecomsOperator():
    out_file = path_intermediate_dataset + hdf_conversion_ratio_telecomsOperator
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_conversion_ratio_telecomsOperator)

    # 加载数据
    train_df = pd.read_hdf(path_intermediate_dataset + hdf_train)
    # 提取 telecomsOperator 的转化率特征
    distribution_telecomsOperator = train_df['telecomsOperator'].value_counts(
        dropna=False)
    distribution_telecomsOperator.sort_index(inplace=True)

    distribution_telecomsOperator_positive = train_df.loc[
        train_df['label'] == 1, 'telecomsOperator'].value_counts(dropna=False)
    distribution_telecomsOperator_positive.sort_index(inplace=True)

    # 手动释放内存
    del train_df
    gc.collect()

    distribution_telecomsOperator_ratio = distribution_telecomsOperator_positive / distribution_telecomsOperator

    mx = distribution_telecomsOperator_ratio.max()
    mn = distribution_telecomsOperator_ratio.min()
    distribution_telecomsOperator_ratio = (
        distribution_telecomsOperator_ratio - mn) / (mx - mn)

    conversion_ratio_telecomsOperator = DataFrame(
        distribution_telecomsOperator_ratio)
    conversion_ratio_telecomsOperator.reset_index(inplace=True)
    conversion_ratio_telecomsOperator.columns = [
        'telecomsOperator', 'conversion_ratio_telecomsOperator'
    ]

    # 手动释放内存
    del distribution_telecomsOperator_ratio
    gc.collect()

    # 存储
    util.safe_save(path_intermediate_dataset,
                   hdf_conversion_ratio_telecomsOperator,
                   conversion_ratio_telecomsOperator)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Example #12
0
def train():
    out_file = path_intermediate_dataset + hdf_train
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_train)

    # 加载 train.csv
    train_df = pd.read_csv(path_original_dataset + csv_train)

    # ===== 填充telecomsOperator中的缺失值 =====
    userID_telecomsOperator = train_df.groupby(['userID', 'telecomsOperator'],
                                               as_index=False).count()
    userID_count = userID_telecomsOperator['userID'].value_counts()
    del userID_telecomsOperator
    gc.collect()

    userID_count_set_2 = set(userID_count.loc[userID_count == 2].index.values)
    del userID_count
    gc.collect()
    userID_missing_value_set = set(
        train_df.loc[train_df['telecomsOperator'] == 0, 'userID'])

    # 将缺失值置为NaN
    train_df.loc[train_df['telecomsOperator'] == 0,
                 'telecomsOperator'] = np.nan
    # 排序
    train_df.sort_values(by=['userID', 'telecomsOperator'], inplace=True)
    indexer = train_df['userID'].isin(userID_count_set_2
                                      & userID_missing_value_set)
    del userID_count_set_2
    del userID_missing_value_set
    gc.collect()
    # 填充缺失值
    train_df.loc[indexer, 'telecomsOperator'] = train_df.loc[
        indexer, 'telecomsOperator'].ffill()

    # 将剩余的缺失值置为 0
    train_df['telecomsOperator'].fillna(value=0, inplace=True)

    # 存储
    util.safe_save(path_intermediate_dataset, hdf_train, train_df)

    # 停止计时,并打印相关信息
    util.print_stop(start)

    gc.collect()
Example #13
0
def user():
    out_file = path_intermediate_dataset + hdf_user
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_user)

    in_file = path_original_dataset + csv_user
    user_df = pd.read_csv(in_file)

    # 将地理位置调整到省级
    user_df['hometown'] = (user_df['hometown'] / 100).astype(int)
    user_df['residence'] = (user_df['hometown'] / 100).astype(int)

    # 存储
    util.safe_save(path_intermediate_dataset, hdf_user, user_df)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Example #14
0
def predict_test_ol():
    # 开始计时,并打印相关信息
    start = time()
    print('\nStart predicting test_ol')

    # 加载 test_ol
    test_ol = pd.read_hdf(path_intermediate_dataset + hdf_test_ol)
    # # 加载 ad
    # ad = pd.read_hdf(path_intermediate_dataset + hdf_ad)
    # # 合并表格
    # test_ol = test_ol.merge(ad[['creativeID', 'appID']], how='left', on='creativeID')
    # # 构造 'userID-appID' 列
    # test_ol['userID-appID'] = test_ol['userID'].astype(str) + '-' + test_ol['appID'].astype(str)
    # # 加载已经有安装行为的 'userID-appID'
    # userID_appID_test = pd.read_hdf(path_intermediate_dataset + 'userID_appID_for_test.h5')

    # 加载 X_test_ol 和 model
    X_test_ol = load_npz(path_modeling_dataset + npz_X_test_ol)
    clf = joblib.load(path_model + 'sgd_lr.pkl')

    # 预测
    y_test_ol = clf.predict_proba(X_test_ol)

    # 生成提交数据集
    # submission = test_ol[['instanceID', 'label', 'userID-appID']].copy()
    submission = test_ol[['instanceID', 'label']].copy()
    submission.rename(columns={'label': 'prob'}, inplace=True)
    submission['prob'] = y_test_ol[:, 1]
    submission.set_index('instanceID', inplace=True)
    submission.sort_index(inplace=True)

    # # 对于那些已经有安装行为的 'userID-appID', 应该都预测为0
    # submission.loc[submission['userID-appID'].isin(userID_appID_test), 'prob'] = 0
    # # 删除 userID-appID 列
    # del submission['userID-appID']

    # 生成提交的压缩文件
    util.safe_save(path_submission_dataset, csv_submission, submission)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Example #15
0
def user_app_cat():
    out_file = path_intermediate_dataset + hdf_user_app_cat
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_user_app_cat)

    # 加载数据
    user_app_df = pd.read_hdf(path_intermediate_dataset + hdf_user_app)
    app_cat_df = pd.read_hdf(path_intermediate_dataset + hdf_app_cat)

    # 合并表格
    user_app_cat_df = user_app_df.merge(app_cat_df, on='appID', how='left')

    # 存储
    util.safe_save(path_intermediate_dataset, hdf_user_app_cat,
                   user_app_cat_df)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Example #16
0
def split_train_test(train_proportion=0.8):
    """
    直接取后面 20% 的数据作为测试集
    
    Notes
    -----
    由于样本具有时序性,故不能使用 train_test_split 来随机划分,否则会导致数据泄露。
    """

    # 开始计时,并打印相关信息
    start = time()
    print('\nStart spliting train and test')

    # ===== X =====
    X = load_npz(path_modeling_dataset + npz_X)
    # 划分出训练集、测试集(注意不能随机划分)
    train_size = int(np.shape(X)[0] * train_proportion)
    # X_train
    X_train = X[:train_size, :]
    util.safe_save(path_modeling_dataset, npz_X_train, X_train)
    # X_test
    X_test = X[train_size:, :]
    util.safe_save(path_modeling_dataset, npz_X_test, X_test)
    # 手动释放内存
    del X

    # ===== y =====
    y = np.load(path_modeling_dataset + npy_y)
    # y_train
    y_train = y[:train_size]
    util.safe_save(path_modeling_dataset, npy_y_train, y_train)
    # y_test
    y_test = y[train_size:]
    util.safe_save(path_modeling_dataset, npy_y_test, y_test)
    # 手动释放内存
    del y
    gc.collect()

    # 停止计时,并打印相关信息
    util.print_stop(start)
Example #17
0
def transform_csv_to_hdf(csv, hdf):
    """

    :param csv: 
    :param hdf: 
    :return: 
    """
    out_file = path_intermediate_dataset + hdf
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf)

    in_file = path_original_dataset + csv
    df = pd.read_csv(in_file)

    # 存储
    util.safe_save(path_intermediate_dataset, hdf, df)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Example #18
0
def f_userID():
    out_file = path_intermediate_dataset + hdf_userID
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_userID)

    # 加载 train.h5
    train_df = pd.read_hdf(path_intermediate_dataset + 'train.h5')

    # 从`userID`中提取`conversion_count`特征
    userID_count_positive = train_df.loc[train_df['label'] == 1,
                                         'userID'].value_counts()
    userID_count_positive.sort_index(inplace=True)

    userID_count_positive_df = DataFrame(userID_count_positive)
    userID_count_positive_df.reset_index(inplace=True)
    userID_count_positive_df.columns = ['userID', 'conversion_count']

    del userID_count_positive
    gc.collect()

    # 对`userID`提取`click_count_group`特征
    userID_count = train_df['userID'].value_counts()
    userID_count.sort_index(inplace=True)

    userID_count_df = DataFrame(userID_count)
    userID_count_df.reset_index(inplace=True)
    userID_count_df.columns = ['userID', 'click_count']

    del userID_count
    gc.collect()

    # 对 click_count 分组
    bins = [1, 28, 44, 120]
    userID_count_df['click_count_group'] = \
        pd.cut(userID_count_df['click_count'], bins=bins, include_lowest=True, labels=False)

    # 合并
    f_userID_df = userID_count_df.merge(userID_count_positive_df,
                                        how='left',
                                        on='userID')
    del userID_count_df
    del userID_count_positive_df
    gc.collect()

    # 将缺失值填充为0
    f_userID_df['conversion_count'].fillna(value=0, inplace=True)

    # 对`userID`提取`conversion_ratio`特征
    f_userID_df['conversion_ratio_click'] = f_userID_df[
        'conversion_count'] / f_userID_df['click_count']

    # 存储
    del f_userID_df['click_count']
    util.safe_save(path_intermediate_dataset, hdf_userID, f_userID_df)

    # 手动释放内存
    del train_df
    gc.collect()

    # 停止计时,并打印相关信息
    util.print_stop(start)
Example #19
0
def userID_appID_pair_installed():
    """ 为训练集准备已存在安装行为的 'userID-appID'

    Notes
    -----
    该函数所生成的数据是给处理训练集时使用的,对于已存在安装行为的 'userID-appID',其所
    对应的训练集中的样本应当直接舍弃。
    这样单独计算也是为了节省内存。因为train和test中的appID只占hdf_user_app中很少一部分
    """
    out_file = path_intermediate_dataset + hdf_userID_appID_pair_installed
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_userID_appID_pair_installed)

    # ===== train =====
    train_df = pd.read_hdf(path_intermediate_dataset + hdf_train)
    ad_df = pd.read_hdf(path_intermediate_dataset + hdf_ad)
    # 合并
    train_df = train_df.merge(ad_df, on='creativeID')
    # 单独提取出 userID, appID
    userID_set_train = set(train_df['userID'])
    appID_set_train = set(train_df['appID'])
    # 手动释放内存
    del train_df
    gc.collect()

    # ===== test_ol =====
    test_df = pd.read_hdf(path_intermediate_dataset + hdf_test_ol)
    # 合并
    test_df = test_df.merge(ad_df, on='creativeID')
    # 单独提取出 userID, appID
    userID_set_test_ol = set(test_df['userID'])
    appID_set_test_ol = set(test_df['appID'])
    # 手动释放内存
    del test_df
    del ad_df
    gc.collect()

    userID_set = userID_set_train | userID_set_test_ol
    appID_set = appID_set_train | appID_set_test_ol

    # 手动释放内存
    del userID_set_train
    del userID_set_test_ol
    del appID_set_train
    del appID_set_test_ol
    gc.collect()

    # 从 user_app 中提取出已经发生安装行为的 'userID_appID' 对
    user_app_df = pd.read_hdf(path_intermediate_dataset + hdf_user_app)
    indexer = user_app_df['userID'].isin(
        userID_set) & user_app_df['appID'].isin(appID_set)
    userID_appID_set = set(
        util.elegant_pairing(user_app_df.loc[indexer, 'userID'],
                             user_app_df.loc[indexer, 'appID']))
    del user_app_df
    gc.collect()

    # 从 action 中提取出已经发生安装行为的 'userID_appID' 对
    action_df = pd.read_hdf(path_intermediate_dataset + hdf_action)
    indexer = action_df['userID'].isin(userID_set) & action_df['appID'].isin(
        appID_set)
    userID_appID_set |= set(
        util.elegant_pairing(action_df.loc[indexer, 'userID'],
                             action_df.loc[indexer, 'appID']))
    del action_df
    gc.collect()

    # 通过 list 转换为 Series 以存为 hdf5 格式
    util.safe_save(path_intermediate_dataset, hdf_userID_appID_pair_installed,
                   Series(list(userID_appID_set)))

    # 停止计时,并打印相关信息
    util.print_stop(start)

    gc.collect()
Example #20
0
def fg_ad():
    out_file = path_intermediate_dataset + hdf_ad_fg
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_ad_fg)

    # 加载 ad.h5 和 app_cat.h5
    ad = pd.read_hdf(path_intermediate_dataset + hdf_ad)
    app_cat = pd.read_hdf(path_intermediate_dataset + hdf_app_cat)

    # 合并 ad 和 app_cat
    ad = ad.merge(app_cat, on='appID')

    # 手动释放内存
    del app_cat
    gc.collect()

    # 加载 app_popularity.h5
    in_file = path_intermediate_dataset + hdf_app_popularity
    if not os.path.exists(in_file):
        f_app_popularity()
    app_popularity = pd.read_hdf(in_file)

    # 合并表格
    # 因为构造该特征时使用了 groupby,故此时的 index 是 'appID', 需要在合并之前将其恢复为 columns
    app_popularity.reset_index(inplace=True)
    ad = ad.merge(app_popularity, how='left', on='appID')

    # 手动释放内存
    del app_popularity
    gc.collect()

    # 将 app_popularity 离散化
    ad['app_popularity'] = pd.cut(ad['app_popularity'],
                                  np.logspace(0, 7, num=8),
                                  include_lowest=True,
                                  labels=False)

    # 将 app_popularity 的 NaN 填充为 6
    ad['app_popularity'].fillna(6, inplace=True)

    # 提取出部分特征
    selected_feature = [
        'creativeID',
        # 'adID', \
        # 'camgaignID', \
        'advertiserID',
        # 'appID',
        'appPlatform',
        'appCategory',
        'app_popularity'
    ]
    ad_selected = ad[selected_feature]

    # 存储
    util.safe_save(path_intermediate_dataset, hdf_ad_fg, ad_selected)

    del ad
    gc.collect()

    # 停止计时,并打印相关信息
    util.print_stop(start)
Example #21
0
def fg_context(hdf_out, hdf_in):
    out_file = path_intermediate_dataset + hdf_out
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_out)

    # 加载 hdf_in
    df = pd.read_hdf(path_intermediate_dataset + hdf_in)

    # 添加 connectionType 的转化率特征
    in_file = path_intermediate_dataset + hdf_conversion_ratio_connectionType
    if not os.path.exists(in_file):
        f_conversion_ratio_connectionType()
    conversion_ratio_connectionType = pd.read_hdf(in_file)
    df = df.merge(conversion_ratio_connectionType,
                  how='left',
                  on='connectionType')
    print('添加 connectionType 的转化率特征', df.shape)
    del conversion_ratio_connectionType

    # 添加 telecomsOperator 的转化率特征
    in_file = path_intermediate_dataset + hdf_conversion_ratio_telecomsOperator
    if not os.path.exists(in_file):
        f_conversion_ratio_telecomsOperator()
    conversion_ratio_telecomsOperator = pd.read_hdf(in_file)
    df = df.merge(conversion_ratio_telecomsOperator,
                  how='left',
                  on='telecomsOperator')
    print('添加 telecomsOperator 的转化率特征', df.shape)
    del conversion_ratio_telecomsOperator

    # 提取 hour 特征
    df['hour'] = (df['clickTime'] / 100 % 100).astype(int)

    # 添加与`hour`相关的特征
    in_file = path_intermediate_dataset + hdf_hour
    if not os.path.exists(in_file):
        f_hour()
    f_hour_df = pd.read_hdf(in_file)
    df = df.merge(f_hour_df, how='left', on='hour')
    print('添加与`hour`相关的特征', df.shape)
    del f_hour_df
    gc.collect()

    # 添加与`userID`相关的特征
    in_file = path_intermediate_dataset + hdf_userID
    if not os.path.exists(in_file):
        f_userID()
    f_userID_df = pd.read_hdf(in_file)
    df = df.merge(f_userID_df, on='userID', how='left')
    print('添加与`userID`相关的特征', df.shape)
    del f_userID_df
    gc.collect()
    # 对线上测试集来说,此时会出现缺失值
    if 'test' in hdf_in:
        # 将缺失的conversion_count填充为0
        df['conversion_count'].fillna(value=0, inplace=True)
        # 将缺失的click_count_group填充为0
        df['click_count_group'].fillna(value=0, inplace=True)
        # 将缺失的 conversion_ratio_click 填充为0
        df['conversion_ratio_click'].fillna(value=0, inplace=True)

    # ===== 添加“该 userID_appID 是否已存在安装行为”的特征 =====
    # 加载 ad.h5
    ad_df = pd.read_hdf(path_intermediate_dataset + hdf_ad)

    # 合并 train, ad
    df = df.merge(ad_df[['creativeID', 'appID']], how='left', on='creativeID')
    print('合并 ad', df.shape)
    del ad_df
    gc.collect()

    # # 构造 'userID-appID' 列, 有没有更好的编码方式?str太占内存了。有,接下来看寡人的牛逼函数!
    # df['userID-appID'] = df['userID'].astype(str) + '-' + df['appID'].astype(str)

    df['userID-appID'] = util.elegant_pairing(df['userID'], df['appID'])
    del df['appID']
    gc.collect()

    # 加载 userID_appID.h5
    userID_appID = pd.read_hdf(path_intermediate_dataset +
                               hdf_userID_appID_pair_installed)

    # 只保留没有安装行为的 userID_appID(这个操作也应当放到数据清洗里),还是提取为一个特征比较好
    # df = df.loc[~df['userID-appID'].isin(userID_appID)]
    df['is_installed'] = df['userID-appID'].isin(userID_appID)

    # 释放内存
    del df['userID-appID']
    del userID_appID
    gc.collect()

    # 加载 pos.h5
    pos_df = pd.read_hdf(path_intermediate_dataset + hdf_pos)
    # 合并表格
    df = df.merge(pos_df, on='positionID', how='left')
    print('合并 pos', df.shape)
    del pos_df
    gc.collect()

    # 准备存储
    if 'train' in hdf_in:
        # 舍弃后5天的负样本(感觉这个操作应该放到数据清洗里)
        # 这里会用到clickTime,所以在此之前,不能删除
        df = df.loc[(df['clickTime'] < 260000) | (df['label'] != 0)]
        del df['clickTime']
        del df['conversionTime']
        del df['positionID']
    elif 'test' in hdf_in:
        del df['instanceID']
        del df['label']
        del df['clickTime']
        del df['positionID']

    # 存储
    util.safe_save(path_intermediate_dataset, hdf_out, df)

    # 停止计时,并打印相关信息
    util.print_stop(start)