Beispiel #1
0
def f_hour_weight():
    out_file = path_intermediate_dataset + hdf_hour_weight
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_hour_weight)

    # 加载 action.h5
    action = pd.read_hdf(path_intermediate_dataset + 'action.h5')

    # 提取每小时的行为数据
    hour_weight = (action['installTime'] / 100 %
                   100).astype(int).value_counts()
    hour_weight_df = DataFrame(hour_weight).reset_index()
    hour_weight_df.rename(columns={
        'index': 'hour',
        'installTime': 'hour_weight'
    },
                          inplace=True)

    # 手动释放内存
    del action
    gc.collect()

    # 存储
    util.safe_save(path_intermediate_dataset, hdf_hour_weight, hour_weight_df)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Beispiel #2
0
def merge(hdf_out, hdf_in):
    # 开始计时,并打印相关信息
    start = util.print_start(hdf_out)

    # 加载数据
    user = pd.read_hdf(path_intermediate_dataset + hdf_user_fg)
    context = pd.read_hdf(path_intermediate_dataset + hdf_in)

    # 合并为 dataset
    dataset = pd.merge(context, user, on='userID')
    del user
    del context
    gc.collect()

    ad = pd.read_hdf(path_intermediate_dataset + hdf_ad_fg)
    dataset = dataset.merge(ad, on='creativeID')
    del ad
    gc.collect()

    # 构造 is_pref_cat 特征
    dataset['is_pref_cat'] = dataset['appCategory'] == dataset['cat_pref']

    # 删除 creativeID, userID 两列
    del dataset['creativeID']
    del dataset['userID']

    # 存储
    util.safe_save(path_intermediate_dataset, hdf_out, dataset)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Beispiel #3
0
def f_app_popularity():
    out_file = path_intermediate_dataset + hdf_app_popularity
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_app_popularity)

    # 加载 user_app
    user_app = pd.read_hdf(path_intermediate_dataset + hdf_user_app)

    # 提取 app 的热度特征
    app_popularity = user_app.groupby('appID').count()
    app_popularity.rename(columns={'userID': 'app_popularity'}, inplace=True)

    # 手动释放内存
    del user_app
    gc.collect()

    # 存储
    util.safe_save(path_intermediate_dataset, hdf_app_popularity,
                   app_popularity)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Beispiel #4
0
def f_user_pref_cat():
    out_file = path_intermediate_dataset + hdf_user_pref_cat
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_user_pref_cat)

    # 加载数据
    user_app_cat = pd.read_hdf(path_intermediate_dataset + hdf_user_app_cat)

    # 计算同一用户安装相同品类app的数量
    user_cat_count = user_app_cat.groupby(['userID', 'appCategory'],
                                          as_index=False).count()
    user_cat_count.rename(columns={'appID': 'count'}, inplace=True)

    # 手动释放内存
    del user_app_cat
    gc.collect()

    # 提取数量最多的非未知品类,作为用户的偏好品类
    group_idxmax = \
        user_cat_count.loc[user_cat_count['appCategory'] != 0, ['userID', 'count']].groupby('userID').idxmax()
    user_pref_cat = user_cat_count.loc[group_idxmax['count'],
                                       ['userID', 'appCategory']]
    user_pref_cat.rename(columns={'appCategory': 'cat_pref'}, inplace=True)

    # 存储
    util.safe_save(path_intermediate_dataset, hdf_user_pref_cat, user_pref_cat)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Beispiel #5
0
def f_user_activity():
    out_file = path_intermediate_dataset + hdf_user_activity
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_user_activity)

    # 提取用户的活跃度特征
    user_app = pd.read_hdf(path_intermediate_dataset + hdf_user_app)
    user_activity = user_app.groupby('userID').count()
    user_activity.rename(columns={'appID': 'user_activity'}, inplace=True)

    # 手动释放内存
    del user_app
    gc.collect()

    # 离散化
    interval = np.ceil(np.logspace(0, 3, 6))
    user_activity['user_activity'] = \
        pd.cut(user_activity['user_activity'], interval, include_lowest=True, labels=False)
    user_activity.reset_index(inplace=True)

    # 存储
    util.safe_save(path_intermediate_dataset, hdf_user_activity, user_activity)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Beispiel #6
0
def fg_user():
    out_file = path_intermediate_dataset + hdf_user_fg
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_user_fg)

    # 加载 user 数据
    user = pd.read_hdf(path_intermediate_dataset + hdf_user)

    # 对 age 分段
    age_interval = [0, 1, 4, 14, 29, 44, 59, 74, 84]
    user['age'] = pd.cut(user['age'],
                         age_interval,
                         right=False,
                         include_lowest=True,
                         labels=False)

    # 加载并添加用户的活跃度特征
    in_file = path_intermediate_dataset + hdf_user_activity
    if not os.path.exists(in_file):
        f_user_activity()
    user_activity = pd.read_hdf(in_file)
    user = user.merge(user_activity, how='left', on='userID')

    # 手动释放内存
    del user_activity
    gc.collect()

    # 将 user_activity 的 NaN 填充为 5
    user['user_activity'].fillna(5, inplace=True)

    # 加载并添加用户对app的品类偏好特征
    in_file = path_intermediate_dataset + hdf_user_pref_cat
    if not os.path.exists(in_file):
        f_user_pref_cat()
    user_pref_cat = pd.read_hdf(in_file)
    user = user.merge(user_pref_cat, how='left', on='userID')

    # 手动释放内存
    del user_pref_cat
    gc.collect()

    # 将 cat_pref 的 NaN 填充为 0
    user['cat_pref'].fillna(0, inplace=True)

    # 存储
    util.safe_save(path_intermediate_dataset, hdf_user_fg, user)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Beispiel #7
0
def train():
    out_file = path_intermediate_dataset + hdf_train
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_train)

    # 加载 train.csv
    train_df = pd.read_csv(path_original_dataset + csv_train)

    # ===== 填充telecomsOperator中的缺失值 =====
    userID_telecomsOperator = train_df.groupby(['userID', 'telecomsOperator'],
                                               as_index=False).count()
    userID_count = userID_telecomsOperator['userID'].value_counts()
    del userID_telecomsOperator
    gc.collect()

    userID_count_set_2 = set(userID_count.loc[userID_count == 2].index.values)
    del userID_count
    gc.collect()
    userID_missing_value_set = set(
        train_df.loc[train_df['telecomsOperator'] == 0, 'userID'])

    # 将缺失值置为NaN
    train_df.loc[train_df['telecomsOperator'] == 0,
                 'telecomsOperator'] = np.nan
    # 排序
    train_df.sort_values(by=['userID', 'telecomsOperator'], inplace=True)
    indexer = train_df['userID'].isin(userID_count_set_2
                                      & userID_missing_value_set)
    del userID_count_set_2
    del userID_missing_value_set
    gc.collect()
    # 填充缺失值
    train_df.loc[indexer, 'telecomsOperator'] = train_df.loc[
        indexer, 'telecomsOperator'].ffill()

    # 将剩余的缺失值置为 0
    train_df['telecomsOperator'].fillna(value=0, inplace=True)

    # 存储
    util.safe_save(path_intermediate_dataset, hdf_train, train_df)

    # 停止计时,并打印相关信息
    util.print_stop(start)

    gc.collect()
Beispiel #8
0
def f_conversion_ratio_telecomsOperator():
    out_file = path_intermediate_dataset + hdf_conversion_ratio_telecomsOperator
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_conversion_ratio_telecomsOperator)

    # 加载数据
    train_df = pd.read_hdf(path_intermediate_dataset + hdf_train)
    # 提取 telecomsOperator 的转化率特征
    distribution_telecomsOperator = train_df['telecomsOperator'].value_counts(
        dropna=False)
    distribution_telecomsOperator.sort_index(inplace=True)

    distribution_telecomsOperator_positive = train_df.loc[
        train_df['label'] == 1, 'telecomsOperator'].value_counts(dropna=False)
    distribution_telecomsOperator_positive.sort_index(inplace=True)

    # 手动释放内存
    del train_df
    gc.collect()

    distribution_telecomsOperator_ratio = distribution_telecomsOperator_positive / distribution_telecomsOperator

    mx = distribution_telecomsOperator_ratio.max()
    mn = distribution_telecomsOperator_ratio.min()
    distribution_telecomsOperator_ratio = (
        distribution_telecomsOperator_ratio - mn) / (mx - mn)

    conversion_ratio_telecomsOperator = DataFrame(
        distribution_telecomsOperator_ratio)
    conversion_ratio_telecomsOperator.reset_index(inplace=True)
    conversion_ratio_telecomsOperator.columns = [
        'telecomsOperator', 'conversion_ratio_telecomsOperator'
    ]

    # 手动释放内存
    del distribution_telecomsOperator_ratio
    gc.collect()

    # 存储
    util.safe_save(path_intermediate_dataset,
                   hdf_conversion_ratio_telecomsOperator,
                   conversion_ratio_telecomsOperator)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Beispiel #9
0
def f_hour():
    """
    构造与 hour 相关的特征。
    """
    out_file = path_intermediate_dataset + hdf_hour
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_hour)

    # 加载 train.h5
    train_df = pd.read_hdf(path_intermediate_dataset + hdf_train)

    # 从`clickTime`中提取`hour`特征
    train_df['hour'] = (train_df['clickTime'] / 100 % 100).astype(int)

    # 提取`hour`的`weight`特征
    hour_count_positive = train_df.loc[train_df['label'] == 1,
                                       'hour'].value_counts()
    hour_count_positive.sort_index(inplace=True)

    hour_count_positive_df = DataFrame(hour_count_positive)
    hour_count_positive_df.reset_index(inplace=True)
    hour_count_positive_df.columns = ['hour', 'hour_weight']

    # 提取`hour`的`conversion_ratio`特征
    hour_count = train_df['hour'].value_counts()
    hour_count.sort_index(inplace=True)

    hour_count_df = DataFrame(hour_count)
    hour_count_df.reset_index(inplace=True)
    hour_count_df.columns = ['hour', 'hour_weight']

    hour_count_positive_df['hour_conversion_ratio'] = \
        hour_count_positive_df['hour_weight'] / hour_count_df['hour_weight']

    # hour_conversion_ratio 归一化
    mx = hour_count_positive_df['hour_conversion_ratio'].max()
    mn = hour_count_positive_df['hour_conversion_ratio'].min()
    hour_count_positive_df['hour_conversion_ratio'] = \
        (hour_count_positive_df['hour_conversion_ratio'] - mn) / (mx - mn)

    # 存储
    util.safe_save(path_intermediate_dataset, hdf_hour, hour_count_positive_df)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Beispiel #10
0
def user():
    out_file = path_intermediate_dataset + hdf_user
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_user)

    in_file = path_original_dataset + csv_user
    user_df = pd.read_csv(in_file)

    # 将地理位置调整到省级
    user_df['hometown'] = (user_df['hometown'] / 100).astype(int)
    user_df['residence'] = (user_df['hometown'] / 100).astype(int)

    # 存储
    util.safe_save(path_intermediate_dataset, hdf_user, user_df)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Beispiel #11
0
def user_app_cat():
    out_file = path_intermediate_dataset + hdf_user_app_cat
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_user_app_cat)

    # 加载数据
    user_app_df = pd.read_hdf(path_intermediate_dataset + hdf_user_app)
    app_cat_df = pd.read_hdf(path_intermediate_dataset + hdf_app_cat)

    # 合并表格
    user_app_cat_df = user_app_df.merge(app_cat_df, on='appID', how='left')

    # 存储
    util.safe_save(path_intermediate_dataset, hdf_user_app_cat,
                   user_app_cat_df)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Beispiel #12
0
def transform_csv_to_hdf(csv, hdf):
    """

    :param csv: 
    :param hdf: 
    :return: 
    """
    out_file = path_intermediate_dataset + hdf
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf)

    in_file = path_original_dataset + csv
    df = pd.read_csv(in_file)

    # 存储
    util.safe_save(path_intermediate_dataset, hdf, df)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Beispiel #13
0
def userID_appID_pair_installed():
    """ 为训练集准备已存在安装行为的 'userID-appID'

    Notes
    -----
    该函数所生成的数据是给处理训练集时使用的,对于已存在安装行为的 'userID-appID',其所
    对应的训练集中的样本应当直接舍弃。
    这样单独计算也是为了节省内存。因为train和test中的appID只占hdf_user_app中很少一部分
    """
    out_file = path_intermediate_dataset + hdf_userID_appID_pair_installed
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_userID_appID_pair_installed)

    # ===== train =====
    train_df = pd.read_hdf(path_intermediate_dataset + hdf_train)
    ad_df = pd.read_hdf(path_intermediate_dataset + hdf_ad)
    # 合并
    train_df = train_df.merge(ad_df, on='creativeID')
    # 单独提取出 userID, appID
    userID_set_train = set(train_df['userID'])
    appID_set_train = set(train_df['appID'])
    # 手动释放内存
    del train_df
    gc.collect()

    # ===== test_ol =====
    test_df = pd.read_hdf(path_intermediate_dataset + hdf_test_ol)
    # 合并
    test_df = test_df.merge(ad_df, on='creativeID')
    # 单独提取出 userID, appID
    userID_set_test_ol = set(test_df['userID'])
    appID_set_test_ol = set(test_df['appID'])
    # 手动释放内存
    del test_df
    del ad_df
    gc.collect()

    userID_set = userID_set_train | userID_set_test_ol
    appID_set = appID_set_train | appID_set_test_ol

    # 手动释放内存
    del userID_set_train
    del userID_set_test_ol
    del appID_set_train
    del appID_set_test_ol
    gc.collect()

    # 从 user_app 中提取出已经发生安装行为的 'userID_appID' 对
    user_app_df = pd.read_hdf(path_intermediate_dataset + hdf_user_app)
    indexer = user_app_df['userID'].isin(
        userID_set) & user_app_df['appID'].isin(appID_set)
    userID_appID_set = set(
        util.elegant_pairing(user_app_df.loc[indexer, 'userID'],
                             user_app_df.loc[indexer, 'appID']))
    del user_app_df
    gc.collect()

    # 从 action 中提取出已经发生安装行为的 'userID_appID' 对
    action_df = pd.read_hdf(path_intermediate_dataset + hdf_action)
    indexer = action_df['userID'].isin(userID_set) & action_df['appID'].isin(
        appID_set)
    userID_appID_set |= set(
        util.elegant_pairing(action_df.loc[indexer, 'userID'],
                             action_df.loc[indexer, 'appID']))
    del action_df
    gc.collect()

    # 通过 list 转换为 Series 以存为 hdf5 格式
    util.safe_save(path_intermediate_dataset, hdf_userID_appID_pair_installed,
                   Series(list(userID_appID_set)))

    # 停止计时,并打印相关信息
    util.print_stop(start)

    gc.collect()
Beispiel #14
0
def fg_context(hdf_out, hdf_in):
    out_file = path_intermediate_dataset + hdf_out
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_out)

    # 加载 hdf_in
    df = pd.read_hdf(path_intermediate_dataset + hdf_in)

    # 添加 connectionType 的转化率特征
    in_file = path_intermediate_dataset + hdf_conversion_ratio_connectionType
    if not os.path.exists(in_file):
        f_conversion_ratio_connectionType()
    conversion_ratio_connectionType = pd.read_hdf(in_file)
    df = df.merge(conversion_ratio_connectionType,
                  how='left',
                  on='connectionType')
    print('添加 connectionType 的转化率特征', df.shape)
    del conversion_ratio_connectionType

    # 添加 telecomsOperator 的转化率特征
    in_file = path_intermediate_dataset + hdf_conversion_ratio_telecomsOperator
    if not os.path.exists(in_file):
        f_conversion_ratio_telecomsOperator()
    conversion_ratio_telecomsOperator = pd.read_hdf(in_file)
    df = df.merge(conversion_ratio_telecomsOperator,
                  how='left',
                  on='telecomsOperator')
    print('添加 telecomsOperator 的转化率特征', df.shape)
    del conversion_ratio_telecomsOperator

    # 提取 hour 特征
    df['hour'] = (df['clickTime'] / 100 % 100).astype(int)

    # 添加与`hour`相关的特征
    in_file = path_intermediate_dataset + hdf_hour
    if not os.path.exists(in_file):
        f_hour()
    f_hour_df = pd.read_hdf(in_file)
    df = df.merge(f_hour_df, how='left', on='hour')
    print('添加与`hour`相关的特征', df.shape)
    del f_hour_df
    gc.collect()

    # 添加与`userID`相关的特征
    in_file = path_intermediate_dataset + hdf_userID
    if not os.path.exists(in_file):
        f_userID()
    f_userID_df = pd.read_hdf(in_file)
    df = df.merge(f_userID_df, on='userID', how='left')
    print('添加与`userID`相关的特征', df.shape)
    del f_userID_df
    gc.collect()
    # 对线上测试集来说,此时会出现缺失值
    if 'test' in hdf_in:
        # 将缺失的conversion_count填充为0
        df['conversion_count'].fillna(value=0, inplace=True)
        # 将缺失的click_count_group填充为0
        df['click_count_group'].fillna(value=0, inplace=True)
        # 将缺失的 conversion_ratio_click 填充为0
        df['conversion_ratio_click'].fillna(value=0, inplace=True)

    # ===== 添加“该 userID_appID 是否已存在安装行为”的特征 =====
    # 加载 ad.h5
    ad_df = pd.read_hdf(path_intermediate_dataset + hdf_ad)

    # 合并 train, ad
    df = df.merge(ad_df[['creativeID', 'appID']], how='left', on='creativeID')
    print('合并 ad', df.shape)
    del ad_df
    gc.collect()

    # # 构造 'userID-appID' 列, 有没有更好的编码方式?str太占内存了。有,接下来看寡人的牛逼函数!
    # df['userID-appID'] = df['userID'].astype(str) + '-' + df['appID'].astype(str)

    df['userID-appID'] = util.elegant_pairing(df['userID'], df['appID'])
    del df['appID']
    gc.collect()

    # 加载 userID_appID.h5
    userID_appID = pd.read_hdf(path_intermediate_dataset +
                               hdf_userID_appID_pair_installed)

    # 只保留没有安装行为的 userID_appID(这个操作也应当放到数据清洗里),还是提取为一个特征比较好
    # df = df.loc[~df['userID-appID'].isin(userID_appID)]
    df['is_installed'] = df['userID-appID'].isin(userID_appID)

    # 释放内存
    del df['userID-appID']
    del userID_appID
    gc.collect()

    # 加载 pos.h5
    pos_df = pd.read_hdf(path_intermediate_dataset + hdf_pos)
    # 合并表格
    df = df.merge(pos_df, on='positionID', how='left')
    print('合并 pos', df.shape)
    del pos_df
    gc.collect()

    # 准备存储
    if 'train' in hdf_in:
        # 舍弃后5天的负样本(感觉这个操作应该放到数据清洗里)
        # 这里会用到clickTime,所以在此之前,不能删除
        df = df.loc[(df['clickTime'] < 260000) | (df['label'] != 0)]
        del df['clickTime']
        del df['conversionTime']
        del df['positionID']
    elif 'test' in hdf_in:
        del df['instanceID']
        del df['label']
        del df['clickTime']
        del df['positionID']

    # 存储
    util.safe_save(path_intermediate_dataset, hdf_out, df)

    # 停止计时,并打印相关信息
    util.print_stop(start)
Beispiel #15
0
def fg_ad():
    out_file = path_intermediate_dataset + hdf_ad_fg
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_ad_fg)

    # 加载 ad.h5 和 app_cat.h5
    ad = pd.read_hdf(path_intermediate_dataset + hdf_ad)
    app_cat = pd.read_hdf(path_intermediate_dataset + hdf_app_cat)

    # 合并 ad 和 app_cat
    ad = ad.merge(app_cat, on='appID')

    # 手动释放内存
    del app_cat
    gc.collect()

    # 加载 app_popularity.h5
    in_file = path_intermediate_dataset + hdf_app_popularity
    if not os.path.exists(in_file):
        f_app_popularity()
    app_popularity = pd.read_hdf(in_file)

    # 合并表格
    # 因为构造该特征时使用了 groupby,故此时的 index 是 'appID', 需要在合并之前将其恢复为 columns
    app_popularity.reset_index(inplace=True)
    ad = ad.merge(app_popularity, how='left', on='appID')

    # 手动释放内存
    del app_popularity
    gc.collect()

    # 将 app_popularity 离散化
    ad['app_popularity'] = pd.cut(ad['app_popularity'],
                                  np.logspace(0, 7, num=8),
                                  include_lowest=True,
                                  labels=False)

    # 将 app_popularity 的 NaN 填充为 6
    ad['app_popularity'].fillna(6, inplace=True)

    # 提取出部分特征
    selected_feature = [
        'creativeID',
        # 'adID', \
        # 'camgaignID', \
        'advertiserID',
        # 'appID',
        'appPlatform',
        'appCategory',
        'app_popularity'
    ]
    ad_selected = ad[selected_feature]

    # 存储
    util.safe_save(path_intermediate_dataset, hdf_ad_fg, ad_selected)

    del ad
    gc.collect()

    # 停止计时,并打印相关信息
    util.print_stop(start)
Beispiel #16
0
def f_userID():
    out_file = path_intermediate_dataset + hdf_userID
    if util.is_exist(out_file):
        return

    # 开始计时,并打印相关信息
    start = util.print_start(hdf_userID)

    # 加载 train.h5
    train_df = pd.read_hdf(path_intermediate_dataset + 'train.h5')

    # 从`userID`中提取`conversion_count`特征
    userID_count_positive = train_df.loc[train_df['label'] == 1,
                                         'userID'].value_counts()
    userID_count_positive.sort_index(inplace=True)

    userID_count_positive_df = DataFrame(userID_count_positive)
    userID_count_positive_df.reset_index(inplace=True)
    userID_count_positive_df.columns = ['userID', 'conversion_count']

    del userID_count_positive
    gc.collect()

    # 对`userID`提取`click_count_group`特征
    userID_count = train_df['userID'].value_counts()
    userID_count.sort_index(inplace=True)

    userID_count_df = DataFrame(userID_count)
    userID_count_df.reset_index(inplace=True)
    userID_count_df.columns = ['userID', 'click_count']

    del userID_count
    gc.collect()

    # 对 click_count 分组
    bins = [1, 28, 44, 120]
    userID_count_df['click_count_group'] = \
        pd.cut(userID_count_df['click_count'], bins=bins, include_lowest=True, labels=False)

    # 合并
    f_userID_df = userID_count_df.merge(userID_count_positive_df,
                                        how='left',
                                        on='userID')
    del userID_count_df
    del userID_count_positive_df
    gc.collect()

    # 将缺失值填充为0
    f_userID_df['conversion_count'].fillna(value=0, inplace=True)

    # 对`userID`提取`conversion_ratio`特征
    f_userID_df['conversion_ratio_click'] = f_userID_df[
        'conversion_count'] / f_userID_df['click_count']

    # 存储
    del f_userID_df['click_count']
    util.safe_save(path_intermediate_dataset, hdf_userID, f_userID_df)

    # 手动释放内存
    del train_df
    gc.collect()

    # 停止计时,并打印相关信息
    util.print_stop(start)