def load_data(path): """ 获取数据,并将user文件以及item文件和行为文件进行合并 :param path: :return: """ user = reduce_mem_usage(pd.read_csv(path + 'user.csv', header=None)) item = reduce_mem_usage(pd.read_csv(path + 'df_item.csv')) user.columns = ['userID', 'sex', 'age', 'ability'] # 获取用户行为日志1到16天的 data = pd.read_csv(path + 'df_behavior.csv') # 要将itemID的格式转化成int64类型的 data['itemID'] = data['itemID'].astype('int64') data = reduce_mem_usage(data) # 开始合并到data中来 data = pd.merge(left=data, right=item, on='itemID', how='left') data = pd.merge(left=data, right=user, on='userID', how='left') return user, item, data
def get_underline_labels(): """ 此函数是获取线下训练集的标签 则应该是返回15号的数据 也就是21号 格式就是['userID', 'itemID', 'behavior'] :return: """ df = pd.read_csv("../data/df_behavior_train.csv") df = df[df['day'] == 21] train_labels = df[['userID', 'itemID', 'behavior']] train_labels = reduce_mem_usage(train_labels) return train_labels
def get_users_log_function(): """ 获取1号到16号的用户行为日志 :return: """ filename = "../data/df_behavior.csv" df_behavior = pd.read_csv(filename) df_behavior = reduce_mem_usage(df_behavior) matrix = df_behavior[['userID', 'itemID', 'behavior', 'day']].values # 使用values时会将数据集转化成浮点数的形式 users_log = dict() for row in matrix: users_log.setdefault(int(row[0]), []) users_log[int(row[0])].append((int(row[1]), int(row[2]), int(row[3]))) return users_log
def generate_online_train_user_sets(): """ 这个函数是用来生成线上训练集的用户集合,即给这里面的所有用户推荐商品 包含着1到15天的用户集合 格式为{user:([i1, i2, i3], .....)}.....其中i1为物品的ID, i2为行为的程度,i3为行为的时间段 :return: """ filename = "../data/df_behavior_train.csv" train_data_df = pd.read_csv(filename) # 获取datafame的数据集 train_data_df = reduce_mem_usage(train_data_df) matrix = train_data_df[['userID', 'itemID', 'behavior', 'day']].values # 这里返回的是一个二维数组,里面的数字全部转成浮点的数字 user_logs = dict() for row in matrix: user_logs.setdefault(int(row[0]), []) user_logs[int(row[0])].append((int(row[1]), int(row[2]), int(row[3]))) return user_logs
def get_label_function(): """ 此函数是为了获得第16天的用户行为日志,从而与1到15天的召回列表合并生成label的功能 格式为dataframe格式(userID, itemID, behavior) :return: """ filename = "../data/df_behavior_test.csv" right_data = pd.read_csv(filename) right_datas = right_data[['userID', 'itemID', 'behavior']] # 讲itemID的数据类型转化成int64类型 right_datas = right_datas.astype({'itemID': 'int64'}) # right_data['itemID'] = right_datas.loc[right_datas['behavior'] == 'pv', 'behavior'] = 1 # loc[i, j]的意思就是对于i行j列的意思 right_datas.loc[right_datas['behavior'] == 'fav', 'behavior'] = 2 right_datas.loc[right_datas['behavior'] == 'cart', 'behavior'] = 3 right_datas.loc[right_datas['behavior'] == 'buy', 'behavior'] = 4 right_datas = reduce_mem_usage(right_datas) return right_datas
我觉得不应该分动态和静态,应该是线下与线上之说,因为总的来说对于 不同的行为数据集特征值应该不一样的!!!!!!!!!!!!!!! """ from code_file.utils import reduce_mem_usage import numpy as np import pandas as pd # 载入原始的数据集 即7到21号的。。 data = pd.read_csv("../data/df_behavior_train.csv" ) # userID,behavior,timestap,itemID,date,day """ 现在为了时间的,选择19到21号即可 """ data = data[data['day'] >= 19] data = reduce_mem_usage(data) # 载入用户的数据 user = pd.read_csv("../data/user.csv", header=None) user.columns = ['userID', 'sex', 'age', 'ability'] user = reduce_mem_usage(user) # 载入商品的数据 item = pd.read_csv("../data/df_item.csv") # categoryID,shopID,brandID,itemID item = reduce_mem_usage(item) # 合并商品以及用户的数据集进行统计过程 data = pd.merge(left=data, right=user, on=['userID'], how='left') data = pd.merge(left=data, right=item, on=['itemID'], how='left') # 首先是用户与商品的商品的交叉的特征,比如,userID,再shopID分组之类的行为的程度
from sklearn import preprocessing from code_file.utils import reduce_mem_usage # 定义超参数 FEATURE_SIZE = 601221 # 此处为转化成特征之后的特征总数 FIELD_SIZE = 9 # 特征域的大小 EMBEDDING_SIZE = 8 # embedding之后的最后维度的大小 BATCH_SIZE = 1024 """获取训练集的部分""" """ 之后的总的特征就是['userID', 'itemID', 'sim', 'label', 'behavior', 'day', 'sex', 'age', 'ability', 'categoryID', 'shopID', 'brandID'], """ data_index, data_value = deal_underline_train_data() data_index = reduce_mem_usage(data_index) data_value = reduce_mem_usage(data_value) """漏了一部分就是数据预处理,对于连续值进行归一化操作!!!!!""" min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0.1, 1)) # 连续特征有哪些'sim', 'age' tmp1 = np.array(data_value['sim']) tmp1 = np.reshape(tmp1, (-1, 1)) tmp2 = np.array((data_value['age'])) tmp2 = np.reshape(tmp2, (-1, 1)) data_value['sim'] = min_max_scaler.fit_transform(tmp1) data_value['age'] = min_max_scaler.fit_transform(tmp2) # 取得标签的值
""" df_item = pd.read_csv(filename_item, header=None) df_item.columns = ['itemID', 'categoryID', 'shopID', 'brandID'] le = preprocessing.LabelEncoder() df_item['itemID_Encoding'] = le.fit_transform(df_item['itemID']) # print(df_item.head(10)) print(df_item.shape) # (4318202, 5) print(df_item['itemID_Encoding'].min()) # 0 print(df_item['itemID_Encoding'].max()) # 4318202 df_behavior = pd.read_csv(filename_behavior, header=None) # 在此处优化存储空间 df_behavior = reduce_mem_usage(df_behavior) df_behavior = df_behavior.iloc[:, 1:5] df_behavior.columns = ['userID', 'itemID', 'behavior', 'timestap'] df_behavior = df_behavior.merge(df_item, on='itemID', how='left') df_behavior.drop(df_behavior[np.isnan(df_behavior['itemID_Encoding'])].index, inplace=True) # 删除指定列含有NaN值 # 删除多余的列 df_behavior = df_behavior.drop(['itemID', 'categoryID', 'shopID', 'brandID'], axis=1) df_behavior['itemID'] = df_behavior['itemID_Encoding'] df_behavior = df_behavior.drop(['itemID_Encoding'], axis=1) print(df_behavior.head()) # print(df_behavior['itemID'].min()) # 11 print(df_behavior['itemID'].max()) # 4318196 print(df_behavior.shape) # (8047545, 4) # 在对于df_item进行处理
def generate_all_feature(): """ 获取线上训练集的总的特征 也就是1到15天的数据特征 :return: """ # 先获取召回1到15天的数据集 train_data = pd.read_csv(filename_recall) train_data = reduce_mem_usage(train_data) train_data = train_data.fillna(0) # 给label为NaN进行补充值 """ 以上train_data的格式为[userID, itemID, sim, label] 解释:userID为用户,itemID为用户行为的商品,sim表示用户对于该商品的感兴趣的程度 label表示的是在16号的时候该用户是否真的对这个商品使用了。。1到3表示使用了,0表示没有 """ # 进行负采样的过程 recall_train = down_sample(train_data, 10) recall_train['label'] = recall_train['label'].apply(transfer_label) # 然后要跟原始的item文件以及user文件进行左连接 user_data = pd.read_csv("../data/user.csv", header=None) user_data.columns = ['userID', 'sex', 'age', 'ability'] item_data = pd.read_csv( "../data/df_item.csv") # 其中它的列名为categoryID,shopID,brandID,itemID recall_train = pd.merge(left=recall_train, right=user_data, on=['userID'], how='left', sort=False) recall_train = pd.merge(left=recall_train, right=item_data, on=['itemID'], how='left', sort=False) """ 以上进行合并之后的特征为: userID, itemID, sim, label, sex, age, ability, categoryID, shopID, brandID """ """ 再把之前生成的统计统计特征进行连接 分别为category_higher以及item.higher的统计 """ # 格式为['categoryID', 'category_median', 'category_std'] category_feature = pd.read_csv('../statistics_feature/category_higher.csv') # 格式为['itemID', 'item_median', 'item_std'] item_feature = pd.read_csv('../statistics_feature/item.higher.csv') recall_train = pd.merge(left=recall_train, right=category_feature, on=['categoryID'], how='left') recall_train = pd.merge(left=recall_train, right=item_feature, on=['itemID'], how='left') # 再跟统计的四个特征进行合并 item_ID_feature = pd.read_csv('../statistics_feature/itemID_count.csv') category_ID_feature = pd.read_csv( "../statistics_feature/categoryID_count.csv") shop_ID_feature = pd.read_csv("../statistics_feature/shopID_count.csv") brand_ID_feature = pd.read_csv("../statistics_feature/brandID_count.csv") recall_train = pd.merge(left=recall_train, right=item_ID_feature, on=["itemID"], how="left") recall_train = pd.merge(left=recall_train, right=category_ID_feature, on=["categoryID"], how="left") recall_train = pd.merge(left=recall_train, right=shop_ID_feature, on=["shopID"], how="left") recall_train = pd.merge(left=recall_train, right=brand_ID_feature, on=["brandID"], how="left") """ 以上之后的总的特征就是 userID, itemID, sim, label, sex, age, ability, categoryID, shopID, brandID category_median category_std item_median item_std, itemID_sum, categoryID_sum, shopID_sum, brandID_sum """ return recall_train
def predict_function(data_index, data_value): """ 预测模型函数,上传到testdata处理好的训练集 :param data_value: :param data_index: :return: """ STEPS = len(data_index) // BATCH_SIZE """开始处理测试集""" data_index = reduce_mem_usage(data_index) data_value = reduce_mem_usage(data_value) """漏了一部分就是数据预处理,对于连续值进行归一化操作!!!!!""" min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0.1, 1)) # 连续特征有哪些'sim', 'age' tmp1 = np.array(data_value['sim']) tmp1 = np.reshape(tmp1, (-1, 1)) tmp2 = np.array((data_value['age'])) tmp2 = np.reshape(tmp2, (-1, 1)) data_value['sim'] = min_max_scaler.fit_transform(tmp1) data_value['age'] = min_max_scaler.fit_transform(tmp2) feature = [x for x in data_index.columns if x not in ['userID', 'itemID', 'label']] # 获得训练的训练集 data_index = data_index[feature] data_index = np.array(data_index) data_value = data_value[feature] data_value = np.array(data_value) # 获取deepfm保存好的网络 deepfm_model = '../deepfm_save_model/deepfm_model_saver.ckpt' # deepfm_model = '../deepfm_real_embeddings_save/deepfm_model_saver.ckpt' y_pred = None with tf.Session() as sess: saver = tf.train.Saver() saver.restore(sess, deepfm_model) # 开始进行训练 for step in range(STEPS): pred_temp = sess.run(out, feed_dict={ feature_index: data_index[step * BATCH_SIZE:(step + 1) * BATCH_SIZE], feature_value: data_value[step * BATCH_SIZE:(step + 1) * BATCH_SIZE], iS_training: False}) """合并好数据集""" pred_temp = np.reshape(pred_temp, (-1, 1)) if y_pred is None: y_pred = pred_temp else: y_pred = np.concatenate((y_pred, pred_temp), axis=0) """处理剩余的部分""" print(step) # 80221, 而step的总数是80222 pred_last = sess.run(out, feed_dict={ feature_index: data_index[(step + 1) * BATCH_SIZE:], feature_value: data_value[(step + 1) * BATCH_SIZE:], iS_training: False}) pred_last = np.reshape(pred_last, (-1, 1)) y_pred = np.concatenate((y_pred, pred_last), axis=0) return y_pred # 维度为(-1, 1)