def plot_nfm(): # 读取数据 data, dense_features, sparse_features = read_criteo_data() dense_features = dense_features[:3] sparse_features = sparse_features[:2] # 将特征分组,分成linear部分和dnn部分(根据实际场景进行选择),并将分组之后的特征做标记(使用DenseFeat, SparseFeat) linear_feature_columns = [ SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4) for i, feat in enumerate(sparse_features) ] + [DenseFeat( feat, 1, ) for feat in dense_features] dnn_feature_columns = [ SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4) for i, feat in enumerate(sparse_features) ] + [DenseFeat( feat, 1, ) for feat in dense_features] # 构建NFM模型 history = NFM(linear_feature_columns, dnn_feature_columns) keras.utils.plot_model(history, to_file="./imgs/NFM.png", show_shapes=True)
def plot_dien(): """读取数据""" samples_data = pd.read_csv("data/movie_sample.txt", sep="\t", header=None) samples_data.columns = [ "user_id", "gender", "age", "hist_movie_id", "hist_len", "movie_id", "movie_type_id", "label" ] """数据集""" X = samples_data[[ "user_id", "gender", "age", "hist_movie_id", "hist_len", "movie_id", "movie_type_id" ]] y = samples_data["label"] """特征封装""" feature_columns = [ SparseFeat('user_id', max(samples_data["user_id"]) + 1, embedding_dim=8), SparseFeat('gender', max(samples_data["gender"]) + 1, embedding_dim=8), SparseFeat('age', max(samples_data["age"]) + 1, embedding_dim=8), SparseFeat('movie_id', max(samples_data["movie_id"]) + 1, embedding_dim=8), SparseFeat('movie_type_id', max(samples_data["movie_type_id"]) + 1, embedding_dim=8), DenseFeat('hist_len', 1) ] feature_columns += [ VarLenSparseFeat('hist_movie_id', vocabulary_size=max(samples_data["movie_id"]) + 1, embedding_dim=8, maxlen=50) ] feature_columns += [ VarLenSparseFeat('neg_hist_movie_id', vocabulary_size=max(samples_data["movie_id"]) + 1, embedding_dim=8, maxlen=50) ] # 行为特征列表,表示的是基础特征 behavior_feature_list = ['movie_id'] # 行为序列特征 behavior_seq_feature_list = ['hist_movie_id'] # 负采样序列特征 neg_seq_feature_list = ['neg_hist_movie_id'] """构建DIN模型""" history = DIEN(feature_columns, behavior_feature_list, behavior_seq_feature_list, neg_seq_feature_list, use_neg_sample=True) keras.utils.plot_model(history, to_file="./imgs/DIEN.png", show_shapes=True)
def plot_pnn(): data, dense_features, sparse_features = read_criteo_data() dense_features = dense_features[:3] sparse_features = sparse_features[:3] # 将特征分组,分成linear部分和dnn部分(根据实际场景进行选择),并将分组之后的特征做标记(使用DenseFeat, SparseFeat) dnn_feature_columns = [ SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4) for feat in sparse_features ] + [DenseFeat( feat, 1, ) for feat in dense_features] # 构建DeepCrossing模型 history = PNN(dnn_feature_columns) keras.utils.plot_model(history, to_file="./imgs/PNN.png", show_shapes=True)
def plot_din(): # 读取数据 samples_data = pd.read_csv("./data/movie_sample.txt", sep="\t", header=None) samples_data.columns = [ "user_id", "gender", "age", "hist_movie_id", "hist_len", "movie_id", "movie_type_id", "label" ] feature_columns = [ SparseFeat('user_id', max(samples_data["user_id"]) + 1, embedding_dim=8), SparseFeat('gender', max(samples_data["gender"]) + 1, embedding_dim=8), SparseFeat('age', max(samples_data["age"]) + 1, embedding_dim=8), SparseFeat('movie_id', max(samples_data["movie_id"]) + 1, embedding_dim=8), SparseFeat('movie_type_id', max(samples_data["movie_type_id"]) + 1, embedding_dim=8), DenseFeat('hist_len', 1) ] feature_columns += [ VarLenSparseFeat('hist_movie_id', vocabulary_size=max(samples_data["movie_id"]) + 1, embedding_dim=8, maxlen=50) ] # 行为特征列表,表示的是基础特征 behavior_feature_list = ['movie_id'] # 行为序列特征 behavior_seq_feature_list = ['hist_movie_id'] history = DIN(feature_columns, behavior_feature_list, behavior_seq_feature_list) keras.utils.plot_model(history, to_file="./imgs/DIN.png", show_shapes=True)
# 划分dense和sparse特征 columns = data.columns.values dense_features = [feat for feat in columns if 'I' in feat] sparse_features = [feat for feat in columns if 'C' in feat] # 简单的数据预处理 train_data = data_process(data, dense_features, sparse_features) train_data['label'] = data['label'] # 将特征分组,分成linear部分和dnn部分(根据实际场景进行选择),并将分组之后的特征做标记(使用DenseFeat, SparseFeat) linear_feature_columns = [ SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4) for i, feat in enumerate(sparse_features) ] + [DenseFeat( feat, 1, ) for feat in dense_features] dnn_feature_columns = [ SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4) for i, feat in enumerate(sparse_features) ] + [DenseFeat( feat, 1, ) for feat in dense_features] # 构建xDeepFM模型 model = xDeepFM(linear_feature_columns, dnn_feature_columns) model.summary() model.compile( optimizer="adam",
"age": np.array(X["age"]), \ "hist_movie_id": np.array([[int(i) for i in l.split(',')] for l in X["hist_movie_id"]]), \ "neg_hist_movie_id": np.array([[int(i) for i in l.split(',')] for l in X["neg_hist_movie_id"]]), \ "hist_len": np.array(X["hist_len"]), \ "movie_id": np.array(X["movie_id"]), \ "movie_type_id": np.array(X["movie_type_id"])} y_train = np.array(y) """特征封装""" feature_columns = [SparseFeat('user_id', max(samples_data["user_id"])+1, embedding_dim=8), SparseFeat('gender', max(samples_data["gender"])+1, embedding_dim=8), SparseFeat('age', max(samples_data["age"])+1, embedding_dim=8), SparseFeat('movie_id', max(samples_data["movie_id"])+1, embedding_dim=8), SparseFeat('movie_type_id', max(samples_data["movie_type_id"])+1, embedding_dim=8), DenseFeat('hist_len', 1)] feature_columns += [VarLenSparseFeat('hist_movie_id', vocabulary_size=max(samples_data["movie_id"])+1, embedding_dim=8, maxlen=50)] feature_columns += [VarLenSparseFeat('neg_hist_movie_id', vocabulary_size=max(samples_data["movie_id"])+1, embedding_dim=8, maxlen=50)] # 行为特征列表,表示的是基础特征 behavior_feature_list = ['movie_id'] # 行为序列特征 behavior_seq_feature_list = ['hist_movie_id'] # 负采样序列特征 neg_seq_feature_list = ['neg_hist_movie_id'] """构建DIN模型""" history = DIEN(feature_columns, behavior_feature_list, behavior_seq_feature_list, neg_seq_feature_list, use_neg_sample=True) history.compile('adam', 'binary_crossentropy')
if __name__ == "__main__": # 读取数据 data = pd.read_csv('./data/criteo_sample.txt') # 划分dense和sparse特征 columns = data.columns.values dense_features = [feat for feat in columns if 'I' in feat] sparse_features = [feat for feat in columns if 'C' in feat] # 简单的数据预处理 train_data = data_process(data, dense_features, sparse_features) train_data['label'] = data['label'] # 将特征分组,分成linear部分和dnn部分(根据实际场景进行选择),并将分组之后的特征做标记(使用DenseFeat, SparseFeat) linear_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4) for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,) for feat in dense_features] dnn_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4) for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,) for feat in dense_features] # 构建DCN模型 history = DCN(linear_feature_columns, dnn_feature_columns) history.summary() history.compile(optimizer="adam", loss="binary_crossentropy", metrics=["binary_crossentropy", tf.keras.metrics.AUC(name='auc')]) # 将输入数据转化成字典的形式输入 train_model_input = {name: data[name] for name in dense_features + sparse_features}
# 读取数据 data = pd.read_csv('./data/criteo_sample.txt') # 划分一下两类的数据 columns = data.columns.values dense_features = [feat for feat in columns if "I" in feat] sparse_features = [feat for feat in columns if "C" in feat] # 简单的数据处理 train_data = data_process(data, dense_features, sparse_features) train_data['label'] = data['label'] # 将特征进行分组 # 分成linear部分和dnn部分(根据实际场景进行选择), # 并将分组之后的特征做标记(使用DenseFeat, SparseFeat) linear_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4) for i, feat in enumerate(sparse_features)] + [DenseFeat(feat, 1) for feat in dense_features] # 深度神经网络需要的数据 dnn_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat], embedding_dim=4) for feat in enumerate(sparse_features)] + [DenseFeat(feat, 1) for feat in dense_features] # 构建NFM模型 history = NFM(linear_feature_columns, dnn_feature_columns) history.summary() # 检验的条件 history.compile(optimizer="adam", loss="binary_crossentropy", metrics=["bianry_crossentropy", tf.keras.metrics.AUC(name='auc')]) # 将输入数据变为字典的形式进行导入 train_model_input = {name: data[name] for name in dense_features + sparse_features} # 模型的训练