def find_l2_reg_embedding(linear_feature_columns, dnn_feature_columns, train_model_input, train, test_model_input, test): cols = ['l2_reg_embedding', 'RMSE', 'MAE', 'MSE', 'AUC'] df_result = pd.DataFrame(columns=cols, index=range( len(config.param_rand['l2_reg_embedding']))) for i, x in enumerate(config.param_rand['l2_reg_embedding']): ##Add dnn_hidden_units as b later model = WDL( linear_feature_columns, dnn_feature_columns, # ADD LATER dnn_hidden_units=(2, 2), l2_reg_linear=0.1, l2_reg_embedding=x, l2_reg_dnn=0, init_std=0.0001, seed=1024, task='binary') model.compile("adam", "mse", metrics=['mse']) history = model.fit( train_model_input, train[target].values, batch_size=256, epochs=config.model_epoch['epoch'], verbose=2, validation_split=0.2, ) pred_ans = model.predict(test_model_input, batch_size=256) auc = roc_auc_score(test[target].values, pred_ans) df_result.loc[i].l2_reg_embedding = x df_result.loc[i].RMSE = np.round( math.sqrt(mean_squared_error(test[target].values, pred_ans)), 3) df_result.loc[i].MAE = np.round( mean_absolute_error(test[target].values, pred_ans), 3) df_result.loc[i].MSE = np.round( mean_squared_error(test[target].values, pred_ans), 3) df_result.loc[i].AUC = np.round(auc, 3) return df_result
def widendeep_model(linear_feature_columns, dnn_feature_columns, train_model_input, train, test_model_input, test): cols = ['model', 'RMSE', 'MAE', 'MSE', 'AUC', 'score'] df_result = pd.DataFrame(columns=cols, index=range(1)) model = WDL( linear_feature_columns, dnn_feature_columns, dnn_hidden_units=config.widendeep_att["dnn_hidden_units"], #l2_reg_linear=config.widendeep_att["l2_reg_linear"], # l2_reg_embedding=config.widendeep_att["l2_reg_embedding"], #l2_reg_dnn=config.widendeep_att["l2_reg_dnn"], # init_std=config.widendeep_att["init_std"], dnn_dropout=config.widendeep_att['dnn_dropout'], dnn_activation=config.widendeep_att['dnn_activation'], seed=config.widendeep_att["seed"], task=config.widendeep_att["task"]) model.compile("adam", "mse", metrics=['mse']) history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=config.model_epoch['epoch'], verbose=2, validation_split=0.2) pred_ans = model.predict(test_model_input, batch_size=256) save_model(model, 'saved_widendeep.h5') # save_model auc = roc_auc_score(test[target].values, pred_ans) df_result.loc[0].model = "Wide and Deep" df_result.loc[0].RMSE = np.round( math.sqrt(mean_squared_error(test[target].values, pred_ans)), 3) df_result.loc[0].MAE = np.round( mean_absolute_error(test[target].values, pred_ans), 3) df_result.loc[0].MSE = np.round( mean_squared_error(test[target].values, pred_ans), 3) df_result.loc[0].AUC = np.round(auc, 3) return df_result
target = ['rating'] # 对特征标签进行编码 for feature in sparse_features: lbe = LabelEncoder() data[feature] = lbe.fit_transform(data[feature]) # 计算每个特征中的 不同特征值的个数 fixlen_feature_columns = [SparseFeat(feature, data[feature].nunique()) for feature in sparse_features] linear_feature_columns = fixlen_feature_columns dnn_feature_columns = fixlen_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 将数据集切分成训练集和测试集 train, test = train_test_split(data, test_size=0.2) train_model_input = {name:train[name].values for name in feature_names} test_model_input = {name:test[name].values for name in feature_names} # 使用WDL进行训练 model = WDL(linear_feature_columns, dnn_feature_columns, task='regression') model.compile("adam", "mse", metrics=['mse'], ) history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=100, verbose=True, validation_split=0.2, ) plt.figure() x = range(len(history.history['loss'])) plt.plot(x, history.history['loss']) plt.title('loss') # 使用WDL进行预测 pred_ans = model.predict(test_model_input, batch_size=256) # 输出RMSE或MSE mse = round(mean_squared_error(test[target].values, pred_ans), 4) rmse = mse ** 0.5 print("test RMSE", rmse)
SparseFeat(feature, data[feature].nunique()) for feature in sparse_features ] linear_feature_columns = fix_len_feature_columns dnn_feature_columns = fix_len_feature_columns feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # 将数据集切分成训练集和测试集 train, test = train_test_split(data, test_size=0.2) train_set = {name: train[name].values for name in feature_names} test_set = {name: test[name].values for name in feature_names} # 使用 WDL 进行训练 model = WDL(linear_feature_columns, dnn_feature_columns, task='regression') model.compile('adam', 'mse', metrics=['mse']) history = model.fit(train_set, train[target].values, batch_size=256, epochs=1, verbose=True, validation_split=0.2) # 使用 WDL 进行预测 pred_ans = model.predict(test_set, batch_size=256) # 输出 RMSE 或者 MSE mse = round(mean_squared_error(test[target].values, pred_ans), 4) rmse = mse**0.5 print(f'test rmse: {rmse}')
data = pd.read_csv("./data/sample/validation.txt") # 1.Label Encoding for sparse features,and do simple Transformation for dense features for feat in sparse_features: lbe = LabelEncoder() data[feat] = lbe.fit_transform(data[feat]) # 2.count #unique features for each sparse field sparse_feature_dim = { feat: data[feat].nunique() for feat in sparse_features } # 3.generate input data for model model_input = [data[feat].values for feat in sparse_feature_dim] pred = model.predict(model_input, batch_size, 1) label = data[target].values.flatten().tolist() pred = pred.flatten().tolist() with open('data/pctr', 'w') as fw: for i in range(len(pred)): if i % 10000 == 0: print('label: %f, pred: %f' % (label[i], pred[i])) to_write = str(i + 1) + ',' + str(label[i]) + ',' + str( pred[i]) + '\n' fw.write(to_write) fw.close() AUC = auc.auc(label, pred) print('auc: %f' % AUC) print("demo done")