def train(data_dir): # 读入数据集 feature_sizes = np.load(os.path.join(data_dir, 'sample_feature_sizes.npy')) batch_size = 1024 start = time.time() train_loader, test_loader = load_criteo_data(data_dir, batch_size) print('finish loading data, time={}'.format(time.time() - start)) args = DotMap({ 'field_size': 39, 'feature_sizes': sum(feature_sizes), 'embedding_size': 64, 'use_lr': False, 'use_fm': True, 'is_shallow_dropout': False, 'dropout_shallow': [0.5, 0.5], 'use_deep': True, 'deep_layers': [512, 128, 32], 'is_deep_dropout': True, 'dropout_deep': [0, 0.5, 0.5, 0.5], 'is_batch_norm': True, # 'random_seed': 666, 'batch_size': batch_size, 'wd': 0, 'device': 'cuda', # cpu / cuda 'epochs': 20, 'lr': 0.0001, # learning_rate 'log_interval': 100, # log intervel 'save_model': False, 'eval_metric': roc_auc_score }) print(args) net = DeepFM(args) # print(net) # for blk in net.children(): # X = blk(X) # print('output shape: ', X.shape) # 以均值为0,方差0.01初始化参数 for params in net.parameters(): init.normal_(params, mean=0, std=0.01) # for name, param in net.named_parameters(): # print(name, param) print("training on ", args.device) model = net.to(args.device) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) # optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wd) # loss = torch.nn.CrossEntropyLoss() loss = F.binary_cross_entropy_with_logits train_model(args, model, loss, train_loader, test_loader, optimizer)
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params): fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS) data_parser = DataParser(feat_dict=fd) Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True) Xi_test, Xv_test, y_test = data_parser.parse(df=dfTest, has_label=True) dfm_params["feature_size"] = fd.feat_dim dfm_params["field_size"] = len(Xi_train[0]) eval_metric = dfm_params["eval_metric"] y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float) y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l] eval_metric_results_cv = np.zeros(len(folds), dtype=float) eval_metric_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) eval_metric_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) dfm = DeepFM(**dfm_params) for i, (train_idx, valid_idx) in enumerate(folds): Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx) dfm = DeepFM(**dfm_params) dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_) y_train_meta[valid_idx, 0] = dfm.predict(Xi_valid_, Xv_valid_) y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test) eval_metric_results_cv[i] = eval_metric(y_valid_, y_train_meta[valid_idx]) eval_metric_results_epoch_train[i] = dfm.train_result eval_metric_results_epoch_valid[i] = dfm.valid_result y_test_meta /= float(len(folds)) # save result if dfm_params["use_fm"] and dfm_params["use_deep"]: clf_str = "DeepFM" elif dfm_params["use_fm"]: clf_str = "FM" elif dfm_params["use_deep"]: clf_str = "DNN" print("%s: %.5f (%.5f)" % (clf_str, eval_metric_results_cv.mean(), eval_metric_results_cv.std())) filename = "%s_Mean%.5f_Std%.5f.csv" % (clf_str, eval_metric_results_cv.mean(), eval_metric_results_cv.std()) _make_submission(ids_test, y_test_meta, y_test, filename) _plot_fig(eval_metric_results_epoch_train, eval_metric_results_epoch_valid, clf_str) _export_embedding(dfm.sess, fd.feat_dict, dfm.weights["feature_embeddings"], config.CATEGORICAL_COLS, config.SUB_DIR) return y_train_meta, y_test_meta
def main(unused_argv): # data load FLAGS.dummy_cols = [ 'banner_pos', 'device_conn_type', 'C1', 'C15', 'C16', 'C18' ] dp = DataPreprocess(FLAGS.dummy_cols, FLAGS.numerical_cols, FLAGS.target_colname, FLAGS.train_file, FLAGS.test_file) train_features, train_labels = dp.parse_data(FLAGS.train_file) test_features, test_labels = dp.parse_data(FLAGS.test_file) print(train_features['dfi'][:10]) print(train_features['dfv'][:10]) print(train_labels[:10]) print('----------------------------------') feature_nums = dp.feature_nums field_nums = len(dp.all_cols) # model define if FLAGS.model_type == 'DCN': model = DCN(feature_nums, field_nums, args=FLAGS) elif FLAGS.model_type == 'DeepFM': model = DeepFM(feature_nums, field_nums, args=FLAGS) elif FLAGS.model_type == 'FM': model = FM(feature_nums, field_nums, args=FLAGS) elif FLAGS.model_type == 'FFM': model = FFM(feature_nums, field_nums, args=FLAGS) elif FLAGS.model_type == 'LR': model = LR(feature_nums, field_nums, args=FLAGS) # train with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) train(sess, model, train_features, train_labels, batch_size=FLAGS.batch_size, epochs=FLAGS.epochs, checkpoint_dir=FLAGS.checkpoint_dir, log_dir=FLAGS.log_dirs) # evaluate evaluate(test_features, test_labels, checkpoint_dir=FLAGS.checkpoint_dir)
def train_DeepFM(): X, y, sparse_list, dense_list = feature_engineering.get_NN_data( use_over_sampler=True) data = pd.DataFrame(y) dnn_feature_columns = linear_feature_columns = sparse_list + dense_list feature_names = inputs.get_feature_names(linear_feature_columns + dnn_feature_columns) # kFold cv models = [] scores = [] kf = StratifiedKFold(n_splits=5, random_state=42) for i, (tdx, vdx) in enumerate(kf.split(X, y)): print(f'Fold : {i}') X_train, X_val, y_train, y_val = X.loc[tdx], X.loc[vdx], y.loc[ tdx], y.loc[vdx] y_true = y_val X_train = {name: X_train[name] for name in feature_names} X_val = {name: X_val[name] for name in feature_names} model = DeepFM(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=(128, 64), dnn_use_bn=True, task='binary', dnn_dropout=0.5) best_param_path = '/Users/a_piao/PycharmProjects/BankMarketing/workspace/DeepFM/best_param_DeepFM.py_%d.h5' % i # best_param_path = 'best_param_%s_%d.h5' % (os.path.basename(__file__), i) if os.path.exists(best_param_path): model.load_weights(best_param_path) else: model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy']) # tb = TensorBoard(log_dir="/Users/a_piao/PycharmProjects/NightLife_recommend/workspace/DeepFM/log", write_images=1, histogram_freq=1) es = EarlyStopping(monitor='val_binary_crossentropy', mode='min', patience=20) mc = ModelCheckpoint(best_param_path, monitor='val_binary_crossentropy', mode='min', save_best_only=False, verbose=False, save_weights_only=True) # model.fit(X_train, y_train, batch_size=512, epochs=1000, verbose=2, validation_split=0.2, callbacks=[es, mc]) model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=1024, epochs=1000, verbose=2, callbacks=[es, mc]) # model.fit(X_train, y_train, batch_size=1024, epochs=100, verbose=2, callbacks=[es, mc]) model.load_weights(best_param_path) y_pred = model.predict(X_val, batch_size=64).flatten() auc = roc_auc_score(y_true, y_pred) print("AUC score at %d floder: %f" % (i, auc)) scores.append(auc) models.append(model) data.loc[vdx, 'y_pred'] = y_pred # print(data['y_pred'].value_counts()) # exit() mean_score = np.mean(scores) oof = roc_auc_score(data['y'], data['y_pred']) print("5-floder total mean_score:", mean_score) print("5-floder oof auc score:", oof) print("----train %s finish!----" % 'DeepFM') cal_roc_curve(data['y'], data['y_pred'], 'DeepFM') return data['y_pred']
train_0 = train[train.click == 0] train_1 = train[train.click == 1] train = pd.concat([train_1, train_0[0:len(train_1)]]) train_model_input = {name: train[name].values for name in sparse_features} test_model_input = {name: test[name].values for name in sparse_features} # Instantiate a FeatureMetas object, add your features' meta information to it feature_metas = FeatureMetas() for feat in sparse_features: feature_metas.add_sparse_feature(name=feat, one_hot_dim=data[feat].nunique(), embedding_dim=32) # a warning need to be fixed see https://stackoverflow.com/questions/35892412/tensorflow-dense-gradient-explanation # Instantiate a model and compile it model = DeepFM(feature_metas=feature_metas, linear_slots=sparse_features, fm_slots=sparse_features, dnn_slots=sparse_features) model.compile(optimizer="adam", loss="binary_crossentropy", metrics=['binary_crossentropy']) # Train the model history = model.fit(x=train_model_input, y=train[target].values, batch_size=128, epochs=3, verbose=2, validation_split=0.2) # Testing pred_ans = model.predict(test_model_input, batch_size=256)
# val_data = PreProcessData("./data/ratings_small.csv", train=True) # loader_val = DataLoader(train_data, batch_size=50, sampler=sampler.SubsetRandomSampler(range(TRAIN_ROUND, 100000))) feature_size = FM_train_data.feature_size print("feature_size is " + str(feature_size)) loader_train = DataLoader(FM_train_data, batch_size=50, sampler=sampler.SubsetRandomSampler( range(FM_TRAIN_DATA_NUMBER))) """ FM """ FM_start_time = time.time() FM_model = FM(feature_sizes=feature_size) print("Now, lets train the model") FM_model.fit(loader_train, epochs=50) FM_end_time = time.time() print("the end of training FM, time consume: %d" % (FM_end_time - FM_start_time)) """ DeepFM """ deepFM_start_time = time.time() deepFM_model = DeepFM(feature_sizes=feature_size) print("Now, lets train the model") deepFM_model.fit(loader_train, epochs=50) deepFM_end_time = time.time() print("the end of training deefFM, and the time consumption: %d" % (deepFM_end_time - deepFM_start_time))