Example #1
0
def train(data_dir):
    # 读入数据集
    feature_sizes = np.load(os.path.join(data_dir, 'sample_feature_sizes.npy'))

    batch_size = 1024
    start = time.time()
    train_loader, test_loader = load_criteo_data(data_dir, batch_size)
    print('finish loading data, time={}'.format(time.time() - start))

    args = DotMap({
        'field_size': 39,
        'feature_sizes': sum(feature_sizes),
        'embedding_size': 64,
        'use_lr': False,
        'use_fm': True,
        'is_shallow_dropout': False,
        'dropout_shallow': [0.5, 0.5],
        'use_deep': True,
        'deep_layers': [512, 128, 32],
        'is_deep_dropout': True,
        'dropout_deep': [0, 0.5, 0.5, 0.5],
        'is_batch_norm': True,

        # 'random_seed': 666,
        'batch_size': batch_size,
        'wd': 0,
        'device': 'cuda',  # cpu / cuda
        'epochs': 20,
        'lr': 0.0001,  # learning_rate
        'log_interval': 100,  # log intervel
        'save_model': False,
        'eval_metric': roc_auc_score
    })
    print(args)
    net = DeepFM(args)
    # print(net)

    # for blk in net.children():
    #     X = blk(X)
    #     print('output shape: ', X.shape)

    # 以均值为0,方差0.01初始化参数
    for params in net.parameters():
        init.normal_(params, mean=0, std=0.01)
    # for name, param in net.named_parameters():
    #     print(name, param)

    print("training on ", args.device)
    model = net.to(args.device)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.wd)
    # optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wd)
    # loss = torch.nn.CrossEntropyLoss()
    loss = F.binary_cross_entropy_with_logits

    train_model(args, model, loss, train_loader, test_loader, optimizer)
Example #2
0
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params):
    fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols=config.IGNORE_COLS)
    data_parser = DataParser(feat_dict=fd)
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
    Xi_test, Xv_test, y_test = data_parser.parse(df=dfTest, has_label=True)

    dfm_params["feature_size"] = fd.feat_dim
    dfm_params["field_size"] = len(Xi_train[0])

    eval_metric = dfm_params["eval_metric"]

    y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float)
    y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)

    _get = lambda x, l: [x[i] for i in l]
    eval_metric_results_cv = np.zeros(len(folds), dtype=float)
    eval_metric_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float)
    eval_metric_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float)

    dfm = DeepFM(**dfm_params)

    for i, (train_idx, valid_idx) in enumerate(folds):
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx)

        dfm = DeepFM(**dfm_params)
        dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)

        y_train_meta[valid_idx, 0] = dfm.predict(Xi_valid_, Xv_valid_)
        y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test)

        eval_metric_results_cv[i] = eval_metric(y_valid_, y_train_meta[valid_idx])
        eval_metric_results_epoch_train[i] = dfm.train_result
        eval_metric_results_epoch_valid[i] = dfm.valid_result

    y_test_meta /= float(len(folds))

    # save result
    if dfm_params["use_fm"] and dfm_params["use_deep"]:
        clf_str = "DeepFM"
    elif dfm_params["use_fm"]:
        clf_str = "FM"
    elif dfm_params["use_deep"]:
        clf_str = "DNN"
    print("%s: %.5f (%.5f)" % (clf_str, eval_metric_results_cv.mean(), eval_metric_results_cv.std()))
    filename = "%s_Mean%.5f_Std%.5f.csv" % (clf_str, eval_metric_results_cv.mean(), eval_metric_results_cv.std())
    _make_submission(ids_test, y_test_meta, y_test, filename)

    _plot_fig(eval_metric_results_epoch_train, eval_metric_results_epoch_valid, clf_str)

    _export_embedding(dfm.sess, fd.feat_dict, dfm.weights["feature_embeddings"],
                      config.CATEGORICAL_COLS, config.SUB_DIR)

    return y_train_meta, y_test_meta
Example #3
0
def main(unused_argv):
    # data load
    FLAGS.dummy_cols = [
        'banner_pos', 'device_conn_type', 'C1', 'C15', 'C16', 'C18'
    ]
    dp = DataPreprocess(FLAGS.dummy_cols, FLAGS.numerical_cols,
                        FLAGS.target_colname, FLAGS.train_file,
                        FLAGS.test_file)
    train_features, train_labels = dp.parse_data(FLAGS.train_file)
    test_features, test_labels = dp.parse_data(FLAGS.test_file)
    print(train_features['dfi'][:10])
    print(train_features['dfv'][:10])
    print(train_labels[:10])
    print('----------------------------------')

    feature_nums = dp.feature_nums
    field_nums = len(dp.all_cols)

    # model define
    if FLAGS.model_type == 'DCN':
        model = DCN(feature_nums, field_nums, args=FLAGS)
    elif FLAGS.model_type == 'DeepFM':
        model = DeepFM(feature_nums, field_nums, args=FLAGS)
    elif FLAGS.model_type == 'FM':
        model = FM(feature_nums, field_nums, args=FLAGS)
    elif FLAGS.model_type == 'FFM':
        model = FFM(feature_nums, field_nums, args=FLAGS)
    elif FLAGS.model_type == 'LR':
        model = LR(feature_nums, field_nums, args=FLAGS)

    # train
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())
        train(sess,
              model,
              train_features,
              train_labels,
              batch_size=FLAGS.batch_size,
              epochs=FLAGS.epochs,
              checkpoint_dir=FLAGS.checkpoint_dir,
              log_dir=FLAGS.log_dirs)
    # evaluate
    evaluate(test_features, test_labels, checkpoint_dir=FLAGS.checkpoint_dir)
Example #4
0
def train_DeepFM():
    X, y, sparse_list, dense_list = feature_engineering.get_NN_data(
        use_over_sampler=True)

    data = pd.DataFrame(y)
    dnn_feature_columns = linear_feature_columns = sparse_list + dense_list
    feature_names = inputs.get_feature_names(linear_feature_columns +
                                             dnn_feature_columns)

    # kFold cv
    models = []
    scores = []

    kf = StratifiedKFold(n_splits=5, random_state=42)
    for i, (tdx, vdx) in enumerate(kf.split(X, y)):
        print(f'Fold : {i}')
        X_train, X_val, y_train, y_val = X.loc[tdx], X.loc[vdx], y.loc[
            tdx], y.loc[vdx]
        y_true = y_val

        X_train = {name: X_train[name] for name in feature_names}
        X_val = {name: X_val[name] for name in feature_names}

        model = DeepFM(linear_feature_columns,
                       dnn_feature_columns,
                       dnn_hidden_units=(128, 64),
                       dnn_use_bn=True,
                       task='binary',
                       dnn_dropout=0.5)

        best_param_path = '/Users/a_piao/PycharmProjects/BankMarketing/workspace/DeepFM/best_param_DeepFM.py_%d.h5' % i
        # best_param_path = 'best_param_%s_%d.h5' % (os.path.basename(__file__), i)

        if os.path.exists(best_param_path):
            model.load_weights(best_param_path)
        else:
            model.compile("adam",
                          "binary_crossentropy",
                          metrics=['binary_crossentropy'])
            # tb = TensorBoard(log_dir="/Users/a_piao/PycharmProjects/NightLife_recommend/workspace/DeepFM/log", write_images=1, histogram_freq=1)
            es = EarlyStopping(monitor='val_binary_crossentropy',
                               mode='min',
                               patience=20)
            mc = ModelCheckpoint(best_param_path,
                                 monitor='val_binary_crossentropy',
                                 mode='min',
                                 save_best_only=False,
                                 verbose=False,
                                 save_weights_only=True)
            # model.fit(X_train, y_train, batch_size=512, epochs=1000, verbose=2, validation_split=0.2, callbacks=[es, mc])
            model.fit(X_train,
                      y_train,
                      validation_data=(X_val, y_val),
                      batch_size=1024,
                      epochs=1000,
                      verbose=2,
                      callbacks=[es, mc])
            # model.fit(X_train, y_train, batch_size=1024, epochs=100, verbose=2, callbacks=[es, mc])
            model.load_weights(best_param_path)

        y_pred = model.predict(X_val, batch_size=64).flatten()
        auc = roc_auc_score(y_true, y_pred)
        print("AUC score at %d floder: %f" % (i, auc))
        scores.append(auc)
        models.append(model)
        data.loc[vdx, 'y_pred'] = y_pred
        # print(data['y_pred'].value_counts())
        # exit()

    mean_score = np.mean(scores)
    oof = roc_auc_score(data['y'], data['y_pred'])
    print("5-floder total mean_score:", mean_score)
    print("5-floder oof auc score:", oof)
    print("----train %s finish!----" % 'DeepFM')
    cal_roc_curve(data['y'], data['y_pred'], 'DeepFM')

    return data['y_pred']
Example #5
0
    train_0 = train[train.click == 0]
    train_1 = train[train.click == 1]
    train = pd.concat([train_1, train_0[0:len(train_1)]])
    train_model_input = {name: train[name].values for name in sparse_features}
    test_model_input = {name: test[name].values for name in sparse_features}

    # Instantiate a FeatureMetas object, add your features' meta information to it
    feature_metas = FeatureMetas()
    for feat in sparse_features:
        feature_metas.add_sparse_feature(name=feat,
                                         one_hot_dim=data[feat].nunique(),
                                         embedding_dim=32)
    # a warning need to be fixed see https://stackoverflow.com/questions/35892412/tensorflow-dense-gradient-explanation
    # Instantiate a model and compile it
    model = DeepFM(feature_metas=feature_metas,
                   linear_slots=sparse_features,
                   fm_slots=sparse_features,
                   dnn_slots=sparse_features)
    model.compile(optimizer="adam",
                  loss="binary_crossentropy",
                  metrics=['binary_crossentropy'])

    # Train the model
    history = model.fit(x=train_model_input,
                        y=train[target].values,
                        batch_size=128,
                        epochs=3,
                        verbose=2,
                        validation_split=0.2)

    # Testing
    pred_ans = model.predict(test_model_input, batch_size=256)
Example #6
0
# val_data = PreProcessData("./data/ratings_small.csv", train=True)
# loader_val = DataLoader(train_data, batch_size=50, sampler=sampler.SubsetRandomSampler(range(TRAIN_ROUND, 100000)))

feature_size = FM_train_data.feature_size
print("feature_size is " + str(feature_size))
loader_train = DataLoader(FM_train_data,
                          batch_size=50,
                          sampler=sampler.SubsetRandomSampler(
                              range(FM_TRAIN_DATA_NUMBER)))
"""
FM
"""

FM_start_time = time.time()
FM_model = FM(feature_sizes=feature_size)
print("Now, lets train the model")
FM_model.fit(loader_train, epochs=50)
FM_end_time = time.time()
print("the end of training FM, time consume: %d" %
      (FM_end_time - FM_start_time))
"""
DeepFM
"""
deepFM_start_time = time.time()
deepFM_model = DeepFM(feature_sizes=feature_size)
print("Now, lets train the model")
deepFM_model.fit(loader_train, epochs=50)
deepFM_end_time = time.time()
print("the end of training deefFM, and the time consumption: %d" %
      (deepFM_end_time - deepFM_start_time))