def run_base_model_dcn(dfTrain, dfTest, folds, dcn_params): fd = FeatureDictionary(dfTrain, dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS, category_cols=config.CATEGORICAL_COLS) print(fd.feat_dim) print(fd.feat_dict) data_parser = DataParser(feat_dict=fd) cate_Xi_train, cate_Xv_train, numeric_Xv_train, y_train = data_parser.parse( df=dfTrain, has_label=True) cate_Xi_test, cate_Xv_test, numeric_Xv_test, ids_test = data_parser.parse( df=dfTest) dcn_params["cate_feature_size"] = fd.feat_dim dcn_params["field_size"] = len(cate_Xi_train[0]) dcn_params['numeric_feature_size'] = len(config.NUMERIC_COLS) _get = lambda x, l: [x[i] for i in l] for i, (train_idx, valid_idx) in enumerate(folds): cate_Xi_train_, cate_Xv_train_, numeric_Xv_train_, y_train_ = _get( cate_Xi_train, train_idx), _get(cate_Xv_train, train_idx), _get( numeric_Xv_train, train_idx), _get(y_train, train_idx) cate_Xi_valid_, cate_Xv_valid_, numeric_Xv_valid_, y_valid_ = _get( cate_Xi_train, valid_idx), _get(cate_Xv_train, valid_idx), _get( numeric_Xv_train, valid_idx), _get(y_train, valid_idx) dcn = DCN(**dcn_params) dcn.fit(cate_Xi_train_, cate_Xv_train_, numeric_Xv_train_, y_train_, cate_Xi_valid_, cate_Xv_valid_, numeric_Xv_valid_, y_valid_)
def run_base_model_dcn(dfTrain, dfTest, folds, dcn_params): fd = FeatureDictionary(dfTrain,dfTest,numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS, cate_cols = config.CATEGORICAL_COLS) print(fd.feat_dim) print(fd.feat_dict) data_parser = DataParser(feat_dict=fd) cate_Xi_train, cate_Xv_train, numeric_Xv_train,y_train = data_parser.parse(df=dfTrain, has_label=True) cate_Xi_test, cate_Xv_test, numeric_Xv_test,ids_test = data_parser.parse(df=dfTest) dcn_params["cate_feature_size"] = fd.feat_dim dcn_params["field_size"] = len(cate_Xi_train[0]) dcn_params['numeric_feature_size'] = len(config.NUMERIC_COLS) _get = lambda x, l: [x[i] for i in l] for i, (train_idx, valid_idx) in enumerate(folds): cate_Xi_train_, cate_Xv_train_, numeric_Xv_train_,y_train_ = _get(cate_Xi_train, train_idx), _get(cate_Xv_train, train_idx),_get(numeric_Xv_train, train_idx), _get(y_train, train_idx) cate_Xi_valid_, cate_Xv_valid_, numeric_Xv_valid_,y_valid_ = _get(cate_Xi_train, valid_idx), _get(cate_Xv_train, valid_idx),_get(numeric_Xv_train, valid_idx), _get(y_train, valid_idx) dcn = DCN(**dcn_params) dcn.fit(cate_Xi_train_, cate_Xv_train_, numeric_Xv_train_,y_train_, cate_Xi_valid_, cate_Xv_valid_, numeric_Xv_valid_,y_valid_)
def run_base_model_dcn(dfTrain, dfTest, folds, dcn_params): fd = FeatureDictionary(dfTrain,dfTest,numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS, cate_cols = config.CATEGORICAL_COLS) print(fd.feat_dim) print(fd.feat_dict) data_parser = DataParser(feat_dict=fd) cate_Xi_train, cate_Xv_train, numeric_Xv_train,y_train = data_parser.parse(df=dfTrain, has_label=True) cate_Xi_test, cate_Xv_test, numeric_Xv_test,ids_test = data_parser.parse(df=dfTest) dcn_params["cate_feature_size"] = fd.feat_dim dcn_params["field_size"] = len(cate_Xi_train[0]) dcn_params['numeric_feature_size'] = len(config.NUMERIC_COLS) _get = lambda x, l: [x[i] for i in l] for i, (train_idx, valid_idx) in enumerate(folds): print("i",i) cate_Xi_train_, cate_Xv_train_, numeric_Xv_train_,y_train_ = _get(cate_Xi_train, train_idx), _get(cate_Xv_train, train_idx),_get(numeric_Xv_train, train_idx), _get(y_train, train_idx) cate_Xi_valid_, cate_Xv_valid_, numeric_Xv_valid_,y_valid_ = _get(cate_Xi_train, valid_idx), _get(cate_Xv_train, valid_idx),_get(numeric_Xv_train, valid_idx), _get(y_train, valid_idx) dcn = DCN(**dcn_params) s=dcn.fit(cate_Xi_train_, cate_Xv_train_, numeric_Xv_train_,y_train_, cate_Xi_valid_, cate_Xv_valid_, numeric_Xv_valid_,y_valid_,i) dcn.saver.save(s, 'D:/code/tensorflow_practice/recommendation/Basic-DCN-Demo/model/model', global_step=i + 1)
def plot_dcn(): # 读取数据 data, dense_features, sparse_features = read_criteo_data() dense_features = dense_features[:3] sparse_features = sparse_features[:2] # 将特征分组,分成linear部分和dnn部分(根据实际场景进行选择),并将分组之后的特征做标记(使用DenseFeat, SparseFeat) linear_feature_columns = [ SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4) for i, feat in enumerate(sparse_features) ] + [DenseFeat( feat, 1, ) for feat in dense_features] dnn_feature_columns = [ SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4) for i, feat in enumerate(sparse_features) ] + [DenseFeat( feat, 1, ) for feat in dense_features] # 构建AFM模型 history = DCN(linear_feature_columns, dnn_feature_columns) keras.utils.plot_model(history, to_file="./imgs/DCN.png", show_shapes=True)
def main(unused_argv): FLAGS.dummy_cols = [ 'banner_pos', 'device_conn_type', 'C1', 'C15', 'C16', 'C18' ] dp = DataPreprocess(FLAGS.dummy_cols, FLAGS.numerical_cols, FLAGS.target_colname, FLAGS.train_file, FLAGS.test_file) train_features, train_labels = dp.parse_data(FLAGS.train_file) test_features, test_labels = dp.parse_data(FLAGS.test_file) print(train_features['dfi'][:10]) print(train_features['dfv'][:10]) print(train_labels[:10]) print('----------------------------------') feature_nums = dp.feature_nums field_nums = len(dp.all_cols) model = DCN(feature_nums, field_nums, args=FLAGS) with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) train(sess, model, train_features, train_labels, batch_size=FLAGS.batch_size, epochs=FLAGS.epochs, checkpoint_dir=FLAGS.checkpoint_dir) evaluate(test_features, test_labels, checkpoint_dir=FLAGS.checkpoint_dir)
def run_base_model_dcn(dfTrain, dfTest, folds, dcn_params): # 类别型特征与索引的映射 fd = FeatureDictionary(dfTrain, dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS, cate_cols=config.CATEGORICAL_COLS) print(fd.feat_dim) print(fd.feat_dict) # 返回类别型特征索引,类别型特征值,数值型特征,标签值 data_parser = DataParser(feat_dict=fd) cate_Xi_train, cate_Xv_train, numeric_Xv_train, y_train = data_parser.parse( df=dfTrain, has_label=True) cate_Xi_test, cate_Xv_test, numeric_Xv_test, _ = data_parser.parse( df=dfTest) # 离散型特征onthot后类别型特征个数 dcn_params["n_cate_feature"] = fd.feat_dim # 离散型特征个数 dcn_params["n_field"] = len(cate_Xi_train[0]) print('values', str(fd.feat_dim), 'values', str(len(cate_Xi_train[0]))) _get = lambda x, l: [x[i] for i in l] for i, (train_idx, valid_idx) in enumerate(folds): # 训练集 cate_Xi_train_, cate_Xv_train_, numeric_Xv_train_, y_train_ = _get( cate_Xi_train, train_idx), _get(cate_Xv_train, train_idx), _get( numeric_Xv_train, train_idx), _get(y_train, train_idx) # 验证集 cate_Xi_valid_, cate_Xv_valid_, numeric_Xv_valid_, y_valid_ = _get( cate_Xi_train, valid_idx), _get(cate_Xv_train, valid_idx), _get( numeric_Xv_train, valid_idx), _get(y_train, valid_idx) dcn = DCN(**dcn_params) dcn.fit(cate_Xi_train_, cate_Xv_train_, numeric_Xv_train_, y_train_, cate_Xi_valid_, cate_Xv_valid_, numeric_Xv_valid_, y_valid_)
def eval(eval_parameters, device): print(".. Evaluation started ..") treated_set = eval_parameters["treated_set"] control_set = eval_parameters["control_set"] model_path = eval_parameters["model_save_path"] network = DCN(training_flag=False).to(device) network.load_state_dict(torch.load(model_path, map_location=device)) network.eval() treated_data_loader = torch.utils.data.DataLoader(treated_set, shuffle=False, num_workers=1) control_data_loader = torch.utils.data.DataLoader(control_set, shuffle=False, num_workers=1) err_treated_list = [] err_control_list = [] for batch in treated_data_loader: covariates_X, ps_score, y_f, y_cf = batch covariates_X = covariates_X.to(device) ps_score = ps_score.squeeze().to(device) treatment_pred = network(covariates_X, ps_score) predicted_ITE = treatment_pred[0] - treatment_pred[1] true_ITE = y_f - y_cf if torch.cuda.is_available(): diff = true_ITE.float().cuda() - predicted_ITE.float().cuda() else: diff = true_ITE.float() - predicted_ITE.float() err_treated_list.append(diff.item()) for batch in control_data_loader: covariates_X, ps_score, y_f, y_cf = batch covariates_X = covariates_X.to(device) ps_score = ps_score.squeeze().to(device) treatment_pred = network(covariates_X, ps_score) predicted_ITE = treatment_pred[0] - treatment_pred[1] true_ITE = y_cf - y_f if torch.cuda.is_available(): diff = true_ITE.float().cuda() - predicted_ITE.float().cuda() else: diff = true_ITE.float() - predicted_ITE.float() err_control_list.append(diff.item()) # print(err_treated_list) # print(err_control_list) return { "treated_err": err_treated_list, "control_err": err_control_list, }
def _run_base_model_dfm(Xi_train, Xv_train, y_train, Xi_test, Xv_test, ids_test, cate_cnt, folds, dfm_params): dfm_params["cate_feature_size"] = cate_cnt dfm_params["cate_field_size"] = len(Xi_train[0]) dfm_params["num_field_size"] = len(Xv_train[0]) y_train_meta = np.zeros((Xi_train.shape[0], 1), dtype=float) y_test_meta = np.zeros((Xi_test.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l] gini_results_cv = np.zeros(len(folds), dtype=float) gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) for i, (train_idx, valid_idx) in enumerate(folds): Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get( Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get( Xv_train, valid_idx), _get(y_train, valid_idx) dfm = DCN(**dfm_params) dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_) y_train_meta[valid_idx, 0] = dfm.predict(Xi_valid_, Xv_valid_) y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test) gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx]) gini_results_epoch_train[i] = dfm.train_result gini_results_epoch_valid[i] = dfm.valid_result y_test_meta /= float(len(folds)) # save result if dfm_params["use_cross"] and dfm_params["use_deep"]: clf_str = "DeepAndCross" elif dfm_params["use_cross"]: clf_str = "CROSS" elif dfm_params["use_deep"]: clf_str = "DNN" print("%s: %.5f (%.5f)" % (clf_str, gini_results_cv.mean(), gini_results_cv.std())) filename = "%s_Mean%.5f_Std%.5f.csv" % (clf_str, gini_results_cv.mean(), gini_results_cv.std()) _make_submission(ids_test, y_test_meta, filename) _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str) return y_train_meta, y_test_meta
transform=transformer) test_set = datasets.MNIST(args.dir, train=False, transform=transformer) train_limit = list(range( 0, len(train_set))) if not args.test_run else list(range(0, 500)) test_limit = list(range(0, len(test_set))) if not args.test_run else list( range(0, 500)) train_loader = torch.utils.data.DataLoader(Subset(train_set, train_limit), batch_size=args.batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(Subset(test_set, test_limit), batch_size=args.batch_size, shuffle=False) model = DCN(args) rec_loss_list = model.pretrain(train_loader, epoch=args.pre_epoch) pre_trained_AE = copy.deepcopy(model.autoencoder) # model.autoencoder = pre_trained_AE # initial_clustering = model.clustering # model.pre_cluster(train_loader) nmi_list = [] ari_list = [] model.args = args # model.clustering = initial_clustering reducer = umap.UMAP() for e in range(args.epoch): # Print training set if e % 1 == 0: out = model.autoencoder(torch.FloatTensor(np.array(X_train)).to(
args = parser.parse_args() # Load data transformer = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) train_set = datasets.MNIST(args.dir, train=True, download=True, transform=transformer) test_set = datasets.MNIST(args.dir, train=False, transform=transformer) train_limit = list(range( 0, len(train_set))) if not args.test_run else list(range(0, 500)) test_limit = list(range(0, len(test_set))) if not args.test_run else list( range(0, 500)) train_loader = torch.utils.data.DataLoader(Subset(train_set, train_limit), batch_size=args.batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(Subset(test_set, test_limit), batch_size=args.batch_size, shuffle=False) # Main body model = DCN(args) rec_loss_list, nmi_list, ari_list = solver(args, model, train_loader, test_loader)
test_size = 0.2 k = 8 layer_num = 6 output_dim = 1 reg = 1e-4 # =============== 准备数据 =============== dense_feature = ['I' + str(i) for i in range(1, 14)] sparse_feature = ['C' + str(i) for i in range(1, 27)] embed_dict, train_df, test_df = preprocess(args.file_path, sample_num, test_size) embed_num = list(embed_dict.values()) input_dim = len(dense_feature) + len(sparse_feature) * k hidden_units = [input_dim, 256, 128, 64] train_dataset = DCNDataset(train_df, dense_feature, sparse_feature) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) # =============== 创建模型 =============== DCN_model = DCN(embed_num, k, input_dim, layer_num, hidden_units, output_dim) loss_func = nn.BCELoss() optimizer = optim.Adam(DCN_model.parameters(), lr=args.learning_rate, weight_decay=reg) # =============== 模型训练与测试 =============== train(DCN_model, args.epochs, train_loader, loss_func, optimizer) test(DCN_model, test_df, dense_feature, sparse_feature)
def train(self, train_parameters, device): epochs = train_parameters["epochs"] treated_batch_size = train_parameters["treated_batch_size"] control_batch_size = train_parameters["control_batch_size"] lr = train_parameters["lr"] shuffle = train_parameters["shuffle"] model_save_path = train_parameters["model_save_path"].format(epochs, lr) treated_set_train = train_parameters["treated_set_train"] control_set_train = train_parameters["control_set_train"] input_nodes = train_parameters["input_nodes"] phases = ['train', 'val'] print("Saved model path: {0}".format(model_save_path)) treated_data_loader_train = torch.utils.data.DataLoader(treated_set_train, batch_size=treated_batch_size, shuffle=shuffle, num_workers=1) control_data_loader_train = torch.utils.data.DataLoader(control_set_train, batch_size=control_batch_size, shuffle=shuffle, num_workers=1) network = DCN(training_flag=True, input_nodes=input_nodes).to(device) optimizer = optim.Adam(network.parameters(), lr=lr) lossF = nn.MSELoss() min_loss = 100000.0 dataset_loss = 0.0 print(".. Training started ..") print(device) for epoch in range(epochs): network.train() total_loss = 0 train_set_size = 0 if epoch % 2 == 0: dataset_loss = 0 # train treated network.hidden1_Y1.weight.requires_grad = True network.hidden1_Y1.bias.requires_grad = True network.hidden2_Y1.weight.requires_grad = True network.hidden2_Y1.bias.requires_grad = True network.out_Y1.weight.requires_grad = True network.out_Y1.bias.requires_grad = True network.hidden1_Y0.weight.requires_grad = False network.hidden1_Y0.bias.requires_grad = False network.hidden2_Y0.weight.requires_grad = False network.hidden2_Y0.bias.requires_grad = False network.out_Y0.weight.requires_grad = False network.out_Y0.bias.requires_grad = False for batch in treated_data_loader_train: covariates_X, ps_score, y_f, y_cf = batch covariates_X = covariates_X.to(device) ps_score = ps_score.squeeze().to(device) train_set_size += covariates_X.size(0) treatment_pred = network(covariates_X, ps_score) # treatment_pred[0] -> y1 # treatment_pred[1] -> y0 predicted_ITE = treatment_pred[0] - treatment_pred[1] true_ITE = y_f - y_cf if torch.cuda.is_available(): loss = lossF(predicted_ITE.float().cuda(), true_ITE.float().cuda()).to(device) else: loss = lossF(predicted_ITE.float(), true_ITE.float()).to(device) optimizer.zero_grad() loss.backward() optimizer.step() total_loss += loss.item() dataset_loss = total_loss elif epoch % 2 == 1: # train controlled network.hidden1_Y1.weight.requires_grad = False network.hidden1_Y1.bias.requires_grad = False network.hidden2_Y1.weight.requires_grad = False network.hidden2_Y1.bias.requires_grad = False network.out_Y1.weight.requires_grad = False network.out_Y1.bias.requires_grad = False network.hidden1_Y0.weight.requires_grad = True network.hidden1_Y0.bias.requires_grad = True network.hidden2_Y0.weight.requires_grad = True network.hidden2_Y0.bias.requires_grad = True network.out_Y0.weight.requires_grad = True network.out_Y0.bias.requires_grad = True for batch in control_data_loader_train: covariates_X, ps_score, y_f, y_cf = batch covariates_X = covariates_X.to(device) ps_score = ps_score.squeeze().to(device) train_set_size += covariates_X.size(0) treatment_pred = network(covariates_X, ps_score) # treatment_pred[0] -> y1 # treatment_pred[1] -> y0 predicted_ITE = treatment_pred[0] - treatment_pred[1] true_ITE = y_cf - y_f if torch.cuda.is_available(): loss = lossF(predicted_ITE.float().cuda(), true_ITE.float().cuda()).to(device) else: loss = lossF(predicted_ITE.float(), true_ITE.float()).to(device) optimizer.zero_grad() loss.backward() optimizer.step() total_loss += loss.item() dataset_loss = dataset_loss + total_loss # print("epoch: {0}, train_set_size: {1} loss: {2}". # format(epoch, train_set_size, total_loss)) if epoch % 10 == 9: print("epoch: {0}, Treated + Control loss: {1}".format(epoch, dataset_loss)) # if epoch % 2 == 1: # print("epoch: {0}, Treated + Control loss: {1}".format(epoch, dataset_loss)) # if dataset_loss < min_loss: # print("Current loss: {0}, over previous: {1}, Saving model". # format(dataset_loss, min_loss)) # min_loss = dataset_loss torch.save(network.state_dict(), model_save_path)
help='decay rate', type=float, default=0.99) args = parser.parse_args(args=[]) # load data set X_train_cate, X_train_cont, y_train, X_test_cate, X_test_cont, y_test, cate_list = load_dataset( args.input_dir) cate_num = X_train_cate.shape[1] cont_num = X_train_cont.shape[1] tf.reset_default_graph() with tf.Session() as sess: # define model model = DCN.DCN(args, cate_num, cont_num, cate_list) model.build() ckpt = tf.train.get_checkpoint_state( os.path.join(args.input_dir, args.model_name)) if ckpt: print('Loading model parameters from %s' % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) else: print('Creating model with inital parameters') sess.run(tf.global_variables_initializer()) step = 0 for epoch in range(args.epoch): start_time = time.time()
from DCN import DCN import sys nrows = None if len(sys.argv) > 1: nrows = sys.argv[1] nrows = int(nrows) if __name__ == '__main__': path = '../data/data.csv' feature_size, data = data_loader.data_load('../data/data.csv', nrows=nrows) features = ['userId', 'movieId', 'tag'] num = data.shape[0] * 4 // 5 model = DCN(features, feature_size, embedding_size=8, verbose=False) X = data[features].values y = data.label.values.reshape(-1, 1) ''' model.fit( X[:num],y[:num], epoch=10, X_valid=X[num:],y_valid=y[num:], early_stopping=True, refit=True ) ''' import time start = time.time() model.fit(X[:num], y[:num], epoch=1) print('train a epoch cost %.2f' % (time.time() - start))
train_limit = list(range( 0, len(train_set))) if not args.test_run else list(range(0, 500)) test_limit = list(range( 0, len(test_set))) if not args.test_run else list(range(0, 500)) train_loader = torch.utils.data.DataLoader(Subset( train_set, train_limit), batch_size=args.batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(Subset(test_set, test_limit), batch_size=args.batch_size, shuffle=False) # Main body model = DCN(args) rec_loss_list, nmi_list, ari_list = solver(args, model, train_loader, test_loader) # X_train = X_train.to(self.device) # print(y_train[0]) out = model.autoencoder(torch.FloatTensor(np.array(X_train)), latent=True) reducer = umap.UMAP() # print(help(umap)) X2 = reducer.fit_transform(out.detach().numpy()) c = [color[int(y_train.iloc[i])] for i in range(len(y_train))] plt.scatter(X2[:, 0], X2[:, 1], color=c) plt.show() X4 = reducer.fit_transform(X_train) plt.scatter(X4[:, 0], X4[:, 1], color=c)