def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params): fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS) data_parser = DataParser(feat_dict=fd) Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True) Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest) dfm_params["feature_size"] = fd.feat_dim dfm_params["field_size"] = len(Xi_train[0]) y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float) y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l] gini_results_cv = np.zeros(len(folds), dtype=float) gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) for i, (train_idx, valid_idx) in enumerate(folds): Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get( Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get( Xv_train, valid_idx), _get(y_train, valid_idx) dfm = DeepFM(**dfm_params) dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_) y_train_meta[valid_idx, 0] = dfm.predict(Xi_valid_, Xv_valid_) y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test) gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx]) gini_results_epoch_train[i] = dfm.train_result gini_results_epoch_valid[i] = dfm.valid_result y_test_meta /= float(len(folds)) # save result if dfm_params["module_name"] == "DeepFM": if dfm_params["use_fm"] and dfm_params["use_deep"]: clf_str = "DeepFM" elif dfm_params["use_fm"]: clf_str = "FM" elif dfm_params["use_deep"]: clf_str = "DNN" elif dfm_params["module_name"] == "LR": clf_str = "LR" elif dfm_params["module_name"] == "WideDeep": clf_str = "WideDeep" print("%s: %.5f (%.5f)" % (clf_str, gini_results_cv.mean(), gini_results_cv.std())) filename = "%s_Mean%.5f_Std%.5f.csv" % (clf_str, gini_results_cv.mean(), gini_results_cv.std()) _make_submission(ids_test, y_test_meta, filename) _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str) return y_train_meta, y_test_meta
def eval_on_dev(split_vector_data): e_b_s = len(dev_data) / graph_hyper_params['batch_size'] auc_true, auc_pre = [], [] # auc = [] for index in tqdm(range(e_b_s)): start = index * graph_hyper_params['batch_size'] end = (index + 1) * graph_hyper_params['batch_size'] if ( index + 1) * graph_hyper_params['batch_size'] < len( dev_data) else len(dev_data) b_dev_data = dev_data[start:end] fed_dict = get_fed_dict(b_dev_data, split_vector_data) pred_value = sess.run([pred_val], feed_dict=fed_dict) pre_real_val = np.array(pred_value).reshape((-1)) auc_true = auc_true + list(b_dev_data['label'].values) auc_pre = auc_pre + pre_real_val.tolist() # auc.append() # auc_pre = np.array(auc_pre) # auc_pre = np.exp(auc_pre) / np.exp(auc_pre).sum() # print auc_true # print auc_pre fpr, tpr, thresholds = metrics.roc_curve(auc_true, auc_pre, pos_label=1) # >> > metrics.auc(fpr, tpr) return metrics.auc(fpr, tpr), gini_norm(auc_true, auc_pre)
def eval_on_dev(split_vector_data): e_b_s = len(dev_data) / graph_hyper_params['batch_size'] auc_true, auc_pre = [], [] # auc = [] for index in tqdm(range(e_b_s)): start = index * graph_hyper_params['batch_size'] end = (index + 1) * graph_hyper_params['batch_size'] if (index + 1) * graph_hyper_params['batch_size'] < len(dev_data) else len(dev_data) b_dev_data = dev_data[start:end] fed_dict = get_fed_dict(b_dev_data, split_vector_data, feature_conf_dict) pred_value, pre_pred_value, final_vec, uu, vv = sess.run([pred_val, network_params[0], network_params[1], network_params[2], network_params[3]], feed_dict=fed_dict) pre_real_val = np.array(pred_value).reshape((-1)) auc_true = auc_true + list(b_dev_data['label'].values) auc_pre = auc_pre + pre_real_val.tolist() if True in np.isnan(pre_real_val): print 'contain nan: ', np.array(pre_pred_value).reshape((-1)) print np.array(final_vec).reshape((-1)) print np.array(uu).reshape((-1)) print np.array(vv).reshape((-1)) # auc.append() # auc_pre = np.array(auc_pre) # auc_pre = np.exp(auc_pre) / np.exp(auc_pre).sum() # print auc_true # print auc_pre fpr, tpr, thresholds = metrics.roc_curve(auc_true, auc_pre, pos_label=1) auc_v, gni = metrics.auc(fpr, tpr), gini_norm(auc_true, auc_pre) auc_pre_2 = np.array(auc_pre) auc_pre_2.sort() print('dev_pre_top2=%.4f %.4f min2=%.4f %.4f' % (auc_pre_2.tolist()[-1], auc_pre_2.tolist()[-2], auc_pre_2.tolist()[0], auc_pre_2.tolist()[1])) return auc_v, gni
def val(model, dataloader): """ 计算模型在验证集上的信息 """ model.eval() ########固定 total = 0 loss_val = 0 val_iteration = 0 y_true = [] y_pre = [] for i, (Xi_batch, Xv_batch, y_batch) in enumerate(dataloader): Xi_batch = Xi_batch.to(device) Xv_batch = Xv_batch.to(device) y_batch = y_batch.to(device) outputs = model(Xi_batch, Xv_batch) loss = criterion(outputs, y_batch) # _, predicted = torch.max(outputs.data, 1) y_true.append(y_batch.data.cpu().numpy()) prob = F.sigmoid(outputs) y_pre.append(prob.data.cpu().numpy()) loss_val += loss.item() val_iteration += 1 loss_val /= val_iteration y_true = np.concatenate(y_true, axis=0) y_pre = np.concatenate(y_pre, axis=0) gini_val = gini_norm(y_true, y_pre) model.train() ####重启 return loss_val, gini_val
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params,save_path:str,past_epoch:int=0): fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS) data_parser = DataParser(feat_dict=fd) Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True) Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest) dfm_params["feature_size"] = fd.feat_dim dfm_params["field_size"] = len(Xi_train[0]) y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float) y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l] gini_results_cv = np.zeros(len(folds), dtype=float) # gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) # gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) dfm = DeepFM(**dfm_params) if past_epoch!=0 :dfm.saver.restore(dfm.sess, save_path + '-'+str(past_epoch)) for i, (train_idx, valid_idx) in enumerate(folds): Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx) # print(y_train) # print(y_train_) # print(dfm.predict(Xi_train_, Xv_train_)) # continue dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_) y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_) y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test) gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx]) # gini_results_epoch_train[i] = dfm.train_result # gini_results_epoch_valid[i] = dfm.valid_result # print('saving') dfm.saver.save(dfm.sess, save_path, global_step=past_epoch+dfm_params["epoch"]*(i+1)) y_test_meta /= float(len(folds)) # save result if dfm_params["use_fm"] and dfm_params["use_deep"]: clf_str = "DeepFM" elif dfm_params["use_fm"]: clf_str = "FM" elif dfm_params["use_deep"]: clf_str = "DNN" print("%s: %.5f (%.5f)"%(clf_str, gini_results_cv.mean(), gini_results_cv.std())) filename = "%s_Mean%.5f_Std%.5f.csv"%(clf_str, gini_results_cv.mean(), gini_results_cv.std()) _make_submission(ids_test, y_test_meta, filename) gini_results_epoch_train = np.zeros((1, dfm_params["epoch"]*len(folds)), dtype=float) gini_results_epoch_valid = np.zeros((1, dfm_params["epoch"]*len(folds)), dtype=float) gini_results_epoch_train[0]=dfm.train_result gini_results_epoch_valid[0]=dfm.valid_result _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str) return y_train_meta, y_test_meta,dfm
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params): fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS) data_parser = DataParser(feat_dict=fd) Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True) Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest) dfm_params["feature_size"] = fd.feat_dim dfm_params["field_size"] = len(Xi_train[0]) y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float) y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l] gini_results_cv = np.zeros(len(folds), dtype=float) gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) for i, (train_idx, valid_idx) in enumerate(folds): Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx) # todo dfm_params={'use_fm': True, 'use_deep': True, 'embedding_size': 8, 'dropout_fm': [1.0, 1.0], 'deep_layers': [32, 32], # 'dropout_deep': [0.5, 0.5, 0.5], 'deep_layers_activation': <function relu at 0x7fe4917da950>, 'epoch': 30, 'batch_size': 1024, # 'learning_rate': 0.001, 'optimizer_type': 'adam', 'batch_norm': 1, 'batch_norm_decay': 0.995, 'l2_reg': 0.01, 'verbose': True, # 'eval_metric': <function gini_norm at 0x7fe495b06048>, 'random_seed': 2017, 'feature_size': 259, 'field_size': 39} # print(f"dfm_params={dfm_params}") dfm = DeepFM(**dfm_params) dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_) y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_) y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test) gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx]) # todo 所谓的 train_result 是训练集的gini系数 gini_results_epoch_train[i] = dfm.train_result gini_results_epoch_valid[i] = dfm.valid_result # todo 上面用了5折交叉,y_test_meta是各折交叉的加和,这里相当于5折交叉取平均 y_test_meta /= float(len(folds)) # save result if dfm_params["use_fm"] and dfm_params["use_deep"]: clf_str = "DeepFM" elif dfm_params["use_fm"]: clf_str = "FM" elif dfm_params["use_deep"]: clf_str = "DNN" print("%s: %.5f (%.5f)"%(clf_str, gini_results_cv.mean(), gini_results_cv.std())) filename = "%s_Mean%.5f_Std%.5f.csv"%(clf_str, gini_results_cv.mean(), gini_results_cv.std()) _make_submission(ids_test, y_test_meta, filename) _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str) return y_train_meta, y_test_meta
def run_base_model_dfm(dfTrain,dfTest,folds,dfm_params): fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols = config.IGNORE_COLS) data_parser = DataParser(feat_dict= fd) # Xi_train :列的序号 # Xv_train :列的对应的值 Xi_train,Xv_train,y_train = data_parser.parse(df=dfTrain,has_label=True) Xi_test,Xv_test,ids_test = data_parser.parse(df=dfTest) print(dfTrain.dtypes) dfm_params['feature_size'] = fd.feat_dim dfm_params['field_size'] = len(Xi_train[0]) y_train_meta = np.zeros((dfTrain.shape[0],1),dtype=float) y_test_meta = np.zeros((dfTest.shape[0],1),dtype=float) _get = lambda x,l:[x[i] for i in l] gini_results_cv = np.zeros(len(folds),dtype=float) gini_results_epoch_train = np.zeros((len(folds),dfm_params['epoch']),dtype=float) gini_results_epoch_valid = np.zeros((len(folds),dfm_params['epoch']),dtype=float) for i, (train_idx, valid_idx) in enumerate(folds): Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx) dfm = DeepFM(**dfm_params) dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_) y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_) y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test) gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx]) gini_results_epoch_train[i] = dfm.train_result gini_results_epoch_valid[i] = dfm.valid_result y_test_meta /= float(len(folds)) # save result if dfm_params["use_fm"] and dfm_params["use_deep"]: clf_str = "DeepFM" elif dfm_params["use_fm"]: clf_str = "FM" elif dfm_params["use_deep"]: clf_str = "DNN" print("%s: %.5f (%.5f)"%(clf_str, gini_results_cv.mean(), gini_results_cv.std())) filename = "%s_Mean%.5f_Std%.5f.csv"%(clf_str, gini_results_cv.mean(), gini_results_cv.std()) _make_submission(ids_test, y_test_meta, filename) _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str) return y_train_meta, y_test_meta
def _run_base_model_dfm(Xi_train, Xv_train, y_train, Xi_test, Xv_test, ids_test, cate_cnt, folds, dfm_params): dfm_params["cate_feature_size"] = cate_cnt dfm_params["cate_field_size"] = len(Xi_train[0]) dfm_params["num_field_size"] = len(Xv_train[0]) y_train_meta = np.zeros((Xi_train.shape[0], 1), dtype=float) y_test_meta = np.zeros((Xi_test.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l] gini_results_cv = np.zeros(len(folds), dtype=float) gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) for i, (train_idx, valid_idx) in enumerate(folds): Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get( Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get( Xv_train, valid_idx), _get(y_train, valid_idx) dfm = DCN(**dfm_params) dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_) y_train_meta[valid_idx, 0] = dfm.predict(Xi_valid_, Xv_valid_) y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test) gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx]) gini_results_epoch_train[i] = dfm.train_result gini_results_epoch_valid[i] = dfm.valid_result y_test_meta /= float(len(folds)) # save result if dfm_params["use_cross"] and dfm_params["use_deep"]: clf_str = "DeepAndCross" elif dfm_params["use_cross"]: clf_str = "CROSS" elif dfm_params["use_deep"]: clf_str = "DNN" print("%s: %.5f (%.5f)" % (clf_str, gini_results_cv.mean(), gini_results_cv.std())) filename = "%s_Mean%.5f_Std%.5f.csv" % (clf_str, gini_results_cv.mean(), gini_results_cv.std()) _make_submission(ids_test, y_test_meta, filename) _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str) return y_train_meta, y_test_meta
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params): #获取dict fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest,#训练集和测试集 numeric_cols=config.NUMERIC_COLS,#num类列 ignore_cols=config.IGNORE_COLS)#ignore特征,dfTrain和dfTest没有过滤掉 data_parser = DataParser(feat_dict=fd)#data_parser对象 Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)调用parse方法获取处理后的数据 Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest) dfm_params["feature_size"] = fd.feat_dim#处理之后的特征个数,即考虑了one-hot之后 dfm_params["field_size"] = len(Xi_train[0])#field个数 y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float) y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l] gini_results_cv = np.zeros(len(folds), dtype=float) gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) for i, (train_idx, valid_idx) in enumerate(folds):#应该是划分k份 Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx) dfm = DeepFM(**dfm_params) dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_) y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_)#k次折交 y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test)#每次训练都预测一次,然后把预测结果累加取来 gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx]) gini_results_epoch_train[i] = dfm.train_result gini_results_epoch_valid[i] = dfm.valid_result y_test_meta /= float(len(folds))#在测试集上的累加结果求平均 # save result if dfm_params["use_fm"] and dfm_params["use_deep"]:#deepFM clf_str = "DeepFM" elif dfm_params["use_fm"]:#FM clf_str = "FM" elif dfm_params["use_deep"]:#DNN clf_str = "DNN" print("%s: %.5f (%.5f)"%(clf_str, gini_results_cv.mean(), gini_results_cv.std())) filename = "%s_Mean%.5f_Std%.5f.csv"%(clf_str, gini_results_cv.mean(), gini_results_cv.std()) _make_submission(ids_test, y_test_meta, filename) _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str) return y_train_meta, y_test_meta#返回验证的预测和测试集的预测
def run_base_model_dfm(dfTrain, dfTest, folds, dfm_params): ''' 对模型的运行部分, 可以往下看发现,这部分 同时可以用于 设置使用FM 、Deep 、DeepFM这三种不同的模型 ''' # 别忽视了 FeatureDictionary 这里面有非常多的信息包装 转换的。 这里 解析 和字典包装真的是有点不明白,太复杂了,v是怎么获取使用的 fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS) # 在解析数据中,逐行处理每一条数据,dfi 记录了当前的特征在总的输入的特征中的索引。dfv 中记录的是具体的值, # 如果是 numerical 特征,存的是原始的值,如果是 categories 类型的,就存放 1。这个相当于进行了 one-hot 编码, # 在 dfi 存储了特征所在的索引。输入到网络中的特征的长度是 ( numerical 特征的个数 +categories 特征 one-hot 编码的长度 )。 # 最终,Xi 和 Xv 是一个二维的 list,里面的每一个 list 是一行数据,Xi 存放的是特征所在的索引,Xv 存放的是具体的特征值。 data_parser = DataParser(feat_dict=fd) # Xi_train :列的序号 # Xv_train :列的对应的值 # 解析数据 Xi_train 存放的是特征对应的索引 Xv_train 存放的是特征的具体的值 Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True) Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest) #这里面是二维的, 大列表是 每个样本,小列表表示具体对应feature_index下的value的长度 。 小列表长度应该不是统一的,因为针对one-hot,只显示为1的 print('Xi_train:', Xi_train) #存储了对应标签索引 print('Xv_train:', Xv_train) #存储了真实值 print('y_train:', y_train) print('Xi_test:', Xi_test) print('Xv_test:', Xv_test) print('Xi_train shape:', len(Xi_train)) # 存储了对应标签索引 print('Xv_train shape:', len(Xv_train)) # 存储了真实值 print('y_train shape:', len(y_train)) print('Xi_test shape:', len(Xi_test)) print('Xv_test shape:', len(Xv_test)) #print('ids_test:', ids_test) print(dfTrain.dtypes) #field_size 是原始的特征size, feature_size是经过对离散型数据one-hot处理后的特征数量 dfm_params['feature_size'] = fd.feat_dim dfm_params['field_size'] = len(Xi_train[0]) y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float) y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l] gini_results_cv = np.zeros(len(folds), dtype=float) gini_results_epoch_train = np.zeros((len(folds), dfm_params['epoch']), dtype=float) gini_results_epoch_valid = np.zeros((len(folds), dfm_params['epoch']), dtype=float) for i, (train_idx, valid_idx) in enumerate(folds): # 这里Xi_train_, Xv_train_, y_train_ 分别表示当前的特征在总的输入的特征中的索引、特征的具体的值、对应的标签索引 Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get( Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get( Xv_train, valid_idx), _get(y_train, valid_idx) # 训练好模型 并进行预测 dfm = DeepFM(**dfm_params) print('before fit Xi_train_:', Xi_train_[0:3]) print('before fit Xv_train_:', Xv_train_[0:3]) print('before fit y_train_:', y_train_[0:3]) dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_) y_train_meta[valid_idx, 0] = dfm.predict(Xi_valid_, Xv_valid_) y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test) gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx]) gini_results_epoch_train[i] = dfm.train_result gini_results_epoch_valid[i] = dfm.valid_result y_test_meta /= float(len(folds)) # save result if dfm_params["use_fm"] and dfm_params["use_deep"]: clf_str = "DeepFM" elif dfm_params["use_fm"]: clf_str = "FM" elif dfm_params["use_deep"]: clf_str = "DNN" print("%s: %.5f (%.5f)" % (clf_str, gini_results_cv.mean(), gini_results_cv.std())) filename = "%s_Mean%.5f_Std%.5f.csv" % (clf_str, gini_results_cv.mean(), gini_results_cv.std()) _make_submission(ids_test, y_test_meta, filename) _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str) return y_train_meta, y_test_meta
def fit(self, Xi_train, Xv_train, y_train, Xi_valid=None, Xv_valid=None, y_valid=None, early_stopping=False, refit=False): has_valid = Xv_valid is not None self.gini_train = [] self.gini_valid = [] for epoch in range(self.epoch): pre_train = [] pre_valid = [] t1 = time() self.shuffle_in_unison_scary(Xi_train, Xv_train, y_train) total_batch = int((len(y_train) - 1) / self.batch_size) + 1 for i in range(total_batch): Xi_batch, Xv_batch, y_batch = self.get_batch( Xi_train, Xv_train, y_train, self.batch_size, i) feed_dict = { self.feat_index: np.array(Xi_batch), self.feat_value: np.array(Xv_batch), self.label: np.array(y_batch).reshape((-1, 1)), self.dropout_keep_fm: self.dropout_fm, self.dropout_keep_deep: self.dropout_dep, self.train_phase: True } # loss, opt = self.sess.run([self.loss, self.optimizer], feed_dict=feed_dict) loss, opt, train_out = self.sess.run( (self.loss, self.optimizer, self.out), feed_dict=feed_dict) # pre_train.append(train_out) # dfm.fit_on_batch(Xi_batch, Xv_batch, y_batch) for i in range(total_batch): dummy = [1] * len(Xi_train) Xi_batch, Xv_batch, y_batch = self.get_batch( Xi_train, Xv_train, dummy, self.batch_size, i) num_batch = len(y_batch) feed_dict = { self.feat_index: np.array(Xi_batch), self.feat_value: np.array(Xv_batch), self.label: np.array(y_batch).reshape((-1, 1)), self.dropout_keep_fm: [1.0] * len(self.dropout_fm), self.dropout_keep_deep: [1.0] * len(self.dropout_dep), self.train_phase: False } loss, train_out = self.sess.run((self.loss, self.out), feed_dict=feed_dict) if i == 0: pre_train = np.reshape(train_out, (num_batch, )) else: pre_train = np.concatenate( (pre_train, np.reshape(train_out, (num_batch, )))) sig_gini_train = gini_norm(y_train, pre_train) # sig_gini_train = self.evaluate(Xi_train, Xv_train, y_train) self.gini_train.append(sig_gini_train) # evaluate training and validation datasets train_result = self.evaluate(Xi_train, Xv_train, y_train) self.train_result.append(train_result) if has_valid: valid_result = self.evaluate(Xi_valid, Xv_valid, y_valid) self.valid_result.append(valid_result) feed_dict = { self.feat_index: np.array(Xi_valid), self.feat_value: np.array(Xv_valid), self.label: np.array(y_valid).reshape((-1, 1)), self.dropout_keep_fm: [1.0] * len(self.dropout_fm), self.dropout_keep_deep: [1.0] * len(self.dropout_dep), self.train_phase: False } loss_test, valid_out = self.sess.run((self.loss, self.out), feed_dict=feed_dict) pre_valid.append(valid_out) pre_valid = [y for x in pre_valid for y in x] sig_gini_valid = gini_norm(y_valid, pre_valid) self.gini_valid.append(sig_gini_valid) if self.verbose > 0 and epoch % self.verbose == 0: if has_valid: print( "[%d] train-result=%.4f, valid-result=%.4f,my-train=%.4f, my_valid=%.4f [%.1f s]," % (epoch + 1, train_result, valid_result, sig_gini_train, sig_gini_valid, time() - t1)) else: print("[%d] train-result=%.4f [%.1f s]" % (epoch + 1, train_result, time() - t1)) if has_valid and early_stopping and self.training_termination( self.valid_result): break
def my_fit(self, Xi_train_, Xv_train_, y_train_, Xi_valid_=None, Xv_valid_=None, y_valid_=None): max_checks_without_progress = 10 checks_without_progress = 0 best_gini = 0 loss_train = 0 loss_test = 0 self.gini_train = [] self.gini_valid = [] for epoch in range(self.epoch): t1 = time() pre_train = [] pre_valid = [] self.shuffle_in_unison_scary(Xi_train_, Xv_train_, y_train_) total_batch = int((len(y_train_) - 1) / self.batch_size) + 1 for i in range(total_batch): Xi_batch, Xv_batch, y_batch = self.get_batch( Xi_train_, Xv_train_, y_train_, self.batch_size, i) feed_dict = { self.feat_index: np.array(Xi_batch), self.feat_value: np.array(Xv_batch), self.label: np.array(y_batch).reshape((-1, 1)), self.dropout_keep_fm: self.dropout_fm, self.dropout_keep_deep: self.dropout_dep, self.train_phase: True } loss, opt, train_out = self.sess.run( (self.loss, self.optimizer, self.out), feed_dict=feed_dict) # if extra_update_ops: # sess.run(extra_update_ops,feed_dict=feed_dict) loss_train += loss pre_train.append(train_out) # dfm.fit_on_batch(Xi_batch, Xv_batch, y_batch) loss_train /= total_batch pre_train = [y for x in pre_train for y in x] sig_gini_train = gini_norm(y_train_, pre_train) self.gini_train.append(sig_gini_train) feed_dict = { self.feat_index: np.array(Xi_valid_), self.feat_value: np.array(Xv_valid_), self.label: np.array(y_valid_).reshape((-1, 1)), self.dropout_keep_fm: [1.0] * len(self.dropout_fm), self.dropout_keep_deep: [1.0] * len(self.dropout_dep), self.train_phase: False } loss_test, valid_out = self.sess.run((self.loss, self.out), feed_dict=feed_dict) pre_valid.append(valid_out) pre_valid = [y for x in pre_valid for y in x] sig_gini_valid = gini_norm(y_valid_, pre_valid) self.gini_valid.append(sig_gini_valid) if sig_gini_valid > best_gini: gvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) best_params = { gvar.op.name: value for gvar, value in zip(gvars, self.sess.run(gvars)) } best_gini = sig_gini_valid checks_without_progress = 0 else: checks_without_progress += 1 print("[%d] train-result=%.4f, valid-result=%.4f [%.1f s]" % (epoch + 1, sig_gini_train, sig_gini_valid, time() - t1)) if checks_without_progress > max_checks_without_progress: print('early stopping!') break ##########将训练过程中保存的最好的参数重新返回到模型参数,此时得到的是最好的模型 if best_params: gvars_names = list(best_params.keys()) assign_ops = { gvar_name: tf.get_default_graph().get_operation_by_name(gvar_name + '/Assign') for gvar_name in gvars_names } init_values = { gvar_name: assign_op.inputs[1] for gvar_name, assign_op in assign_ops.items() } feed_dict = { init_values[gvar_name]: best_params[gvar_name] for gvar_name in gvars_names } self.sess.run(assign_ops, feed_dict=feed_dict) return self
def run_base_model_dfm(train_df, test_df, data_folds, params): # 解析数据,构建特征索引和特征值 fd = FeatureDictionay(df_train=train_df, df_test=test_df, numeric_cols=params.numeric_cols, ignore_cols=params.ignore_cols) data_parser = DataParser(feat_dict=fd) train_Xi, train_Xv, train_y = data_parser.parse(df=train_df, has_label=True) test_Xi, test_Xv, test_ids = data_parser.parse(df=test_df) # get feature size and field size feature_size = fd.feat_size field_size = len(train_Xi[0]) train_meta_y = np.zeros((train_df.shape[0], 1), dtype=float) test_meta_y = np.zeros((test_df.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l] # metric gini_results_cv = np.zeros(len(data_folds), dtype=float) gini_results_epoch_train = np.zeros((len(data_folds), params.epochs), dtype=float) gini_results_epoch_valid = np.zeros((len(data_folds), params.epochs), dtype=float) for idx, (train_idx, valid_idx) in enumerate(data_folds): train_Xi_ = _get(train_Xi, train_idx) train_Xv_ = _get(train_Xv, train_idx) train_y_ = _get(train_y, train_idx) valid_Xi_ = _get(train_Xi, valid_idx) valid_Xv_ = _get(train_Xv, valid_idx) valid_y_ = _get(train_y, valid_idx) # construct model, for folds dfm = Train(params, feature_size, field_size) dfm.training(train_Xi_, train_Xv_, train_y_, valid_Xi_, valid_Xv_, valid_y_, early_stopping=False, refit=False) train_meta_y[valid_idx, 0] = dfm.predict(valid_Xi_, valid_Xv_) test_meta_y[:, 0] += dfm.predict(test_Xi, test_Xv) gini_results_cv[idx] = gini_norm(valid_y_, train_meta_y[valid_idx]) gini_results_epoch_train[ idx, :len(dfm.train_results)] = dfm.train_results gini_results_epoch_valid[ idx, :len(dfm.valid_results)] = dfm.valid_results test_meta_y = test_meta_y / float(len(data_folds)) # save result if params.use_fm and params.use_deep: clf_str = "DeepFM" elif params.use_fm: clf_str = "FM" else: clf_str = "DNN" print("%s: %.5f (%.5f)" % (clf_str, gini_results_cv.mean(), gini_results_cv.std())) filename = "%s_Mean%.5f_Std%.5f.csv" % (clf_str, gini_results_cv.mean(), gini_results_cv.std()) _make_submission(test_ids, test_meta_y, params.sub_dir, filename) _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str) return train_meta_y, test_meta_y
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params): fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS) data_parser = DataParser(feat_dict=fd) """ Xi_x是一个n_samples x n_features的索引list,每个数值型特征编码为一个固定索引,每个类别型特征根据类别 数编码为不同的索引 Xv_x是一个n_samples x n_features的值list """ _print("parse data begin") Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest) Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True) _print("parse data finish") dfm_params["feature_size"] = fd.feat_dim #最大索引 dfm_params["field_size"] = len(Xi_train[0]) #特征数,这个还是原始的特征数 y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float) y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l] gini_results_cv = np.zeros(len(folds), dtype=float) gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) #train_idx和valid_idx分别是训练集和验证集的idx,因为做了kfold所以下面要从 #全样本中根据idx提取出来 for i, (train_idx, valid_idx) in enumerate(folds): Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx) _print("fit, fold=%d" % i) dfm = DeepFM(**dfm_params, n_samples=len(Xi_train_)) dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_) y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_) y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test) gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx]) gini_results_epoch_train[i] = dfm.train_result gini_results_epoch_valid[i] = dfm.valid_result y_test_meta /= float(len(folds)) # save result if dfm_params["use_fm"] and dfm_params["use_deep"]: clf_str = "DeepFM" elif dfm_params["use_fm"]: clf_str = "FM" elif dfm_params["use_deep"]: clf_str = "DNN" line = "%s: %.5f (%.5f)"%(clf_str, gini_results_cv.mean(), gini_results_cv.std()) _print(line) filename = "%s_Mean%.5f_Std%.5f.csv"%(clf_str, gini_results_cv.mean(), gini_results_cv.std()) _make_submission(ids_test, y_test_meta, filename) # 暂时不画图了 #_plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str) return y_train_meta, y_test_meta
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params): fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS) data_parser = DataParser(feat_dict=fd) Xi_train, Xv_train, y_train = data_parser.parse( df=dfTrain, has_label=True) # 返回样本特征id, 样本特征值, label Xi_test, Xv_test, ids_test = data_parser.parse( df=dfTest) # 返回样本特征id, 样本特征值, 样本id dfm_params["feature_size"] = fd.feat_dim # 特征总数 dfm_params["field_size"] = len( Xi_train[0]) # Xi_train[0]是训练集的第一条样本,该长度描述的是field的数量 y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float) # 构建中间变量,长度和样本数保持一致 y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l ] # lambda语法,类似于C语言中的宏定义,冒号前的是变量,冒号后的是变量执行的语句 gini_results_cv = np.zeros(len(folds), dtype=float) # len(folds)表示分割训练集和验证集的方法数(kfold) gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) for i, (train_idx, valid_idx) in enumerate( folds): # 反复训练模型k(len(folds))次,只在训练集量少时进行, 数据量足够大时,无需循环 Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get( Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get( Xv_train, valid_idx), _get(y_train, valid_idx) dfm = DeepFM(**dfm_params) # 构造网络 dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_) # 拟合 y_train_meta[valid_idx, 0] = dfm.predict(Xi_valid_, Xv_valid_) # 在验证集上预测 y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test) # 在测试集上预测 gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx]) gini_results_epoch_train[i] = dfm.train_result gini_results_epoch_valid[i] = dfm.valid_result y_test_meta /= float(len(folds)) # save result if dfm_params["use_fm"] and dfm_params["use_deep"]: clf_str = "DeepFM" elif dfm_params["use_fm"]: clf_str = "FM" elif dfm_params["use_deep"]: clf_str = "DNN" print("%s: %.5f (%.5f)" % (clf_str, gini_results_cv.mean(), gini_results_cv.std())) filename = "%s_Mean%.5f_Std%.5f.csv" % (clf_str, gini_results_cv.mean(), gini_results_cv.std()) _make_submission(ids_test, y_test_meta, filename) _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str) return y_train_meta, y_test_meta
feed_dict = { dfm.feat_index: np.array(Xi_batch), dfm.feat_value: np.array(Xv_batch), dfm.label: np.array(y_batch).reshape((-1, 1)), dfm.dropout_keep_fm: [1.0] * len(dfm.dropout_fm), dfm.dropout_keep_deep: [1.0] * len(dfm.dropout_deep), dfm._training: False } loss, train_out = sess.run((dfm.loss, dfm.out), feed_dict=feed_dict) if i == 0: pre_train = np.reshape(train_out, (num_batch, )) else: pre_train = np.concatenate( (pre_train, np.reshape(train_out, (num_batch, )))) sig_gini_train = gini_norm(y_train_, pre_train) gini_train.append(sig_gini_train) feed_dict = { dfm.feat_index: np.array(Xi_valid_), dfm.feat_value: np.array(Xv_valid_), dfm.label: np.array(y_valid_).reshape((-1, 1)), dfm.dropout_keep_fm: [1.0] * len(dfm.dropout_fm), dfm.dropout_keep_deep: [1.0] * len(dfm.dropout_deep), dfm._training: False } loss_test, valid_out = sess.run((dfm.loss, dfm.out), feed_dict=feed_dict) pre_valid.append(valid_out) pre_valid = [y for x in pre_valid for y in x] sig_gini_valid = gini_norm(y_valid_, pre_valid)
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params, NUMERIC_COLS, IGNORE_COLS, application='classification'): fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=NUMERIC_COLS, ignore_cols=IGNORE_COLS) data_parser = DataParser(feat_dict=fd) Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True) Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest, has_label=True) dfm_params["feature_size"] = fd.feat_dim dfm_params["field_size"] = len(Xi_train[0]) y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float) y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l] results_cv = np.zeros(len(folds), dtype=float) results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) for i, (train_idx, valid_idx) in enumerate(folds): Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get( Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get( Xv_train, valid_idx), _get(y_train, valid_idx) dfm = DeepFM(**dfm_params) dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_) y_train_meta[valid_idx, 0] = dfm.predict(Xi_valid_, Xv_valid_) y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test) if application == 'classification': results_cv[i] = roc_auc_score(y_valid_, y_train_meta[valid_idx]) elif application == 'regression': results_cv[i] = np.sqrt( mean_squared_error(y_valid_, y_train_meta[valid_idx])) else: results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx]) results_epoch_train[i] = dfm.train_result results_epoch_valid[i] = dfm.valid_result y_test_meta /= float(len(folds)) # save result if dfm_params["use_fm"] and dfm_params["use_deep"]: clf_str = "DeepFM" elif dfm_params["use_fm"]: clf_str = "FM" elif dfm_params["use_deep"]: clf_str = "DNN" print("%s: rmse/accuracy/gini is %.4f (std is %.4f)" % (clf_str, results_cv.mean(), results_cv.std())) filename = "%s_Mean%.5f.csv" % (clf_str, results_cv.mean()) _make_submission(ids_test, y_test_meta, filename) _plot_fig(results_epoch_train, results_epoch_valid, clf_str, application) return y_train_meta, y_test_meta
def my_fit(self, Xi_train_, Xv_train_, y_train_, Xi_valid_=None, Xv_valid_=None, y_valid_=None): self.close_session() self._init_graph() # extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) self._session = tf.Session() self._init = tf.global_variables_initializer() self._session.run(self._init) max_checks_without_progress = 10 checks_without_progress = 0 best_gini = 0 loss_train = 0 loss_test = 0 self.gini_train = [] self.gini_valid = [] for epoch in range(self.epoch): t1 = time() pre_train = [] pre_valid = [] self.shuffle_in_unison_scary(Xi_train_, Xv_train_, y_train_) total_batch = int((len(y_train_) - 1) / self.batch_size) + 1 for i in range(total_batch): Xi_batch, Xv_batch, y_batch = self.get_batch( Xi_train_, Xv_train_, y_train_, self.batch_size, i) feed_dict = { self.feat_index: np.array(Xi_batch), self.feat_value: np.array(Xv_batch), self.label: np.array(y_batch).reshape((-1, 1)), self.dropout_keep_fm: self.dropout_fm, self.dropout_keep_deep: self.dropout_deep, self._training: True } # if extra_update_ops: # self._session.run(extra_update_ops, feed_dict=feed_dict) loss, opt, train_out = self._session.run( (self.loss, self.train_step, self.out), feed_dict=feed_dict) loss_train /= total_batch # ##########----------------gini train--------- # for i in range(total_batch): # dummy = [1] * len(Xi_train_) # Xi_batch, Xv_batch, y_batch = self.get_batch(Xi_train_, Xv_train_, dummy, self.batch_size, i) # num_batch = len(y_batch) # feed_dict = {self.feat_index: np.array(Xi_batch), # self.feat_value: np.array(Xv_batch), # self.label: np.array(y_batch).reshape((-1, 1)), # self.dropout_keep_fm: [1.0] * len(self.dropout_fm), # self.dropout_keep_deep: [1.0] * len(self.dropout_deep), # self._training: False # # } # loss, train_out = self._session.run((self.loss, self.out), feed_dict=feed_dict) # if i == 0: # pre_train = np.reshape(train_out, (num_batch,)) # else: # pre_train = np.concatenate((pre_train, np.reshape(train_out, (num_batch,)))) pre_train = self.my_predict_prob(Xi_train_, Xv_train_) sig_gini_train = gini_norm(y_train_, pre_train) self.gini_train.append(sig_gini_train) #########################valid value------------- # feed_dict = {self.feat_index: np.array(Xi_valid_), # self.feat_value: np.array(Xv_valid_), # self.label: np.array(y_valid_).reshape((-1, 1)), # self.dropout_keep_fm: [1.0] * len(self.dropout_fm), # self.dropout_keep_deep: [1.0] * len(self.dropout_deep) # # } # loss_test, valid_out = self._session.run((self.loss, self.out), feed_dict=feed_dict) # pre_valid.append(valid_out) # pre_valid = [y for x in pre_valid for y in x] pre_valid = self.my_predict_prob(Xi_valid_, Xv_valid_) sig_gini_valid = gini_norm(y_valid_, pre_valid) self.gini_valid.append(sig_gini_valid) if sig_gini_valid > best_gini: gvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) best_params = { gvar.op.name: value for gvar, value in zip(gvars, self._session.run(gvars)) } best_gini = sig_gini_valid checks_without_progress = 0 else: checks_without_progress += 1 print("[%d] train-result=%.4f, valid-result=%.4f [%.1f s]" % (epoch + 1, sig_gini_train, sig_gini_valid, time() - t1)) #####################-------------希望早停的话把这句加上 # if checks_without_progress > max_checks_without_progress: # print('early stopping!') # break ##########将训练过程中保存的最好的参数重新返回到模型参数,此时得到的是最好的模型 if best_params: gvars_names = list(best_params.keys()) assign_ops = { gvar_name: tf.get_default_graph().get_operation_by_name(gvar_name + '/Assign') for gvar_name in gvars_names } init_values = { gvar_name: assign_op.inputs[1] for gvar_name, assign_op in assign_ops.items() } feed_dict = { init_values[gvar_name]: best_params[gvar_name] for gvar_name in gvars_names } self._session.run(assign_ops, feed_dict=feed_dict) return self
prob = F.sigmoid(outputs) y_pre.append(prob.data.cpu().numpy()) loss = criterion(outputs, y_batch) # _, predicted = torch.max(outputs.data, 1) loss_train += loss.item() ###########backward and optimize torch_optim.zero_grad() loss.backward() torch_optim.step() # if (i + 1) % 10 == 0: # print('steps:[%d],train_loss:[%.3f]' % (i + 1, loss.item())) y_true = np.concatenate(y_true, axis=0) y_pre = np.concatenate(y_pre, axis=0) gini_train = gini_norm(y_true, y_pre) loss_train /= steps loss_val, gini_val = val(model, valid_dataloader) history['loss_train'].append(loss_train) history['loss_val'].append(loss_val) history['gini_train'].append(gini_train) history['gini_val'].append(gini_val) if loss_val < best_loss: torch.save(model.state_dict(), check_file) best_loss = loss_val checks_without_progress = 0 else: checks_without_progress += 1 if checks_without_progress > max_checks_without_progress:
def training(self, train_Xi, train_Xv, train_y, valid_Xi=None, valid_Xv=None, valid_y=None, early_stopping=False, refit=False): need_valid = False if valid_Xi is not None: need_valid = True ## construct optimizer optimizer_type = self.params.optimizer if optimizer_type == "sgd": optimizer = optim.SGD(self.deepfm.parameters(), lr=self.params.learning_rate) else: optimizer = optim.Adam(self.deepfm.parameters(), lr=self.params.learning_rate, betas=(0.9, 0.99), eps=1e-8, amsgrad=True) ## loss_type = self.params.loss_type for epoch in range(1, self.params.epochs + 1): self.deepfm.train() t1 = time.time() self.shuffle_in_unison_scary(train_Xi, train_Xv, train_y) total_batch = int(len(train_y) / self.params.batch_size) + 1 for i in range(total_batch): batch_Xi, batch_Xv, batch_y = self.get_batch(train_Xi, train_Xv, train_y, self.params.batch_size, i) batch_Xi = torch.tensor(batch_Xi, dtype=torch.long) batch_Xv = torch.tensor(batch_Xv, dtype=torch.float) batch_y = torch.tensor(batch_y, dtype=torch.long) optimizer.zero_grad() output, _, _, _ = self.deepfm(batch_Xi, batch_Xv) if loss_type == "logloss": # for classification output = F.sigmoid(output) loss = -torch.mul(batch_y, torch.log(output)) \ - torch.mul((1-batch_y), torch.log(1-output)) loss = torch.mean(loss) elif loss_type == "mse": # for regression loss = F.mse_loss(input=output, target=batch_y) else: raise ValueError("Unknown loss type, should be one of 'logloss/mes'") # l2 regularization on weights for preventing over-fitting if self.params.l2_reg > 0: loss += self.params.l2_reg * torch.norm(self.deepfm.final_W, 2) if self.params.use_deep: for weight in self.deepfm.deep_layers: loss += self.params.l2_reg * torch.norm(weight.W, 2) print("epoch: %d, loss: %.4f" % (epoch, loss.item())) # backward loss.backward() torch.nn.utils.clip_grad_norm_(self.deepfm.parameters(), self.params.grad_clip) optimizer.step() # each epoch with evaluate training and validation datasets train_pred_y = self.predict(train_Xi, train_Xv) self.train_results.append(gini_norm(train_y, train_pred_y)) if need_valid: valid_pred_y = self.predict(valid_Xi, valid_Xv) self.valid_results.append(gini_norm(valid_y, valid_pred_y)) print("epoch: %d, train-result: %.4f, valid-result: %.4f, " "cost-time: %.2f s" % (epoch, self.train_results[-1], self.valid_results[-1], time.time() - t1)) else: print("epoch: %d, train-result: %.4f, cost-time: %.2f s" % (epoch, self.train_results[-1], time.time() - t1)) if need_valid and early_stopping and self.training_termination(self.valid_results): print("Early Stopping!!") break # fit a few more epochs on train+valid until result reaches the best_train_score if need_valid and refit: greater_is_better = self.params.greater_is_better if greater_is_better: best_valid_score = max(self.valid_results) else: best_valid_score = min(self.valid_results) best_epoch = self.valid_results.index(best_valid_score) best_train_score = self.train_results[best_epoch] train_Xi = train_Xi + valid_Xi train_Xv = train_Xv + valid_Xv train_y = train_y + valid_y for epoch in range(1, 100): self.shuffle_in_unison_scary(train_Xi, train_Xv, train_y) total_batch = int(len(train_y) / self.params.batch_size) + 1 for i in range(total_batch): batch_Xi, batch_Xv, batch_y = self.get_batch(train_Xi, train_Xv, train_y, self.params.batch_size, i) batch_Xi = torch.tensor(batch_Xi, dtype=torch.long) batch_Xv = torch.tensor(batch_Xv, dtype=torch.float) batch_y = torch.tensor(batch_y, dtype=torch.long) optimizer.zero_grad() output, _, _, _ = self.deepfm(batch_Xi, batch_Xv) if loss_type == "logloss": # for classification output = F.sigmoid(output) loss = -torch.mul(batch_y, torch.log(output)) \ - torch.mul((1 - batch_y), torch.log(1 - output)) loss = torch.mean(loss) elif loss_type == "mse": # for regression loss = F.mse_loss(input=output, target=batch_y) else: raise ValueError("Unknown loss type, should be one of 'logloss/mes'") # l2 regularization on weights for preventing over-fitting if self.params.l2_reg > 0: loss += self.params.l2_reg * torch.norm(self.deepfm.final_W, 2) if self.params.use_deep: for weight in self.deepfm.deep_layers: loss += self.params.l2_reg * torch.norm(weight.W, 2) print("epoch: %d, loss: %.4f" % (epoch, loss.item())) # backward loss.backward() torch.nn.utils.clip_grad_norm_(self.deepfm.parameters(), self.params.grad_clip) optimizer.step() # check train_pred_y = self.predict(train_Xi, train_Xv) train_result = gini_norm(train_y, train_pred_y) if abs(train_result - best_train_score) < 0.001 \ or (greater_is_better and train_result > best_train_score) \ or ((not greater_is_better) and train_result < best_train_score): print("Find best train score!!") break