def run_base_model_nfm(dfTrain,dfTest,folds,pnn_params): fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols = config.IGNORE_COLS) data_parser = DataParser(feat_dict= fd) # Xi_train :列的序号 # Xv_train :列的对应的值 Xi_train,Xv_train,y_train = data_parser.parse(df=dfTrain,has_label=True) Xi_test,Xv_test,ids_test = data_parser.parse(df=dfTest) print(dfTrain.dtypes) pnn_params['feature_size'] = fd.feat_dim pnn_params['field_size'] = len(Xi_train[0]) _get = lambda x,l:[x[i] for i in l] for i, (train_idx, valid_idx) in enumerate(folds): Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx) afm = AFM(**pnn_params) afm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)
def run_base_model_nfm(dfTrain, dfTest, folds, pnn_params): fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS) data_parser = DataParser(feat_dict=fd) # Xi_train :列的序号 # Xv_train :列的对应的值 Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True) Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest) #print(dfTrain.dtypes) pnn_params['feature_size'] = fd.feat_dim pnn_params['field_size'] = len(Xi_train[0]) _get = lambda x, l: [x[i] for i in l] for i, (train_idx, valid_idx) in enumerate(folds): Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get( Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get( Xv_train, valid_idx), _get(y_train, valid_idx) nfm = NFM(**pnn_params) nfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params): fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS) data_parser = DataParser(feat_dict=fd) Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True) Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest) dfm_params["feature_size"] = fd.feat_dim dfm_params["field_size"] = len(Xi_train[0]) y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float) y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l] gini_results_cv = np.zeros(len(folds), dtype=float) gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) for i, (train_idx, valid_idx) in enumerate(folds): Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get( Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get( Xv_train, valid_idx), _get(y_train, valid_idx) dfm = DeepFM(**dfm_params) dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_) y_train_meta[valid_idx, 0] = dfm.predict(Xi_valid_, Xv_valid_) y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test) gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx]) gini_results_epoch_train[i] = dfm.train_result gini_results_epoch_valid[i] = dfm.valid_result y_test_meta /= float(len(folds)) # save result if dfm_params["module_name"] == "DeepFM": if dfm_params["use_fm"] and dfm_params["use_deep"]: clf_str = "DeepFM" elif dfm_params["use_fm"]: clf_str = "FM" elif dfm_params["use_deep"]: clf_str = "DNN" elif dfm_params["module_name"] == "LR": clf_str = "LR" elif dfm_params["module_name"] == "WideDeep": clf_str = "WideDeep" print("%s: %.5f (%.5f)" % (clf_str, gini_results_cv.mean(), gini_results_cv.std())) filename = "%s_Mean%.5f_Std%.5f.csv" % (clf_str, gini_results_cv.mean(), gini_results_cv.std()) _make_submission(ids_test, y_test_meta, filename) _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str) return y_train_meta, y_test_meta
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params): fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest) data_parser = DataParser( feat_dict=fd ) #converting into-Xi: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...] Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True) Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest) dfm_params["feature_size"] = fd.feat_dim dfm_params["field_size"] = len(Xi_train[0]) y_train_meta = np.zeros((dfTrain.shape[0], 4), dtype=float) #crated array of (rows in dfTrain,4) y_test_meta = np.zeros((dfTest.shape[0], 4), dtype=float) _get = lambda x, l: [x[i] for i in l] gini_results_cv = np.zeros(len(folds), dtype=float) gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) for i, (train_idx, valid_idx) in enumerate(folds): Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get( Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get( Xv_train, valid_idx), _get(y_train, valid_idx) dfm = DeepFM(**dfm_params) dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_) y_train_meta[valid_idx] = dfm.predict(Xi_valid_, Xv_valid_) y_test_meta[:, 0:4] += dfm.predict(Xi_test, Xv_test) b = np.zeros_like(y_train_meta) b[np.arange(len(y_train_meta)), y_train_meta.argmax(1)] = 1 #y_train_meta = np.array(y_train_meta, dtype=np.float32) gini_results_cv[i] = label_ranking_average_precision_score( y_valid_, y_train_meta[valid_idx]) gini_results_epoch_train[i] = dfm.train_result gini_results_epoch_valid[i] = dfm.valid_result y_test_meta /= float(len(folds)) #b = np.zeros_like(y_test_meta) #b[np.arange(len(y_test_meta)), y_test_meta.argmax(1)] = 1 #y_test_meta = b # save result if dfm_params["use_fm"] and dfm_params["use_deep"]: clf_str = "DeepFM" elif dfm_params["use_fm"]: clf_str = "FM" elif dfm_params["use_deep"]: clf_str = "DNN" print("%s: %.5f (%.5f)" % (clf_str, gini_results_cv.mean(), gini_results_cv.std())) #filename = "%s_Mean%.5f_Std%.5f.csv"%(clf_str, gini_results_cv.mean(), gini_results_cv.std()) _final_result(ids_test, y_test_meta, filename="result.csv") _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str) return y_train_meta, y_test_meta
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params,save_path:str,past_epoch:int=0): fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS) data_parser = DataParser(feat_dict=fd) Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True) Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest) dfm_params["feature_size"] = fd.feat_dim dfm_params["field_size"] = len(Xi_train[0]) y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float) y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l] gini_results_cv = np.zeros(len(folds), dtype=float) # gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) # gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) dfm = DeepFM(**dfm_params) if past_epoch!=0 :dfm.saver.restore(dfm.sess, save_path + '-'+str(past_epoch)) for i, (train_idx, valid_idx) in enumerate(folds): Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx) # print(y_train) # print(y_train_) # print(dfm.predict(Xi_train_, Xv_train_)) # continue dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_) y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_) y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test) gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx]) # gini_results_epoch_train[i] = dfm.train_result # gini_results_epoch_valid[i] = dfm.valid_result # print('saving') dfm.saver.save(dfm.sess, save_path, global_step=past_epoch+dfm_params["epoch"]*(i+1)) y_test_meta /= float(len(folds)) # save result if dfm_params["use_fm"] and dfm_params["use_deep"]: clf_str = "DeepFM" elif dfm_params["use_fm"]: clf_str = "FM" elif dfm_params["use_deep"]: clf_str = "DNN" print("%s: %.5f (%.5f)"%(clf_str, gini_results_cv.mean(), gini_results_cv.std())) filename = "%s_Mean%.5f_Std%.5f.csv"%(clf_str, gini_results_cv.mean(), gini_results_cv.std()) _make_submission(ids_test, y_test_meta, filename) gini_results_epoch_train = np.zeros((1, dfm_params["epoch"]*len(folds)), dtype=float) gini_results_epoch_valid = np.zeros((1, dfm_params["epoch"]*len(folds)), dtype=float) gini_results_epoch_train[0]=dfm.train_result gini_results_epoch_valid[0]=dfm.valid_result _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str) return y_train_meta, y_test_meta,dfm
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params): fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS) data_parser = DataParser(feat_dict=fd) Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True) Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest) dfm_params["feature_size"] = fd.feat_dim dfm_params["field_size"] = len(Xi_train[0]) y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float) y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l] gini_results_cv = np.zeros(len(folds), dtype=float) gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) for i, (train_idx, valid_idx) in enumerate(folds): Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx) # todo dfm_params={'use_fm': True, 'use_deep': True, 'embedding_size': 8, 'dropout_fm': [1.0, 1.0], 'deep_layers': [32, 32], # 'dropout_deep': [0.5, 0.5, 0.5], 'deep_layers_activation': <function relu at 0x7fe4917da950>, 'epoch': 30, 'batch_size': 1024, # 'learning_rate': 0.001, 'optimizer_type': 'adam', 'batch_norm': 1, 'batch_norm_decay': 0.995, 'l2_reg': 0.01, 'verbose': True, # 'eval_metric': <function gini_norm at 0x7fe495b06048>, 'random_seed': 2017, 'feature_size': 259, 'field_size': 39} # print(f"dfm_params={dfm_params}") dfm = DeepFM(**dfm_params) dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_) y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_) y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test) gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx]) # todo 所谓的 train_result 是训练集的gini系数 gini_results_epoch_train[i] = dfm.train_result gini_results_epoch_valid[i] = dfm.valid_result # todo 上面用了5折交叉,y_test_meta是各折交叉的加和,这里相当于5折交叉取平均 y_test_meta /= float(len(folds)) # save result if dfm_params["use_fm"] and dfm_params["use_deep"]: clf_str = "DeepFM" elif dfm_params["use_fm"]: clf_str = "FM" elif dfm_params["use_deep"]: clf_str = "DNN" print("%s: %.5f (%.5f)"%(clf_str, gini_results_cv.mean(), gini_results_cv.std())) filename = "%s_Mean%.5f_Std%.5f.csv"%(clf_str, gini_results_cv.mean(), gini_results_cv.std()) _make_submission(ids_test, y_test_meta, filename) _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str) return y_train_meta, y_test_meta
def run_base_model_dfm(dfTrain,dfTest,folds,dfm_params): fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols = config.IGNORE_COLS) data_parser = DataParser(feat_dict= fd) # Xi_train :列的序号 # Xv_train :列的对应的值 Xi_train,Xv_train,y_train = data_parser.parse(df=dfTrain,has_label=True) Xi_test,Xv_test,ids_test = data_parser.parse(df=dfTest) print(dfTrain.dtypes) dfm_params['feature_size'] = fd.feat_dim dfm_params['field_size'] = len(Xi_train[0]) y_train_meta = np.zeros((dfTrain.shape[0],1),dtype=float) y_test_meta = np.zeros((dfTest.shape[0],1),dtype=float) _get = lambda x,l:[x[i] for i in l] gini_results_cv = np.zeros(len(folds),dtype=float) gini_results_epoch_train = np.zeros((len(folds),dfm_params['epoch']),dtype=float) gini_results_epoch_valid = np.zeros((len(folds),dfm_params['epoch']),dtype=float) for i, (train_idx, valid_idx) in enumerate(folds): Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx) dfm = DeepFM(**dfm_params) dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_) y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_) y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test) gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx]) gini_results_epoch_train[i] = dfm.train_result gini_results_epoch_valid[i] = dfm.valid_result y_test_meta /= float(len(folds)) # save result if dfm_params["use_fm"] and dfm_params["use_deep"]: clf_str = "DeepFM" elif dfm_params["use_fm"]: clf_str = "FM" elif dfm_params["use_deep"]: clf_str = "DNN" print("%s: %.5f (%.5f)"%(clf_str, gini_results_cv.mean(), gini_results_cv.std())) filename = "%s_Mean%.5f_Std%.5f.csv"%(clf_str, gini_results_cv.mean(), gini_results_cv.std()) _make_submission(ids_test, y_test_meta, filename) _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str) return y_train_meta, y_test_meta
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params): fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS, multi_value_cols=config.MULTI_VALUE_COLS) data_parser = DataParser(feat_dict=fd) Xi_train, Xv_train, Xmv_train, y_train = data_parser.parse(df=dfTrain, has_label=True) Xi_test, Xv_test, Xmv_test, ids_test = data_parser.parse(df=dfTest) # pickle.dump((Xi_train, Xv_train, Xmv_train, y_train), open('./data/train_set1.pkl','wb')) # pickle.dump((Xi_test, Xv_test, Xmv_test, ids_test), open('./data/test_set1.pkl','wb')) # Xi_train, Xv_train, Xmv_train, y_train = pickle.load(open('./data/train_set1.pkl','rb')) # y_train = np.array(y_train) # y_train = np.where(y_train<0,0,y_train) # y_train = list(y_train) # Xi_test, Xv_test, Xmv_test, ids_test = pickle.load(open('./data/test_set1.pkl','rb')) dfm_params["feature_size"] = fd.feat_dim dfm_params["field_size"] = len(Xi_train[0]) dfm_params["vocab_size"] = fd.vocab_size dfm_params["num_multiVal_feat"] = len(fd.multi_value_cols) dfm_params["sequence_length"] = config.MAXLEN print(dfm_params) del fd del data_parser gc.collect() y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l] for i, (train_idx, valid_idx) in enumerate(folds): Xi_train_, Xv_train_, Xmv_train_, y_train_ = \ _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(Xmv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, Xmv_valid_, y_valid_ = \ _get(Xi_train, valid_idx), _get(Xv_train, valid_idx),_get(Xmv_train, valid_idx), _get(y_train, valid_idx) dfm = DeepCFM(**dfm_params) dfm.fit(Xi_train_, Xv_train_, Xmv_train_, y_train_, Xi_valid_, Xv_valid_, Xmv_valid_, y_valid_) y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test, Xmv_test) break # y_test_meta /= float(len(folds)) # save result _make_submission(ids_test, y_test_meta, "submission1.csv") # _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str) return y_test_meta
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params, label2current_service): fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS) data_parser = DataParser(feat_dict=fd) Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True) Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest) dfm_params["feature_size"] = fd.feat_dim dfm_params["field_size"] = len(Xi_train[0]) xx_score = [] cv_pred = [] y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float) y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l] for i, (train_idx, valid_idx) in enumerate(folds): Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get( Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get( Xv_train, valid_idx), _get(y_train, valid_idx) dfm = DeepFM(**dfm_params) dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_) xx_pred = dfm.predict(Xi_valid_, Xv_valid_) xx_score.append(f1_score(y_valid_, xx_pred, average='macro')) y_test = dfm.predict(Xi_test, Xv_test) if i == 0: cv_pred = np.asarray(y_test).reshape(-1, 1) else: cv_pred = np.hstack((cv_pred, np.asarray(y_test).reshape(-1, 1))) submit = [] for line in cv_pred: submit.append(np.argmax(np.bincount(line))) # 保存结果 df_test = pd.DataFrame() df_test['id'] = list(ids_test) df_test['predict'] = submit df_test['predict'] = df_test['predict'].map(label2current_service) df_test.to_csv('result.csv', index=False) print(xx_score, np.mean(xx_score)) return y_train_meta, y_test_meta
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params): fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS) data_parser = DataParser(feat_dict=fd) Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True) Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest) dfm_params["feature_size"] = fd.feat_dim dfm_params["field_size"] = len(Xi_train[0]) y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float) y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l] gini_results_cv = np.zeros(len(folds), dtype=float) gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) for i, (train_idx, valid_idx) in enumerate(folds): # k折交叉,每一折中的fit中,含有epoch轮训练,每一次epoch拆分了batch来喂入 Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx) dfm = DeepFM(**dfm_params) dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_) # fit中包含对train和valid的评估 yy = dfm.predict(Xi_valid_, Xv_valid_) # print("type(yy):",type(yy)) # print("type(y_valid_):", type(y_valid_)) # print("yy.shape:",yy.shape) #yy : array # print("y_valid_.shape:", y_valid_.shape) #y_valid_ : list #print("yy:", yy) # 原始的predict出来的是概率值 for index in range(len(yy)): if (yy[index] <= 0.5): yy[index] = 0 else: yy[index] = 1 #print("y_valid_:", y_valid_) print("accuracy_score(y_valid_, yy):", accuracy_score(y_valid_, yy)) y_train_meta[valid_idx, 0] = yy y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test) y_test_meta /= float(len(folds)) return y_train_meta, y_test_meta
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params): #获取dict fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest,#训练集和测试集 numeric_cols=config.NUMERIC_COLS,#num类列 ignore_cols=config.IGNORE_COLS)#ignore特征,dfTrain和dfTest没有过滤掉 data_parser = DataParser(feat_dict=fd)#data_parser对象 Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)调用parse方法获取处理后的数据 Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest) dfm_params["feature_size"] = fd.feat_dim#处理之后的特征个数,即考虑了one-hot之后 dfm_params["field_size"] = len(Xi_train[0])#field个数 y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float) y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l] gini_results_cv = np.zeros(len(folds), dtype=float) gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) for i, (train_idx, valid_idx) in enumerate(folds):#应该是划分k份 Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx) dfm = DeepFM(**dfm_params) dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_) y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_)#k次折交 y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test)#每次训练都预测一次,然后把预测结果累加取来 gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx]) gini_results_epoch_train[i] = dfm.train_result gini_results_epoch_valid[i] = dfm.valid_result y_test_meta /= float(len(folds))#在测试集上的累加结果求平均 # save result if dfm_params["use_fm"] and dfm_params["use_deep"]:#deepFM clf_str = "DeepFM" elif dfm_params["use_fm"]:#FM clf_str = "FM" elif dfm_params["use_deep"]:#DNN clf_str = "DNN" print("%s: %.5f (%.5f)"%(clf_str, gini_results_cv.mean(), gini_results_cv.std())) filename = "%s_Mean%.5f_Std%.5f.csv"%(clf_str, gini_results_cv.mean(), gini_results_cv.std()) _make_submission(ids_test, y_test_meta, filename) _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str) return y_train_meta, y_test_meta#返回验证的预测和测试集的预测
def run_base_model_dfm(dfTrain, dfTest, folds, prefix, dfm_params): fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS) data_parser = DataParser(feat_dict=fd) Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True) dfm_params["feature_size"] = fd.feat_dim dfm_params["field_size"] = len(Xi_train[0]) _get = lambda x, l: [x[i] for i in l] for i, (train_idx, valid_idx) in enumerate(folds): Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get( Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get( Xv_train, valid_idx), _get(y_train, valid_idx) Xi_train_ = np.array(Xi_train_, dtype='int32') Xv_train_ = np.array(Xv_train_, dtype='float32') y_train_ = np.array(y_train_, dtype=np.int8) Xi_valid_ = np.array(Xi_valid_, dtype='int32') Xv_valid_ = np.array(Xv_valid_, dtype='float32') y_valid_ = np.array(y_valid_, dtype=np.int8) dfm = DeepFM(**dfm_params).build_model() dfm.compile(optimizer='adam', loss='binary_crossentropy', metrics=[keras.metrics.AUC(name='auc')]) print(dfm.summary()) checkpoint_dir = "../checkpoints/{}_cpt_" + str(i) log_dir = "../logs/{}_train_logs_" + str(i) checkpoint_dir = checkpoint_dir.format(prefix) log_dir = log_dir.format(prefix) shutil.rmtree(checkpoint_dir, ignore_errors=True) os.makedirs(checkpoint_dir, exist_ok=True) shutil.rmtree(log_dir, ignore_errors=True) os.makedirs(log_dir, exist_ok=True) checkpoint_path = os.path.join(checkpoint_dir, "weights.hdf5") callbacks = [ ModelCheckpoint(checkpoint_path, monitor="val_loss", save_best_only=True), EarlyStopping(patience=5, monitor="val_loss"), TensorBoard(log_dir=log_dir) ] dfm.fit( (Xi_train_, Xv_train_), y_train_, epochs=50, # epochs=1, batch_size=64, validation_data=((Xi_valid_, Xv_valid_), y_valid_), verbose=2, callbacks=callbacks)
def k_fold_cross_valid(dfTrain, X_submission, folds, pnn_params, train_params): numeric_cols = [] ignore_cols = [] for col in dfTrain.columns: type_col = str(dfTrain[col].dtype) if (type_col == 'float32' or (type_col == 'int64' and col[:10] != 'pref_month')): numeric_cols.append(col) fd = FeatureDictionary(dfTrain=dfTrain, dfTest=X_submission, numeric_cols=numeric_cols, ignore_cols=ignore_cols) data_parser = DataParser(feat_dict=fd) # Xi_train :列的序号 # Xv_train :列的对应的值 # 这里不方便调用imblearn实现过采样,因为他不是直接存储为one-hot矩阵 # 而是索引和值分开存储的。也就是说,要用tensorflow自带的embedding函数, # 就很难再调用imlearn中的过采样了 Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True) Xi_submission, Xv_submission, ids_submission = data_parser.parse( df=X_submission) # print(y_train) pnn_params['feature_size'] = fd.feat_dim #包括one-hot所有维度的总维度,n_all_feature pnn_params['field_size'] = len(Xi_train[0]) #将one-hot看做整体的总的域个数, n_field _get = lambda x, l: [x[i] for i in l] for i, (train_idx, valid_idx) in enumerate(folds): Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get( Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get( Xv_train, valid_idx), _get(y_train, valid_idx) pnn = PNN(**pnn_params) train_iter = tf.data.Dataset.from_tensor_slices( (Xi_train_, Xv_train_, y_train_)).batch(train_params['batch_size']) test_iter = tf.data.Dataset.from_tensor_slices( (Xi_valid_, Xv_valid_, y_valid_)).batch(train_params['batch_size']) train(pnn, train_iter, test_iter, **train_params)
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params, numerical_cols = ): fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest) data_parser = DataParser(feat_dict=fd) Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True) Xi_test, Xv_test, y_test = data_parser.parse(df=dfTest,has_label=True) dfm_params["feature_size"] = fd.feat_dim dfm_params["field_size"] = len(Xi_train[0]) dfm = DeepFM(**dfm_params) dfm.fit(Xi_train, Xv_train, y_train,early_stopping=True) pred = dfm.predict(Xi_test,Xv_test) print(pred) dfm.evaluate(Xi_test, Xv_test, y_test) '''
def __init__(self,TRAINVALTEST_DENSE_X,TRAINVALTEST_DENSE_X_NAMES,\ TRAINVAL_SPARSE_X,TRAINVAL_SPARSE_X_NAMES,\ TEST_SPARSE_X,TEST_SPARSE_X_NAMES,\ UF_VW,ADF,TRAINVAL,UF_CSV,TRAINVAL_MERGE,\ TEST_MERGE,TEST,name='deepffm',USE_TINY=False,RANDOMSTATE=2018): super(DFFM, self).__init__( TRAINVALTEST_DENSE_X,TRAINVALTEST_DENSE_X_NAMES,\ TRAINVAL_SPARSE_X,TRAINVAL_SPARSE_X_NAMES,\ TEST_SPARSE_X,TEST_SPARSE_X_NAMES,\ UF_VW,ADF,TRAINVAL,UF_CSV,TRAINVAL_MERGE,\ TEST_MERGE,TEST,name,USE_TINY,RANDOMSTATE=2018) '''In Ridge, only 'sag' solver can currently fit the intercept when X is sparse.''' dfm_params = { "use_fm": True, "use_deep": True, "embedding_size": 8, "dropout_fm": [1.0, 1.0], "deep_layers": [32, 32], "dropout_deep": [0.5, 0.5, 0.5], "deep_layers_activation": tf.nn.relu, "epoch": 72, "batch_size": 1024, "learning_rate": 0.001, "optimizer_type": "adam", "batch_norm": 1, "batch_norm_decay": 0.995, "l2_reg": 0.01, "verbose": True, "eval_metric": roc_auc_score, "random_seed": 2018 } dfTrainVal, dfTest = self.ds.load_TrainVal_Test() fd = FeatureDictionary(dfTrain=dfTrainVal, dfTest=dfTest, numeric_cols=[], ignore_cols=[]) data_parser = DataParser(feat_dict=fd) #dfTrain_x, dfVal_x, dfTrain_y, dfVal_y =train_test_split(dfTrainVal.drop(['label'],axis=1)\ # ,dfTrainVal['label'],test_size=0.1, random_state=self.randomstate) #dfTrain=pd.DataFrame([dfTrain_x,dfTrain_y]) #dfVal=pd.DataFrame([dfVal_x,dfVal_y]) #print dfTrain.shape devideline = int(0.9 * len(dfTrainVal)) dfTrain = dfTrainVal[:devideline] dfVal = dfTrainVal[devideline:] Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True) Xi_valid, Xv_valid, y_valid = data_parser.parse(df=dfVal, has_label=True) Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest) dfm_params["feature_size"] = fd.feat_dim dfm_params["field_size"] = len(Xi_train[0]) self.clf = DeepFM(**dfm_params) # fit a DeepFM model self.clf.fit(Xi_train, Xv_train, y_train, Xi_valid, Xv_valid, y_valid, early_stopping=True, refit=True) y_pred = self.clf.predict(Xi_test, Xv_test) ids_test["label"] = y_pred ids_test.to_csv('submission_dffm.csv', index=False, float_format="%.5f") joblib.dump(self.clf, 'saved_model.model')
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params): fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS) data_parser = DataParser(feat_dict=fd) Xi_train, Xv_train, y_train = data_parser.parse( df=dfTrain, has_label=True) # 返回样本特征id, 样本特征值, label Xi_test, Xv_test, ids_test = data_parser.parse( df=dfTest) # 返回样本特征id, 样本特征值, 样本id dfm_params["feature_size"] = fd.feat_dim # 特征总数 dfm_params["field_size"] = len( Xi_train[0]) # Xi_train[0]是训练集的第一条样本,该长度描述的是field的数量 y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float) # 构建中间变量,长度和样本数保持一致 y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l ] # lambda语法,类似于C语言中的宏定义,冒号前的是变量,冒号后的是变量执行的语句 gini_results_cv = np.zeros(len(folds), dtype=float) # len(folds)表示分割训练集和验证集的方法数(kfold) gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) for i, (train_idx, valid_idx) in enumerate( folds): # 反复训练模型k(len(folds))次,只在训练集量少时进行, 数据量足够大时,无需循环 Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get( Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get( Xv_train, valid_idx), _get(y_train, valid_idx) dfm = DeepFM(**dfm_params) # 构造网络 dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_) # 拟合 y_train_meta[valid_idx, 0] = dfm.predict(Xi_valid_, Xv_valid_) # 在验证集上预测 y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test) # 在测试集上预测 gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx]) gini_results_epoch_train[i] = dfm.train_result gini_results_epoch_valid[i] = dfm.valid_result y_test_meta /= float(len(folds)) # save result if dfm_params["use_fm"] and dfm_params["use_deep"]: clf_str = "DeepFM" elif dfm_params["use_fm"]: clf_str = "FM" elif dfm_params["use_deep"]: clf_str = "DNN" print("%s: %.5f (%.5f)" % (clf_str, gini_results_cv.mean(), gini_results_cv.std())) filename = "%s_Mean%.5f_Std%.5f.csv" % (clf_str, gini_results_cv.mean(), gini_results_cv.std()) _make_submission(ids_test, y_test_meta, filename) _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str) return y_train_meta, y_test_meta
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params, NUMERIC_COLS, IGNORE_COLS, application='classification'): fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=NUMERIC_COLS, ignore_cols=IGNORE_COLS) data_parser = DataParser(feat_dict=fd) Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True) Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest, has_label=True) dfm_params["feature_size"] = fd.feat_dim dfm_params["field_size"] = len(Xi_train[0]) y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float) y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l] results_cv = np.zeros(len(folds), dtype=float) results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) for i, (train_idx, valid_idx) in enumerate(folds): Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get( Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get( Xv_train, valid_idx), _get(y_train, valid_idx) dfm = DeepFM(**dfm_params) dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_) y_train_meta[valid_idx, 0] = dfm.predict(Xi_valid_, Xv_valid_) y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test) if application == 'classification': results_cv[i] = roc_auc_score(y_valid_, y_train_meta[valid_idx]) elif application == 'regression': results_cv[i] = np.sqrt( mean_squared_error(y_valid_, y_train_meta[valid_idx])) else: results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx]) results_epoch_train[i] = dfm.train_result results_epoch_valid[i] = dfm.valid_result y_test_meta /= float(len(folds)) # save result if dfm_params["use_fm"] and dfm_params["use_deep"]: clf_str = "DeepFM" elif dfm_params["use_fm"]: clf_str = "FM" elif dfm_params["use_deep"]: clf_str = "DNN" print("%s: rmse/accuracy/gini is %.4f (std is %.4f)" % (clf_str, results_cv.mean(), results_cv.std())) filename = "%s_Mean%.5f.csv" % (clf_str, results_cv.mean()) _make_submission(ids_test, y_test_meta, filename) _plot_fig(results_epoch_train, results_epoch_valid, clf_str, application) return y_train_meta, y_test_meta
def run_base_model_dfm(dfTrain, dfTest, folds, dfm_params): ''' 对模型的运行部分, 可以往下看发现,这部分 同时可以用于 设置使用FM 、Deep 、DeepFM这三种不同的模型 ''' # 别忽视了 FeatureDictionary 这里面有非常多的信息包装 转换的。 这里 解析 和字典包装真的是有点不明白,太复杂了,v是怎么获取使用的 fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS) # 在解析数据中,逐行处理每一条数据,dfi 记录了当前的特征在总的输入的特征中的索引。dfv 中记录的是具体的值, # 如果是 numerical 特征,存的是原始的值,如果是 categories 类型的,就存放 1。这个相当于进行了 one-hot 编码, # 在 dfi 存储了特征所在的索引。输入到网络中的特征的长度是 ( numerical 特征的个数 +categories 特征 one-hot 编码的长度 )。 # 最终,Xi 和 Xv 是一个二维的 list,里面的每一个 list 是一行数据,Xi 存放的是特征所在的索引,Xv 存放的是具体的特征值。 data_parser = DataParser(feat_dict=fd) # Xi_train :列的序号 # Xv_train :列的对应的值 # 解析数据 Xi_train 存放的是特征对应的索引 Xv_train 存放的是特征的具体的值 Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True) Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest) #这里面是二维的, 大列表是 每个样本,小列表表示具体对应feature_index下的value的长度 。 小列表长度应该不是统一的,因为针对one-hot,只显示为1的 print('Xi_train:', Xi_train) #存储了对应标签索引 print('Xv_train:', Xv_train) #存储了真实值 print('y_train:', y_train) print('Xi_test:', Xi_test) print('Xv_test:', Xv_test) print('Xi_train shape:', len(Xi_train)) # 存储了对应标签索引 print('Xv_train shape:', len(Xv_train)) # 存储了真实值 print('y_train shape:', len(y_train)) print('Xi_test shape:', len(Xi_test)) print('Xv_test shape:', len(Xv_test)) #print('ids_test:', ids_test) print(dfTrain.dtypes) #field_size 是原始的特征size, feature_size是经过对离散型数据one-hot处理后的特征数量 dfm_params['feature_size'] = fd.feat_dim dfm_params['field_size'] = len(Xi_train[0]) y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float) y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l] gini_results_cv = np.zeros(len(folds), dtype=float) gini_results_epoch_train = np.zeros((len(folds), dfm_params['epoch']), dtype=float) gini_results_epoch_valid = np.zeros((len(folds), dfm_params['epoch']), dtype=float) for i, (train_idx, valid_idx) in enumerate(folds): # 这里Xi_train_, Xv_train_, y_train_ 分别表示当前的特征在总的输入的特征中的索引、特征的具体的值、对应的标签索引 Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get( Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get( Xv_train, valid_idx), _get(y_train, valid_idx) # 训练好模型 并进行预测 dfm = DeepFM(**dfm_params) print('before fit Xi_train_:', Xi_train_[0:3]) print('before fit Xv_train_:', Xv_train_[0:3]) print('before fit y_train_:', y_train_[0:3]) dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_) y_train_meta[valid_idx, 0] = dfm.predict(Xi_valid_, Xv_valid_) y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test) gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx]) gini_results_epoch_train[i] = dfm.train_result gini_results_epoch_valid[i] = dfm.valid_result y_test_meta /= float(len(folds)) # save result if dfm_params["use_fm"] and dfm_params["use_deep"]: clf_str = "DeepFM" elif dfm_params["use_fm"]: clf_str = "FM" elif dfm_params["use_deep"]: clf_str = "DNN" print("%s: %.5f (%.5f)" % (clf_str, gini_results_cv.mean(), gini_results_cv.std())) filename = "%s_Mean%.5f_Std%.5f.csv" % (clf_str, gini_results_cv.mean(), gini_results_cv.std()) _make_submission(ids_test, y_test_meta, filename) _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str) return y_train_meta, y_test_meta
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params): fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS) data_parser = DataParser(feat_dict=fd) """ Xi_x是一个n_samples x n_features的索引list,每个数值型特征编码为一个固定索引,每个类别型特征根据类别 数编码为不同的索引 Xv_x是一个n_samples x n_features的值list """ _print("parse data begin") Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest) Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True) _print("parse data finish") dfm_params["feature_size"] = fd.feat_dim #最大索引 dfm_params["field_size"] = len(Xi_train[0]) #特征数,这个还是原始的特征数 y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float) y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l] gini_results_cv = np.zeros(len(folds), dtype=float) gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) #train_idx和valid_idx分别是训练集和验证集的idx,因为做了kfold所以下面要从 #全样本中根据idx提取出来 for i, (train_idx, valid_idx) in enumerate(folds): Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx) _print("fit, fold=%d" % i) dfm = DeepFM(**dfm_params, n_samples=len(Xi_train_)) dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_) y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_) y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test) gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx]) gini_results_epoch_train[i] = dfm.train_result gini_results_epoch_valid[i] = dfm.valid_result y_test_meta /= float(len(folds)) # save result if dfm_params["use_fm"] and dfm_params["use_deep"]: clf_str = "DeepFM" elif dfm_params["use_fm"]: clf_str = "FM" elif dfm_params["use_deep"]: clf_str = "DNN" line = "%s: %.5f (%.5f)"%(clf_str, gini_results_cv.mean(), gini_results_cv.std()) _print(line) filename = "%s_Mean%.5f_Std%.5f.csv"%(clf_str, gini_results_cv.mean(), gini_results_cv.std()) _make_submission(ids_test, y_test_meta, filename) # 暂时不画图了 #_plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str) return y_train_meta, y_test_meta
#cols = [c for c in features if (not c in config.IGNORE_COLS)] ''' X_train = dfTrain[cols].values y_train = dfTrain["is_trade"].values X_test = dfTest[cols].values ids_test = dfTest["instance_id"].values cat_features_indices = [i for i,c in enumerate(cols) if c in config.CATEGORICAL_COLS] ''' #convert data to (index: value: lable) fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS) data_parser = DataParser(feat_dict=fd) Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True) Xi_test, Xv_test, y_test = data_parser.parse(df=dfTest, has_label=True) #feature_size = fd.feat_dim feature_sizes = fd.feature_sizes field_size = len(Xi_train[0]) num_epoch = 30 #没有装cuda,不要调cuda #Xi_train和result_dict['index']不同,一个是统计全局,一个是统计某一个 def run_deepFM(): deepfm = DeepFM.DeepFM(field_size,feature_sizes,verbose=True,use_cuda=True, weight_decay=0.00005,embedding_size=12,use_fm=False,batch_size=128,use_ffm=True,use_deep=True, n_epochs=num_epoch) if online == False: deepfm.fit(Xi_train, Xv_train, y_train, Xi_test, Xv_test, y_test, ealry_stopping=True,refit=True)
# fm_params["use_deep"] = False # y_train_fm, y_test_fm,dfm_fm = _run_base_model_dfm(dfTrain, dfTest, folds, fm_params,"save/fm/temp") # dfm_fm.saver.save(dfm_fm.sess,'save/fm',global_step=dfm_params["epoch"]) # ------------------ DNN Model ------------------ dnn_params = dfm_params.copy() dnn_params["use_fm"] = False dfm_dnn = get_deep_fm_model(dfTrain, dfTest, dnn_params,dfTest2) ###Convert Predict Data from datetime import datetime start_time=datetime.now() fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS) data_parser = DataParser(feat_dict=fd) Xi_test, Xv_test, _ = data_parser.parse(df=dfTest2) print('Consumed Time for Convert Data: {} second(s)'.format(str((datetime.now()-start_time).seconds))) ###Convert Predict Data past_epoch=800 dfm_dnn.saver.restore(dfm_dnn.sess, "save/FixedHashing/temp" + '-' + str(past_epoch)) predict_result=dfm_dnn.predict(Xi_test,Xv_test) print(predict_result[0]) past_epoch=1600 dfm_dnn.saver.restore(dfm_dnn.sess, "save/FixedHashing/temp" + '-' + str(past_epoch)) predict_result=dfm_dnn.predict(Xi_test,Xv_test) print(predict_result[0]) past_epoch=2400 dfm_dnn.saver.restore(dfm_dnn.sess, "save/FixedHashing/temp" + '-' + str(past_epoch)) predict_result=dfm_dnn.predict(Xi_test,Xv_test)
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params): if os.path.exists(config.DF_FILE): print("FD EXISTED") with open(config.DF_FILE, 'rb') as fd_f: fd = pickle.load(fd_f) else: print("FD NO EXISTED") fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS) with open(config.DF_FILE, 'wb') as fd_f: pickle.dump(fd, fd_f) data_parser = DataParser(feat_dict=fd) Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True) Xi_test, Xv_test, y_test = data_parser.parse(df=dfTest, has_label=True) #测试集也是有label # print(y_test) # print(Xi_train) # print(Xv_train) # print(y_train) dfm_params["feature_size"] = fd.feat_dim dfm_params["field_size"] = len(Xi_train[0]) print(dfm_params) # print(dfm_params) y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float) y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l] auc_results_cv = np.zeros(len(folds), dtype=float) test_auc_results_cv = np.zeros(len(folds), dtype=float) auc_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) auc_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) # best_test_res = 0.0 for i, (train_idx, valid_idx) in enumerate(folds): print(f"Fold {i}:") Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx) # print(Xi_train_) # print(Xv_train_) # print(y_train_) # print(Xi_valid_) # print(Xv_valid_) # print(y_valid_) dfm = DeepFM(**dfm_params) dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_, i) y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_) y_test_meta[:,0] = dfm.predict(Xi_test, Xv_test) auc_results_cv[i] = auc(y_valid_, y_train_meta[valid_idx]) test_auc_results = auc(y_test, y_test_meta) # if test_auc_results > best_test_res: # MODEL_PATH = config.MODEL_PATH % (i, ) # dfm.save_model(config.MODEL_PATH)#可以写保存地址 test_auc_results_cv[i] = test_auc_results auc_results_epoch_train[i] = dfm.train_result auc_results_epoch_valid[i] = dfm.valid_result y_test_meta /= float(len(folds)) # save result if dfm_params["use_fm"] and dfm_params["use_deep"]: clf_str = "DeepFM" elif dfm_params["use_fm"]: clf_str = "FM" elif dfm_params["use_deep"]: clf_str = "DNN" print("%s: %.5f (%.5f)"%(clf_str, auc_results_cv.mean(), auc_results_cv.std())) print("test auc: ", test_auc_results_cv) filename = "%s_Mean%.5f_Std%.5f.csv"%(clf_str, auc_results_cv.mean(), auc_results_cv.std()) # _make_submission(ids_test, y_test_meta, filename) # _plot_fig(auc_results_epoch_train, auc_results_epoch_valid, clf_str) return y_train_meta, y_test_meta
def _run_base_model_dfm(dfTrain=None, dfTest=None, trainfile=None, testfile=None, dfm_params=None): fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, trainfile=trainfile, testfile=testfile, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS) data_parser = DataParser(feat_dict=fd) Xi_train, Xv_train, y_train = data_parser.parse(df=fd.dfTrain, has_label=True, target=config.LABEL) Xi_test, Xv_test, ids_test = data_parser.parse(df=fd.dfTest, uid=config.UID) dfm_params["feature_size"] = fd.feat_dim dfm_params["field_size"] = len(Xi_train[0]) y_train_meta = np.zeros((fd.dfTrain.shape[0], 1), dtype=float) y_test_meta = np.zeros((fd.dfTest.shape[0], 1), dtype=float) _get = lambda x, l: [x[i] for i in l] # folds folds = list( StratifiedKFold(n_splits=config.NUM_SPLITS, shuffle=True, random_state=config.RANDOM_SEED).split( np.array(Xv_train), y_train)) err_results_cv = np.zeros(len(folds), dtype=float) err_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) err_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float) for i, (train_idx, valid_idx) in enumerate(folds): Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get( Xv_train, train_idx), _get(y_train, train_idx) Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get( Xv_train, valid_idx), _get(y_train, valid_idx) dfm = DeepFM(**dfm_params) dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_) y_train_meta[valid_idx, 0] = dfm.predict(Xi_valid_, Xv_valid_) y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test) # need change to mae err_results_cv[i] = err_norm(y_valid_, y_train_meta[valid_idx]) err_results_epoch_train[i] = dfm.train_result err_results_epoch_valid[i] = dfm.valid_result y_test_meta /= float(len(folds)) # save result if dfm_params["use_fm"] and dfm_params["use_deep"]: clf_str = "DeepFM" elif dfm_params["use_fm"]: clf_str = "FM" elif dfm_params["use_deep"]: clf_str = "DNN" print("%s: %.5f (%.5f)" % (clf_str, err_results_cv.mean(), err_results_cv.std())) filename = "%s_Mean%.5f_Std%.5f.csv" % (clf_str, err_results_cv.mean(), err_results_cv.std()) _make_submission(ids_test, y_test_meta, filename) #_plot_fig(err_results_epoch_train, err_results_epoch_valid, clf_str) return y_train_meta, y_test_meta
class dfm_predict(object): def __init__(self): with open(cur_dir + config.DF_FILE, 'rb') as fd_f: fd = pickle.load(fd_f) dfm_params["feature_size"] = fd.feat_dim dfm_params["field_size"] = 18 self.userfeats, self.insfeats = self._load_dict() self.data_parser = DataParser(feat_dict=fd) self.dfm = DeepFM(**dfm_params) self.dfm.load_model(cur_dir + config.MODEL_DIR) # print(fd["1"]) def _load_dict(self): userfeats = pd.read_csv(cur_dir + config.USER_FILE, sep="\t", header=0) insfeats = pd.read_csv(cur_dir + config.INS_FILE, sep="\t", header=0) return userfeats, insfeats def predict(self, users_feats, ins_feats): users_feats.update(ins_feats) # print(users_feats) dfTest = pd.DataFrame([users_feats]) Xi_test, Xv_test, y_test = self.data_parser.parse( df=dfTest, has_label=False) # 测试集也是有label res = self.dfm.predict(Xi_test, Xv_test) return res def predict_plus(self, uID, insID): ins_feats = self.insfeats.loc[self.insfeats['InsID'].isin([insID])] users_feats = self.userfeats.loc[self.userfeats['UID'].isin([uID])] #若用户不存在,直接返回相似度为1 if users_feats.empty: return [1] #若保险不存在,返回相似度为0 if ins_feats.empty: return [0] # print("ins_feats:") # print(ins_feats) # print("users_feats:") # print(users_feats) ins_feats['tmp'] = 1 users_feats['tmp'] = 1 dfTest = pd.merge(users_feats, ins_feats, on=['tmp']) dfTest = dfTest.drop("tmp", axis=1) Xi_test, Xv_test, y_test = self.data_parser.parse( df=dfTest, has_label=False) # 测试集也是有label try: res = self.dfm.predict(Xi_test, Xv_test) except: print(f"数据中有nan:{Xi_test}") return [0] return res def predict_plus_plus(self, uID, insIDs): ins_feats = [] users_feats = [] for insID in insIDs: ins_feat = self.insfeats.loc[self.insfeats['InsID'].isin([insID])] users_feat = self.userfeats.loc[self.userfeats['UID'].isin([uID])] ins_feats.append(ins_feat) users_feats.append(users_feat) # print(ins_feats) ins_feats = pd.concat(ins_feats) users_feats = pd.concat(users_feats) #若用户不存在,直接返回相似度为1 if users_feats.empty: return [1] #若保险不存在,返回相似度为0 if ins_feats.empty: return [0] ins_feats.index = range(len(ins_feats)) users_feats.index = range(len(users_feats)) dfTest = pd.concat([users_feats, ins_feats], axis=1) Xi_test, Xv_test, y_test = self.data_parser.parse( df=dfTest, has_label=False) # 测试集也是有label try: res = self.dfm.predict(Xi_test, Xv_test) except: print(f"数据中有nan:{Xi_test}") return [0] return res
def run_base_model_nfm(dfTrain, dfTest, folds, kdfm_params): fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest, numeric_cols=config.NUMERIC_COLS, ignore_cols=config.IGNORE_COLS, xm_cols=config.XM_COLS) data_parser = DataParser(feat_dict=fd) # 新添 word2idx, idx2word = build_vocab(config.word_file) # Xi_train :列的序号 # Xv_train :列的对应的值 Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain) Xt_train, Xm_train = read_text_data( config.TRAIN_FILE, word2idx, config.num_unroll_steps) # read data TODO:config 与 pnn_params Xi_test, Xv_test, y_test = data_parser.parse(df=dfTest) Xt_test, Xm_test = read_text_data(config.TEST_FILE, word2idx, config.num_unroll_steps) kdfm_params['feature_size_one_hot'] = fd.feat_dim kdfm_params['word_embeddings'] = load_embedding( config.embedding_size, filename=config.embedding_file) # read data #TODO:change y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float) y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float) results_cv = np.zeros(len(folds), dtype=float) results_epoch_train = np.zeros((len(folds), kdfm_params['epoch']), dtype=float) results_epoch_valid = np.zeros((len(folds), kdfm_params['epoch']), dtype=float) results_epoch_train_mae = np.zeros((len(folds), kdfm_params['epoch']), dtype=float) results_epoch_valid_mae = np.zeros((len(folds), kdfm_params['epoch']), dtype=float) def _get(x, l): return [x[i] for i in l] for i, (train_idx, valid_idx) in enumerate(folds): Xi_train_, Xv_train_, y_train_, Xt_train_, Xm_train_ = \ _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx), \ _get(Xt_train, train_idx), _get(Xm_train, train_idx) Xi_valid_, Xv_valid_, y_valid_, Xt_valid_, Xm_valid_ = \ _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx), \ _get(Xt_train, valid_idx), _get(Xm_train, valid_idx) kdfm = DeepAFM(**kdfm_params) Xim_train_ = [] Xvm_train_ = [] Xim_valid_ = [] Xvm_vaild_ = [] Xim_test = [] Xvm_test = [] kdfm.fit(Xi_train_, Xv_train_, Xim_train_, Xvm_train_, Xt_train_, y_train_, Xi_valid_, Xv_valid_, Xim_valid_, Xvm_vaild_, Xt_valid_, y_valid_) y_train_meta[valid_idx, 0] = kdfm.predict(Xi_valid_, Xv_valid_, Xim_valid_, Xvm_vaild_, Xt_valid_) y_test_meta[:, 0] += kdfm.predict(Xi_test, Xv_test, Xim_test, Xvm_test, Xt_test) results_cv[i] = mse_norm(y_valid_, y_train_meta[valid_idx]) results_epoch_train[i] = kdfm.train_result results_epoch_valid[i] = kdfm.valid_result results_epoch_train_mae[i] = kdfm.mae_train_result results_epoch_valid_mae[i] = kdfm.mae_valid_result y_test_meta /= float(len(folds)) mse_test = mse(y_test, y_test_meta) # save result if kdfm_params["use_afm"] and kdfm_params["use_deep"]: clf_str = "KDFM" elif kdfm_params["use_afm"]: clf_str = "AFM" elif kdfm_params["use_deep"]: clf_str = "DNN" print("%s: %.5f (%.5f)" % (clf_str, results_cv.mean(), results_cv.std())) filename = "%s_Mean%.5f_Std%.5f.csv" % (clf_str, results_cv.mean(), results_cv.std()) _make_submission(y_test, y_test_meta, mse_test, filename) _plot_fig(results_epoch_train, results_epoch_valid, clf_str + 'mse', "mse") _plot_fig(results_epoch_train_mae, results_epoch_valid_mae, clf_str + 'mae', "mae")