def get_standardscaler(samples_path, channelid): data_manager = DataManager(channelid=channelid) data_manager.loadLabeledPoint(samples_path) X_res, Y_res = resamples(data_manager.data) scaler = preprocessing.StandardScaler().fit(X_res) pickle.dumps(open(workdir + "scaler_{}.pkl".format(channelid)), "wb") return scaler
def run(self, samples_path, model_save_path, algo="ffm", params=None, df_path=None, channelid=4, phase="train"): if df_path is None and not os.path.exists( samples_path + ".pkl") and not os.path.exists(samples_path): print(samples_path + " not exists") return pipelineModel = None if df_path is None: """ 实际上这个分支已经不支持了 """ data_manager = DataManager(channelid=channelid, config_param=config_param) data_manager.loadLabeledPoint(samples_path) x, y = CtrExperiment.resamples(data_manager.data) pipelineModel = CtrExperiment.experiment(x, y, algo, params=params) else: config_param[algo]["phase"] = phase CtrExperiment.experiment(None, None, algo, params=params, df_path=df_path) if pipelineModel is not None: CtrExperiment.saveModel2PMMLFormat(pipelineModel, model_save_path) pickle.dump( pipelineModel, open( config_param["workdir"][platform.system()] + algo + "_{}.pkl".format(channelid), "wb"))
pos_samples = data_manager_instance.data["pos_samples"]["X"] neg_samples = data_manager_instance.data["neg_samples"]["X"] pos_columns = (pos_samples != 0).sum(0) / pos_samples.shape[0] neg_columns = (neg_samples != 0).sum(0) / neg_samples.shape[0] x = linspace(0, pos_columns.shape[0], pos_columns.shape[0]) plt.plot(x, pos_columns, 'r', linewidth=2) # plt.plot(x, neg_columns, 'b', linewidth=2) plt.xlabel(r'feature', fontsize=16) plt.ylabel(r'nonzero ratio', fontsize=16) plt.savefig(config_param["workdir"][platform.system()] + "explore_nonz-pos.png") plt.figure() plt.plot(x, neg_columns, 'b', linewidth=2) plt.xlabel(r'feature', fontsize=16) plt.ylabel(r'nonzero ratio', fontsize=16) plt.savefig(config_param["workdir"][platform.system()] + "explore_nonz-neg.png") if __name__ == "__main__": channelid = 4 data_manager_instance = DataManager( channelid=channelid, workdir=config_param["workdir"][platform.system()]) explore_nonz(config_param["workdir"][platform.system()] + "samples-optimization_{}.labeledpoint".format(channelid))
with open(workdir + "ffm_train.txt", "w", encoding="utf-8") as file_write: for idx in pos_list[:int(pos_cnt * 0.7)]: file_write.write(ffm_data["pos_samples"][idx] + "\n") for idx in samples_index[:int(neg_cnt * 0.7)]: file_write.write(ffm_data["neg_samples"][idx] + "\n") with open(workdir + "ffm_vaild.txt", "w", encoding="utf-8") as file_write: for idx in pos_list[int(pos_cnt * 0.7):int(pos_cnt * 0.8)]: file_write.write(ffm_data["pos_samples"][idx] + "\n") for idx in samples_index[int(neg_cnt * 0.7):int(neg_cnt * 0.8)]: file_write.write(ffm_data["neg_samples"][idx] + "\n") with open(workdir + "ffm_test.txt", "w", encoding="utf-8") as file_write: for idx in pos_list[int(pos_cnt * 0.8):]: file_write.write(ffm_data["pos_samples"][idx] + "\n") for idx in samples_index[int(neg_cnt * 0.8):]: file_write.write(ffm_data["neg_samples"][idx] + "\n") if __name__ == "__main__": config_param = yaml.load(open("config.yml", "r", encoding="utf-8").read()) workdir = config_param["workdir"][platform.system()] dm_instance = DataManager(4, workdir) ffm_field_idx_path = workdir + "field_idx_4.ffm" generate_ffm_data( workdir + "samples-optimization_{}.labeledpoint".format(4), ffm_format_path=workdir + "samples-optimization_{}.ffm".format(4)) make_ffm_train_vaild_test_data(ffm_format_path=workdir + "samples-optimization_{}.ffm".format(4))
def experiment(x, y, algo, params=None, df_path=None): x_train, x_test, y_train, y_test = None, None, None, None if df_path is None: x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.3, random_state=42) print("train samples dim", x_train.shape, "test samples dim", x_test.shape) # 数据归一化 min_max_scaler = preprocessing.MinMaxScaler().fit(x) x_train = np.around(min_max_scaler.transform(x_train), 4) x_test = np.around(min_max_scaler.transform(x_test), 4) print("runing algorithm:" + algo) if algo == "gbdt": return GbdtExperiment.train(x_train, y_train, params) elif algo == "lr": clf_l2_LR = LogisticRegression(penalty='l2', tol=0.01) clf_l2_LR.fit(x_train, y_train) y_pre_train = clf_l2_LR.predict(x_train) # print(classification_report(y_train, y_pre_train, target_names=["exposure", "click"])) elif algo == "gbdt_plus_lr": return GbdtPlusLrExperiment.experiment(x_train, x_test, y_train, y_test, params) # # todo 类别变量与连续变量分开 # X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train, y_train, test_size=0.5) # params_ = params.copy() # params_["n_estimators"] = 40 # pipelineModel = GbdtExperiment.train(X_train, y_train, params_) # gbt_enc = OneHotEncoder() # gbt_enc.fit(pipelineModel.named_steps['gbtclassifier'].apply(X_train)[:, :, 0]) # # grd_lm = LogisticRegression(max_iter=300) # grd_lm.fit(gbt_enc.transform(pipelineModel.named_steps['gbtclassifier'].apply(X_train_lr)[:, :, 0]), # y_train_lr) # # y_pred_grd_lm = grd_lm.predict( # gbt_enc.transform(pipelineModel.named_steps['gbtclassifier'].apply(X_test)[:, :, 0])) # [:, 1] # print(classification_report(y_test, y_pred_grd_lm, target_names=["exposure", "click"])) elif algo == "xgboost": pipelineModel = XgboostExperiment.experiment( x_train, y_train, params) evaluateOnTrainAndTest(y_train, y_test, pipelineModel.predict(x_train), pipelineModel.predict(x_test)) return pipelineModel elif algo == "xgboost_plus_fm": pass elif algo == "xgboost_plus_ffm": xgboost_plust_ffm = XgboostPlusFFMExperiment( config_params=config_param) phase = config_param[algo]["phase"] if phase.startswith("train"): xgboost_plust_ffm.experiment() else: xgboost_plust_ffm.experiment( DataManager.load_dataframe(df_path, 10000000)) elif algo == "lightgbm_plus_ffm": lgbm_plust_ffm = LightGbmPlusFFMExperiment( config_params=config_param) phase = config_param[algo]["phase"] if phase.startswith("train"): lgbm_plust_ffm.experiment() else: lgbm_plust_ffm.experiment( DataManager.load_dataframe(df_path, 10000000)) elif algo == "ffm": ffm = FFMExperiment(config_params=config_param) phase = config_param[algo]["phase"] if phase.startswith("train") or phase == "emsemble": ffm.experiment() else: ffm.experiment(DataManager.load_dataframe(df_path, 10000000))
samples["neg_samples"]["Y"].shape[0])) scaler = preprocessing.StandardScaler().fit(X) x_scaler = scaler.transform(X) pipelineModel = pickle.load(open(workdir + "xgboost_4.pkl", "rb")) y_pre_train = pipelineModel.named_steps['xgbclassifier'].predict_proba( x_scaler) pred_click_num = sum([ 1 if y_pre_train[i][1] >= 0.5 else 0 for i in range(y_pre_train.shape[0]) ]) print("ctr_pre_avg = ", pred_click_num / y_pre_train.shape[0]) if __name__ == "__main__": workdir = "E:/Work/jobs/data/DSP/CTR预估/samples/" if platform.system() == "Linux": workdir = "/data/kongyy/ctr/" config_param = yaml.load(open("config.yml", "r", encoding="utf-8").read()) channelid = '4' data_manager = DataManager(channelid=channelid, workdir=workdir) data_manager.loadLabeledPoint( workdir + "samples-optimization_labeledpoint_{}".format(channelid)) get_avg_ctr()
from dsp.ctr.data_manager import DataManager from dsp.utils.data_utils import * # 测试 Python 生成的特征向量与java 生成的是否一样 if __name__ == "__main__": config_param = yaml.load(open("config.yml", "r", encoding="utf-8").read()) if platform.system() == "Linux": workdir = config_param["work_dir"]["Linux"] else: workdir = config_param["work_dir"]["Windows"] channelid = 4 predict_day = "2018-08-09" data_manager = DataManager(channelid=channelid, workdir=workdir) if not os.path.exists(workdir + "/samples-optimization_test_std".format(channelid)): std_raw_samples(workdir + "/samples-optimization_test") data_manager.load_raw_fields(workdir + "samples-optimization_test_std", workdir + "samples-optimization_test_labeledpoint_{}".format(channelid), { "creativeid": workdir + "ctr_dsp_creativeid_statistics.csv", "adid": workdir + "ctr_dsp_adid_statistics.csv", "advertiserid": workdir + "ctr_dsp_advertiserid_statistics.csv", }, predict_day ) print("end")