def data_split(data, threads, split_flag): data = shuffle(pd.DataFrame(data)) total = len(data) sep = round(total / threads, 2) split_point = [] for i in range(threads + 1): split_point.append(int(i * sep)) logger.info("CutPoint: " + str(split_point)) split_res = [] for j in range(len(split_point) - 1): t = [] t.append(split_point[j]) t.append(split_point[j + 1]) split_res.append(t) res = [] for s in split_res: content = data[s[0]:s[1]] if len(content) != 0: res.append(content) res_final = [] for ss in range(len(res)): tmp = res[ss] tmp = pd.DataFrame(tmp) tmp[split_flag] = ss res_final.append(tmp) return res_final
def main(): df = pd.read_csv("data/hl_test_clean.csv", encoding="utf8") df['book_date'] = pd.to_datetime(df['book_date']) trainSet = df[(df['book_date'] >= '2017-04-01') & (df['book_date'] <= '2017-07-20')].reset_index(drop=True) testSet = df[(df['book_date'] >= '2017-07-20') & (df['book_date'] <= '2017-08-31')].reset_index(drop=True) logger.info( "============================Data is ready!============================" ) clf = RandomForestClassifier( n_estimators=10, max_features=10, max_depth=4, min_samples_split=0.05, ) myexe = MyExecutor(df, "fpd", clf) leftVaris = myexe.get_result() leftVaris = leftVaris[leftVaris.values > 7].keys() X_train = trainSet[leftVaris].copy() y_train = trainSet['fpd'].copy() X_test = testSet[leftVaris].copy() y_test = testSet[leftVaris].copy() # AutoSklearn阶段: cls = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=62, per_run_time_limit=60, include_estimators=['adaboost'], resampling_strategy='holdout', resampling_strategy_arguments={'train_size': 0.67}) getReport(cls, trainSet, X_train, y_train, testSet, X_test, y_test)
def drop_by_iv(df, y, p=10): """ select variables by information value :param df: :param y: :param p: 线程数, < cpu核数/线程数 :return: """ columns = list(df.columns) result = [] # print(columns) # for col in tqdm(columns): # tmp = { # "columns": col, # "IV": calc_iv(df, y, col)[1] # } # result.append(copy(tmp)) logger.info("Start filter variables by IV, Current thread: {}".format(p)) pool = Pool(processes=p) for col in columns: result.append(pool.apply_async(calc_iv, args=(df, y, col))) pool.close() pool.join() result = pd.DataFrame([s.get() for s in result]) selected_vars = list(result[result.IV >= 0.02]["columns"]) selected_vars.append("fpd") return selected_vars
def build_model(self, mymodel, train_x, train_y, test_x, test_y): """ :param mymodel: sklearn对象 :param train_x: 训练集_X :param train_y: 训练集_Y :param test_x: 测试集_X :param test_y: 测试集_Y :return: auc """ # TODO: 训练模型,使用九份数据训练,在第十份数据上检验,得到相应的AUC,KS, 总共训练10次 clf = mymodel clf.fit(train_x, train_y) predict_prob_y = clf.predict_proba(test_x) predict_prob_y = pd.DataFrame(predict_prob_y, columns=["fpd0", "fpd1"]) test_auc = metrics.roc_auc_score(test_y, predict_prob_y["fpd1"]) left_variables = train_x.columns[np.where( clf.feature_importances_ > 0)].tolist() logger.info(left_variables) logger.info(len(left_variables)) this_feature_impoirtance = pd.DataFrame( list(zip(train_x.columns, clf.feature_importances_)), columns=["variable", "importance"]) used_feature_importance = this_feature_impoirtance[ this_feature_impoirtance.importance > 0] # print(used_feature_importance) return test_auc, used_feature_importance
def main(): df = pd.read_csv("data/model_mix/clean_data.csv", encoding="utf8") trainSet = df[ df["book_mon"].isin(['2017-05', '2017-06', '2017-07']) ].reset_index(drop=True) testSet = df[ df["book_mon"] == "2017-08" ].reset_index(drop=True) logger.info("============================Data is ready!============================") clf = XGBClassifier( n_estimators=10, max_features=10, max_depth=4, min_samples_split=0.05, ) myexe = MyExecutor(df, "fpd", clf) leftVaris = myexe.get_result() leftVaris = leftVaris[leftVaris.values > 7].keys() X_train = trainSet[leftVaris].copy() y_train = trainSet['fpd'].copy() X_test = testSet[leftVaris].copy() y_test = testSet['fpd'].copy() # AutoSklearn阶段: cls = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=62, per_run_time_limit=60, include_estimators=['adaboost'], resampling_strategy='holdout', resampling_strategy_arguments={'train_size': 0.67} ) getReport(cls, trainSet, X_train, y_train, testSet, X_test, y_test)
def feature_select(self, sub_train_set, y): logger.info("Start filer Variables total: {}".format(len(sub_train_set.columns))) tmp1 = drop_useless(sub_train_set, 'pre_apply_no', 'book_date', 'book_mon') sub_train_set = sub_train_set[tmp1] tmp2 = drop_by_iv(sub_train_set, "fpd", 2) logger.info("Stop filter Variables total: {}".format(len(tmp2))) return tmp2
def main(): df = pd.read_csv("data/hl_test_clean.csv", encoding="utf8") logger.info("============================Data is ready!============================") clf = DecisionTreeClassifier( max_features=1, min_weight_fraction_leaf=0.05, min_samples_split=0.05, criterion="entropy", max_leaf_nodes=5 ) myexe = myExecutor(df, "fpd", clf) myexe.train_all()
def main(): df = pd.read_csv("data/hl_test_clean.csv", encoding="utf8") df['book_date'] = pd.to_datetime(df['book_date']) trainSet = df[(df['book_date'] >= '2017-04-01') & (df['book_date'] <= '2017-07-20')].reset_index(drop=True) testSet = df[(df['book_date'] >= '2017-07-20') & (df['book_date'] <= '2017-08-31')].reset_index(drop=True) logger.info( "============================Data is ready!============================" ) clf = XGBClassifier(learning_rate=0.01, max_depth=7, min_child_weight=15, n_estimators=100, nthread=1, subsample=0.6500000000000001) myexe = MyExecutor(df, "fpd", clf) #leftVaris = myexe.get_result() #leftVaris = leftVaris[leftVaris.values > 7].keys() #print(leftVaris) leftVaris = [ 'hl_call_domesitc_cnt_2m', 'hl_contact_early_morning_cnt_5m', 'hl_phone_silent_frequentcy', 'hl_contact_night_pct', 'hl_transactions_total_amt_5m', 'hl_region_call_cnt_max_uniq_num_cnt', 'hl_region_call_out_cnt_max_avg_call_in_time', 'hl_contact_morning_cnt_5m', 'hl_region_call_in_time_max_avg_call_in_time', 'hl_transactions_total_amt_2m', 'hl_contact_night_cnt_5m', 'hl_phone_num_used_time_months', 'hl_region_call_cnt_max_avg_callin_time', 'hl_region_call_in_time_max_uniq_num_cnt', 'hl_region_call_in_cnt_max_avg_call_out_time', 'hl_transactions_min_5m', 'hl_region_call_out_time_max_avg_call_out_time' ] X_train = trainSet[leftVaris].copy() y_train = trainSet['fpd'].copy() X_test = testSet[leftVaris].copy() y_test = testSet['fpd'].copy() # AutoSklearn阶段: pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=4, random_state=42, verbosity=2) pipeline_optimizer.fit(X_train, y_train) # print(pipeline_optimizer.score(X_test, y_test)) pipeline_optimizer.export('tpot_exported_pipeline.py') getReport(pipeline_optimizer, trainSet, X_train, y_train, testSet, X_test, y_test)
def judge_function_v1(result): """ 1. 测试AUC, KS表现 2. 单个变量多次入选B1~B10 3. 变量业务逻辑核查 :param: train_all的返回值 :return: """ # TODO: 对变量集对应的10组测试AUC均值和方差进行评判 calc_res = [] for model_res in result: current_auc = model_res[0] logger.info(current_auc) var_importance = model_res[1] var_importance[ "importance_plus"] = var_importance["importance"] * current_auc calc_res.append(var_importance) return pd.concat(calc_res)
def main(): df = pd.read_csv("data/model_mix/clean_data.csv", encoding="utf8") trainSet = df[df["book_mon"].isin(['2017-05', '2017-06', '2017-07'])].reset_index(drop=True) testSet = df[df["book_mon"] == "2017-08"].reset_index(drop=True) logger.info( "============================Data is ready!============================" ) clf = RandomForestClassifier( n_estimators=10, max_features=10, max_depth=4, min_samples_split=0.05, ) myexe = MyExecutor(trainSet, "fpd", clf) leftVaris = myexe.get_result(20) print( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) print(leftVaris) print( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) X_train = trainSet[leftVaris].copy() y_train = trainSet['fpd'].copy() X_test = testSet[leftVaris].copy() y_test = testSet['fpd'].copy() # AutoSklearn阶段: pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=4, random_state=42, n_jobs=1, verbosity=2) pipeline_optimizer.fit(X_train, y_train) pipeline_optimizer.export('tpot_exported_pipeline.py') getReport(pipeline_optimizer, trainSet, X_train, y_train, testSet, X_test, y_test)
def main(kwargs): df = pd.read_csv("data/model_mix/clean_data.csv", encoding="utf8") trainSet = df[df["book_mon"].isin(['2017-05', '2017-06', '2017-07'])].reset_index(drop=True) testSet = df[df["book_mon"] == "2017-08"].reset_index(drop=True) logger.info( "============================Data is ready!============================" ) # clf = RandomForestClassifier( # n_estimators=10, # max_features=10, # max_depth=4, # min_samples_split=0.05, # ) clf = kwargs["feature_model"] myexe = MyExecutor(df, "fpd", clf) leftVaris = myexe.get_result(kwargs["feature_num"]) print( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) print(leftVaris) print( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) X_train = trainSet[leftVaris].copy() y_train = trainSet['fpd'].copy() X_test = testSet[leftVaris].copy() y_test = testSet['fpd'].copy() # AutoSklearn阶段: pipeline_optimizer = TPOTClassifier( generations=int(kwargs["generations"]), population_size=int(kwargs["population_size"]), #offspring_size=kwargs["offspring_size"], #mutation_rate=kwargs["mutation_rate"], #crossover_rate=kwargs["crossover_rate"], scoring=kwargs["scoring"], cv=int(kwargs["cv"]), subsample=float(kwargs["subsample"]), n_jobs=int(kwargs["n_jobs"]), #max_time_mins=kwargs["max_time_mins"], # max_eval_time_seconds = max(int(self.max_eval_time_mins * 60), 1) max_eval_time_mins=int(kwargs["max_eval_time_mins"]), random_state=random.randint(1, 100)) pipeline_optimizer.fit(X_train, y_train) trainKS, testKS, abs_trainKS_testKS, trainAUC, testAUC, abs_trainAUC_testAUC = \ getReport(pipeline_optimizer, trainSet, X_train, y_train, testSet, X_test, y_test) # 记录结果 if kwargs["uid"] is os.listdir("tpot_result/"): os.removedirs("tpot_result/{}".format(kwargs['uid'])) os.mkdir("tpot_result/{}".format(kwargs["uid"])) pipeline_optimizer.export( 'tpot_result/{}/tpot_exported_pipeline.py'.format(kwargs["uid"])) with open('tpot_result/{}/vars'.format(kwargs["uid"]), "w+") as f1: f1.write(str(leftVaris)) report = pd.DataFrame([ { "trainKS": trainKS, "testKS": testKS, "abs_trainKS_testKS": abs_trainKS_testKS, "trainAUC": trainAUC, "testAUC": testAUC, "abs_trainAUC_testAUC": abs_trainAUC_testAUC }, ]) report.to_csv('tpot_result/{}/report.csv'.format(kwargs['uid']), index=False, encoding="utf8")
def feature_select(self, sub_train_set, y): logger.info("Start filer Variables total: {}".format( len(sub_train_set.columns))) tmp1 = drop_useless(sub_train_set, 'pre_apply_no', 'book_date', 'book_mon') return tmp1