def _model_predict(all_feature, predict_feature, predict_col, num_boost_round=1000): # 多余的col del_cols = None for index, i in enumerate(tool.types): if predict_col in i: del_cols = i.copy() break test_label_col = str(index) + "_test" del_cols.extend(["0_test", "1_test", "2_test", "3_test"]) k_v = {} if predict_col in enum_col or predict_col in ext_enum_col: # 删除数量较少的类别 def func_count(df): df['value_count'] = df[predict_col].count() return df if predict_col in large_limit_col.keys(): number_limit = large_limit_col[predict_col] else: number_limit = 10 all_feature = all_feature.groupby(predict_col).apply(func_count) del_test_size = len( all_feature[(all_feature[test_label_col] == 1) & (all_feature["value_count"] < number_limit)]) print(predict_col, "del_test_size:", del_test_size) # 原本应有的所有测试集 test_feature_org = all_feature[all_feature[test_label_col] == 1] test_feature_org.drop(["value_count"], axis=1, inplace=True) test_y_org = np.array(test_feature_org[predict_col]) test_x_org = np.array(test_feature_org.drop(del_cols, axis=1)) print("test_x_org", test_x_org.shape) all_feature = all_feature[all_feature["value_count"] >= number_limit] all_feature.drop(["value_count"], axis=1, inplace=True) # 将value转换为class label = all_feature[predict_col] all_y = sorted(list(set(label))) if len(all_y) == 1: # 只有一个值,直接返回预测结果 print("only one value!") return np.array([all_y[0]] * len(predict_feature)), 1 v_k = {} for k, v in enumerate(all_y): v_k[v] = k k_v[k] = v label = np.array([v_k[i] for i in label]) all_feature[predict_col] = label train_feature = all_feature[all_feature[test_label_col] == 0] train_y = np.array(train_feature[predict_col]) train_x = np.array(train_feature.drop(del_cols, axis=1)) test_feature = all_feature[all_feature[test_label_col] == 1] test_y = np.array(test_feature[predict_col]) test_x = np.array(test_feature.drop(del_cols, axis=1)) predict_x = np.array(predict_feature.drop(del_cols, axis=1)) print("train_x:", train_x.shape, "test_x:", test_x.shape, "predict_x", predict_x.shape) lgb_params = { 'boosting_type': 'gbdt', 'learning_rate': 0.02, 'num_leaves': 256, 'subsample': 0.8, 'colsample_bytree': 0.9, 'min_data_in_leaf': 40, 'num_threads': num_threads, 'verbosity': 0 } if predict_col in bool_col: lgb_params["objective"] = "binary" lgb_params["metric"] = "binary_error" lgb_params["is_unbalance"] = True eval_metric = None elif predict_col in enum_col or predict_col in ext_enum_col: lgb_params["objective"] = "multiclass" lgb_params["metric"] = "multi_error" lgb_params["num_class"] = max(label) + 1 eval_metric = None else: lgb_params["objective"] = lgb_obj eval_metric = tool.lgb_metric train_set = lgb.Dataset(train_x, label=train_y) valid_set = lgb.Dataset(test_x, label=test_y) temp_model = lgb.train(lgb_params, train_set, num_boost_round=num_boost_round, valid_sets=[valid_set], feval=eval_metric, early_stopping_rounds=50, verbose_eval=False) test_pred = temp_model.predict(test_x) # 把概率转换为label if predict_col in bool_col: test_pred = np.where(test_pred > 0.5, 1, 0) elif predict_col in enum_col or predict_col in ext_enum_col: # 用原始的全测试集 if del_test_size > 0: test_pred = temp_model.predict(test_x_org) test_y = test_y_org test_pred = [list(x).index(max(x)) for x in test_pred] # 取回原来的值 test_pred = np.array([k_v[i] for i in test_pred]) if predict_col in category_col: test_s = tool.label_score(test_y, test_pred) else: test_s = tool.regression_score(test_y, test_pred) # 可能保留两位小数或一位小数更好 if_round = False test_pred2 = np.round(test_pred, 2) test_s2 = tool.regression_score(test_y, test_pred2) if test_s < test_s2 - threshold: if_round = 2 test_s = test_s2 test_pred2 = np.round(test_pred, 1) test_s2 = tool.regression_score(test_y, test_pred2) if test_s < test_s2 - threshold: if_round = 1 test_s = test_s2 test_pred2 = np.round(test_pred, 0) test_s2 = tool.regression_score(test_y, test_pred2) if test_s < test_s2 - threshold: if_round = 0 test_s = test_s2 print("best iteration: ", temp_model.best_iteration) print("test score: ", test_s) predict_target = temp_model.predict(predict_x) predict_target = np.array(predict_target) if predict_col in enum_col or predict_col in ext_enum_col: predict_target = [list(x).index(max(x)) for x in predict_target] predict_target = np.array([k_v[i] for i in predict_target]) elif predict_col in bool_col: predict_target = np.where(predict_target > 0.5, 1, 0) if if_round: predict_target = np.round(predict_target, if_round) return predict_target, test_s
def _model_predict(all_feature, predict_feature, predict_col, num_boost_round=1000): # 多余的col del_cols = None for index, i in enumerate(tool.types): if predict_col in i: del_cols = i.copy() break test_label_col = str(index) + "_test" del_cols.extend(["0_test", "1_test", "2_test", "3_test"]) k_v = {} if predict_col in enum_col or predict_col in ext_enum_col: # 删除数量较少的类别 def func_count(df): df['value_count'] = df[predict_col].count() return df if predict_col in large_limit_col.keys(): number_limit = large_limit_col[predict_col] else: number_limit = 10 all_feature = all_feature.groupby(predict_col).apply(func_count) del_test_size = len( all_feature[(all_feature[test_label_col] == 1) & (all_feature["value_count"] < number_limit)]) print(predict_col, "del_test_size:", del_test_size) # 原本应有的所有测试集 test_feature_org = all_feature[all_feature[test_label_col] == 1] test_feature_org.drop(["value_count"], axis=1, inplace=True) test_y_org = np.array(test_feature_org[predict_col]) test_x_org = np.array(test_feature_org.drop(del_cols, axis=1)) print("test_x_org", test_x_org.shape) all_feature = all_feature[all_feature["value_count"] >= number_limit] all_feature.drop(["value_count"], axis=1, inplace=True) # 将value转换为class label = all_feature[predict_col] all_y = sorted(list(set(label))) if len(all_y) == 1: # 只有一个值,直接返回预测结果 print("only one value!") return np.array([all_y[0]] * len(predict_feature)), 1 v_k = {} for k, v in enumerate(all_y): v_k[v] = k k_v[k] = v label = np.array([v_k[i] for i in label]) all_feature[predict_col] = label train_feature = all_feature[all_feature[test_label_col] == 0] train_y = np.array(train_feature[predict_col]) train_x = np.array(train_feature.drop(del_cols, axis=1)) test_feature = all_feature[all_feature[test_label_col] == 1] test_y = np.array(test_feature[predict_col]) test_x = np.array(test_feature.drop(del_cols, axis=1)) predict_x = np.array(predict_feature.drop(del_cols, axis=1)) print("train_x:", train_x.shape, "test_x:", test_x.shape, "predict_x", predict_x.shape) params = { 'booster': 'gbtree', 'eta': 0.02, 'max_depth': 8, # 5 4 3 'colsample_bytree': 0.9, # 0.8 0.7 'subsample': 0.8, 'min_child_weight': 40, # 2 3 'silent': 1, 'nthread': 4, 'tree_method': 'gpu_hist', "gpu_id": 0, "seed": 0 } if predict_col in bool_col: params["objective"] = "binary:logistic" params["eval_metric"] = "error" params["is_unbalance"] = True eval_metric = None elif predict_col in enum_col or predict_col in ext_enum_col: params["objective"] = "multi:softmax" params["eval_metric"] = "merror" params["num_class"] = max(label) + 1 eval_metric = None else: params["objective"] = "reg:linear" eval_metric = tool.xgb_metric train_set = xgb.DMatrix(train_x, label=train_y) valid_set = xgb.DMatrix(test_x, label=test_y) temp_model = xgb.train(params, train_set, num_boost_round=num_boost_round, evals=[(valid_set, "validate")], feval=eval_metric, maximize=True, early_stopping_rounds=200, verbose_eval=False) test_pred = temp_model.predict(valid_set) # 把概率转换为label if predict_col in bool_col: test_pred = np.where(test_pred > 0.5, 1, 0) elif predict_col in enum_col or predict_col in ext_enum_col: # 用原始的全测试集 if del_test_size > 0: valid_set = xgb.DMatrix(test_x_org) test_pred = temp_model.predict(valid_set) test_y = test_y_org # 取回原来的值 test_pred = np.array([k_v[i] for i in test_pred]) if predict_col in category_col: test_s = tool.label_score(test_y, test_pred) else: test_s = tool.regression_score(test_y, test_pred) # 可能保留两位小数或一位小数更好 if_round = False test_pred2 = np.round(test_pred, 2) test_s2 = tool.regression_score(test_y, test_pred2) if test_s < test_s2 - threshold: if_round = 2 test_s = test_s2 test_pred2 = np.round(test_pred, 1) test_s2 = tool.regression_score(test_y, test_pred2) if test_s < test_s2 - threshold: if_round = 1 test_s = test_s2 test_pred2 = np.round(test_pred, 0) test_s2 = tool.regression_score(test_y, test_pred2) if test_s < test_s2 - threshold: if_round = 0 test_s = test_s2 print("best iteration: ", temp_model.best_iteration) print("test score: ", test_s) predict_set = xgb.DMatrix(predict_x) predict_target = temp_model.predict(predict_set) predict_target = np.array(predict_target) if predict_col in enum_col or predict_col in ext_enum_col: predict_target = np.array([k_v[i] for i in predict_target]) elif predict_col in bool_col: predict_target = np.where(predict_target > 0.5, 1, 0) if if_round: predict_target = np.round(predict_target, if_round) return predict_target, test_s
def interpolate_predict(method="index"): start = datetime.datetime.now() data = pd.read_hdf(data_path) final_result = pd.DataFrame() score_df = pd.DataFrame() score_df["var"] = var_col for i in tqdm(range(1, 34)): sub = data[data["wtid"] == i] score_temp = [] for var in var_col: sub1 = sub[pd.notna(sub[var])].reset_index(drop=True) index = 0 for index, t in enumerate(tool.types): if var in t: break col_name = str(index) + "_test" sub2 = sub1[[var]].copy() sub1.loc[sub1[col_name] == 1, var] = np.nan sub1[var] = sub1[var].interpolate(method=method) true_value = sub2[sub1[col_name] == 1][var] predict_value = sub1[sub1[col_name] == 1][var] if_round = False if var in category_col: predict_value = np.array(predict_value).astype(int) true_value = np.array(true_value).astype(int) score = tool.label_score(true_value, predict_value) else: score = tool.regression_score(true_value, predict_value) predict_value2 = np.round(predict_value, 2) score2 = tool.regression_score(true_value, predict_value2) if score < score2 - threshold: score = score2 if_round = 2 predict_value2 = np.round(predict_value, 1) score2 = tool.regression_score(true_value, predict_value2) if score < score2 - threshold: score = score2 if_round = 1 score_temp.append(score) # 预测结果 sub[var] = sub[var].interpolate(method=method) if if_round: sub[var] = np.round(sub[var], if_round) final_result = pd.concat((final_result, sub), axis=0, ignore_index=True) score_df[str(i)] = score_temp score_df.set_index("var", inplace=True) score_df = score_df.T score_df.reset_index(inplace=True) score_df.rename(columns={"index": "wtid"}, inplace=True) score_df.to_csv("./result/{}_score.csv".format(method), encoding="utf8", index=False, float_format='%.4f') final_result = final_result[final_result["count_miss"] > 0] final_result = final_result[head_col] final_result.sort_values(["wtid", "ts"], inplace=True) for var in category_col: final_result[var] = final_result[var].astype(int) final_result.to_csv("./result/{}_result.csv".format(method), encoding="utf8", index=False, float_format='%.2f') end = datetime.datetime.now() print("finish", method, "interpolate_predict time: ", end - start)
def _model_predict(all_feature, predict_feature, predict_col): # 多余的col del_cols = None for index, i in enumerate(tool.types): if predict_col in i: del_cols = i.copy() break test_label_col = str(index) + "_test" del_cols.append(test_label_col) all_col = list(all_feature.columns) all_col.remove(predict_col) all_col.remove(test_label_col) for c in all_col: all_feature[c] = all_feature[c].fillna(value=-1000) predict_feature[c] = predict_feature[c].fillna(value=-1000) k_v = {} if predict_col in enum_col or predict_col in ext_enum_col: # 删除数量较少的类别 def func_count(df): df['value_count'] = df[predict_col].count() return df if predict_col in large_limit_col.keys(): number_limit = large_limit_col[predict_col] else: number_limit = 10 all_feature = all_feature.groupby(predict_col).apply(func_count) del_test_size = len( all_feature[(all_feature[test_label_col] == 1) & (all_feature["value_count"] < number_limit)]) print(predict_col, "del_test_size:", del_test_size) # 原本应有的所有测试集 test_feature_org = all_feature[all_feature[test_label_col] == 1] test_feature_org.drop(["value_count"], axis=1, inplace=True) test_y_org = np.array(test_feature_org[predict_col]) test_x_org = np.array(test_feature_org.drop(del_cols, axis=1)) print("test_x_org", test_x_org.shape) all_feature = all_feature[all_feature["value_count"] >= number_limit] all_feature.drop(["value_count"], axis=1, inplace=True) # 将value转换为class label = all_feature[predict_col] all_y = sorted(list(set(label))) if len(all_y) == 1: # 只有一个值,直接返回预测结果 print("only one value!") return np.array([all_y[0]] * len(predict_feature)), 1 v_k = {} for k, v in enumerate(all_y): v_k[v] = k k_v[k] = v label = np.array([v_k[i] for i in label]) all_feature[predict_col] = label train_feature = all_feature[all_feature[test_label_col] == 0] train_y = np.array(train_feature[predict_col]) train_x = np.array(train_feature.drop(del_cols, axis=1)) test_feature = all_feature[all_feature[test_label_col] == 1] test_y = np.array(test_feature[predict_col]) test_x = np.array(test_feature.drop(del_cols, axis=1)) predict_x = np.array(predict_feature.drop(del_cols, axis=1)) print("train_x:", train_x.shape, "test_x:", test_x.shape, "predict_x", predict_x.shape) if predict_col in category_col or predict_col in ext_enum_col: temp_model = RandomForestClassifier(n_jobs=num_threads) else: temp_model = RandomForestRegressor(n_jobs=num_threads) temp_model.fit(train_x, train_y) test_pred = temp_model.predict(test_x) # 把概率转换为label if predict_col in enum_col or predict_col in ext_enum_col: # 用原始的全测试集 if del_test_size > 0: test_pred = temp_model.predict(test_x_org) test_y = test_y_org # 取回原来的值 test_pred = np.array([k_v[i] for i in test_pred]) if predict_col in category_col: test_s = tool.label_score(test_y, test_pred) else: test_s = tool.regression_score(test_y, test_pred) # 可能保留两位小数或一位小数更好,或取整 if_round = False test_pred2 = np.round(test_pred, 2) test_s2 = tool.regression_score(test_y, test_pred2) if test_s < test_s2 - threshold: if_round = 2 test_s = test_s2 test_pred2 = np.round(test_pred, 1) test_s2 = tool.regression_score(test_y, test_pred2) if test_s < test_s2 - threshold: if_round = 1 test_s = test_s2 test_pred2 = np.round(test_pred, 0) test_s2 = tool.regression_score(test_y, test_pred2) if test_s < test_s2 - threshold: if_round = 0 test_s = test_s2 print("test score: ", test_s) predict_target = temp_model.predict(predict_x) predict_target = np.array(predict_target) if predict_col in enum_col or predict_col in ext_enum_col: predict_target = np.array([k_v[i] for i in predict_target]) if if_round: predict_target = np.round(predict_target, if_round) return predict_target, test_s
def top_predict(): data = pd.read_hdf(data_path) score_df = pd.DataFrame() score_df["var"] = [i for i in var_col] final_result = pd.DataFrame() start = datetime.datetime.now() for wtid in tqdm(range(1, 34)): use_data = data[data["wtid"] == wtid] test_scores = [] for var in var_col: train_data = use_data[pd.notna(use_data[var])] predict_data = use_data[pd.isna(use_data[var])] index = 0 for index, t in enumerate(tool.types): if var in t: break test_label_col = str(index) + "_test" train_feature = train_data[train_data[test_label_col] == 0] top_values = train_feature[var].value_counts().index test_feature = train_data[train_data[test_label_col] == 1] test_y = np.array(test_feature[var]) # 用出现次数最多的数值 test_pred = np.array([top_values[0]] * len(test_y)) predict_y = np.array([top_values[0]] * len(predict_data)) if var in category_col: test_score = tool.label_score(test_y, test_pred) else: test_score = tool.regression_score(test_y, test_pred) # 检验第二多的数值 if test_score > 0.1 and len(top_values) > 1: test_pred2 = [top_values[1]] * len(test_y) if var in category_col: test_score2 = tool.label_score(test_y, test_pred2) else: test_score2 = tool.regression_score(test_y, test_pred2) if test_score2 > test_score: test_score = test_score2 predict_y = np.array([top_values[1]] * len(predict_data)) test_scores.append(test_score) use_data.loc[predict_data.index, var] = predict_y score_df[str(wtid)] = test_scores final_result = pd.concat( (final_result, use_data[use_data["count_miss"] > 0]), axis=0, ignore_index=True) final_result = final_result[head_col] final_result.sort_values(["wtid", "ts"], inplace=True) final_result.to_csv("./result/top_result.csv", encoding="utf8", index=False, float_format='%.2f') score_df.set_index("var", inplace=True) score_df = score_df.T score_df.reset_index(inplace=True) score_df.rename(columns={"index": "wtid"}, inplace=True) score_df.to_csv("./result/top_score.csv", encoding="utf8", index=False, float_format='%.4f') end = datetime.datetime.now() print("finish top_predict time: ", end - start, "\n")