def main(): transaction = pd.read_csv('transaction_new.csv') dis = pd.read_csv('submit_disv1.csv') transaction_new = pd.merge(transaction, dis[['TransactionID', 'score']], on='TransactionID') feature = [ f for f in transaction_new.columns if f != 'TransactionID' and f != 'split' and f != 'isFraud' ] fmap = {} for f in feature: fmap[f] = f.replace(' ', '_') transaction_new = transaction_new.rename(columns=fmap) data = transaction_new[transaction_new['split'] == 1] valid = transaction_new[transaction_new['split'] == 2] train, test = train_test_split(data, test_size=0.3, random_state=42) train_x = train[list(fmap.values())] test_x = test[list(fmap.values())] train_y = train['isFraud'].astype('int') test_y = test['isFraud'].astype('int') clf = LGBMClassifier( boosting_type='gbdt', colsample_bytree=0.2, drop_rate=0.1, importance_type='split', learning_rate=0.04, max_bin=500, max_depth=4, min_child_samples=50, min_split_gain=0.1, n_estimators=500, n_jobs=-1, num_leaves=9, objective=None, random_state=24, reg_alpha=40, reg_lambda=10, sigmoid=0.4, silent=True, #class_weight={0:1,1:10}, #subsample=0.3, subsample_for_bin=24000, is_unbalance=True, subsample_freq=1) clf.fit(train_x, train_y) train_y_pred = clf.predict_proba(train_x)[:, 1] train_ks = cal_ks_scipy(train_y_pred, train_y) y_pred = clf.predict_proba(test_x)[:, 1] test_ks = cal_ks_scipy(y_pred, test_y) print(train_ks, test_ks) tr_auc = metrics.roc_auc_score(train_y, train_y_pred) te_auc = metrics.roc_auc_score(test_y, y_pred) print(tr_auc, te_auc) valid['isFraud'] = clf.predict_proba(valid[clf._Booster.feature_name()])[:, 1] valid[['TransactionID', 'isFraud']].to_csv('submit6.csv', index=False)
def fit_lgb_model(model_spec, early_stopping_rounds=10): ms = model_spec print("loading data using", ms["filename"]) df = ms["dataset"]() print("converting all feature columns to float32") for col in ms["features"].values(): df[col] = df[col].astype("float32") print(df.describe().T) n_targets_with_null = df["target"].isna().sum() print("dropping", n_targets_with_null, "rows with null in target") df = df[df["target"].notna()] monotone_constraints = [ ms["monotone_constraints"].get(col, 0) for col in ms["features"].keys() ] print("monotone constraints:", monotone_constraints) model = LGBMClassifier( n_estimators=5_000, num_leaves=11, learning_rate=0.01, monotone_constraints=monotone_constraints, monotone_constraints_method="advanced", ) X = df[ms["features"].values()] y = df["target"] n_games = df["game_id"].max() + 1 game_pct = (df["game_id"] + 1) / n_games w = config.GAME_WEIGHTING_FACTOR + (1 - config.GAME_WEIGHTING_FACTOR) * game_pct eval_size = ms["validation_size"] X_tr, X_te = X.iloc[:-eval_size], X.iloc[-eval_size:] y_tr, y_te = y.iloc[:-eval_size], y.iloc[-eval_size:] w_tr, w_te = w.iloc[:-eval_size], w.iloc[-eval_size:] eval_set = [(X_te.values, y_te.values)] model.fit( X_tr, y_tr, sample_weight=w_tr, eval_set=eval_set, eval_sample_weight=[w_te], early_stopping_rounds=early_stopping_rounds, verbose=early_stopping_rounds, ) print("refitting model with full dataset") model.set_params(n_estimators=model.best_iteration_) model.fit(X, y, sample_weight=w) pred = pd.Series(model.predict_proba(X)[:, 1]) print("distribution of predictions:") print(pred.describe(percentiles=[0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95])) feature_importances = [ (feature, importance) for feature, importance in zip( model.booster_.feature_name(), model.booster_.feature_importance(importance_type="gain"), ) ] print("feature importance (by gain):") for feature, importance in sorted(feature_importances, key=lambda row: -row[1]): print(f" {feature}: {importance}") filepath = os.path.join(FILEPATH, "models", ms["filename"]) print("saving to", filepath) model.booster_.save_model(filepath)
def score(params, skf=skf, sample_weight=sample_weight): params = {"max_depth": int(params["max_depth"]), "subsample": params["subsample"], "colsample_bytree": params['colsample_bytree'], "num_leaves": int(params['num_leaves']), "n_jobs": -2 } clf = LGBMClassifier(n_estimators=500, learning_rate=0.05, **params) list_score_acc = [] list_score_logloss = [] for train, val in skf.split(self.X, self.y): X_train, X_val = self.X[train], self.X[val] y_train, y_val = self.y[train], self.y[val] weight_train = sample_weight[train] weight_val = sample_weight[val] clf.fit(X_train, y_train, sample_weight=weight_train, eval_sample_weight=[weight_val], eval_set=[(X_val, y_val)], eval_metric="logloss", early_stopping_rounds=0, verbose=False ) _score_acc = accuracy_score(y_val, clf.predict(X_val), sample_weight=weight_val) _score_logloss = log_loss(y_val, clf.predict_proba(X_val), sample_weight=weight_val) list_score_acc.append(_score_acc) list_score_logloss.append(_score_logloss) """ ##n_estimaters=0 causes error at .fit() if clf.best_iteration_ != -1: list_best_iter.append(clf.best_iteration_) else: list_best_iter.append(params['n_estimators']) break """ # logger.info("n_estimators: {}".format(list_best_iter)) # params["n_estimators"] = np.mean(list_best_iter, dtype=int) score_acc = (np.mean(list_score_acc), np.min(list_score_acc), np.max(list_score_acc)) # logger.info("score_acc %s" % np.mean(list_score_acc)) # score_logloss = (np.mean(list_score_logloss), np.min(list_score_logloss), np.max(list_score_logloss)) # score_f1 = (np.mean(list_score_f1), np.min(list_score_f1), np.max(list_score_f1)) # score_auc = (np.mean(list_score_auc), np.min(list_score_auc), np.max(list_score_auc)) logloss = np.mean(list_score_logloss) return {'loss': logloss, 'status': STATUS_OK, 'localCV_acc': score_acc}
def fit(): train, validation, _ = train_validation_holdout_split(read('./data/train_set.csv')) steps = [ preprocess, russia_only, rouble_only, with_transaction_location, with_job, (partial(fit_categories, ['mcc', 'city', 'terminal_id']), transform_categories), partial(calc_is_close, ['transaction_lat', 'transaction_lon'], ['work_add_lat', 'work_add_lon']) ] pipeline, train = fit_pipeline(steps, train) validation = pipeline(validation) feature_columns = ['mcc', 'city', 'amount', 'terminal_id'] print(f'Train size: {len(train)}, Validation size: {len(validation)}') print(f'Features: {feature_columns}') model = LGBMClassifier() model.fit(train[feature_columns], train['is_close']) predictions = model.predict_proba(validation[feature_columns]) accuracy_value = accuracy_score(validation['is_close'], np.argmax(predictions, axis=1)) logloss_value = log_loss(validation['is_close'], predictions) print(f'Accuracy: {accuracy_value:.5f}, Logloss: {logloss_value:.5f}') print(classification_report(validation['is_close'], np.argmax(predictions, axis=1))) validation['probs'] = predictions[:, 1] top1_accuracy = validation.groupby('customer_id').apply(lambda group: group.sort_values('probs').tail(1).is_close.max()).mean() top5_accuracy = validation.groupby('customer_id').apply(lambda group: group.sort_values('probs').tail(5).is_close.max()).mean() top10_accuracy = validation.groupby('customer_id').apply(lambda group: group.sort_values('probs').tail(10).is_close.max()).mean() print(f'Top1: {top1_accuracy:.5f}') print(f'Top5: {top5_accuracy:.5f}') print(f'Top10: {top10_accuracy:.5f}') # contributions = model._Booster.predict(validation[feature_columns], pred_contrib=True) # contributions_df = pd.DataFrame( # index=validation.index, # data=contributions, # columns=list(map(lambda col: col + '_contr', feature_columns)) + ['expected_value'] # ) # debug_df = pd.concat([validation, contributions_df], axis=1) # debug_df.index.name = 'id' # debug_df.to_csv('./data/debug.csv') import pdb; pdb.set_trace()
def single_model(df_final, train_y,weight=None,metric=None): train_values, test_values = df_final[:train_num], df_final[test_num:] skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019) clf = LGBMClassifier(learning_rate=0.05, n_estimators=10000, subsample=0.8, subsample_freq=1, colsample_bytree=0.8, random_state=2019) test_pred_prob = np.zeros((test_values.shape[0], 33)) for i, (trn_idx, val_idx) in enumerate(skf.split(train_values, train_y['label'])): print(i, 'fold...') t = time.time() trn_x, trn_y = train_values[trn_idx], train_y['label'][trn_idx] val_x, val_y = train_values[val_idx], train_y['label'][val_idx] train_amt, val_amt = train_y['due_amt'][trn_idx].values, train_y['due_amt'][val_idx].values clf.fit(trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)],sample_weight=weight, eval_metric=metric, early_stopping_rounds=100, verbose=5) test_pred_prob += clf.predict_proba(test_values, num_iteration=clf.best_iteration_) / skf.n_splits print('runtime: {}\n'.format(time.time() - t)) print('单模型拟合已完成') return test_pred_prob
def single_train(label_list,var_list): """ 用该特征单独训练一个模型,得到pvalue,用来计算AUC/KS。K折交叉赋分,比较准确 """ print("用该特征单独训练一个模型,得到pvalue,用来计算AUC/KS。K折交叉赋分,比较准确!") df = pd.DataFrame({'label':label_list, 'var':var_list}) df['label'] = df['label'].astype('int64') clf = LGBMClassifier( metric='binary_logloss', is_unbalance=True, random_state=11, silent=True, n_jobs=10, reg_alpha=0.3, reg_lambda=0.3 ,learning_rate=0.01, n_estimators=2000, subsample=0.6, colsample_bytree=0.3 ,num_leaves=7, max_depth=3, min_child_samples=2000 ,min_split_gain=0.1, min_child_weight=0.1 #,is_training_metric=True, max_bin=255, subsample_for_bin=400, ,objective='binary' ,importance_type='gain', ) # kfold kf = KFold(n_splits=3, shuffle=True) # 注意交叉赋分的时候必须要打乱顺序,保证好坏样本比例类似!!!要不然每一折的test AUC不稳定 df = df.reset_index(drop=True) # index必须从0开始,要不然交叉的时候出错 for train_index, test_index in kf.split(df): clf.fit(df.loc[train_index, 'var'].values.reshape(-1, 1),df.loc[train_index, 'label']) prob_list = clf.predict_proba(df.loc[test_index, 'var'].values.reshape(-1, 1))[:, 1] # 给test集打分 df.loc[test_index, "pvalue"] = prob_list # 保留交叉赋分之后的pvalue return df['pvalue'].tolist()
def main(mode, params, model_type): n_estimators = params["n_estimators"] early_stopping_rounds = params["early_stopping_rounds"] eval_date = params["eval_date"] print("******************** eval_month: %s ********************" % eval_date) t2 = time.time() X_train, y_train, X_test, y_test = get_dataset(eval_date, mode, model_type) print("used time:{}m".format((time.time() - t2) // 60)) gc.collect() print(X_train.shape, X_test.shape) print(y_train.value_counts()) print(y_test.value_counts()) print('************** training **************') # class weight clf = LGBMClassifier( learning_rate=0.01, n_estimators=n_estimators, num_leaves=127, # 20, max_depth:5 subsample=0.8, colsample_bytree=0.8, random_state=2019, #scale_pos_weight=50, metric=None) F1.clear() if mode == "test": clf.fit(X_train, y_train, eval_set=[(X_train, y_train)], early_stopping_rounds=early_stopping_rounds, verbose=10) now_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") joblib.dump(clf, 'pakdd_model{}.pkl'.format(model_type)) return clf.fit( X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=lambda y_true, y_pred: [custom_f1_eval(y_true, y_pred)], early_stopping_rounds=early_stopping_rounds, verbose=10) joblib.dump(clf, 'pakdd_model_valid.pkl') y_pred = clf.predict_proba(X_test)[:, 1] test_sub = TEST_SUB.copy() y_ranked = rank_result(y_pred, test_sub, verbose=True) evaluate_classification_new(y_ranked, verbose=True) file_path = "../offline/" now_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") submit = y_ranked[['manufacturer', 'model', 'serial_number', 'dt']] print("submit shape", submit.shape) submit.to_csv(file_path + "submit_%s.csv" % now_time, index=False, header=None) feature_importaces = pd.Series( clf.feature_importances_, index=X_train.columns).sort_values(ascending=False) feature_importaces.to_frame().to_csv(file_path + 'lgb_feat_imp.csv', header=None)
class PHSICAdasynLGBM(BaseEstimator): """ An estimator upsampling minority classes, finding a small set of stable biomarkers, and fitting a gradient boosting model over them Parameters ---------- n_features : int, optional (default=30) Max. number of biomarkers (important features) to be selected adasyn_neighbors : int, optional (default=10) K neighbors for ADASYN upsampling algorithm B : int, optional (default=20) Block size for Block HSIC Lasso M : int, optional (default=10) Max allowed permutations of samples for Block HSIC Lasso hsic_splits : int, optional (default=5) number of folds for verifying feature stability feature_neighbor_threshold : float, optional (default=0.4) threshold for considering neighbors of important features in stability check """ def __init__(self, n_features=30, adasyn_neighbors=10, B=20, M=10, hsic_splits=5, feature_neighbor_threshold=0.4): self.n_features = n_features self.adasyn_neighbors = adasyn_neighbors self.M = M self.B = B self.hsic_splits = hsic_splits self.neighbor_threshold = feature_neighbor_threshold def fit(self, X, y): sss = StratifiedShuffleSplit(n_splits=self.hsic_splits, random_state=42) idxs = [] hsics = [] for train_index, test_index in list(sss.split(X, y)): hsic_lasso2 = HSICLasso() hsic_lasso2.input(X[train_index], y[train_index]) hsic_lasso2.classification( self.n_features, B=self.B, M=self.M) #(self.n_features, B=self.B, M=self.M) hsics.append(hsic_lasso2) # not just best features - get their neighbors (similar features) too all_ft_idx = np.array(hsic_lasso2.get_index(), dtype=int).ravel() for i in range(len(all_ft_idx)): idx = np.array(hsic_lasso2.get_index_neighbors( feat_index=i, num_neighbors=10), dtype=int) score = np.array(hsic_lasso2.get_index_neighbors_score( feat_index=i, num_neighbors=10), dtype=int) idx = idx[np.where(score > self.neighbor_threshold)[0]] all_ft_idx = np.concatenate((all_ft_idx, idx)) all_ft_idx = np.unique(all_ft_idx) idxs.append(all_ft_idx) if len(idxs) == 1: self.hsic_idx_ = idxs[0] else: self.hsic_idx_ = np.intersect1d(idxs[-1], self.hsic_idx_) print("HSIC done.", len(self.hsic_idx_)) print("Upsampling with ADASYN... (features: " + str(len(self.hsic_idx_)) + ")") sm = ADASYN(sampling_strategy="minority", n_neighbors=self.adasyn_neighbors, n_jobs=-1) sX, sy = X[:, self.hsic_idx_], y if self.adasyn_neighbors > 0: try: sX, sy = sm.fit_resample(X[:, self.hsic_idx_], y) for i in range(len(np.unique(y) - 1)): sX, sy = sm.fit_resample(sX, sy) except: pass print("ADASYN done. Starting clf") self.clf_ = LGBMClassifier(n_estimators=1000).fit(sX, sy) print("done") return self def predict_proba(self, X): return self.clf_.predict_proba(X[:, self.hsic_idx_]) def predict(self, X): return self.clf_.predict(X[:, self.hsic_idx_])
del test gc.collect() clf = LGBMClassifier( learning_rate=0.001, n_estimators=100, num_leaves=127, subsample=0.8, colsample_bytree=0.8, random_state=2019, # is_unbalenced = 'True', metric=None) print('************** training **************') print(train_df.shape, test_x.shape) clf.fit(train_df, labels, eval_set=[(train_df, labels)], eval_metric='auc', early_stopping_rounds=10, verbose=10) sub['p'] = clf.predict_proba(test_x)[:, 1] sub['label'] = sub['p'].rank() sub['label'] = (sub['label'] >= sub.shape[0] * 0.996).astype(int) submit = sub.loc[sub.label == 1] ###这里我取故障率最大的一天进行提交,线上测了几个阈值,100个左右好像比较好。。。。 submit = submit.sort_values('p', ascending=False) submit = submit.drop_duplicates(['serial_number', 'model']) submit[['manufacturer', 'model', 'serial_number', 'dt']].to_csv("../sub.csv", index=False, header=None) submit.shape
trn_x, trn_y = train_values[trn_idx], clf_labels[trn_idx] val_x, val_y = train_values[val_idx], clf_labels[val_idx] val_repay_amt = amt_labels[val_idx] val_due_amt = train_due_amt_df[val_idx] val_df = train_df[[ 'listing_id', 'auditing_date', 'due_date', 'due_amt', 'repay_date' ]].iloc[val_idx] clf.fit(trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)], early_stopping_rounds=50, verbose=5, feature_name=list(feature_name)) # shape = (-1, 33) val_pred_prob_everyday = clf.predict_proba( val_x, num_iteration=clf.best_iteration_) prob_oof[val_idx] = val_pred_prob_everyday val_pred_prob_today = [ val_pred_prob_everyday[i][val_y[i]] for i in range(val_pred_prob_everyday.shape[0]) ] val_pred_repay_amt = val_due_amt * val_pred_prob_today print('val rmse:', np.sqrt(mean_squared_error(val_repay_amt, val_pred_repay_amt))) print('val mae:', mean_absolute_error(val_repay_amt, val_pred_repay_amt)) print('val new rmse:', new_rmse(val_df, val_pred_prob_everyday)) amt_oof[val_idx] = val_pred_repay_amt test_pred_prob += clf.predict_proba( test_values, num_iteration=clf.best_iteration_) / skf.n_splits print('runtime: {}\n'.format(time.time() - t))
all_pred = np.zeros(y_train.shape[0]) for train, test in cv[:1]: trn_x = x_train[train] val_x = x_train[test] trn_y = y_train[train] val_y = y_train[test] clf = LGBMClassifier(**params) clf.fit( trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=True, # eval_metric='logloss', early_stopping_rounds=30) pred = clf.predict_proba(val_x)[:, 1] all_pred[test] = pred _score = log_loss(val_y, pred) _score2 = -roc_auc_score(val_y, pred) # logger.debug(' _score: %s' % _score) list_score.append(_score) list_score2.append(_score2) if clf.best_iteration != -1: list_best_iter.append(clf.best_iteration) else: list_best_iter.append(params['n_estimators']) # break with open('train_cv_pred_base.pkl', 'wb') as f: pickle.dump(pred, f, -1)
trn_y = y_train[train] val_y = y_train[test] trn_w = sample_weight[train] val_w = sample_weight[test] clf = LGBMClassifier(**params) clf.fit(trn_x, trn_y, sample_weight=trn_w, eval_sample_weight=[val_w], eval_set=[(val_x, val_y)], verbose=False, # eval_metric='logloss', early_stopping_rounds=100 ) pred = clf.predict_proba(val_x)[:, 1] _score = log_loss(val_y, pred, sample_weight=val_w) _score2 = - roc_auc_score(val_y, pred, sample_weight=val_w) list_score.append(_score) list_score2.append(_score2) if clf.best_iteration != -1: list_best_iter.append(clf.best_iteration) else: list_best_iter.append(params['n_estimators']) logger.info('trees: {}'.format(list_best_iter)) params['n_estimators'] = np.mean(list_best_iter, dtype=int) score = (np.mean(list_score), np.min(list_score), np.max(list_score)) score2 = (np.mean(list_score2), np.min(list_score2), np.max(list_score2)) logger.info('param: %s' % (params)) logger.info('loss: {} (avg min max {})'.format(score[use_score], score))
clf = LGBMClassifier(**params) clf.fit( trn_x, trn_y, callbacks=[callback], # init_score=trn_sc, eval_init_score=[val_sc], # sample_weight=trn_w, # eval_sample_weight=[val_w], eval_set=[(val_x, val_y)], verbose=False, eval_metric=dummy, # f1_metric, # early_stopping_rounds=50 #categorical_feature=['o_product_id', 'o_user_id', 'p_aisle_id', 'p_department_id'] ) pred = clf.predict_proba(val_x)[:, 1] all_pred[test] = pred _score = log_loss(val_y, pred) _score2 = -roc_auc_score(val_y, pred) _, _score3, _ = f1_metric(val_y.astype(int), pred.astype(float)) logger.debug(' _score: %s' % _score3) list_score.append(_score) list_score2.append(_score2) list_score3.append(-1 * _score3) if clf.best_iteration != -1: list_best_iter.append(clf.best_iteration) else: list_best_iter.append(params['n_estimators']) with open(DIR + 'train_cv_pred_%s.pkl' % cnt, 'wb') as f:
oof = np.zeros((len(train), 6)) predictions = np.zeros((len(test), 6)) feature_importance_df = pd.DataFrame() for i, (trn_idx, val_idx) in enumerate(skf.split(train_values, clf_labels)): print(i, 'fold...') trn_x, trn_y = train_values[trn_idx], clf_labels[trn_idx] val_x, val_y = train_values[val_idx], clf_labels[val_idx] clf.fit(trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)], early_stopping_rounds=100, verbose=50) oof[val_idx] = clf.predict_proba(train_values[val_idx], num_iteration=clf.best_iteration_) fold_importance_df = pd.DataFrame() fold_importance_df["Feature"] = features fold_importance_df["importance"] = clf.feature_importances_ fold_importance_df["fold"] = +1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) predictions += clf.predict_proba( test_values, num_iteration=clf.best_iteration_) / skf.n_splits # 评价指标: AUC指标 准确率 clf_one_hot = pd.Series(clf_labels) clf_one_hot = pd.get_dummies(clf_one_hot) auc = roc_auc_score(clf_one_hot, oof, average='weighted')
def ks_score(kwargs: dict) -> dict: starttime = time.time() # Retrieve the subsample if present otherwise set to 1.0 subsample = kwargs["boosting_type"].get("subsample", 1.0) # Extract the boosting type kwargs["boosting_type"] = kwargs["boosting_type"]["boosting_type"] kwargs["subsample"] = subsample # Make sure parameters that need to be integers are integers for k in ( "feature_num", "max_depth", "num_leaves", "n_estimators", "min_child_samples", "subsample_for_bin", ): if isinstance(kwargs.get(k), float): kwargs[k] = int(kwargs[k]) ks_list = [] chosen_feature = raw_features[:kwargs["feature_num"]] X = X_raw[chosen_feature] logger.info("开始训练1111111111111111") logger.info("{}".format(kwargs)) iter = 0 estr = {} for train_idx, test_idx in kf.split(X): X_train, X_test = X.iloc[train_idx], X.iloc[test_idx] y_train, y_test = y.iloc[train_idx], y.iloc[test_idx] params.update(kwargs) try: params.pop("n_estimators") except KeyError: pass if model == "lgb": # # Retrieve the subsample if present otherwise set to 1.0 # subsample = params["boosting_type"].get("subsample", 1.0) # # # Extract the boosting type # params["boosting_type"] = params["boosting_type"]["boosting_type"] # params["subsample"] = subsample train_data = lgb.Dataset(X_train, y_train, silent=True) cv_result = lgb.cv( params, train_data, num_boost_round=10000, # n_estimators early_stopping_rounds=100, nfold=5, seed=555, metrics= "auc", # Evaluation metrics to be monitored while CV. verbose_eval=False, ) # Boosting rounds that returned the highest cv score print('222222222222222') n_estimators = int(np.argmax(cv_result["auc-mean"]) + 1) logger.info("n_estimators:{}".format(n_estimators)) params["n_estimators"] = n_estimators clf = LGBMClassifier(**params) clf.fit(X_train, y_train) y_pred_test = clf.predict_proba(X_test)[:, 1] iter += 1 estr[iter] = {n_estimators} # elif model == 'xgb': # train_data = xgb.DMatrix(X_train, y_train, silent=False) # cv_result = xgb.cv(params, train_data, num_boost_round=500, nfold=3, metrics='auc', # early_stopping_rounds=100) # params['n_estimators'] = len(cv_result) # clf = XGBClassifier(**params) # clf.fit(X_train, y_train) # y_pred_test = clf.predict_proba(X_test)[:, 1] # elif model == 'adaboost': # clf = AdaBoostClassifier(DecisionTreeClassifier(**kwargs, min_samples_leaf=0.05)) # clf.fit(X_train, y_train) # y_pred_test = clf.predict_proba(X_test)[:, 1] else: raise NotImplementedError("Not implemented!") ks_list.append(calc_ks(y_pred_test, y_test, method="crosstab")) # model.predict_proba(X_test)= # array([[0.1,0.9], #代表[2,3,4,5]被判断为0的概率为0.1,被判断为1的概率为0.9 # [0.8,0.2]]) #代表[3,4,5,6]被判断为0的概率为0.8,被判断为1的概率为0.2 ks_arr = np.asarray(ks_list) score = np.mean(ks_arr) - 1.96 * np.std(ks_arr) / np.sqrt( len(ks_arr) ) # 置信区间 https://www.shuxuele.com/data/confidence-interval.html loss = -score run_time = time.time() - starttime # Write to the csv file ('a' means append) of_connection = open(out_file, "a") writer = csv.writer(of_connection) writer.writerow([loss, params, estr, run_time, i]) return { "loss": loss, "params": params, "estimators": i, "train_time": run_time, "status": STATUS_OK, }
for i, (trn_idx, val_idx) in enumerate(skf.split(train_values, clf_labels_2)):#特征数据,标签 print(i, 'fold...') t = time.time() trn_x, trn_y = train_values.values[trn_idx], clf_labels_2[trn_idx]#训练集 val_x, val_y = train_values.values[val_idx], clf_labels_2[val_idx]#测试集 clf.fit( trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)], early_stopping_rounds=100, verbose=5 )#交叉验证进行训练 # shepe = (-1, 33) test_pred_prob += clf.predict_proba(test_values.values, num_iteration=clf.best_iteration_) / skf.n_splits#每个测试集样本的33个类别概率 joblib.dump(clf, '../data/paipaidai_binary_%d.pkl'%i) print('runtime: {}\n'.format(time.time() - t))
print('=========', y, '=========') t = time.time() clf = LGBMClassifier(learning_rate=0.05, n_estimators=5000, num_leaves=63, subsample=0.8, colsample_bytree=0.8, random_state=2021, metric='None') clf.fit(trn_x[cols], trn_x[y], eval_set=[(val_x[cols], val_x[y])], eval_metric='auc', early_stopping_rounds=100, verbose=50) val_x[y + '_score'] = clf.predict_proba(val_x[cols])[:, 1] val_uauc = uAUC(val_x[y], val_x[y + '_score'], val_x['userid']) uauc_list.append(val_uauc) print(val_uauc) r_list.append(clf.best_iteration_) print('runtime: {}\n'.format(time.time() - t)) weighted_uauc = 0.4 * uauc_list[0] + 0.3 * uauc_list[1] + 0.2 * uauc_list[ 2] + 0.1 * uauc_list[3] print(uauc_list) print(weighted_uauc) ##################### 全量训练 ##################### r_dict = dict(zip(y_list[:4], r_list)) for y in y_list[:4]: print('=========', y, '=========') t = time.time() clf = LGBMClassifier(learning_rate=0.05,
x_min, x_max = x_train[:, 0].min() - .5, x_train[:, 0].max() + .5 y_min, y_max = x_train[:, 1].min() - .5, x_train[:, 1].max() + .5 h = .02 xx, yy = np.meshgrid(np.arange(x_min,x_max,h), np.arange(y_min,y_max,h)) print(xx) cm = plt.cm.RdBu cm_bright = plt.cm.RdBu ax = plt.subplot(1,1,1) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) ax.scatter(x_train[:, 0], x_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='k') ax.scatter(x_test[:, 0], x_test[:, 1], c=y_test, cmap=cm_bright, alpha = .6, edgecolors='k') z = lgb.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] z=z.reshape(xx.shape) ax.contourf(xx, yy, z, cmap=cm, alpha=.8) plt.show()
metric=None) print('************** training **************') clf.fit(train_x, train_y, eval_set=[(val_x, val_y)], eval_metric='auc', categorical_feature=cate_cols, early_stopping_rounds=200, verbose=50) print('runtime:', time.time() - t) print('************** validate predict **************') best_rounds = clf.best_iteration_ best_auc = clf.best_score_['valid_0']['auc'] val_pred = clf.predict_proba(val_x)[:, 1] fea_imp_list.append(clf.feature_importances_) print('runtime:', time.time() - t) print( '=============================================== training predict ===============================================' ) clf = LGBMClassifier(learning_rate=0.01, n_estimators=best_rounds, num_leaves=255, subsample=0.9, colsample_bytree=0.8, random_state=2019) print('************** training **************') clf.fit(train_df,
random_state=42) print('Train classifier...') clf = LGBMClassifier(boosting_type='gbdt', objective='binary', max_depth=-1, num_leaves=2**7 - 1, learning_rate=0.01, n_estimators=2000, min_split_gain=0.0, min_child_weight=0.001, subsample=0.8, colsample_bytree=0.7, random_state=888) clf.fit( X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric='logloss', early_stopping_rounds=100, verbose=50, ) ######################################## print('Predict...') pred = clf.predict_proba(tfidf_test) make_submission(pred[:, 1]) print('Complete')
y, random_state=0, test_size=0.2) #%% time clf = LGBMClassifier(num_leaves=80, objective='binary', max_depth=30, learning_rate=0.01, min_child_samples=20, random_state=2021, n_estimators=1000, subsample=0.9, colsample_bytree=0.9) clf.fit(X_train, y_train) # %% pred_y = clf.predict_proba(X_train)[:, -1] print("train:", roc_auc_score(y_train, pred_y)) pred_y = clf.predict_proba(X_test)[:, -1] print("test:", roc_auc_score(y_test, pred_y)) # %% import tools #%% test = pd.read_csv(f"{dirPath}rawData/testB.csv") test = tools.preprocess(test, savePath=f"{dirPath}data/test_v2.pkl") #%% test_sub = pd.read_csv(f"{dirPath}rawData/sample_submit.csv") pred_y = clf.predict_proba(test) test_sub['isDefault'] = pred_y[:, -1] test_sub.to_csv(f"{dirPath}submits/subMay5-12.csv", index=False)
val_x, val_y = train_values.values[val_idx], clf_labels[val_idx] #测试集 val_repay_amt = amt_labels[val_idx] #训练集对应的实际还款金额 val_due_amt = train_due_amt_df.iloc[val_idx] #训练集对应的应还款金额 clf.fit(trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)], early_stopping_rounds=100, verbose=5) #交叉验证进行训练 # shepe = (-1, 33) val_pred_prob_everyday = clf.predict_proba( val_x, num_iteration=clf.best_iteration_ ) #预测,是一个shepe = (-1, 33)的结果,每一行的33个数代表每一类别的概率 prob_oof[val_idx] = val_pred_prob_everyday #一折训练集的预测结果,将预测结果填到相对应的验证集的位置上 val_pred_prob_today = [ val_pred_prob_everyday[i][val_y[i]] for i in range(val_pred_prob_everyday.shape[0]) ] #i表示第i条验证集的预测结果,该结果包含33个概率,val_y[i]表示验证集真实的label,所以val_pred_prob_today保存的就是第i条验证集样本的33个预测类别中,真实类别的概率 val_pred_repay_amt = val_due_amt[ 'due_amt'].values * val_pred_prob_today #实际还款那一天的预测还款金额=实际还款那一天对应的概率*应还款金额 print( 'val rmse:', np.sqrt(mean_squared_error(
early_stopping_rounds=200, verbose=200, ) best_iter = clf.best_iteration_ X_trn, Y_trn = df_train[feats], df_train.label X_sub = df_test[feats] clf = LGBMClassifier( objective='binary', learning_rate=0.05, n_estimators=best_iter, num_leaves=63, subsample=0.6, colsample_bytree=0.6, random_state=2020, n_jobs=32, ) clf.fit(X_trn, Y_trn, verbose=200) sub = clf.predict_proba(X_sub)[:, 1] imp = clf.feature_importances_ pd.DataFrame({ 'id': df_test.id, 'probability': sub.astype('float32'), }).to_csv('submission_1.csv', index=None) pd.DataFrame({ 'feat': feats, 'imp': imp, }).to_csv('feat_imp_1.csv', index=None)
def train_lightgbm(verbose=True, seed=0): """Train a boosted tree with LightGBM.""" """ trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(x, y, random_state=42, stratify=y, test_size=0.20) """ all_params = {'max_depth': [3], 'learning_rate': [0.06], 'n_estimators': [1500], 'min_child_weight': [0], 'subsample': [1], 'colsample_bytree': [0.5], 'boosting_type': ['gbdt'], #'num_leaves': [2, 3], #'reg_alpha': [0.1, 0, 1], #'reg_lambda': [0.1, 0, 1], #'is_unbalance': [True, False], #'subsample_freq': [1, 3], 'seed': [2261] } min_score = 100 min_params = None for params in ParameterGrid(all_params): pass cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) list_score = [] for train, test in cv.split(x, y): trn_x = x[train] val_x = x[test] trn_y = y[train] val_y = y[test] clf = LGBMClassifier(**params) clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=verbose, # eval_metric=log_loss, early_stopping_rounds=300 ) _score = log_loss(val_y, clf.predict_proba(val_x)[:, 1]) list_score.append(_score) score = np.mean(list_score) max_score = np.max(list_score) list_score = [] for train, test in cv.split(x, y): trn_x = x2[train] val_x = x2[test] trn_y = y[train] val_y = y[test] clf = LGBMClassifier(**params) clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=verbose, # eval_metric=log_loss, early_stopping_rounds=300 ) _score = log_loss(val_y, clf.predict_proba(val_x)[:, 1]) list_score.append(_score) score2 = np.mean(list_score) max_score2 = np.max(list_score) logger.info('seed: %s, score: %s %s %s %s' % (seed, score, score2, score - score2, max_score - max_score2)) return clf
def train_lightgbm(verbose=True, idx=0): """Train a boosted tree with LightGBM.""" logger.info("Training with LightGBM") df = pd.read_csv(STAGE1_LABELS) data = [] use_idx = [] for id in df['id'].tolist(): nd = np.load(os.path.join(FEATURE_FOLDER, '%s.npy' % str(id))) if nd.shape[0] <= idx: continue data.append(nd[idx]) use_idx.append(id) x = np.array(data) y = df[df['id'].isin(use_idx)]['cancer'].as_matrix() trn_x, val_x, trn_y, val_y = cross_validation.train_test_split( x, y, random_state=42, stratify=y, test_size=0.20) all_params = { 'max_depth': [5, 10, 20, 50], 'learning_rate': [0.06, 0.1], 'n_estimators': [1500], 'min_child_weight': [0, 0.1], 'subsample': [0.99, 0.8], 'colsample_bytree': [0.8, 0.5, 1], 'boosting_type': ['gbdt'], 'num_leaves': [10, 21, 50], 'seed': [2261] } min_score = 100 min_params = None cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) for params in ParameterGrid(all_params): params = { 'min_child_weight': 0, 'num_leaves': 10, 'learning_rate': 0.06, 'subsample': 0.99, 'max_depth': 5, 'boosting_type': 'gbdt', 'seed': 2261, 'colsample_bytree': 0.5, 'n_estimators': 1500 } list_score = [] for train, test in cv.split(x, y): trn_x = x[train] val_x = x[test] trn_y = y[train] val_y = y[test] clf = LGBMClassifier(**params) clf.fit( trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=verbose, # eval_metric=log_loss, early_stopping_rounds=300) _score = log_loss(val_y, clf.predict_proba(val_x)[:, 1]) list_score.append(_score) score = np.mean(list_score) params['n_estimators'] = clf.best_iteration logger.info('idx:%s, score: %s' % (idx, score)) break return clf
metric=None) print('************** training **************') clf.fit(train_x, train_y, eval_set=[(val_x, val_y)], eval_metric='auc', categorical_feature=cate_cols, early_stopping_rounds=200, verbose=50) print('runtime:', time.time() - t) print('************** validate predict **************') best_rounds = clf.best_iteration_ best_auc = clf.best_score_['valid_0']['auc'] val_pred = clf.predict_proba(val_x)[:, 1] fea_imp_list.append(clf.feature_importances_) print('runtime:', time.time() - t) print( '=============================================== training predict ===============================================' ) clf = LGBMClassifier(learning_rate=0.01, n_estimators=best_rounds, num_leaves=255, subsample=0.9, colsample_bytree=0.8, random_state=2019) print('************** training **************') clf.fit(train_df,
'ps_car_06_cat','ps_car_07_cat','ps_car_08_cat','ps_car_09_cat','ps_ind_02_cat','ps_ind_04_cat','ps_ind_05_cat'] for f in encode_feature: tempDfTrain,tempDfTest = cat_feature_encoder(tempDfTrain,tempDfTest,f) tempDfTrain=tempDfTrain.drop(encode_feature,axis=1) tempDfTest=tempDfTest.drop(encode_feature,axis=1) X_train = tempDfTrain.drop(['id','target'],axis=1) y_train = y[train_index] X_valid = tempDfTest.drop(['id','target'],axis=1) y_valid = y[test_index] LGB = LGBMClassifier(n_estimators=2000,max_depth=8,learning_rate=0.01,subsample=0.7,colsample_bytree=0.7,min_child_weight=50) eval_set=[(X_valid, y_valid)] LGB.fit(X_train, y_train,early_stopping_rounds=100,eval_metric=gini_lgb_used,eval_set=eval_set,verbose=100) pred = LGB.predict_proba(X_valid,num_iteration=LGB.best_iteration)[:,1] cv_vaild_.iloc[test_index] =pred score = gini_c(pred,y_valid) print('valid-gini:'+ str(score)) score_list.append(score) p_test = LGB.predict_proba(test,num_iteration=LGB.best_iteration)[:,1] p_test = np.log(p_test) sub['model_lgb_submit'] += p_test/kfold print('avg-valid-gini:'+ str(sum(score_list)/kfold)) sub['target'] = np.e**sub['model_lgb_submit'] sub.to_csv('../output/model_lgb_submit.csv', index=False, float_format='%.5f')
lgb1 = LGBMClassifier(boosting_type='gbdt', learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=1, subsample=0.8, colsample_bytree=0.8, objective='binary', n_jobs=4, random_state=66) lgbfit(lgb1, X_train, y_train, useTrainCV=True) test_target = lgb1.predict_proba(X_test)[:, 1] test_target = (np.exp(test_target) - 1.0).clip(0, 1).tolist() sub = pd.DataFrame([test_id, test_target]).transpose() sub.columns = ['id', 'target'] sub['id'] = sub['id'].astype('int32') sub.to_csv('submission.csv', index=False, float_format='%.5f') ###############################Bagging##################################### bag = BaggingClassifier(lgb1, max_samples=0.8, max_features=0.8) bag.fit(X_train, y_train) test_target = bag.predict_proba(X_test)[:, 1].tolist() sub = pd.DataFrame([test_id, test_target]).transpose() sub.columns = ['id', 'target'] sub['id'] = sub['id'].astype('int32') sub.to_csv('submission.csv', index=False, float_format='%.5f')
def train_lightgbm(verbose=True): """Train a boosted tree with LightGBM.""" logger.info("Training with LightGBM") df = pd.read_csv(STAGE1_LABELS) # x = np.array([np.mean(np.load(os.path.join(FEATURE_FOLDER, '%s.npy' % str(id)))[IDX], axis=0).flatten() # for id in df['id'].tolist()]) x = np.array([ np.mean(np.load(os.path.join(FEATURE_FOLDER, '%s.npy' % str(id))), axis=0).flatten() for id in df['id'].tolist() ]) y = df['cancer'].as_matrix() trn_x, val_x, trn_y, val_y = cross_validation.train_test_split( x, y, random_state=42, stratify=y, test_size=0.20) all_params = { 'max_depth': [5, 10, 20, 50], 'learning_rate': [0.06, 0.1], 'n_estimators': [1500], 'min_child_weight': [0, 0.1], 'subsample': [0.99, 0.8], 'colsample_bytree': [0.8, 0.5, 1], 'boosting_type': ['gbdt'], 'num_leaves': [10, 21, 50], 'seed': [2261] } min_score = 100 min_params = None cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) for params in ParameterGrid(all_params): list_score = [] for train, test in cv.split(x, y): trn_x = x[train] val_x = x[test] trn_y = y[train] val_y = y[test] clf = LGBMClassifier(**params) clf.fit( trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=verbose, # eval_metric=log_loss, early_stopping_rounds=300) _score = log_loss(val_y, clf.predict_proba(val_x)[:, 1]) list_score.append(_score) score = np.mean(list_score) params['n_estimators'] = clf.best_iteration score = log_loss(val_y, clf.predict_proba(val_x)[:, 1]) logger.info('param: %s' % (params)) logger.info('score: %s' % score) if min_score > score: min_score = score min_params = params logger.info('best score: %s' % min_score) logger.info('best_param: %s' % (min_params)) # params = {'seed': 2261, 'min_child_weight': 0, 'boosting_type': 'gbdt', 'subsample': 0.99, 'num_leaves': 21, # 'n_estimators': 18, 'max_depth': 20, 'colsample_bytree': 0.8, 'learning_rate': 0.1} clf = LGBMClassifier(**min_params) clf.fit(x, y) return clf
def fit(objective): if objective == 'work': target_columns = ['work_add_lat', 'work_add_lon'] else: target_columns = ['home_add_lat', 'home_add_lon'] train, validation, _ = train_validation_holdout_split( read('./data/train_set.csv')) steps = [ preprocess, russia_only, rouble_only, with_transaction_location, partial(with_columns, target_columns), cluster, merge_cluster_features, (partial(fit_categories, ['mcc']), transform_categories), partial(calc_is_close, ['transaction_lat', 'transaction_lon'], target_columns) ] pipeline, train = fit_pipeline(steps, train) validation = pipeline(validation) feature_columns = [ # Original transaction features 'amount', 'mcc', # Cluster features 'amount_hist_-1.0', 'amount_hist_-2.0', 'amount_hist_0.0', 'amount_hist_1.0', 'amount_hist_2.0', 'amount_hist_3.0', 'amount_hist_4.0', 'amount_hist_5.0', 'amount_hist_6.0', 'amount_ratio', 'area', 'cluster_id', 'date_ratio', 'day_hist_0', 'day_hist_1', 'day_hist_2', 'day_hist_3', 'day_hist_4', 'day_hist_5', 'day_hist_6', 'mcc_hist_4111.0', 'mcc_hist_5261.0', 'mcc_hist_5331.0', 'mcc_hist_5411.0', 'mcc_hist_5499.0', 'mcc_hist_5541.0', 'mcc_hist_5691.0', 'mcc_hist_5812.0', 'mcc_hist_5814.0', 'mcc_hist_5912.0', 'mcc_hist_5921.0', 'mcc_hist_5977.0', 'mcc_hist_6011.0', 'mcc_hist_nan', 'transaction_ratio' ] print(f'Train size: {len(train)}, Validation size: {len(validation)}') print(f'Features: {feature_columns}') model = LGBMClassifier() model.fit(train[feature_columns], train['is_close']) predictions = model.predict_proba(validation[feature_columns]) accuracy_value = accuracy_score(validation['is_close'], np.argmax(predictions, axis=1)) logloss_value = log_loss(validation['is_close'], predictions) print(f'Accuracy: {accuracy_value:.5f}, Logloss: {logloss_value:.5f}') print( classification_report(validation['is_close'], np.argmax(predictions, axis=1))) validation['probs'] = predictions[:, 1] top1_accuracy = validation.groupby('customer_id').apply( lambda group: group.sort_values('probs').tail(1).is_close.max()).mean( ) top5_accuracy = validation.groupby('customer_id').apply( lambda group: group.sort_values('probs').tail(5).is_close.max()).mean( ) top10_accuracy = validation.groupby('customer_id').apply( lambda group: group.sort_values('probs').tail(10).is_close.max()).mean( ) print(f'Top1: {top1_accuracy:.5f}') print(f'Top5: {top5_accuracy:.5f}') print(f'Top10: {top10_accuracy:.5f}') import pdb pdb.set_trace() return model, pipeline, feature_columns