gbm = LGBMClassifier( objective='binary', num_leaves=24, max_depth=3, learning_rate=0.1, seed=2018, colsample_bytree=0.3, subsample=0.8, n_jobs=-1, n_estimators=2000 ) print('fitting...') gbm.fit(ip_train, train.loc[train_index, 'is_trade'], eval_set=[(ip_test, train.loc[test_index, 'is_trade'])], early_stopping_rounds=10) property_df = pd.DataFrame(columns=['instance_id', 'item_property_prob']) property_df['instance_id'] = data['instance_id'] property_df['item_property_prob'] = gbm.predict_proba(data_ip)[:, 1] def NatureLP(data, columns): pass print('saving...') property_df.to_csv(wd+out_put[0], index=False, sep=' ')
def lgb_model(apptype_train, app_desc, apptype_train_term_doc, app_desc_term_doc, **params): """ lgb模型 :param apptype_train: :param app_desc: :param apptype_train_term_doc: :param app_desc_term_doc: :param params: :return: """ import numpy as np from lightgbm import LGBMClassifier from sklearn.model_selection import StratifiedKFold from sklearn import metrics # 类别数 122 num_class = apptype_train['label1'].max() + 1 # 类别 label = apptype_train['label1'] n_splits = 5 params = { 'boosting_type': 'gbdt', 'objective': 'multiclass', 'nthread': -1, 'silent': True, # 是否打印信息,默认False 'learning_rate': 0.01, 'num_leaves': 1000, 'max_depth': 7, # 第二次交叉验证得到的参数 'max_bin': 127, 'subsample_for_bin': 1000, 'subsample': 0.8, 'subsample_freq': 1, 'colsample_bytree': 0.8, } oof_lgb = np.zeros((apptype_train.shape[0], num_class)) prediction_lgb = np.zeros((app_desc.shape[0], num_class)) for i, (tr, va) in enumerate( StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2019).split(apptype_train_term_doc, label)): print('fold:', i + 1, 'training') # 训练: bst = LGBMClassifier(**params).fit(X=apptype_train_term_doc[tr], y=label[tr]) # 预测验证集: oof_lgb[va] += bst.predict_proba(apptype_train_term_doc[va], num_iteration=bst.best_iteration_) # 预测测试集: prediction_lgb += bst.predict_proba(app_desc_term_doc, num_iteration=bst.best_iteration_) print( "model acc_score:", metrics.accuracy_score(label, np.argmax(oof_lgb, axis=1), normalize=True, sample_weight=None)) return oof_lgb, prediction_lgb
def train(): store = pd.HDFStore(filename) # run length stop indices df_idx = pd.read_hdf(store, 'idx') idx = np.array(df_idx[0], dtype = np.int32) idx = np.insert(idx, 0, 0) begin_offset = idx[:-1] lengths = np.diff(idx) del df_idx del idx gc.collect() data = process_train_set(store, begin_offset, lengths) meta = pd.read_csv(DATA_DIR + "training_set_metadata.csv") y = np.array (meta.target, dtype = np.int32) m = y == 90 y[m] = 1 y[~m] = 0 num_splits = 8 folds = KFold(n_splits=num_splits, shuffle=True, random_state=11) oof_preds = np.zeros(data.shape[0]) for n_fold, (trn_idx, val_idx) in enumerate(folds.split(data)): trn_x, trn_y = data[trn_idx], y[trn_idx] val_x, val_y = data[val_idx], y[val_idx] print (n_fold) clf = LGBMClassifier(n_estimators=20000, learning_rate=0.01, num_leaves = 255, silent=-1, verbose=-1) clf.fit(trn_x, trn_y, eval_set= [(trn_x, trn_y), (val_x, val_y)], eval_metric='auc', verbose=25, early_stopping_rounds=400) oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1] print('Fold %2d AUC : %.3f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx]))) del clf, trn_x, trn_y, val_x, val_y gc.collect() print('Full AUC score %.3f' % roc_auc_score(y, oof_preds)) clf = LGBMClassifier(n_estimators=200, learning_rate=0.01, max_depth=5, num_leaves = 31, silent=-1, verbose=-1) clf.fit(data, y, eval_set= [(data, y)], eval_metric='auc', verbose=25, early_stopping_rounds=400) return clf
colsample_bytree=.8, subsample=.9, max_depth=7, reg_alpha=.1, reg_lambda=.1, min_split_gain=.01, min_child_weight=2) clf.fit(trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)], eval_metric='auc', verbose=250, early_stopping_rounds=150) oof_preds[val_idx] = clf.predict_proba( val_x, num_iteration=clf.best_iteration_)[:, 1] sub_preds += clf.predict_proba( test[features], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx]))) del clf, trn_x, trn_y, val_x, val_y gc.collect() print('Full AUC score %.6f' % roc_auc_score(y, oof_preds)) test['TARGET'] = sub_preds test[['SK_ID_CURR', 'TARGET']].to_csv('first_submission.csv', index=False,
def fit(self, X: pd.DataFrame, y: np.array) -> tuple: # process cat cols if self.cat_validation == "None": encoder = MultipleEncoder( cols=self.cat_cols, encoders_names_tuple=self.encoders_names ) X = encoder.fit_transform(X, y) for n_fold, (train_idx, val_idx) in enumerate( self.model_validation.split(X, y) ): X_train, X_val = ( X.loc[train_idx].reset_index(drop=True), X.loc[val_idx].reset_index(drop=True), ) y_train, y_val = y[train_idx], y[val_idx] print(f"shapes before encoder : ", X_train.shape, X_val.shape) if self.cat_validation == "Single": encoder = MultipleEncoder( cols=self.cat_cols, encoders_names_tuple=self.encoders_names ) X_train = encoder.fit_transform(X_train, y_train) X_val = encoder.transform(X_val) if self.cat_validation == "Double": encoder = DoubleValidationEncoderNumerical( cols=self.cat_cols, encoders_names_tuple=self.encoders_names ) X_train = encoder.fit_transform(X_train, y_train) X_val = encoder.transform(X_val) pass self.encoders_list.append(encoder) # check for OrdinalEncoder encoding for col in [col for col in X_train.columns if "OrdinalEncoder" in col]: X_train[col] = X_train[col].astype("category") X_val[col] = X_val[col].astype("category") # fit model print(f"shapes before model : ", X_train.shape, X_val.shape) model = LGBMClassifier(**self.model_params) model.fit( X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=100, early_stopping_rounds=100, ) self.models_trees.append(model.best_iteration_) self.models_list.append(model) y_hat = model.predict_proba(X_train)[:, 1] score_train = roc_auc_score(y_train, y_hat) self.scores_list_train.append(score_train) y_hat = model.predict_proba(X_val)[:, 1] score_val = roc_auc_score(y_val, y_hat) self.scores_list_val.append(score_val) print(f"AUC on {n_fold} fold train : {np.round(score_train, 4)}\n\n ") print(f"AUC on {n_fold} fold val : {np.round(score_val, 4)}\n\n ") mean_score_train = np.mean(self.scores_list_train) mean_score_val = np.mean(self.scores_list_val) avg_num_trees = int(np.mean(self.models_trees)) print(f"\n\n Mean score train : {np.round(mean_score_train, 4)}\n\n ") print(f"\n\n Mean score val : {np.round(mean_score_val, 4)}\n\n ") return mean_score_train, mean_score_val, avg_num_trees
lgb_params = dict() lgb_params['learning_rate'] = 0.01 lgb_params['n_estimators'] = 1000 # lgb_params['max_depth'] = 10 lgb_params['max_bin'] = 10 lgb_params['subsample'] = 0.8 lgb_params['subsample_freq'] = 10 lgb_params['colsample_bytree'] = 0.8 lgb_params['min_child_samples'] = 500 lgb = LGBMClassifier(**lgb_params) skf = StratifiedKFold(n_splits=3, shuffle=True) predictions = np.zeros((test_pca.shape[0], 3)) for train_index, test_index in skf.split(train_pca, train_target): i = 0 lgb_train = train_pca[train_index] lgb_target = train_target[train_index] lgb.fit(lgb_train, lgb_target) y_pred = lgb.predict_proba(test_pca)[:, 1] predictions[:, i] = y_pred i += 1 # write the result to a csv res = pd.DataFrame() res['id'] = test_id res['target'] = predictions.mean(axis=1) res.to_csv('smooth_pred.csv', index=False)
def main(): topicdata = pd.read_csv(r'/data/work/wk/tb/20170906/user_out.csv') tbjxldata = pd.read_csv( r'/data/work/wk/tb/data/tbjxl_r360dtl_data20170713_uft8.csv') tbjxldata = tbjxldata[(tbjxldata['target'] != 2) & tbjxldata['flg_sample'] == 1] # 閺堝宕� topicdata.rename(columns={"ugid": "user_gid"}, inplace=True) rawdata = pd.merge(topicdata, tbjxldata, on='user_gid') exclude = [ 'cust_nm', 'register_mobile', 'flg_jxl', 'flg_tb', 'flg_sample', 'user_gid', 'IDCardNO', 'decision_tm', 'usertype', 'ugid', 'weight', 'phone', 'id_card_1', 'mobile_auth', 'first_decision_tm', 'register_time', 'credit_history', 'cust_nm_sha', 'id_card_sha', 'mobile_sha', 'cust_nm_1', 'target1', 'cust_perf', 'source' ] features = [f for f in rawdata.columns if f not in exclude] data = rawdata[features] data = data.replace('@', np.nan) data = data.replace(-9999976, np.nan) data = data.replace(-99999976, np.nan) data = data.replace(-9999977, np.nan) data = data.replace(-9999978, np.nan) data = data.replace(-99999980, np.nan) data = data.replace(-99998.0, np.nan) print("data shape %s" % str(data.shape)) # count missing data in each column invest = data.isnull().sum() for i in invest.index: if invest[i] > 0: break print("feature %s have missing %s data" % (i, str(invest[i]))) # feature engineer standard_feature_obj = standard_feature_tree(data, 'target') standard_feature_obj.categ_continue_auto() standard_feature_obj.miss_inf_trans() standard_feature_obj.categ_label_trans() standard_feature_obj.format_train_test() #standard_feature_obj.apply_standardscale_classification() X_train = standard_feature_obj.sample_x y_train = standard_feature_obj.sample_y # model ops bayesopsObj = bayes_ops(X=X_train, Y=y_train, estimator=LGBMClassifier) parms = { #'x_train':X_train, #'y_train':y_train, 'num_leaves': (15, 500), 'colsample_bytree': (0.1, 1), 'drop_rate': (0.1, 1), 'learning_rate': (0.001, 0.05), 'max_bin': (10, 100), 'max_depth': (2, 20), 'min_split_gain': (0.2, 0.9), 'min_child_samples': (10, 200), 'n_estimators': (100, 3000), 'reg_alpha': (0.1, 100), 'reg_lambda': (0.1, 100), 'sigmoid': (0.5, 1), 'subsample': (0.1, 1), 'subsample_for_bin': (10000, 50000), 'subsample_freq': (1, 5) } # 参数整理格式,其实只需要提供parms里的参数即可 intdeal = [ 'max_bin', 'max_depth', 'max_drop', 'min_child_samples', 'min_child_weight', 'n_estimators', 'num_leaves', 'scale_pos_weight', 'subsample_for_bin', 'subsample_freq' ] # int类参数 middledeal = [ 'colsample_bytree', 'drop_rate', 'learning_rate', 'min_split_gain', 'skip_drop', 'subsample', '' ] # float, 只能在0,1之间 maxdeal = ['reg_alpha', 'reg_lambda', 'sigmoid'] # float,且可以大于1 bayesopsObj.run( parms=parms, cv=10, intdeal=intdeal, middledeal=middledeal, maxdeal=maxdeal, score_func=make_scorer(score_func=accuracy_score, greater_is_better=True), ) parms = bayesopsObj.baseparms model = LGBMClassifier(**parms) print(model) model.fit(X_train, y_train) # trainingset evaluation print('trainingset evaluation') y_pred = model.predict(X_train) y_pred_prob = model.predict_proba(X_train)[:, 0] acc = accuracy_score(y_pred, y_train, normalize=True) print('acc=%s' % str(acc)) auc = roc_auc_score(y_score=y_pred, y_true=y_train.values) print('auc=%s' % str(auc)) #evl.ks_curve(Y_true = y_train.values, Y_predprob = y_pred_prob, fig_path = 'lgr_train.png') ksobj = ks_statistic(yprob=y_pred_prob, ytrue=y_train.values) ksobj.cal_ks() print('ks=%s' % str(ksobj.ks)) # testset evaluation print('testset evaluation') X_test = standard_feature_obj.test_x y_test = standard_feature_obj.test_y y_pred = model.predict(X_test) y_pred_prob = model.predict_proba(X_test)[:, 0] acc = accuracy_score(y_pred, y_test, normalize=True) print('acc=%s' % str(acc)) auc = roc_auc_score(y_score=y_pred, y_true=y_test.values) print('auc=%s' % str(auc)) #evl.ks_curve(Y_true = y_train.values, Y_predprob = y_pred_prob, fig_path = 'lgr_train.png') ksobj = ks_statistic(yprob=y_pred_prob, ytrue=y_test.values) ksobj.cal_ks() print('ks=%s' % str(ksobj.ks))
def predict(self, F, datainfo, timeinfo): ''' This function should provide predictions of labels on (test) data. Here we just return random values... Make sure that the predicted values are in the correct format for the scoring metric. For example, binary classification problems often expect predictions in the form of a discriminant value (if the area under the ROC curve it the metric) rather that predictions of the class labels themselves. The function predict eventually casdn return probabilities or continuous values. ''' overall_spenttime = time.time() - timeinfo[0] dataset_spenttime = time.time() - timeinfo[1] logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime) logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime) date_cols = datainfo['loaded_feat_types'][0] numeric_cols = datainfo['loaded_feat_types'][1] categorical_cols = datainfo['loaded_feat_types'][2] multicategorical_cols = datainfo['loaded_feat_types'][3] # Get numerical variables and replace NaNs with 0s X = np.nan_to_num(F['numerical']) # Frequency encode categorical variables and concatenate them with numerical variables if categorical_cols > 0: X_cat = self.cat_encs.transform(F['CAT']).values X = np.concatenate((X, X_cat), axis=1) del X_cat # Adversarial validation logging.info('AV: starting adversarial validation...') np.random.seed(SEED) cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED) n_trn = self.X.shape[0] n_tst = X.shape[0] X_all = np.vstack((self.X, X)) y_all = np.concatenate((np.zeros(n_trn, ), np.ones(n_tst, ))) logging.info(f'AV: {X_all.shape}, {y_all.shape}') ps_all = np.zeros_like(y_all, dtype=float) for i, (i_trn, i_val) in enumerate(cv.split(X_all, y_all)): model_av = LGBMClassifier(**params) model_av.fit(X_all[i_trn], y_all[i_trn], eval_set=(X_all[i_val], y_all[i_val]), early_stopping_rounds=10, verbose=10) ps_all[i_val] = model_av.predict_proba(X_all[i_val])[:, 1] av_score = roc_auc_score(y_all, ps_all) logging.info(f'AV: AUC={av_score * 100: 3.2f}') logging.info( f'AV: propensity scores deciles: {np.percentile(ps_all, np.linspace(0, 1, 11))}' ) # Training idx = np.argsort(ps_all[:n_trn]) trn_idx = idx[:int(n_trn * .75)] val_idx = idx[int(n_trn * .75):] np.random.shuffle(trn_idx) X_trn = self.X[trn_idx] y_trn = self.y[trn_idx] X_val = self.X[val_idx] y_val = self.y[val_idx] self.clf.fit(X_trn, y_trn, eval_set=(X_val, y_val), early_stopping_rounds=10, verbose=10) num_test_samples = X.shape[0] if X.ndim > 1: num_feat = X.shape[1] logging.info( ("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples, num_feat)) if (self.num_feat != num_feat): logging.info( "ARRGH: number of features in X does not match training data!") logging.info( ("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples, self.num_labels)) y = self.clf.predict_proba(X)[:, 1] y = np.transpose(y) return y
class Model: def __init__(self, datainfo, timeinfo): ''' This constructor is supposed to initialize data members. Use triple quotes for function documentation. ''' # Just logging.info some info from the datainfo variable logging.info("The Budget for this data set is: %d seconds" % datainfo['time_budget']) logging.info( "Loaded %d time features, %d numerical Features, %d categorical features and %d multi valued categorical variables" % (datainfo['loaded_feat_types'][0], datainfo['loaded_feat_types'][1], datainfo['loaded_feat_types'][2], datainfo['loaded_feat_types'][3])) overall_spenttime = time.time() - timeinfo[0] dataset_spenttime = time.time() - timeinfo[1] logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime) logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime) self.num_train_samples = 0 self.num_feat = 1 self.num_labels = 1 self.is_trained = False self.clf = LGBMClassifier(**params) # Here you may have parameters and hyper-parameters def fit(self, F, y, datainfo, timeinfo): ''' This function should train the model parameters. Here we do nothing in this example... Args: X: Training data matrix of dim num_train_samples * num_feat. y: Training label matrix of dim num_train_samples * num_labels. Both inputs are numpy arrays. If fit is called multiple times on incremental data (train, test1, test2, etc.) you should warm-start your training from the pre-trained model. Past data will NOT be available for re-training. ''' overall_spenttime = time.time() - timeinfo[0] dataset_spenttime = time.time() - timeinfo[1] logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime) logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime) date_cols = datainfo['loaded_feat_types'][0] numeric_cols = datainfo['loaded_feat_types'][1] categorical_cols = datainfo['loaded_feat_types'][2] multicategorical_cols = datainfo['loaded_feat_types'][3] # Get numerical variables and replace NaNs with 0s self.X = np.nan_to_num(F['numerical']) self.y = y # Frequency encode categorical variables and concatenate them with numerical variables if categorical_cols > 0: self.cat_encs = FrequencyEncoder() X_cat = self.cat_encs.fit_transform(F['CAT']).values self.X = np.concatenate((self.X, X_cat), axis=1) del X_cat self.num_train_samples = self.X.shape[0] self.num_feat = self.X.shape[1] num_train_samples = y.shape[0] logging.info("The whole available data is: ") logging.info( ("Real-FIT: dim(X)= [{:d}, {:d}]").format(self.X.shape[0], self.X.shape[1])) logging.info( ("Real-FIT: dim(y)= [{:d}, {:d}]").format(self.y.shape[0], self.num_labels)) self.is_trained = True def predict(self, F, datainfo, timeinfo): ''' This function should provide predictions of labels on (test) data. Here we just return random values... Make sure that the predicted values are in the correct format for the scoring metric. For example, binary classification problems often expect predictions in the form of a discriminant value (if the area under the ROC curve it the metric) rather that predictions of the class labels themselves. The function predict eventually casdn return probabilities or continuous values. ''' overall_spenttime = time.time() - timeinfo[0] dataset_spenttime = time.time() - timeinfo[1] logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime) logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime) date_cols = datainfo['loaded_feat_types'][0] numeric_cols = datainfo['loaded_feat_types'][1] categorical_cols = datainfo['loaded_feat_types'][2] multicategorical_cols = datainfo['loaded_feat_types'][3] # Get numerical variables and replace NaNs with 0s X = np.nan_to_num(F['numerical']) # Frequency encode categorical variables and concatenate them with numerical variables if categorical_cols > 0: X_cat = self.cat_encs.transform(F['CAT']).values X = np.concatenate((X, X_cat), axis=1) del X_cat # Adversarial validation logging.info('AV: starting adversarial validation...') np.random.seed(SEED) cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED) n_trn = self.X.shape[0] n_tst = X.shape[0] X_all = np.vstack((self.X, X)) y_all = np.concatenate((np.zeros(n_trn, ), np.ones(n_tst, ))) logging.info(f'AV: {X_all.shape}, {y_all.shape}') ps_all = np.zeros_like(y_all, dtype=float) for i, (i_trn, i_val) in enumerate(cv.split(X_all, y_all)): model_av = LGBMClassifier(**params) model_av.fit(X_all[i_trn], y_all[i_trn], eval_set=(X_all[i_val], y_all[i_val]), early_stopping_rounds=10, verbose=10) ps_all[i_val] = model_av.predict_proba(X_all[i_val])[:, 1] av_score = roc_auc_score(y_all, ps_all) logging.info(f'AV: AUC={av_score * 100: 3.2f}') logging.info( f'AV: propensity scores deciles: {np.percentile(ps_all, np.linspace(0, 1, 11))}' ) # Training idx = np.argsort(ps_all[:n_trn]) trn_idx = idx[:int(n_trn * .75)] val_idx = idx[int(n_trn * .75):] np.random.shuffle(trn_idx) X_trn = self.X[trn_idx] y_trn = self.y[trn_idx] X_val = self.X[val_idx] y_val = self.y[val_idx] self.clf.fit(X_trn, y_trn, eval_set=(X_val, y_val), early_stopping_rounds=10, verbose=10) num_test_samples = X.shape[0] if X.ndim > 1: num_feat = X.shape[1] logging.info( ("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples, num_feat)) if (self.num_feat != num_feat): logging.info( "ARRGH: number of features in X does not match training data!") logging.info( ("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples, self.num_labels)) y = self.clf.predict_proba(X)[:, 1] y = np.transpose(y) return y def save(self, path="./"): pickle.dump(self, open(path + '_model.pickle', "w")) def load(self, path="./"): modelfile = path + '_model.pickle' if isfile(modelfile): with open(modelfile) as f: self = pickle.load(f) logging.info("Model reloaded from: " + modelfile) return self
def kfold_lightgbm(df, num_folds, stratified=False, debug=False): """ LightGBM GBDT with KFold or Stratified KFold. Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code Separates train and test sets. Trains the model with tuned hyperparameters(found by Bayesian optimization) and creates feature importance dataframe. Returns a dataframe that shows hightest 40 feature importances. :param df: dataframe dataframe to be trained :param num_folds: int int that shows the number of splits for cross validation. :param stratified: bool boolean that indicates, if cross validation will be applied stratified or not. :param debug: bool boolean that indicates, if the model will be run debug mode or not. :return: dataframe """ # Divide in training/validation and test data train_df = df[df['TARGET'].notnull()] test_df = df[df['TARGET'].isnull()] print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape)) del df gc.collect() # Cross validation model if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1001) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index']] for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx] # LightGBM parameters found by Bayesian optimization clf = LGBMClassifier( nthread=4, n_estimators=10000, learning_rate=0.02, num_leaves=34, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.041545473, reg_lambda=0.0735294, min_split_gain=0.0222415, min_child_weight=39.3259775, silent=-1, verbose=-1, ) clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='auc', verbose=200, early_stopping_rounds=200) oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1] sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importances_ fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]))) del clf, train_x, train_y, valid_x, valid_y gc.collect() print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds)) # Write submission file and plot feature importance if not debug: test_df['TARGET'] = sub_preds test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index=False) display_importances(feature_importance_df) return feature_importance_df
def lgbm_modeling_cross_validation(params, full_train, y, classes=CLASSES, class_weights=CLASS_WEIGHTS, nr_fold=5, random_state=1, sweights=BEST_SWEIGHTS, smote=False, standard_scaler=False): full_train = full_train.drop(ILLEGAL_FNAMES, axis=1, errors='ignore') # assert 'distmod' in full_train.columns if sweights is None: # Compute weights w = y.value_counts() sweights = {i: np.sum(w) / w[i] for i in w.index} elif smote and sweights == BEST_SWEIGHTS: print(f'WARNING: got BEST_SWEIGHTS and smote=True') clfs = [] importances = pd.DataFrame() folds = StratifiedKFold(n_splits=nr_fold, shuffle=True, random_state=random_state) oof_preds = np.zeros((len(full_train), np.unique(y).shape[0])) if standard_scaler: scl = StandardScaler() full_train = pd.DataFrame(scl.fit_transform(full_train.fillna(0)), index=full_train.index, columns=full_train.columns) for fold_, (trn_, val_) in tqdm_notebook(enumerate(folds.split(y, y)), total=nr_fold): trn_x, trn_y = full_train.iloc[trn_], y.iloc[trn_] val_x, val_y = full_train.iloc[val_], y.iloc[val_] if smote: trn_xa, trn_y, val_xa, val_y = smoteAdataset( trn_x.values, trn_y.values, val_x.values, val_y.values) trn_x = pd.DataFrame(data=trn_xa, columns=trn_x.columns) val_x = pd.DataFrame(data=val_xa, columns=val_x.columns) clf = LGBMClassifier(**params) loss_fn = lambda y, ypred: lgbm_multi_weighted_logloss( y, ypred, classes=classes, class_weights=class_weights) clf.fit(trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)], eval_metric=loss_fn, verbose=-1, early_stopping_rounds=50, sample_weight=trn_y.map(sweights)) clfs.append(clf) oof_preds[val_, :] = clf.predict_proba( val_x, num_iteration=clf.best_iteration_) imp_df = pd.DataFrame({ 'feature': full_train.columns, 'gain': clf.feature_importances_, 'fold': [fold_ + 1] * len(full_train.columns), }) importances = pd.concat([importances, imp_df], axis=0, sort=False) score = multi_weighted_logloss(y_true=y, y_preds=oof_preds, classes=classes, class_weights=class_weights) print(f'OOF:{score:.4f} n_folds={nr_fold}, nfeat={full_train.shape[1]}') normal_weight_score = multi_weighted_logloss(y, oof_preds) if class_weights != CLASS_WEIGHTS: print( f'OOF Default weights:{normal_weight_score:.4f} n_folds={nr_fold}, ' f'nfeat={full_train.shape[1]}') df_importances = agg_importances(importances) oof_df = make_oof_pred_df(oof_preds, columns=clf.classes_) return clfs, score, df_importances, oof_df, normal_weight_score
class ensemble: def __init__(self, df): ''' Initialize with dataframe to train models on Ex. e = ensemble(df) ''' self.df = df self.X = df.drop(columns='is_scammer') self.y = df['is_scammer'] #baseline vectorizer parameters tfidf = TfidfVectorizer( stop_words='english', min_df=3, # min count for relevant vocabulary max_features=5000, # maximum number of features strip_accents='unicode', # replace all accented unicode char by their corresponding ASCII char analyzer='word', # features made of words token_pattern=r'[a-zA-Z]{3,}', # tokenize only words of 3+ chars ngram_range=(1, 1), # features made of a single tokens use_idf=True, # enable inverse-document-frequency reweighting smooth_idf=True, # prevents zero division for unseen words sublinear_tf=False) #instantiate classifiers for ensemble self.rf = RandomForestClassifier(n_estimators = 100, n_jobs = -1, oob_score = True) self.xgb = XGBClassifier() self.lgb = LGBMClassifier() #creating ensemble classifier self.eclf = VotingClassifier(estimators=[('xgb', self.xgb), ('lgb', self.lgb), ('rf', self.rf)], voting='soft') #create pipeline for vectorizing user's comments for Naive Bayes self.model = make_pipeline(tfidf, MultinomialNB()) #numerical columns to use for rf/gb models self.num_cols = ['link_karma', 'comment_karma', 'verified', 'mod', 'gold', 'days_old', 'total_comments', 'positive', 'neutral', 'negative', 'mean_comment_length', 'mode_comment_length', 'median_comment_length', 'duplicate_comments', 'avg_grammar', 'total_grammar', 'cap_freq_mean'] def split(self,random_state = None): ''' Split imported dataframe into a train and test set. Use with train_fit and test_predict to tune parameters ''' self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, random_state=None) self.X_train_MNB = self.X_train['comments_new'] self.X_train = self.X_train[self.num_cols] self.X_test_MNB = self.X_test['comments_new'] self.X_test = self.X_test[self.num_cols] def train_fit(self): ''' Fits on training data ''' self.eclf.fit(self.X_train, self.y_train) self.model.fit(self.X_train_MNB, self.y_train) def fit(self): ''' Fits on full dataset for predicting unlabeled data ''' self.eclf.fit(self.X[self.num_cols], self.y) self.model.fit(self.X['comments_new'], self.y) def test_predict(self): ''' Returns test data prediction probability ''' y_pred = self.eclf.predict_proba(self.X_test)[:,1] y_pred_MNB = self.model.predict_proba(self.X_test_MNB)[:,1] y_final_pred = (y_pred+y_pred_MNB/2) return y_final_pred def predict(self, username): ''' Input Reddit username Returns prediction probability for new data ''' X = get_user_profile(str(username)) X_MNB = X['comments_new'] X = X[self.num_cols] y_pred = self.eclf.predict_proba(X)[:,1] y_pred_MNB = self.model.predict_proba(X_MNB)[:,1] y_final_pred = (y_pred+y_pred_MNB/2) return y_final_pred def rf_predict(self, X): ''' Fit and only return prediction probability for Random Forest Classifier ''' self.rf.fit(self.X_train, self.y_train) # X = X[self.num_cols] return self.rf.predict_proba(X)[:,1] def xgb_predict(self, X): ''' Fit and only return prediction probability for XGBoost classifier ''' self.xgb.fit(self.X_train, self.y_train) # X = X[self.num_cols] return self.xgb.predict_proba(X)[:,1] def lgb_predict(self, X): ''' Fit and only return prediction probability for LightGBM classifier ''' self.lgb.fit(self.X_train, self.y_train) # X = X[self.num_cols] return self.lgb.predict_proba(X)[:,1] def MNB_predict(self, X): ''' Fit and only return prediction probability for Multinomial Naive Bayes classifier ''' self.model.fit(self.X_train_MNB, self.y_train) # X = X['comments_new'] return self.model.predict_proba(X)[:,1] def score(self): ''' Returns Area Under Receiver Operator Characteristic Curve for ensemble method ''' print(f'''ROC AUC score: {roc_auc_score(self.y_test, self.test_predict())}''')
def modeling(all_data): all_data = all_data.rename( columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x)) train_df = all_data[all_data['TARGET'].notnull()] test_df = all_data[all_data['TARGET'].isnull()] folds = KFold(n_splits=10, shuffle=True, random_state=1001) oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR']] for n_fold, (train_idx, valid_idx) in enumerate( folds.split(train_df[feats], train_df['TARGET'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df[ 'TARGET'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[ 'TARGET'].iloc[valid_idx] clf = LGBMClassifier( n_jobs=-1, n_estimators=10000, learning_rate=0.02, num_leaves=34, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.041545473, reg_lambda=0.0735294, min_split_gain=0.0222415, min_child_weight=39.3259775, silent=-1, verbose=-1, ) clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='auc', verbose=200, early_stopping_rounds=200) # y_pred_valid oof_preds[valid_idx] = clf.predict_proba( valid_x, num_iteration=clf.best_iteration_)[:, 1] sub_preds += clf.predict_proba( test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importances_ fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]))) print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds)) # y_pred_valid test_df['TARGET'] = sub_preds test_df[['SK_ID_CURR', 'TARGET']].to_csv("dsmlbc1_submission.csv", index=False) display_importances(feature_importance_df) return feature_importance_df
class LightGBM(AutoSklearnClassificationAlgorithm): def __init__(self, n_estimators, learning_rate, num_leaves, max_depth, min_child_samples, subsample, colsample_bytree, random_state=None): self.n_estimators = int(n_estimators) self.learning_rate = learning_rate self.num_leaves = num_leaves self.max_depth = max_depth self.subsample = subsample self.min_child_samples = min_child_samples self.colsample_bytree = colsample_bytree self.n_jobs = 1 self.random_state = random_state self.estimator = None def fit(self, X, y): self.estimator = LGBMClassifier( num_leaves=self.num_leaves, max_depth=self.max_depth, learning_rate=self.learning_rate, n_estimators=self.n_estimators, min_child_samples=self.min_child_samples, subsample=self.subsample, colsample_bytree=self.colsample_bytree, n_jobs=self.n_jobs) self.estimator.fit(X, y) return self def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict_proba(X) @staticmethod def get_properties(dataset_properties=None): return { 'shortname': 'LightGBM Classifier', 'name': 'LightGBM Classifier', 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'is_deterministic': False, 'input': (SPARSE, DENSE, UNSIGNED_DATA), 'output': (PREDICTIONS, ) } @staticmethod def get_hyperparameter_search_space(dataset_properties=None, optimizer='smac'): cs = ConfigurationSpace() n_estimators = UniformFloatHyperparameter("n_estimators", 100, 1000, default_value=500, q=50) num_leaves = UniformIntegerHyperparameter("num_leaves", 31, 2047, default_value=128) max_depth = Constant('max_depth', 15) learning_rate = UniformFloatHyperparameter("learning_rate", 1e-3, 0.3, default_value=0.1, log=True) min_child_samples = UniformIntegerHyperparameter("min_child_samples", 5, 30, default_value=20) subsample = UniformFloatHyperparameter("subsample", 0.7, 1, default_value=1, q=0.1) colsample_bytree = UniformFloatHyperparameter("colsample_bytree", 0.7, 1, default_value=1, q=0.1) cs.add_hyperparameters([ n_estimators, num_leaves, max_depth, learning_rate, min_child_samples, subsample, colsample_bytree ]) return cs
def cv_scores(df, num_folds, params, stratified = False, verbose = -1, save_train_prediction = False, train_prediction_file_name = 'train_prediction.csv', save_test_prediction = True, test_prediction_file_name = 'test_prediction.csv'): warnings.simplefilter('ignore') clf = LGBMClassifier(**params) # Divide in training/validation and test data train_df = df[df['TARGET'].notnull()] test_df = df[df['TARGET'].isnull()] print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape)) # Cross validation model if stratified: folds = StratifiedKFold(n_splits = num_folds, shuffle = True, random_state = 1001) else: folds = KFold(n_splits = num_folds, shuffle = True, random_state = 1001) # Create arrays and dataframes to store results train_pred = np.zeros(train_df.shape[0]) train_pred_proba = np.zeros(train_df.shape[0]) test_pred = np.zeros(train_df.shape[0]) test_pred_proba = np.zeros(train_df.shape[0]) prediction = np.zeros(test_df.shape[0]) feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']] df_feature_importance = pd.DataFrame(index = feats) for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])): print('Fold', n_fold, 'started at', time.ctime()) train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx] clf.fit(train_x, train_y, eval_set = [(train_x, train_y), (valid_x, valid_y)], eval_metric = 'auc', verbose = verbose, early_stopping_rounds = 200) train_pred[train_idx] = clf.predict(train_x, num_iteration = clf.best_iteration_) train_pred_proba[train_idx] = clf.predict_proba(train_x, num_iteration = clf.best_iteration_)[:, 1] test_pred[valid_idx] = clf.predict(valid_x, num_iteration = clf.best_iteration_) test_pred_proba[valid_idx] = clf.predict_proba(valid_x, num_iteration = clf.best_iteration_)[:, 1] prediction += \ clf.predict_proba(test_df[feats], num_iteration = clf.best_iteration_)[:, 1] / folds.n_splits df_feature_importance[n_fold] = pd.Series(clf.feature_importances_, index = feats) print('Fold %2d AUC : %.6f' % (n_fold, roc_auc_score(valid_y, test_pred_proba[valid_idx]))) del train_x, train_y, valid_x, valid_y gc.collect() roc_auc_train = roc_auc_score(train_df['TARGET'], train_pred_proba) precision_train = precision_score(train_df['TARGET'], train_pred, average = None) recall_train = recall_score(train_df['TARGET'], train_pred, average = None) roc_auc_test = roc_auc_score(train_df['TARGET'], test_pred_proba) precision_test = precision_score(train_df['TARGET'], test_pred, average = None) recall_test = recall_score(train_df['TARGET'], test_pred, average = None) print('Full AUC score %.6f' % roc_auc_test) df_feature_importance.fillna(0, inplace = True) df_feature_importance['mean'] = df_feature_importance.mean(axis = 1) # Write prediction files if save_train_prediction: df_prediction = train_df[['SK_ID_CURR', 'TARGET']] df_prediction['Prediction'] = test_pred_proba df_prediction.to_csv(train_prediction_file_name, index = False) del df_prediction gc.collect() if save_test_prediction: df_prediction = test_df[['SK_ID_CURR']] df_prediction['TARGET'] = prediction df_prediction.to_csv(test_prediction_file_name, index = False) del df_prediction gc.collect() return df_feature_importance, \ [roc_auc_train, roc_auc_test, precision_train[0], precision_test[0], precision_train[1], precision_test[1], recall_train[0], recall_test[0], recall_train[1], recall_test[1], 0]
from lightgbm import LGBMClassifier, plot_importance from sklearn.model_selection import train_test_split from sklearn.feature_selection import SelectKBest, f_classif, chi2 train = pd.read_csv('train.csv', index_col=0) test = pd.read_csv('test.csv', index_col=0) sample_submission = pd.read_csv('sample_submission.csv', index_col=0) x = train.drop(columns='class', axis=1) # class 열을 삭제한 새로운 객체 y = train['class'] # 결과 레이블(class) TEST = test train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, shuffle=True, stratify=y, random_state=42) # 데이터에서 20%를 test 데이터로 분리 evals = [(test_x, test_y)] lgbm = LGBMClassifier(n_estimators=1000, learning_rate=0.03, max_depth=12, num_leaves=4000, random_state=42, boosting_type="goss") lgbm.fit(train_x, train_y, early_stopping_rounds=20, eval_set=evals) print("acc: {}".format(lgbm.score(train_x, train_y))) # 훈련 데이터에 대한 정확도 print("acc: {}".format(lgbm.score(test_x, test_y))) # 테스트 데이터에 대한 정확도 y_pred = np.argmax(lgbm.predict_proba(TEST), axis=1) # 각 클래스에 대한 예측확률 submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index) submission.to_csv('submission5.csv', index=True)
def cv_lgbm_scores(df_, num_folds, params, target_name = 'TARGET', index_name = 'SK_ID_CURR', stratified = False, rs = 1001, verbose = -1): warnings.simplefilter('ignore') # Cleaning and defining parameters for LGBM params = int_lgbm_params(params) clf = LGBMClassifier(**params, n_estimators = 20000, nthread = 4, n_jobs = -1) # Divide in training/validation and test data df_train_ = df_[df_[target_name].notnull()] df_test_ = df_[df_[target_name].isnull()] print("Starting LightGBM cross-validation at {}".format(time.ctime())) print("Train shape: {}, test shape: {}".format(df_train_.shape, df_test_.shape)) # Cross validation model if stratified: folds = StratifiedKFold(n_splits = num_folds, shuffle = True, random_state = rs) else: folds = KFold(n_splits = num_folds, shuffle = True, random_state = rs) # Create arrays to store results train_pred = np.zeros(df_train_.shape[0]) train_pred_proba = np.zeros(df_train_.shape[0]) test_pred = np.zeros(df_train_.shape[0]) test_pred_proba = np.zeros(df_train_.shape[0]) prediction = np.zeros(df_test_.shape[0]) # prediction for test set feats = df_train_.columns.drop([target_name, index_name]) df_feat_imp_ = pd.DataFrame(index = feats) # Cross-validation cycle for n_fold, (train_idx, valid_idx) in enumerate(folds.split(df_train_[feats], df_train_[target_name])): print('--- Fold {} started at {}'.format(n_fold, time.ctime())) train_x, train_y = df_train_[feats].iloc[train_idx], df_train_[target_name].iloc[train_idx] valid_x, valid_y = df_train_[feats].iloc[valid_idx], df_train_[target_name].iloc[valid_idx] clf.fit(train_x, train_y, eval_set = [(valid_x, valid_y)], eval_metric = 'auc', verbose = verbose, early_stopping_rounds = 100) train_pred[train_idx] = clf.predict(train_x, num_iteration = clf.best_iteration_) train_pred_proba[train_idx] = clf.predict_proba(train_x, num_iteration = clf.best_iteration_)[:, 1] test_pred[valid_idx] = clf.predict(valid_x, num_iteration = clf.best_iteration_) test_pred_proba[valid_idx] = clf.predict_proba(valid_x, num_iteration = clf.best_iteration_)[:, 1] prediction += clf.predict_proba(df_test_[feats], num_iteration = clf.best_iteration_)[:, 1] / folds.n_splits df_feat_imp_[n_fold] = pd.Series(clf.feature_importances_, index = feats) del train_x, train_y, valid_x, valid_y gc.collect() # Computation of metrics roc_auc_train = roc_auc_score(df_train_[target_name], train_pred_proba) precision_train = precision_score(df_train_[target_name], train_pred, average = None) recall_train = recall_score(df_train_[target_name], train_pred, average = None) roc_auc_test = roc_auc_score(df_train_[target_name], test_pred_proba) precision_test = precision_score(df_train_[target_name], test_pred, average = None) recall_test = recall_score(df_train_[target_name], test_pred, average = None) print('Full AUC score {:.6f}'.format(roc_auc_test)) # Filling the feature_importance table df_feat_imp_.fillna(0, inplace = True) df_feat_imp_['mean'] = df_feat_imp_.mean(axis = 1) # Preparing results of prediction for saving prediction_train = df_train_[[index_name]] prediction_train[target_name] = test_pred_proba prediction_test = df_test_[[index_name]] prediction_test[target_name] = prediction del df_train_, df_test_ gc.collect() # Returning the results and metrics in format for scores' table return df_feat_imp_, prediction_train, prediction_test, [roc_auc_train, roc_auc_test, precision_train[0], precision_test[0], precision_train[1], precision_test[1], recall_train[0], recall_test[0], recall_train[1], recall_test[1], 0]
num_leaves=85, max_depth=15, learning_rate=0.003, n_estimators=3677, subsample_for_bin=400000, objective="binary", min_split_gain=0.0, min_child_weight=0.01, min_child_samples=50, subsample=0.8, subsample_freq=1, colsample_bytree=0.7, reg_alpha=5.0, reg_lambda=0.0, silent=True) kf = KFold(n_splits=5) for n_fold, (train_index, test_index) in enumerate(kf.split(train_X)): print n_fold X_train, X_test = train_X.iloc[train_index], train_X.iloc[test_index] y_train, y_test = train_y[train_index], train_y[test_index] model_1.fit(X_train, y_train) #prediction = model_1.predict_proba(X_test) #train_score.append(prediction[:,1]) oof_train[test_index] = model_1.predict_proba(X_test)[:, 1] oof_test_skf[n_fold, :] = model_1.predict_proba(test_X)[:, 1] oof_test[:] = oof_test_skf.mean(axis=0) te['buy'] = oof_test tr['buy'] = oof_train
def tr_managerskill(train, test, y, folds, cache_file): print("\n\n############# Manager skill step ################") cache_key_train = 'managerskill_train' cache_key_test = 'managerskill_test' #Check if cache file exist and if data for this step is cached dict_train, dict_test = load_from_cache(cache_file, cache_key_train, cache_key_test) if dict_train is not None and dict_test is not None: train_out = train.assign(**dict_train) test_out = test.assign(**dict_test) return train_out, test_out, y, folds, cache_file print('# No cache detected, computing from scratch #') lb = LabelBinarizer(sparse_output=True) lb.fit(list(train['manager_id'].values) + list(test['manager_id'].values)) X_train_mngr = lb.transform(train['manager_id']).astype(np.float32) X_test_mngr = lb.transform(test['manager_id']).astype(np.float32) le = LabelEncoder() y_encode = le.fit_transform(y) # Separate train in train + validation data X_train, X_val, y_train, y_val = train_test_split(X_train_mngr, y_encode, test_size=0.2, random_state=42) # train gbm = LGBMClassifier(n_estimators=2048, seed=42, objective='multiclass', colsample_bytree='0.8', subsample='0.8') # Predict out-of-folds train data print('Start training - Number of folds: ', len(folds)) train_predictions = out_of_fold_predict(gbm, X_train_mngr, y_encode, folds) mngr_train_names = { 'mngr_' + le.classes_[0]: [row[0] for row in train_predictions], 'mngr_' + le.classes_[1]: [row[1] for row in train_predictions], 'mngr_' + le.classes_[2]: [row[2] for row in train_predictions], } mngr_train_names['mngr_skill'] = [ 2 * h + m for (h, m) in zip(mngr_train_names['mngr_high'], mngr_train_names['mngr_medium']) ] gbm.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='multi_logloss', early_stopping_rounds=50, verbose=False) # Now validate the predict value using the previously split validation set print('Start validating Manager skill...') # predict y_pred = gbm.predict_proba(X_val, num_iteration=gbm.best_iteration) # eval print('We stopped at boosting round: ', gbm.best_iteration) print('The mlogloss of prediction is:', mlogloss(y_val, y_pred)) # Now compute the value for the actual test data using out-of-folds predictions print('Start predicting Manager skill...') test_predictions = gbm.predict_proba(X_test_mngr, num_iteration=gbm.best_iteration) mngr_test_names = { 'mngr_' + le.classes_[0]: [row[0] for row in test_predictions], 'mngr_' + le.classes_[1]: [row[1] for row in test_predictions], 'mngr_' + le.classes_[2]: [row[2] for row in test_predictions] } mngr_test_names['mngr_skill'] = [ 2 * h + m for (h, m) in zip(mngr_test_names['mngr_high'], mngr_test_names['mngr_medium']) ] print('Caching features in ' + cache_file) save_to_cache(cache_file, cache_key_train, cache_key_test, mngr_train_names, mngr_test_names) print('Adding features to dataframe') train_out = train.assign(**mngr_train_names) test_out = test.assign(**mngr_test_names) return train_out, test_out, y, folds, cache_file
def lgbm_modeling_cross_validation(params, full_train, y, classes, class_weights, nr_fold=10, random_state=7): unique_y = np.unique(y) class_map = dict() for i, val in enumerate(unique_y): class_map[val] = i # y = np.array([class_map[val] for val in y]) y = y.apply(lambda x: class_map[x]) # Compute weights w = y.value_counts() weights = {i: np.sum(w) / w[i] for i in w.index} clfs = [] importances = pd.DataFrame() folds = StratifiedKFold(n_splits=nr_fold, shuffle=True, random_state=random_state) oof_preds = np.zeros((len(full_train), np.unique(y).shape[0])) for fold_, (trn_, val_) in enumerate(folds.split(y, y)): trn_x, trn_y = full_train.iloc[trn_], y.iloc[trn_] val_x, val_y = full_train.iloc[val_], y.iloc[val_] trn_xa, trn_y, val_xa, val_y = smoteAdataset(trn_x.values, trn_y.values, val_x.values, val_y.values) trn_x = pd.DataFrame(data=trn_xa, columns=trn_x.columns) val_x = pd.DataFrame(data=val_xa, columns=val_x.columns) clf = LGBMClassifier(**params) clf.fit(trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)], eval_metric=lgbm_multi_weighted_logloss, verbose=100, early_stopping_rounds=50, sample_weight=trn_y.map(weights)) clf.my_name = "lgbm" clfs.append(clf) oof_preds[val_, :] = clf.predict_proba( val_x) #, num_iteration=clf.best_iteration_) print('no {}-fold loss: {}'.format( fold_ + 1, multi_weighted_logloss(val_y, oof_preds[val_, :], classes, class_weights))) imp_df = pd.DataFrame({ 'feature': full_train.columns, 'gain': clf.feature_importances_, 'fold': [fold_ + 1] * len(full_train.columns), }) importances = pd.concat([importances, imp_df], axis=0, sort=False) score = multi_weighted_logloss(y_true=y, y_preds=oof_preds, classes=classes, class_weights=class_weights) print('MULTI WEIGHTED LOG LOSS: {:.5f}'.format(score)) df_importances = save_importances(importances_=importances) df_importances.to_csv('lgbm_importances.csv', index=False) cnf = confusion_matrix(y, np.argmax(oof_preds, axis=1)) plot_confusion_matrix(cnf, classes=classes, normalize=True, filename="lgbm") return clfs, score, oof_preds
def q1(x): return x.quantile(0.25) def q2(x): return x.quantile(0.75) grouped = train[features].groupby('id') X_train = grouped.agg(['max', 'min', 'mean', q1, q2]) X_test = test[features].groupby('id').agg(['max', 'min', 'mean', q1, q2]) y_train = train_label['label'] from lightgbm import LGBMClassifier from sklearn.model_selection import train_test_split lgbm_wrapper = LGBMClassifier(n_estimators=400) lgbm_wrapper.fit(X_train.values, y_train) preds = lgbm_wrapper.predict(X_test) pred_proba = lgbm_wrapper.predict_proba(X_test)[:, 1] y_pred = lgbm_wrapper.predict_proba(X_test) submission.iloc[:, 1:] = y_pred submission submission.to_csv('lightgbm_q1q2.csv', index=False)
rfc_predict = rfc.predict(X_test_std) rfc_predict_proba = rfc.predict_proba(X_test_std)[:,1] get_scores(y_test,rfc_predict,rfc_predict_proba) print('') #GBDT print('GBDT:') gdbt = GradientBoostingClassifier(random_state=2018) gdbt.fit(X_train_std,y_train) gdbt_predict = gdbt.predict(X_test_std) gdbt_predict_proba = gdbt.predict_proba(X_test_std)[:,1] get_scores(y_test,gdbt_predict,gdbt_predict_proba) print('') #XGBoost print('XGBoost:') xgbs = XGBClassifier(random_state=2018) xgbs.fit(X_train_std,y_train) xgbs_predict = xgbs.predict(X_test_std) xgbs_predict_proba = xgbs.predict_proba(X_test_std)[:,1] get_scores(y_test,xgbs_predict,xgbs_predict_proba) print('') #LightGBM print('LightGBM:') lgbm = LGBMClassifier(random_state=2018) lgbm.fit(X_train_std,y_train) lgbm_predict = lgbm.predict(X_test_std) lgbm_predict_proba = lgbm.predict_proba(X_test_std)[:,1] get_scores(y_test,lgbm_predict,lr_predict_pro)
Y = Y.reshape(len(Y)) import xgboost as xgb from lightgbm import LGBMClassifier model = LGBMClassifier(random_state=1, n_estimators=40, reg_lambda=1, reg_alpha=1) model.fit(X, Y) test_data = pd.read_csv("data/test.csv", index_col=0) test_data = test_data.fillna(0) features = test_data features["mid"] = (features["mid"] - features["last_price"]) / features["last_price"] features["bid1"] = (features["bid1"] - features["last_price"]) / features["last_price"] features["ask1"] = (features["ask1"] - features["last_price"]) / features["last_price"] features["bid2"] = (features["bid2"] - features["last_price"]) / features["last_price"] features["ask2"] = (features["ask2"] - features["last_price"]) / features["last_price"] features = features[["transacted_qty", "d_open_interest", "mid", "bid1", "bid2", "ask1", "ask2", "bid1vol", "bid2vol", "bid3vol", "bid4vol", "bid5vol", "ask1vol", "ask2vol", "ask3vol", "ask4vol", "ask5vol"]] features["bidcross1"] = features["bid1"] * features["bid1vol"] features["bidcross2"] = features["bid2"] * features["bid2vol"] features["askcross1"] = features["ask1"] * features["ask1vol"] features["askcross2"] = features["ask2"] * features["ask2vol"] X = scalar.transform(features.values) df_test = pd.read_csv('data/test.csv', index_col=0) df_test['Predicted'] = model.predict_proba(X)[:, 1] df_test[['Predicted']].to_csv('submission.csv')
X_test_scaled = scaler.transform(X_test) test_x_scaled = scaler.transform(test_x) print(X_train_scaled) # 시각화 import matplotlib.pyplot as plt plt.hist(X_train_scaled) plt.title('StandardScaler') plt.show() # 정확도 측정 acc = LGBM.score(X_test, y_test) print('acc: ', acc) # 0.8454961374034351 # 예측 y_pred = LGBM.predict_proba(test_x) print(y_pred) # 특성 중요도 그리기 import numpy as np import matplotlib.pyplot as plt def plot_feature_importances_orb(model): n_features = train_x.shape[1] plt.barh(np.arange(n_features), LGBM.feature_importances_, align='center') plt.yticks(np.arange(n_features), feat_labels) plt.xlabel("feature importance") plt.ylabel("feature") plt.ylim(-1, n_features)
def kfold_lightgbm(df, num_folds, lgb_param, stratified=False, debug=False): # Divide in training/validation and test data train_df = df[df['TARGET'].notnull()] test_df = df[df['TARGET'].isnull()] print("Starting LightGBM. Train shape: {}, test shape: {}".format( train_df.shape, test_df.shape)) del df gc.collect() # Cross validation model if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=50) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=50) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [ f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index'] ] for n_fold, (train_idx, valid_idx) in enumerate( folds.split(train_df[feats], train_df['TARGET'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df[ 'TARGET'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[ 'TARGET'].iloc[valid_idx] # LightGBM parameters found by Bayesian optimization clf = LGBMClassifier(**lgb_param) clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='auc', verbose=100, early_stopping_rounds=200) oof_preds[valid_idx] = clf.predict_proba( valid_x, num_iteration=clf.best_iteration_)[:, 1] sub_preds += clf.predict_proba( test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importances_ fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) logging.info( 'Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]))) del clf, train_x, train_y, valid_x, valid_y gc.collect() full_auc = roc_auc_score(train_df['TARGET'], oof_preds) print('Full AUC score %.6f' % full_auc) # Write submission file and plot feature importance if not debug: test_df['TARGET'] = sub_preds test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index=False) return full_auc
time2_1 = time.time() print('xgboost计算时间', time2_1 - time2_0) # 用lgb建模 if flag == 3 or flag == 0: print("开始lgbm训练") time3_0 = time.time() lgb = LGBMClassifier(objective='binary', learning_rate=0.02, n_estimators=100, num_leaves=45, depth=12, colsample_bytree=0.8, min_child_samples=14, subsample=0.9) lgb.fit(x_train, y_train) test_lgb_prob = lgb.predict_proba(x_test) train_lgb_prob = lgb.predict_proba(x_train) print('lightgbm的训练集log损失', log_loss(y_train, train_lgb_prob)) print('lightgbm的测试集集log损失', log_loss(y_test, test_lgb_prob)) time3_1 = time.time() print('lightgbm计算时间', time3_1 - time3_0) ''' #验证集输出结果,线上测试 import getFearures01 path_test = '../data/round1_ijcai_18_test_b_20180418.txt' test_df = getFearures01.cpfeature(path_test) test_pre = lgb.predict_proba(test_df) result = pd.DataFrame({'instance_id':test_df['instance_id'],'predicted_score':test_pre[:,1]}) result.to_csv('./result.csv',sep=' ',header=True,index=None)'''
def kfold_lightgbm(df, num_folds, stratified=False, debug=False): # Divide in training/validation and test data train_df = df[df['TARGET'].notnull()] test_df = df[df['TARGET'].isnull()] print("Starting LightGBM. Train shape: {}, test shape: {}".format( train_df.shape, test_df.shape)) del df gc.collect() # Cross validation model if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=47) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=47) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [ f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index'] ] for n_fold, (train_idx, valid_idx) in enumerate( folds.split(train_df[feats], train_df['TARGET'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df[ 'TARGET'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[ 'TARGET'].iloc[valid_idx] # LightGBM parameters found by Bayesian optimization clf = LGBMClassifier( nthread=8, #is_unbalance=True, n_estimators=10000, learning_rate=0.1, num_leaves=32, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.04, reg_lambda=0.073, min_split_gain=0.0222415, min_child_weight=40, silent=-1, verbose=-1, #scale_pos_weight=11 ) clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='auc', verbose=100, early_stopping_rounds=200) oof_preds[valid_idx] = clf.predict_proba( valid_x, num_iteration=clf.best_iteration_)[:, 1] sub_preds += clf.predict_proba( test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importances_ fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]))) with open(filename, 'a') as f: f.write( f"Fold {n_fold+1} AUC: {roc_auc_score(valid_y, oof_preds[valid_idx]):.6f}\n" ) del clf, train_x, train_y, valid_x, valid_y gc.collect() print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds)) with open(filename, 'a') as f: f.write( f"Full AUC: {roc_auc_score(train_df['TARGET'], oof_preds):.6f}\n") # Write submission file and plot feature importance if not debug: test_df['TARGET'] = sub_preds test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index=False) display_importances(feature_importance_df) return feature_importance_df
eval_ps = [1 / i for i in range(1, 20)] res = pd.DataFrame([], index=mds, columns=eval_ps) for md in mds: learner = LGBMClassifier(n_estimators=10000, max_depth=md) learner.fit(trainset.drop("reordered", axis=1), trainset.reordered, eval_metric="auc", early_stopping_rounds=10, eval_set=[(trainset.drop("reordered", axis=1), trainset.reordered), (evalset.drop("reordered", axis=1), evalset.reordered)]) preds = learner.predict_proba(evalset.drop("reordered", axis=1))[:, -1] for p in eval_ps: ppreds = evalset[preds > p] ppreds = ppreds.groupby("user_id").product_id.apply(set) ppreds.name = "preds" real.name = "real" comp = pd.concat([real, ppreds], axis=1) temp = pd.Series([set([0])] * comp.shape[0], index=comp.index) comp.real.fillna(temp, inplace=True) comp.preds.fillna(temp, inplace=True) comp["tp"] = comp.apply(lambda x: len(x["real"].intersection(x.preds)), axis=1) comp["acc"] = comp.tp / comp["preds"].apply(len) comp["recall"] = comp.tp / comp["real"].apply(len)
# (stratified) Cross validation for train_index, validation_index in kf.split(X, y): print("Cross-validation, Fold %d" % (len(log_loss_val) + 1)) # Split data into training and testing set X_train = X.iloc[train_index, :].copy() X_validate = X.iloc[validation_index, :].copy() y_train = y[train_index] y_validate = y[validation_index] # Train the model model = model.fit(X_train, y_train) # Test the model log_loss_val.append(log_loss(y_validate, model.predict_proba(X_validate))) print("Log loss: %f" % log_loss_val[-1]) # Make predictions y_pred.append(model.predict_proba(test[X.columns])[:, 1]) # delete temporal dataframes del X_train, X_validate, y_train, y_validate # Evaluate results from CV print("Log loss %f +/- %f" % (np.mean(log_loss_val), 2 * np.std(log_loss_val))) ## =========================== 4. Output results =========================== ## # Create output dataframes submission = pd.DataFrame({ 'msno': test.msno,
def kfold_lightgbm(df, debug=False): # Divide in training/validation and test data train_df = df[df['TARGET'].notnull()] test_df = df[df['TARGET'].isnull()] print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape)) del df gc.collect() folds = KFold(n_splits=10, shuffle=True, random_state=1001) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) # predicted valid_y sub_preds = np.zeros(test_df.shape[0]) # submission preds feature_importance_df = pd.DataFrame() # feature importance fold_auc_best_df = pd.DataFrame(columns=["FOLD", "AUC", "BEST_ITER"]) # holding best iter to save model feats = [f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index', "APP_index", "BURO_index", "PREV_index", "INSTAL_index", "CC_index", "POS_index"]] for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx] # LightGBM parameters found by Bayesian optimization clf = LGBMClassifier( n_jobs=-1, n_estimators=10000, learning_rate=0.02, num_leaves=34, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.041545473, reg_lambda=0.0735294, min_split_gain=0.0222415, min_child_weight=39.3259775, silent=-1, verbose=-1, ) clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='auc', verbose=200, early_stopping_rounds=200) # predicted valid_y oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1] # submission preds. her kat icin test setini tahmin edip tum katların ortalamasini alıyor. sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits # fold, auc and best iteration print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]))) # best auc & iteration fold_auc_best_df = fold_auc_best_df.append({'FOLD': int(n_fold + 1), 'AUC': roc_auc_score(valid_y, oof_preds[valid_idx]), "BEST_ITER": clf.best_iteration_}, ignore_index=True) fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importances_ fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) del clf, train_x, train_y, valid_x, valid_y gc.collect() # OUTPUTS print(fold_auc_best_df) print(feature_importance_df) # feature importance'ları df olarak kaydet feature_importance_df.to_pickle("outputs/features/feature_importance_df.pkl") fold_auc_best_df.to_pickle("outputs/features/fold_auc_best_df.pkl") # Final Model best_iter_1 = int(fold_auc_best_df.sort_values(by="AUC", ascending=False)[:1]["BEST_ITER"].values) y_train = train_df["TARGET"] x_train = train_df[feats] final_model = LGBMClassifier( n_jobs=-1, n_estimators=best_iter_1, learning_rate=0.02, num_leaves=34, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.041545473, reg_lambda=0.0735294, min_split_gain=0.0222415, min_child_weight=39.3259775, silent=-1, verbose=-1).fit(x_train, y_train) cur_dir = os.getcwd() os.chdir('models/reference/') pickle.dump(final_model, open("lightgbm_final_model.pkl", 'wb')) # model os.chdir(cur_dir) # her bir fold icin tahmin edilen valid_y'ler aslında train setinin y'lerinin farklı parcalarda yer alan tahminleri. cowsay.cow('Full Train(Validation) AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds)) # Write submission file and plot feature importance if not debug: cur_dir = os.getcwd() os.chdir('outputs/predictions/') test_df['TARGET'] = sub_preds test_df[['SK_ID_CURR', 'TARGET']].to_csv("reference_submission.csv", index=False) os.chdir(cur_dir) display_importances(feature_importance_df) del x_train, y_train return feature_importance_df
def OOFPreds(X, y, test_X, params, n_splits=5, random_state=23, clf='lgb'): """ 输入要求数据为 Dataframe 返回数据 Series """ # 方便后续特征重要度分析 feature_importance = pd.DataFrame(columns=['feature', 'importance', 'fold']) folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state) # oof 是交叉验证结果 sub是测试集预测结果 oof_preds, sub_preds = np.zeros(X.shape[0]), np.zeros(test_X.shape[0]) oof_train = np.zeros(X.shape[0]) print(X.shape, test_X.shape) valid_scores = [] train_scores = [] for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X, y)): trn_x, trn_y = X.iloc[trn_idx], y.iloc[trn_idx] val_x, val_y = X.iloc[val_idx], y.iloc[val_idx] # # 初始化 score记录方式 # trn_init_score = pd.Series([0.95] * len(trn_x), index=trn_x.index) # val_init_score = pd.Series([0.95] * len(val_x), index=val_x.index) # 模型构建与预测任务 if clf == 'lgb': with timer('{} fold 训练时间:'.format(n_fold)) as time: gbm = LGBMClassifier(**params) gbm.fit(trn_x, trn_y, init_score=trn_init_score, eval_set=[(trn_x, trn_y), (val_x, val_y)], eval_init_score=[trn_init_score, val_init_score], eval_metric='auc', verbose=30, early_stopping_rounds=100) print('best iteration: {}'.format(gbm.best_iteration_)) print('100单次训练时间: {:.3f}'.format(time*100/gbm.best_iteration_)) pred_val = gbm.predict_proba(val_x, num_iteration=gbm.best_iteration_)[:, 1] pred_test = gbm.predict_proba(test_X, num_iteration=gbm.best_iteration_)[:, 1] # 预测分数 预测结果记录 oof_preds[val_idx] = pred_val sub_preds += pred_test / folds.n_splits print(gbm.best_score_) valid_score = gbm.best_score_['valid_1']['auc'] train_score = gbm.best_score_['training']['auc'] valid_scores.append(valid_score) train_scores.append(train_score) feature_importance = feature_importance.append(pd.DataFrame({ 'importance': gbm.feature_importances_, 'fold': [n_fold + 1] * X.shape[1], 'feature': X.columns.tolist()})) else: # 自己的模型 # 任务一:完成模型的构建预测任务 # 任务二:完成预测分数,预测结果的记录 # 任务三:完成模型重要程度的记录 clf = LogisticRegression(**params) clf.fit(trn_x, trn_y) pred_train = clf.predict_proba(trn_x)[:, 1] pred_val = clf.predict_proba(val_x)[:, 1] pred_test = clf.predict_proba(test_X)[:, 1] \ oof_preds[val_idx] = pred_val sub_preds += pred_test / folds.n_splits valid_score = roc_auc_score(val_y, pred_val) train_score = roc_auc_score(trn_y, pred_train) valid_scores.append(valid_score) train_scores.append(train_score) feature_importance = feature_importance.append(pd.DataFrame({ 'importance': clf.coef_[0], 'fold': [n_fold + 1] * X.shape[1], 'feature': X.columns.tolist()})) print('Fold {:02d} 训练集 AUC: {:.6f} 验证集 AUC: {:.6f} '.format(n_fold + 1, train_score, valid_score)) del trn_x, trn_y, val_x, val_y; gc.collect() feature_importance['importance'] = feature_importance['importance'].astype(float) fold_names = list(range(folds.n_splits)) fold_names.append('overall') valid_auc = roc_auc_score(y, oof_preds) valid_scores.append(valid_auc) train_scores.append(np.mean(train_scores)) # 构建记录分数的 Dataframe metrics = pd.DataFrame({'fold': fold_names, 'train': train_scores, 'valid': valid_scores}) oof_preds = pd.Series(oof_preds.flatten(), index=X.index).rename('TARGET') sub_preds = pd.Series(sub_preds.flatten(), index=test_X.index).rename('TARGET') return oof_preds, sub_preds, feature_importance, metrics
def clean_data(data): warnings.simplefilter(action = 'ignore') # Removing empty features nun = data.nunique() empty = list(nun[nun <= 1].index) data.drop(empty, axis = 1, inplace = True) print('After removing empty features there are {0:d} features'.format(data.shape[1])) # Removing features with the same distribution on 0 and 1 classes corr = pd.DataFrame(index = ['diff', 'p']) ind = data[data['TARGET'].notnull()].index for c in data.columns.drop('TARGET'): corr[c] = corr_feature_with_target(data.loc[ind, c], data.loc[ind, 'TARGET']) corr = corr.T corr['diff_norm'] = abs(corr['diff'] / data.mean(axis = 0)) to_del_1 = corr[((corr['diff'] == 0) & (corr['p'] > .05))].index to_del_2 = corr[((corr['diff_norm'] < .5) & (corr['p'] > .05))].drop(to_del_1).index to_del = list(to_del_1) + list(to_del_2) if 'SK_ID_CURR' in to_del: to_del.remove('SK_ID_CURR') data.drop(to_del, axis = 1, inplace = True) print('After removing features with the same distribution on 0 and 1 classes there are {0:d} features'.format(data.shape[1])) # Removing features with not the same distribution on train and test datasets corr_test = pd.DataFrame(index = ['diff', 'p']) target = data['TARGET'].notnull().astype(int) for c in data.columns.drop('TARGET'): corr_test[c] = corr_feature_with_target(data[c], target) corr_test = corr_test.T corr_test['diff_norm'] = abs(corr_test['diff'] / data.mean(axis = 0)) bad_features = corr_test[((corr_test['p'] < .05) & (corr_test['diff_norm'] > 1))].index bad_features = corr.loc[bad_features][corr['diff_norm'] == 0].index data.drop(bad_features, axis = 1, inplace = True) print('After removing features with not the same distribution on train and test datasets there are {0:d} features'.format(data.shape[1])) del corr, corr_test gc.collect() # Removing features not interesting for classifier clf = LGBMClassifier(random_state = 0) train_index = data[data['TARGET'].notnull()].index train_columns = data.drop('TARGET', axis = 1).columns score = 1 new_columns = [] while score > .7: train_columns = train_columns.drop(new_columns) clf.fit(data.loc[train_index, train_columns], data.loc[train_index, 'TARGET']) f_imp = pd.Series(clf.feature_importances_, index = train_columns) score = roc_auc_score(data.loc[train_index, 'TARGET'], clf.predict_proba(data.loc[train_index, train_columns])[:, 1]) new_columns = f_imp[f_imp > 0].index data.drop(train_columns, axis = 1, inplace = True) print('After removing features not interesting for classifier there are {0:d} features'.format(data.shape[1])) return data