Exemple #1
0
gbm = LGBMClassifier(
	objective='binary',
	num_leaves=24,
	max_depth=3,
	learning_rate=0.1,
	seed=2018,
	colsample_bytree=0.3,
	subsample=0.8,
	n_jobs=-1,
	n_estimators=2000
	)

print('fitting...')
gbm.fit(ip_train, train.loc[train_index, 'is_trade'], eval_set=[(ip_test, train.loc[test_index, 'is_trade'])], 
		early_stopping_rounds=10)

property_df = pd.DataFrame(columns=['instance_id', 'item_property_prob'])
property_df['instance_id'] = data['instance_id']
property_df['item_property_prob'] = gbm.predict_proba(data_ip)[:, 1]

def NatureLP(data, columns):
	
	pass

print('saving...')
property_df.to_csv(wd+out_put[0], index=False, sep=' ')




def lgb_model(apptype_train, app_desc, apptype_train_term_doc,
              app_desc_term_doc, **params):
    """
    lgb模型
    :param apptype_train:
    :param app_desc:
    :param apptype_train_term_doc:
    :param app_desc_term_doc:
    :param params:
    :return:
    """
    import numpy as np
    from lightgbm import LGBMClassifier
    from sklearn.model_selection import StratifiedKFold
    from sklearn import metrics

    # 类别数 122
    num_class = apptype_train['label1'].max() + 1
    # 类别
    label = apptype_train['label1']

    n_splits = 5

    params = {
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'nthread': -1,
        'silent': True,  # 是否打印信息,默认False
        'learning_rate': 0.01,
        'num_leaves': 1000,
        'max_depth': 7,  # 第二次交叉验证得到的参数
        'max_bin': 127,
        'subsample_for_bin': 1000,
        'subsample': 0.8,
        'subsample_freq': 1,
        'colsample_bytree': 0.8,
    }

    oof_lgb = np.zeros((apptype_train.shape[0], num_class))
    prediction_lgb = np.zeros((app_desc.shape[0], num_class))
    for i, (tr, va) in enumerate(
            StratifiedKFold(n_splits=n_splits, shuffle=True,
                            random_state=2019).split(apptype_train_term_doc,
                                                     label)):
        print('fold:', i + 1, 'training')
        # 训练:
        bst = LGBMClassifier(**params).fit(X=apptype_train_term_doc[tr],
                                           y=label[tr])
        # 预测验证集:
        oof_lgb[va] += bst.predict_proba(apptype_train_term_doc[va],
                                         num_iteration=bst.best_iteration_)
        # 预测测试集:
        prediction_lgb += bst.predict_proba(app_desc_term_doc,
                                            num_iteration=bst.best_iteration_)

    print(
        "model acc_score:",
        metrics.accuracy_score(label,
                               np.argmax(oof_lgb, axis=1),
                               normalize=True,
                               sample_weight=None))

    return oof_lgb, prediction_lgb
Exemple #3
0
def train():
    store = pd.HDFStore(filename)

    # run length stop indices
    df_idx = pd.read_hdf(store, 'idx')

    idx = np.array(df_idx[0], dtype = np.int32)

    idx = np.insert(idx, 0, 0)

    begin_offset = idx[:-1]
    lengths = np.diff(idx)

    del df_idx
    del idx

    gc.collect()


    data = process_train_set(store, begin_offset, lengths)

    meta = pd.read_csv(DATA_DIR + "training_set_metadata.csv")   

    y = np.array (meta.target, dtype = np.int32)

    m = y == 90

    y[m] = 1
    y[~m] = 0

    num_splits = 8

    folds = KFold(n_splits=num_splits, shuffle=True, random_state=11)

    oof_preds = np.zeros(data.shape[0])

    for n_fold, (trn_idx, val_idx) in enumerate(folds.split(data)):
        
        trn_x, trn_y = data[trn_idx], y[trn_idx]
        val_x, val_y = data[val_idx], y[val_idx]

        print (n_fold)

        clf = LGBMClassifier(n_estimators=20000, learning_rate=0.01, num_leaves = 255, silent=-1, verbose=-1)


        clf.fit(trn_x, trn_y,  eval_set= [(trn_x, trn_y), (val_x, val_y)], eval_metric='auc', verbose=25, early_stopping_rounds=400)  

        oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]

        print('Fold %2d AUC : %.3f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
        del clf, trn_x, trn_y, val_x, val_y
        gc.collect()


    print('Full AUC score %.3f' % roc_auc_score(y, oof_preds)) 


    clf = LGBMClassifier(n_estimators=200, learning_rate=0.01, max_depth=5, num_leaves = 31, silent=-1, verbose=-1)
    clf.fit(data, y,  eval_set= [(data, y)], eval_metric='auc', verbose=25, early_stopping_rounds=400) 

    return clf
                         colsample_bytree=.8,
                         subsample=.9,
                         max_depth=7,
                         reg_alpha=.1,
                         reg_lambda=.1,
                         min_split_gain=.01,
                         min_child_weight=2)

    clf.fit(trn_x,
            trn_y,
            eval_set=[(trn_x, trn_y), (val_x, val_y)],
            eval_metric='auc',
            verbose=250,
            early_stopping_rounds=150)

    oof_preds[val_idx] = clf.predict_proba(
        val_x, num_iteration=clf.best_iteration_)[:, 1]
    sub_preds += clf.predict_proba(
        test[features], num_iteration=clf.best_iteration_)[:,
                                                           1] / folds.n_splits

    print('Fold %2d AUC : %.6f' %
          (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
    del clf, trn_x, trn_y, val_x, val_y
    gc.collect()

print('Full AUC score %.6f' % roc_auc_score(y, oof_preds))

test['TARGET'] = sub_preds

test[['SK_ID_CURR', 'TARGET']].to_csv('first_submission.csv',
                                      index=False,
    def fit(self, X: pd.DataFrame, y: np.array) -> tuple:
        # process cat cols
        if self.cat_validation == "None":
            encoder = MultipleEncoder(
                cols=self.cat_cols, encoders_names_tuple=self.encoders_names
            )
            X = encoder.fit_transform(X, y)

        for n_fold, (train_idx, val_idx) in enumerate(
            self.model_validation.split(X, y)
        ):
            X_train, X_val = (
                X.loc[train_idx].reset_index(drop=True),
                X.loc[val_idx].reset_index(drop=True),
            )
            y_train, y_val = y[train_idx], y[val_idx]
            print(f"shapes before encoder : ", X_train.shape, X_val.shape)

            if self.cat_validation == "Single":
                encoder = MultipleEncoder(
                    cols=self.cat_cols, encoders_names_tuple=self.encoders_names
                )
                X_train = encoder.fit_transform(X_train, y_train)
                X_val = encoder.transform(X_val)
            if self.cat_validation == "Double":
                encoder = DoubleValidationEncoderNumerical(
                    cols=self.cat_cols, encoders_names_tuple=self.encoders_names
                )
                X_train = encoder.fit_transform(X_train, y_train)
                X_val = encoder.transform(X_val)
                pass
            self.encoders_list.append(encoder)

            # check for OrdinalEncoder encoding
            for col in [col for col in X_train.columns if "OrdinalEncoder" in col]:
                X_train[col] = X_train[col].astype("category")
                X_val[col] = X_val[col].astype("category")

            # fit model
            print(f"shapes before model : ", X_train.shape, X_val.shape)
            model = LGBMClassifier(**self.model_params)
            model.fit(
                X_train,
                y_train,
                eval_set=[(X_train, y_train), (X_val, y_val)],
                verbose=100,
                early_stopping_rounds=100,
            )
            self.models_trees.append(model.best_iteration_)
            self.models_list.append(model)

            y_hat = model.predict_proba(X_train)[:, 1]
            score_train = roc_auc_score(y_train, y_hat)
            self.scores_list_train.append(score_train)
            y_hat = model.predict_proba(X_val)[:, 1]
            score_val = roc_auc_score(y_val, y_hat)
            self.scores_list_val.append(score_val)

            print(f"AUC on {n_fold} fold train : {np.round(score_train, 4)}\n\n ")
            print(f"AUC on {n_fold} fold val : {np.round(score_val, 4)}\n\n ")

        mean_score_train = np.mean(self.scores_list_train)
        mean_score_val = np.mean(self.scores_list_val)
        avg_num_trees = int(np.mean(self.models_trees))
        print(f"\n\n Mean score train : {np.round(mean_score_train, 4)}\n\n ")
        print(f"\n\n Mean score val : {np.round(mean_score_val, 4)}\n\n ")
        return mean_score_train, mean_score_val, avg_num_trees
Exemple #6
0
lgb_params = dict()
lgb_params['learning_rate'] = 0.01
lgb_params['n_estimators'] = 1000
# lgb_params['max_depth'] = 10
lgb_params['max_bin'] = 10
lgb_params['subsample'] = 0.8
lgb_params['subsample_freq'] = 10
lgb_params['colsample_bytree'] = 0.8
lgb_params['min_child_samples'] = 500

lgb = LGBMClassifier(**lgb_params)

skf = StratifiedKFold(n_splits=3, shuffle=True)

predictions = np.zeros((test_pca.shape[0], 3))
for train_index, test_index in skf.split(train_pca, train_target):
    i = 0
    lgb_train = train_pca[train_index]
    lgb_target = train_target[train_index]
    lgb.fit(lgb_train, lgb_target)
    y_pred = lgb.predict_proba(test_pca)[:, 1]
    predictions[:, i] = y_pred
    i += 1

# write the result to a csv

res = pd.DataFrame()
res['id'] = test_id
res['target'] = predictions.mean(axis=1)
res.to_csv('smooth_pred.csv', index=False)
Exemple #7
0
def main():
    topicdata = pd.read_csv(r'/data/work/wk/tb/20170906/user_out.csv')
    tbjxldata = pd.read_csv(
        r'/data/work/wk/tb/data/tbjxl_r360dtl_data20170713_uft8.csv')
    tbjxldata = tbjxldata[(tbjxldata['target'] != 2)
                          & tbjxldata['flg_sample'] == 1]  # 閺堝宕�

    topicdata.rename(columns={"ugid": "user_gid"}, inplace=True)
    rawdata = pd.merge(topicdata, tbjxldata, on='user_gid')

    exclude = [
        'cust_nm', 'register_mobile', 'flg_jxl', 'flg_tb', 'flg_sample',
        'user_gid', 'IDCardNO', 'decision_tm', 'usertype', 'ugid', 'weight',
        'phone', 'id_card_1', 'mobile_auth', 'first_decision_tm',
        'register_time', 'credit_history', 'cust_nm_sha', 'id_card_sha',
        'mobile_sha', 'cust_nm_1', 'target1', 'cust_perf', 'source'
    ]

    features = [f for f in rawdata.columns if f not in exclude]
    data = rawdata[features]
    data = data.replace('@', np.nan)
    data = data.replace(-9999976, np.nan)
    data = data.replace(-99999976, np.nan)
    data = data.replace(-9999977, np.nan)
    data = data.replace(-9999978, np.nan)
    data = data.replace(-99999980, np.nan)
    data = data.replace(-99998.0, np.nan)
    print("data shape %s" % str(data.shape))

    # count missing data in each column
    invest = data.isnull().sum()
    for i in invest.index:
        if invest[i] > 0:
            break
            print("feature %s have missing %s data" % (i, str(invest[i])))

    # feature engineer
    standard_feature_obj = standard_feature_tree(data, 'target')
    standard_feature_obj.categ_continue_auto()
    standard_feature_obj.miss_inf_trans()
    standard_feature_obj.categ_label_trans()
    standard_feature_obj.format_train_test()
    #standard_feature_obj.apply_standardscale_classification()
    X_train = standard_feature_obj.sample_x
    y_train = standard_feature_obj.sample_y
    # model ops
    bayesopsObj = bayes_ops(X=X_train, Y=y_train, estimator=LGBMClassifier)
    parms = {
        #'x_train':X_train,
        #'y_train':y_train,
        'num_leaves': (15, 500),
        'colsample_bytree': (0.1, 1),
        'drop_rate': (0.1, 1),
        'learning_rate': (0.001, 0.05),
        'max_bin': (10, 100),
        'max_depth': (2, 20),
        'min_split_gain': (0.2, 0.9),
        'min_child_samples': (10, 200),
        'n_estimators': (100, 3000),
        'reg_alpha': (0.1, 100),
        'reg_lambda': (0.1, 100),
        'sigmoid': (0.5, 1),
        'subsample': (0.1, 1),
        'subsample_for_bin': (10000, 50000),
        'subsample_freq': (1, 5)
    }
    # 参数整理格式,其实只需要提供parms里的参数即可
    intdeal = [
        'max_bin', 'max_depth', 'max_drop', 'min_child_samples',
        'min_child_weight', 'n_estimators', 'num_leaves', 'scale_pos_weight',
        'subsample_for_bin', 'subsample_freq'
    ]  # int类参数
    middledeal = [
        'colsample_bytree', 'drop_rate', 'learning_rate', 'min_split_gain',
        'skip_drop', 'subsample', ''
    ]  # float, 只能在0,1之间
    maxdeal = ['reg_alpha', 'reg_lambda', 'sigmoid']  # float,且可以大于1
    bayesopsObj.run(
        parms=parms,
        cv=10,
        intdeal=intdeal,
        middledeal=middledeal,
        maxdeal=maxdeal,
        score_func=make_scorer(score_func=accuracy_score,
                               greater_is_better=True),
    )

    parms = bayesopsObj.baseparms
    model = LGBMClassifier(**parms)
    print(model)
    model.fit(X_train, y_train)

    # trainingset evaluation
    print('trainingset evaluation')
    y_pred = model.predict(X_train)
    y_pred_prob = model.predict_proba(X_train)[:, 0]
    acc = accuracy_score(y_pred, y_train, normalize=True)
    print('acc=%s' % str(acc))
    auc = roc_auc_score(y_score=y_pred, y_true=y_train.values)
    print('auc=%s' % str(auc))
    #evl.ks_curve(Y_true = y_train.values, Y_predprob = y_pred_prob, fig_path = 'lgr_train.png')
    ksobj = ks_statistic(yprob=y_pred_prob, ytrue=y_train.values)
    ksobj.cal_ks()
    print('ks=%s' % str(ksobj.ks))

    # testset evaluation
    print('testset evaluation')
    X_test = standard_feature_obj.test_x
    y_test = standard_feature_obj.test_y
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:, 0]
    acc = accuracy_score(y_pred, y_test, normalize=True)
    print('acc=%s' % str(acc))
    auc = roc_auc_score(y_score=y_pred, y_true=y_test.values)
    print('auc=%s' % str(auc))
    #evl.ks_curve(Y_true = y_train.values, Y_predprob = y_pred_prob, fig_path = 'lgr_train.png')
    ksobj = ks_statistic(yprob=y_pred_prob, ytrue=y_test.values)
    ksobj.cal_ks()
    print('ks=%s' % str(ksobj.ks))
Exemple #8
0
    def predict(self, F, datainfo, timeinfo):
        '''
        This function should provide predictions of labels on (test) data.
        Here we just return random values...
        Make sure that the predicted values are in the correct format for the scoring
        metric. For example, binary classification problems often expect predictions
        in the form of a discriminant value (if the area under the ROC curve it the metric)
        rather that predictions of the class labels themselves.
        The function predict eventually casdn return probabilities or continuous values.
        '''

        overall_spenttime = time.time() - timeinfo[0]
        dataset_spenttime = time.time() - timeinfo[1]

        logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime)
        logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime)

        date_cols = datainfo['loaded_feat_types'][0]
        numeric_cols = datainfo['loaded_feat_types'][1]
        categorical_cols = datainfo['loaded_feat_types'][2]
        multicategorical_cols = datainfo['loaded_feat_types'][3]

        # Get numerical variables and replace NaNs with 0s
        X = np.nan_to_num(F['numerical'])

        # Frequency encode categorical variables and concatenate them with numerical variables
        if categorical_cols > 0:
            X_cat = self.cat_encs.transform(F['CAT']).values
            X = np.concatenate((X, X_cat), axis=1)
            del X_cat

        # Adversarial validation
        logging.info('AV: starting adversarial validation...')

        np.random.seed(SEED)
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

        n_trn = self.X.shape[0]
        n_tst = X.shape[0]

        X_all = np.vstack((self.X, X))
        y_all = np.concatenate((np.zeros(n_trn, ), np.ones(n_tst, )))
        logging.info(f'AV: {X_all.shape}, {y_all.shape}')

        ps_all = np.zeros_like(y_all, dtype=float)
        for i, (i_trn, i_val) in enumerate(cv.split(X_all, y_all)):

            model_av = LGBMClassifier(**params)
            model_av.fit(X_all[i_trn],
                         y_all[i_trn],
                         eval_set=(X_all[i_val], y_all[i_val]),
                         early_stopping_rounds=10,
                         verbose=10)

            ps_all[i_val] = model_av.predict_proba(X_all[i_val])[:, 1]

        av_score = roc_auc_score(y_all, ps_all)
        logging.info(f'AV: AUC={av_score * 100: 3.2f}')
        logging.info(
            f'AV: propensity scores deciles: {np.percentile(ps_all, np.linspace(0, 1, 11))}'
        )

        # Training
        idx = np.argsort(ps_all[:n_trn])
        trn_idx = idx[:int(n_trn * .75)]
        val_idx = idx[int(n_trn * .75):]

        np.random.shuffle(trn_idx)
        X_trn = self.X[trn_idx]
        y_trn = self.y[trn_idx]
        X_val = self.X[val_idx]
        y_val = self.y[val_idx]

        self.clf.fit(X_trn,
                     y_trn,
                     eval_set=(X_val, y_val),
                     early_stopping_rounds=10,
                     verbose=10)

        num_test_samples = X.shape[0]
        if X.ndim > 1: num_feat = X.shape[1]
        logging.info(
            ("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples,
                                                     num_feat))
        if (self.num_feat != num_feat):
            logging.info(
                "ARRGH: number of features in X does not match training data!")
        logging.info(
            ("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples,
                                                     self.num_labels))
        y = self.clf.predict_proba(X)[:, 1]
        y = np.transpose(y)
        return y
Exemple #9
0
class Model:
    def __init__(self, datainfo, timeinfo):
        '''
        This constructor is supposed to initialize data members.
        Use triple quotes for function documentation.
        '''
        # Just logging.info some info from the datainfo variable
        logging.info("The Budget for this data set is: %d seconds" %
                     datainfo['time_budget'])

        logging.info(
            "Loaded %d time features, %d numerical Features, %d categorical features and %d multi valued categorical variables"
            % (datainfo['loaded_feat_types'][0],
               datainfo['loaded_feat_types'][1],
               datainfo['loaded_feat_types'][2],
               datainfo['loaded_feat_types'][3]))
        overall_spenttime = time.time() - timeinfo[0]
        dataset_spenttime = time.time() - timeinfo[1]
        logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime)
        logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime)
        self.num_train_samples = 0
        self.num_feat = 1
        self.num_labels = 1
        self.is_trained = False
        self.clf = LGBMClassifier(**params)
        # Here you may have parameters and hyper-parameters

    def fit(self, F, y, datainfo, timeinfo):
        '''
        This function should train the model parameters.
        Here we do nothing in this example...
        Args:
            X: Training data matrix of dim num_train_samples * num_feat.
            y: Training label matrix of dim num_train_samples * num_labels.
        Both inputs are numpy arrays.
        If fit is called multiple times on incremental data (train, test1, test2, etc.)
        you should warm-start your training from the pre-trained model. Past data will
        NOT be available for re-training.
        '''

        overall_spenttime = time.time() - timeinfo[0]
        dataset_spenttime = time.time() - timeinfo[1]

        logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime)
        logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime)

        date_cols = datainfo['loaded_feat_types'][0]
        numeric_cols = datainfo['loaded_feat_types'][1]
        categorical_cols = datainfo['loaded_feat_types'][2]
        multicategorical_cols = datainfo['loaded_feat_types'][3]

        # Get numerical variables and replace NaNs with 0s
        self.X = np.nan_to_num(F['numerical'])
        self.y = y

        # Frequency encode categorical variables and concatenate them with numerical variables
        if categorical_cols > 0:
            self.cat_encs = FrequencyEncoder()
            X_cat = self.cat_encs.fit_transform(F['CAT']).values
            self.X = np.concatenate((self.X, X_cat), axis=1)
            del X_cat

        self.num_train_samples = self.X.shape[0]
        self.num_feat = self.X.shape[1]
        num_train_samples = y.shape[0]

        logging.info("The whole available data is: ")
        logging.info(
            ("Real-FIT: dim(X)= [{:d}, {:d}]").format(self.X.shape[0],
                                                      self.X.shape[1]))
        logging.info(
            ("Real-FIT: dim(y)= [{:d}, {:d}]").format(self.y.shape[0],
                                                      self.num_labels))

        self.is_trained = True

    def predict(self, F, datainfo, timeinfo):
        '''
        This function should provide predictions of labels on (test) data.
        Here we just return random values...
        Make sure that the predicted values are in the correct format for the scoring
        metric. For example, binary classification problems often expect predictions
        in the form of a discriminant value (if the area under the ROC curve it the metric)
        rather that predictions of the class labels themselves.
        The function predict eventually casdn return probabilities or continuous values.
        '''

        overall_spenttime = time.time() - timeinfo[0]
        dataset_spenttime = time.time() - timeinfo[1]

        logging.info("[***] Overall time spent %5.2f sec" % overall_spenttime)
        logging.info("[***] Dataset time spent %5.2f sec" % dataset_spenttime)

        date_cols = datainfo['loaded_feat_types'][0]
        numeric_cols = datainfo['loaded_feat_types'][1]
        categorical_cols = datainfo['loaded_feat_types'][2]
        multicategorical_cols = datainfo['loaded_feat_types'][3]

        # Get numerical variables and replace NaNs with 0s
        X = np.nan_to_num(F['numerical'])

        # Frequency encode categorical variables and concatenate them with numerical variables
        if categorical_cols > 0:
            X_cat = self.cat_encs.transform(F['CAT']).values
            X = np.concatenate((X, X_cat), axis=1)
            del X_cat

        # Adversarial validation
        logging.info('AV: starting adversarial validation...')

        np.random.seed(SEED)
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

        n_trn = self.X.shape[0]
        n_tst = X.shape[0]

        X_all = np.vstack((self.X, X))
        y_all = np.concatenate((np.zeros(n_trn, ), np.ones(n_tst, )))
        logging.info(f'AV: {X_all.shape}, {y_all.shape}')

        ps_all = np.zeros_like(y_all, dtype=float)
        for i, (i_trn, i_val) in enumerate(cv.split(X_all, y_all)):

            model_av = LGBMClassifier(**params)
            model_av.fit(X_all[i_trn],
                         y_all[i_trn],
                         eval_set=(X_all[i_val], y_all[i_val]),
                         early_stopping_rounds=10,
                         verbose=10)

            ps_all[i_val] = model_av.predict_proba(X_all[i_val])[:, 1]

        av_score = roc_auc_score(y_all, ps_all)
        logging.info(f'AV: AUC={av_score * 100: 3.2f}')
        logging.info(
            f'AV: propensity scores deciles: {np.percentile(ps_all, np.linspace(0, 1, 11))}'
        )

        # Training
        idx = np.argsort(ps_all[:n_trn])
        trn_idx = idx[:int(n_trn * .75)]
        val_idx = idx[int(n_trn * .75):]

        np.random.shuffle(trn_idx)
        X_trn = self.X[trn_idx]
        y_trn = self.y[trn_idx]
        X_val = self.X[val_idx]
        y_val = self.y[val_idx]

        self.clf.fit(X_trn,
                     y_trn,
                     eval_set=(X_val, y_val),
                     early_stopping_rounds=10,
                     verbose=10)

        num_test_samples = X.shape[0]
        if X.ndim > 1: num_feat = X.shape[1]
        logging.info(
            ("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples,
                                                     num_feat))
        if (self.num_feat != num_feat):
            logging.info(
                "ARRGH: number of features in X does not match training data!")
        logging.info(
            ("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples,
                                                     self.num_labels))
        y = self.clf.predict_proba(X)[:, 1]
        y = np.transpose(y)
        return y

    def save(self, path="./"):
        pickle.dump(self, open(path + '_model.pickle', "w"))

    def load(self, path="./"):
        modelfile = path + '_model.pickle'
        if isfile(modelfile):
            with open(modelfile) as f:
                self = pickle.load(f)
            logging.info("Model reloaded from: " + modelfile)
        return self
def kfold_lightgbm(df, num_folds, stratified=False, debug=False):
    """
    LightGBM GBDT with KFold or Stratified KFold.
    Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code
    Separates train and test sets. Trains the model with tuned hyperparameters(found by Bayesian optimization) and
    creates feature importance dataframe.

    Returns a dataframe that shows hightest 40 feature importances.

    :param df: dataframe
        dataframe to be trained

    :param num_folds: int
        int that shows the number of splits for cross validation.

    :param stratified: bool
        boolean that indicates, if cross validation will be applied stratified or not.

    :param debug: bool
        boolean that indicates, if the model will be run debug mode or not.

    :return: dataframe

    """
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index']]

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=4,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)],
                eval_metric='auc', verbose=200, early_stopping_rounds=200)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    # Write submission file and plot feature importance
    if not debug:
        test_df['TARGET'] = sub_preds
        test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index=False)
    display_importances(feature_importance_df)
    return feature_importance_df
def lgbm_modeling_cross_validation(params,
                                   full_train,
                                   y,
                                   classes=CLASSES,
                                   class_weights=CLASS_WEIGHTS,
                                   nr_fold=5,
                                   random_state=1,
                                   sweights=BEST_SWEIGHTS,
                                   smote=False,
                                   standard_scaler=False):
    full_train = full_train.drop(ILLEGAL_FNAMES, axis=1, errors='ignore')
    # assert 'distmod' in full_train.columns
    if sweights is None:
        # Compute weights
        w = y.value_counts()
        sweights = {i: np.sum(w) / w[i] for i in w.index}
    elif smote and sweights == BEST_SWEIGHTS:
        print(f'WARNING: got BEST_SWEIGHTS and smote=True')
    clfs = []
    importances = pd.DataFrame()
    folds = StratifiedKFold(n_splits=nr_fold,
                            shuffle=True,
                            random_state=random_state)
    oof_preds = np.zeros((len(full_train), np.unique(y).shape[0]))
    if standard_scaler:
        scl = StandardScaler()
        full_train = pd.DataFrame(scl.fit_transform(full_train.fillna(0)),
                                  index=full_train.index,
                                  columns=full_train.columns)
    for fold_, (trn_, val_) in tqdm_notebook(enumerate(folds.split(y, y)),
                                             total=nr_fold):
        trn_x, trn_y = full_train.iloc[trn_], y.iloc[trn_]
        val_x, val_y = full_train.iloc[val_], y.iloc[val_]
        if smote:

            trn_xa, trn_y, val_xa, val_y = smoteAdataset(
                trn_x.values, trn_y.values, val_x.values, val_y.values)
            trn_x = pd.DataFrame(data=trn_xa, columns=trn_x.columns)
            val_x = pd.DataFrame(data=val_xa, columns=val_x.columns)

        clf = LGBMClassifier(**params)
        loss_fn = lambda y, ypred: lgbm_multi_weighted_logloss(
            y, ypred, classes=classes, class_weights=class_weights)
        clf.fit(trn_x,
                trn_y,
                eval_set=[(trn_x, trn_y), (val_x, val_y)],
                eval_metric=loss_fn,
                verbose=-1,
                early_stopping_rounds=50,
                sample_weight=trn_y.map(sweights))
        clfs.append(clf)

        oof_preds[val_, :] = clf.predict_proba(
            val_x, num_iteration=clf.best_iteration_)
        imp_df = pd.DataFrame({
            'feature': full_train.columns,
            'gain': clf.feature_importances_,
            'fold': [fold_ + 1] * len(full_train.columns),
        })
        importances = pd.concat([importances, imp_df], axis=0, sort=False)

    score = multi_weighted_logloss(y_true=y,
                                   y_preds=oof_preds,
                                   classes=classes,
                                   class_weights=class_weights)
    print(f'OOF:{score:.4f} n_folds={nr_fold}, nfeat={full_train.shape[1]}')
    normal_weight_score = multi_weighted_logloss(y, oof_preds)
    if class_weights != CLASS_WEIGHTS:
        print(
            f'OOF Default weights:{normal_weight_score:.4f} n_folds={nr_fold}, '
            f'nfeat={full_train.shape[1]}')
    df_importances = agg_importances(importances)
    oof_df = make_oof_pred_df(oof_preds, columns=clf.classes_)
    return clfs, score, df_importances, oof_df, normal_weight_score
class ensemble:
    def __init__(self, df):
        '''
        Initialize with dataframe to train models on

        Ex. e = ensemble(df)
        '''
        self.df = df
        self.X = df.drop(columns='is_scammer')
        self.y = df['is_scammer']
        #baseline vectorizer parameters
        tfidf = TfidfVectorizer(
        stop_words='english',
        min_df=3,  # min count for relevant vocabulary
        max_features=5000,  # maximum number of features
        strip_accents='unicode',  # replace all accented unicode char by their corresponding ASCII char
        analyzer='word',  # features made of words
        token_pattern=r'[a-zA-Z]{3,}',  # tokenize only words of 3+ chars
        ngram_range=(1, 1),  # features made of a single tokens
        use_idf=True,  # enable inverse-document-frequency reweighting
        smooth_idf=True,  # prevents zero division for unseen words
        sublinear_tf=False)
        
        #instantiate classifiers for ensemble
        self.rf = RandomForestClassifier(n_estimators = 100, n_jobs = -1, oob_score = True)
        self.xgb = XGBClassifier()
        self.lgb = LGBMClassifier()
        
        #creating ensemble classifier
        self.eclf = VotingClassifier(estimators=[('xgb', self.xgb), ('lgb', self.lgb), ('rf', self.rf)], voting='soft')

        #create pipeline for vectorizing user's comments for Naive Bayes
        self.model = make_pipeline(tfidf, MultinomialNB())

        #numerical columns to use for rf/gb models
        self.num_cols = ['link_karma', 'comment_karma', 'verified', 'mod', 'gold',
            'days_old', 'total_comments', 'positive', 'neutral',
            'negative', 'mean_comment_length', 'mode_comment_length',
            'median_comment_length', 'duplicate_comments',
            'avg_grammar', 'total_grammar',
            'cap_freq_mean']

    def split(self,random_state = None):
        '''
        Split imported dataframe into a train and test set. Use with train_fit and test_predict to tune parameters
        '''
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, random_state=None)
        self.X_train_MNB = self.X_train['comments_new']
        self.X_train = self.X_train[self.num_cols]
        self.X_test_MNB = self.X_test['comments_new']
        self.X_test = self.X_test[self.num_cols]

    def train_fit(self):
        '''
        Fits on training data
        '''
        self.eclf.fit(self.X_train, self.y_train)
        self.model.fit(self.X_train_MNB, self.y_train)

    def fit(self):
        '''
        Fits on full dataset for predicting unlabeled data
        '''
        self.eclf.fit(self.X[self.num_cols], self.y)
        self.model.fit(self.X['comments_new'], self.y)
    
    def test_predict(self):
        '''
        Returns test data prediction probability
        '''
        y_pred = self.eclf.predict_proba(self.X_test)[:,1]
        y_pred_MNB = self.model.predict_proba(self.X_test_MNB)[:,1]
        y_final_pred = (y_pred+y_pred_MNB/2)
        return y_final_pred

    def predict(self, username):
        '''
        Input Reddit username
        Returns prediction probability for new data
        '''
        X = get_user_profile(str(username))
        X_MNB = X['comments_new']
        X = X[self.num_cols]
        y_pred = self.eclf.predict_proba(X)[:,1]
        y_pred_MNB = self.model.predict_proba(X_MNB)[:,1]
        y_final_pred = (y_pred+y_pred_MNB/2)
        return y_final_pred
        
    def rf_predict(self, X):
        '''
        Fit and only return prediction probability for Random Forest Classifier
        '''
        self.rf.fit(self.X_train, self.y_train)
        # X = X[self.num_cols]
        return self.rf.predict_proba(X)[:,1]

    def xgb_predict(self, X):
        '''
        Fit and only return prediction probability for XGBoost classifier
        '''
        self.xgb.fit(self.X_train, self.y_train)
        # X = X[self.num_cols]
        return self.xgb.predict_proba(X)[:,1]

    def lgb_predict(self, X):
        '''
        Fit and only return prediction probability for LightGBM classifier
        '''
        self.lgb.fit(self.X_train, self.y_train)
        # X = X[self.num_cols]
        return self.lgb.predict_proba(X)[:,1]

    def MNB_predict(self, X):
        '''
        Fit and only return prediction probability for Multinomial Naive Bayes classifier
        '''
        self.model.fit(self.X_train_MNB, self.y_train)
        # X = X['comments_new']
        return self.model.predict_proba(X)[:,1]

    def score(self):
        '''
        Returns Area Under Receiver Operator Characteristic Curve for ensemble method
        '''
        print(f'''ROC AUC score: {roc_auc_score(self.y_test, self.test_predict())}''')
Exemple #13
0
def modeling(all_data):
    all_data = all_data.rename(
        columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x))

    train_df = all_data[all_data['TARGET'].notnull()]
    test_df = all_data[all_data['TARGET'].isnull()]

    folds = KFold(n_splits=10, shuffle=True, random_state=1001)

    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()

    feats = [f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR']]

    for n_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df[
            'TARGET'].iloc[train_idx]

        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[
            'TARGET'].iloc[valid_idx]

        clf = LGBMClassifier(
            n_jobs=-1,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1,
        )

        clf.fit(train_x,
                train_y,
                eval_set=[(train_x, train_y), (valid_x, valid_y)],
                eval_metric='auc',
                verbose=200,
                early_stopping_rounds=200)

        # y_pred_valid
        oof_preds[valid_idx] = clf.predict_proba(
            valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(
            test_df[feats],
            num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)

        print('Fold %2d AUC : %.6f' %
              (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))

    print('Full AUC score %.6f' %
          roc_auc_score(train_df['TARGET'], oof_preds))  # y_pred_valid

    test_df['TARGET'] = sub_preds
    test_df[['SK_ID_CURR', 'TARGET']].to_csv("dsmlbc1_submission.csv",
                                             index=False)

    display_importances(feature_importance_df)

    return feature_importance_df
Exemple #14
0
class LightGBM(AutoSklearnClassificationAlgorithm):
    def __init__(self,
                 n_estimators,
                 learning_rate,
                 num_leaves,
                 max_depth,
                 min_child_samples,
                 subsample,
                 colsample_bytree,
                 random_state=None):
        self.n_estimators = int(n_estimators)
        self.learning_rate = learning_rate
        self.num_leaves = num_leaves
        self.max_depth = max_depth
        self.subsample = subsample
        self.min_child_samples = min_child_samples
        self.colsample_bytree = colsample_bytree

        self.n_jobs = 1
        self.random_state = random_state
        self.estimator = None

    def fit(self, X, y):
        self.estimator = LGBMClassifier(
            num_leaves=self.num_leaves,
            max_depth=self.max_depth,
            learning_rate=self.learning_rate,
            n_estimators=self.n_estimators,
            min_child_samples=self.min_child_samples,
            subsample=self.subsample,
            colsample_bytree=self.colsample_bytree,
            n_jobs=self.n_jobs)
        self.estimator.fit(X, y)
        return self

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict_proba(X)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            'shortname': 'LightGBM Classifier',
            'name': 'LightGBM Classifier',
            'handles_regression': False,
            'handles_classification': True,
            'handles_multiclass': True,
            'handles_multilabel': True,
            'is_deterministic': False,
            'input': (SPARSE, DENSE, UNSIGNED_DATA),
            'output': (PREDICTIONS, )
        }

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None,
                                        optimizer='smac'):
        cs = ConfigurationSpace()
        n_estimators = UniformFloatHyperparameter("n_estimators",
                                                  100,
                                                  1000,
                                                  default_value=500,
                                                  q=50)
        num_leaves = UniformIntegerHyperparameter("num_leaves",
                                                  31,
                                                  2047,
                                                  default_value=128)
        max_depth = Constant('max_depth', 15)
        learning_rate = UniformFloatHyperparameter("learning_rate",
                                                   1e-3,
                                                   0.3,
                                                   default_value=0.1,
                                                   log=True)
        min_child_samples = UniformIntegerHyperparameter("min_child_samples",
                                                         5,
                                                         30,
                                                         default_value=20)
        subsample = UniformFloatHyperparameter("subsample",
                                               0.7,
                                               1,
                                               default_value=1,
                                               q=0.1)
        colsample_bytree = UniformFloatHyperparameter("colsample_bytree",
                                                      0.7,
                                                      1,
                                                      default_value=1,
                                                      q=0.1)
        cs.add_hyperparameters([
            n_estimators, num_leaves, max_depth, learning_rate,
            min_child_samples, subsample, colsample_bytree
        ])
        return cs
Exemple #15
0
def cv_scores(df, num_folds, params, stratified = False, verbose = -1, 
              save_train_prediction = False, train_prediction_file_name = 'train_prediction.csv',
              save_test_prediction = True, test_prediction_file_name = 'test_prediction.csv'):
    warnings.simplefilter('ignore')
    
    clf = LGBMClassifier(**params)

    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits = num_folds, shuffle = True, random_state = 1001)
    else:
        folds = KFold(n_splits = num_folds, shuffle = True, random_state = 1001)
        
    # Create arrays and dataframes to store results
    train_pred = np.zeros(train_df.shape[0])
    train_pred_proba = np.zeros(train_df.shape[0])

    test_pred = np.zeros(train_df.shape[0])
    test_pred_proba = np.zeros(train_df.shape[0])
    
    prediction = np.zeros(test_df.shape[0])
    
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    df_feature_importance = pd.DataFrame(index = feats)
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        print('Fold', n_fold, 'started at', time.ctime())
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        clf.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)], eval_metric = 'auc', 
                verbose = verbose, early_stopping_rounds = 200)

        train_pred[train_idx] = clf.predict(train_x, num_iteration = clf.best_iteration_)
        train_pred_proba[train_idx] = clf.predict_proba(train_x, num_iteration = clf.best_iteration_)[:, 1]
        test_pred[valid_idx] = clf.predict(valid_x, num_iteration = clf.best_iteration_)
        test_pred_proba[valid_idx] = clf.predict_proba(valid_x, num_iteration = clf.best_iteration_)[:, 1]
        
        prediction += \
                clf.predict_proba(test_df[feats], num_iteration = clf.best_iteration_)[:, 1] / folds.n_splits

        df_feature_importance[n_fold] = pd.Series(clf.feature_importances_, index = feats)
        
        print('Fold %2d AUC : %.6f' % (n_fold, roc_auc_score(valid_y, test_pred_proba[valid_idx])))
        del train_x, train_y, valid_x, valid_y
        gc.collect()

    roc_auc_train = roc_auc_score(train_df['TARGET'], train_pred_proba)
    precision_train = precision_score(train_df['TARGET'], train_pred, average = None)
    recall_train = recall_score(train_df['TARGET'], train_pred, average = None)
    
    roc_auc_test = roc_auc_score(train_df['TARGET'], test_pred_proba)
    precision_test = precision_score(train_df['TARGET'], test_pred, average = None)
    recall_test = recall_score(train_df['TARGET'], test_pred, average = None)

    print('Full AUC score %.6f' % roc_auc_test)
    
    df_feature_importance.fillna(0, inplace = True)
    df_feature_importance['mean'] = df_feature_importance.mean(axis = 1)
    
    # Write prediction files
    if save_train_prediction:
        df_prediction = train_df[['SK_ID_CURR', 'TARGET']]
        df_prediction['Prediction'] = test_pred_proba
        df_prediction.to_csv(train_prediction_file_name, index = False)
        del df_prediction
        gc.collect()

    if save_test_prediction:
        df_prediction = test_df[['SK_ID_CURR']]
        df_prediction['TARGET'] = prediction
        df_prediction.to_csv(test_prediction_file_name, index = False)
        del df_prediction
        gc.collect()
    
    return df_feature_importance, \
           [roc_auc_train, roc_auc_test,
            precision_train[0], precision_test[0], precision_train[1], precision_test[1],
            recall_train[0], recall_test[0], recall_train[1], recall_test[1], 0]
Exemple #16
0
from lightgbm import LGBMClassifier, plot_importance
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif, chi2
train = pd.read_csv('train.csv', index_col=0)
test = pd.read_csv('test.csv', index_col=0)
sample_submission = pd.read_csv('sample_submission.csv', index_col=0)
x = train.drop(columns='class', axis=1)  # class 열을 삭제한 새로운 객체
y = train['class']  # 결과 레이블(class)
TEST = test
train_x, test_x, train_y, test_y = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    stratify=y,
                                                    random_state=42)
# 데이터에서 20%를 test 데이터로 분리
evals = [(test_x, test_y)]
lgbm = LGBMClassifier(n_estimators=1000,
                      learning_rate=0.03,
                      max_depth=12,
                      num_leaves=4000,
                      random_state=42,
                      boosting_type="goss")
lgbm.fit(train_x, train_y, early_stopping_rounds=20, eval_set=evals)
print("acc: {}".format(lgbm.score(train_x, train_y)))  # 훈련 데이터에 대한 정확도
print("acc: {}".format(lgbm.score(test_x, test_y)))  # 테스트 데이터에 대한 정확도
y_pred = np.argmax(lgbm.predict_proba(TEST), axis=1)  # 각 클래스에 대한 예측확률
submission = pd.DataFrame(data=y_pred,
                          columns=sample_submission.columns,
                          index=sample_submission.index)
submission.to_csv('submission5.csv', index=True)
Exemple #17
0
def cv_lgbm_scores(df_, num_folds, params, 
                   target_name = 'TARGET', index_name = 'SK_ID_CURR',
                   stratified = False, rs = 1001, verbose = -1):
    
    warnings.simplefilter('ignore')
    
    # Cleaning and defining parameters for LGBM
    params = int_lgbm_params(params)
    clf = LGBMClassifier(**params, n_estimators = 20000, nthread = 4, n_jobs = -1)

    # Divide in training/validation and test data
    df_train_ = df_[df_[target_name].notnull()]
    df_test_ = df_[df_[target_name].isnull()]
    print("Starting LightGBM cross-validation at {}".format(time.ctime()))
    print("Train shape: {}, test shape: {}".format(df_train_.shape, df_test_.shape))

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits = num_folds, shuffle = True, random_state = rs)
    else:
        folds = KFold(n_splits = num_folds, shuffle = True, random_state = rs)
        
    # Create arrays to store results
    train_pred = np.zeros(df_train_.shape[0])
    train_pred_proba = np.zeros(df_train_.shape[0])

    test_pred = np.zeros(df_train_.shape[0])
    test_pred_proba = np.zeros(df_train_.shape[0])
    
    prediction = np.zeros(df_test_.shape[0]) # prediction for test set
    
    feats = df_train_.columns.drop([target_name, index_name])
    
    df_feat_imp_ = pd.DataFrame(index = feats)
    
    # Cross-validation cycle
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(df_train_[feats], df_train_[target_name])):
        print('--- Fold {} started at {}'.format(n_fold, time.ctime()))
        
        train_x, train_y = df_train_[feats].iloc[train_idx], df_train_[target_name].iloc[train_idx]
        valid_x, valid_y = df_train_[feats].iloc[valid_idx], df_train_[target_name].iloc[valid_idx]

        clf.fit(train_x, train_y, 
                eval_set = [(valid_x, valid_y)], eval_metric = 'auc', 
                verbose = verbose, early_stopping_rounds = 100)

        train_pred[train_idx] = clf.predict(train_x, num_iteration = clf.best_iteration_)
        train_pred_proba[train_idx] = clf.predict_proba(train_x, num_iteration = clf.best_iteration_)[:, 1]
        test_pred[valid_idx] = clf.predict(valid_x, num_iteration = clf.best_iteration_)
        test_pred_proba[valid_idx] = clf.predict_proba(valid_x, num_iteration = clf.best_iteration_)[:, 1]
        
        prediction += clf.predict_proba(df_test_[feats], 
                                        num_iteration = clf.best_iteration_)[:, 1] / folds.n_splits

        df_feat_imp_[n_fold] = pd.Series(clf.feature_importances_, index = feats)
        
        del train_x, train_y, valid_x, valid_y
        gc.collect()

    # Computation of metrics
    roc_auc_train = roc_auc_score(df_train_[target_name], train_pred_proba)
    precision_train = precision_score(df_train_[target_name], train_pred, average = None)
    recall_train = recall_score(df_train_[target_name], train_pred, average = None)
    
    roc_auc_test = roc_auc_score(df_train_[target_name], test_pred_proba)
    precision_test = precision_score(df_train_[target_name], test_pred, average = None)
    recall_test = recall_score(df_train_[target_name], test_pred, average = None)

    print('Full AUC score {:.6f}'.format(roc_auc_test))
    
    # Filling the feature_importance table
    df_feat_imp_.fillna(0, inplace = True)
    df_feat_imp_['mean'] = df_feat_imp_.mean(axis = 1)
    
    # Preparing results of prediction for saving
    prediction_train = df_train_[[index_name]]
    prediction_train[target_name] = test_pred_proba
    prediction_test = df_test_[[index_name]]
    prediction_test[target_name] = prediction
    
    del df_train_, df_test_
    gc.collect()
    
    # Returning the results and metrics in format for scores' table
    return df_feat_imp_, prediction_train, prediction_test,            [roc_auc_train, roc_auc_test,
            precision_train[0], precision_test[0], precision_train[1], precision_test[1],
            recall_train[0], recall_test[0], recall_train[1], recall_test[1], 0]
Exemple #18
0
                         num_leaves=85,
                         max_depth=15,
                         learning_rate=0.003,
                         n_estimators=3677,
                         subsample_for_bin=400000,
                         objective="binary",
                         min_split_gain=0.0,
                         min_child_weight=0.01,
                         min_child_samples=50,
                         subsample=0.8,
                         subsample_freq=1,
                         colsample_bytree=0.7,
                         reg_alpha=5.0,
                         reg_lambda=0.0,
                         silent=True)

kf = KFold(n_splits=5)
for n_fold, (train_index, test_index) in enumerate(kf.split(train_X)):
    print n_fold
    X_train, X_test = train_X.iloc[train_index], train_X.iloc[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]
    model_1.fit(X_train, y_train)
    #prediction = model_1.predict_proba(X_test)
    #train_score.append(prediction[:,1])
    oof_train[test_index] = model_1.predict_proba(X_test)[:, 1]
    oof_test_skf[n_fold, :] = model_1.predict_proba(test_X)[:, 1]

oof_test[:] = oof_test_skf.mean(axis=0)
te['buy'] = oof_test
tr['buy'] = oof_train
def tr_managerskill(train, test, y, folds, cache_file):
    print("\n\n############# Manager skill step ################")
    cache_key_train = 'managerskill_train'
    cache_key_test = 'managerskill_test'

    #Check if cache file exist and if data for this step is cached
    dict_train, dict_test = load_from_cache(cache_file, cache_key_train,
                                            cache_key_test)
    if dict_train is not None and dict_test is not None:
        train_out = train.assign(**dict_train)
        test_out = test.assign(**dict_test)
        return train_out, test_out, y, folds, cache_file

    print('# No cache detected, computing from scratch #')
    lb = LabelBinarizer(sparse_output=True)
    lb.fit(list(train['manager_id'].values) + list(test['manager_id'].values))

    X_train_mngr = lb.transform(train['manager_id']).astype(np.float32)
    X_test_mngr = lb.transform(test['manager_id']).astype(np.float32)

    le = LabelEncoder()
    y_encode = le.fit_transform(y)

    # Separate train in train + validation data
    X_train, X_val, y_train, y_val = train_test_split(X_train_mngr,
                                                      y_encode,
                                                      test_size=0.2,
                                                      random_state=42)

    # train
    gbm = LGBMClassifier(n_estimators=2048,
                         seed=42,
                         objective='multiclass',
                         colsample_bytree='0.8',
                         subsample='0.8')

    # Predict out-of-folds train data
    print('Start training - Number of folds: ', len(folds))
    train_predictions = out_of_fold_predict(gbm, X_train_mngr, y_encode, folds)

    mngr_train_names = {
        'mngr_' + le.classes_[0]: [row[0] for row in train_predictions],
        'mngr_' + le.classes_[1]: [row[1] for row in train_predictions],
        'mngr_' + le.classes_[2]: [row[2] for row in train_predictions],
    }
    mngr_train_names['mngr_skill'] = [
        2 * h + m for (h, m) in zip(mngr_train_names['mngr_high'],
                                    mngr_train_names['mngr_medium'])
    ]

    gbm.fit(X_train,
            y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='multi_logloss',
            early_stopping_rounds=50,
            verbose=False)

    # Now validate the predict value using the previously split validation set
    print('Start validating Manager skill...')
    # predict
    y_pred = gbm.predict_proba(X_val, num_iteration=gbm.best_iteration)
    # eval
    print('We stopped at boosting round: ', gbm.best_iteration)
    print('The mlogloss of prediction is:', mlogloss(y_val, y_pred))

    # Now compute the value for the actual test data using out-of-folds predictions
    print('Start predicting Manager skill...')
    test_predictions = gbm.predict_proba(X_test_mngr,
                                         num_iteration=gbm.best_iteration)

    mngr_test_names = {
        'mngr_' + le.classes_[0]: [row[0] for row in test_predictions],
        'mngr_' + le.classes_[1]: [row[1] for row in test_predictions],
        'mngr_' + le.classes_[2]: [row[2] for row in test_predictions]
    }
    mngr_test_names['mngr_skill'] = [
        2 * h + m for (h, m) in zip(mngr_test_names['mngr_high'],
                                    mngr_test_names['mngr_medium'])
    ]

    print('Caching features in ' + cache_file)
    save_to_cache(cache_file, cache_key_train, cache_key_test,
                  mngr_train_names, mngr_test_names)

    print('Adding features to dataframe')
    train_out = train.assign(**mngr_train_names)
    test_out = test.assign(**mngr_test_names)

    return train_out, test_out, y, folds, cache_file
Exemple #20
0
def lgbm_modeling_cross_validation(params,
                                   full_train,
                                   y,
                                   classes,
                                   class_weights,
                                   nr_fold=10,
                                   random_state=7):

    unique_y = np.unique(y)
    class_map = dict()
    for i, val in enumerate(unique_y):
        class_map[val] = i

    # y = np.array([class_map[val] for val in y])
    y = y.apply(lambda x: class_map[x])

    # Compute weights
    w = y.value_counts()
    weights = {i: np.sum(w) / w[i] for i in w.index}

    clfs = []
    importances = pd.DataFrame()
    folds = StratifiedKFold(n_splits=nr_fold,
                            shuffle=True,
                            random_state=random_state)

    oof_preds = np.zeros((len(full_train), np.unique(y).shape[0]))
    for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
        trn_x, trn_y = full_train.iloc[trn_], y.iloc[trn_]
        val_x, val_y = full_train.iloc[val_], y.iloc[val_]

        trn_xa, trn_y, val_xa, val_y = smoteAdataset(trn_x.values,
                                                     trn_y.values,
                                                     val_x.values,
                                                     val_y.values)
        trn_x = pd.DataFrame(data=trn_xa, columns=trn_x.columns)

        val_x = pd.DataFrame(data=val_xa, columns=val_x.columns)

        clf = LGBMClassifier(**params)
        clf.fit(trn_x,
                trn_y,
                eval_set=[(trn_x, trn_y), (val_x, val_y)],
                eval_metric=lgbm_multi_weighted_logloss,
                verbose=100,
                early_stopping_rounds=50,
                sample_weight=trn_y.map(weights))

        clf.my_name = "lgbm"

        clfs.append(clf)

        oof_preds[val_, :] = clf.predict_proba(
            val_x)  #, num_iteration=clf.best_iteration_)
        print('no {}-fold loss: {}'.format(
            fold_ + 1,
            multi_weighted_logloss(val_y, oof_preds[val_, :], classes,
                                   class_weights)))

        imp_df = pd.DataFrame({
            'feature': full_train.columns,
            'gain': clf.feature_importances_,
            'fold': [fold_ + 1] * len(full_train.columns),
        })
        importances = pd.concat([importances, imp_df], axis=0, sort=False)

    score = multi_weighted_logloss(y_true=y,
                                   y_preds=oof_preds,
                                   classes=classes,
                                   class_weights=class_weights)
    print('MULTI WEIGHTED LOG LOSS: {:.5f}'.format(score))
    df_importances = save_importances(importances_=importances)
    df_importances.to_csv('lgbm_importances.csv', index=False)

    cnf = confusion_matrix(y, np.argmax(oof_preds, axis=1))
    plot_confusion_matrix(cnf,
                          classes=classes,
                          normalize=True,
                          filename="lgbm")

    return clfs, score, oof_preds
Exemple #21
0

def q1(x):
    return x.quantile(0.25)


def q2(x):
    return x.quantile(0.75)


grouped = train[features].groupby('id')
X_train = grouped.agg(['max', 'min', 'mean', q1, q2])
X_test = test[features].groupby('id').agg(['max', 'min', 'mean', q1, q2])
y_train = train_label['label']

from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split

lgbm_wrapper = LGBMClassifier(n_estimators=400)
lgbm_wrapper.fit(X_train.values, y_train)

preds = lgbm_wrapper.predict(X_test)
pred_proba = lgbm_wrapper.predict_proba(X_test)[:, 1]

y_pred = lgbm_wrapper.predict_proba(X_test)

submission.iloc[:, 1:] = y_pred
submission

submission.to_csv('lightgbm_q1q2.csv', index=False)
Exemple #22
0
rfc_predict = rfc.predict(X_test_std)
rfc_predict_proba = rfc.predict_proba(X_test_std)[:,1]
get_scores(y_test,rfc_predict,rfc_predict_proba)
print('')

#GBDT
print('GBDT:')
gdbt = GradientBoostingClassifier(random_state=2018)
gdbt.fit(X_train_std,y_train)
gdbt_predict = gdbt.predict(X_test_std)
gdbt_predict_proba = gdbt.predict_proba(X_test_std)[:,1]
get_scores(y_test,gdbt_predict,gdbt_predict_proba)
print('')

#XGBoost
print('XGBoost:')
xgbs = XGBClassifier(random_state=2018)
xgbs.fit(X_train_std,y_train)
xgbs_predict = xgbs.predict(X_test_std)
xgbs_predict_proba = xgbs.predict_proba(X_test_std)[:,1]
get_scores(y_test,xgbs_predict,xgbs_predict_proba)
print('')

#LightGBM
print('LightGBM:')
lgbm = LGBMClassifier(random_state=2018)
lgbm.fit(X_train_std,y_train)
lgbm_predict = lgbm.predict(X_test_std)
lgbm_predict_proba = lgbm.predict_proba(X_test_std)[:,1]
get_scores(y_test,lgbm_predict,lr_predict_pro)
Y = Y.reshape(len(Y))

import xgboost as xgb
from lightgbm import LGBMClassifier

model = LGBMClassifier(random_state=1, n_estimators=40, reg_lambda=1, reg_alpha=1)
model.fit(X, Y)

test_data = pd.read_csv("data/test.csv", index_col=0)
test_data = test_data.fillna(0)
features = test_data
features["mid"] = (features["mid"] - features["last_price"]) / features["last_price"]
features["bid1"] = (features["bid1"] - features["last_price"]) / features["last_price"]
features["ask1"] = (features["ask1"] - features["last_price"]) / features["last_price"]
features["bid2"] = (features["bid2"] - features["last_price"]) / features["last_price"]
features["ask2"] = (features["ask2"] - features["last_price"]) / features["last_price"]

features = features[["transacted_qty", "d_open_interest", "mid", "bid1", "bid2", "ask1", "ask2",
                     "bid1vol", "bid2vol", "bid3vol", "bid4vol", "bid5vol", "ask1vol", "ask2vol", "ask3vol", "ask4vol",
                     "ask5vol"]]

features["bidcross1"] = features["bid1"] * features["bid1vol"]
features["bidcross2"] = features["bid2"] * features["bid2vol"]
features["askcross1"] = features["ask1"] * features["ask1vol"]
features["askcross2"] = features["ask2"] * features["ask2vol"]

X = scalar.transform(features.values)
df_test = pd.read_csv('data/test.csv', index_col=0)
df_test['Predicted'] = model.predict_proba(X)[:, 1]
df_test[['Predicted']].to_csv('submission.csv')
X_test_scaled = scaler.transform(X_test)
test_x_scaled = scaler.transform(test_x)
print(X_train_scaled)

# 시각화
import matplotlib.pyplot as plt
plt.hist(X_train_scaled)
plt.title('StandardScaler')
plt.show()

# 정확도 측정
acc = LGBM.score(X_test, y_test)
print('acc: ', acc)  # 0.8454961374034351

# 예측
y_pred = LGBM.predict_proba(test_x)
print(y_pred)

# 특성 중요도 그리기
import numpy as np
import matplotlib.pyplot as plt


def plot_feature_importances_orb(model):
    n_features = train_x.shape[1]
    plt.barh(np.arange(n_features), LGBM.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), feat_labels)
    plt.xlabel("feature importance")
    plt.ylabel("feature")
    plt.ylim(-1, n_features)
Exemple #25
0
def kfold_lightgbm(df, num_folds, lgb_param, stratified=False, debug=False):
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(
        train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=True,
                                random_state=50)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=50)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [
        f for f in train_df.columns if f not in
        ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index']
    ]

    for n_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df[
            'TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[
            'TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(**lgb_param)

        clf.fit(train_x,
                train_y,
                eval_set=[(train_x, train_y), (valid_x, valid_y)],
                eval_metric='auc',
                verbose=100,
                early_stopping_rounds=200)

        oof_preds[valid_idx] = clf.predict_proba(
            valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(
            test_df[feats],
            num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
        logging.info(
            'Fold %2d AUC : %.6f' %
            (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()
    full_auc = roc_auc_score(train_df['TARGET'], oof_preds)
    print('Full AUC score %.6f' % full_auc)
    # Write submission file and plot feature importance
    if not debug:
        test_df['TARGET'] = sub_preds
        test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name,
                                                 index=False)
    return full_auc
Exemple #26
0
    time2_1 = time.time()
    print('xgboost计算时间', time2_1 - time2_0)

# 用lgb建模
if flag == 3 or flag == 0:
    print("开始lgbm训练")
    time3_0 = time.time()
    lgb = LGBMClassifier(objective='binary',
                         learning_rate=0.02,
                         n_estimators=100,
                         num_leaves=45,
                         depth=12,
                         colsample_bytree=0.8,
                         min_child_samples=14,
                         subsample=0.9)
    lgb.fit(x_train, y_train)
    test_lgb_prob = lgb.predict_proba(x_test)
    train_lgb_prob = lgb.predict_proba(x_train)
    print('lightgbm的训练集log损失', log_loss(y_train, train_lgb_prob))
    print('lightgbm的测试集集log损失', log_loss(y_test, test_lgb_prob))
    time3_1 = time.time()
    print('lightgbm计算时间', time3_1 - time3_0)
'''
#验证集输出结果,线上测试
import getFearures01
path_test = '../data/round1_ijcai_18_test_b_20180418.txt'
test_df = getFearures01.cpfeature(path_test)
test_pre = lgb.predict_proba(test_df)
result = pd.DataFrame({'instance_id':test_df['instance_id'],'predicted_score':test_pre[:,1]})
result.to_csv('./result.csv',sep=' ',header=True,index=None)'''
Exemple #27
0
def kfold_lightgbm(df, num_folds, stratified=False, debug=False):
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(
        train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=True,
                                random_state=47)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=47)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [
        f for f in train_df.columns if f not in
        ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index']
    ]

    for n_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df[
            'TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[
            'TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=8,
            #is_unbalance=True,
            n_estimators=10000,
            learning_rate=0.1,
            num_leaves=32,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.04,
            reg_lambda=0.073,
            min_split_gain=0.0222415,
            min_child_weight=40,
            silent=-1,
            verbose=-1,
            #scale_pos_weight=11
        )

        clf.fit(train_x,
                train_y,
                eval_set=[(train_x, train_y), (valid_x, valid_y)],
                eval_metric='auc',
                verbose=100,
                early_stopping_rounds=200)

        oof_preds[valid_idx] = clf.predict_proba(
            valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(
            test_df[feats],
            num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' %
              (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        with open(filename, 'a') as f:
            f.write(
                f"Fold {n_fold+1} AUC: {roc_auc_score(valid_y, oof_preds[valid_idx]):.6f}\n"
            )
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    with open(filename, 'a') as f:
        f.write(
            f"Full AUC: {roc_auc_score(train_df['TARGET'], oof_preds):.6f}\n")
    # Write submission file and plot feature importance
    if not debug:
        test_df['TARGET'] = sub_preds
        test_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name,
                                                 index=False)
    display_importances(feature_importance_df)
    return feature_importance_df
Exemple #28
0
eval_ps = [1 / i for i in range(1, 20)]
res = pd.DataFrame([], index=mds, columns=eval_ps)

for md in mds:
    learner = LGBMClassifier(n_estimators=10000, max_depth=md)

    learner.fit(trainset.drop("reordered", axis=1),
                trainset.reordered,
                eval_metric="auc",
                early_stopping_rounds=10,
                eval_set=[(trainset.drop("reordered",
                                         axis=1), trainset.reordered),
                          (evalset.drop("reordered",
                                        axis=1), evalset.reordered)])

    preds = learner.predict_proba(evalset.drop("reordered", axis=1))[:, -1]

    for p in eval_ps:
        ppreds = evalset[preds > p]
        ppreds = ppreds.groupby("user_id").product_id.apply(set)
        ppreds.name = "preds"
        real.name = "real"

        comp = pd.concat([real, ppreds], axis=1)
        temp = pd.Series([set([0])] * comp.shape[0], index=comp.index)
        comp.real.fillna(temp, inplace=True)
        comp.preds.fillna(temp, inplace=True)
        comp["tp"] = comp.apply(lambda x: len(x["real"].intersection(x.preds)),
                                axis=1)
        comp["acc"] = comp.tp / comp["preds"].apply(len)
        comp["recall"] = comp.tp / comp["real"].apply(len)
Exemple #29
0
# (stratified) Cross validation
for train_index, validation_index in kf.split(X, y):
    print("Cross-validation, Fold %d" % (len(log_loss_val) + 1))

    # Split data into training and testing set
    X_train = X.iloc[train_index, :].copy()
    X_validate = X.iloc[validation_index, :].copy()
    y_train = y[train_index]
    y_validate = y[validation_index]

    # Train the model
    model = model.fit(X_train, y_train)

    # Test the model
    log_loss_val.append(log_loss(y_validate, model.predict_proba(X_validate)))
    print("Log loss: %f" % log_loss_val[-1])

    # Make predictions
    y_pred.append(model.predict_proba(test[X.columns])[:, 1])

    # delete temporal dataframes
    del X_train, X_validate, y_train, y_validate

# Evaluate results from CV
print("Log loss %f +/- %f" % (np.mean(log_loss_val), 2 * np.std(log_loss_val)))

## =========================== 4. Output results =========================== ##
# Create output dataframes
submission = pd.DataFrame({
    'msno': test.msno,
Exemple #30
0
def kfold_lightgbm(df, debug=False):
    # Divide in training/validation and test data

    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()

    folds = KFold(n_splits=10, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])  # predicted valid_y
    sub_preds = np.zeros(test_df.shape[0])  # submission preds
    feature_importance_df = pd.DataFrame()  # feature importance

    fold_auc_best_df = pd.DataFrame(columns=["FOLD", "AUC", "BEST_ITER"])  # holding best iter to save model
    feats = [f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index',
                                                      "APP_index", "BURO_index", "PREV_index", "INSTAL_index",
                                                      "CC_index", "POS_index"]]

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]
        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            n_jobs=-1,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

        clf.fit(train_x,
                train_y,
                eval_set=[(train_x, train_y),
                          (valid_x, valid_y)],
                eval_metric='auc',
                verbose=200,
                early_stopping_rounds=200)

        # predicted valid_y
        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]

        # submission preds. her kat icin test setini tahmin edip tum katların ortalamasini alıyor.
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        # fold, auc and best iteration
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))

        # best auc & iteration
        fold_auc_best_df = fold_auc_best_df.append({'FOLD': int(n_fold + 1),
                                                    'AUC': roc_auc_score(valid_y, oof_preds[valid_idx]),
                                                    "BEST_ITER": clf.best_iteration_}, ignore_index=True)

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    # OUTPUTS
    print(fold_auc_best_df)
    print(feature_importance_df)

    # feature importance'ları df olarak kaydet
    feature_importance_df.to_pickle("outputs/features/feature_importance_df.pkl")
    fold_auc_best_df.to_pickle("outputs/features/fold_auc_best_df.pkl")

    # Final Model
    best_iter_1 = int(fold_auc_best_df.sort_values(by="AUC", ascending=False)[:1]["BEST_ITER"].values)

    y_train = train_df["TARGET"]
    x_train = train_df[feats]

    final_model = LGBMClassifier(
        n_jobs=-1,
        n_estimators=best_iter_1,
        learning_rate=0.02,
        num_leaves=34,
        colsample_bytree=0.9497036,
        subsample=0.8715623,
        max_depth=8,
        reg_alpha=0.041545473,
        reg_lambda=0.0735294,
        min_split_gain=0.0222415,
        min_child_weight=39.3259775,
        silent=-1,
        verbose=-1).fit(x_train, y_train)

    cur_dir = os.getcwd()
    os.chdir('models/reference/')
    pickle.dump(final_model, open("lightgbm_final_model.pkl", 'wb'))  # model
    os.chdir(cur_dir)

    # her bir fold icin tahmin edilen valid_y'ler aslında train setinin y'lerinin farklı parcalarda yer alan tahminleri.
    cowsay.cow('Full Train(Validation) AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    # Write submission file and plot feature importance
    if not debug:
        cur_dir = os.getcwd()
        os.chdir('outputs/predictions/')
        test_df['TARGET'] = sub_preds
        test_df[['SK_ID_CURR', 'TARGET']].to_csv("reference_submission.csv", index=False)
        os.chdir(cur_dir)
    display_importances(feature_importance_df)
    del x_train, y_train

    return feature_importance_df
def OOFPreds(X, y, test_X, params, n_splits=5, random_state=23, clf='lgb'):
    """
    输入要求数据为 Dataframe
    返回数据 Series
    """

    # 方便后续特征重要度分析
    feature_importance = pd.DataFrame(columns=['feature', 'importance', 'fold'])

    folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # oof 是交叉验证结果 sub是测试集预测结果
    oof_preds, sub_preds = np.zeros(X.shape[0]), np.zeros(test_X.shape[0])

    oof_train = np.zeros(X.shape[0])

    print(X.shape, test_X.shape)

    valid_scores = []
    train_scores = []

    for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
        trn_x, trn_y = X.iloc[trn_idx], y.iloc[trn_idx]
        val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

        # # 初始化 score记录方式
        # trn_init_score = pd.Series([0.95] * len(trn_x), index=trn_x.index)
        # val_init_score = pd.Series([0.95] * len(val_x), index=val_x.index)

        # 模型构建与预测任务
        if clf == 'lgb':
            with timer('{} fold 训练时间:'.format(n_fold)) as time:
                gbm = LGBMClassifier(**params)
                gbm.fit(trn_x, trn_y, init_score=trn_init_score,
                        eval_set=[(trn_x, trn_y), (val_x, val_y)],
                        eval_init_score=[trn_init_score, val_init_score],
                        eval_metric='auc', verbose=30, early_stopping_rounds=100)

                print('best iteration: {}'.format(gbm.best_iteration_))
                print('100单次训练时间: {:.3f}'.format(time*100/gbm.best_iteration_))

                pred_val = gbm.predict_proba(val_x, num_iteration=gbm.best_iteration_)[:, 1]
                pred_test = gbm.predict_proba(test_X, num_iteration=gbm.best_iteration_)[:, 1]

            # 预测分数 预测结果记录
            oof_preds[val_idx] = pred_val
            sub_preds += pred_test / folds.n_splits

            print(gbm.best_score_)

            valid_score = gbm.best_score_['valid_1']['auc']
            train_score = gbm.best_score_['training']['auc']

            valid_scores.append(valid_score)
            train_scores.append(train_score)

            feature_importance = feature_importance.append(pd.DataFrame({
                'importance': gbm.feature_importances_,
                'fold': [n_fold + 1] * X.shape[1],
                'feature': X.columns.tolist()}))


        else:
            # 自己的模型
            # 任务一:完成模型的构建预测任务
            # 任务二:完成预测分数,预测结果的记录
            # 任务三:完成模型重要程度的记录
            clf = LogisticRegression(**params)
            clf.fit(trn_x, trn_y)

            pred_train = clf.predict_proba(trn_x)[:, 1]
            pred_val = clf.predict_proba(val_x)[:, 1]
            pred_test = clf.predict_proba(test_X)[:, 1] \

            oof_preds[val_idx] = pred_val
            sub_preds += pred_test / folds.n_splits


            valid_score = roc_auc_score(val_y, pred_val)
            train_score = roc_auc_score(trn_y, pred_train)

            valid_scores.append(valid_score)
            train_scores.append(train_score)

            feature_importance = feature_importance.append(pd.DataFrame({
                'importance': clf.coef_[0],
                'fold': [n_fold + 1] * X.shape[1],
                'feature': X.columns.tolist()}))


        print('Fold {:02d} 训练集 AUC: {:.6f} 验证集 AUC: {:.6f} '.format(n_fold + 1, train_score, valid_score))
        del trn_x, trn_y, val_x, val_y;
        gc.collect()

    feature_importance['importance'] = feature_importance['importance'].astype(float)

    fold_names = list(range(folds.n_splits))
    fold_names.append('overall')

    valid_auc = roc_auc_score(y, oof_preds)

    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))

    # 构建记录分数的 Dataframe
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores})

    oof_preds = pd.Series(oof_preds.flatten(), index=X.index).rename('TARGET')
    sub_preds = pd.Series(sub_preds.flatten(), index=test_X.index).rename('TARGET')

    return oof_preds, sub_preds, feature_importance, metrics
Exemple #32
0
def clean_data(data):
    warnings.simplefilter(action = 'ignore')
    
    # Removing empty features
    nun = data.nunique()
    empty = list(nun[nun <= 1].index)
    
    data.drop(empty, axis = 1, inplace = True)
    print('After removing empty features there are {0:d} features'.format(data.shape[1]))
    
    # Removing features with the same distribution on 0 and 1 classes
    corr = pd.DataFrame(index = ['diff', 'p'])
    ind = data[data['TARGET'].notnull()].index
    
    for c in data.columns.drop('TARGET'):
        corr[c] = corr_feature_with_target(data.loc[ind, c], data.loc[ind, 'TARGET'])

    corr = corr.T
    corr['diff_norm'] = abs(corr['diff'] / data.mean(axis = 0))
    
    to_del_1 = corr[((corr['diff'] == 0) & (corr['p'] > .05))].index
    to_del_2 = corr[((corr['diff_norm'] < .5) & (corr['p'] > .05))].drop(to_del_1).index
    to_del = list(to_del_1) + list(to_del_2)
    if 'SK_ID_CURR' in to_del:
        to_del.remove('SK_ID_CURR')
        
    data.drop(to_del, axis = 1, inplace = True)
    print('After removing features with the same distribution on 0 and 1 classes there are {0:d} features'.format(data.shape[1]))
    
    # Removing features with not the same distribution on train and test datasets
    corr_test = pd.DataFrame(index = ['diff', 'p'])
    target = data['TARGET'].notnull().astype(int)
    
    for c in data.columns.drop('TARGET'):
        corr_test[c] = corr_feature_with_target(data[c], target)

    corr_test = corr_test.T
    corr_test['diff_norm'] = abs(corr_test['diff'] / data.mean(axis = 0))
    
    bad_features = corr_test[((corr_test['p'] < .05) & (corr_test['diff_norm'] > 1))].index
    bad_features = corr.loc[bad_features][corr['diff_norm'] == 0].index
    
    data.drop(bad_features, axis = 1, inplace = True)
    print('After removing features with not the same distribution on train and test datasets there are {0:d} features'.format(data.shape[1]))
    
    del corr, corr_test
    gc.collect()
    
    # Removing features not interesting for classifier
    clf = LGBMClassifier(random_state = 0)
    train_index = data[data['TARGET'].notnull()].index
    train_columns = data.drop('TARGET', axis = 1).columns

    score = 1
    new_columns = []
    while score > .7:
        train_columns = train_columns.drop(new_columns)
        clf.fit(data.loc[train_index, train_columns], data.loc[train_index, 'TARGET'])
        f_imp = pd.Series(clf.feature_importances_, index = train_columns)
        score = roc_auc_score(data.loc[train_index, 'TARGET'], 
                              clf.predict_proba(data.loc[train_index, train_columns])[:, 1])
        new_columns = f_imp[f_imp > 0].index

    data.drop(train_columns, axis = 1, inplace = True)
    print('After removing features not interesting for classifier there are {0:d} features'.format(data.shape[1]))

    return data