def pre(self, s):
            if self.input is None:
                for i, a in enumerate(self.validation_data):
                    print(i, a.shape)
                self.input = self.validation_data[:-2]
                self.label = base_data_process.one_hot2label_index(self.validation_data[-2])
            y_pre = self.model.predict(self.input)

            y_index = base_data_process.one_hot2label_index(y_pre)
            f1 = f1_score(self.label, y_index, average='macro')
            log.info('{} f1: {}'.format(s, f1))
            return f1
Beispiel #2
0
def write2file(col_id, pre_label, name=None):
    with timer('write result {}'.format(name)):
        y_pre = one_hot2label_index(pre_label)
        df = pd.DataFrame()
        df[ID] = col_id
        df['predict'] = index2label(y_pre)
        df.to_csv('result{}.csv'.format(name), index=False)
Beispiel #3
0
def cross_validation(train,
                     params,
                     ID_COLUMN_NAME,
                     LABEL_COLUMN_NAME,
                     N_FOLD=5):
    '''
    :return: loss
    '''
    NUM_BOOST_ROUND = 5
    EARLY_STOPPING_ROUNDS = 2

    # Cross validation model
    folds = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=1001)
    feats = [
        f for f in train.columns
        if f not in [LABEL_COLUMN_NAME, ID_COLUMN_NAME]
    ]
    for i_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train[feats], train[LABEL_COLUMN_NAME])):
        dtrain = lgb.Dataset(data=train[feats].iloc[train_idx],
                             label=train[LABEL_COLUMN_NAME].iloc[train_idx],
                             free_raw_data=False,
                             silent=True)
        dvalid = lgb.Dataset(data=train[feats].iloc[valid_idx],
                             label=train[LABEL_COLUMN_NAME].iloc[valid_idx],
                             free_raw_data=False,
                             silent=True)
        with timer('cross validation-fold {} train model'.format(i_fold)):
            log.info('params is {}'.format(params))
            clf = lgb.train(num_boost_round=NUM_BOOST_ROUND,
                            params=params,
                            verbose_eval=10,
                            train_set=dtrain,
                            valid_sets=[dvalid],
                            early_stopping_rounds=EARLY_STOPPING_ROUNDS)
        with timer('cross validation-fold {} predict'.format(i_fold)):
            v_data = clf.predict(dvalid.data)
            y_pre = one_hot2label_index(v_data)
        f1 = f1_score(dvalid.label, y_pre, average='macro')
        return f1
Beispiel #4
0
def model(train,
          test,
          num_folds=5,
          stratified=True,
          num_boost_round=1000,
          save_path='origin_data_save'):
    LABEL_SIZE = train[LABEL].value_counts().count()

    print("Starting LightGBM. Train shape: {}, test shape: {}".format(
        train.shape, test.shape))

    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=True,
                                random_state=1001)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    sub_preds = np.zeros(shape=(test.shape[0], LABEL_SIZE))
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train.columns if f not in [LABEL, ID]]
    for i_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train[feats], train[LABEL])):
        dtrain = lgb.Dataset(data=train[feats].iloc[train_idx],
                             label=train[LABEL].iloc[train_idx],
                             free_raw_data=False,
                             silent=True)
        dvalid = lgb.Dataset(data=train[feats].iloc[valid_idx],
                             label=train[LABEL].iloc[valid_idx],
                             free_raw_data=False,
                             silent=True)

        params = {
            'bagging_fraction': 0.94795171020152,
            'bagging_freq': 6,
            'bin_construct_sample_cnt': 200000,
            'boosting_type': 'gbdt',
            'feature_fraction': 0.9953235660931046,
            'is_unbalance': False,
            'learning_rate': 0.005,
            'min_data_in_leaf': 30,
            'num_class': 11,
            'num_leaves': 80,
            'num_threads': 40,
            'objective': 'multiclass',
            'reg_alpha': 0.001,
            'reg_lambda': 0.1,
            'verbose': -1
        }
        with timer('fold {} train model'.format(i_fold)):
            clf = lgb.train(num_boost_round=num_boost_round,
                            params=params,
                            train_set=dtrain,
                            valid_sets=[dvalid],
                            early_stopping_rounds=50)
            clf.save_model(
                (save_path + '/model{}_{}.txt').format(i_fold,
                                                       int(time.time())))
        with timer('fold {} predict'.format(i_fold)):
            v_data = clf.predict(dvalid.data)
            y_pre = one_hot2label_index(v_data)
            sub_preds += clf.predict(test[feats])
            write2file(test[ID], sub_preds, i_fold)
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importance(
            importance_type='gain')
        fold_importance_df["fold"] = i_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
        f1 = f1_score(dvalid.label, y_pre, average='macro')
        log.warn('Fold {} f1 : {} score {}'.format(i_fold + 1, f1, f1**2))
        del clf, dtrain, dvalid
        gc.collect()
    display_importances(feature_importance_df)