def pre(self, s): if self.input is None: for i, a in enumerate(self.validation_data): print(i, a.shape) self.input = self.validation_data[:-2] self.label = base_data_process.one_hot2label_index(self.validation_data[-2]) y_pre = self.model.predict(self.input) y_index = base_data_process.one_hot2label_index(y_pre) f1 = f1_score(self.label, y_index, average='macro') log.info('{} f1: {}'.format(s, f1)) return f1
def write2file(col_id, pre_label, name=None): with timer('write result {}'.format(name)): y_pre = one_hot2label_index(pre_label) df = pd.DataFrame() df[ID] = col_id df['predict'] = index2label(y_pre) df.to_csv('result{}.csv'.format(name), index=False)
def cross_validation(train, params, ID_COLUMN_NAME, LABEL_COLUMN_NAME, N_FOLD=5): ''' :return: loss ''' NUM_BOOST_ROUND = 5 EARLY_STOPPING_ROUNDS = 2 # Cross validation model folds = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=1001) feats = [ f for f in train.columns if f not in [LABEL_COLUMN_NAME, ID_COLUMN_NAME] ] for i_fold, (train_idx, valid_idx) in enumerate( folds.split(train[feats], train[LABEL_COLUMN_NAME])): dtrain = lgb.Dataset(data=train[feats].iloc[train_idx], label=train[LABEL_COLUMN_NAME].iloc[train_idx], free_raw_data=False, silent=True) dvalid = lgb.Dataset(data=train[feats].iloc[valid_idx], label=train[LABEL_COLUMN_NAME].iloc[valid_idx], free_raw_data=False, silent=True) with timer('cross validation-fold {} train model'.format(i_fold)): log.info('params is {}'.format(params)) clf = lgb.train(num_boost_round=NUM_BOOST_ROUND, params=params, verbose_eval=10, train_set=dtrain, valid_sets=[dvalid], early_stopping_rounds=EARLY_STOPPING_ROUNDS) with timer('cross validation-fold {} predict'.format(i_fold)): v_data = clf.predict(dvalid.data) y_pre = one_hot2label_index(v_data) f1 = f1_score(dvalid.label, y_pre, average='macro') return f1
def model(train, test, num_folds=5, stratified=True, num_boost_round=1000, save_path='origin_data_save'): LABEL_SIZE = train[LABEL].value_counts().count() print("Starting LightGBM. Train shape: {}, test shape: {}".format( train.shape, test.shape)) gc.collect() # Cross validation model if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1001) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001) # Create arrays and dataframes to store results sub_preds = np.zeros(shape=(test.shape[0], LABEL_SIZE)) feature_importance_df = pd.DataFrame() feats = [f for f in train.columns if f not in [LABEL, ID]] for i_fold, (train_idx, valid_idx) in enumerate( folds.split(train[feats], train[LABEL])): dtrain = lgb.Dataset(data=train[feats].iloc[train_idx], label=train[LABEL].iloc[train_idx], free_raw_data=False, silent=True) dvalid = lgb.Dataset(data=train[feats].iloc[valid_idx], label=train[LABEL].iloc[valid_idx], free_raw_data=False, silent=True) params = { 'bagging_fraction': 0.94795171020152, 'bagging_freq': 6, 'bin_construct_sample_cnt': 200000, 'boosting_type': 'gbdt', 'feature_fraction': 0.9953235660931046, 'is_unbalance': False, 'learning_rate': 0.005, 'min_data_in_leaf': 30, 'num_class': 11, 'num_leaves': 80, 'num_threads': 40, 'objective': 'multiclass', 'reg_alpha': 0.001, 'reg_lambda': 0.1, 'verbose': -1 } with timer('fold {} train model'.format(i_fold)): clf = lgb.train(num_boost_round=num_boost_round, params=params, train_set=dtrain, valid_sets=[dvalid], early_stopping_rounds=50) clf.save_model( (save_path + '/model{}_{}.txt').format(i_fold, int(time.time()))) with timer('fold {} predict'.format(i_fold)): v_data = clf.predict(dvalid.data) y_pre = one_hot2label_index(v_data) sub_preds += clf.predict(test[feats]) write2file(test[ID], sub_preds, i_fold) fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importance( importance_type='gain') fold_importance_df["fold"] = i_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) f1 = f1_score(dvalid.label, y_pre, average='macro') log.warn('Fold {} f1 : {} score {}'.format(i_fold + 1, f1, f1**2)) del clf, dtrain, dvalid gc.collect() display_importances(feature_importance_df)