def make_folds(train_features, train_targets_scored, n_folds, seed): train = train_features.merge(train_targets_scored, on='sig_id') target_cols = [ c for c in train_targets_scored.columns if c not in ['sig_id'] ] cols = target_cols + ['cp_type'] train_cp = train[train['cp_type'] != 'ctl_vehicle'].reset_index(drop=True) mskf = MultilabelStratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed) train_cp.loc[:, 'fold'] = 0 for n, (train_index, val_index) in enumerate(mskf.split(train_cp, train_cp[target_cols])): train_cp.loc[val_index, 'fold'] = int(n) train_cp['fold'] = train_cp['fold'].astype(int) train_ctl = train[train['cp_type'] == 'ctl_vehicle'].reset_index(drop=True) train_ctl.loc[:, 'fold'] = 100 train = pd.concat([train_cp, train_ctl]) train_features_with_fold = train_features.merge(train[['sig_id', 'fold']], on='sig_id') return train_features_with_fold
def run_train(features, targets, seed): data = pd.read_pickle('./cached/preprocessing_train.pkl') oof = np.zeros(data[targets].shape) mskf = MultilabelStratifiedKFold(n_splits=NUM_FOLD, shuffle=True, random_state=seed) for n_fold, (train_idx, valid_idx) in enumerate( mskf.split(data[features], data[targets])): logger.info('>' * 5 + f" Fold {n_fold+1}") logger.info( f"train_size: {len(train_idx)} valid_size: {len(valid_idx)}") # Split train and valid. X_train, y_train = data.iloc[train_idx][features], data.iloc[ train_idx][targets] X_valid, y_valid = data.iloc[valid_idx][features], data.iloc[ valid_idx][targets] # Fit model and cv pred. with timer(f'fold {n_fold+1} train time'): estimater = MoaModel(len(features), len(targets)).to(device) model = MoaModelTrainer(estimater) oof[valid_idx] = model.fit(X_train, y_train, X_valid, y_valid, n_fold, seed) # Evaluation. oof_metric = moa_loss(oof, data[targets].to_numpy()) logger.info(f'\nMetric of oof: {oof_metric}\n') # Export oof dataframe. oof_df = data[['sig_id']].reset_index(drop=True) oof_df = oof_df.join( pd.DataFrame(oof, columns=targets).add_prefix('pred_')) oof_df.to_csv(f'{VERSION}_oof_seed{seed}.csv', index=False)
def main(): args = make_parse() df = pd.read_csv(args.train_df) # Add columns df['Label'] = df.Image_Label.map(lambda x: x.split('_')[1]) df['ImageId'] = df.Image_Label.map(lambda x: x.split('_')[0]) # Extract unique image ids labels = ['Fish', 'Flower', 'Gravel', 'Sugar'] df_new = pd.DataFrame({'ImageId': df.ImageId.unique()}) df_new['kfold'] = -1 for l in labels: df_tmp = (df[df.Label == l].drop( columns=['Image_Label', 'Label']).rename(columns={ 'EncodedPixels': l }).reset_index(drop=True)) df_new = df_new.merge(df_tmp, on='ImageId') # Make ont-hot vector df_new2 = df_new.copy() for l in labels: df_new2[l] = (~pd.isna(df_new2[l])).astype('int') y = df_new2.iloc[:, 2:].values # Make kfolds indxs = list(range(len(df_new2))) mskf = MultilabelStratifiedKFold(n_splits=args.kfold, random_state=42) for i, (train_index, test_index) in enumerate(mskf.split(indxs, y)): df_new.loc[test_index, 'kfold'] = i + 1 new_path = Path(args.train_df).parent / f'train_{args.kfold}kfold.csv' df_new.to_csv(new_path, index=False)
def kfold( n_splits: int, annotations: Annotations, ) -> t.List[t.Tuple[Annotations, Annotations]]: multi_hot = to_multi_hot(annotations, size=3474) indecies = range(len(multi_hot)) mskf = MultilabelStratifiedKFold(n_splits=n_splits, random_state=0) return [([annotations[i] for i in train], [annotations[i] for i in test]) for train, test in mskf.split(indecies, multi_hot)]
def _mls_enhanced_full_kfold_dfs(): df = get_enhanced_full_train_df() label_mat = multilabel_binary_representation(df, sparse=False) kf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=0) for train_index, val_index in kf.split(df.index.values, label_mat): fold_train_df = df.iloc[train_index] fold_val_df = df.iloc[val_index] yield fold_train_df, fold_val_df
def multilabel_stratified_K_fold(dataset, n_folds=5, shuffle=False, random_state=42): y = target_to_numpy(dataset['Target']) mskf = MultilabelStratifiedKFold(n_splits=n_folds, shuffle=shuffle, random_state=random_state) return list(mskf.split(X=dataset, y=y))
def indices_for_fold(fold, labels): dataset_size = labels.shape[0] kfold = MultilabelStratifiedKFold(len(FOLDS), shuffle=True, random_state=config.seed) splits = list(kfold.split(np.zeros(dataset_size), labels)) train_indices, eval_indices = splits[fold - 1] assert len(train_indices) + len(eval_indices) == dataset_size return train_indices, eval_indices
def split_df(df): kf = MultilabelStratifiedKFold(n_splits=FOLD_NUM, shuffle=True, random_state=SEED) annot_pivot = pd.pivot_table(df, index=['image_id'], columns=['category_id'], values='id', fill_value=0, aggfunc='count') \ .reset_index().rename_axis(None, axis=1) for fold, (train_idx, val_idx) in enumerate(kf.split(annot_pivot, annot_pivot.iloc[:, 1:8])): annot_pivot[f'fold_{fold}'] = 0 annot_pivot.loc[val_idx, f'fold_{fold}'] = 1 return annot_pivot
def stratified_train_test_split(x, test_size: float = 0.1, random_state=None, shuffle=False): X = x[['image_id']].values y = x[['grapheme_root', 'vowel_diacritic', 'consonant_diacritic']].values mskf = MultilabelStratifiedKFold(n_splits=int(1 / test_size), random_state=random_state, shuffle=shuffle) for train_index, test_index in mskf.split(X, y): df_train, df_test = x.iloc[train_index], x.iloc[test_index] break return df_train, df_test
def stratCV(model, nfolds, train_X, train_Y, **params): mskf = MultilabelStratifiedKFold(n_splits=nfolds, shuffle=True) for train_index, valid_index in mskf.split(train_X, train_Y): print("TRAIN:", train_index, "VALID:", valid_index) X_train, X_valid = train_X[train_index], train_X[valid_index] Y_train, Y_valid = train_Y[train_index], train_Y[valid_index] m = MultiOutputRegressor(model(params)) m.fit(X_train, Y_train) y_score = m.score(X_valid, Y_valid) print(y_score)