Ejemplo n.º 1
0
def make_folds(train_features, train_targets_scored, n_folds, seed):
    train = train_features.merge(train_targets_scored, on='sig_id')
    target_cols = [
        c for c in train_targets_scored.columns if c not in ['sig_id']
    ]
    cols = target_cols + ['cp_type']

    train_cp = train[train['cp_type'] != 'ctl_vehicle'].reset_index(drop=True)

    mskf = MultilabelStratifiedKFold(n_splits=n_folds,
                                     shuffle=True,
                                     random_state=seed)
    train_cp.loc[:, 'fold'] = 0
    for n, (train_index,
            val_index) in enumerate(mskf.split(train_cp,
                                               train_cp[target_cols])):
        train_cp.loc[val_index, 'fold'] = int(n)
    train_cp['fold'] = train_cp['fold'].astype(int)

    train_ctl = train[train['cp_type'] == 'ctl_vehicle'].reset_index(drop=True)
    train_ctl.loc[:, 'fold'] = 100

    train = pd.concat([train_cp, train_ctl])
    train_features_with_fold = train_features.merge(train[['sig_id', 'fold']],
                                                    on='sig_id')
    return train_features_with_fold
Ejemplo n.º 2
0
def run_train(features, targets, seed):
    data = pd.read_pickle('./cached/preprocessing_train.pkl')

    oof = np.zeros(data[targets].shape)
    mskf = MultilabelStratifiedKFold(n_splits=NUM_FOLD,
                                     shuffle=True,
                                     random_state=seed)
    for n_fold, (train_idx, valid_idx) in enumerate(
            mskf.split(data[features], data[targets])):
        logger.info('>' * 5 + f" Fold {n_fold+1}")
        logger.info(
            f"train_size: {len(train_idx)}  valid_size: {len(valid_idx)}")
        # Split train and valid.
        X_train, y_train = data.iloc[train_idx][features], data.iloc[
            train_idx][targets]
        X_valid, y_valid = data.iloc[valid_idx][features], data.iloc[
            valid_idx][targets]
        # Fit model and cv pred.
        with timer(f'fold {n_fold+1} train time'):
            estimater = MoaModel(len(features), len(targets)).to(device)
            model = MoaModelTrainer(estimater)
            oof[valid_idx] = model.fit(X_train, y_train, X_valid, y_valid,
                                       n_fold, seed)
    # Evaluation.
    oof_metric = moa_loss(oof, data[targets].to_numpy())
    logger.info(f'\nMetric of oof: {oof_metric}\n')
    # Export oof dataframe.
    oof_df = data[['sig_id']].reset_index(drop=True)
    oof_df = oof_df.join(
        pd.DataFrame(oof, columns=targets).add_prefix('pred_'))
    oof_df.to_csv(f'{VERSION}_oof_seed{seed}.csv', index=False)
Ejemplo n.º 3
0
def main():
    args = make_parse()
    df = pd.read_csv(args.train_df)

    # Add columns
    df['Label'] = df.Image_Label.map(lambda x: x.split('_')[1])
    df['ImageId'] = df.Image_Label.map(lambda x: x.split('_')[0])

    # Extract unique image ids
    labels = ['Fish', 'Flower', 'Gravel', 'Sugar']
    df_new = pd.DataFrame({'ImageId': df.ImageId.unique()})
    df_new['kfold'] = -1

    for l in labels:
        df_tmp = (df[df.Label == l].drop(
            columns=['Image_Label', 'Label']).rename(columns={
                'EncodedPixels': l
            }).reset_index(drop=True))
        df_new = df_new.merge(df_tmp, on='ImageId')

    # Make ont-hot vector
    df_new2 = df_new.copy()
    for l in labels:
        df_new2[l] = (~pd.isna(df_new2[l])).astype('int')
    y = df_new2.iloc[:, 2:].values

    # Make kfolds
    indxs = list(range(len(df_new2)))
    mskf = MultilabelStratifiedKFold(n_splits=args.kfold, random_state=42)
    for i, (train_index, test_index) in enumerate(mskf.split(indxs, y)):
        df_new.loc[test_index, 'kfold'] = i + 1

    new_path = Path(args.train_df).parent / f'train_{args.kfold}kfold.csv'
    df_new.to_csv(new_path, index=False)
Ejemplo n.º 4
0
def kfold(
    n_splits: int,
    annotations: Annotations,
) -> t.List[t.Tuple[Annotations, Annotations]]:
    multi_hot = to_multi_hot(annotations, size=3474)
    indecies = range(len(multi_hot))
    mskf = MultilabelStratifiedKFold(n_splits=n_splits, random_state=0)
    return [([annotations[i] for i in train], [annotations[i] for i in test])
            for train, test in mskf.split(indecies, multi_hot)]
def _mls_enhanced_full_kfold_dfs():
    df = get_enhanced_full_train_df()
    label_mat = multilabel_binary_representation(df, sparse=False)

    kf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    for train_index, val_index in kf.split(df.index.values, label_mat):
        fold_train_df = df.iloc[train_index]
        fold_val_df = df.iloc[val_index]
        yield fold_train_df, fold_val_df
Ejemplo n.º 6
0
def multilabel_stratified_K_fold(dataset,
                                 n_folds=5,
                                 shuffle=False,
                                 random_state=42):
    y = target_to_numpy(dataset['Target'])
    mskf = MultilabelStratifiedKFold(n_splits=n_folds,
                                     shuffle=shuffle,
                                     random_state=random_state)
    return list(mskf.split(X=dataset, y=y))
Ejemplo n.º 7
0
def indices_for_fold(fold, labels):
    dataset_size = labels.shape[0]
    kfold = MultilabelStratifiedKFold(len(FOLDS),
                                      shuffle=True,
                                      random_state=config.seed)
    splits = list(kfold.split(np.zeros(dataset_size), labels))
    train_indices, eval_indices = splits[fold - 1]
    assert len(train_indices) + len(eval_indices) == dataset_size

    return train_indices, eval_indices
Ejemplo n.º 8
0
def split_df(df):
    kf = MultilabelStratifiedKFold(n_splits=FOLD_NUM, shuffle=True, random_state=SEED)
    annot_pivot = pd.pivot_table(df, index=['image_id'], columns=['category_id'],
                                 values='id', fill_value=0, aggfunc='count') \
        .reset_index().rename_axis(None, axis=1)
    for fold, (train_idx, val_idx) in enumerate(kf.split(annot_pivot,
                                                         annot_pivot.iloc[:, 1:8])):
        annot_pivot[f'fold_{fold}'] = 0
        annot_pivot.loc[val_idx, f'fold_{fold}'] = 1
    return annot_pivot
Ejemplo n.º 9
0
def stratified_train_test_split(x, test_size: float = 0.1, random_state=None, shuffle=False):
    X = x[['image_id']].values
    y = x[['grapheme_root', 'vowel_diacritic', 'consonant_diacritic']].values

    mskf = MultilabelStratifiedKFold(n_splits=int(1 / test_size), random_state=random_state, shuffle=shuffle)

    for train_index, test_index in mskf.split(X, y):
        df_train, df_test = x.iloc[train_index], x.iloc[test_index]
        break
        
    return df_train, df_test
Ejemplo n.º 10
0
def stratCV(model, nfolds, train_X, train_Y, **params):
    mskf = MultilabelStratifiedKFold(n_splits=nfolds, shuffle=True)

    for train_index, valid_index in mskf.split(train_X, train_Y):
        print("TRAIN:", train_index, "VALID:", valid_index)
        X_train, X_valid = train_X[train_index], train_X[valid_index]
        Y_train, Y_valid = train_Y[train_index], train_Y[valid_index]

    m = MultiOutputRegressor(model(params))
    m.fit(X_train, Y_train)
    y_score = m.score(X_valid, Y_valid)
    print(y_score)