Ejemplo n.º 1
0
def make_folds(targets, scored, seed, K):
    # LOCATE DRUGS
    vc = scored["drug_id"].value_counts()
    vc1 = vc.loc[vc <= 18].index.sort_values()
    vc2 = vc.loc[vc > 18].index.sort_values()

    # STRATIFY DRUGS 18X OR LESS
    dct1 = {}
    dct2 = {}
    skf = MultilabelStratifiedKFold(n_splits=K,
                                    shuffle=True,
                                    random_state=seed)
    tmp = scored.groupby('drug_id')[targets].mean().loc[vc1]
    for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])):
        dd = {k: fold
              for k in tmp.index[idxV].values}  # drug id がどのフォールドに属すか格納
        dct1.update(dd)

    # STRATIFY DRUGS MORE THAN 18X
    skf = MultilabelStratifiedKFold(n_splits=K,
                                    shuffle=True,
                                    random_state=seed)
    tmp = scored.loc[scored["drug_id"].isin(vc2)].reset_index(drop=True)
    for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])):
        dd = {k: fold for k in tmp["sig_id"][idxV].values}
        dct2.update(dd)

    # ASSIGN K
    scored['fold'] = scored.drug_id.map(dct1)
    scored.loc[scored["fold"].isna(),
               'fold'] = scored.loc[scored["fold"].isna(), 'sig_id'].map(dct2)
    scored["fold"] = scored["fold"].astype('int8')
    return scored
def create_random_split(train_meta,
                        external_meta=None,
                        n_splits=4,
                        alias='random'):
    split_dir = opj(DATA_DIR, 'split', '%s_folds%d' % (alias, n_splits))
    os.makedirs(split_dir, exist_ok=True)

    kf = MultilabelStratifiedKFold(n_splits=n_splits,
                                   shuffle=True,
                                   random_state=100)
    train_indices_list, valid_indices_list = [], []
    for train_indices, valid_indices in kf.split(
            train_meta, train_meta[LABEL_NAME_LIST].values):
        train_indices_list.append(train_indices)
        valid_indices_list.append(valid_indices)

    ext_train_indices_list, ext_valid_indices_list = [], []
    if external_meta is not None:
        ext_kf = MultilabelStratifiedKFold(n_splits=n_splits,
                                           shuffle=True,
                                           random_state=100)
        for ext_train_indices, ext_valid_indices in ext_kf.split(
                external_meta, external_meta[LABEL_NAME_LIST].values):
            ext_train_indices_list.append(ext_train_indices)
            ext_valid_indices_list.append(ext_valid_indices)

    for idx in range(n_splits):
        train_split_df = train_meta.loc[train_indices_list[idx]]
        valid_split_df = train_meta.loc[valid_indices_list[idx]]

        if external_meta is not None:
            train_split_df = pd.concat(
                (train_split_df,
                 external_meta.loc[ext_train_indices_list[idx]]),
                ignore_index=True)
            valid_split_df = pd.concat(
                (valid_split_df,
                 external_meta.loc[ext_valid_indices_list[idx]]),
                ignore_index=True)
            train_split_df = train_split_df[
                [ID, TARGET, EXTERNAL, ANTIBODY, ANTIBODY_CODE] +
                LABEL_NAME_LIST]
            valid_split_df = valid_split_df[
                [ID, TARGET, EXTERNAL, ANTIBODY, ANTIBODY_CODE] +
                LABEL_NAME_LIST]

        if idx == 0:
            for name in LABEL_NAMES.values():
                print(name, (train_split_df[name] == 1).sum(),
                      (valid_split_df[name] == 1).sum())

        fname = opj(split_dir, 'random_train_cv%d.csv' % (idx))
        print("create split file: %s, shape: %s" %
              (fname, str(train_split_df.shape)))
        train_split_df.to_csv(fname, index=False)

        fname = opj(split_dir, 'random_valid_cv%d.csv' % (idx))
        print("create split file: %s, shape: %s" %
              (fname, str(valid_split_df.shape)))
        valid_split_df.to_csv(fname, index=False)
Ejemplo n.º 3
0
def get_best_epoch(cfg):
    print(cfg)
    dataset = pd.read_csv(data_dir + 'train.csv')
    dataset['Id'] = data_dir + 'img_data/' + dataset['Id']
    dataset['suffix'] = '.png'

    ex_data = pd.read_csv(data_dir + 'HPAv18RBGY_wodpl.csv')
    if cfg['gray']:
        ex_data['Id'] = data_dir + 'HPAv18_gray/' + ex_data['Id']
    else:
        ex_data['Id'] = data_dir + 'HPAv18/' + ex_data['Id']
    ex_data['suffix'] = '.jpg'

    target = get_label(dataset)
    extarget = get_label(ex_data)
    folds = MultilabelStratifiedKFold(2, shuffle=True, random_state=66666)
    exfolds = MultilabelStratifiedKFold(3, shuffle=True, random_state=66666)
    for fold_i, (tr_idx,
                 val_idx) in enumerate(folds.split(dataset['Id'], target)):
        for fold_j, (ex_tr_idx,
                     _) in enumerate(exfolds.split(ex_data['Id'], extarget)):
            if fold_i * 3 + fold_j not in cfg['fold']:
                continue
            print(fold_i, fold_j, '-----------------')

            tr_data = ex_data.iloc[ex_tr_idx].append(
                dataset.iloc[tr_idx]).reset_index(drop=True)
            val_data = dataset.iloc[val_idx]

            train(fold_i * 3 + fold_j, tr_data, val_data, cfg)
Ejemplo n.º 4
0
def add_fold(train, targets):
    # https://www.kaggle.com/c/lish-moa/discussion/195195
    # LOAD LIBRARIESr
    from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
    # LOAD FILES
    drug = pd.read_csv(os.path.join(DATA_DIR, 'train_drug.csv'))
    target_cols = targets.columns[1:]
    targets = targets.merge(drug, on='sig_id', how='left')
    # LOCATE DRUGS. 18 or 1000
    vc = targets.drug_id.value_counts()
    vc1 = vc.loc[vc <= 18].index
    vc2 = vc.loc[vc > 18].index
    # STRATIFY DRUGS 18X OR LESS
    dct1 = {}
    dct2 = {}
    cv = MultilabelStratifiedKFold(n_splits=NUM_FOLD, shuffle=True)
    tmp = targets.groupby('drug_id')[target_cols].mean().loc[vc1]
    for fold, (idxT, idxV) in enumerate(cv.split(tmp, tmp[target_cols])):
        dd = {k: fold for k in tmp.index[idxV].values}
        dct1.update(dd)
    # STRATIFY DRUGS MORE THAN 18X
    cv = MultilabelStratifiedKFold(n_splits=NUM_FOLD, shuffle=True)
    tmp = targets.loc[targets.drug_id.isin(vc2)].reset_index(drop=True)
    for fold, (idxT, idxV) in enumerate(cv.split(tmp, tmp[target_cols])):
        dd = {k: fold for k in tmp.sig_id[idxV].values}
        dct2.update(dd)
    # ASSIGN FOLDS
    train['fold'] = targets.drug_id.map(dct1)
    train.loc[train.fold.isna(), 'fold'] = train.loc[train.fold.isna(),
                                                     'sig_id'].map(dct2)
    train.fold = train.fold.astype('int8')
    targets.drop(['drug_id'], axis=1, inplace=True)
    return train, targets
Ejemplo n.º 5
0
def process_score(scored, targets, seed=42, folds=7):
    # LOCATE DRUGS
    vc = scored.drug_id.value_counts()
    vc1 = vc.loc[vc <= 18].index.sort_values()
    vc2 = vc.loc[vc > 18].index.sort_values()

    # STRATIFY DRUGS 18X OR LESS
    dct1 = {}
    dct2 = {}
    skf = MultilabelStratifiedKFold(n_splits=folds,
                                    shuffle=True,
                                    random_state=seed)
    tmp = scored.groupby('drug_id')[targets].mean().loc[vc1]
    for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])):
        dd = {k: fold for k in tmp.index[idxV].values}
        dct1.update(dd)

    # STRATIFY DRUGS MORE THAN 18X
    skf = MultilabelStratifiedKFold(n_splits=folds,
                                    shuffle=True,
                                    random_state=seed)
    tmp = scored.loc[scored.drug_id.isin(vc2)].reset_index(drop=True)
    for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])):
        dd = {k: fold for k in tmp.sig_id[idxV].values}
        dct2.update(dd)

    # ASSIGN FOLDS
    scored['kfold'] = scored.drug_id.map(dct1)
    scored.loc[scored.kfold.isna(),'kfold'] =\
        scored.loc[scored.kfold.isna(),'sig_id'].map(dct2)
    scored.kfold = scored.kfold.astype('int8')
    return scored
def make_folds(train, num_starts, num_splits):

    train_ = train.copy()
    folds = []

    # LOAD FILES
    train_feats = pd.read_csv('../input/lish-moa/train_features.csv')
    scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
    drug = pd.read_csv('/kaggle/input/lish-moa/train_drug.csv')
    scored = scored.loc[train_feats['cp_type'] == 'trt_cp', :]
    drug = drug.loc[train_feats['cp_type'] == 'trt_cp', :]
    targets = scored.columns[1:]
    scored = scored.merge(drug, on='sig_id', how='left')

    # LOCATE DRUGS
    vc = scored.drug_id.value_counts()
    vc1 = vc.loc[(vc <= 6) | (vc == 12) | (vc == 18)].index.sort_values()
    vc2 = vc.loc[(vc > 6) & (vc != 12) & (vc != 18)].index.sort_values()

    for seed in range(num_starts):

        # STRATIFY DRUGS 18X OR LESS
        dct1 = {}
        dct2 = {}
        skf = MultilabelStratifiedKFold(n_splits=num_splits,
                                        shuffle=True,
                                        random_state=seed)
        tmp = scored.groupby('drug_id')[targets].mean().loc[vc1]
        for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])):
            dd = {k: fold for k in tmp.index[idxV].values}
            dct1.update(dd)

        # STRATIFY DRUGS MORE THAN 18X
        skf = MultilabelStratifiedKFold(n_splits=num_splits,
                                        shuffle=True,
                                        random_state=seed)
        tmp = scored.loc[scored.drug_id.isin(vc2)].reset_index(drop=True)
        for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])):
            dd = {k: fold for k in tmp.sig_id[idxV].values}
            dct2.update(dd)

        # ASSIGN FOLDS
        scored['fold'] = scored.drug_id.map(dct1)
        scored.loc[scored.fold.isna(), 'fold'] = scored.loc[scored.fold.isna(),
                                                            'sig_id'].map(dct2)
        scored.fold = scored.fold.astype('int8')
        folds.append(scored.fold.values)

        del scored['fold']

        for i in range(len(folds)):
            train_[f'seed{i}'] = folds[i]

    return train_
Ejemplo n.º 7
0
def create_cv(X, y, drugs, sig_ids, threshold=1000, folds=5, seed=42):
    seed_everything(seed)

    y_cols = y.columns.tolist()
    X = X.copy()
    y = y.copy()

    X = pd.concat([X, y], axis=1)
    X['drug_id'] = drugs
    X['sig_id'] = sig_ids

    #Locate drugs
    drugs_count = X['drug_id'].value_counts()
    drugs_below_thresh = drugs_count.loc[
        drugs_count <= threshold].index.sort_values()
    drugs_above_thresh = drugs_count.loc[
        drugs_count > threshold].index.sort_values()

    dct_below_thresh = {}
    dct_above_thresh = {}
    #Stratify below threshold
    skf = MultilabelStratifiedKFold(n_splits=folds,
                                    shuffle=True,
                                    random_state=seed)
    tmp = X.groupby('drug_id')[y_cols].mean().loc[drugs_below_thresh]
    for f, (idxT, idxV) in enumerate(skf.split(tmp, tmp[y_cols])):
        dd = {k: f for k in tmp.index[idxV].values}
        dct_below_thresh.update(dd)

    #stratify above threshold
    skf = MultilabelStratifiedKFold(n_splits=folds,
                                    shuffle=True,
                                    random_state=seed)
    tmp = X.loc[X['drug_id'].isin(drugs_above_thresh)].reset_index(drop=True)
    for f, (idxT, idxV) in enumerate(skf.split(tmp, tmp[y_cols])):
        dd = {k: f for k in tmp.sig_id[idxV].values}
        dct_above_thresh.update(dd)

    # ASSIGN FOLDS
    X['fold'] = X['drug_id'].map(dct_below_thresh)
    X.loc[X['fold'].isna(), 'fold'] = X.loc[X['fold'].isna(),
                                            'sig_id'].map(dct_above_thresh)
    X['fold'] = X['fold'].astype('int8')

    oof_assignment = X['fold'].values

    oof_idx = []
    for x in np.arange(folds):
        train = np.where(oof_assignment != x)[0]
        val = np.where(oof_assignment == x)[0]
        oof_idx.append((train, val))
    return oof_idx
Ejemplo n.º 8
0
def make_folds(drug,
               scored,
               folds,
               random_state,
               stratify=True,
               drug_thresh=18):
    targets = scored.columns[1:]
    scored = scored.merge(drug, on='sig_id', how='left')

    # LOCATE DRUGS
    vc = scored.drug_id.value_counts()
    vc1 = vc.loc[vc <= drug_thresh].index.sort_values()
    vc2 = vc.loc[vc > drug_thresh].index.sort_values()

    # STRATIFY DRUGS 18 OR LESS
    dct1 = {}
    dct2 = {}
    if stratify:
        skf = MultilabelStratifiedKFold(n_splits=folds,
                                        shuffle=True,
                                        random_state=random_state)
    else:
        skf = KFold(n_splits=folds, shuffle=True, random_state=random_state)
    tmp = scored.groupby('drug_id')[targets].mean().loc[vc1]
    for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])):
        dd = {k: fold for k in tmp.index[idxV].values}
        dct1.update(dd)

    # STRATIFY DRUGS MORE THAN 18
    if stratify:
        skf = MultilabelStratifiedKFold(n_splits=folds,
                                        shuffle=True,
                                        random_state=random_state)
    else:
        skf = KFold(n_splits=folds, shuffle=True, random_state=random_state)
    tmp = scored.loc[scored.drug_id.isin(vc2)].reset_index(drop=True)
    for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])):
        dd = {k: fold for k in tmp.sig_id[idxV].values}
        dct2.update(dd)

    # ASSIGN FOLDS
    scored['fold'] = np.nan
    scored['fold'] = scored.drug_id.map(dct1)
    scored.loc[scored.fold.isna(), 'fold'] = scored.loc[scored.fold.isna(),
                                                        'sig_id'].map(dct2)
    scored.fold = scored.fold.astype('int8')

    return scored[['sig_id', 'fold']].copy()
Ejemplo n.º 9
0
def split_data(x, y, n_splits):
    kfold = MultilabelStratifiedKFold(n_splits=n_splits,
                                      random_state=SEED,
                                      shuffle=True)
    for train_index, valid_index in kfold.split(x, y):
        yield (x.iloc[train_index], y.iloc[train_index]), (x.iloc[valid_index],
                                                           y.iloc[valid_index])
Ejemplo n.º 10
0
def split_data(config, df):
    """split data into training and validation data

    Args:
        config: CFG
        df: dataframe object

    Returns: df: dataframe object with fold
    """

    df['fold'] = -1

    # MultilabelStratifiedKFold
    mskf = MultilabelStratifiedKFold(n_splits=config.n_folds,
                                     random_state=config.seed,
                                     shuffle=True)

    # patient level - target, sex, size
    patient = df.groupby("patient_id")['target'].apply(lambda v:
                                                       (v == 1).any())
    patient = pd.concat(
        [patient,
         df.groupby('patient_id')['sex'].apply(lambda v: v.iloc[0])],
        axis=1)
    patient = pd.concat([patient, df.groupby("patient_id").size()],
                        axis=1).rename({0: "size"}, axis=1)
    patient['size'] = pd.qcut(patient['size'], 10, labels=range(10))

    for fold, (tr_idx,
               vl_idx) in enumerate(mskf.split(X=patient, y=patient.values)):
        vl_idx = df[df['patient_id'].isin(patient.iloc[vl_idx].index)].index
        df.loc[vl_idx, "fold"] = fold

    return df
Ejemplo n.º 11
0
def main(cfg):
    data_dir = './input/'
    dataset = pd.read_csv(data_dir+'train.csv')
    dataset['Id'] = data_dir + 'train/' + dataset['Id']
    dataset['suffix'] = '.png'
    if cfg['use_external_data']:
        ex_data = pd.read_csv(data_dir+'HPAv18RBGY_wodpl.csv')
        ex_data['Id'] = data_dir + 'HPAv18_512/' + ex_data['Id']
        ex_data['suffix'] = '.jpg'
        dataset = dataset.append(ex_data).reset_index(drop=True)

    print(cfg)

    target = np.zeros((len(dataset), 28))
    for i, labels in enumerate(dataset['Target']):
        labels = [int(t) for t in labels.split() if t != '']
        for l in labels:
            target[i, l] = 1
    # folds = KFold(len(dataset),cfg['nfold'],shuffle=True,random_state=66666)
    folds = MultilabelStratifiedKFold(cfg['nfold'],shuffle=True,random_state=66666)

    for n_fold, (tr_idx, val_idx) in enumerate(folds.split(dataset['Id'],target)):
        print(val_idx)

        if n_fold not in cfg['fold']:
            continue

        print(n_fold,'-----------------')
        tr_data = dataset.iloc[tr_idx]
        val_data = dataset.iloc[val_idx]



        train(n_fold,tr_data,val_data,cfg)
Ejemplo n.º 12
0
def train(model, device, X, Y, n_splits=10, batch_size=4096, epochs=50):
    kfold = MultilabelStratifiedKFold(n_splits=n_splits,
                                      random_state=42,
                                      shuffle=True)

    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters())

    train_losses = np.array([])
    val_losses = np.array([])

    for n, (tr, te) in enumerate(kfold.split(X, Y)):
        X_train, X_val = X[tr], X[te]
        y_train, y_val = Y[tr], Y[te]

        train_dataset = MoaDataset(X_train, y_train)
        val_dataset = MoaDataset(X_val, y_val)
        train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                   batch_size=batch_size,
                                                   shuffle=True)
        val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                                 batch_size=batch_size,
                                                 shuffle=False)
        split_train_losses, split_val_losses = batch_gd(
            model, device, criterion, optimizer, train_loader, val_loader,
            epochs)
        print(
            f"Fold {n+1}, final train loss: {split_train_losses[epochs-1]:5.5f}, final train loss: {split_val_losses[epochs-1]:5.5f}"
        )
        train_losses = np.concatenate((train_losses, split_train_losses))
        val_losses = np.concatenate((val_losses, split_val_losses))

    model.save("latest_model")
    return train_losses, val_losses
Ejemplo n.º 13
0
def main(cfg):

    dataset = pd.read_csv(data_dir + 'train.csv')
    dataset['Id'] = data_dir + 'img_data/' + dataset['Id']
    dataset['suffix'] = '.png'
    if cfg['use_external_data']:
        ex_data = pd.read_csv(data_dir + 'HPAv18RBGY_wodpl.csv')
        if cfg['gray']:
            ex_data['Id'] = data_dir + 'HPAv18_gray/' + ex_data['Id']
        else:
            ex_data['Id'] = data_dir + 'HPAv18/' + ex_data['Id']
        ex_data['suffix'] = '.jpg'
        dataset = dataset.append(ex_data).reset_index(drop=True)

    print(cfg)
    target = get_label(dataset)

    # folds = KFold(len(dataset),cfg['nfold'],shuffle=True,random_state=66666)
    folds = MultilabelStratifiedKFold(cfg['nfold'],
                                      shuffle=True,
                                      random_state=66666)

    for n_fold, (tr_idx,
                 val_idx) in enumerate(folds.split(dataset['Id'], target)):
        print(val_idx)

        if n_fold not in cfg['fold']:
            continue

        print(n_fold, '-----------------')
        tr_data = dataset.iloc[tr_idx]
        val_data = dataset.iloc[val_idx]

        train(n_fold, tr_data, val_data, cfg)
Ejemplo n.º 14
0
def run_train(features, targets, seed):
    data = pd.read_pickle('./cached/preprocessing_train.pkl')

    oof = np.zeros(data[targets].shape)
    mskf = MultilabelStratifiedKFold(n_splits=NUM_FOLD,
                                     shuffle=True,
                                     random_state=seed)
    for n_fold, (train_idx, valid_idx) in enumerate(
            mskf.split(data[features], data[targets])):
        logger.info('>' * 5 + f" Fold {n_fold+1}")
        logger.info(
            f"train_size: {len(train_idx)}  valid_size: {len(valid_idx)}")
        # Split train and valid.
        X_train, y_train = data.iloc[train_idx][features], data.iloc[
            train_idx][targets]
        X_valid, y_valid = data.iloc[valid_idx][features], data.iloc[
            valid_idx][targets]
        # Fit model and cv pred.
        with timer(f'fold {n_fold+1} train time'):
            estimater = MoaModel(len(features), len(targets)).to(device)
            model = MoaModelTrainer(estimater)
            oof[valid_idx] = model.fit(X_train, y_train, X_valid, y_valid,
                                       n_fold, seed)
    # Evaluation.
    oof_metric = moa_loss(oof, data[targets].to_numpy())
    logger.info(f'\nMetric of oof: {oof_metric}\n')
    # Export oof dataframe.
    oof_df = data[['sig_id']].reset_index(drop=True)
    oof_df = oof_df.join(
        pd.DataFrame(oof, columns=targets).add_prefix('pred_'))
    oof_df.to_csv(f'{VERSION}_oof_seed{seed}.csv', index=False)
Ejemplo n.º 15
0
def make_folds(train_features, train_targets_scored, n_folds, seed):
    train = train_features.merge(train_targets_scored, on='sig_id')
    target_cols = [
        c for c in train_targets_scored.columns if c not in ['sig_id']
    ]
    cols = target_cols + ['cp_type']

    train_cp = train[train['cp_type'] != 'ctl_vehicle'].reset_index(drop=True)

    mskf = MultilabelStratifiedKFold(n_splits=n_folds,
                                     shuffle=True,
                                     random_state=seed)
    train_cp.loc[:, 'fold'] = 0
    for n, (train_index,
            val_index) in enumerate(mskf.split(train_cp,
                                               train_cp[target_cols])):
        train_cp.loc[val_index, 'fold'] = int(n)
    train_cp['fold'] = train_cp['fold'].astype(int)

    train_ctl = train[train['cp_type'] == 'ctl_vehicle'].reset_index(drop=True)
    train_ctl.loc[:, 'fold'] = 100

    train = pd.concat([train_cp, train_ctl])
    train_features_with_fold = train_features.merge(train[['sig_id', 'fold']],
                                                    on='sig_id')
    return train_features_with_fold
Ejemplo n.º 16
0
def main():
    args = make_parse()
    df = pd.read_csv(args.train_df)

    # Add columns
    df['Label'] = df.Image_Label.map(lambda x: x.split('_')[1])
    df['ImageId'] = df.Image_Label.map(lambda x: x.split('_')[0])

    # Extract unique image ids
    labels = ['Fish', 'Flower', 'Gravel', 'Sugar']
    df_new = pd.DataFrame({'ImageId': df.ImageId.unique()})
    df_new['kfold'] = -1

    for l in labels:
        df_tmp = (df[df.Label == l].drop(
            columns=['Image_Label', 'Label']).rename(columns={
                'EncodedPixels': l
            }).reset_index(drop=True))
        df_new = df_new.merge(df_tmp, on='ImageId')

    # Make ont-hot vector
    df_new2 = df_new.copy()
    for l in labels:
        df_new2[l] = (~pd.isna(df_new2[l])).astype('int')
    y = df_new2.iloc[:, 2:].values

    # Make kfolds
    indxs = list(range(len(df_new2)))
    mskf = MultilabelStratifiedKFold(n_splits=args.kfold, random_state=42)
    for i, (train_index, test_index) in enumerate(mskf.split(indxs, y)):
        df_new.loc[test_index, 'kfold'] = i + 1

    new_path = Path(args.train_df).parent / f'train_{args.kfold}kfold.csv'
    df_new.to_csv(new_path, index=False)
Ejemplo n.º 17
0
def _k_fold(df: pd.DataFrame, n_splits: int, random_state: Any = 42):
    X = np.array(df.Id)
    y = np.array(
        [HumanProteinDataset.parse_target(target) for target in df.Target])
    mskf = MultilabelStratifiedKFold(n_splits=n_splits,
                                     random_state=random_state)
    return mskf.split(X, y)
Ejemplo n.º 18
0
    def get_pids(self):

        # FIXME: This is based on label distribution
        # MultilabelStratified KFold
        xs = np.array(list(self.meta_dict.keys()))
        ys = []
        for v in self.meta_dict.values():
            temp_lbl = np.array(v["bbox"])[:, 2]
            temp = np.zeros((15))
            for i in temp_lbl:
                temp[int(i)] = 1
            ys.append(temp)
        ys = np.array(ys)

        mskf = MultilabelStratifiedKFold(n_splits=5,
                                         shuffle=True,
                                         random_state=30)
        kfold_generator = mskf.split(xs, ys)

        for _ in range(self.cfgs["fold"] + 1):
            train_index, val_index = next(kfold_generator)

        if self.mode == "train":
            pids = xs[train_index]
        elif self.mode == "val":
            pids = xs[val_index]

        return pids
Ejemplo n.º 19
0
def split_stratified(all_examples_dict):
    examples = []
    y_list = []
    
    for key, labels in all_examples_dict.items():
        labels = list(labels)
        np_labels = np.zeros((28,), dtype=int)
        np_labels[np.array(labels)] = 1
       # print('np_labels ', np_labels)
       # print('key ', key)
        examples.append((key, labels))
        y_list.append(np_labels)

    X = np.arange(len(y_list))
    y = np.array(y_list)

    # test_val
    mskf = MultilabelStratifiedKFold(n_splits=11, random_state=1234)
    folds = []
  #  print('whole len ', len(X))
    for train_index, test_index in mskf.split(X, y):
        folds.append(test_index)
       # print('train_index ', train_index)
       # print('train_len ', len(train_index))
        
      #  print('test_index ', test_index)
      #  print('test_index ', len(test_index))

    for a, b in combinations(folds, 2):
        assert len(set(a) & set(b)) == 0
   # print('folds ', folds)
    return examples, folds
 def split_df(self, df):
     kf = MultilabelStratifiedKFold(n_splits=self.config.fold_num, shuffle=True, random_state=self.config.seed)
     for fold, (train_idx, val_idx) in enumerate(kf.split(df,
                                                          df.iloc[:, 1:train_df['class_id'].nunique() + 1])):
         df[f'fold_{fold}'] = 0
         df.loc[val_idx, f'fold_{fold}'] = 1
     return df
def ensemble_training(k_folds=5):
    # 准备工作
    log = Logger()
    log.open(os.path.join(config.logs_dir, "%s_log_train.txt" % config.model_name), mode="a")

    # load dataset
    all_files = pd.read_csv(config.train_csv)

    image_names = all_files['Id']
    labels_strs = all_files['Target']
    image_labels = []
    for cur_label_str in labels_strs:
        cur_label = np.eye(config.num_classes, dtype=np.float)[np.array(list(map(int, cur_label_str.split(' '))))].sum(
            axis=0)
        image_labels.append(cur_label)
    image_labels = np.stack(image_labels, axis=0)

    msss = MultilabelStratifiedKFold(n_splits=k_folds)
    i = 0
    for train_index, val_index in msss.split(image_names, image_labels):
        model = get_net()
        model.cuda()
        train_image_names = image_names[train_index]
        train_image_labels = image_labels[train_index]
        val_image_names = image_names[val_index]
        val_image_labels = image_labels[val_index]
        training(model, i, log, train_image_names, train_image_labels, val_image_names, val_image_labels)
        i += 1
def main():
    args = parse_args()
    os.makedirs(args.output_root, exist_ok=True)
    tile_annotations = pd.read_csv(args.annotation_path)
    mosaic_annotations = pd.read_csv(args.mosaic_path, converters={"bbox": ast.literal_eval})
    mosaic_annotations["num_of_bboxes"] = mosaic_annotations["image_id"].map(
        mosaic_annotations["image_id"].value_counts()
    )
    mosaic_annotations["median_area"] = mosaic_annotations["bbox"].apply(lambda x: np.sqrt(x[-1] * x[-2]))
    mosaic_annotations["source_index"] = mosaic_annotations["source"].apply(lambda x: SOURCES.index(x))
    images = (
        mosaic_annotations[["image_id", "source_index", "median_area", "num_of_bboxes", "source"]]
        .copy()
        .drop_duplicates("image_id")
    )
    images = images[~images["source"].isin(VAL_SOURCES)]
    splitter = MultilabelStratifiedKFold(n_splits=args.n_splits, shuffle=True, random_state=3)
    for i, (train_index, test_index) in enumerate(
        splitter.split(images, images[["source_index", "median_area", "num_of_bboxes"]])
    ):
        mosaic_val_ids = images.iloc[test_index, images.columns.get_loc("image_id")]
        tile_val_ids = sum([x.split("_") for x in mosaic_val_ids], [])

        fold_root = osp.join(args.output_root, str(i))

        save_split(mosaic_annotations, mosaic_val_ids, fold_root, prefix="mosaic")
        save_split(tile_annotations, tile_val_ids, fold_root, prefix="tile")
Ejemplo n.º 23
0
def make_folds(folds=5, random_state=0, stratify=True, scored=None):

    drug = pd.read_csv("../input/lish-moa/train_drug.csv")
    if scored is None:
        scored = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
    targets = scored.columns[1:]
    scored = scored.merge(drug, on="sig_id", how="left")

    # LOCATE DRUGS
    vc = scored.drug_id.value_counts()
    vc1 = vc.loc[vc <= 18].index.sort_values()
    vc2 = vc.loc[vc > 18].index.sort_values()

    # STRATIFY DRUGS 18 OR LESS
    dct1 = {}
    dct2 = {}
    if stratify:
        skf = MultilabelStratifiedKFold(n_splits=folds,
                                        shuffle=True,
                                        random_state=random_state)
    else:
        skf = KFold(n_splits=folds, shuffle=True, random_state=random_state)
    tmp = scored.groupby("drug_id")[targets].mean().loc[vc1]
    for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])):
        dd = {k: fold for k in tmp.index[idxV].values}
        dct1.update(dd)

    # STRATIFY DRUGS MORE THAN 18
    if stratify:
        skf = MultilabelStratifiedKFold(n_splits=folds,
                                        shuffle=True,
                                        random_state=random_state)
    else:
        skf = KFold(n_splits=folds, shuffle=True, random_state=random_state)
    tmp = scored.loc[scored.drug_id.isin(vc2)].reset_index(drop=True)
    for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])):
        dd = {k: fold for k in tmp.sig_id[idxV].values}
        dct2.update(dd)

    # ASSIGN FOLDS
    scored["fold"] = np.nan
    scored["fold"] = scored.drug_id.map(dct1)
    scored.loc[scored.fold.isna(), "fold"] = scored.loc[scored.fold.isna(),
                                                        "sig_id"].map(dct2)
    scored.fold = scored.fold.astype("int8")

    return scored[["sig_id", "fold"]].copy()
Ejemplo n.º 24
0
def split_data():
    SEED, FOLDS = 42, 5

    print("Split data")
    path_fold = "input/folds/train_folds.csv"

    if not exists(path_fold):
        scored = pd.read_csv("input/lish-moa/train_targets_scored.csv")
        drug = pd.read_csv("input/lish-moa/train_drug.csv")
        targets = scored.columns[1:]
        scored = scored.merge(drug, on="sig_id", how="left")

        # LOCATE DRUGS
        vc = scored.drug_id.value_counts()
        vc1 = vc.loc[vc <= 18].index.sort_values()
        vc2 = vc.loc[vc > 18].index.sort_values()

        # STRATIFY DRUGS 18X OR LESS
        dct1 = {}
        dct2 = {}
        skf = MultilabelStratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
        tmp = scored.groupby("drug_id")[targets].mean().loc[vc1]
        for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])):
            dd = {k: fold for k in tmp.index[idxV].values}
            dct1.update(dd)

        # STRATIFY DRUGS MORE THAN 18X
        skf = MultilabelStratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
        tmp = scored.loc[scored.drug_id.isin(vc2)].reset_index(drop=True)
        for fold, (idxT, idxV) in enumerate(skf.split(tmp, tmp[targets])):
            dd = {k: fold for k in tmp.sig_id[idxV].values}
            dct2.update(dd)

        # ASSIGN FOLDS
        scored["kfold"] = scored.drug_id.map(dct1)
        scored.loc[scored.kfold.isna(), "kfold"] = scored.loc[scored.kfold.isna(), "sig_id"].map(
            dct2
        )
        scored.kfold = scored.kfold.astype("int8")

        # SAVE FOLDS
        scored.drop("drug_id", axis=1, inplace=True)
        scored.to_csv(path_fold, index=False)
        print(f"Created: {path_fold}")
    else:
        print("Skipped: already exists")
Ejemplo n.º 25
0
def generate_k_folds(k_folds, labels):
    placeholder_x = np.zeros(labels.shape[0])
    msss = MultilabelStratifiedKFold(n_splits=k_folds)
    i = 0
    for train_index, val_index in msss.split(placeholder_x, labels):
        with open('%d.pkl' % i, 'wb') as f:
            pickle.dump((train_index, val_index), f)
        i += 1
Ejemplo n.º 26
0
def multilabel_stratified_K_fold(dataset,
                                 n_folds=5,
                                 shuffle=False,
                                 random_state=42):
    y = target_to_numpy(dataset['Target'])
    mskf = MultilabelStratifiedKFold(n_splits=n_folds,
                                     shuffle=shuffle,
                                     random_state=random_state)
    return list(mskf.split(X=dataset, y=y))
def _mls_enhanced_full_kfold_dfs():
    df = get_enhanced_full_train_df()
    label_mat = multilabel_binary_representation(df, sparse=False)

    kf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    for train_index, val_index in kf.split(df.index.values, label_mat):
        fold_train_df = df.iloc[train_index]
        fold_val_df = df.iloc[val_index]
        yield fold_train_df, fold_val_df
Ejemplo n.º 28
0
    def train_test_split(self, train_portion, batch_size=32):
        mskf = MultilabelStratifiedKFold(n_splits=int(1 / (1 - train_portion)))
        train_set, val_set = mskf.split(X=self.labels,
                                        y=self.labels).__next__()

        train_generator = TrainGenerator(self, train_set, batch_size)
        val_generator = TrainGenerator(self, train_set, batch_size)

        return train_generator, val_generator
Ejemplo n.º 29
0
def kfold(
    n_splits: int,
    annotations: Annotations,
) -> t.List[t.Tuple[Annotations, Annotations]]:
    multi_hot = to_multi_hot(annotations, size=3474)
    indecies = range(len(multi_hot))
    mskf = MultilabelStratifiedKFold(n_splits=n_splits, random_state=0)
    return [([annotations[i] for i in train], [annotations[i] for i in test])
            for train, test in mskf.split(indecies, multi_hot)]
Ejemplo n.º 30
0
def run_train(seed):
    ''' Train model and Dump out of fold.
    '''
    # X is already cut `ctl_vehicle` data.
    train = pd.read_pickle(f'{CACHE_DIR}/train_features.pkl')
    targets = pd.read_pickle(f'{CACHE_DIR}/train_targets.pkl')
    features = train.columns.difference(drop_cols).tolist()
    len_g_features = train.columns.str.contains('^g-').sum()

    y_train_size = targets.shape[0]

    if USE_PSEUDO:
        X_test = pd.read_pickle(f'{EXTEND_DIR}/v04002_test_features.pkl')
        y_test = pd.read_csv(f'{EXTEND_DIR}/v04002_test_targets.csv')
        y_test.iloc[:, 1:] = (y_test.iloc[:, 1:] > 0.5).astype(int)

        is_ctl_vehicle = (X_test['cp_type'] == 'ctl_vehicle')
        X_test = X_test[~is_ctl_vehicle]
        y_test = y_test[~is_ctl_vehicle]

        train = pd.concat([train, X_test], axis=0).reset_index(drop=True)
        targets = pd.concat([targets, y_test], axis=0).reset_index(drop=True)

    X, y = train[features], targets.iloc[:, 1:]
    oof = np.zeros(y.shape)

    from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
    mskf = MultilabelStratifiedKFold(n_splits=NUM_FOLD,
                                     shuffle=True,
                                     random_state=seed)
    for n_fold, (train_idx, valid_idx) in enumerate(mskf.split(X, y)):
        logger.info('>' * 5 + f" Fold {n_fold+1} / {NUM_FOLD}")
        logger.info(
            f"train_size: {len(train_idx)}  valid_size: {len(valid_idx)}")
        # Split train and valid.
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]
        # Fit model and cv pred.
        with timer(f'fold {n_fold+1} train time'):
            num_epoch = 50 if os.path.exists('/kaggle/input') else 5
            modelpath = f"{CACHE_DIR}/{VERSION}_fold{n_fold}_seed{seed}.pth"
            model = MoaModel(
                CustomNet1D(X.shape[1], len_g_features, y.shape[1]).to(device))
            oof[valid_idx] = model.fit(X_train,
                                       y_train,
                                       X_valid,
                                       y_valid,
                                       modelpath,
                                       num_epoch=num_epoch)
    oof = post_processing(train, oof)
    oof = oof[:y_train_size]
    y_np = y.iloc[:y_train_size].to_numpy()
    # Evaluation only original train targets, I mean exclude pesudo label.
    logger.info(f'\nMetric of oof: { moa_loss(oof, y_np) }\n')
    # Export oof dataframe.
    np.save(f'{CACHE_DIR}/oof_seed{seed}.npy', oof)