def run(train_x, train_y, test_x, n_seed, n_fold, epoch, debug, verbose, output_bias, activation, plot):
    start_run = time.time()
    preds = sample_sub[:]
    val_losses = []
    if debug:
        print('run debug mode...')
        train_x = train_x[:10]
        train_y = train_y[:10]
        # test_x = test_x[:10]   
        epoch = 2
        n_fold = 2
        n_seed = 2
        plot = True
        verbose = 1
    for seed in range(1068, n_seed + 1068):
        print('###############')
        print('run_seed:', seed)
        print('###############')
        start = time.time()
        mskf = MultilabelStratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)         
        for (i, (train_idx, val_idx)) in enumerate(mskf.split(train_x, train_y), 1):
            model = MLP(train_x.shape[1], activation, output_bias)
            history = model.fit(train_x[train_idx],
                                train_y[train_idx], 
                                train_x[val_idx], 
                                train_y[val_idx],
                                epochs=epoch, 
                                batch_size=128,
                                verbose=verbose
                                )
            
            if plot:
                plot_loss(history, i, seed)
            
            print('finish run fold', i, 'with', len(history.history['loss']), 'epoch')
            print('train_loss of last 5 epoch:', end='')
            show_loss(history.history['loss'][-5:])
            print('val_loss of last 5 epoch  :', end='')
            show_loss(history.history['val_loss'][-5:])
            val_losses.append(history.history['val_loss'][-1])
            
            
            pred = model.predict(test_x)
            preds.loc[:, target_col] += pred
        
        elapsed_time = time.time() - start
        print ("time:{0}".format(elapsed_time) + "[sec]")
    
    preds.loc[:,target_col] /= n_fold * n_seed
    
    print('-------------------------')
    print('finish train and predict!')
    print('-------------------------')
    elapsed_time = time.time() - start_run
    print ('time: {0} '.format(elapsed_time) + '[sec]')
    print('loss:', end=' ')
    med_loss = (max(val_losses) + min(val_losses)) / 2
    pm_loss = max(val_losses) - med_loss
    print('{:.6f} ± {:.6f} (mean: {:.6f})'.format(med_loss, pm_loss, mean(val_losses)))
    return preds
def split(n_splits=5, seed=42):

    image_class_df = pd.read_csv(
        "/media/jionie/my_disk/Kaggle/Cloud/input/understanding_cloud_organization/image_class.csv"
    )

    image_class_df['class'] = image_class_df['class'].apply(get_label)

    labels_encoded = list(image_class_df['class'].values)

    mskf = MultilabelStratifiedKFold(n_splits=n_splits, random_state=seed)
    splits = mskf.split(image_class_df['id'], labels_encoded)

    for fold_, (tr, val) in enumerate(splits):

        image_train = []
        image_val = []

        for i in list(image_class_df.iloc[tr]['id'].values):
            image_train.append([i, 'train'])
        for i in list(image_class_df.iloc[val]['id'].values):
            image_val.append([i, 'train'])

        np.save(PATH + '/split/train_fold_%s_seed_%s.npy' % (fold_, seed),
                image_train)
        np.save(PATH + '/split/val_fold_%s_seed_%s.npy' % (fold_, seed),
                image_val)
Beispiel #3
0
def multilabel_split_by_drugs(scored, n_folds, seed):
    scored = scored.copy().merge(drug, on='sig_id', how='left')

    # LOCATE DRUGS
    vc = scored.drug_id.value_counts()
    vc1 = vc.loc[vc <= 18].index
    vc2 = vc.loc[vc > 18].index

    # STRATIFY DRUGS 18X OR LESS
    dct1 = {}
    dct2 = {}
    skf = MultilabelStratifiedKFold(n_splits=n_folds,
                                    shuffle=True,
                                    random_state=seed)
    tmp = scored.groupby('drug_id')[label_cols].mean().loc[vc1]
    for fold, (train_idx,
               test_idx) in enumerate(skf.split(tmp, tmp[label_cols])):
        dd = {k: fold for k in tmp.index[test_idx].values}
        dct1.update(dd)

    # STRATIFY DRUGS MORE THAN 18X
    skf = MultilabelStratifiedKFold(n_splits=n_folds,
                                    shuffle=True,
                                    random_state=seed)
    tmp = scored.loc[scored.drug_id.isin(vc2)].reset_index(drop=True)
    for fold, (train_idx,
               test_idx) in enumerate(skf.split(tmp, tmp[label_cols])):
        dd = {k: fold for k in tmp.sig_id[test_idx].values}
        dct2.update(dd)

# ASSIGN FOLDS
    scored['fold'] = scored.drug_id.map(dct1)
    scored.loc[scored.fold.isna(), 'fold'] = scored.loc[scored.fold.isna(),
                                                        'sig_id'].map(dct2)
    scored.fold = scored.fold.astype('int8')

    test_idx = [
        np.where(scored['fold'] == fold)[0].tolist() for fold in range(n_folds)
    ]
    train_idx = [
        np.where(scored['fold'] != fold)[0].tolist() for fold in range(n_folds)
    ]
    return zip(train_idx, test_idx)
def createfolds(train,number):

  folds = train.copy()

  mskf = MultilabelStratifiedKFold(n_splits=number)

  for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=target)):
      folds.loc[v_idx, 'kfold'] = int(f)

  folds['kfold'] = folds['kfold'].astype(int)

  return folds
Beispiel #5
0
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau, TensorBoard
from keras_callbacks import F1Metric, F1MetricNumber
from keras_metrics import f1, f1_02
from keras_losses import f1_loss
epochs = [2, 150]
batch_size = 16

# split data into train, valid

mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=18)

y = np.zeros((len(train_dataset_info), 28))
for i in range(len(train_dataset_info)):
    y[i][train_dataset_info[i]['labels']] = 1
mskf.get_n_splits(train_dataset_info, y)
kf = mskf.split(train_dataset_info, y)
fold_id = 0
train_indexes, valid_indexes = next(kf)

train_generator = data_generator.create_train(
    train_dataset_info[train_indexes],
    batch_size, (SIZE, SIZE, 3),
    augument=True,
    heavy_augment_rares=False,
    oversample_factor=0)
validation_generator = data_generator.create_train(
    train_dataset_info[valid_indexes],
    1, (SIZE, SIZE, 3),
    augument=False,
    heavy_augment_rares=False,
    oversample_factor=0)
        return np.mean(-aux)

scores_auc_all= []
test_cv_preds = []

NB_SPLITS = 10
mskf = MultilabelStratifiedKFold(n_splits=NB_SPLITS, random_state=0, shuffle=True)
oof_preds = []
oof_targets = []
scores = []
scores_auc = []

for seed in [0,1]:
  print('Seed',seed)

  for fold_nb, (train_idx, val_idx) in enumerate(mskf.split(train, train_targets_scored)):
      print("FOLDS : ", fold_nb)

      ## model
      X_train, y_train = train.values[train_idx, :], train_targets_scored.values[train_idx, :]
      X_val, y_val = train.values[val_idx, :], train_targets_scored.values[val_idx, :]
      model = TabNetRegressor(**tabnet_params)

      model.fit(X_train=X_train,
                y_train=y_train,
                eval_set=[(X_val, y_val)],
                eval_name = ["val"],
                eval_metric = ["logits_ll"],
                max_epochs=MAX_EPOCH,
                patience=20, batch_size=1024, virtual_batch_size=128,
                num_workers=1, drop_last=False,
def get_output_model(input_size, output_size, activation, dropout):
    input_layer = tf.keras.layers.Input((input_size, ),
                                        dtype=tf.float32,
                                        name='input')
    input_layer_dpout = tf.keras.layers.Dropout(dropout)(input_layer)
    output_layer = tf.keras.layers.Dense(output_size,
                                         activation=activation,
                                         name="output")(input_layer_dpout)
    model = tf.keras.models.Model(inputs=input_layer, outputs=output_layer)
    return model


kf = MultilabelStratifiedKFold(n_splits=NUM_FOLDS,
                               random_state=SEED,
                               shuffle=True)
kf_split = kf.split(train, train.loc[:, target_columns])
output_models = list()
kfold_scores = list()
for fold, (train_idx, valid_idx) in enumerate(kf_split):
    print(f" fold {fold} ".center(120, "#"))
    model = get_output_model(input_size=768,
                             output_size=30,
                             activation=ACTIVATION,
                             dropout=DROPOUT)

    train_inputs = train_tqa_bert_encoded.loc[train_idx, bert_columns].values
    _train_targets = train_targets.loc[train_idx, :].values

    valid_inputs = train_tqa_bert_encoded.loc[valid_idx, bert_columns].values
    _valid_targets = train_targets.loc[valid_idx, :].values
Beispiel #8
0
val_losses = []
histories = []
Y_train_preds_resnet = pd.DataFrame(np.zeros((X_train.shape[0], num_labels)),
                                    columns=label_cols)
Y_test_preds_resnet = pd.DataFrame(np.zeros(
    (X_test_full.shape[0], num_labels)),
                                   columns=label_cols)

set_random_seeds(123)

for repeat, kf_seed in enumerate(kfold_seeds):
    kf = MultilabelStratifiedKFold(n_splits=n_folds,
                                   random_state=kf_seed,
                                   shuffle=True)

    for fold, (train_idx, test_idx) in enumerate(kf.split(X_train, Y_train)):
        print('** Repeat {}/{}. Fold {}/{}'.format(repeat + 1, n_repeats,
                                                   fold + 1, n_folds))
        K.clear_session()
        train_idx, test_idx = complete_train_labels(Y_train, train_idx,
                                                    test_idx)
        train_idx = tf.random.shuffle(train_idx)
        X_train_1_fold = tf.gather(X_train_resnet_1, train_idx, axis=0)
        X_train_2_fold = tf.gather(X_train_resnet_2, train_idx, axis=0)
        Y_train_fold = tf.gather(Y_train, train_idx, axis=0)
        X_valid_1_fold = tf.gather(X_train_resnet_1, test_idx, axis=0)
        X_valid_2_fold = tf.gather(X_train_resnet_2, test_idx, axis=0)
        Y_valid_fold = tf.gather(Y_train, test_idx, axis=0)
        Y_nonscored_train_fold = tf.gather(Y_train_nonscored,
                                           train_idx,
                                           axis=0)