def fit_predict(self,x_train,y_train, x_predict):
        """Use local regression to predict values for unknown data.

        Arguments:
            x_train = The training data spectra.
            y_train = The values of the quantity being predicted for the training data
            x_predict = The unknown spectra for which y needs to be predicted.
        """
        self.neighbors.fit(x_train)
        predictions = []
        coeffs = []
        intercepts = []
        for i in range(x_predict.shape[0]):
            print('Predicting spectrum ' + str(i + 1))
            x_temp = np.array(x_predict[i])
            foo, ind = self.neighbors.kneighbors([x_temp])
            x_train_local = np.squeeze(x_train[ind])
            y_train_local = np.squeeze(y_train[ind])

            cv = GroupKFold(n_splits=3)
            cv = cv.split(x_train_local, y_train_local,
                          groups=y_train_local)
            self.model.fit(x_train_local, y_train_local)
            predictions.append(self.model.predict([x_temp])[0])
            coeffs.append(self.model.coef_)
            intercepts.append(self.model.intercept_)
        return predictions, coeffs, intercepts
Exemple #2
0
    def _get_mse_profiling(self, x, y, alphas=None):
        """Calculate prediction RMSE.

        Use GroupKFold where a group is a combination of input size and number
        of workers. The prediction of a group is done when it is out of the
        training set.
        """
        # Training set is 2/3 of the data
        groups = self._groups.loc[x.index]
        cv = GroupKFold(n_splits=3)
        preds = None

        for train_ix, test_ix in cv.split(x, groups=groups):
            # train_ix and test_ix starts from 0, so we use iloc
            x_train, y_train = x.iloc[train_ix], y.iloc[train_ix]
            x_test = x.iloc[test_ix]

            # Choose best alpha value for regularization based on training set
            lm = self._choose_alpha(x_train, y_train, alphas)
            lm.fit(x_train, y_train)
            pred = pd.DataFrame(lm.predict(x_test), index=test_ix)
            preds = pred if preds is None else preds.append(
                pred, verify_integrity=True)

        return self._calc_mse(y, preds.sort_index())
Exemple #3
0
 def _split(self, x, y):
     cv = GroupKFold(n_splits=3)
     groups = self._groups.loc[x.index]
     for train_ix, test_ix in cv.split(x, groups=groups):
         # train_ix and test_ix starts from 0, so we use iloc
         x_train = x.iloc[train_ix]
         y_train = y.iloc[train_ix]
         x_test = x.iloc[test_ix]
         yield x_train, y_train, x_test, test_ix
def plot_group_kfold():
    from sklearn.model_selection import GroupKFold
    groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3]

    plt.figure(figsize=(10, 2))
    plt.title("GroupKFold")

    axes = plt.gca()
    axes.set_frame_on(False)

    n_folds = 12
    n_samples = 12
    n_iter = 3
    n_samples_per_fold = 1

    cv = GroupKFold(n_splits=3)
    mask = np.zeros((n_iter, n_samples))
    for i, (train, test) in enumerate(cv.split(range(12), groups=groups)):
        mask[i, train] = 1
        mask[i, test] = 2

    for i in range(n_folds):
        # test is grey
        colors = ["grey" if x == 2 else "white" for x in mask[:, i]]
        # not selected has no hatch

        boxes = axes.barh(bottom=range(n_iter), width=[1 - 0.1] * n_iter,
                          left=i * n_samples_per_fold, height=.6, color=colors,
                          hatch="//", edgecolor="k", align='edge')
        for j in np.where(mask[:, i] == 0)[0]:
            boxes[j].set_hatch("")

    axes.barh(bottom=[n_iter] * n_folds, width=[1 - 0.1] * n_folds,
              left=np.arange(n_folds) * n_samples_per_fold, height=.6,
              color="w", edgecolor='k', align="edge")

    for i in range(12):
        axes.text((i + .5) * n_samples_per_fold, 3.5, "%d" %
                  groups[i], horizontalalignment="center")

    axes.invert_yaxis()
    axes.set_xlim(0, n_samples + 1)
    axes.set_ylabel("CV iterations")
    axes.set_xlabel("Data points")
    axes.set_xticks(np.arange(n_samples) + .5)
    axes.set_xticklabels(np.arange(1, n_samples + 1))
    axes.set_yticks(np.arange(n_iter + 1) + .3)
    axes.set_yticklabels(
        ["Split %d" % x for x in range(1, n_iter + 1)] + ["Group"])
    plt.legend([boxes[0], boxes[1]], ["Training set", "Test set"], loc=(1, .3))
    plt.tight_layout()
def test_ridge_gcv_sample_weights(
        gcv_mode, X_constructor, fit_intercept, n_features, y_shape, noise):
    alphas = [1e-3, .1, 1., 10., 1e3]
    rng = np.random.RandomState(0)
    n_targets = y_shape[-1] if len(y_shape) == 2 else 1
    X, y = _make_sparse_offset_regression(
        n_samples=11, n_features=n_features, n_targets=n_targets,
        random_state=0, shuffle=False, noise=noise)
    y = y.reshape(y_shape)

    sample_weight = 3 * rng.randn(len(X))
    sample_weight = (sample_weight - sample_weight.min() + 1).astype(int)
    indices = np.repeat(np.arange(X.shape[0]), sample_weight)
    sample_weight = sample_weight.astype(float)
    X_tiled, y_tiled = X[indices], y[indices]

    cv = GroupKFold(n_splits=X.shape[0])
    splits = cv.split(X_tiled, y_tiled, groups=indices)
    kfold = RidgeCV(
        alphas=alphas, cv=splits, scoring='neg_mean_squared_error',
        fit_intercept=fit_intercept)
    # ignore warning from GridSearchCV: DeprecationWarning: The default of the
    # `iid` parameter will change from True to False in version 0.22 and will
    # be removed in 0.24
    with ignore_warnings(category=DeprecationWarning):
        kfold.fit(X_tiled, y_tiled)

    ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept)
    splits = cv.split(X_tiled, y_tiled, groups=indices)
    predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits)
    kfold_errors = (y_tiled - predictions)**2
    kfold_errors = [
        np.sum(kfold_errors[indices == i], axis=0) for
        i in np.arange(X.shape[0])]
    kfold_errors = np.asarray(kfold_errors)

    X_gcv = X_constructor(X)
    gcv_ridge = RidgeCV(
        alphas=alphas, store_cv_values=True,
        gcv_mode=gcv_mode, fit_intercept=fit_intercept)
    gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight)
    if len(y_shape) == 2:
        gcv_errors = gcv_ridge.cv_values_[:, :, alphas.index(kfold.alpha_)]
    else:
        gcv_errors = gcv_ridge.cv_values_[:, alphas.index(kfold.alpha_)]

    assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_)
    assert_allclose(gcv_errors, kfold_errors, rtol=1e-3)
    assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3)
    assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3)
def test_knn_rbf_groupkfold():
    nan_roc_auc_scorer = make_scorer(nan_roc_auc_score)
    rng = np.random.RandomState(123)
    iris = load_iris()
    X = iris.data
    # knn = KNeighborsClassifier(n_neighbors=4)
    forest = RandomForestClassifier(n_estimators=100, random_state=123)
    bool_01 = [True if item == 0 else False for item in iris['target']]
    bool_02 = [True if (item == 1 or item == 2) else False for item in
               iris['target']]
    groups = []
    y_new = []
    for ind, _ in enumerate(bool_01):
        if bool_01[ind]:
            groups.append('attribute_A')
            y_new.append(0)
        if bool_02[ind]:
            throw = rng.rand()
            if throw < 0.5:
                groups.append('attribute_B')
            else:
                groups.append('attribute_C')
            throw2 = rng.rand()
            if throw2 < 0.5:
                y_new.append(0)
            else:
                y_new.append(1)
    y_new_bool = [True if item is 1 else False for item in y_new]
    cv_obj = GroupKFold(n_splits=3)
    cv_obj_list = list(cv_obj.split(X, np.array(y_new_bool), groups))
    sfs1 = SFS(forest,
               k_features=3,
               forward=True,
               floating=False,
               cv=cv_obj_list,
               scoring=nan_roc_auc_scorer,
               verbose=0
               )
    sfs1 = sfs1.fit(X, y_new)
    expect = {
        1: {'cv_scores': np.array([0.52, nan, 0.72]), 'avg_score': 0.62,
            'feature_idx': (1,)},
        2: {'cv_scores': np.array([0.42, nan, 0.65]), 'avg_score': 0.53,
            'feature_idx': (1, 2)},
        3: {'cv_scores': np.array([0.47, nan, 0.63]),
            'avg_score': 0.55,
            'feature_idx': (1, 2, 3)}}

    dict_compare_utility(d_actual=sfs1.subsets_, d_desired=expect, decimal=1)
Exemple #7
0
    def fit_and_predict(self, X_train, X_test, y_train, groups):
        if self.cv == "mcs":
            folds = MCSKFold(n_splits=5, shuffle_mc=True, max_iter=100)
        elif self.cv == "group":
            folds = GroupKFold(n_splits=10)
        elif self.cv == "stratified":
            folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
            y_to_stratify = pd.cut(y_train["Global_Sales_log1p"],
                                   bins=7,
                                   labels=False)

        oof = np.zeros(len(X_train))
        predictions = np.zeros(len(X_test))
        feature_importance_df = pd.DataFrame()
        fold_scores = []

        for fold, (train_idx,
                   val_idx) in enumerate(folds.split(X_train, groups=groups)):
            # for fold, (train_idx, val_idx) in enumerate(folds.split(X_train, y_to_stratify)):
            print("-" * 100)
            print(f"Fold {fold+1}")
            train_data = lgb.Dataset(X_train.iloc[train_idx],
                                     label=y_train.iloc[train_idx])
            val_data = lgb.Dataset(X_train.iloc[val_idx],
                                   label=y_train.iloc[val_idx])
            # callbacks = [log_evaluation(self.logger, period=100)]
            clf = lgb.train(self.params,
                            train_data,
                            valid_sets=[train_data, val_data],
                            verbose_eval=100,
                            early_stopping_rounds=100)  #, feval=eval_func)
            oof_pred = clf.predict(X_train.iloc[val_idx].values,
                                   num_iteration=clf.best_iteration)
            oof_pred[oof_pred < 0] = 0
            oof[val_idx] = oof_pred
            fold_score = mean_squared_log_error(
                np.expm1(y_train.iloc[val_idx].values),
                np.expm1(oof[val_idx]))**.5
            fold_scores.append(fold_score)

            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = X_train.columns.values
            fold_importance_df["importance"] = clf.feature_importance(
                importance_type="gain")
            fold_importance_df["fold"] = fold + 1
            feature_importance_df = pd.concat(
                [feature_importance_df, fold_importance_df], axis=0)

            predictions += np.expm1(
                clf.predict(X_test,
                            num_iteration=clf.best_iteration)) / folds.n_splits

        feature_importance_df = feature_importance_df[[
            "feature", "importance"
        ]].groupby("feature").mean().sort_values(by="importance",
                                                 ascending=False).head(50)
        print("##### feature importance #####")
        print(feature_importance_df)
        cv_score_fold_mean = sum(fold_scores) / len(fold_scores)
        print(f"cv_score_fold_mean: {cv_score_fold_mean}")
        return oof, predictions, cv_score_fold_mean
if X_train.columns.duplicated().sum() > 0:
    raise Exception(
        f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }')
print('no dup :) ')
print(f'X_train.shape {X_train.shape}')

gc.collect()

sub_train = utils.read_pickles(
    '../data/prev_train',
    ['SK_ID_CURR', 'SK_ID_PREV']).set_index('SK_ID_CURR').iloc[tr_ind]
sub_train['y'] = y_train.values
sub_train['cnt'] = sub_train.index.value_counts()
sub_train['w'] = 1 / sub_train.cnt.values

group_kfold = GroupKFold(n_splits=NFOLD)
sub_train['g'] = sub_train.index % NFOLD

CAT = list(set(X_train.columns) & set(utils_cat.ALL))

# =============================================================================
# load test
# =============================================================================
files = ('../feature_prev/test_' + imp.head(HEAD).feature + '.f').tolist()

X_test = pd.concat([pd.read_feather(f) for f in tqdm(files, mininterval=60)],
                   axis=1).iloc[te_ind]

sub_test = utils.read_pickles(
    '../data/prev_test',
    ['SK_ID_CURR', 'SK_ID_PREV']).set_index('SK_ID_CURR').iloc[te_ind]
Exemple #9
0
    ax.set_title("Grid Search Scores", fontsize=20, fontweight='bold')
    ax.set_xlabel(name_param_1, fontsize=16)
    ax.set_ylabel('CV Average Score', fontsize=16)
    ax.legend(loc="best", fontsize=15)
    ax.grid('on')


# load the data
data = sgl.load_watch()
X = data['X']
y = data['y']
g = data['subject']

# use subject id to group folds
splitter = GroupKFold(n_splits=3)
cv = splitter.split(X, y, groups=g)

# create a feature representation pipeline
est = Pipeline([('features', sgl.FeatureRep()), ('scaler', StandardScaler()),
                ('rf', RandomForestClassifier())])

pipe = sgl.SegPipe(est)

# create a parameter dictionary using the SegPipe API - which is similar to the sklearn API
#
# parameters passed to an estimator in the ``feed`` pipeline are keyed ``f$estimator__parameter``
# parameters passed to an estimator in the ``est`` pipeline are keyed ``e$estimator__parameter``
#
# when the ``feed`` or ``est`` pipeline is not a pipeline, but just a single estimator
# the parameter would be keyed f$parameter or e$parameter respectively
Exemple #10
0
        embeddings.append(embedding)
    input_numeric = Input(shape=(len(num), ))
    embedding_numeric = Dense(512, activation='relu')(input_numeric)
    inputs.append(input_numeric)
    embeddings.append(embedding_numeric)
    x = Concatenate()(embeddings)
    x = Dense(256, activation='relu')(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    output = Dense(199, activation='softmax')(x)
    model = Model(inputs, output)
    return model


n_splits = 5
kf = GroupKFold(n_splits=n_splits)
score = []
for i_369, (tdx, vdx) in enumerate(kf.split(X, y, X['GameId'])):
    print(f'Fold : {i_369+1}')
    X_train, X_val, y_train, y_val = X.iloc[tdx], X.iloc[vdx], y[tdx], y[vdx]
    X_train = [np.absolute(X_train[i]) for i in cat] + [X_train[num]]
    X_val = [np.absolute(X_val[i]) for i in cat] + [X_val[num]]
    model = model_396_1()
    model.compile(optimizer='Adam',
                  loss='categorical_crossentropy',
                  metrics=[])
    es = EarlyStopping(monitor='val_CRPS',
                       mode='min',
                       restore_best_weights=True,
                       verbose=2,
                       patience=5)
    return score


# affix = 'loocv'
# nsplits = n_subjects

# affix = '2out'
# nsplits = n_subjects // 2

affix = '3out'
nsplits = n_subjects // 3

# affix = 'split-half'
# nsplits = n_subjects // 6

gkf = GroupKFold(n_splits=nsplits)
clf = RidgeCV()
# clf = DummyClassifier(strategy="most_frequent")
# clf = GradientBoostingRegressor()
# clf = RandomForestRegressor()
scores = Parallel(n_jobs=2)(delayed(get_cv_score)(i, ward, paths, clf, gkf)
                            for i in range(1, 1 + n_parcels))

score_imgs = []
for task in task_list:
    score_map = math_img('0. * img ', img=ward.labels_img_).get_data()
    for i in range(1, 1 + n_parcels):
        score_map[ward.labels_img_.get_data() == i] = scores[i - 1][task]

    score_img = nib.Nifti1Image(score_map, ward.labels_img_.affine)
    filename = os.path.join(write_dir, 'score_' + affix + '_%s.nii.gz' % task)
Exemple #12
0
                if restore_best_state:
                    print(
                        "Restoring model state from the end of the best epoch")
                    model.load_state_dict(best_model_state)
                    optimizer.load_state_dict(best_optimizer_state)
                break
        else:
            best_rho = valid_rho
            wait = 0
            if restore_best_state:
                best_model_state = model.state_dict()
                best_optimizer_state = optimizer.state_dict()
    return model, best_rho


kf_split = GroupKFold(n_splits=NUM_FOLDS).split(X=train,
                                                groups=train.question_body)
kfold_rhos = list()
all_models = list()
for fold, (train_idx, valid_idx) in enumerate(kf_split):
    print(f" fold: {fold} ".center(100, "#"))
    train_inputs = train_tqa_bert_encoded.loc[train_idx, bert_columns].values
    _train_targets = train_targets.loc[train_idx, :].values

    valid_inputs = train_tqa_bert_encoded.loc[valid_idx, bert_columns].values
    _valid_targets = train_targets.loc[valid_idx, :].values

    model, best_rho = train_mlp(OutputMLP,
                                train_inputs,
                                _train_targets,
                                valid_inputs,
                                _valid_targets,
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

models = [LogisticRegression(solver='liblinear', max_iter=300),
          SVC(C=1.0, kernel='linear', degree=3, gamma='auto'), MultinomialNB(),
          KNeighborsClassifier(), RidgeClassifier(),
          SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3)]

accuracy_mean, accuracy_std, precision_mean, precision_std = [], [], [], []

for model in models:
    pipe = Pipeline([('cleaner', clean_transformer()),
                 ('vectorizer', bow_vector),
                 ('classifier', model)])
    
    accuracy = cross_val_score(estimator=pipe, X=X, y=y, groups=titles, cv=GroupKFold(), scoring='accuracy')
    precision = cross_val_score(estimator=pipe, X=X, y=y, groups=titles, cv=GroupKFold(), scoring='precision')
    
    accuracy_mean.append(np.mean(accuracy))
    accuracy_std.append(np.std(accuracy))
    precision_mean.append(np.mean(precision))
    precision_std.append(np.std(precision))
    
    
# hyperparameter tuning
classifier = KNeighborsClassifier()

pipe = Pipeline([('cleaner', clean_transformer()),
                 ('vectorizer', tfidf_vector),
                 ('classifier', classifier)])
Exemple #14
0
def main(args, logger):
    trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv')

    gkf = GroupKFold(n_splits=5).split(X=trn_df.question_body,
                                       groups=trn_df.question_body)

    histories = {
        'trn_loss': [],
        'val_loss': [],
        'val_metric': [],
    }
    loaded_fold = -1
    loaded_epoch = -1
    if args.checkpoint:
        histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint)
    for fold, (trn_idx, val_idx) in enumerate(gkf):
        if fold < loaded_fold:
            continue
        fold_trn_df = trn_df.iloc[trn_idx]
        fold_val_df = trn_df.iloc[val_idx]
        if args.debug:
            fold_trn_df = fold_trn_df.sample(100, random_state=71)
            fold_val_df = fold_val_df.sample(100, random_state=71)
        temp = pd.Series(
            list(
                itertools.chain.from_iterable(
                    fold_trn_df.question_title.apply(lambda x: x.split(' ')) +
                    fold_trn_df.question_body.apply(lambda x: x.split(' ')) +
                    fold_trn_df.answer.apply(lambda x: x.split(' '))))
        ).value_counts()
        tokens = temp[temp >= 10].index.tolist()
        tokens = []

        trn_dataset = QUESTDataset(
            df=fold_trn_df,
            mode='train',
            tokens=tokens,
            augment=[],
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
        )
        # update token
        trn_sampler = RandomSampler(data_source=trn_dataset)
        trn_loader = DataLoader(trn_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=trn_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=True,
                                pin_memory=True)
        val_dataset = QUESTDataset(
            df=fold_val_df,
            mode='valid',
            tokens=tokens,
            augment=[],
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
        )
        val_sampler = RandomSampler(data_source=val_dataset)
        val_loader = DataLoader(val_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=val_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=False,
                                pin_memory=True)

        fobj = soft_binary_cross_entropy
        model = BertModelForBinaryMultiLabelClassifier(
            num_labels=30,
            pretrained_model_name_or_path=MODEL_PRETRAIN).to(DEVICE)
        model.resize_token_embeddings(len(trn_dataset.tokenizer))
        optimizer = optim.Adam(model.parameters(), lr=3e-5)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                         T_max=MAX_EPOCH,
                                                         eta_min=1e-5)

        # load checkpoint model, optim, scheduler
        if args.checkpoint and fold == loaded_fold:
            load_checkpoint(args.checkpoint, model, optimizer, scheduler)

        for epoch in tqdm(list(range(MAX_EPOCH))):
            model = model.to(DEVICE)
            if fold <= loaded_fold and epoch <= loaded_epoch:
                continue
            trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader)
            val_loss, val_metric, val_y_preds, val_y_trues, val_qa_ids = test(
                model, val_loader)

            scheduler.step()
            histories['trn_loss'].append(trn_loss)
            histories['val_loss'].append(val_loss)
            histories['val_metric'].append(val_metric)
            sel_log(
                f'epoch : {epoch} -- fold : {fold} -- '
                f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_metric : {float(val_metric):.4f}', logger)
            model = model.to('cpu')
            save_checkpoint(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model,
                            optimizer, scheduler, histories, val_y_preds,
                            val_y_trues, val_qa_ids, fold, epoch, val_loss,
                            val_metric)
        save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                                      trn_dataset.tokenizer)
        del model
    sel_log('now saving best checkpoints...', logger)
Exemple #15
0
from sklearn.model_selection import GroupKFold, GroupShuffleSplit

from photonai.base import Hyperpipe, PipelineElement, OutputSettings
from photonai.optimization import FloatRange, Categorical

# WE USE THE BREAST CANCER SET FROM SKLEARN
X, y = load_breast_cancer(return_X_y=True)

groups = np.random.random_integers(0, 3, (len(y), ))

# DESIGN YOUR PIPELINE
my_pipe = Hyperpipe('group_split_pipe',
                    optimizer='grid_search',
                    metrics=['accuracy', 'precision', 'recall'],
                    best_config_metric='accuracy',
                    outer_cv=GroupKFold(n_splits=4),
                    inner_cv=GroupShuffleSplit(n_splits=10),
                    verbosity=1,
                    output_settings=OutputSettings(project_folder='./tmp/'))

# ADD ELEMENTS TO YOUR PIPELINE
# first normalize all features
my_pipe += PipelineElement('StandardScaler')
# then do feature selection using a PCA, specify which values to try in the hyperparameter search
my_pipe += PipelineElement('PCA',
                           hyperparameters={'n_components': [5, 10, None]},
                           test_disabled=True)
# engage and optimize the good old SVM for Classification
my_pipe += PipelineElement('SVC',
                           hyperparameters={
                               'kernel': Categorical(['rbf', 'linear']),
Exemple #16
0
def main():

    parser = get_arg_parser()

    args = parser.parse_args()

    do_eval = len(args.dev_sets) > 0 and not args.do_cross_validation
    do_train = len(args.train_sets) > 0 and not args.do_cross_validation

    device, n_gpu = get_device(args.local_rank, args.no_cuda)

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if not do_train and not do_eval and not args.do_cross_validation:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    processors = {"ner": NerProcessor, "wikipedia-ner": WikipediaNerProcessor}

    if task_name not in processors:
        raise ValueError("Task not found: %s" % task_name)

    if args.do_cross_validation:

        cross_val_result_file = "cross_validation_results.pkl"

        cross_val_result_file = os.path.join(args.output_dir,
                                             cross_val_result_file)

        sets = set(args.train_sets.split(
            '|')) if args.train_sets is not None else set()

        gt = pd.read_pickle(args.gt_file)

        gt = gt.loc[gt.dataset.isin(sets)]

        k_fold = GroupKFold(n_splits=args.n_splits)

        eval_results = list()

        tokenizer = BertTokenizer.from_pretrained(
            args.bert_model, do_lower_case=args.do_lower_case)

        for ep in range(1, int(args.num_train_epochs) + 1):

            for sp, (train,
                     test) in enumerate(k_fold.split(X=gt,
                                                     groups=gt.nsentence)):

                tr = gt.iloc[train].copy()
                te = gt.iloc[test].copy()

                tr['dataset'] = 'TRAIN'
                te['dataset'] = 'TEST'

                gt_tmp = pd.concat([tr, te])

                processor = \
                    processors[task_name](train_sets='TRAIN', dev_sets='TEST', test_sets='TEST',
                                          gt=gt_tmp, max_seq_length=args.max_seq_length,
                                          tokenizer=tokenizer, data_epochs=args.num_data_epochs,
                                          epoch_size=args.epoch_size)

                model, model_config = \
                    model_train(bert_model=args.bert_model, max_seq_length=args.max_seq_length,
                                do_lower_case=args.do_lower_case, num_train_epochs=ep,
                                train_batch_size=args.train_batch_size,
                                gradient_accumulation_steps=args.gradient_accumulation_steps,
                                learning_rate=args.learning_rate, weight_decay=args.weight_decay,
                                loss_scale=args.loss_scale, warmup_proportion=args.warmup_proportion,
                                processor=processor, device=device, n_gpu=n_gpu, fp16=args.fp16,
                                cache_dir=args.cache_dir, local_rank=args.local_rank, dry_run=args.dry_run,
                                no_cuda=args.no_cuda)

                label_map = {
                    v: k
                    for k, v in model_config['label_map'].items()
                }

                eval_result =\
                    model_eval(model=model, label_map=label_map, processor=processor, device=device,
                               batch_size=args.eval_batch_size, local_rank=args.local_rank,
                               no_cuda=args.no_cuda, dry_run=args.dry_run).reset_index()

                eval_result['split'] = sp
                eval_result['epoch'] = ep
                eval_results.append(eval_result)

                del model  # release CUDA memory

            pd.concat(eval_results).to_pickle(cross_val_result_file)

    if do_train:

        tokenizer = BertTokenizer.from_pretrained(
            args.bert_model, do_lower_case=args.do_lower_case)

        processor = \
            processors[task_name](train_sets=args.train_sets, dev_sets=args.dev_sets, test_sets=args.test_sets,
                                  gt_file=args.gt_file, max_seq_length=args.max_seq_length,
                                  tokenizer=tokenizer, data_epochs=args.num_data_epochs,
                                  epoch_size=args.epoch_size)

        model_train(
            bert_model=args.bert_model,
            output_dir=args.output_dir,
            max_seq_length=args.max_seq_length,
            do_lower_case=args.do_lower_case,
            num_train_epochs=args.num_train_epochs,
            train_batch_size=args.train_batch_size,
            gradient_accumulation_steps=args.gradient_accumulation_steps,
            learning_rate=args.learning_rate,
            weight_decay=args.weight_decay,
            loss_scale=args.loss_scale,
            warmup_proportion=args.warmup_proportion,
            processor=processor,
            device=device,
            n_gpu=n_gpu,
            fp16=args.fp16,
            cache_dir=args.cache_dir,
            local_rank=args.local_rank,
            dry_run=args.dry_run,
            no_cuda=args.no_cuda)

    if do_eval and (args.local_rank == -1
                    or torch.distributed.get_rank() == 0):

        model_config = json.load(
            open(os.path.join(args.output_dir, "model_config.json"), "r"))

        label_to_id = model_config['label_map']

        label_map = {v: k for k, v in model_config['label_map'].items()}

        tokenizer = BertTokenizer.from_pretrained(
            model_config['bert_model'], do_lower_case=model_config['do_lower'])

        processor = \
            processors[task_name](train_sets=None, dev_sets=args.dev_sets, test_sets=args.test_sets,
                                  gt_file=args.gt_file, max_seq_length=model_config['max_seq_length'],
                                  tokenizer=tokenizer, data_epochs=args.num_data_epochs,
                                  epoch_size=args.epoch_size, label_map=label_to_id)

        model_eval(label_map=label_map,
                   processor=processor,
                   device=device,
                   num_train_epochs=args.num_train_epochs,
                   output_dir=args.output_dir,
                   batch_size=args.eval_batch_size,
                   local_rank=args.local_rank,
                   no_cuda=args.no_cuda,
                   dry_run=args.dry_run)
def Cross_validation(is_group=False):
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    training_data = pd.read_csv(
        '/home1/pansj/Small_protein_prediction/training_upstream_cluster.csv',
        index_col=0).values
    print(training_data.shape)
    data = training_data[:, 0]
    label = training_data[:, 1]
    cluster = training_data[:, 3]
    print('cluster:', len(set(cluster)))
    label = label.reshape(len(label), -1)
    print(np.sum(label > 0), len(label))
    label[label == -1] = 0
    label = np.array([y[0] for y in label]).reshape(len(label), 1)
    data_one_hot = []
    print('start')
    for seq in data:
        seq_one_hot = sequence_to_onehot(seq)
        data_one_hot.append(seq_one_hot)
    # print(np.array(data_one_hot).shape)
    train_data = np.array(data_one_hot)

    accuracy = []
    recall = []
    precision = []
    f1 = []

    if is_group == False:
        print('CV')

        skf = StratifiedKFold(n_splits=5, shuffle=True)
        count = 0
        for train_index, test_index in skf.split(train_data, label):
            count += 1
            X_train, Y_train = train_data[train_index], label[train_index]
            X_test, Y_test = train_data[test_index], label[test_index]
            enc = OneHotEncoder()
            Y_train = enc.fit_transform(Y_train).toarray()
            Y_test_ = enc.fit_transform(Y_test).toarray()

            tbCallBack = TensorBoard(
                log_dir='/home1/pansj/Small_protein_prediction/logs_{}'.format(
                    count),  # log 目录
                histogram_freq=0,  # 按照何等频率(epoch)来计算直方图,0为不计算
                #                  batch_size=32,     # 用多大量的数据计算直方图
                write_graph=True,  # 是否存储网络结构图
                write_grads=True,  # 是否可视化梯度直方图
                write_images=True,  # 是否可视化参数
                embeddings_freq=0,
                embeddings_layer_names=None,
                embeddings_metadata=None)

            cnn_model = CNN_lstm_model([3, 4, 5], 32)

            history = cnn_model.fit(X_train,
                                    Y_train,
                                    batch_size=128,
                                    epochs=10,
                                    workers=8,
                                    validation_data=(X_test, Y_test_),
                                    callbacks=[tbCallBack])
            print(history.history)

            pre_y = []
            pre = cnn_model.predict(X_test)
            for i in range(len(pre)):
                if pre[i][0] > pre[i][1]:
                    pre_y.append(0)
                else:
                    pre_y.append(1)
            pre_y = np.array(pre_y).reshape(len(pre_y), 1)
            accuracy.append(accuracy_score(Y_test, pre_y))
            recall.append(recall_score(Y_test, pre_y))
            precision.append(precision_score(Y_test, pre_y))
            f1.append(f1_score(Y_test, pre_y))
    else:
        print('group_cv')
        skf = GroupKFold(n_splits=5)
        count = 0
        for train_index, test_index in skf.split(train_data, label, cluster):
            count += 1
            print(len(train_index), len(test_index))
            X_train, Y_train = train_data[train_index], label[train_index]
            X_test, Y_test = train_data[test_index], label[test_index]
            enc = OneHotEncoder()
            Y_train = enc.fit_transform(Y_train).toarray()
            Y_test_ = enc.fit_transform(Y_test).toarray()

            cnn_model = CNN_model_multi_conv([3, 4, 5], 32)

            tbCallBack = TensorBoard(
                log_dir=
                '/home1/pansj/Small_protein_prediction/CNN_group/cnn_results_upstream_logs_{}'
                .format(count),  # log 目录
                histogram_freq=0,  # 按照何等频率(epoch)来计算直方图,0为不计算
                #                  batch_size=32,     # 用多大量的数据计算直方图
                write_graph=True,  # 是否存储网络结构图
                write_grads=True,  # 是否可视化梯度直方图
                write_images=True,  # 是否可视化参数
                embeddings_freq=0,
                embeddings_layer_names=None,
                embeddings_metadata=None)

            history = cnn_model.fit(X_train,
                                    Y_train,
                                    batch_size=128,
                                    epochs=20,
                                    workers=8,
                                    validation_data=(X_test, Y_test_),
                                    callbacks=[tbCallBack])
            print(history.history)

            pre_y = []
            pre = cnn_model.predict(X_test)
            for i in range(len(pre)):
                if pre[i][0] > pre[i][1]:
                    pre_y.append(0)
                else:
                    pre_y.append(1)
            pre_y = np.array(pre_y).reshape(len(pre_y), 1)
            accuracy.append(accuracy_score(Y_test, pre_y))
            recall.append(recall_score(Y_test, pre_y))
            precision.append(precision_score(Y_test, pre_y))
            f1.append(f1_score(Y_test, pre_y))

    print(
        'CNN CV_group_new: kernel size:[3,4,5] ; num_kernel = 32 epoches = 20;'
    )
    print(accuracy)
    print('accuracy: ', np.average(accuracy))
    print(recall)
    print('recall:', np.average(recall))
    print(precision)
    print('precision:', np.average(precision))
    print(f1)
    print('f1:', np.average(f1))
# In[ ]:

#train_imgs = np.array(train_imgs).astype(np.uint8)

# In[ ]:

#train_imgs.shape

# In[ ]:

#train_imgs.mean(), train_imgs.std()

# In[14]:

#group_kfold = GroupShuffleSplit(n_splits=5, random_state = 4321)
group_kfold = GroupKFold(n_splits=5)

# In[12]:

data_transforms = {
    'train':
    transforms.Compose([
        transforms.Resize(224),
        #transforms.Grayscale(3),
        transforms.RandomAffine(degrees=45, scale=(0.9, 1.1)),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomVerticalFlip(p=0.5),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val':
del data
print('Data Done')

import lightgbm as lgb

col = list(train_set.columns)
col.remove('reordered')
col.remove('eval_set')
col.remove('user_id')
col.remove('product_id')
col.remove('order_id')
col.remove('department_id')
col.remove('aisle_id')

from sklearn.model_selection import GroupKFold
kf = GroupKFold(n_splits=5)
train_indexes = []
test_indexes = []
for i, (train_index, test_index) in enumerate(
        kf.split(train_set, groups=train_set['user_id'].values)):
    train_indexes.append(train_index)
    test_indexes.append(test_index)

index_ = train_indexes[0]
train_set = train_set.iloc[index_, :]
train_set = train_set.frac(0.9, seed=42)

dtrain = lgb.Dataset(train_set[col], label=train_set['reordered'])
#del train_set
lgb_params = {
    'task': 'train',
Exemple #20
0
def fit_meta_feature(
    X_train,
    X_valid,
    X_test,
    Meta_train,
    train_idx,
    bond_type,
    base_fold,
    feature="fc",
    N_META_FOLDS=N_META_FOLDS,
    N_META_ESTIMATORS=N_META_ESTIMATORS,
    model_type="catboost",
):
    """
    Adds meta features to train, test and val
    """
    logger.info(f"{bond_type}: Creating meta feature {feature}")
    logger.info("{}: X_train, X_valid and X_test are shapes {} {} {}".format(
        bond_type, X_train.shape, X_valid.shape, X_test.shape))
    folds = GroupKFold(n_splits=N_META_FOLDS)
    fold_count = 1

    # Init predictions
    X_valid["meta_" + feature] = 0
    X_test["meta_" + feature] = 0
    X_train["meta_" + feature] = 0
    X_train_oof = X_train[["meta_" + feature]].copy()
    X_train = X_train.drop("meta_" + feature, axis=1)
    feature_importance = pd.DataFrame()
    for fold_n, (train_idx2, valid_idx2) in enumerate(
            folds.split(X_train,
                        groups=mol_group_type.iloc[train_idx].values)):
        logger.info("{}: Running Meta Feature Type {} - Fold {} of {}".format(
            bond_type, feature, fold_count, folds.n_splits))
        update_tracking(run_id, "{}_meta_{}_est".format(bond_type, feature),
                        N_META_ESTIMATORS)
        update_tracking(run_id,
                        "{}_meta_{}_metafolds".format(bond_type,
                                                      feature), N_META_FOLDS)
        # Load fold IDs from files for consistancy
        X_train2 = X_train.loc[X_train.reset_index().index.isin(train_idx2)]
        X_valid2 = X_train.loc[X_train.reset_index().index.isin(valid_idx2)]
        X_train2 = X_train2.copy()
        X_valid2 = X_valid2.copy()
        y_train2 = Meta_train.loc[Meta_train.reset_index().index.isin(
            train_idx2)][feature]
        y_valid2 = Meta_train.loc[Meta_train.reset_index().index.isin(
            valid_idx2)][feature]
        fold_count += 1

        if model_type == "catboost":
            train_dataset = Pool(data=X_train2, label=y_train2)
            metavalid_dataset = Pool(data=X_valid2, label=y_valid2)
            valid_dataset = Pool(data=X_valid)
            test_dataset = Pool(data=X_test)
            model = CatBoostRegressor(
                iterations=N_META_ESTIMATORS,
                learning_rate=LEARNING_RATE,
                depth=META_DEPTH,
                eval_metric=EVAL_METRIC,
                verbose=VERBOSE,
                random_state=RANDOM_STATE,
                thread_count=N_THREADS,
                task_type="GPU",
            )  # Train on GPU

            model.fit(
                train_dataset,
                eval_set=metavalid_dataset,
                early_stopping_rounds=EARLY_STOPPING_ROUNDS,
            )
            y_pred_meta_valid = model.predict(metavalid_dataset)
            y_pred_valid = model.predict(valid_dataset)
            y_pred = model.predict(test_dataset)

            X_train_oof.loc[X_train_oof.reset_index().index.isin(valid_idx2),
                            "meta_" + feature] = y_pred_meta_valid
            X_valid["meta_" + feature] += y_pred_valid
            X_test["meta_" + feature] += y_pred

            fold_importance = pd.DataFrame()
            fold_importance["feature"] = X_train.columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["type"] = bond_type
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat(
                [feature_importance, fold_importance], axis=0)
            update_tracking(run_id,
                            '{}_f{}-{}_meta{}_best_iter'.format(
                                bond_type, base_fold, fold_count, feature),
                            model.best_iteration_,
                            integer=True)
        elif model_type == "xgboost":
            model = xgboost.XGBRegressor(**xgb_params)
            model.fit(
                X_train2,
                y_train2,
                eval_metric=EVAL_METRIC,
                eval_set=[(X_valid2, y_valid2)],
                verbose=VERBOSE,
                early_stopping_rounds=EARLY_STOPPING_ROUNDS,
            )

            y_pred_meta_valid = model.predict(X_valid2)
            y_pred_valid = model.predict(
                X_valid.drop("meta_" + feature, axis=1))
            y_pred = model.predict(
                X_test.drop(["meta_" + feature, 'id'], axis=1))

            X_train_oof.loc[X_train_oof.reset_index().index.isin(valid_idx2),
                            "meta_" + feature] = y_pred_meta_valid
            X_valid["meta_" + feature] += y_pred_valid
            X_test["meta_" + feature] += y_pred

            fold_importance = pd.DataFrame()
            fold_importance["feature"] = X_train.columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["type"] = bond_type
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat(
                [feature_importance, fold_importance], axis=0)
            update_tracking(run_id,
                            '{}_f{}-{}_meta{}_best_iter'.format(
                                bond_type, base_fold, fold_count, feature),
                            model.get_booster().best_iteration,
                            integer=True)

    oof_score = mean_absolute_error(Meta_train[feature],
                                    X_train_oof["meta_" + feature])
    log_oof_score = np.log(oof_score)
    logger.info(
        f"{bond_type} Meta feature {feature} has MAE {oof_score:0.4f} LMAE {log_oof_score:0.4f}"
    )
    update_tracking(
        run_id, "{}_meta_{}_mae_cv_f{}".format(bond_type, feature, base_fold),
        oof_score)
    update_tracking(
        run_id,
        "{}_meta_{}_lmae_cv_f{}".format(bond_type, feature, base_fold),
        log_oof_score,
    )
    X_valid["meta_" + feature] = X_valid["meta_" + feature] / N_META_FOLDS
    X_test["meta_" + feature] = X_test["meta_" + feature] / N_META_FOLDS
    X_train["meta_" + feature] = X_train_oof["meta_" + feature]
    feature_importance.to_parquet(
        "type_results/{}/meta/{}_{}_{}_fi_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet"
        .format(
            bond_type,
            MODEL_NUMBER,
            run_id,
            bond_type,
            feature,
            base_fold,
            oof_score,
            log_oof_score,
        ))

    X_train_oof.to_parquet(
        "type_results/{}/meta/{}_{}_{}_oof_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet"
        .format(
            bond_type,
            MODEL_NUMBER,
            run_id,
            bond_type,
            feature,
            base_fold,
            oof_score,
            log_oof_score,
        ))

    X_train.to_parquet(
        "type_results/{}/meta/{}_{}_{}_X_train_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet"
        .format(
            bond_type,
            MODEL_NUMBER,
            run_id,
            bond_type,
            feature,
            base_fold,
            oof_score,
            log_oof_score,
        ))

    X_valid.to_parquet(
        "type_results/{}/meta/{}_{}_{}_X_valid_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet"
        .format(
            bond_type,
            MODEL_NUMBER,
            run_id,
            bond_type,
            feature,
            base_fold,
            oof_score,
            log_oof_score,
        ))

    X_test.to_parquet(
        "type_results/{}/meta/{}_{}_{}_X_test_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet"
        .format(
            bond_type,
            MODEL_NUMBER,
            run_id,
            bond_type,
            feature,
            base_fold,
            oof_score,
            log_oof_score,
        ))
    logger.info(f"{bond_type} Done creating meta features")
    logger.info("{} X_train, X_valid and X_test are shapes {} {} {}".format(
        bond_type, X_train.shape, X_valid.shape, X_test.shape))
    return X_train, X_valid, X_test
Exemple #21
0
 def Split_group_kfolds(self):
     Train_X = self.Train_df.drop(['fraud_ind'], axis=1)
     Train_Y = self.Train_df['fraud_ind']
     Folds  = GroupKFold(n_splits=self.N_folds)
     Splited_data = Folds.split(Train_X, Train_Y, groups=Train_X['Month']) 
     return Splited_data
def main(args, logger):
    # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv')
    trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl')
    trn_df['is_original'] = 1

    # clean texts
    # trn_df = clean_data(trn_df, ['question_title', 'question_body', 'answer'])

    # load additional tokens
    # with open('./mnt/inputs/nes_info/trn_over_10_vocab.pkl', 'rb') as fin:
    #     additional_tokens = pickle.load(fin)

    gkf = GroupKFold(
        n_splits=5).split(
        X=trn_df.question_body,
        groups=trn_df.question_body_le,
    )

    histories = {
        'trn_loss': {},
        'val_loss': {},
        'val_metric': {},
        'val_metric_raws': {},
    }
    loaded_fold = -1
    loaded_epoch = -1
    if args.checkpoint:
        histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint)

    fold_best_metrics = []
    fold_best_metrics_raws = []
    for fold, (trn_idx, val_idx) in enumerate(gkf):
        if fold < loaded_fold:
            fold_best_metrics.append(np.max(histories["val_metric"][fold]))
            fold_best_metrics_raws.append(
                histories["val_metric_raws"][fold][np.argmax(histories["val_metric"][fold])])
            continue
        sel_log(
            f' --------------------------- start fold {fold} --------------------------- ', logger)
        fold_trn_df = trn_df.iloc[trn_idx]  # .query('is_original == 1')
        fold_trn_df = fold_trn_df.drop(
            ['is_original', 'question_body_le'], axis=1)
        # use only original row
        fold_val_df = trn_df.iloc[val_idx].query('is_original == 1')
        fold_val_df = fold_val_df.drop(
            ['is_original', 'question_body_le'], axis=1)
        if args.debug:
            fold_trn_df = fold_trn_df.sample(100, random_state=71)
            fold_val_df = fold_val_df.sample(100, random_state=71)
        temp = pd.Series(list(itertools.chain.from_iterable(
            fold_trn_df.question_title.apply(lambda x: x.split(' ')) +
            fold_trn_df.question_body.apply(lambda x: x.split(' ')) +
            fold_trn_df.answer.apply(lambda x: x.split(' '))
        ))).value_counts()
        tokens = temp[temp >= 10].index.tolist()
        # tokens = []
        tokens = [
            'CAT_TECHNOLOGY'.casefold(),
            'CAT_STACKOVERFLOW'.casefold(),
            'CAT_CULTURE'.casefold(),
            'CAT_SCIENCE'.casefold(),
            'CAT_LIFE_ARTS'.casefold(),
        ]#  + additional_tokens

        trn_dataset = QUESTDataset(
            df=fold_trn_df,
            mode='train',
            tokens=tokens,
            augment=[],
            tokenizer_type=TOKENIZER_TYPE,
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            do_lower_case=DO_LOWER_CASE,
            LABEL_COL=LABEL_COL,
            t_max_len=T_MAX_LEN,
            q_max_len=Q_MAX_LEN,
            a_max_len=A_MAX_LEN,
            tqa_mode=TQA_MODE,
            TBSEP='[TBSEP]',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
        )
        # update token
        trn_sampler = RandomSampler(data_source=trn_dataset)
        trn_loader = DataLoader(trn_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=trn_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=True,
                                pin_memory=True)
        val_dataset = QUESTDataset(
            df=fold_val_df,
            mode='valid',
            tokens=tokens,
            augment=[],
            tokenizer_type=TOKENIZER_TYPE,
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            do_lower_case=DO_LOWER_CASE,
            LABEL_COL=LABEL_COL,
            t_max_len=T_MAX_LEN,
            q_max_len=Q_MAX_LEN,
            a_max_len=A_MAX_LEN,
            tqa_mode=TQA_MODE,
            TBSEP='[TBSEP]',
            pos_id_type='arange',
            MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
        )
        val_sampler = RandomSampler(data_source=val_dataset)
        val_loader = DataLoader(val_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=val_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=False,
                                pin_memory=True)

        fobj = BCEWithLogitsLoss()
        state_dict = BertModel.from_pretrained(MODEL_PRETRAIN).state_dict()
        model = BertModelForBinaryMultiLabelClassifier(num_labels=len(LABEL_COL),
                                                       config_path=MODEL_CONFIG_PATH,
                                                       state_dict=state_dict,
                                                       token_size=len(
                                                           trn_dataset.tokenizer),
                                                       MAX_SEQUENCE_LENGTH=MAX_SEQ_LEN,
                                                       cat_last_layer_num=1,
                                                       do_ratio=0.5,
                                                       )
        optimizer = optim.Adam(model.parameters(), lr=3e-5, weight_decay=0.001)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(
            optimizer, T_max=MAX_EPOCH, eta_min=1e-5)

        # load checkpoint model, optim, scheduler
        if args.checkpoint and fold == loaded_fold:
            load_checkpoint(args.checkpoint, model, optimizer, scheduler)

        for epoch in tqdm(list(range(MAX_EPOCH))):
            if fold <= loaded_fold and epoch <= loaded_epoch:
                continue
            if epoch < 1:
                model.freeze_unfreeze_bert(freeze=True, logger=logger)
            else:
                model.freeze_unfreeze_bert(freeze=False, logger=logger)
            model = DataParallel(model)
            model = model.to(DEVICE)
            trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader, DEVICE)
            val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test(
                model, fobj, val_loader, DEVICE, mode='valid')

            scheduler.step()
            if fold in histories['trn_loss']:
                histories['trn_loss'][fold].append(trn_loss)
            else:
                histories['trn_loss'][fold] = [trn_loss, ]
            if fold in histories['val_loss']:
                histories['val_loss'][fold].append(val_loss)
            else:
                histories['val_loss'][fold] = [val_loss, ]
            if fold in histories['val_metric']:
                histories['val_metric'][fold].append(val_metric)
            else:
                histories['val_metric'][fold] = [val_metric, ]
            if fold in histories['val_metric_raws']:
                histories['val_metric_raws'][fold].append(val_metric_raws)
            else:
                histories['val_metric_raws'][fold] = [val_metric_raws, ]

            logging_val_metric_raws = ''
            for val_metric_raw in val_metric_raws:
                logging_val_metric_raws += f'{float(val_metric_raw):.4f}, '

            sel_log(
                f'fold : {fold} -- epoch : {epoch} -- '
                f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_metric : {float(val_metric):.4f} -- '
                f'val_metric_raws : {logging_val_metric_raws}',
                logger)
            model = model.to('cpu')
            model = model.module
            save_checkpoint(
                f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                model,
                optimizer,
                scheduler,
                histories,
                val_y_preds,
                val_y_trues,
                val_qa_ids,
                fold,
                epoch,
                val_loss,
                val_metric,
                )
        fold_best_metrics.append(np.max(histories["val_metric"][fold]))
        fold_best_metrics_raws.append(
            histories["val_metric_raws"][fold][np.argmax(histories["val_metric"][fold])])
        save_and_clean_for_prediction(
            f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
            trn_dataset.tokenizer,
            clean=False)
        del model

    # calc training stats
    fold_best_metric_mean = np.mean(fold_best_metrics)
    fold_best_metric_std = np.std(fold_best_metrics)
    fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}'
    sel_log(fold_stats, logger)
    send_line_notification(fold_stats)

    fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0)
    fold_raw_stats = ''
    for metric_stats_raw in fold_best_metrics_raws_mean:
        fold_raw_stats += f'{float(metric_stats_raw):.4f},'
    sel_log(fold_raw_stats, logger)
    send_line_notification(fold_raw_stats)

    sel_log('now saving best checkpoints...', logger)
Exemple #23
0
from sklearn.model_selection import StratifiedKFold

X = np.ones(10)
y = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
skf = StratifiedKFold(n_splits=3)
for train, test in skf.split(X, y):
    print("%s %s" % (train, test))

from sklearn.model_selection import GroupKFold

X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10]
y = ["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"]
groups = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3]

gkf = GroupKFold(n_splits=3)
for train, test in gkf.split(X, y, groups=groups):
    print("%s %s" % (train, test))

from sklearn.model_selection import LeaveOneGroupOut

X = [1, 5, 10, 50, 60, 70, 80]
y = [0, 1, 1, 2, 2, 2, 2]
groups = [1, 1, 2, 2, 3, 3, 3]
logo = LeaveOneGroupOut()
for train, test in logo.split(X, y, groups=groups):
    print("%s %s" % (train, test))

from sklearn.model_selection import LeavePGroupsOut

X = np.arange(6)
    'num_leaves': 63,
    'max_bin': 255,
    'min_child_weight': 10,
    'min_data_in_leaf': 150,
    'reg_lambda': 0.5,  # L2 regularization term on weights.
    'reg_alpha': 0.5,  # L1 regularization term on weights.
    'colsample_bytree': 0.9,
    'subsample': 0.9,
    'nthread': 32,
    #         'nthread': cpu_count(),
    'bagging_freq': 1,
    'verbose': -1,
    'seed': SEED
}

group_kfold = GroupKFold(n_splits=NFOLD)
np.random.seed(SEED)

os.system(f'rm ../feature/t*_{PREF}*')

# =============================================================================
# load
# =============================================================================
train = pd.read_csv('../input/application_train.csv.zip')
test = pd.read_csv('../input/application_test.csv.zip')
prev = pd.read_csv('../input/previous_application.csv.zip')


def mk_feature(df):
    df['AMT_CREDIT-d-AMT_ANNUITY'] = df['AMT_CREDIT'] / df[
        'AMT_ANNUITY']  # how long should user pay?(month)
def main(args, logger):
    # trn_df = pd.read_csv(f'{MNT_DIR}/inputs/origin/train.csv')
    trn_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/trn_df.pkl')
    trn_df['is_original'] = 1
    # aug_df = pd.read_pickle(f'{MNT_DIR}/inputs/nes_info/ContextualWordEmbsAug_sub_df.pkl')
    # aug_df['is_original'] = 0

    # trn_df = pd.concat([trn_df, aug_df], axis=0).reset_index(drop=True)

    gkf = GroupKFold(n_splits=5).split(
        X=trn_df.question_body,
        groups=trn_df.question_body_le,
    )

    histories = {
        'trn_loss': {},
        'val_loss': {},
        'val_metric': {},
        'val_metric_raws': {},
    }
    loaded_fold = -1
    loaded_epoch = -1
    if args.checkpoint:
        histories, loaded_fold, loaded_epoch = load_checkpoint(args.checkpoint)

    # calc max_seq_len using quest dataset
    # max_seq_len = QUESTDataset(
    #     df=trn_df,
    #     mode='train',
    #     tokens=[],
    #     augment=[],
    #     pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
    # ).MAX_SEQUENCE_LENGTH
    # max_seq_len = 9458
    # max_seq_len = 1504
    max_seq_len = 512  # roberta!

    fold_best_metrics = []
    fold_best_metrics_raws = []
    for fold, (trn_idx, val_idx) in enumerate(gkf):
        if fold < loaded_fold:
            fold_best_metrics.append(np.max(histories["val_metric"][fold]))
            fold_best_metrics_raws.append(
                histories["val_metric_raws"][fold][np.argmax(
                    histories["val_metric"][fold])])
            continue
        sel_log(
            f' --------------------------- start fold {fold} --------------------------- ',
            logger)
        fold_trn_df = trn_df.iloc[trn_idx]  # .query('is_original == 1')
        fold_trn_df = fold_trn_df.drop(['is_original', 'question_body_le'],
                                       axis=1)
        # use only original row
        fold_val_df = trn_df.iloc[val_idx].query('is_original == 1')
        fold_val_df = fold_val_df.drop(['is_original', 'question_body_le'],
                                       axis=1)
        if args.debug:
            fold_trn_df = fold_trn_df.sample(100, random_state=71)
            fold_val_df = fold_val_df.sample(100, random_state=71)
        temp = pd.Series(
            list(
                itertools.chain.from_iterable(
                    fold_trn_df.question_title.apply(lambda x: x.split(' ')) +
                    fold_trn_df.question_body.apply(lambda x: x.split(' ')) +
                    fold_trn_df.answer.apply(lambda x: x.split(' '))))
        ).value_counts()
        tokens = temp[temp >= 10].index.tolist()
        # tokens = []
        tokens = [
            'CAT_TECHNOLOGY'.casefold(),
            'CAT_STACKOVERFLOW'.casefold(),
            'CAT_CULTURE'.casefold(),
            'CAT_SCIENCE'.casefold(),
            'CAT_LIFE_ARTS'.casefold(),
        ]

        trn_dataset = QUESTDataset(
            df=fold_trn_df,
            mode='train',
            tokens=tokens,
            augment=[],
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            MAX_SEQUENCE_LENGTH=max_seq_len,
        )
        # update token
        trn_sampler = RandomSampler(data_source=trn_dataset)
        trn_loader = DataLoader(
            trn_dataset,
            batch_size=BATCH_SIZE,
            sampler=trn_sampler,
            num_workers=os.cpu_count(),
            # num_workers=0,
            worker_init_fn=lambda x: np.random.seed(),
            drop_last=True,
            pin_memory=True)
        val_dataset = QUESTDataset(
            df=fold_val_df,
            mode='valid',
            tokens=tokens,
            augment=[],
            pretrained_model_name_or_path=TOKENIZER_PRETRAIN,
            MAX_SEQUENCE_LENGTH=max_seq_len,
        )
        # val_sampler = SequentialSampler(data_source=val_dataset)
        val_sampler = RandomSampler(data_source=val_dataset)
        val_loader = DataLoader(val_dataset,
                                batch_size=BATCH_SIZE,
                                sampler=val_sampler,
                                num_workers=os.cpu_count(),
                                worker_init_fn=lambda x: np.random.seed(),
                                drop_last=False,
                                pin_memory=True)

        fobj = BCEWithLogitsLoss()
        # fobj = MSELoss()
        model = BertModelForBinaryMultiLabelClassifier(
            num_labels=len(LABEL_COL),
            pretrained_model_name_or_path=MODEL_PRETRAIN,
            # cat_num=5,
            token_size=len(trn_dataset.tokenizer),
            MAX_SEQUENCE_LENGTH=max_seq_len,
        )
        optimizer = optim.Adam(model.parameters(), lr=3e-5)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                         T_max=MAX_EPOCH,
                                                         eta_min=1e-5)

        # load checkpoint model, optim, scheduler
        if args.checkpoint and fold == loaded_fold:
            load_checkpoint(args.checkpoint, model, optimizer, scheduler)

        for epoch in tqdm(list(range(MAX_EPOCH))):
            if fold <= loaded_fold and epoch <= loaded_epoch:
                continue
            if epoch < 1:
                model.freeze_unfreeze_bert(freeze=True, logger=logger)
            else:
                model.freeze_unfreeze_bert(freeze=False, logger=logger)
            # model = DataParallel(model)
            model = model.to(DEVICE)
            trn_loss = train_one_epoch(model, fobj, optimizer, trn_loader)
            val_loss, val_metric, val_metric_raws, val_y_preds, val_y_trues, val_qa_ids = test(
                model, fobj, val_loader)

            scheduler.step()
            if fold in histories['trn_loss']:
                histories['trn_loss'][fold].append(trn_loss)
            else:
                histories['trn_loss'][fold] = [
                    trn_loss,
                ]
            if fold in histories['val_loss']:
                histories['val_loss'][fold].append(val_loss)
            else:
                histories['val_loss'][fold] = [
                    val_loss,
                ]
            if fold in histories['val_metric']:
                histories['val_metric'][fold].append(val_metric)
            else:
                histories['val_metric'][fold] = [
                    val_metric,
                ]
            if fold in histories['val_metric_raws']:
                histories['val_metric_raws'][fold].append(val_metric_raws)
            else:
                histories['val_metric_raws'][fold] = [
                    val_metric_raws,
                ]

            logging_val_metric_raws = ''
            for val_metric_raw in val_metric_raws:
                logging_val_metric_raws += f'{float(val_metric_raw):.4f}, '

            sel_log(
                f'fold : {fold} -- epoch : {epoch} -- '
                f'trn_loss : {float(trn_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_loss : {float(val_loss.detach().to("cpu").numpy()):.4f} -- '
                f'val_metric : {float(val_metric):.4f} -- '
                f'val_metric_raws : {logging_val_metric_raws}', logger)
            model = model.to('cpu')
            # model = model.module
            save_checkpoint(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}', model,
                            optimizer, scheduler, histories, val_y_preds,
                            val_y_trues, val_qa_ids, fold, epoch, val_loss,
                            val_metric)
        fold_best_metrics.append(np.max(histories["val_metric"][fold]))
        fold_best_metrics_raws.append(
            histories["val_metric_raws"][fold][np.argmax(
                histories["val_metric"][fold])])
        save_and_clean_for_prediction(f'{MNT_DIR}/checkpoints/{EXP_ID}/{fold}',
                                      trn_dataset.tokenizer,
                                      clean=False)
        del model

    # calc training stats
    fold_best_metric_mean = np.mean(fold_best_metrics)
    fold_best_metric_std = np.std(fold_best_metrics)
    fold_stats = f'{EXP_ID} : {fold_best_metric_mean:.4f} +- {fold_best_metric_std:.4f}'
    sel_log(fold_stats, logger)
    send_line_notification(fold_stats)

    fold_best_metrics_raws_mean = np.mean(fold_best_metrics_raws, axis=0)
    fold_raw_stats = ''
    for metric_stats_raw in fold_best_metrics_raws_mean:
        fold_raw_stats += f'{float(metric_stats_raw):.4f},'
    sel_log(fold_raw_stats, logger)
    send_line_notification(fold_raw_stats)

    sel_log('now saving best checkpoints...', logger)
Exemple #26
0
    def split(
        self,
        X,
        y=None,
        group=None,
        **kwargs
    ):  ## the group here will be passed on from the class where this is being called

        if self.validation_scheme is None or isinstance(
                self.validation_scheme, KFold
        ) or self.validation_scheme == FoldScheme.KFold.name or self.validation_scheme == FoldScheme.KFold:
            folds = KFold(n_splits=self.num_folds,
                          random_state=self.random_state,
                          shuffle=self.shuffle)
            self.indices = [(train_index, test_index)
                            for (train_index, test_index) in folds.split(X)]

        elif isinstance(
                self.validation_scheme, StratifiedKFold
        ) or self.validation_scheme == FoldScheme.StratifiedKFold.name or self.validation_scheme == FoldScheme.StratifiedKFold:
            if y is None or X.shape[0] != y.shape[0]:
                raise ValueError(
                    "Y should be passed and X and Y should be of same length for StratifiedKFold"
                )
            folds = StratifiedKFold(n_splits=self.num_folds,
                                    random_state=self.random_state,
                                    shuffle=self.shuffle)
            self.indices = [(train_index, test_index)
                            for (train_index, test_index) in folds.split(X, y)]

        elif isinstance(
                self.validation_scheme, GroupKFold
        ) or self.validation_scheme == FoldScheme.GroupKFold.name or self.validation_scheme == FoldScheme.GroupKFold:
            folds = GroupKFold(n_splits=self.num_folds)
            self.indices = [(train_index, test_index)
                            for (train_index,
                                 test_index) in folds.split(X, y, groups=group)
                            ]

        elif isinstance(
                self.validation_scheme, TimeSeriesSplit
        ) or self.validation_scheme == FoldScheme.TimeSeriesSplit.name or self.validation_scheme == FoldScheme.TimeSeriesSplit:
            folds = TimeSeriesSplit(n_splits=self.num_folds)
            self.indices = [(train_index, test_index)
                            for (train_index, test_index) in folds.split(X)]

        elif self.validation_scheme == FoldScheme.train_test_split.name or self.validation_scheme == FoldScheme.train_test_split:
            # validation_scheme is a simple train test split. testsize is used to determine the size of test samples
            self.indices = [
                train_test_split(list(range(X.shape[0])),
                                 test_size=self.test_size,
                                 shuffle=self.shuffle)
            ]

        elif callable(self.validation_scheme):
            # validation_scheme is a callable funtion which will take X and y as params.
            self.indices = self.validation_scheme(X, y, **kwargs)

        else:
            if not isinstance(self.validation_scheme, list):
                raise ValueError(
                    "Validation Schema should be a list of (train_indexes, test_indexes)"
                )
            self.indices = self.validation_scheme
        return self.indices
Exemple #27
0

result = {}

if args.cross_validation:
    # cross valiation
    modified_data_set = []
    groups = []

    for category in data_set.keys():
        for example in data_set[category]:
            modified_data_set.append([category] + example)
            groups.append(category)
    modified_data_set = np.array(modified_data_set)

    gkf = GroupKFold(n_splits=args.folds)
    all_folds = []
    for train_indices, test_indices in gkf.split(modified_data_set,
                                                 groups=groups):
        train = modified_data_set[train_indices]
        test = modified_data_set[test_indices]

        print(len(train_indices), len(test_indices),
              len(test_indices) / len(modified_data_set))
        print(train[0], test[0], '\n')

        all_folds.append(test)

    result['n_folds'] = args.folds
    result['folds'] = all_folds
Exemple #28
0
    'objective': 'regression',
    'max_depth': 6,
    'learning_rate': LEARNING_RATE,
    "boosting_type": "gbdt",
    "subsample_freq": 1,
    "subsample": 0.9,
    "bagging_seed": 11,
    "metric": 'mae',
    "verbosity": -1,
    'reg_alpha': 0.1,
    'reg_lambda': 0.4,
    'colsample_bytree': 1.0,
    'random_state': RANDOM_STATE
}

folds = GroupKFold(n_splits=N_FOLDS)

# Setup arrays for storing results
train_df = pd.read_parquet(
    'data/FE008_train.parquet')  # only loading for skeleton not features
oof_df = train_df[['id', 'type', 'scalar_coupling_constant']].copy()
mol_group = train_df[['molecule_name', 'type']].copy()
del train_df
gc.collect()

oof_df['oof_preds'] = 0
test_df = pd.read_parquet(
    'data/FE008_test.parquet')  # only loading for skeleton not features
prediction = np.zeros(len(test_df))
feature_importance = pd.DataFrame()
test_pred_df = test_df[['id', 'type', 'molecule_name']].copy()
Exemple #29
0
from sklearn.metrics import auc
import matplotlib.pyplot as plt
import pars
import roc_curve_target as rct

xml_path = 'path.xml'

# settings parsing
global_params, external_params, param_list = pars.parsconf(xml_path)

train, test = pars.parsdata(global_params)

# find best model by GridSearchCV

nfolds = external_params['kfold']
group_kfold = GroupKFold(n_splits=nfolds)
ind = list(
    group_kfold.split(train['features'], train['labels'], train['target_id']))

########################################################################################################################
import check_kfolds as ck

status = ck.test_kfold(ind, train['target_id'])
########################################################################################################################

dtrain = xgb.DMatrix(train['features'], label=train['labels'])
dtest = xgb.DMatrix(test['features'], label=test['labels'])

results = pd.DataFrame(
    columns=['n', 'metric_train', 'metric_val', 'num_trees'])
for n, param in enumerate(param_list):
    return df


if __name__ == "__main__":
    train = pd.read_csv(config.CLEAN_TRAIN_DATA)
    test = pd.read_csv(config.CLEAN_TEST_DATA)

    # Get numerical target
    train['target'] = train.Tag.map(config.TAG_DICT)
    y = train["target"].values

    # Replicate train/test split strategy for cross validation
    train["target_str"] = train["Domain"].astype(str) + train["Tag"].astype(
        str)
    train["target_str"] = train["target_str"].astype("category")
    cvlist = list(GroupKFold(5).split(train, groups=train["target_str"]))

    # Word and character TFIDF on URLs
    vec1 = TfidfVectorizer(analyzer='char',
                           ngram_range=(1, 5),
                           min_df=500,
                           sublinear_tf=True)
    vec2 = TfidfVectorizer(analyzer='word',
                           ngram_range=(1, 1),
                           min_df=400,
                           sublinear_tf=True)
    vec = FeatureUnion([("char", vec1), ("word", vec2)])

    train = tokenize_url(train)
    test = tokenize_url(test)
    all_url = pd.concat([train["Url"], test["Url"]])
Exemple #31
0
def test_group_kfold():
    rng = np.random.RandomState(0)

    # Parameters of the test
    n_groups = 15
    n_samples = 1000
    n_splits = 5

    X = y = np.ones(n_samples)

    # Construct the test data
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    groups = rng.randint(0, n_groups, n_samples)

    ideal_n_groups_per_fold = n_samples // n_splits

    len(np.unique(groups))
    # Get the test fold indices from the test set indices of each fold
    folds = np.zeros(n_samples)
    lkf = GroupKFold(n_splits=n_splits)
    for i, (_, test) in enumerate(lkf.split(X, y, groups)):
        folds[test] = i

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(groups))
    for i in np.unique(folds):
        assert_greater_equal(tolerance,
                             abs(sum(folds == i) - ideal_n_groups_per_fold))

    # Check that each group appears only in 1 fold
    for group in np.unique(groups):
        assert_equal(len(np.unique(folds[groups == group])), 1)

    # Check that no group is on both sides of the split
    groups = np.asarray(groups, dtype=object)
    for train, test in lkf.split(X, y, groups):
        assert_equal(len(np.intersect1d(groups[train], groups[test])), 0)

    # Construct the test data
    groups = np.array(['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean',
                       'Francis', 'Robert', 'Michel', 'Rachel', 'Lois',
                       'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean',
                       'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix',
                       'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky',
                       'Madmood', 'Cary', 'Mary', 'Alexandre', 'David',
                       'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia'])

    n_groups = len(np.unique(groups))
    n_samples = len(groups)
    n_splits = 5
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    ideal_n_groups_per_fold = n_samples // n_splits

    X = y = np.ones(n_samples)

    # Get the test fold indices from the test set indices of each fold
    folds = np.zeros(n_samples)
    for i, (_, test) in enumerate(lkf.split(X, y, groups)):
        folds[test] = i

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(groups))
    for i in np.unique(folds):
        assert_greater_equal(tolerance,
                             abs(sum(folds == i) - ideal_n_groups_per_fold))

    # Check that each group appears only in 1 fold
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", DeprecationWarning)
        for group in np.unique(groups):
            assert_equal(len(np.unique(folds[groups == group])), 1)

    # Check that no group is on both sides of the split
    groups = np.asarray(groups, dtype=object)
    for train, test in lkf.split(X, y, groups):
        assert_equal(len(np.intersect1d(groups[train], groups[test])), 0)

    # groups can also be a list
    cv_iter = list(lkf.split(X, y, groups.tolist()))
    for (train1, test1), (train2, test2) in zip(lkf.split(X, y, groups),
                                                cv_iter):
        assert_array_equal(train1, train2)
        assert_array_equal(test1, test2)

    # Should fail if there are more folds than groups
    groups = np.array([1, 1, 1, 2, 2])
    X = y = np.ones(len(groups))
    assert_raises_regexp(ValueError, "Cannot have number of splits.*greater",
                         next, GroupKFold(n_splits=3).split(X, y, groups))
fulldatasetpath = '../downsampled/'


metadata = pd.read_csv('../UrbanSound8K.csv')


le = LabelEncoder()
le.fit(metadata['class'])
class_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

parameters = {
    'num_cep_coef': [25,30,35,40,45,50], 
    'num_states':[2,3,4,5,6]
}

gKFold = GroupKFold(n_splits = 10)
urban_hmm = UrbanHMMClassifier(class_map = class_mapping)
grid_search = GridSearchCV(urban_hmm, parameters, cv = gKFold, n_jobs = -1, verbose = 1)
grid_search.fit(X = list(fulldatasetpath + metadata['slice_file_name'].astype(str)), 
               y = le.transform(metadata['class']),
               groups = metadata['fold'])

best_filename = "./models/hmm_cvbest_f1_{}.pkl".format(str(grid_search.best_score_)[2:10])
pickle.dump(grid_search.best_estimator_, open(best_filename, "wb"))


cv_filename = "./models/hmm_cv_f1_{}.pkl".format(str(grid_search.best_score_)[2:10])
pickle.dump(grid_search, open(cv_filename, "wb"))


print("\n-----------GRID SEARCH RANKING----------\n")
X_all = pd.concat([pd.read_feather(f) for f in tqdm(files, mininterval=60)],
                  axis=1)
y = utils.read_pickles('../data/prev_label').TARGET

val = utils.read_pickles('../data/prev_train', ['DAYS_DECISION'])
ind = val[val['DAYS_DECISION'].between(day_start, day_end)].index
y = y[ind]

sub_train = utils.read_pickles(
    '../data/prev_train', ['SK_ID_CURR']).set_index('SK_ID_CURR').iloc[ind]
sub_train['y'] = y.values
sub_train['cnt'] = sub_train.index.value_counts()
sub_train['w'] = 1 / sub_train.cnt.values

group_kfold = GroupKFold(n_splits=NFOLD)
sub_train['g'] = sub_train.index % NFOLD

for HEAD in HEADS:

    X = X_all.iloc[ind, :HEAD]

    if X.columns.duplicated().sum() > 0:
        raise Exception(f'duplicated!: { X.columns[X.columns.duplicated()] }')
    print('no dup :) ')
    print(f'X.shape {X.shape}')

    gc.collect()

    CAT = list(set(X.columns) & set(utils_cat.ALL))
X_test: pd.DataFrame = test[use_cols_revised].copy()
print(f"X.shape: {X.shape}, X_test.shape: {X_test.shape}")


# X.to_csv("../info/X_sampled.csv")

# export colnames
pd.DataFrame({"columns": X.columns.tolist()}).to_csv(log_path / f"use_cols.csv")

####################################################################################################
# Model Fitting
print("start fitting")
n_fold = 5

if GROUP_K_FOLD:
    folds = GroupKFold(n_splits=n_fold)
else:
    folds = KFold(n_splits=n_fold, shuffle=True, random_state=11)

#########################################################################################################
# 1st layer model
#seed_base = [0, 2019, 71, 1228, 1988, 1879, 92, 3018, 1234, 185289]
seed_base = [2019]
seed_list = np.array(seed_base) + 50
#seed_list = np.array(seed_base) + 51
#seed_list = np.array(seed_base) + 52
#seed_list = np.array(seed_base) + 53
#seed_list = np.array(seed_base) + 54

current_seed = -1
Exemple #35
0
            if i == 0:
                tsdict[dict_dataset[dataset][:-1]]['test'].append(test_idx)
                tsdict[dict_dataset[dataset][:-1]]['train'].append(train_idx)

            if i == 0 and j == 0:
                cvdict[dict_dataset[dataset][:-1]]['test'] = []
                cvdict[dict_dataset[dataset][:-1]]['train'] = []

            #replace all X_in[train_idx] with scaled X_train and X_in[test_idx] with X_test
            scaler.fit(X_in[train_idx])
            X_train = scaler.transform(X_in[train_idx])
            X_test = scaler.transform(X_in[test_idx])

            #set random state for gkf
            np.random.set_state(state)
            gkf = GroupKFold(n_splits=5).split(X_train, Y[train_idx],
                                               cluster_in[train_idx])
            for tr, te in gkf:
                cvdict[dict_dataset[dataset][:-1]]['train'].append(tr)
                cvdict[dict_dataset[dataset][:-1]]['test'].append(te)

            #set random state for gkf
            np.random.set_state(state)
            gkf = GroupKFold(n_splits=5).split(X_train, Y[train_idx],
                                               cluster_in[train_idx])

            best_params[dict_dataset[dataset] + 'RF.' + str(i + 1) + '.' +
                        str(j + 1)] = rf_param_selection(
                            X_train, Y[train_idx], gkf)

            np.random.set_state(state)
            gkf = GroupKFold(n_splits=5).split(X_train, Y[train_idx],
def test_group_kfold():
    rng = np.random.RandomState(0)

    # Parameters of the test
    n_groups = 15
    n_samples = 1000
    n_splits = 5

    X = y = np.ones(n_samples)

    # Construct the test data
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    groups = rng.randint(0, n_groups, n_samples)

    ideal_n_groups_per_fold = n_samples // n_splits

    len(np.unique(groups))
    # Get the test fold indices from the test set indices of each fold
    folds = np.zeros(n_samples)
    lkf = GroupKFold(n_splits=n_splits)
    for i, (_, test) in enumerate(lkf.split(X, y, groups)):
        folds[test] = i

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(groups))
    for i in np.unique(folds):
        assert_greater_equal(tolerance,
                             abs(sum(folds == i) - ideal_n_groups_per_fold))

    # Check that each group appears only in 1 fold
    for group in np.unique(groups):
        assert_equal(len(np.unique(folds[groups == group])), 1)

    # Check that no group is on both sides of the split
    groups = np.asarray(groups, dtype=object)
    for train, test in lkf.split(X, y, groups):
        assert_equal(len(np.intersect1d(groups[train], groups[test])), 0)

    # Construct the test data
    groups = np.array(['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean',
                       'Francis', 'Robert', 'Michel', 'Rachel', 'Lois',
                       'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean',
                       'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix',
                       'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky',
                       'Madmood', 'Cary', 'Mary', 'Alexandre', 'David',
                       'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia'])

    n_groups = len(np.unique(groups))
    n_samples = len(groups)
    n_splits = 5
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    ideal_n_groups_per_fold = n_samples // n_splits

    X = y = np.ones(n_samples)

    # Get the test fold indices from the test set indices of each fold
    folds = np.zeros(n_samples)
    for i, (_, test) in enumerate(lkf.split(X, y, groups)):
        folds[test] = i

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(groups))
    for i in np.unique(folds):
        assert_greater_equal(tolerance,
                             abs(sum(folds == i) - ideal_n_groups_per_fold))

    # Check that each group appears only in 1 fold
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", DeprecationWarning)
        for group in np.unique(groups):
            assert_equal(len(np.unique(folds[groups == group])), 1)

    # Check that no group is on both sides of the split
    groups = np.asarray(groups, dtype=object)
    for train, test in lkf.split(X, y, groups):
        assert_equal(len(np.intersect1d(groups[train], groups[test])), 0)

    # groups can also be a list
    cv_iter = list(lkf.split(X, y, groups.tolist()))
    for (train1, test1), (train2, test2) in zip(lkf.split(X, y, groups),
                                                cv_iter):
        assert_array_equal(train1, train2)
        assert_array_equal(test1, test2)

    # Should fail if there are more folds than groups
    groups = np.array([1, 1, 1, 2, 2])
    X = y = np.ones(len(groups))
    assert_raises_regexp(ValueError, "Cannot have number of splits.*greater",
                         next, GroupKFold(n_splits=3).split(X, y, groups))
Exemple #37
0
        train, test, 'category')

    # Set training parameters
    device = 'cuda'
    num_workers = 10
    n_folds = 10
    lr = 0.001
    n_epochs = 10
    bs = 2
    grad_accum = 4
    weight_decay = 0.01
    loss_fn = nn.BCEWithLogitsLoss()

    # Start training
    init_seed()
    folds = GroupKFold(n_splits=n_folds).split(X=train['question_body'],
                                               groups=train['question_body'])
    oofs = np.zeros((len(train), N_TARGETS))

    main_logger.info(f'Start training model {model_name}...')

    for fold_id, (train_index, valid_index) in enumerate(folds):

        main_logger.info(f'Fold {fold_id + 1} started at {time.ctime()}')

        fold_logger = init_logger(log_dir,
                                  f'train_fold_{fold_id+1}_{model_name}.log')

        train_loader = DataLoader(
            TextDataset(cat_features_train, ids_train['question'],
                        ids_train['answer'], seg_ids_train['question'],
                        seg_ids_train['answer'], train_index, y),