def test_ridge_gcv_sample_weights(
        gcv_mode, X_constructor, fit_intercept, n_features, y_shape, noise):
    alphas = [1e-3, .1, 1., 10., 1e3]
    rng = np.random.RandomState(0)
    n_targets = y_shape[-1] if len(y_shape) == 2 else 1
    X, y = _make_sparse_offset_regression(
        n_samples=11, n_features=n_features, n_targets=n_targets,
        random_state=0, shuffle=False, noise=noise)
    y = y.reshape(y_shape)

    sample_weight = 3 * rng.randn(len(X))
    sample_weight = (sample_weight - sample_weight.min() + 1).astype(int)
    indices = np.repeat(np.arange(X.shape[0]), sample_weight)
    sample_weight = sample_weight.astype(float)
    X_tiled, y_tiled = X[indices], y[indices]

    cv = GroupKFold(n_splits=X.shape[0])
    splits = cv.split(X_tiled, y_tiled, groups=indices)
    kfold = RidgeCV(
        alphas=alphas, cv=splits, scoring='neg_mean_squared_error',
        fit_intercept=fit_intercept)
    # ignore warning from GridSearchCV: DeprecationWarning: The default of the
    # `iid` parameter will change from True to False in version 0.22 and will
    # be removed in 0.24
    with ignore_warnings(category=DeprecationWarning):
        kfold.fit(X_tiled, y_tiled)

    ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept)
    splits = cv.split(X_tiled, y_tiled, groups=indices)
    predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits)
    kfold_errors = (y_tiled - predictions)**2
    kfold_errors = [
        np.sum(kfold_errors[indices == i], axis=0) for
        i in np.arange(X.shape[0])]
    kfold_errors = np.asarray(kfold_errors)

    X_gcv = X_constructor(X)
    gcv_ridge = RidgeCV(
        alphas=alphas, store_cv_values=True,
        gcv_mode=gcv_mode, fit_intercept=fit_intercept)
    gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight)
    if len(y_shape) == 2:
        gcv_errors = gcv_ridge.cv_values_[:, :, alphas.index(kfold.alpha_)]
    else:
        gcv_errors = gcv_ridge.cv_values_[:, alphas.index(kfold.alpha_)]

    assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_)
    assert_allclose(gcv_errors, kfold_errors, rtol=1e-3)
    assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3)
    assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3)
Esempio n. 2
0
    def _get_mse_profiling(self, x, y, alphas=None):
        """Calculate prediction RMSE.

        Use GroupKFold where a group is a combination of input size and number
        of workers. The prediction of a group is done when it is out of the
        training set.
        """
        # Training set is 2/3 of the data
        groups = self._groups.loc[x.index]
        cv = GroupKFold(n_splits=3)
        preds = None

        for train_ix, test_ix in cv.split(x, groups=groups):
            # train_ix and test_ix starts from 0, so we use iloc
            x_train, y_train = x.iloc[train_ix], y.iloc[train_ix]
            x_test = x.iloc[test_ix]

            # Choose best alpha value for regularization based on training set
            lm = self._choose_alpha(x_train, y_train, alphas)
            lm.fit(x_train, y_train)
            pred = pd.DataFrame(lm.predict(x_test), index=test_ix)
            preds = pred if preds is None else preds.append(
                pred, verify_integrity=True)

        return self._calc_mse(y, preds.sort_index())
    def fit_predict(self,x_train,y_train, x_predict):
        """Use local regression to predict values for unknown data.

        Arguments:
            x_train = The training data spectra.
            y_train = The values of the quantity being predicted for the training data
            x_predict = The unknown spectra for which y needs to be predicted.
        """
        self.neighbors.fit(x_train)
        predictions = []
        coeffs = []
        intercepts = []
        for i in range(x_predict.shape[0]):
            print('Predicting spectrum ' + str(i + 1))
            x_temp = np.array(x_predict[i])
            foo, ind = self.neighbors.kneighbors([x_temp])
            x_train_local = np.squeeze(x_train[ind])
            y_train_local = np.squeeze(y_train[ind])

            cv = GroupKFold(n_splits=3)
            cv = cv.split(x_train_local, y_train_local,
                          groups=y_train_local)
            self.model.fit(x_train_local, y_train_local)
            predictions.append(self.model.predict([x_temp])[0])
            coeffs.append(self.model.coef_)
            intercepts.append(self.model.intercept_)
        return predictions, coeffs, intercepts
Esempio n. 4
0
 def _split(self, x, y):
     cv = GroupKFold(n_splits=3)
     groups = self._groups.loc[x.index]
     for train_ix, test_ix in cv.split(x, groups=groups):
         # train_ix and test_ix starts from 0, so we use iloc
         x_train = x.iloc[train_ix]
         y_train = y.iloc[train_ix]
         x_test = x.iloc[test_ix]
         yield x_train, y_train, x_test, test_ix
def plot_group_kfold():
    from sklearn.model_selection import GroupKFold
    groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3]

    plt.figure(figsize=(10, 2))
    plt.title("GroupKFold")

    axes = plt.gca()
    axes.set_frame_on(False)

    n_folds = 12
    n_samples = 12
    n_iter = 3
    n_samples_per_fold = 1

    cv = GroupKFold(n_splits=3)
    mask = np.zeros((n_iter, n_samples))
    for i, (train, test) in enumerate(cv.split(range(12), groups=groups)):
        mask[i, train] = 1
        mask[i, test] = 2

    for i in range(n_folds):
        # test is grey
        colors = ["grey" if x == 2 else "white" for x in mask[:, i]]
        # not selected has no hatch

        boxes = axes.barh(bottom=range(n_iter), width=[1 - 0.1] * n_iter,
                          left=i * n_samples_per_fold, height=.6, color=colors,
                          hatch="//", edgecolor="k", align='edge')
        for j in np.where(mask[:, i] == 0)[0]:
            boxes[j].set_hatch("")

    axes.barh(bottom=[n_iter] * n_folds, width=[1 - 0.1] * n_folds,
              left=np.arange(n_folds) * n_samples_per_fold, height=.6,
              color="w", edgecolor='k', align="edge")

    for i in range(12):
        axes.text((i + .5) * n_samples_per_fold, 3.5, "%d" %
                  groups[i], horizontalalignment="center")

    axes.invert_yaxis()
    axes.set_xlim(0, n_samples + 1)
    axes.set_ylabel("CV iterations")
    axes.set_xlabel("Data points")
    axes.set_xticks(np.arange(n_samples) + .5)
    axes.set_xticklabels(np.arange(1, n_samples + 1))
    axes.set_yticks(np.arange(n_iter + 1) + .3)
    axes.set_yticklabels(
        ["Split %d" % x for x in range(1, n_iter + 1)] + ["Group"])
    plt.legend([boxes[0], boxes[1]], ["Training set", "Test set"], loc=(1, .3))
    plt.tight_layout()
def test_knn_rbf_groupkfold():
    nan_roc_auc_scorer = make_scorer(nan_roc_auc_score)
    rng = np.random.RandomState(123)
    iris = load_iris()
    X = iris.data
    # knn = KNeighborsClassifier(n_neighbors=4)
    forest = RandomForestClassifier(n_estimators=100, random_state=123)
    bool_01 = [True if item == 0 else False for item in iris['target']]
    bool_02 = [True if (item == 1 or item == 2) else False for item in
               iris['target']]
    groups = []
    y_new = []
    for ind, _ in enumerate(bool_01):
        if bool_01[ind]:
            groups.append('attribute_A')
            y_new.append(0)
        if bool_02[ind]:
            throw = rng.rand()
            if throw < 0.5:
                groups.append('attribute_B')
            else:
                groups.append('attribute_C')
            throw2 = rng.rand()
            if throw2 < 0.5:
                y_new.append(0)
            else:
                y_new.append(1)
    y_new_bool = [True if item is 1 else False for item in y_new]
    cv_obj = GroupKFold(n_splits=3)
    cv_obj_list = list(cv_obj.split(X, np.array(y_new_bool), groups))
    sfs1 = SFS(forest,
               k_features=3,
               forward=True,
               floating=False,
               cv=cv_obj_list,
               scoring=nan_roc_auc_scorer,
               verbose=0
               )
    sfs1 = sfs1.fit(X, y_new)
    expect = {
        1: {'cv_scores': np.array([0.52, nan, 0.72]), 'avg_score': 0.62,
            'feature_idx': (1,)},
        2: {'cv_scores': np.array([0.42, nan, 0.65]), 'avg_score': 0.53,
            'feature_idx': (1, 2)},
        3: {'cv_scores': np.array([0.47, nan, 0.63]),
            'avg_score': 0.55,
            'feature_idx': (1, 2, 3)}}

    dict_compare_utility(d_actual=sfs1.subsets_, d_desired=expect, decimal=1)
Esempio n. 7
0
NUM_FOLDS = 4
EPOCHS = 30
BATCH_SIZE = 256
BAGS = 16

kf = GroupKFold(n_splits=NUM_FOLDS)

shape = None

for bag in range(BAGS):
    fold = 0

    val_loss = np.ones((EPOCHS, NUM_FOLDS), np.float32)

    for train, val in kf.split(x_train, y_train, G):
        model.set_weights(weights)
        model.reset_states()
        tensorboard = TensorBoard(
            log_dir='./logs/gru_fold_{}_bag_{}'.format(fold, bag))
        history = model.fit(x_train[train],
                            y_train[train],
                            batch_size=BATCH_SIZE,
                            validation_data=(x_train[val], y_train[val]),
                            epochs=EPOCHS,
                            shuffle=True,
                            verbose=1,
                            callbacks=[tensorboard])
        val_loss[:, fold] = history.history['val_loss']
        fold += 1
# In[14]:

ch_arr = [0, 1, 2]

# In[15]:

path = '6ts_4space_imgs_time_aug_resnet50_models'
os.makedirs(path)

# In[21]:

models_w_arr = []
fold = 0
for train_index, val_index in group_kfold.split(train_df,
                                                train_df['wind_speed'],
                                                train_df['storm_id']):
    print(fold)
    fold += 1
    os.makedirs(path + '/fold_%d' % (fold - 1))
    image_datasets = {
        'train':
        WindDataset(train_imgs, train_df['wind_speed'].values,
                    train_df['storm_id'].values, 'train', train_index,
                    data_transforms['train']),
        'val':
        WindDataset(train_imgs, train_df['wind_speed'].values,
                    train_df['storm_id'].values, 'val', val_index,
                    data_transforms['val'])
    }
Esempio n. 9
0
# In[11]:

oof = np.zeros(df_train.shape[0])
sub = np.zeros(df_test.shape[0])
feat_imp_df = pd.DataFrame({'feat': feats, 'imp': 0.0})
gkf = GroupKFold(n_splits=5)

# In[12]:

print('train shape {} test shape {}'.format(df_train.shape, df_test.shape))

# In[13]:

for i, (trn_idx, val_idx) in enumerate(
        gkf.split(df_train,
                  groups=(df_train.date.map(str) + '_' +
                          df_train.link_id.map(str)))):
    print(
        '------------------------------{} fold------------------------------'.
        format(i))
    X_trn, Y_trn, W_trn = df_train.iloc[trn_idx][feats], df_train.iloc[
        trn_idx].label, df_train.iloc[trn_idx].weight
    X_val, Y_val, W_val = df_train.iloc[val_idx][feats], df_train.iloc[
        val_idx].label, df_train.iloc[val_idx].weight
    X_sub = df_test[feats]

    clf = LGBMRegressor(
        num_leaves=63,
        learning_rate=0.02,
        n_estimators=100000,
        subsample=0.6,
Esempio n. 10
0
    embedding_numeric = Dense(512, activation='relu')(input_numeric)
    inputs.append(input_numeric)
    embeddings.append(embedding_numeric)
    x = Concatenate()(embeddings)
    x = Dense(256, activation='relu')(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    output = Dense(199, activation='softmax')(x)
    model = Model(inputs, output)
    return model


n_splits = 5
kf = GroupKFold(n_splits=n_splits)
score = []
for i_, (tdx, vdx) in enumerate(kf.split(X, y, X['GameId'])):
    print(f'Fold : {i_+1}')
    X_train, X_val, y_train, y_val = X.iloc[tdx], X.iloc[vdx], y[tdx], y[vdx]

    X_train = [np.absolute(X_train[i]) for i in cat
               ] + [X_train[num]]  # + [X_train[env1]] + [X_train[env2]]
    X_val = [np.absolute(X_val[i])
             for i in cat] + [X_val[num]]  # + [X_val[env1]] + [X_val[env2]]

    model = model_NN()
    model.compile(optimizer='Adam',
                  loss='categorical_crossentropy',
                  metrics=[])
    es = EarlyStopping(monitor='val_CRPS',
                       mode='min',
                       restore_best_weights=True,
Esempio n. 11
0
feature_importance = {}
num_dic = {}
trans_x = np.transpose(X)
max_val = 0
for i in range(num_vars):
    feature_importance[names[i]] = ms_array[i]
    num_dic[i] = ms_array[i]
    if max_val < num_dic[i]:
        max_val = num_dic[i]
        best_feature = i
for a in range(10):
    x1 = X[:, best_feature].reshape(-1, 1)
    group_kfold = GroupKFold(n_splits=10)
    group_kfold.get_n_splits(x1, y, groups)
    acc_arr = []
    for train_index, test_index in group_kfold.split(X, y, groups):
        X_train = []
        X_test = []
        y_train = []
        y_test = []
        for id in train_index:
            X_train.append(x1[id])
        for id in test_index:
            X_test.append(x1[id])
        for id in train_index:
            y_train.append(y[id])
        for id in test_index:
            y_test.append(y[id])
        clf = RandomForestClassifier().fit(X_train, y_train)
        tmp_score = clf.score(X_test, y_test)
        acc_arr.append(tmp_score)
Esempio n. 12
0
def cv():

    data_x, data_y, body_ids = build_data()

    holdout_ids = set([int(x.rstrip()) for x in file('hold_out_ids.txt')])
    print 'len(holdout_ids): ', len(holdout_ids)
    holdout_idx = [t for (t, x) in enumerate(body_ids) if x in holdout_ids]
    test_x = data_x[holdout_idx]  # features of test set
    print 'holdout_x.shape: '
    print test_x.shape
    test_y = data_y[holdout_idx]
    print Counter(test_y)
    #return 1

    # to obtain test dataframe for model averaging
    body = pd.read_csv("train_bodies.csv")
    stances = pd.read_csv("train_stances.csv")
    data = pd.merge(stances, body, how='left', on='Body ID')
    targets = ['agree', 'disagree', 'discuss', 'unrelated']
    targets_dict = dict(zip(targets, range(len(targets))))
    data['target'] = map(lambda x: targets_dict[x], data['Stance'])
    test_df = data.ix[holdout_idx]

    cv_ids = set([int(x.rstrip()) for x in file('training_ids.txt')])
    print 'len(cv_ids): ', len(cv_ids)
    cv_idx = [t for (t, x) in enumerate(body_ids) if x in cv_ids]
    cv_x = data_x[cv_idx]
    print 'cv_x.shape: '
    print cv_x.shape
    cv_y = data_y[cv_idx]
    groups = body_ids[cv_idx]  # GroupKFold will make sure all samples
    # having the same "Body ID" will appear in the same fold
    w = np.array([1 if y == 3 else 4 for y in cv_y])
    print 'w:'
    print w
    print np.mean(w)

    scores = []
    wscores = []
    pscores = []
    n_folds = 10
    best_iters = [0] * n_folds
    kf = GroupKFold(n_splits=n_folds)
    # need to create disjoint sets for training and validation
    for fold, (trainInd, validInd) in enumerate(kf.split(cv_x, cv_y, groups)):
        continue
        print 'fold %s' % fold
        x_train = cv_x[trainInd]
        y_train = cv_y[trainInd]
        x_valid = cv_x[validInd]
        y_valid = cv_y[validInd]
        idx_valid = np.array(cv_idx)[validInd]

        print 'perfect_score: ', perfect_score(y_valid)
        print Counter(y_valid)
        #break
        dtrain = xgb.DMatrix(x_train, label=y_train, weight=w[trainInd])
        dvalid = xgb.DMatrix(x_valid, label=y_valid, weight=w[validInd])
        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
        bst = xgb.train(
            params_xgb,
            dtrain,
            num_round,
            watchlist,
            verbose_eval=10,
            #feval = eval_metric,
            #maximize = True,
            early_stopping_rounds=80)

        #pred_y = bst.predict(dvalid, ntree_limit=bst.best_ntree_limit)
        pred_y = bst.predict(dvalid, ntree_limit=bst.best_ntree_limit).reshape(
            y_valid.shape[0], 4)
        print 'predicted probabilities: '
        print pred_y
        pred_y = np.argmax(pred_y, axis=1)
        print 'predicted label indices: '
        print pred_y

        print 'best iterations: ', bst.best_ntree_limit
        best_iters[fold] = bst.best_ntree_limit

        #pred_y = bst.predict(dvalid)
        print pred_y
        #print Counter(pred_y)
        #pred_y = np.argmax(bst.predict(dvalid, ntree_limit=bst.best_ntree_limit), axis=1)
        print 'pred_y.shape'
        print pred_y.shape
        print 'y_valid.shape'
        print y_valid.shape
        #s = fscore(pred_y, y_valid)
        #s_perf = perfect_score(y_valid)
        predicted = [LABELS[int(a)] for a in pred_y]
        actual = [LABELS[int(a)] for a in y_valid]
        # print out the headline & body text for incorrect predictions
        #show_incorrect_pred(actual, predicted, idx_valid)

        s, _ = score_submission(actual, predicted)
        s_perf, _ = score_submission(actual, actual)
        wscore = float(s) / s_perf
        print 'fold %s, score = %f, perfect_score %f, weighted percentage %f' % (
            fold, s, s_perf, wscore)
        scores.append(s)
        pscores.append(s_perf)
        wscores.append(wscore)
        #break

    print 'scores:'
    print scores
    print 'mean score:'
    print np.mean(scores)
    print 'perfect scores:'
    print pscores
    print 'mean perfect score:'
    print np.mean(pscores)
    print 'w scores:'
    print wscores
    print 'mean w score:'
    print np.mean(wscores)
    print 'best iters:'
    print best_iters
    print 'mean best_iter:'
    m_best = np.mean(best_iters)
    print m_best
    #m_best = best_iters[0]
    m_best = 500
    #m_best = 500
    #m_best = 600
    #return 1

    # use the same parameters to train with full cv data, test on hold-out data
    print 'test on holdout set'
    dtrain = xgb.DMatrix(cv_x, label=cv_y, weight=w)
    dtest = xgb.DMatrix(test_x, label=test_y)
    watchlist = [(dtrain, 'train')]
    clf = xgb.train(
        params_xgb,
        dtrain,
        #num_round,
        int(m_best),
        watchlist,
        feval=eval_metric,
        verbose_eval=10)

    pred_prob_holdout_y = clf.predict(dtest).reshape(test_y.shape[0],
                                                     4)  # probabilities
    pred_holdout_y = np.argmax(pred_prob_holdout_y, axis=1)
    print 'pred_holdout_y.shape:'
    print pred_holdout_y.shape
    print 'test_y.shape:'
    print test_y.shape
    #s_test = fscore(pred_holdout_y, test_y)
    #s_test_perf = perfect_score(test_y)
    predicted = [LABELS[int(a)] for a in pred_holdout_y]
    actual = [LABELS[int(a)] for a in test_y]
    report_score(actual, predicted)
    print Counter(predicted)

    test_df['actual'] = actual
    test_df['predicted'] = predicted
    test_df['prob_0'] = pred_prob_holdout_y[:, 0]
    test_df['prob_1'] = pred_prob_holdout_y[:, 1]
    test_df['prob_2'] = pred_prob_holdout_y[:, 2]
    test_df['prob_3'] = pred_prob_holdout_y[:, 3]

    #test_df[['Headline','Body ID', 'Stance', 'actual', 'predicted']].to_csv('predtest.csv', index=False)
    test_df[[
        'Headline', 'Body ID', 'Stance', 'actual', 'predicted', 'prob_0',
        'prob_1', 'prob_2', 'prob_3'
    ]].to_csv('predtest_cor2.csv', index=False)
Esempio n. 13
0
    def train_classifier_GS(self, x,y, groups):

        if self.classifier == "SVM":
            clf = svm.SVC(kernel='rbf', probability=True)
            if self.OneVsRest:
                param_grid = {
                    'estimator__C': [1, 10, 100, 1000],
                    'estimator__gamma': [0.001, 0.0001]
                }
            else:

                param_grid = {
                    'C': [1, 10, 100, 1000],
                    'gamma': [0.001, 0.0001]
                }

        elif self.classifier == "RF":
            clf = RandomForestClassifier()
            if self.OneVsRest:
                param_grid = {
                    'estimator__n_estimators': [100, 250, 500, 750, 1000],
                    'estimator__max_features': ['auto', 'log2'],
                    'estimator__max_depth': [4, 6, 8],
                    'estimator__criterion': ['gini', 'entropy']
                }

            else:
                param_grid = {
                    'n_estimators': [100, 250, 500, 750, 1000],
                    'max_features': ['auto', 'log2'],
                    'max_depth': [4, 6, 8],
                    'criterion': ['gini', 'entropy']
                }

        elif self.classifier == "XGB":
            clf = xgb.XGBClassifier()
            if self.OneVsRest:
                param_grid = {
                    'estimator__max_depth': [4, 6, 8],
                    'estimator__learning_rate': [0.01, 0.1, 0.3]
                }
            else:
                param_grid = {
                    'max_depth': [4, 6, 8],
                    'learning_rate': [0.01, 0.1, 0.3]
                }

        #Apply PCA transformation to data
        if self.n > 0:
            self.pca.fit_transform(x)
            x_transformed =self.pca.transform(x)
        else:
            x_transformed = x

        # OneVsRest Classifier
        if self.OneVsRest:
            clf = OneVsRestClassifier(clf)

        # Define cross validation method
        #kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=50)
        group_kfold = GroupKFold(n_splits=5)
        kfolds = group_kfold.split(x_transformed, y, groups)

        #Get scoring metrics
        scoring = self.get_scoring()

        #Perform Grid Search
        grid = GridSearchCV(clf, param_grid=param_grid, cv=kfolds, n_jobs=1,
                            scoring=scoring, refit='f1_weighted', verbose=1, return_train_score=True)
        grid.fit(x_transformed, y)

        #Print and log results
        self.save_results(grid)
        self.save_best_result(grid)

        #Save best estimator
        self.clf = grid.best_estimator_

        #disp = plot_precision_recall_curve(best_clf, x_transformed, y)

        return
Esempio n. 14
0
def main(args):
    # Load the parameters from json file
    params_dir = args.params_dir
    json_path = os.path.join(params_dir, 'params.json')
    assert os.path.isfile(
        json_path), "No json configuration file found at {}".format(json_path)
    params = utils.Params(json_path)

    # use GPU if available
    params.cuda = torch.cuda.is_available()

    # Set the random seed for reproducible experiments
    torch.manual_seed(params.seed)
    if params.cuda:
        torch.cuda.manual_seed(params.seed)

    # Set the logger
    model_dir = args.output_dir
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    utils.set_logger(os.path.join(model_dir, 'train.log'))

    logging.info("************ Validation fold: {} ************".format(
        args.fold))

    # Create the input data pipeline
    logging.info("Loading the datasets...")

    config_dict = {
        'image_dir': os.path.join(args.input_dir, 'train'),
        'csv_path': os.path.join(args.input_dir, 'train.csv')
    }

    train_data = DataPreprocess(config_dict)
    df, target_cols, num_targets = train_data.df, train_data.target_cols, train_data.num_targets

    # check for debug mode
    if args.debug:
        params.num_epochs = 1
        df = df.sample(n=100, random_state=params.seed).reset_index(drop=True)

    # update params
    params.mode = args.mode
    params.num_targets = num_targets
    params.target_cols = target_cols

    # split data into folds and pass to the model
    Fold = GroupKFold(n_splits=params.num_folds)
    groups = df['PatientID'].values
    for n, (train_index, valid_index) in enumerate(
            Fold.split(df, df[params.target_cols], groups)):
        df.loc[valid_index, 'fold'] = int(n)
    df['fold'] = df['fold'].astype(int)

    # get training and validation data using folds
    train_df = df[df.fold != args.fold].reset_index(drop=True)
    valid_df = df[df.fold == args.fold].reset_index(drop=True)

    # get dataloaders
    train_dataloader = dataloader.fetch_dataloader(train_df,
                                                   params,
                                                   data='train')
    valid_dataloader = dataloader.fetch_dataloader(valid_df,
                                                   params,
                                                   data='valid')

    logging.info("- done.")

    # Define the model and optimizer
    model = RANZCRModel(params, pretrained=True).model
    if params.cuda:
        model = model.to(torch.device('cuda'))

    optimizer = optim.Adam(model.parameters(),
                           lr=params.learning_rate,
                           amsgrad=False)
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                               mode='min',
                                               factor=0.1,
                                               patience=2,
                                               verbose=True)

    # fetch loss function and metrics
    loss_fn = nn.BCEWithLogitsLoss()
    metrics = models.metrics

    # Train the model
    logging.info("Starting training for {} epoch(s)".format(params.num_epochs))
    train_and_evaluate(model, train_dataloader, valid_dataloader,
                       valid_df[params.target_cols].values, optimizer,
                       scheduler, loss_fn, metrics, params, model_dir)
Esempio n. 15
0
logger.debug(f"Number of rows in train: {n_train}")
logger.debug(f"Number of rows in test: {n_test}")
logger.debug(f"Using features:{train_use.columns.values}")

categorical_cols = ["district", "layout", "direction", "structure"]

####################
## Train model
####################
folds = GroupKFold(n_splits=5)
oof = np.zeros(len(train_use))
predictions = np.zeros(len(test_use))
feature_importance_df = pd.DataFrame()

for fold, (train_idx,
           val_idx) in enumerate(folds.split(train_use, groups=train_group)):
    print(f"Fold {fold+1}")
    train_data = lgb.Dataset(train_use.iloc[train_idx],
                             label=target_log[train_idx],
                             categorical_feature=categorical_cols)
    val_data = lgb.Dataset(train_use.iloc[val_idx],
                           label=target_log[val_idx],
                           categorical_feature=categorical_cols)
    num_round = N_ROUNDS
    callbacks = [log_evaluation(logger, period=100)]
    clf = lgb.train(params,
                    train_data,
                    num_round,
                    valid_sets=[train_data, val_data],
                    verbose_eval=False,
                    early_stopping_rounds=100,
    
    print(f'Processing fold {fold}')
    model = get_model()
    model.to(DEVICE)
    train_idx, valid_idx = fold_info[fold]
    f'Proportions valid / train: {len(valid_idx) / len(train_idx)}'
    train_dl, valid_dl = generate_train_valid_dls(ds, train_idx, valid_idx)
    optimizer, scheduler = create_optimizer_scheduler(model, train_dl, EPOCHS)
    train_losses, valid_losses, accumulated_lrs, accumulated_dice_metrics = train(fold, EPOCHS, train_dl, valid_dl, optimizer, scheduler, patience = PATIENCE)
    return train_losses, valid_losses, accumulated_lrs, accumulated_dice_metrics


# In[50]:


fold_info = [(train_idx, valid_idx) for fold, (train_idx, valid_idx) in tqdm(enumerate(group_kfold.split(ds.slices, 
                                                        groups = groups)), total=FOLDS)]


# In[51]:


# from fastai.data.core import DataLoaders

# train_idx, valid_idx = fold_info[0]
# train_ds, valid_ds = create_subset(ds, train_idx, valid_idx)

# dls = DataLoaders.from_dsets(train_ds, valid_ds, bs=BATCH_SIZE, num_workers=2)
# assert(dls.bs == BATCH_SIZE)


# In[52]:
Esempio n. 17
0
    # Weight train samples by the inverse of how frequently the installation_id appears
    train_features["sample_weight"] = 1 / train_features.groupby(
        "installation_id")["accuracy_group"].transform("count")
    # Estimate accuracy_group proportions in the test set by using sample weights in train instead of sampling
    accuracy_groups = train_features.groupby(
        "accuracy_group")["sample_weight"].agg("sum")
    accuracy_group_proportions = list(accuracy_groups / accuracy_groups.sum())

    assessment_encoder = LabelEncoder()
    train_features["assessment"] = assessment_encoder.fit_transform(
        train_features["assessment"])

    group_kfold = GroupKFold(n_splits=5)
    models = []
    qwk_scores = []
    for train_index, val_index in group_kfold.split(
            train_features, groups=train_features["installation_id"]):
        model, qwk_score = train_and_evaluate(
            train_features,
            list(train_index),
            list(val_index),
            accuracy_group_proportions,
        )
        models.append(model)
        qwk_scores.append(qwk_score)

    print(f"QWK score: {np.mean(qwk_scores)}")

    # Predict on test set
    X_test, installation_ids = (
        test_features.drop(["installation_id"], axis=1),
        test_features["installation_id"],
    param_dist = {
        'n_estimators': stats.randint(100, 1000),
        'learning_rate': stats.uniform(0.01, 0.1),
        'subsample': stats.uniform(0.3, 0.7),
        'max_depth': [3, 4, 5, 6, 7, 8, 9],
        'colsample_bytree': stats.uniform(0.5, 0.45),
        'min_child_weight': [1, 2, 3]
    }

    n_splits = 50
    groups = data_train['Scenario']
    group_kfold = GroupKFold(n_splits=n_splits)

    rmse = list()
    for number, (train_idx, test_idx) in enumerate(
            group_kfold.split(data_train, groups=groups)):
        print(f'Fold {number + 1} of {n_splits}')
        X_train, Y_train = x_.iloc[train_idx, :], y_.iloc[train_idx]
        X_test, Y_test = x_.iloc[test_idx, :], y_.iloc[test_idx]
        groups_train = groups[train_idx]

        group_kfold_inner = GroupKFold(n_splits=3)

        xgb_regressor_model = xgb.XGBRegressor(objective='reg:squarederror')
        xgb_regressor_search = RandomizedSearchCV(
            xgb_regressor_model,
            param_distributions=param_dist,
            n_iter=10,
            scoring='neg_mean_squared_error',
            cv=group_kfold_inner.split(X_train, groups=groups_train),
            refit=1,
data = data.merge(user_ais_data, on=['user_id', 'aisle_id'], how='left')
del user_ais_data

train_set = data[data['eval_set'] == 'train']

del data
"""
任务1: 用一个单决策树来进行拟合,对参数进行调整。
"""

from sklearn.model_selection import GroupKFold
kf = GroupKFold(n_splits=5)
train_indexes = []
test_indexes = []
for i, (train_index, test_index) in enumerate(
        kf.split(train_set, groups=train_set['user_id'].values)):
    train_indexes.append(train_index)
    test_indexes.append(test_index)
train_index = train_indexes[0]
test_index = test_indexes[0]

training = train_set.iloc[train_index, :]
testing = train_set.iloc[test_index, :]

del train_set
"""
Decision Tree

"""
col = list(training.columns)
col.remove('reordered')
group_kfold = GroupKFold(n_splits=NFOLD)
sub_train['g'] = sub_train.index % NFOLD

CAT = list(set(X.columns) & set(utils_cat.ALL))

# =============================================================================
# cv
# =============================================================================
dtrain = lgb.Dataset(X, y, categorical_feature=CAT)
gc.collect()

ret = lgb.cv(param,
             dtrain,
             9999,
             folds=group_kfold.split(X, sub_train['y'], sub_train['g']),
             early_stopping_rounds=100,
             verbose_eval=50,
             seed=SEED)

result = f"CV auc-mean: {ret['auc-mean'][-1]}"
print(result)

utils.send_line(result)

# =============================================================================
# imp
# =============================================================================
dtrain = lgb.Dataset(X, y, categorical_feature=CAT)
model = lgb.train(param, dtrain, len(ret['auc-mean']))
imp = ex.getImp(model).sort_values(['gain', 'feature'],
Esempio n. 21
0
def multiclass_one_vs_rest(x,
                           y,
                           model_type='svm',
                           plot=False,
                           verbose=False,
                           run_cv=False):

    # First split the data into training and test sets
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=0.33, random_state=RANDOM_SEED)

    # pick the base classifier based on the model_type paramemter
    if model_type is 'logistic':
        base_model = LogisticRegression(random_state=RANDOM_SEED,
                                        class_weight='balanced')

    elif model_type is 'tree':
        base_model = DecisionTreeClassifier(random_state=RANDOM_SEED,
                                            class_weight='balanced')

    elif model_type is 'adaboost':
        base_model = AdaBoostClassifier(random_state=RANDOM_SEED)

    elif model_type is 'forest':
        # base case
        base_model = RandomForestClassifier(random_state=RANDOM_SEED,
                                            class_weight='balanced')

        # this gives no improvement over base case
        #base_model = RandomForestClassifier(random_state=RANDOM_SEED, class_weight='balanced', n_estimators = 50, max_depth=20)

        # no improvement here either
        #base_model = RandomForestClassifier(random_state=RANDOM_SEED, class_weight='balanced', n_estimators = 100, max_depth=40)

    elif model_type is 'nnet':
        base_model = MLPClassifier(random_state=RANDOM_SEED)

    elif model_type is 'extra':
        base_model = ExtraTreesClassifier(random_state=RANDOM_SEED)

    else:
        #base case
        base_model = SVC(kernel='linear',
                         random_state=RANDOM_SEED,
                         class_weight='balanced',
                         probability=True)

        # no improvement here
        #base_model = SVC(kernel='linear', random_state=RANDOM_SEED, class_weight='balanced', activation="logistic", max_iter=500)

    # create the OvR model using the base classifier
    # model = OneVsRestClassifier(base_model, n_jobs=10)
    # create OvR model with base classifier and feature selection
    model = CustomBRClassifier(base_model)

    # train the model using the training data
    fit_start = time.time()
    model.fit(x_train, y_train)
    fit_end = time.time()

    if verbose:
        print('------ model info ----------')
        print('one vs all ' + model_type + ' is a multi-label classifier: ' +
              str(model.multilabel_))
        print('one vs all ' + model_type + ' number of classes: ' +
              str(model.classes_))
        print('one vs all ' + model_type + ' elapsed training time: ' +
              str(fit_end - fit_start))

    # check the accuracy on the training data
    if verbose:
        print('------ training data ----------')
    fpr_train, tpr_train, auc_train = check_predictions(
        model, (model_type + " - train"), x_train, y_train, plot, verbose)

    # check the accuracy on the test data
    if verbose:
        print('------ test data ----------')

    fpr_test, tpr_test, auc_test = check_predictions(model,
                                                     (model_type + " - test"),
                                                     x_test, y_test, plot,
                                                     verbose)
    # generate_roc_hist(y_test, model.predict_proba(x_test))

    # get the cross-validation score
    if run_cv:

        accuracy_scorer = make_scorer(calculate_overall_accuracy)
        kf = GroupKFold(5)
        cv_accuracy_scores = []
        cv_auc_scores = []
        for train_index, test_index in kf.split(x,
                                                y=y,
                                                groups=np.arange(x.shape[0])):
            X_train, X_test = x.iloc[train_index,
                                     np.arange(0, x.shape[1])], x.iloc[
                                         test_index,
                                         np.arange(0, x.shape[1])]
            Y_train, Y_test = y[train_index], y[test_index]
            model.fit(X_train, Y_train)
            cv_auc_scores.append(
                roc_auc_score(Y_test,
                              model.predict_proba(X_test),
                              average='micro'))
            cv_accuracy_scores.append(
                calculate_overall_accuracy(Y_test,
                                           model.predict_proba(X_test)))
        cv_accuracy_scores = np.mean(np.array(cv_accuracy_scores))
        cv_auc_scores = np.mean(np.array(cv_auc_scores))

        if verbose:
            print('------ CV scores ----------')
            print('one vs all ' + model_type + ' CV accuracy scores ' +
                  str(cv_accuracy_scores))
            print('one vs all ' + model_type + ' CV AUC scores ' +
                  str(cv_auc_scores))

    return fpr_train, tpr_train, auc_train, fpr_test, tpr_test, auc_test
Esempio n. 22
0
logger.debug(f"Number of rows in train: {n_train}")
logger.debug(f"Number of rows in test: {n_test}")
logger.debug(f"Using features:{train_use.columns.values}")

categorical_cols = ["district", "layout", "direction", "structure"]

####################
## Train model
####################
folds = GroupKFold(n_splits=5)
oof = np.zeros(len(train_use))
predictions = np.zeros(len(test_use))
feature_importance_df = pd.DataFrame()

for fold, (train_idx,
           val_idx) in enumerate(folds.split(train_use,
                                             groups=group_for_kfold)):
    print(f"Fold {fold+1}")
    train_data = lgb.Dataset(train_use.iloc[train_idx],
                             label=target_log[train_idx],
                             categorical_feature=categorical_cols)
    val_data = lgb.Dataset(train_use.iloc[val_idx],
                           label=target_log[val_idx],
                           categorical_feature=categorical_cols)
    num_round = N_ROUNDS
    callbacks = [log_evaluation(logger, period=100)]
    clf = lgb.train(params,
                    train_data,
                    num_round,
                    valid_sets=[train_data, val_data],
                    verbose_eval=False,
                    early_stopping_rounds=100,
Esempio n. 23
0
def RandomForest(X, Y, groups, n_trees):
    lpgo = GroupKFold(n_splits=14)
    MAE = []
    ECM = []
    MAPE = []
    R2_SCORE = []
    relevant_features = []
    N = np.size(Y[0])
    for train_index, test_index in lpgo.split(X, Y, groups):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]

        #Normalización de los datos
        sc_X = StandardScaler()
        X_train = sc_X.fit_transform(X_train)
        X_test = sc_X.transform(X_test)

        # Ajustar el modelo a la regresión simple
        regressor = RandomForestRegressor(n_estimators=n_trees, random_state=0)
        regressor.fit(X_train, y_train)

        relevant_features.append(regressor.feature_importances_)

        # Predecir los resultados de prueba
        y_pred = regressor.predict(X_test)

        ECM.append(mean_squared_error(y_test, y_pred,
                                      multioutput='raw_values'))
        MAE.append(
            mean_absolute_error(y_test, y_pred, multioutput='raw_values'))
        R2_SCORE.append(r2_score(y_test, y_pred, multioutput='raw_values'))
        m = []
        m.append(
            np.mean(np.abs(
                (y_test[:, 0] - y_pred[:, 0]) / y_test[:, 0])) * 100)
        m.append(
            np.mean(np.abs(
                (y_test[:, 1] - y_pred[:, 1]) / y_test[:, 1])) * 100)
        MAPE.append(m)

    ECM_matrix = np.asmatrix(ECM)
    MAE_matrix = np.asmatrix(MAE)
    MAPE_matrix = np.asmatrix(MAPE)
    R2_matrix = np.asmatrix(R2_SCORE)

    for i in range(0, N):
        print("El error cuadratrico medio de validación para la salida", i,
              "es (ECM):", np.around(np.mean(ECM_matrix[:, i]),
                                     decimals=3), "+-",
              np.around(np.std(ECM_matrix[:, i]), decimals=3))
        print("El error medio absoluto de validación para la salida", i,
              "es (MAE):", np.around(np.mean(MAE_matrix[:, i]),
                                     decimals=3), "+-",
              np.around(np.std(MAE_matrix[:, i]), decimals=3))
        print(
            "El porcentaje de error medio absoluto de validación para la salida",
            (i + 1), "es (MAPE):",
            np.around(np.mean(MAPE_matrix[:, i]), decimals=3), "%", "+-",
            np.around(np.std(MAPE_matrix[:, i]), decimals=3), "%")
        print("Coeficiente de determinación para la salida", (i + 1),
              "es (R2):", np.around(np.mean(R2_matrix[:, i])), "%", "+-",
              np.around(np.std(R2_matrix[:, i]), decimals=3))

    relevant_features = np.asmatrix(relevant_features)
    print(np.mean(relevant_features, axis=0))
    def __init__(self,
                 seed,
                 val_split=0.2,
                 shuffle=True,
                 cell_features=['expression'],
                 drug_features=['descriptors'],
                 use_landmark_genes=False,
                 use_combo_score=False,
                 feature_subsample=None,
                 scaling='std',
                 scramble=False,
                 cv_partition='overlapping',
                 cv=0):
        """Initialize data merging drug response, drug descriptors and cell line essay.
           Shuffle and split training and validation set

        Parameters
        ----------
        seed: integer
            seed for random generation
        val_split : float, optional (default 0.2)
            fraction of data to use in validation
        cell_features: list of strings from 'expression', 'expression_5platform', 'mirna', 'proteome', 'all', 'categorical' (default ['expression'])
            use one or more cell line feature sets: gene expression, microRNA, proteome
            use 'all' for ['expression', 'mirna', 'proteome']
            use 'categorical' for one-hot encoded cell lines
        drug_features: list of strings from 'descriptors', 'latent', 'all', 'categorical', 'noise' (default ['descriptors'])
            use dragon7 descriptors, latent representations from Aspuru-Guzik's SMILES autoencoder
            trained on NSC drugs, or both; use random features if set to noise
            use 'categorical' for one-hot encoded drugs
        shuffle : True or False, optional (default True)
            if True shuffles the merged data before splitting training and validation sets
        scramble: True or False, optional (default False)
            if True randomly shuffle dose response data as a control
        feature_subsample: None or integer (default None)
            number of feature columns to use from cellline expressions and drug descriptors
        use_landmark_genes: True or False
            only use LINCS1000 landmark genes
        use_combo_score: bool (default False)
            use combination score in place of percent growth (stored in 'GROWTH' column)
        scaling: None, 'std', 'minmax' or 'maxabs' (default 'std')
            type of feature scaling: 'maxabs' to [-1,1], 'maxabs' to [-1, 1], 'std' for standard normalization
        """

        self.cv_partition = cv_partition

        np.random.seed(seed)

        df = NCI60.load_combo_response(use_combo_score=use_combo_score,
                                       fraction=True)
        logger.info('Loaded {} unique (CL, D1, D2) response sets.'.format(
            df.shape[0]))

        if 'all' in cell_features:
            self.cell_features = ['expression', 'mirna', 'proteome']
        else:
            self.cell_features = cell_features

        if 'all' in drug_features:
            self.drug_features = ['descriptors', 'latent']
        else:
            self.drug_features = drug_features

        for fea in self.cell_features:
            if fea == 'expression' or fea == 'rnaseq':
                self.df_cell_expr = NCI60.load_cell_expression_rnaseq(
                    ncols=feature_subsample,
                    scaling=scaling,
                    use_landmark_genes=use_landmark_genes)
                df = df.merge(self.df_cell_expr[['CELLNAME']], on='CELLNAME')
            elif fea == 'expression_u133p2':
                self.df_cell_expr = NCI60.load_cell_expression_u133p2(
                    ncols=feature_subsample,
                    scaling=scaling,
                    use_landmark_genes=use_landmark_genes)
                df = df.merge(self.df_cell_expr[['CELLNAME']], on='CELLNAME')
            elif fea == 'expression_5platform':
                self.df_cell_expr = NCI60.load_cell_expression_5platform(
                    ncols=feature_subsample,
                    scaling=scaling,
                    use_landmark_genes=use_landmark_genes)
                df = df.merge(self.df_cell_expr[['CELLNAME']], on='CELLNAME')
            elif fea == 'mirna':
                self.df_cell_mirna = NCI60.load_cell_mirna(
                    ncols=feature_subsample, scaling=scaling)
                df = df.merge(self.df_cell_mirna[['CELLNAME']], on='CELLNAME')
            elif fea == 'proteome':
                self.df_cell_prot = NCI60.load_cell_proteome(
                    ncols=feature_subsample, scaling=scaling)
                df = df.merge(self.df_cell_prot[['CELLNAME']], on='CELLNAME')
            elif fea == 'categorical':
                df_cell_ids = df[['CELLNAME']].drop_duplicates()
                cell_ids = df_cell_ids['CELLNAME'].map(
                    lambda x: x.replace(':', '.'))
                df_cell_cat = pd.get_dummies(cell_ids)
                df_cell_cat.index = df_cell_ids['CELLNAME']
                self.df_cell_cat = df_cell_cat.reset_index()

        for fea in self.drug_features:
            if fea == 'descriptors':
                self.df_drug_desc = NCI60.load_drug_descriptors(
                    ncols=feature_subsample, scaling=scaling)
                df = df[df['NSC1'].isin(self.df_drug_desc['NSC'])
                        & df['NSC2'].isin(self.df_drug_desc['NSC'])]
            elif fea == 'latent':
                self.df_drug_auen = NCI60.load_drug_autoencoded_AG(
                    ncols=feature_subsample, scaling=scaling)
                df = df[df['NSC1'].isin(self.df_drug_auen['NSC'])
                        & df['NSC2'].isin(self.df_drug_auen['NSC'])]
            elif fea == 'categorical':
                df_drug_ids = df[['NSC1']].drop_duplicates()
                df_drug_ids.columns = ['NSC']
                drug_ids = df_drug_ids['NSC']
                df_drug_cat = pd.get_dummies(drug_ids)
                df_drug_cat.index = df_drug_ids['NSC']
                self.df_drug_cat = df_drug_cat.reset_index()
            elif fea == 'noise':
                ids1 = df[['NSC1'
                           ]].drop_duplicates().rename(columns={'NSC1': 'NSC'})
                ids2 = df[['NSC2'
                           ]].drop_duplicates().rename(columns={'NSC2': 'NSC'})
                df_drug_ids = pd.concat([ids1, ids2]).drop_duplicates()
                noise = np.random.normal(size=(df_drug_ids.shape[0], 500))
                df_rand = pd.DataFrame(
                    noise,
                    index=df_drug_ids['NSC'],
                    columns=['RAND-{:03d}'.format(x) for x in range(500)])
                self.df_drug_rand = df_rand.reset_index()

        logger.info(
            'Filtered down to {} rows with matching information.'.format(
                df.shape[0]))

        ids1 = df[['NSC1']].drop_duplicates().rename(columns={'NSC1': 'NSC'})
        ids2 = df[['NSC2']].drop_duplicates().rename(columns={'NSC2': 'NSC'})
        df_drug_ids = pd.concat([ids1, ids2
                                 ]).drop_duplicates().reset_index(drop=True)

        n_drugs = df_drug_ids.shape[0]
        n_val_drugs = int(n_drugs * val_split)
        n_train_drugs = n_drugs - n_val_drugs

        logger.info('Unique cell lines: {}'.format(df['CELLNAME'].nunique()))
        logger.info('Unique drugs: {}'.format(n_drugs))
        # df.to_csv('filtered.growth.min.tsv', sep='\t', index=False, float_format='%.4g')
        # df.to_csv('filtered.score.max.tsv', sep='\t', index=False, float_format='%.4g')

        if shuffle:
            df = df.sample(frac=1.0, random_state=seed).reset_index(drop=True)
            df_drug_ids = df_drug_ids.sample(
                frac=1.0, random_state=seed).reset_index(drop=True)

        self.df_response = df
        self.df_drug_ids = df_drug_ids

        self.train_drug_ids = df_drug_ids['NSC'][:n_train_drugs]
        self.val_drug_ids = df_drug_ids['NSC'][-n_val_drugs:]

        if scramble:
            growth = df[['GROWTH']]
            random_growth = growth.iloc[np.random.permutation(
                np.arange(growth.shape[0]))].reset_index()
            self.df_response[['GROWTH']] = random_growth['GROWTH']
            logger.warn('Randomly shuffled dose response growth values.')

        logger.info('Distribution of dose response:')
        logger.info(self.df_response[['GROWTH']].describe())

        self.total = df.shape[0]
        self.n_val = int(self.total * val_split)
        self.n_train = self.total - self.n_val
        logger.info('Rows in train: {}, val: {}'.format(
            self.n_train, self.n_val))

        self.cell_df_dict = {
            'expression': 'df_cell_expr',
            'expression_5platform': 'df_cell_expr',
            'expression_u133p2': 'df_cell_expr',
            'rnaseq': 'df_cell_expr',
            'mirna': 'df_cell_mirna',
            'proteome': 'df_cell_prot',
            'categorical': 'df_cell_cat'
        }

        self.drug_df_dict = {
            'descriptors': 'df_drug_desc',
            'latent': 'df_drug_auen',
            'categorical': 'df_drug_cat',
            'noise': 'df_drug_rand'
        }

        self.input_features = collections.OrderedDict()
        self.feature_shapes = {}
        for fea in self.cell_features:
            feature_type = 'cell.' + fea
            feature_name = 'cell.' + fea
            df_cell = getattr(self, self.cell_df_dict[fea])
            self.input_features[feature_name] = feature_type
            self.feature_shapes[feature_type] = (df_cell.shape[1] - 1, )

        for drug in ['drug1', 'drug2']:
            for fea in self.drug_features:
                feature_type = 'drug.' + fea
                feature_name = drug + '.' + fea
                df_drug = getattr(self, self.drug_df_dict[fea])
                self.input_features[feature_name] = feature_type
                self.feature_shapes[feature_type] = (df_drug.shape[1] - 1, )

        logger.info('Input features shapes:')
        for k, v in self.input_features.items():
            logger.info('  {}: {}'.format(k, self.feature_shapes[v]))

        self.input_dim = sum([
            np.prod(self.feature_shapes[x])
            for x in self.input_features.values()
        ])
        logger.info('Total input dimensions: {}'.format(self.input_dim))

        if cv > 1:
            if cv_partition == 'disjoint':
                pass
            elif cv_partition == 'disjoint_cells':
                y = self.df_response['GROWTH'].values
                groups = self.df_response['CELLNAME'].values
                gkf = GroupKFold(n_splits=cv)
                splits = gkf.split(y, groups=groups)
                self.cv_train_indexes = []
                self.cv_val_indexes = []
                for index, (train_index, val_index) in enumerate(splits):
                    print(index, train_index)
                    self.cv_train_indexes.append(train_index)
                    self.cv_val_indexes.append(val_index)
            else:
                y = self.df_response['GROWTH'].values
                # kf = KFold(n_splits=cv)
                # splits = kf.split(y)
                skf = StratifiedKFold(n_splits=cv, random_state=seed)
                splits = skf.split(y, discretize(y, bins=cv))
                self.cv_train_indexes = []
                self.cv_val_indexes = []
                for index, (train_index, val_index) in enumerate(splits):
                    print(index, train_index)
                    self.cv_train_indexes.append(train_index)
                    self.cv_val_indexes.append(val_index)
Esempio n. 25
0
def split_data(df,
               ycol='0',
               classify=False,
               cv=5,
               bins=0,
               cutoffs=None,
               groupcols=None,
               ignore_categoricals=False,
               verbose=True):
    if groupcols is not None:
        groups = make_group_from_columns(df, groupcols)

    cat_cols = df.select_dtypes(['object']).columns
    if ignore_categoricals:
        df[cat_cols] = 0
    else:
        df[cat_cols] = df[cat_cols].apply(
            lambda x: x.astype('category').cat.codes)

    if ycol.isdigit():
        ycol = df.columns[int(ycol)]

    y = df.loc[:, ycol].as_matrix()
    x = df.drop(ycol, axis=1).as_matrix()
    features = df.drop(ycol, axis=1).columns.tolist()

    if verbose:
        print('Target column: {}'.format(ycol))
        print('  count = {}, uniq = {}, mean = {:.3g}, std = {:.3g}'.format(
            len(y), len(np.unique(y)), np.mean(y), np.std(y)))
        print(
            '  min = {:.3g}, q1 = {:.3g}, median = {:.3g}, q3 = {:.3g}, max = {:.3g}'
            .format(np.min(y), np.percentile(y, 25), np.median(y),
                    np.percentile(y, 75), np.max(y)))

    if not classify:
        y_even = discretize(y, bins=5, verbose=False)
    elif bins >= 2:
        y = discretize(y, bins=bins, min_count=cv, verbose=verbose)
    elif cutoffs:
        y = discretize(y, cutoffs=cutoffs, min_count=cv, verbose=verbose)
    elif df[ycol].dtype in [np.dtype('float64'), np.dtype('float32')]:
        warnings.warn(
            'Warning: classification target is float; consider using --bins or --cutoffs'
        )
        y = y.astype(int)

    if classify:
        mask = np.ones(len(y), dtype=bool)
        unique, counts = np.unique(y, return_counts=True)
        for v, c in zip(unique, counts):
            if c < cv:
                mask[y == v] = False
        x = x[mask]
        y = y[mask]
        removed = len(mask) - np.sum(mask)
        if removed and verbose:
            print('Removed {} rows in small classes: count < {}'.format(
                removed, cv))

    if groupcols is None:
        if classify:
            y_even = y
        skf = StratifiedKFold(n_splits=cv, shuffle=True)
        splits = skf.split(x, y_even)
    else:
        if classify:
            groups = groups[mask]
        gkf = GroupKFold(n_splits=cv)
        splits = gkf.split(x, y, groups)

    if verbose:
        print()

    return x, y, list(splits), features
Esempio n. 26
0
col = list(train.columns)
col.remove('reordered')
col.remove('eval_set')
col.remove('user_id')
col.remove('product_id')
col.remove('order_id')
col.remove('department_id')
col.remove('aisle_id')

from sklearn.model_selection import GroupKFold
kf = GroupKFold(n_splits=5)
train_indexes = []
test_indexes = []
for i, (train_index, test_index) in enumerate(
        kf.split(train, groups=train['user_id'].values)):
    train_indexes.append(train_index)
    test_indexes.append(test_index)
train_index = train_indexes[0]
test_index = test_indexes[0]

training = train.iloc[train_index, :]
testing = train.iloc[test_index, :]
del train
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss

del train_index, train_indexes, test_index, test_indexes, i
del test

import lightgbm as lgb
    # Plot and save bar chart
    plt.rcParams['xtick.labelsize'] = 8
    ax = scaled_var_imp_df_sorted.plot.bar(y='scaled_importance',
                                           x='variable',
                                           rot=90,
                                           figsize=(16, 12))
    plt.tight_layout()
    plt.savefig('FS_result_h2o.pdf', format='pdf', dpi=1200)

# Perform k-fold cv
# split on train - test dataset by group - according to no_folds
gkf = GroupKFold(n_splits=no_folds)
cv_fold = 0

for train_index, test_index in gkf.split(X, y, groups=groups):
    cv_fold += 1
    print("CV fold: ", cv_fold)
    # print("Train Index: ", train_index)
    # print("Test Index: ", test_index, "\n")
    #print('Groups: ', groups,'\n')

    trainX_data = X.loc[train_index]
    trainy_data = y.loc[train_index]

    testX_data = X.loc[test_index]
    testy_data = y.loc[test_index]

    # Save original 10cv folds with all features
    train_set = pd.concat([trainX_data, trainy_data, groups],
                          axis=1,
Esempio n. 28
0
def group_test_2(pre_x, kmeans_labels, names, num_dic, groups, num_vars,
                 meta_i):
    # print('meta-i='+str(meta_i))
    chosen_vars = np.zeros(meta_i)
    chosen_values = np.zeros(meta_i)
    # print('===')
    for i in range(num_vars):  #clean this routine up? check for errors?
        old_val = 0
        new_val = num_dic[i]
        clust = kmeans_labels[i]
        old_val = chosen_values[clust]
        if old_val < new_val:
            # print(names[i],old_val,new_val)
            chosen_vars[clust] = int(i)
            chosen_values[clust] = new_val
    # print(chosen_vars)
    # print(type(chosen_vars))
    chosen_works = []
    chosen_names = []
    for qq in list(chosen_vars):
        chosen_names.append(names[int(qq)])
        chosen_works.append(int(qq))
    X = pre_x[:, chosen_works]
    # print(chosen_names)
    # x_train,x_test,y_train,y_test=train_test_split(new_x,y,test_size=.3)
    group_kfold = GroupKFold(n_splits=3)
    group_kfold.get_n_splits(X, y, groups)
    acc_arr = []
    for train_index, test_index in group_kfold.split(X, y, groups):
        X_train = []
        X_test = []
        y_train = []
        y_test = []
        # print(train_index)
        # print(test_index)
        for id in train_index:
            X_train.append(X[id])
        for id in test_index:
            X_test.append(X[id])
        for id in train_index:
            y_train.append(y[id])
        for id in test_index:
            y_test.append(y[id])

        # if model=='svm':
        # clf=SVC().fit(X_train,y_train)
        # elif model=='rf_extra':
        # clf=ExtraTreesClassifier().fit(X_train,y_train)
        # elif model=='rf':
        # clf=RandomForestClassifier().fit(X_train,y_train)
        # elif model=='nb':
        # clf=GaussianNB().fit(X_train,y_train)
        # elif model=='lr':
        # clf=LogisticRegression().fit(X_train,y_train)
        clf = RandomForestClassifier().fit(X_train, y_train)
        # score=np.mean(cross_val_score(svm,X,y,cv=10))
        # print(groupdic[groupid],modeldic[modelid],np.shape(X_test),np.shape(X_train))
        tmp_score = clf.score(X_test, y_test)
        acc_arr.append(tmp_score)
        # print('accuracy='+','+str(qwer)+'\n')
    return np.mean(acc_arr), chosen_names
Esempio n. 29
0
    if cross_test:
        this_scores = cross_val_score(pipe, Sample, target, cv=10)
        print('10次交叉验证:\n')
        print('10次交叉验证的精确度', this_scores.view())
        print('10次交叉验证的精确度平均值', this_scores.mean())
        print('10次交叉验证的精确度方差', this_scores.std())
        print('-----------------------------------------------')

        group = sio.loadmat(
            'D:\\1-embed\\4-Serial_GUI\\分类模型训练\\tmp\\group.mat')
        group = group['group_individual']
        group = group.reshape(-1)
        print(group)
        # gkf.split(X, y, groups=group)
        gkf = GroupKFold(n_splits=11)
        for train, test in gkf.split(Sample, target, groups=group):
            print("train-%s test-%s" % (group[train], group[test]))
        this_scores = cross_val_score(pipe,
                                      Sample,
                                      target,
                                      groups=group,
                                      cv=gkf)
        print('5随机划分训练集:\n')
        print('随机交叉验证的精确度', this_scores.view())
        print('随机交叉验证的精确度平均值', this_scores.mean())
        print('随机交叉验证的精确度方差', this_scores.std())

    if save:
        #使用dump()将数据序列化到文件中
        fw = open('D:\\1-embed\\4-Serial_GUI\\分类模型训练\\tmp\\ModelFile.txt',
                  'wb')
Esempio n. 30
0
def validate_train(model,
                   X,
                   y,
                   groups,
                   oversample=True,
                   n_splits=5,
                   dump=DUMP_DEFAULT,
                   model_folder=MODEL_FOLDER_DEFAULT,
                   metric=f1_score,
                   verbose=False,
                   num_importance=20):
    kf = GroupKFold(n_splits=n_splits)
    all_y = []
    all_predicted = []
    for train_index, test_index in kf.split(X, y, groups):
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]
        if oversample:
            X_tmp, y_tmp = oversample_data(X_train, y_train, verbose=verbose)
            model.train_model(X_tmp, y_tmp)
        else:
            model.train_model(X_train, y_train)
        pred = model.predict_batch(X_test)

        all_predicted.extend(pred)
        all_y.extend(y_test)

    print(">>> MODEL: ", model.model_name)
    print("Params:", model.get_description())
    all_y = model.encode2idx(all_y)

    if metric is f1_score:
        result = metric(all_y, all_predicted, average=None)
    else:
        result = metric(all_y, all_predicted)

    if dump:
        if oversample:
            X_tmp, y_tmp = oversample_data(X, y, verbose=verbose)
            model.train_model(X_tmp, y_tmp)
        else:
            model.train_model(X, y)

        print("FEATURE_IMPORTANCE")

        importances = model.get_feature_importance()
        labels = model.get_labels()
        print("=== labels {} ===".format(labels))
        if importances is not None:
            for imp_line, label in zip(importances, labels):
                print("\nLABEL: ", label)
                print("*" * 20)
                print("\n --- TOP {} most important --- \n".format(
                    num_importance))
                for n, val in imp_line[:num_importance]:
                    print("{}\t{}".format(n, np.round(val, 3)))

                print("\n --- TOP {} anti features --- \n".format(
                    num_importance))
                for n, val in imp_line[::-1][:num_importance]:
                    print("{}\t{}".format(n, np.round(val, 3)))

        model.dump_model(os.path.join(model_folder, model.model_name))
        print("== MODEL DUMPED ==")

    print("classif_report:\n", classification_report(all_y, all_predicted))
    log_results(all_y, all_predicted, model.model_name or model.model_name,
                model)
    return result
Esempio n. 31
0
def run_cv_model_by_batch(train, test, splits, batch_col, feats, sample_submission, nn_epochs, nn_batch_size):
    
    seed_everything(SEED)
    K.clear_session()
    config = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1,inter_op_parallelism_threads=1)
    sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=config)
    tf.compat.v1.keras.backend.set_session(sess)
    oof_ = np.zeros((len(train), 11)) # build out of folds matrix with 11 columns, they represent our target variables classes (from 0 to 10)
    preds_ = np.zeros((len(test), 11))
    target = ['open_channels']
    group = train['group']
    kf = GroupKFold(n_splits=5)
    splits = [x for x in kf.split(train, train[target], group)]
    oof_pd = pd.DataFrame()
    oof_pd['open_channels'] = train['open_channels']

    new_splits = []
    for sp in splits:
        new_split = []
        new_split.append(np.unique(group[sp[0]]))
        new_split.append(np.unique(group[sp[1]]))
        new_split.append(sp[1])    
        new_splits.append(new_split)
    # pivot target columns to transform the net to a multiclass classification estructure (you can also leave it in 1 vector with sparsecategoricalcrossentropy loss function)
    tr = pd.concat([pd.get_dummies(train.open_channels), train[['group']]], axis=1)

    tr.columns = ['target_'+str(i) for i in range(11)] + ['group']
    target_cols = ['target_'+str(i) for i in range(11)]
    
    train_tr = np.array(list(tr.groupby('group').apply(lambda x: x[target_cols].values))).astype(np.float32)
    train = np.array(list(train.groupby('group').apply(lambda x: x[feats].values)))
    test = np.array(list(test.groupby('group').apply(lambda x: x[feats].values)))

    for n_fold, (tr_idx, val_idx, val_orig_idx) in enumerate(new_splits[0:], start=0):
        train_x, train_y = train[tr_idx], train_tr[tr_idx]
        valid_x, valid_y = train[val_idx], train_tr[val_idx]
        print(f'Our training dataset shape is {train_x.shape}')
        print(f'Our validation dataset shape is {valid_x.shape}')

        train_x_new = train

        gc.collect()
        shape_ = (None, train_x.shape[2]) # input is going to be the number of feature we are using (dimension 2 of 0, 1, 2)
        model = Classifier(shape_)
        # using our lr_schedule function
        cb_lr_schedule = LearningRateScheduler(lr_schedule)
        # Use Early-Stopping
        callback_early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=0, mode='auto')
        model.fit(train_x,train_y,
                  epochs = nn_epochs,
                  callbacks = [callback_early_stopping,cb_lr_schedule, MacroF1(model, valid_x, valid_y)], # adding custom evaluation metric for each epoch
                  batch_size = nn_batch_size,verbose = 2,
                  validation_data = (valid_x,valid_y))
        preds_f = model.predict(valid_x)
        f1_score_ = f1_score(np.argmax(valid_y, axis=2).reshape(-1),  np.argmax(preds_f, axis=2).reshape(-1), average = 'macro') # need to get the class with the biggest probability
        print(f'Training fold {n_fold + 1} completed. macro f1 score : {f1_score_ :1.5f}')
        preds_f = preds_f.reshape(-1, preds_f.shape[-1])
        oof_[val_orig_idx,:] += preds_f
        te_preds = model.predict(test)
        te_preds = te_preds.reshape(-1, te_preds.shape[-1])           
        preds_ += te_preds / SPLITS
        del train_x, train_y, valid_x, valid_y
    # calculate the oof macro f1_score
    f1_score_ = f1_score(np.argmax(train_tr, axis = 2).reshape(-1),  np.argmax(oof_, axis = 1), average = 'macro') # axis 2 for the 3 Dimension array and axis 1 for the 2 Domension Array (extracting the best class)
    print(f'Training completed. oof macro f1 score : {f1_score_:1.5f}')
    sample_submission['open_channels'] = np.argmax(preds_, axis = 1).astype(int)
    oof_pd['open_channels_pred'] = np.argmax(oof_, axis = 1).astype(int)
    sample_submission.to_csv('submission_wavenet.csv', index=False, float_format='%.4f')
# ## Make Folds

# In[13]:

df_train = pd.read_csv(f'{datadir}train.csv')  # 载入train.csv为dataframe
df_train['file_path'] = df_train.image.apply(
    lambda x: os.path.join(f"{datadir}train_images", x))  # 加入图片的文件路径

# In[14]:

# 做5fold切分数据,并把每行归属的fold加入到df_train中
gkf = GroupKFold(n_splits=5)
df_train['fold'] = -1
for fold, (train_idx, valid_idx) in enumerate(
        gkf.split(df_train, None, df_train.label_group)):
    df_train.loc[valid_idx, 'fold'] = fold

# In[15]:

df_train.head()

# In[16]:

# 对label_group做一次标签编码
# 相关阅读 https://blog.csdn.net/weixin_43172660/article/details/84886470
le = LabelEncoder()
df_train.label_group = le.fit_transform(df_train.label_group)

# ## Transforms
    ax.set_title("Grid Search Scores", fontsize=20, fontweight='bold')
    ax.set_xlabel(name_param_1, fontsize=16)
    ax.set_ylabel('CV Average Score', fontsize=16)
    ax.legend(loc="best", fontsize=15)
    ax.grid(True)


# load the data
data = sgl.load_watch()
X = data['X']
y = data['y']
g = data['subject']

# use subject id to group folds
splitter = GroupKFold(n_splits=3)
cv = splitter.split(X, y, groups=g)

# create a feature representation pipeline
pipe = sgl.Pype([('seg', sgl.SegmentX()), ('features', sgl.FeatureRep()),
                 ('scaler', StandardScaler()),
                 ('rf', RandomForestClassifier())])

# create a parameter dictionary using the sklearn API
# note that if you want to set a parameter to a single value, it will still need to be as a list

par_grid = {
    'seg__width': [50, 100, 200],
    'seg__overlap': [0., 0.5],
    'rf__n_estimators': [20]
}
Esempio n. 34
0
 logger.info('{} Features after dropping null columns'.format(
     len(X_type.columns)))
 # Start training for type
 bond_start = timer()
 fold_count = 1
 # Train the model
 # X_type = X.loc[X['type'] == bond_type]
 # y_type = y.iloc[X_type.index]
 # X_test_type = X_test.loc[X_test['type'] == bond_type]
 mol_group_type = mol_group.loc[mol_group['type'] ==
                                bond_type]['molecule_name']
 oof = np.zeros(len(X_type))
 prediction_type = np.zeros(len(X_test_type))
 bond_scores = []
 for fold_n, (train_idx, valid_idx) in enumerate(
         folds.split(X_type, groups=mol_group_type)):
     if MODEL_TYPE == 'lgbm':
         fold_start = timer()
         logger.info('Running Type {} - Fold {} of {}'.format(
             bond_type, fold_count, folds.n_splits))
         X_train, X_valid = X_type.iloc[train_idx], X_type.iloc[valid_idx]
         y_train, y_valid = y_type.iloc[train_idx], y_type.iloc[valid_idx]
         model = lgb.LGBMRegressor(**lgb_params,
                                   n_estimators=N_ESTIMATORS,
                                   n_jobs=N_THREADS)
         model.fit(
             X_train.drop('type', axis=1),
             y_train,
             eval_set=[  #(X_train.drop('type', axis=1), y_train),
                 (X_valid.drop('type', axis=1), y_valid)
             ],
Esempio n. 35
0
# Step 2: Collect data for running CRF classifier

true_iob_dir = join(LOCAL_DIR, 'train', 'iob')

data = collect_crf_data(true_iob_dir, base_feats_dir, word_feats_dir)

# Step 3: Create folds

# create folds from complete texts only (i.e. instances of the same text
# are never in different folds)
# TODO How to set seed for random generator?
group_k_fold = GroupKFold(n_splits=5)

# use same split for all three entities
splits = list(
    group_k_fold.split(data['feats'], data['Material'], data['filenames']))

# Step 4: Run CRF classifier
crf = PruneCRF(c1=0.1, c2=0.1, all_possible_transitions=True)
pred = {}

for ent in ENTITIES:
    pred[ent] = cross_val_predict(crf, data['feats'], data[ent], cv=splits)
    # Report scores directly on I and B tags,
    # disregard 'O' because it is by far the most frequent class
    print('\n' + ent + ':\n')
    print(
        flat_classification_report(data[ent],
                                   pred[ent],
                                   digits=3,
                                   labels=('B', 'I')))
Esempio n. 36
0
def test_group_kfold():
    rng = np.random.RandomState(0)

    # Parameters of the test
    n_groups = 15
    n_samples = 1000
    n_splits = 5

    X = y = np.ones(n_samples)

    # Construct the test data
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    groups = rng.randint(0, n_groups, n_samples)

    ideal_n_groups_per_fold = n_samples // n_splits

    len(np.unique(groups))
    # Get the test fold indices from the test set indices of each fold
    folds = np.zeros(n_samples)
    lkf = GroupKFold(n_splits=n_splits)
    for i, (_, test) in enumerate(lkf.split(X, y, groups)):
        folds[test] = i

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(groups))
    for i in np.unique(folds):
        assert_greater_equal(tolerance,
                             abs(sum(folds == i) - ideal_n_groups_per_fold))

    # Check that each group appears only in 1 fold
    for group in np.unique(groups):
        assert_equal(len(np.unique(folds[groups == group])), 1)

    # Check that no group is on both sides of the split
    groups = np.asarray(groups, dtype=object)
    for train, test in lkf.split(X, y, groups):
        assert_equal(len(np.intersect1d(groups[train], groups[test])), 0)

    # Construct the test data
    groups = np.array(['Albert', 'Jean', 'Bertrand', 'Michel', 'Jean',
                       'Francis', 'Robert', 'Michel', 'Rachel', 'Lois',
                       'Michelle', 'Bernard', 'Marion', 'Laura', 'Jean',
                       'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix',
                       'Robert', 'Marion', 'David', 'Tony', 'Abel', 'Becky',
                       'Madmood', 'Cary', 'Mary', 'Alexandre', 'David',
                       'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi', 'Silvia'])

    n_groups = len(np.unique(groups))
    n_samples = len(groups)
    n_splits = 5
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    ideal_n_groups_per_fold = n_samples // n_splits

    X = y = np.ones(n_samples)

    # Get the test fold indices from the test set indices of each fold
    folds = np.zeros(n_samples)
    for i, (_, test) in enumerate(lkf.split(X, y, groups)):
        folds[test] = i

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(groups))
    for i in np.unique(folds):
        assert_greater_equal(tolerance,
                             abs(sum(folds == i) - ideal_n_groups_per_fold))

    # Check that each group appears only in 1 fold
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", DeprecationWarning)
        for group in np.unique(groups):
            assert_equal(len(np.unique(folds[groups == group])), 1)

    # Check that no group is on both sides of the split
    groups = np.asarray(groups, dtype=object)
    for train, test in lkf.split(X, y, groups):
        assert_equal(len(np.intersect1d(groups[train], groups[test])), 0)

    # groups can also be a list
    cv_iter = list(lkf.split(X, y, groups.tolist()))
    for (train1, test1), (train2, test2) in zip(lkf.split(X, y, groups),
                                                cv_iter):
        assert_array_equal(train1, train2)
        assert_array_equal(test1, test2)

    # Should fail if there are more folds than groups
    groups = np.array([1, 1, 1, 2, 2])
    X = y = np.ones(len(groups))
    assert_raises_regexp(ValueError, "Cannot have number of splits.*greater",
                         next, GroupKFold(n_splits=3).split(X, y, groups))
Esempio n. 37
0
def test_group_kfold():
    rng = np.random.RandomState(0)

    # Parameters of the test
    n_groups = 15
    n_samples = 1000
    n_splits = 5

    X = y = np.ones(n_samples)

    # Construct the test data
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    groups = rng.randint(0, n_groups, n_samples)

    ideal_n_groups_per_fold = n_samples // n_splits

    len(np.unique(groups))
    # Get the test fold indices from the test set indices of each fold
    folds = np.zeros(n_samples)
    lkf = GroupKFold(n_splits=n_splits)
    for i, (_, test) in enumerate(lkf.split(X, y, groups)):
        folds[test] = i

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(groups))
    for i in np.unique(folds):
        assert_greater_equal(tolerance,
                             abs(sum(folds == i) - ideal_n_groups_per_fold))

    # Check that each group appears only in 1 fold
    for group in np.unique(groups):
        assert_equal(len(np.unique(folds[groups == group])), 1)

    # Check that no group is on both sides of the split
    groups = np.asarray(groups, dtype=object)
    for train, test in lkf.split(X, y, groups):
        assert_equal(len(np.intersect1d(groups[train], groups[test])), 0)

    # Construct the test data
    groups = np.array([
        'Albert', 'Jean', 'Bertrand', 'Michel', 'Jean', 'Francis', 'Robert',
        'Michel', 'Rachel', 'Lois', 'Michelle', 'Bernard', 'Marion', 'Laura',
        'Jean', 'Rachel', 'Franck', 'John', 'Gael', 'Anna', 'Alix', 'Robert',
        'Marion', 'David', 'Tony', 'Abel', 'Becky', 'Madmood', 'Cary', 'Mary',
        'Alexandre', 'David', 'Francis', 'Barack', 'Abdoul', 'Rasha', 'Xi',
        'Silvia'
    ])

    n_groups = len(np.unique(groups))
    n_samples = len(groups)
    n_splits = 5
    tolerance = 0.05 * n_samples  # 5 percent error allowed
    ideal_n_groups_per_fold = n_samples // n_splits

    X = y = np.ones(n_samples)

    # Get the test fold indices from the test set indices of each fold
    folds = np.zeros(n_samples)
    for i, (_, test) in enumerate(lkf.split(X, y, groups)):
        folds[test] = i

    # Check that folds have approximately the same size
    assert_equal(len(folds), len(groups))
    for i in np.unique(folds):
        assert_greater_equal(tolerance,
                             abs(sum(folds == i) - ideal_n_groups_per_fold))

    # Check that each group appears only in 1 fold
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", DeprecationWarning)
        for group in np.unique(groups):
            assert_equal(len(np.unique(folds[groups == group])), 1)

    # Check that no group is on both sides of the split
    groups = np.asarray(groups, dtype=object)
    for train, test in lkf.split(X, y, groups):
        assert_equal(len(np.intersect1d(groups[train], groups[test])), 0)

    # groups can also be a list
    cv_iter = list(lkf.split(X, y, groups.tolist()))
    for (train1, test1), (train2, test2) in zip(lkf.split(X, y, groups),
                                                cv_iter):
        assert_array_equal(train1, train2)
        assert_array_equal(test1, test2)

    # Should fail if there are more folds than groups
    groups = np.array([1, 1, 1, 2, 2])
    X = y = np.ones(len(groups))
    assert_raises_regexp(ValueError, "Cannot have number of splits.*greater",
                         next,
                         GroupKFold(n_splits=3).split(X, y, groups))