Exemple #1
0
    if prr >= 0.9:
        return 'good'
    elif prr <= 0.1:
        return 'bad'
    else:
        return 'interm.'


@memory.cache
def load_rutgers():
    return list(get_traces())


features = ['rssi', 'rssi_std', 'rssi_avg']

cv = model_selection.StratifiedKFold(n_splits=10, shuffle=True)

pipe = pipeline.Pipeline([
    ('scaler', preprocessing.StandardScaler()),
    ('resample', over_sampling.RandomOverSampler()),
    #('clf', tree.DecisionTreeClassifier(max_depth=3)),
    ('linear', linear_model.LogisticRegression(solver='ovr')),
])


@memory.cache
def different_window_sizes(W_PRR, W_HISTORY):
    print(f'*** PRR={W_PRR}, HISTORY={W_HISTORY} ***')

    dataset = load_rutgers()
    print('Rutgers loaded ...')
Exemple #2
0
# file directory
root_dir = 'D:/Deep Learning/Projects/Melanoma skin cancer detection/'

# not using
# Folding the validation set using stratified cross validation . Stratified cross validation works well for skewed dataset
if __name__ == "__main__":
    train_csv = pd.read_csv(root_dir + 'train.csv')
    train_csv['kfold'] = -1
    # DataFrame.sample randomizes the rows of the data and frac determines the fraction of the data to be used for randomizing
    # DataFrame.reset_index creates new index for the randomized data and drop=True delete the previous index
    train_csv = train_csv.sample(frac=1).reset_index(drop=True)
    # listing target values based on which we will set up the stratified folds
    y = train_csv.target.values
    # initiating the k-fold class from model selection
    kf = model_selection.StratifiedKFold(n_splits=5)

    for count, (train_index, val_index) in enumerate(kf.split(X=train_csv,
                                                              y=y)):
        train_csv.loc[val_index, 'kfold'] = count

    train_csv.to_csv(root_dir + "train_folds.csv")

# data loading and converting into tensor
# train data
train_csv = pd.read_csv(root_dir + 'train.csv')
images_id_train = train_csv.image_name.values.tolist()
train_images = [
    os.path.join(root_dir + 'train/', i + '.jpg') for i in images_id_train
]
train_targets = train_csv.target.values
# split into train test sets using t_t_s
# because we combined the datasets to apply uniform
# one hot and label encoding, we set 'shuffle' parameter as false
# we also know that there should be 15060 rows in the test sets
test_set_size = test_dataset_nomissing.shape[0]
print('\n test_set_size...')
print(test_set_size)
X_train, X_test, Y_train, Y_test = t_t_s(rescaledX, Y, test_size=test_set_size, random_state=seed, shuffle=False)

# instantiate XGBC class using defaults
model = XGBC()

# evaluate the model against the training datset using stratified kfold
print('\n evaluating xgb model via skfold...')
kfold = m_s.StratifiedKFold(n_splits=10, random_state=seed, shuffle=True)
cv_results = c_v_s(model, X_train, Y_train, cv=kfold)
print("xgb SKFOLD training accuracy standardized: %.2f%% (%.2f%%)" % (cv_results.mean()*100, cv_results.std()*100))

# fit model to training datasets
print('\n training d model...')
model.fit(X_train, Y_train)

# view trained model
print('\n model...')
print(model)

# make predictions for test data
print('\n making predictions...')
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
                                        p=2)))
    models.append(
        ('KNN-Manhattan', KNeighborsClassifier(n_neighbors=neighbhors, p=1)))
    models.append(("KNN-Manhattan-Weighted",
                   KNeighborsClassifier(n_neighbors=neighbhors,
                                        weights='distance',
                                        p=1)))
    models.append(("Gaussian Bayes", GaussianNB()))
    models.append(("ID3", tree.DecisionTreeClassifier()))
    #evaluate each model in turn
    results = []
    names = []

    for name, model in models:
        kfold = model_selection.StratifiedKFold(n_splits=splits,
                                                shuffle=True,
                                                random_state=seed)
        cv_results = model_selection.cross_val_score(model,
                                                     descriptiveFeats,
                                                     targetFeats,
                                                     cv=kfold,
                                                     scoring=scoring)
        results.append(cv_results)
        names.append(name)
        print("-----------------------{}-----------------------".format(name))
        y_predictions = cross_val_predict(model,
                                          descriptiveFeats,
                                          targetFeats,
                                          cv=kfold)
        print(accuracy_score(targetFeats, y_predictions))
        print(confusion_matrix(targetFeats, y_predictions))
Exemple #5
0
        [f for f in features if 'context_' in f][0])
    m = train_nn_def.BespokeNN(num_dense_features=num_dense_features,
                               verbose=0,
                               file_prefix=CACHE_DIR + '/holdout_bespoke_nn_')
    param_grid = {
        'model__num_hidden_layers': [0, 1, 2, 4],
        'model__hidden_layer_size': [2, 4, 8, 16, 32],
        'model__dropout': [0, .25, .5],
        'model__learning_rate': [.01, .001, .0001],
        'model__dense_reg_strength': [.1, .01,
                                      .001],  # Bespoke DNN specific parameters
        'model__sparse_reg_strength': [.1, .01, .001],
    }

xval = model_selection.StratifiedKFold(4,
                                       shuffle=True,
                                       random_state=RANDOM_SEED)

pipe = pipeline.Pipeline([
    ('normalize',
     preprocessing.MinMaxScaler()),  # NN classifier (doesn't hurt trees)
    ('model', m),
])
gs = model_selection.GridSearchCV(pipe,
                                  param_grid,
                                  scoring=score_metric,
                                  verbose=2,
                                  cv=xval,
                                  refit=True)

threshold = 1 / 3
Exemple #6
0
 def create(self, X, y):
     return model_selection.StratifiedKFold(n_splits=self.n_splits,
                                            shuffle=self.shuffle,
                                            random_state=self.random_state)
def crossValidationLogReg():
    # Create crossvalidation partition for evaluation
    K_o_splits = 10
    outer_it = 0
    K_i_splits = 10
    model_count = 10

    summed_eval_i = np.zeros((model_count))
    eval_i = np.zeros((model_count))
    eval_o = np.zeros((model_count))
    optimal_lambda = np.zeros((K_o_splits))

    #CV1 = model_selection.KFold(n_splits=K_o_splits,shuffle=True)
    #StratifiedKfold ensures that there is a reasonable percentage of each class in each split.
    CV1 = model_selection.StratifiedKFold(n_splits=K_o_splits, shuffle=True)
    CV2 = model_selection.StratifiedKFold(n_splits=K_i_splits, shuffle=True)

    #Outer k-fold split
    for train_index_o, test_index_o in CV1.split(X, y):
        print('Outer CV1-fold {0} of {1}'.format(outer_it + 1, K_o_splits))

        X_train_o = X[train_index_o, :]
        y_train_o = y[train_index_o]
        X_test_o = X[test_index_o, :]
        y_test_o = y[test_index_o]

        #Inner validation loop
        inner_it = 0

        for train_index_i, test_index_i in CV2.split(X_train_o, y_train_o):
            print('Inner CV2-fold {0} of {1}'.format(inner_it + 1, K_i_splits))
            X_train_i = X[train_index_i, :]
            y_train_i = y[train_index_i]
            X_test_i = X[test_index_i, :]
            y_test_i = y[test_index_i]

            #C specifies the inverse of regularization strength. Small C means high regularization
            lowest_err = 100
            optimal_reg = 999
            for idx in range(model_count):
                reg_term = (0.01 + idx * 0.1)
                model = lm.logistic.LogisticRegression(C=reg_term,
                                                       penalty='l2')
                model = model.fit(X_train_i, y_train_i)
                y_logreg = model.predict(X_test_i)
                current_err = 100 * (
                    y_logreg != y_test_i).sum().astype(float) / len(y_test_i)

                summed_eval_i[idx] += current_err

            inner_it += 1

        eval_i = summed_eval_i * (len(X_test_i) / len(X_train_o))
        idx = np.argmin(eval_i)
        reg_term = (0.01 + idx * 0.1)
        model = lm.logistic.LogisticRegression(C=reg_term, penalty='l2')
        model = model.fit(X_train_o, y_train_o)
        y_logreg = model.predict(X_test_o)
        current_err = 100 * (y_logreg !=
                             y_test_o).sum().astype(float) / len(y_test_o)

        eval_o[outer_it] = current_err
        optimal_lambda[outer_it] = reg_term

        outer_it += 1

    mode_reg, _ = numpy.unique(optimal_lambda, return_counts=True)

    figure()
    boxplot(eval_o)
    xlabel('Logistic Regression')
    ylabel('Cross-validation error [%]')
    show()
    e_gen = np.sum(eval_o) * (len(X_test_o) / len(X))
    print("Logistic regression generalization error: %f with %s and %f" %
          ((e_gen), 'l2-norm', mode_reg[0]))
Exemple #8
0
    print('Train Indices: ', train_indices, 'Test Indices: ', test_indices)

print('{0:-^70}'.format('Shuffle Split'))
# 就是打乱数据后随机分割数据集,与第一个方法类似
ss = sm.ShuffleSplit(n_splits=3, test_size=0.25, random_state=0)
print('Shuffle Split class: ', ss)
print('splits of ss: ', ss.get_n_splits(X))  # 就是构造函数中的n_splits参数
for train_indices, test_indices in ss.split(X):
    print('Train Indices: ', train_indices, 'Test Indices: ', test_indices)

# 以上的K_Fold, LOO, LPO, Shuffle等方法都是建立在样本数据独立同分布的基础上的(i.d.d)
# 只有在i.d.d的基础上,以上抽样方法才能保留数据的统计特征,做到不失真
# 以下是数据分布不均匀的时候,基于分层或分组的抽样方法
print('{0:-^70}'.format('Stratified K-Fold'))
y = np.array([0, 0, 0, 0, 1, 1, 1, 1])
skf = sm.StratifiedKFold(n_splits=4)
print('Stratified K-Fold class: ', skf)
print('splits of skf: ', skf.get_n_splits(X, y))  # 增加一个参数,根据y来分层
for train_indices, test_indices in skf.split(X, y):
    print('Train Indices: ', train_indices, 'Test Indices: ', test_indices)

# Group K-Fold, 除了X和y之外,还有个额外的参数是每个样本所属的组
# 抽样要保证测试集里面的数据所属的组与训练集里面的样本所属的组是完全不一样的
print('{0:-^70}'.format('Group K-Fold'))
X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10]
y = ["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"]
groups = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3]
gkf = sm.GroupKFold(n_splits=3)  # groups的数量必须要大于n_splits
print('X: \n', X)
print('y: ', y)
print('groups: ', groups)
Exemple #9
0
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='scale')))
models.append(('AdaB', AdaBoostClassifier()))
models.append(('RF', RandomForestClassifier(n_estimators=1000, n_jobs=-1)))
models.append(('GBM', GradientBoostingClassifier()))

outFile = open("output.txt", "w")

#############################
# model evaluation
results = []
names = []
scoring = 'f1'
for name, model in models:
    kfold = model_selection.StratifiedKFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model,
                                                 X,
                                                 Y,
                                                 cv=kfold,
                                                 scoring=scoring,
                                                 error_score=np.nan,
                                                 n_jobs=-1)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f) " % (name, cv_results.mean(), cv_results.std())
    outFile.write(msg)
    print(msg)
    outFile.write("- Time: %s in seconds" % (time.time() - start_time))
    print("- Time: %s in seconds" % (time.time() - start_time))
    outFile.write("\n")
Exemple #10
0
# Save TF-IDF vectors so I'm not recalculating them every time...
#print('Save the TF-IDFed data.\n')
#pickle.dump([x_train, y_train], open(os.path.join(data_path, 'meanembedding-train.pkl'), 'wb'))
#pickle.dump([x_test, y_test], open(os.path.join(data_path, 'meanembedding-test.pkl'), 'wb'))

# Load TF-IDFed train/test data with labels attached
#print('Load TF-IDFed Train/Test data.')
#[x_train, y_train] = pickle.load(open(os.path.join(data_path,'tfidfed-train.pkl'),'rb'))
#[x_test, y_test] = pickle.load(open(os.path.join(data_path,'tfidfed-test.pkl'),'rb'))

# Print data info
#print('Number of Instances: {}'.format(data.shape[0]))
print('\tTraining instances: {}'.format(x_train.shape))
print('\tTesting instances: {}\n'.format(x_test.shape))

strat_kfold = ms.StratifiedKFold(n_splits=5, shuffle=True)
print('Cross-validation: {} folds\n'.format(strat_kfold.get_n_splits()))


for estimator in classifiers:

    print('{} fitting - '.format(estimator.__class__.__name__), end='')
    # Fit model
    time_start = time.time()
    estimator.fit(x_train, y_train)
    time_stop = time.time()
    elapsed = time_stop - time_start
    print('{} minutes {} seconds'.format(elapsed // 60, elapsed % 60))

    # Predict on the training dataset
    print('{} predict training - '.format(estimator.__class__.__name__), end='')
Exemple #11
0
    def split_data(self):
        self.train_data_for_target_df = {}
        self.train_data_for_target_df['no_gradient'] = self.train_data_df.index
        self.target_data_for_target_df = {}
        self.target_data_for_target_df[
            'no_gradient'] = self.target_data_df.index

        self.target_data_index_df = self.target_data_df
        self.train_data_index_df = self.train_data_df

        if self.independent_testset == 1:
            self.test_data_index_df = self.test_data_df
        else:
            self.test_data_index_df = self.train_data_df

        if self.validation_from_testset == 1:
            self.validation_data_index_df = self.test_data_df
        else:
            self.validation_data_index_df = self.train_data_df

        if (self.independent_testset == 1) & (self.validation_from_testset
                                              == 1):
            self.validation_data_for_target_df = {}
            self.validation_data_for_target_df[
                'no_gradient'] = self.test_data_df.index
        else:
            self.validation_data_for_target_df = None

        #split test, validation, training dataset
        #case1: all test, validation, training from one original training set
        if self.independent_testset == 0:
            #Split the original training set into test_split_folds folds.  (training - test)
            #output two list train_splits_df and test_splits_df
            self.train_splits_df = [{} for i in range(self.test_split_folds)]
            self.test_splits_df = [{} for i in range(self.test_split_folds)]
            self.train_cv_splits_df = [[{} for i in range(self.cv_split_folds)]
                                       for j in range(self.test_split_folds)]
            self.validation_cv_splits_df = [[
                {} for i in range(self.cv_split_folds)
            ] for j in range(self.test_split_folds)]

            if self.test_split_method == 0:
                if self.test_split_folds == 1:
                    kf_list = []
                    if self.test_split_ratio == 0:
                        kf_list.append(
                            (range(len(self.train_data_df.index)), None))
                    else:
                        kf_folds = int(1 / self.test_split_ratio)
                        kf = ms.KFold(n_splits=kf_folds, shuffle=True)
                        kf_list.append(list(kf.split(self.train_data_df))[0])
                else:
                    kf = ms.KFold(n_splits=self.test_split_folds, shuffle=True)
                    kf_list = list(kf.split(self.train_data_df))
            # stratified split (keep prior)
            if self.test_split_method == 1:
                if self.test_split_folds == 1:
                    kf_list = []
                    if self.test_split_ratio == 0:
                        kf_list.append((self.train_data_df.index, None))
                    else:
                        kf_folds = int(1 / self.test_split_ratio)
                        kf = ms.StratifiedKFold(n_splits=kf_folds,
                                                shuffle=True)
                        kf_list.append(
                            list(
                                kf.split(
                                    self.train_data_df, self.train_data_df[
                                        self.dependent_variable]))[0])
                else:
                    kf = ms.StratifiedKFold(n_splits=self.test_split_folds,
                                            shuffle=True)
                    kf_list = list(
                        kf.split(self.train_data_df,
                                 self.train_data_df[self.dependent_variable]))
            # customized split
            if self.test_split_method == 2:
                kf_list = self.test_split(self.train_data_df)

            test_split_fold_id = 0
            for train_index_split, test_index_split in kf_list:
                train_index = self.train_data_df.index[train_index_split]
                self.train_splits_df[test_split_fold_id][
                    'no_gradient'] = train_index
                if test_index_split is None:
                    test_index = None
                else:
                    test_index = self.train_data_df.index[test_index_split]
                self.test_splits_df[test_split_fold_id][
                    'no_gradient'] = test_index
                test_split_fold_id += 1

            #Split each training set into cv_split_folds folds  (training - validation)
            for i in range(self.test_split_folds):
                cur_train_data_df = self.train_data_df.loc[
                    self.train_splits_df[i]['no_gradient'], :]
                if self.cv_split_method == 0:
                    if self.cv_split_folds == 1:
                        kf_folds = int(1 / self.cv_split_ratio)
                        kf = ms.KFold(n_splits=kf_folds, shuffle=True)
                        kf_list = []
                        kf_list.append(list(kf.split(cur_train_data_df))[0])
                    else:
                        kf = ms.KFold(n_splits=self.cv_split_folds,
                                      shuffle=True)
                        kf_list = list(kf.split(cur_train_data_df))
                # stratified split (keep prior)
                if self.cv_split_method == 1:
                    if self.cv_split_folds == 1:
                        kf_folds = int(1 / self.cv_split_ratio)
                        kf = ms.StratifiedKFold(n_splits=kf_folds,
                                                shuffle=True)
                        kf_list = []
                        kf_list.append(
                            list(
                                kf.split(
                                    cur_train_data_df, cur_train_data_df[
                                        self.dependent_variable]))[0])
                    else:
                        kf = ms.StratifiedKFold(n_splits=self.cv_split_folds,
                                                shuffle=True)
                        kf_list = list(
                            kf.split(
                                cur_train_data_df,
                                cur_train_data_df[self.dependent_variable]))
                # customized split
                if self.cv_split_method == 2:
                    kf_list = self.cv_split(cur_train_data_df)

                cv_split_fold_id = 0
                for train_index_split, validation_index_split in kf_list:
                    train_index = cur_train_data_df.index[train_index_split]
                    validation_index = cur_train_data_df.index[
                        validation_index_split]
                    self.train_cv_splits_df[i][cv_split_fold_id][
                        'no_gradient'] = train_index
                    self.validation_cv_splits_df[i][cv_split_fold_id][
                        'no_gradient'] = validation_index
                    #                     self.train_cv_splits_df[i][cv_split_fold_id]['no_gradient'] = self.train_splits_df[i].loc[train_index, :]
                    #                     self.validation_cv_splits_df[i][cv_split_fold_id]['no_gradient'] = self.train_splits_df[i].loc[validation_index, :]
                    cv_split_fold_id += 1

        #case2: training from one set, validation and test from another set (independent_testset and validation_from_testset parameters)
        if self.independent_testset == 1:
            #             if self.validation_from_testset:
            #validation from testset will force the cv_split_folds = 1
            #self.cv_split_folds = 1
            #             else:
            #validation not from testset will force the test_split_folds = 1
            #                 self.test_split_folds  = 1

            self.train_splits_df = [{} for i in range(self.test_split_folds)]
            self.test_splits_df = [{} for i in range(self.test_split_folds)]
            self.train_cv_splits_df = [[{} for i in range(self.cv_split_folds)]
                                       for j in range(self.test_split_folds)]
            self.validation_cv_splits_df = [[
                {} for i in range(self.cv_split_folds)
            ] for j in range(self.test_split_folds)]

            #split testset to test_split_folds folds (validation - test)
            if self.validation_from_testset:
                if self.validation_equal_testset:
                    self.test_split_folds = 1  #special case that validation set is the same as test set
                    kf_list = [(range(self.test_data_df.shape[0]),
                                range(self.test_data_df.shape[0]))]
                else:
                    if self.test_split_method == 0:
                        if self.test_split_folds == 1:
                            kf_folds = int(1 / self.test_split_ratio)
                            kf = ms.KFold(n_splits=kf_folds, shuffle=True)
                            kf_list = []
                            kf_list.append(
                                list(kf.split(self.test_data_df))[0])
                        else:
                            kf = ms.KFold(n_splits=self.test_split_folds,
                                          shuffle=True)
                            kf_list = list(kf.split(self.test_data_df))
                    # stratified split (keep prior)
                    if self.test_split_method == 1:
                        if self.test_split_folds == 1:
                            kf_folds = int(1 / self.test_split_ratio)
                            kf = ms.StratifiedKFold(n_splits=kf_folds,
                                                    shuffle=True)
                            kf_list = []
                            kf_list.append(
                                list(
                                    kf.split(
                                        self.test_data_df, self.test_data_df[
                                            self.dependent_variable]))[0])
                        else:
                            kf = ms.StratifiedKFold(
                                n_splits=self.test_split_folds, shuffle=True)
                            kf_list = list(
                                kf.split(
                                    self.test_data_df, self.test_data_df[
                                        self.dependent_variable]))
                    # customized split
                    if self.test_split_method == 2:
                        kf_list = self.test_split(self.test_data_df.copy())

                test_split_fold_id = 0
                for validation_index_split, test_index_split in kf_list:
                    validation_index = self.test_data_df.index[
                        validation_index_split]
                    test_index = self.test_data_df.index[test_index_split]
                    self.train_splits_df[test_split_fold_id][
                        'no_gradient'] = self.train_data_df.index
                    self.test_splits_df[test_split_fold_id][
                        'no_gradient'] = test_index

                    cv_validation_index = np.array_split(
                        validation_index, self.cv_split_folds)

                    for j in range(self.cv_split_folds):

                        self.train_cv_splits_df[test_split_fold_id][j][
                            'no_gradient'] = self.train_data_df.index
                        self.validation_cv_splits_df[test_split_fold_id][j][
                            'no_gradient'] = cv_validation_index[j]

    #                     self.train_splits_df[test_split_fold_id]['no_gradient'] = self.train_data_df
    #                     self.test_splits_df[test_split_fold_id]['no_gradient'] = self.test_data_df.loc[test_index, :]
    #                     self.train_cv_splits_df[test_split_fold_id][0]['no_gradient'] = self.train_data_df
    #                     self.validation_cv_splits_df[test_split_fold_id][0]['no_gradient'] = self.test_data_df.loc[validation_index, :]
                    test_split_fold_id += 1
                print('done')
            else:
                self.train_splits_df[0][
                    'no_gradient'] = self.train_data_df.index
                self.test_splits_df[0]['no_gradient'] = self.test_data_df.index

                #Split each training set into cv_split_folds folds  (training - validation)
                for i in range(self.test_split_folds):
                    cur_train_data_df = self.train_data_df.loc[
                        self.train_splits_df[i]['no_gradient'], :]
                    if self.cv_split_method == 0:
                        kf = ms.KFold(n_splits=self.cv_split_folds,
                                      shuffle=True)
                        kf_list = list(kf.split(cur_train_data_df))
                    # stratified split (keep prior)
                    if self.cv_split_method == 1:
                        kf = ms.StratifiedKFold(n_splits=self.cv_split_folds,
                                                shuffle=True)
                        kf_list = list(kf.split(cur_train_data_df))
                    # customized split
                    if self.cv_split_method == 2:
                        kf_list = self.cv_split(self.name, self.cv_split_folds,
                                                cur_train_data_df)

                    cv_split_fold_id = 0
                    for train_index_split, validation_index_split in kf_list:
                        train_index = cur_train_data_df.index[
                            train_index_split]
                        validation_index = cur_train_data_df.index[
                            validation_index_split]
                        self.train_cv_splits_df[i][cv_split_fold_id][
                            'no_gradient'] = train_index
                        self.validation_cv_splits_df[i][cv_split_fold_id][
                            'no_gradient'] = validation_index
                        #                         self.train_cv_splits_df[i][cv_split_fold_id]['no_gradient'] = self.train_splits_df[i].loc[train_index, :]
                        #                         self.validation_cv_splits_df[i][cv_split_fold_id]['no_gradient'] = self.train_splits_df[i].loc[validation_index, :]
                        cv_split_fold_id += 1
def crossValidationDT():
    # Create crossvalidation partition for evaluation
    K_o_splits = 10
    outer_it = 0
    K_i_splits = 10
    model_count = 10

    summed_eval_i = np.zeros((model_count))
    eval_i = np.zeros((model_count))
    eval_o = np.zeros((model_count))
    optimal_lambda = np.zeros((K_o_splits))

    #CV1 = model_selection.KFold(n_splits=K_o_splits,shuffle=True)
    #StratifiedKfold ensures that there is a reasonable percentage of each class in each split.
    CV1 = model_selection.StratifiedKFold(n_splits=K_o_splits, shuffle=True)
    CV2 = model_selection.StratifiedKFold(n_splits=K_i_splits, shuffle=True)

    #Outer k-fold split
    for train_index_o, test_index_o in CV1.split(X, y):
        print('Outer CV1-fold {0} of {1}'.format(outer_it + 1, K_o_splits))

        X_train_o = X[train_index_o, :]
        y_train_o = y[train_index_o]
        X_test_o = X[test_index_o, :]
        y_test_o = y[test_index_o]

        #Inner validation loop
        inner_it = 0

        for train_index_i, test_index_i in CV2.split(X_train_o, y_train_o):
            print('Inner CV2-fold {0} of {1}'.format(inner_it + 1, K_i_splits))
            X_train_i = X[train_index_i, :]
            y_train_i = y[train_index_i]
            X_test_i = X[test_index_i, :]
            y_test_i = y[test_index_i]

            #C specifies the inverse of regularization strength. Small C means high regularization
            for idx in range(model_count):
                reg_term = (1 + idx)

                model2 = tree.DecisionTreeClassifier(
                    max_depth=reg_term, criterion="entropy")  ###NEED REGU
                model2 = model2.fit(X_train_i, y_train_i)
                y_dectree = model2.predict(X_test_i)
                current_err = 100 * (
                    y_dectree != y_test_i).sum().astype(float) / len(y_test_i)

                summed_eval_i[idx] += current_err

            inner_it += 1

        eval_i = summed_eval_i * (len(X_test_i) / len(X_train_o))
        idx = np.argmin(eval_i)
        reg_term = (1 + idx * 2)

        model2 = tree.DecisionTreeClassifier(
            max_depth=reg_term, criterion="entropy")  ###NEED REGU "gini"
        model2 = model2.fit(X_train_o, y_train_o)
        y_dectree = model2.predict(X_test_o)
        current_err = 100 * (y_dectree !=
                             y_test_o).sum().astype(float) / len(y_test_o)

        eval_o[outer_it] = current_err
        optimal_lambda[outer_it] = reg_term

        outer_it += 1

    mode_reg, _ = numpy.unique(optimal_lambda, return_counts=True)

    figure()
    boxplot(eval_o)
    xlabel('Decision Tree')
    ylabel('Cross-validation error [%]')
    show()
    e_gen = np.sum(eval_o) * (len(X_test_o) / len(X))
    print("Decision Tree generalization error: %f with %s and %i" %
          (e_gen, 'max depth', mode_reg[0]))
Exemple #13
0
def svc_fit(train, proj_mask, epochs, folds=5, batch_size=32):
    """ Fit SVM using SVC on data set. 

    Args:
        train (tuple of list): (X, y) train data.
        proj_mask (Namedtuple): Radar projections to use for training.
        epochs (int): Number of times to augment data.
        folds (int, optional): Number of folds for the Stratified K-Folds
            cross-validator. Default=5
        batch_size (int, optional): Augment batch size. Default=32.

    Returns:
        estimator: Estimator that was chosen by grid search.
    """
    def find_best_svm_estimator(X, y, cv, random_seed):
        """Exhaustive search over specified parameter values for svm.

        Returns:
            optimized svm estimator.

        Note:
            https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf
        """
        print('\n Finding best svm estimator...')
        Cs = [0.01, 0.1, 1, 10, 100]
        gammas = [0.001, 0.01, 0.1, 1, 10]
        param_grid = [{
            'C': Cs,
            'kernel': ['linear']
        }, {
            'C': Cs,
            'gamma': gammas,
            'kernel': ['rbf']
        }]
        init_est = svm.SVC(probability=True,
                           class_weight='balanced',
                           random_state=random_seed,
                           cache_size=1000,
                           verbose=False)
        grid_search = model_selection.GridSearchCV(estimator=init_est,
                                                   param_grid=param_grid,
                                                   verbose=2,
                                                   n_jobs=4,
                                                   cv=cv)
        grid_search.fit(X, y)
        #print('\n All results:')
        #print(grid_search.cv_results_)
        logger.info('\n Best estimator:')
        logger.info(grid_search.best_estimator_)
        logger.info('\n Best score for {}-fold search:'.format(folds))
        logger.info(grid_search.best_score_)
        logger.info('\n Best hyperparameters:')
        logger.info(grid_search.best_params_)
        return grid_search.best_estimator_

    X_train, y_train = train

    # Augment training set.
    if epochs:
        data_gen = DataGenerator(rotation_range=15.0,
                                 zoom_range=0.3,
                                 noise_sd=0.2)
        logger.info('Augmenting data set.')
        logger.info(f'Original number of training samples: {y_train.shape[0]}')

        # Faster to use a list in below ops.
        y_train = y_train.tolist()

        # Do not mutate original lists.
        xc = X_train.copy()
        yc = y_train.copy()

        for e in range(epochs):
            logger.debug(f'epoch: {e}')
            batch = 0
            for X_batch, y_batch in data_gen.flow(xc,
                                                  yc,
                                                  batch_size=batch_size):
                logger.debug(f'batch: {batch}')
                X_train.extend(X_batch)
                y_train.extend(y_batch)
                batch += 1
                if batch >= len(xc) / batch_size:
                    break

        # Sanity check if augmentation introduced a scaling problem.
        max = np.amax([[np.concatenate(t, axis=None)] for t in X_train])
        assert abs(max - 1.0) < 1e-6, 'scale error'

        # Convert y_train back to np array.
        y_train = np.array(y_train, dtype=np.int8)

        logger.info(
            f'Augmented number of training samples: {y_train.shape[0]}')

    logger.info('Generating feature vectors from radar projections.')
    X_train = common.process_samples(X_train, proj_mask=proj_mask)
    logger.info(f'Feature vector length: {X_train.shape[1]}')

    # Balance classes.
    logger.info('Balancing classes.')
    y_train, X_train = balance_classes(y_train, X_train)

    skf = model_selection.StratifiedKFold(n_splits=folds)

    # Find best classifier.
    logger.info('Finding best classifier.')
    clf = find_best_svm_estimator(X_train, y_train,
                                  skf.split(X_train, y_train), RANDOM_SEED)

    return clf
Exemple #14
0
def sgd_fit(train,
            test,
            proj_mask,
            online_learn,
            svm_model,
            epochs,
            folds=5,
            batch_size=32):
    """ Fit SVM using SGD on data set. 

    Args:
        train (tuple of list): (X, y) train data.
        test (tuple of list): (X, y) test data.
        proj_mask (Namedtuple): Radar projections to use for training.
        online_learn (bool): If True perform online learning with data.
        svm_model (str): Name of existing svm model for online learning.
        epochs (int): Number of times to augment data.
        folds (int, optional): Number of folds for the Stratified K-Folds
            cross-validator. Default=5
        batch_size (int, optional): Augment batch size. Default=32.

    Returns:
        estimator: Estimator that was chosen by grid search.
    """
    def find_best_sgd_svm_estimator(X, y, cv, random_seed):
        """Exhaustive search over specified parameter values for svm using sgd.

        Returns:
            optimized svm estimator.
        """
        max_iter = max(np.ceil(10**6 / len(X)), 1000)
        small_alphas = [10.0e-08, 10.0e-09, 10.0e-10]
        alphas = [10.0e-04, 10.0e-05, 10.0e-06, 10.0e-07]
        l1_ratios = [0.075, 0.15, 0.30]
        param_grid = [{
            'alpha': alphas,
            'penalty': ['l1', 'l2'],
            'average': [False]
        }, {
            'alpha': alphas,
            'penalty': ['elasticnet'],
            'average': [False],
            'l1_ratio': l1_ratios
        }, {
            'alpha': small_alphas,
            'penalty': ['l1', 'l2'],
            'average': [True]
        }, {
            'alpha': small_alphas,
            'penalty': ['elasticnet'],
            'average': [True],
            'l1_ratio': l1_ratios
        }]
        init_est = linear_model.SGDClassifier(loss='log',
                                              max_iter=max_iter,
                                              random_state=random_seed,
                                              n_jobs=-1,
                                              warm_start=True)
        grid_search = model_selection.GridSearchCV(estimator=init_est,
                                                   param_grid=param_grid,
                                                   verbose=2,
                                                   n_jobs=-1,
                                                   cv=cv)
        grid_search.fit(X, y)
        #print('\n All results:')
        #print(grid_search.cv_results_)
        logger.info('\n Best estimator:')
        logger.info(grid_search.best_estimator_)
        logger.info('\n Best score for {}-fold search:'.format(folds))
        logger.info(grid_search.best_score_)
        logger.info('\n Best hyperparameters:')
        logger.info(grid_search.best_params_)
        return grid_search.best_estimator_

    X_train, y_train = train
    X_test, y_test = test

    # Make a copy of train set for later use in augmentation.
    if epochs:
        xc = X_train.copy()
        yc = y_train.copy()

    # Generate feature vectors from radar projections.
    logger.info('Generating feature vectors.')
    X_train = common.process_samples(X_train, proj_mask=proj_mask)
    X_test = common.process_samples(X_test, proj_mask=proj_mask)
    logger.info(f'Feature vector length: {X_train.shape[1]}')

    # Balance classes.
    logger.info('Balancing classes.')
    y_train, X_train = balance_classes(y_train, X_train)

    if not online_learn:
        # Find best initial classifier.
        logger.info('Running best fit with new data.')
        skf = model_selection.StratifiedKFold(n_splits=folds)
        clf = find_best_sgd_svm_estimator(X_train, y_train,
                                          skf.split(X_train, y_train),
                                          RANDOM_SEED)
    else:
        # Fit existing classifier with new data.
        logger.info('Running partial fit with new data.')
        with open(os.path.join(common.PRJ_DIR, svm_model), 'rb') as fp:
            clf = pickle.load(fp)
        max_iter = max(np.ceil(10**6 / len(X_train)), 1000)
        for _ in range(max_iter):
            clf.partial_fit(X_train, y_train)

    # Augment training set and use to run partial fits on classifier.
    if epochs:
        logger.info(
            f'Running partial fit with augmented data (epochs: {epochs}).')
        y_predicted = clf.predict(X_test)
        logger.debug(
            f'Un-augmented accuracy: {metrics.accuracy_score(y_test, y_predicted)}.'
        )
        data_gen = DataGenerator(rotation_range=5.0,
                                 zoom_range=0.2,
                                 noise_sd=0.1,
                                 balance=True)
        for e in range(epochs):
            logger.debug(f'Augment epoch: {e}.')
            batch = 0
            for X_batch, y_batch in data_gen.flow(xc,
                                                  yc,
                                                  batch_size=batch_size):
                logger.debug(f'Augment batch: {batch}.')
                X_batch = common.process_samples(X_batch, proj_mask=proj_mask)
                y_batch, X_batch = balance_classes(y_batch, X_batch)
                clf.partial_fit(X_batch, y_batch, classes=np.unique(y_train))
                y_predicted = clf.predict(X_test)
                acc = metrics.accuracy_score(y_test, y_predicted)
                logger.debug(f'Augmented accuracy: {acc}.')
                batch += 1
                if batch >= len(xc) / batch_size:
                    break

    return clf
Exemple #15
0
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""
Titanic evaluation definition.

This is one of the main _formal_ forml components (along with `source` and `evaluation`) that's being looked up by
the forml loader.
"""

from sklearn import model_selection, metrics

from forml.project import component
from forml.lib.flow.operator.folding import evaluation

# Typical method of providing component implementation using `component.setup()`. Choosing the `MergingScorer` operator
# to implement classical crossvalidated metric scoring
component.setup(
    evaluation.MergingScorer(
        crossvalidator=model_selection.StratifiedKFold(n_splits=2,
                                                       shuffle=True,
                                                       random_state=42),
        metric=metrics.log_loss,
    ))
Exemple #16
0
def PCBA_CV(DB, data_type, list_ID, list_y, list_SMILES, dict_id2smile, n_folds):

    if data_type == 'kernel':
        if not os.path.isfile('data/' + DB + '/' + DB + '_K.npy'):
            K = mol_build_K(list_SMILES)
            np.save('data/' + DB + '/' + DB + '_K', K)
        else:
            K = np.load('data/' + DB + '/' + DB + '_K.npy')

        if DB == 'PCBA':
            # if Kmedoid
            # list_assignment, medoids = Kmedoid_cluster(K, n_folds)

            # if agglomerative clustering
            list_assignment = Khierarchical_cluster(K, n_folds)
        else:
            list_assignment = np.zeros(K.shape[0])
            for y in [0, 1]:
                indices = np.where(list_y == y)[0]
                K_local = K[indices, :]
                K_local = K_local[:, indices]
                local_assignment = Khierarchical_cluster(K_local, n_folds)
                list_assignment[indices] = local_assignment

    elif data_type == 'features':
        if not os.path.isfile('data/' + DB + '/' + DB + '_X.npy'):
            X = mol_build_X(list_SMILES)
            np.save('data/' + DB + '/' + DB + '_X', X)
        else:
            X = np.load('data/' + DB + '/' + DB + '_X.npy')

        if DB == 'PCBA':
            list_assignment = Xkmeans_cluster(X, n_folds)
        else:
            list_assignment = np.zeros(X.shape[0])
            for y in [0, 1]:
                indices = np.where(list_y == y)[0]
                X_local = X[indices, :]
                local_assignment = Xkmeans_cluster(X_local, n_folds)
                list_assignment[indices] = local_assignment

    elif data_type == 'standard':
        # if not os.path.isfile('data/' + DB + '/' + DB + '_X.npy'):
        #     X = mol_build_X(list_SMILES)
        #     np.save('data/' + DB + '/' + DB + '_X', X)
        # else:
        #     X = np.load('data/' + DB + '/' + DB + '_X.npy')
        list_ID = pickle.load(open('data/' + DB + '/' + DB + '_list_ID.data', 'rb'))
        list_y = np.array(pickle.load(open('data/' + DB + '/' + DB + '_list_y.data', 'rb')))
        X = np.zeros((len(list_ID), 1))
        list_assignment = np.zeros(X.shape[0])
        if DB not in ['PCBA', 'PCBA10', 'PCBA100']:
            skf = model_selection.StratifiedKFold(n_folds, shuffle=True, random_state=92)
            skf.get_n_splits(X, list_y)
            ifold = 0
            for train_index, test_index in skf.split(X, list_y):
                list_assignment[test_index] = ifold
                ifold += 1
        else:
            skf = model_selection.KFold(n_folds, shuffle=True, random_state=92)
            skf.get_n_splits(X)
            ifold = 0
            for train_index, test_index in skf.split(X):
                list_assignment[test_index] = ifold
                ifold += 1

    # import pdb; pdb.Pdb().set_trace()
    c = collections.Counter(list_assignment)
    print(c)
    folds = [np.where(list_assignment == cle)[0] for cle in list(c.keys())]

    fo = open('data/' + DB + '/' + DB + '_folds.txt', 'w')
    for ifold in range(n_folds):
        fo.write("ifold" + str(ifold) + '\n')
        if DB in ['PCBA', 'PCBA10', 'PCBA100']:
            for iclass in range(list_y.shape[1]):
                fo.write("iclass " + str(iclass) + ' ' +
                         str(collections.Counter(list_y[folds[ifold], iclass])) + '\n')
                print("iclass " + str(iclass) + ' ' +
                      str(collections.Counter(list_y[folds[ifold], iclass])))
        else:
            fo.write(str(collections.Counter(list_y[folds[ifold]])) + '\n')
            print(ifold, collections.Counter(list_y[folds[ifold]]))
        fo.write('\n')

    return folds
Exemple #17
0
kNN_reg = kNN.KNeighborsClassifier(n_neighbors=k)
kNN_reg.fit(X, Y)
kNN_test = kNN.KNeighborsClassifier(n_neighbors=k)

# Linear discirimant Analysis
lda_reg = disc.LinearDiscriminantAnalysis()
lda_reg.fit(X, Y)
lda_test = disc.LinearDiscriminantAnalysis()

# Qudratic discriminat analysis
qda_reg = disc.QuadraticDiscriminantAnalysis()
qda_reg.fit(X, Y)
qda_test = disc.QuadraticDiscriminantAnalysis()

# Cross-validation
cv = mod.StratifiedKFold(n_splits=5, shuffle=True)
log_result = mod.cross_validate(log_test, X, Y, cv=cv)
kNN_result = mod.cross_validate(kNN_test, X, Y, cv=cv)
lda_result = mod.cross_validate(lda_test, X, Y, cv=cv)
qda_result = mod.cross_validate(qda_test, X, Y, cv=cv)

print(log_result['test_score'].mean())
print(kNN_result['test_score'].mean())
print(lda_result['test_score'].mean())
print(qda_result['test_score'].mean())

sns.set()
fig, sub = plt.subplots(2,2)
plt.subplots_adjust(wspace=0.6, hspace=0.6)
xx, yy = make_meshgrid(X[:,0], X[:,1])
titles = ['Logistic', "kNN (k={})".format(k), 'LDA', 'QDA']
Exemple #18
0
def main(argv):

    topics = ["uni", "movie", "title"]
    langs = ["de", "es", "fr"]

    result_path = "/home/oyku/datasets/newexperiments/experiment_0/mtsm_baseline.csv"
    cols = ['Topic', 'Lang', 'F1', 'Recall', 'Precision']
    df = pd.DataFrame(columns=cols)

    for topic in topics:
        for lang in langs:

            # --------------------------------------
            if topic == "uni":
                path = "/home/oyku/datasets/University/"

            elif topic == "movie":
                path = "/home/oyku/datasets/Movie/"

            elif topic == "title":
                path = "/home/oyku/datasets/Article/"
            # --------------------------------------

            labeled = path + topic + "_" + lang + "_blocked_translated.csv"
            labeled = pd.read_csv(labeled)
            print(labeled.shape)

            print("Running translated test on " + topic +
                  " dataset on language " + lang)

            fts_path = path + "features/" + topic + "_" + lang + "_baseline_features.csv"
            train_features = pd.read_csv(fts_path)
            print("Training features:  " + str(len(list(train_features))))

            exclude = ["_id", "ltable_id", "rtable_id"]
            gold = pd.DataFrame(labeled["Label"])

            cols = [col for col in list(train_features) if col not in exclude]
            train_features = train_features[cols]

            imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
            scale = StandardScaler()
            imp.fit(train_features)
            imp.statistics_[pd.np.isnan(imp.statistics_)] = 0
            features = scale.fit_transform(imp.transform(train_features))

            # Cross Validation
            model = XGBClassifier(random_state=7, n_estimators=350)
            kfold = model_selection.StratifiedKFold(n_splits=5, random_state=7)
            scoring = ['f1', 'recall', 'precision']
            scores = model_selection.cross_validate(model,
                                                    features,
                                                    gold.values.ravel(),
                                                    cv=kfold,
                                                    scoring=scoring)
            f1 = "%.3f (%.3f)" % (scores['test_f1'].mean() * 100,
                                  scores['test_f1'].std() * 100)
            recall = "%.3f (%.3f)" % (scores['test_recall'].mean() * 100,
                                      scores['test_recall'].std() * 100)
            precision = "%.3f (%.3f)" % (scores['test_precision'].mean() * 100,
                                         scores['test_precision'].std() * 100)

            print(
                "Topic: %s ---  Lang: %s --- F1: %s     Recall: %s      Precision: %s"
                % (topic, lang, f1, recall, precision))
            version_results = [topic, lang, f1, recall, precision]
            df.loc[len(df)] = version_results

    df.to_csv(result_path, index=False)
def run(train, y, test, v, z):
    #cname = sys._getframe().f_code.co_name
    cname = 'p'
    train.drop('id', axis=1, inplace=True)
    test.drop('id', axis=1, inplace=True)
    from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
    dtrain = xgb.DMatrix(train, y)

    def step_xgb(params):
        cv = xgb.cv(params=params,
                    dtrain=dtrain,
                    num_boost_round=10000,
                    early_stopping_rounds=50,
                    nfold=10,
                    seed=params['seed'])
        score = cv.ix[len(cv) - 1, 0]
        print(cname, score, len(cv), params)
        return dict(loss=score, status=STATUS_OK)

    space_xgb = dict(max_depth=hp.choice('max_depth', range(2, 9)),
                     subsample=hp.quniform('subsample', 0.6, 1, 0.05),
                     colsample_bytree=hp.quniform('colsample_bytree', 0.6, 1,
                                                  0.05),
                     learning_rate=hp.quniform('learning_rate', 0.005, 0.1,
                                               0.005),
                     min_child_weight=hp.quniform('min_child_weight', 1, 6, 1),
                     gamma=hp.quniform('gamma', 0.5, 10, 0.05),
                     reg_alpha=hp.quniform('reg_alpha', 0, 1, 0.001),
                     objective='binary:logistic',
                     eval_metric='logloss',
                     seed=1,
                     silent=1)
    trs = state.load('xgb_trials')
    if trs == None or debug_mode:
        tr = Trials()
    else:
        tr, _ = trs
    if len(tr.trials) > 0:
        print('reusing %d trials, best was:' % (len(tr.trials)),
              space_eval(space_xgb, tr.argmin))
        best = tr.argmin
    while len(tr.trials) < 15:
        best = fmin(step_xgb,
                    space_xgb,
                    algo=tpe.suggest,
                    max_evals=len(tr.trials) + 1,
                    trials=tr)
        state.save('xgb_trials', (tr, space_xgb))
    xgb_params = space_eval(space_xgb, best)
    print(xgb_params)

    N_splits = 9
    N_seeds = 3

    skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True)
    dtest = xgb.DMatrix(test)
    cv = []
    for s in range(N_seeds):
        scores = []
        cname2 = cname + str(s)
        v[cname2], z[cname2] = 0, 0
        xgb_params['seed'] = s + 4242
        for n, (itrain, ival) in enumerate(skf.split(train, y)):
            dtrain = xgb.DMatrix(train.ix[itrain], y[itrain])
            dvalid = xgb.DMatrix(train.ix[ival], y[ival])
            watch = [(dtrain, 'train'), (dvalid, 'valid')]
            clf = xgb.train(xgb_params,
                            dtrain,
                            10000,
                            watch,
                            early_stopping_rounds=100,
                            verbose_eval=False)

            p = clf.predict(dvalid)
            v.loc[ival, cname2] += p
            score = metrics.log_loss(y[ival], p)
            z[cname2] += clf.predict(dtest)
            print(
                cname, 'seed %d step %d of %d: ' %
                (xgb_params['seed'], n + 1, skf.n_splits), score, state.now())
            scores.append(score)
        z[cname2] /= N_splits
        cv.append(np.mean(scores))
        print('seed %d loss: ' % (xgb_params['seed']), scores, np.mean(scores),
              np.std(scores))
        z['y'] = z[cname2]

    print('cv:', cv, np.mean(cv), np.std(cv))
    return cv, None
Exemple #20
0
    def __init__(self,
                 ds: pd.DataFrame,
                 n_ss_folds: int = 3,
                 n_folds: int = 5,
                 target_col: str = 'target',
                 random_state: int or None = None,
                 unlabeled_target_col: str = '5means_classes',
                 test_ratio: int = 0.25,
                 labeled_train_size_per_class: int = None,
                 unlabeled_train_size_per_class: int = None,
                 labeled_train_size: int = None,
                 unlabeled_train_size: int = None,
                 group_col: str or None = None,
                 equal_target: bool = True,
                 equal_unlabeled_target: bool = True,
                 shuffle: bool = True):
        super().__init__()

        self._test_ratio = test_ratio

        if equal_target and labeled_train_size_per_class is None:
            raise ValueError(
                "labeled_train_size_per_class must be determined when \
            equal_target is True, but found None")

        if not equal_target and labeled_train_size is None:
            raise ValueError("labeled_train_size must be determined when \
            equal_target is False, but found None")

        # Master split into Label/Unlabel
        if group_col is None:
            master_splitter = model_selection.StratifiedKFold(
                n_splits=n_ss_folds, random_state=random_state)
            unlabeled_idx, labeled_idx = next(
                master_splitter.split(ds, ds[target_col]))
        else:
            master_splitter = model_selection.GroupKFold(n_splits=n_ss_folds)
            unlabeled_idx, labeled_idx = next(
                master_splitter.split(ds, ds[target_col],
                                      groups=ds[group_col]))
        unlabeled_ds = ds.iloc[unlabeled_idx]
        # u_groups = ds[unlabeled_target_col].iloc[unlabeled_idx]
        labeled_ds = ds.iloc[labeled_idx]
        l_groups = ds[target_col].iloc[labeled_idx]

        if not equal_target and labeled_train_size is not None and labeled_train_size > len(
                labeled_idx):
            raise ValueError(
                'Input labeled train size {} is larger than actual labeled train size {}'
                .format(labeled_train_size, len(labeled_idx)))

        if unlabeled_train_size is not None and unlabeled_train_size > len(
                unlabeled_idx):
            unlabeled_train_size = len(unlabeled_idx)
            # raise ValueError('Input unlabeled train size {} is larger than actual unlabeled train size {}'.format(unlabeled_train_size, len(unlabeled_idx)))

        # Split labeled data using GroupKFold
        # Split unlabeled data using GroupKFold
        self.__cv_folds_idx = []
        self.__ds_chunks = []

        # split of train/val data
        if group_col is None:
            unlabeled_splitter = model_selection.StratifiedKFold(
                n_splits=n_folds, random_state=random_state + 1)
            unlabeled_spl_iter = unlabeled_splitter.split(
                unlabeled_ds, unlabeled_ds[target_col])
        else:
            unlabeled_splitter = model_selection.GroupKFold(n_splits=n_folds)
            unlabeled_spl_iter = unlabeled_splitter.split(
                unlabeled_ds,
                unlabeled_ds[target_col],
                groups=unlabeled_ds[group_col])

        if group_col is None:
            labeled_splitter = model_selection.StratifiedKFold(
                n_splits=n_folds, random_state=random_state + 2)
            labeled_spl_iter = labeled_splitter.split(labeled_ds,
                                                      labeled_ds[target_col])
        else:
            labeled_splitter = model_selection.GroupKFold(n_splits=n_folds)
            labeled_spl_iter = labeled_splitter.split(
                labeled_ds,
                labeled_ds[target_col],
                groups=labeled_ds[group_col])

        for i in range(n_folds):
            u_train, u_test = next(unlabeled_spl_iter)
            l_train, l_test = next(labeled_spl_iter)

            l_train_target = labeled_ds.iloc[l_train][target_col]
            l_train_data = labeled_ds.iloc[l_train]

            l_test_target = labeled_ds.iloc[l_test][target_col]
            l_test_data = labeled_ds.iloc[l_test]

            # Sample labeled_train_size of labeled data
            if equal_target:
                filtered_l_train_idx, chosen_l_train = self._sample_labeled_data(
                    l_train_data, l_train_target, target_col,
                    labeled_train_size_per_class, random_state)

                filtered_l_test_idx, chosen_l_test = self._sample_labeled_data(
                    l_test_data, l_test_target, target_col,
                    int(labeled_train_size_per_class * self._test_ratio),
                    random_state)
            else:
                if labeled_train_size is not None:
                    chosen_l_train, _ = model_selection.train_test_split(
                        l_train,
                        train_size=labeled_train_size,
                        random_state=random_state,
                        shuffle=shuffle,
                        stratify=l_train_target)
                    chosen_l_test, _ = model_selection.train_test_split(
                        l_test,
                        train_size=int(labeled_train_size * self._test_ratio),
                        random_state=random_state,
                        shuffle=shuffle,
                        stratify=l_train_target)
                else:
                    chosen_l_train = l_train
                    chosen_l_test = l_test
                filtered_l_train_idx = labeled_ds.iloc[chosen_l_train]
                filtered_l_test_idx = labeled_ds.iloc[chosen_l_test]

            # Sample unlabeled_train_size of labeled data
            if equal_unlabeled_target:
                u_train_target = unlabeled_ds.iloc[u_train][
                    unlabeled_target_col]
                u_test_target = unlabeled_ds.iloc[u_test][unlabeled_target_col]

                filtered_u_train_idx, chosen_u_train = self._sample_unlabeled_data(
                    unlabeled_ds, u_train, unlabeled_target_col,
                    u_train_target, unlabeled_train_size_per_class,
                    random_state)

                filtered_u_test_idx, chosen_u_test = self._sample_unlabeled_data(
                    unlabeled_ds, u_test, unlabeled_target_col, u_test_target,
                    int(unlabeled_train_size_per_class * self._test_ratio),
                    random_state)
            else:
                if unlabeled_train_size is not None:
                    # chosen_u_train, _ = model_selection.train_test_split(u_train, train_size=unlabeled_train_size,
                    #                                                      random_state=random_state, shuffle=shuffle)
                    is_replace = unlabeled_train_size > len(u_train)
                    chosen_u_train = resample(u_train,
                                              n_samples=unlabeled_train_size,
                                              replace=is_replace,
                                              random_state=random_state)
                    unlabeled_test_size = int(unlabeled_train_size *
                                              self._test_ratio)
                    is_replace = unlabeled_test_size > len(u_test)
                    chosen_u_test = resample(u_test,
                                             n_samples=unlabeled_test_size,
                                             replace=is_replace,
                                             random_state=random_state)
                else:
                    chosen_u_train = u_train
                    chosen_u_test = u_test

                filtered_u_train_idx = unlabeled_ds.iloc[chosen_u_train]
                filtered_u_test_idx = unlabeled_ds.iloc[chosen_u_test]

            self.__cv_folds_idx.append(
                (chosen_l_train, chosen_l_test, chosen_u_train, chosen_u_test))

            self.__ds_chunks.append(
                (filtered_l_train_idx, filtered_l_test_idx,
                 filtered_u_train_idx, filtered_u_test_idx))

        self.__folds_iter = iter(self.__ds_chunks)
Exemple #21
0
                                          bootstrap=bootstrap,
                                          min_samples_leaf=min_sample_leaf,
                                          min_samples_split=min_sample_split,
                                          n_estimators=n_estimators,
                                          max_depth=max_depth,
                                          max_features=max_features,
                                          oob_score=oob_score,
                                          random_state=531,
                                          verbose=1,
                                          class_weight=class_weight,
                                          n_jobs=1)

fileModel = fileModel.fit(xTrain_base.values, yTrain_base.values)

if base_sampling is None:
    cv = model_selection.StratifiedKFold(n_splits=5, random_state=None)
else:
    cv = 5

y_pred_score = model_selection.cross_val_predict(
    fileModel,
    Valid.drop([selected_label] + ['id_siniestro'], axis=1).values,
    Valid[[selected_label]].values,
    cv=cv,
    method='predict_proba')

y_pred_score = np.delete(y_pred_score, 0, axis=1)
y_hat_test = (y_pred_score > threshold_models).astype(int)
y_hat_test = y_hat_test.tolist()
y_hat_test = [item for sublist in y_hat_test for item in sublist]
recall_base = metrics.recall_score(y_pred=y_hat_test,
Exemple #22
0
                                        solver='sag',
                                        class_weight={
                                            1: 0.46,
                                            0: 1.32
                                        },
                                        verbose=0.8)
#model = SVC(probability=True)
#model = BernoulliNB()
model.fit(X, y)

logRegAccuracy = []
logRegLogLoss = []
logRegAUC = []

print('---------------------------------------------')
stratifiedCV = model_selection.StratifiedKFold(n_splits=numCVSplits,
                                               random_state=2)

for k, (trainInds, validInds) in enumerate(stratifiedCV.split(X, y)):
    break
    foldTrainingStartTime = time.time()

    X_train_cv = X[trainInds, :]
    X_valid_cv = X[validInds, :]

    y_train_cv = y[trainInds]
    y_valid_cv = y[validInds]

    model.fit(X_train_cv, y_train_cv)

    y_train_hat = model.predict_proba(X_train_cv)[:, 1]
    y_valid_hat = model.predict_proba(X_valid_cv)[:, 1]
def plotOptimalK(desc_train, targ_train):
    scoring = 'accuracy'
    i_array = list()
    euclid = list()
    euclid_W = list()
    manhattan = list()
    manhattan_W = list()
    Gaussian = list()
    mink = list()
    minkW = list()
    splits = 100
    for i in range(1, 25):
        models = []
        models.append(('KNN-Euclid', KNeighborsClassifier(n_neighbors=i, p=2)))
        models.append(("KNN-Euclid-Weighted",
                       KNeighborsClassifier(n_neighbors=i,
                                            weights='distance',
                                            p=2)))
        models.append(('KNN-Manhattan', KNeighborsClassifier(n_neighbors=i,
                                                             p=1)))
        models.append(("KNN-Manhattan-Weighted",
                       KNeighborsClassifier(n_neighbors=i,
                                            weights='distance',
                                            p=1)))
        #models.append(("Gaussian Bayes", GaussianNB()))
        #models.append(("ID3"),tree.DecisionTreeClassifier())

        # evaluate each model in turn
        results = []
        names = []

        for name, model in models:
            models
            kfold = model_selection.StratifiedKFold(n_splits=splits,
                                                    random_state=seed)
            cv_results = model_selection.cross_val_score(model,
                                                         desc_train,
                                                         targ_train,
                                                         cv=kfold,
                                                         scoring=scoring)
            results.append(cv_results)
            names.append(name)
            # msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
            # print(msg)
            # print("-")
            if name == 'KNN-Euclid':
                euclid.append(cv_results.mean())
            elif name == "KNN-Euclid-Weighted":
                euclid_W.append(cv_results.mean())
            elif name == 'KNN-Manhattan':
                manhattan.append(cv_results.mean())
            elif name == 'KNN-Manhattan-Weighted':
                manhattan_W.append(cv_results.mean())
            elif name == 'KNN-Mikowski (8)':
                mink.append(cv_results.mean())
            elif name == 'KNN-Mikowski (8)-Weighted':
                minkW.append(cv_results.mean())
            # elif name == 'Gaussian Bayes':
            #     Gaussian.append( cv_results.mean())

        i_array.append(i)

    plt.plot(i_array, euclid, label='Euclidian')
    plt.plot(i_array, euclid_W, label='Euclidian Weighted')
    plt.plot(i_array, manhattan, label='Manhattan ')
    plt.plot(i_array, manhattan_W, label='Manhattan Weighted')
    plt.plot(i_array, mink, label='Mikowski (8) ')
    plt.plot(i_array, minkW, label='Mikowski (8)-Weighted')
    plt.legend(loc='upper left')
    plt.title("Accuracy per (K)")
    plt.xlabel("K values")
    plt.ylabel("Accuracy")
    plt.show()
Exemple #24
0
labelsDB = data['shot_made_flag']


# ## Build a model based on featuresDB table, and make sure it doesn't overfit 
# (i.e. the training error and the test error are the same)
# #### Use an ExtraTreesClassifier for that

# In[ ]:


#%% build a simple model and make sure it doesnt overfit
randomSeed = 1
numFolds   = 4

stratifiedCV = model_selection.StratifiedKFold(n_splits=numFolds, shuffle=True, random_state=randomSeed)

mainLearner = ensemble.ExtraTreesClassifier(n_estimators=500, max_depth=5, 
                                            min_samples_leaf=120, max_features=120, 
                                            criterion='entropy', bootstrap=False, 
                                            n_jobs=-1, random_state=randomSeed)

startTime = time.time()
trainAccuracy = []; validAccuracy = [];
trainLogLosses = []; validLogLosses = []
for trainInds, validInds in stratifiedCV.split(featuresDB, labelsDB):
    # split to train and valid sets
    X_train_CV = featuresDB.iloc[trainInds,:]
    y_train_CV = labelsDB.iloc[trainInds]
    X_valid_CV = featuresDB.iloc[validInds,:]
    y_valid_CV = labelsDB.iloc[validInds]
                                                 gamma=G_list[it])
        gram_test = metrics.pairwise.rbf_kernel(X_test,
                                                X_train,
                                                gamma=G_list[it])
        kernel_train_list.append(gram_train)
        kernel_test_list.append(gram_test)

    # weight_v = hsic_kernel_weights_norm(kernel_train_list, y_train, 1, 0.01, 0) # calculating weights using HSIC during experiment

    # combine kernels
    for i in range(n_kernels):
        gram_train += kernel_train_list[i] * weight_v[i]
        gram_test += kernel_test_list[i] * weight_v[i]

# five-fold cross validation
cv = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# cost parameter for SVM
C = c_list[ds]

# init SVM classifier with precomputed kernel
clf = svm.SVC(C=C, kernel='precomputed', probability=True)

scorerMCC = metrics.make_scorer(metrics.matthews_corrcoef)
scorerSP = metrics.make_scorer(specificity_score)
scorerPR = metrics.make_scorer(metrics.precision_score)
scorerSE = metrics.make_scorer(metrics.recall_score)

scorer = {
    'ACC': 'accuracy',
    'recall': scorerSE,
Exemple #26
0
    def lr_roc_curve(self, C):
        """
        This generates a roc curve using logistic regression and 10 fold crossvalidation

        :param C: The C parameter used for the logistic regression.
        """
        # sets model
        model = lm.LogisticRegression(class_weight='balanced', C=C)

        # seeds random state from time
        random_state = np.random.RandomState(int(time.time()))
        np.random.seed(int(time.time() / 100))

        # Uncomment if you want to seed random state from iteger instead (to be able to repeat exact results)
        #random_state = np.random.RandomState(11235813)
        #np.random.seed(112358)

        # Sets up 10-fold cross validation set
        cv = ms.StratifiedKFold(n_splits=10,
                                random_state=random_state,
                                shuffle=True)

        tprs = []
        aucs = []
        f1s = []
        mean_fpr = np.linspace(0, 1, 100)

        i = 0

        # Creates a shuffled index for X and y
        shuffled_idx = np.arange(len(self.y))
        np.random.shuffle(shuffled_idx)

        # Uncomment if you want it to find and print the mean f1 score
        #test_f1_mean = np.mean(ms.cross_val_score(model, self.X[shuffled_idx], self.y[shuffled_idx], cv=10, n_jobs=-1, scoring='f1'))
        #print('using cross val score F1 = %0.4f' % (test_f1_mean))

        # Calculates and plots the roc cureve for each set in 10-fold cross validation
        for train, test in cv.split(self.X, self.y):
            model_i = model.fit(self.X[train], self.y[train])
            probas_ = model_i.predict_proba(self.X[test])
            pred = model_i.predict(self.X[test])
            f1 = met.f1_score(self.y[test], pred, average='binary')
            f1s.append(f1)
            # Compute ROC curve and area the curve
            fpr, tpr, thresholds = met.roc_curve(self.y[test], probas_[:, 1])
            tprs.append(sci.interp(mean_fpr, fpr, tpr))
            tprs[-1][0] = 0.0
            roc_auc = met.auc(fpr, tpr)
            aucs.append(roc_auc)
            plt.plot(fpr,
                     tpr,
                     lw=1,
                     alpha=0.3,
                     label='ROC fold %d (AUC = %0.4f, F1 = %0.4f)' %
                     (i + 1, roc_auc, f1))

            i += 1

        # Plots the 50/50 line
        plt.plot([0, 1], [0, 1],
                 linestyle='--',
                 lw=2,
                 color='r',
                 label='Coin Flip',
                 alpha=.8)

        # Finds and plots the mean roc curve and mean f1 score
        mean_tpr = np.mean(tprs, axis=0)
        mean_f1 = np.mean(f1s)
        mean_tpr[-1] = 1.0
        mean_auc = met.auc(mean_fpr, mean_tpr)
        std_auc = np.std(aucs)
        plt.plot(mean_fpr,
                 mean_tpr,
                 color='b',
                 label=u'Mean ROC (AUC = %0.4f \u00B1 %0.4f, \n        \
                    Mean F1 = %0.4f)' % (mean_auc, std_auc, mean_f1),
                 lw=2,
                 alpha=.8)

        # Finds and plots the +- standard deviation for roc curve
        std_tpr = np.std(tprs, axis=0)
        tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
        tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
        plt.fill_between(mean_fpr,
                         tprs_lower,
                         tprs_upper,
                         color='grey',
                         alpha=.2,
                         label=r'$\pm$ 1 std. dev.')

        # Sets legend, limits, labels, and displays plot
        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic')
        plt.legend(loc="lower right")
        outloc = self.output + '/Figure2.png'
        plt.savefig(outloc)
Exemple #27
0
import pandas as pd
from sklearn import model_selection

if __name__ == '__main__':
    df = pd.read_csv('input/train.csv')
    df['kfold'] = -1
    print('train length:', df.shape)

    df = df.sample(frac=1).reset_index(drop=True)

    kf = model_selection.StratifiedKFold(n_splits=5,
                                         shuffle=False,
                                         random_state=None)
    kf.get_n_splits()

    for fold, (train_idx,
               val_idx) in enumerate(kf.split(X=df, y=df.target.values)):
        print(len(train_idx), len(val_idx))
        df.loc[val_idx, 'kfold'] = fold

    df.to_csv('input/train_folds.csv', index=False)
Exemple #28
0
    def classify_rf(self,
                    max_depth=64,
                    n_estimators=1000,
                    max_features="sqrt",
                    roc_flag=False,
                    rand_flag=False,
                    save="",
                    compare_flag=True,
                    group_classes=True):
        """
        This uses LogisticRegressionCV to find the maximum mean f1 score using by adjusting the C parameter

        :param C_flag: A boolian indicating what to output from the function. (if False output the max mean f1, if True output the C value used to find the maximum mean f1 score)
        """

        # seeds random state from time
        random_state = np.random.RandomState(int(time.time()))
        np.random.seed(int(time.time() / 100))
        if group_classes:
            rng_idx = np.arange(len(self.class_list))
            np.random.shuffle(rng_idx)
        # Uncomment if you want to seed random state from iteger instead (to be able to repeat exact results)
        #random_state = np.random.RandomState(11235813)
        #np.random.seed(112358)

        # Sets and fits Random ForestModel
        model2 = ensemble.RandomForestClassifier(class_weight='balanced',
                                                 max_depth=max_depth,
                                                 max_leaf_nodes=None,
                                                 n_estimators=n_estimators,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 max_features=max_features,
                                                 n_jobs=-1)
        fitModel = model2.fit(self.X, self.y)

        # saves the model
        if len(save) > 0:
            joblib.dump(fitModel, save)

        if rand_flag:
            # Generate random drug-disease pairs
            rand_n = 10000
            self.rand_rate(rand_n, self.drugs_path, self.diseases_path)

            # Get random pairs cutoff rates
            probas_rand = fitModel.predict_proba(self.X2)

            self.data["treat_prob"] = [pr[1] for pr in probas_rand]
            rand_df_sort = self.data.sort_values(
                "treat_prob", ascending=False).reset_index(drop=True)
            rand_df_sort.to_csv(self.output + "random_pairs_names.csv",
                                index=False)
            #print(self.data.sort_values("treat_prob", ascending = False).reset_index(drop=True))

            # Get true positive cutoff rates
            probas_tp = fitModel.predict_proba(self.Xtp)

            # Get true negative cutoff rates
            probas_tn = fitModel.predict_proba(self.Xtn)

            # Plot the cutoff rates together
            self.plot_cutoff([
                pd.DataFrame({"treat_prob": [pr[1] for pr in probas_rand]}),
                pd.DataFrame({"treat_prob": [pr[1] for pr in probas_tp]}),
                pd.DataFrame({"treat_prob": [pr[1] for pr in probas_tn]})
            ], ["Random Pairs", "True Positives", "True Negatives"])

        if roc_flag:
            model = ensemble.RandomForestClassifier(class_weight='balanced',
                                                    max_depth=max_depth,
                                                    max_leaf_nodes=None,
                                                    n_estimators=n_estimators,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    max_features=max_features,
                                                    n_jobs=-1)

            # Sets up 10-fold cross validation set
            cv = ms.StratifiedKFold(n_splits=10,
                                    random_state=random_state,
                                    shuffle=True)
            if group_classes:
                cv = ms.GroupKFold(n_splits=10)
            tprs = []
            aucs = []
            f1s = []
            mean_fpr = np.linspace(0, 1, 100)

            i = 0

            # Creates a shuffled index for X and y
            shuffled_idx = np.arange(len(self.y))
            np.random.shuffle(shuffled_idx)

            # Uncomment if you want it to find and print the mean f1 score
            #test_f1_mean = np.mean(ms.cross_val_score(model, self.X[shuffled_idx], self.y[shuffled_idx], cv=10, n_jobs=-1, scoring='f1'))
            #print('using cross val score F1 = %0.4f' % (test_f1_mean))

            prob_list = []

            if group_classes:
                cv_params = {
                    "X": self.X[rng_idx],
                    "y": self.y[rng_idx],
                    "groups": list(self.class_list[rng_idx])
                }
            else:
                cv_params = {"X": self.X, "y": self.y}
            # Calculates and plots the roc cureve for each set in 10-fold cross validation
            for train, test in cv.split(**cv_params):
                model_i = model.fit(self.X[train], self.y[train])
                probas_ = model_i.predict_proba(self.X[test])
                pred = model_i.predict(self.X[test])
                f1 = met.f1_score(self.y[test], pred, average='binary')
                f1s.append(f1)
                # Compute ROC curve and area the curve
                #prob_list += [pd.DataFrame({"treat_prob":[pr[1] for pr in probas_]})]
                fpr, tpr, thresholds = met.roc_curve(self.y[test], probas_[:,
                                                                           1])
                tprs.append(sci.interp(mean_fpr, fpr, tpr))
                tprs[-1][0] = 0.0
                roc_auc = met.auc(fpr, tpr)
                aucs.append(roc_auc)
                plt.plot(fpr,
                         tpr,
                         lw=1,
                         alpha=0.3,
                         label='ROC fold %d (AUC = %0.4f, F1 = %0.4f)' %
                         (i, roc_auc, f1))

                i += 1

            # Plots the 50/50 line
            plt.plot([0, 1], [0, 1],
                     linestyle='--',
                     lw=2,
                     color='r',
                     label='Coin Flip',
                     alpha=.8)

            # Finds and plots the mean roc curve and mean f1 score
            mean_tpr = np.mean(tprs, axis=0)
            mean_f1 = np.mean(f1s)
            mean_tpr[-1] = 1.0
            mean_auc = met.auc(mean_fpr, mean_tpr)
            std_auc = np.std(aucs)
            plt.plot(mean_fpr,
                     mean_tpr,
                     color='b',
                     label=u'Mean ROC (AUC = %0.4f \u00B1 %0.4f, \n        \
                        Mean F1 = %0.4f)' % (mean_auc, std_auc, mean_f1),
                     lw=2,
                     alpha=.8)

            # Finds and plots the +- standard deviation for roc curve
            std_tpr = np.std(tprs, axis=0)
            tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
            tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
            plt.fill_between(mean_fpr,
                             tprs_lower,
                             tprs_upper,
                             color='grey',
                             alpha=.2,
                             label=r'$\pm$ 1 std. dev.')

            # Sets legend, limits, labels, and displays plot
            plt.xlim([-0.05, 1.05])
            plt.ylim([-0.05, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('Receiver Operating Characteristic')
            plt.legend(loc="lower right")
            outloc = self.output + '/Figure3.png'
            plt.savefig(outloc)
            #plt.show()
            plt.close()
Exemple #29
0
     1))  # a list for storing generalizaition error after each outer cv-fold
h_optimal_list = [
]  # a list for storing optimal hidden units no after each outer cv-fold
ANN_best_models = [
]  # a list for models for storing models after each outer cv-fold

# Make figure for holding summaries (errors and learning curves)
summaries, summaries_axes = plt.subplots(1, 2, figsize=(10, 5))
# Make a list for storing assigned color of learning curve for up to K=10
color_list = [
    'tab:orange', 'tab:green', 'tab:purple', 'tab:brown', 'tab:pink',
    'tab:gray', 'tab:olive', 'tab:cyan', 'tab:red', 'tab:blue'
]

for k, (train_index, test_index) in enumerate(
        model_selection.StratifiedKFold(K, shuffle=True,
                                        random_state=0).split(X, y)):
    print('\nCROSSVALIDATION OUTER FOLD: {0}/{1}'.format(k + 1, K))

    #network_validate_classification does the inner cross-validation with cvf=10
    opt_val_err, opt_n_h_units = network_validate_classification(
        X, y, h_interval)

    model = lambda: torch.nn.Sequential(
        torch.nn.Linear(M, opt_n_h_units),  #M features to H hiden units
        torch.nn.Tanh(),  # 1st transfer function,
        torch.nn.Linear(opt_n_h_units, 1),  # H hidden units to 1 output neuron
        torch.nn.Sigmoid()  # final tranfer function
    )
    loss_fn = torch.nn.BCELoss()

    X_train = torch.Tensor(X[train_index, :])
    def run(self, decision_tree_params, knn_params, nb_params, mlp_params,
            regression_params):
        # treinamento
        inner_skf = model_selection.StratifiedKFold(n_splits=10,
                                                    shuffle=True,
                                                    random_state=None)
        # ajuste dos params
        outer_skf = model_selection.StratifiedKFold(n_splits=3,
                                                    shuffle=True,
                                                    random_state=None)

        decision_tree = DecisionTreeClassifier(
            **self.decision_tree_tune_params(outer_skf, decision_tree_params))
        knn = KNeighborsClassifier(
            **self.knn_tune_params(outer_skf, knn_params))
        bayes = GaussianNB(**self.nb_tune_params(outer_skf, nb_params))
        regression = LogisticRegression(**self.logistic_regression_tune_params(
            outer_skf, regression_params))
        mlp = MLPClassifier(**self.mlp_tune_params(outer_skf, mlp_params))

        for train, test in inner_skf.split(self.data, self.target):
            data_train, target_train = self.data[train], self.target[train]
            data_test, target_test = self.data[test], self.target[test]

            decision_tree = decision_tree.fit(data_train, target_train)
            decision_tree_predicted = decision_tree.predict(data_test)
            self.predicted_classes["tree"][test] = decision_tree_predicted
            self.stats["tree"]["f1_score"].append(
                f1_score(self.target[test],
                         decision_tree_predicted,
                         average=None))
            self.stats["tree"]["accuracy_score"].append(
                accuracy_score(self.target[test],
                               decision_tree_predicted,
                               average=None))

            knn = knn.fit(data_train, target_train)
            knn_predicted = knn.predict(data_test)
            self.predicted_classes["knn"][test] = knn_predicted
            self.stats["knn"]["f1_score"].append(
                f1_score(self.target[test], knn_predicted, average=None))
            self.stats["knn"]["accuracy_score"].append(
                accuracy_score(self.target[test], knn_predicted, average=None))

            bayes = bayes.fit(data_train, target_train)
            bayes_predicted = bayes.predict(data_test)
            self.predicted_classes["bayes"][test] = bayes_predicted
            self.stats["bayes"]["f1_score"].append(
                f1_score(self.target[test], bayes_predicted, average=None))
            self.stats["bayes"]["accuracy_score"].append(
                accuracy_score(self.target[test],
                               bayes_predicted,
                               average=None))

            regression = regression.fit(data_train, target_train)
            regression_predicted = regression.predict(data_test)
            self.predicted_classes["regression"][test] = regression_predicted
            self.stats["regression"]["f1_score"].append(
                f1_score(self.target[test], regression_predicted,
                         average=None))
            self.stats["regression"]["accuracy_score"].append(
                accuracy_score(self.target[test],
                               regression_predicted,
                               average=None))

            mlp = mlp.fit(data_train, target_train)
            mlp_predicted = mlp.predict(data_test)
            self.predicted_classes["mlp"][test] = mlp_predicted
            self.stats["mlp"]["f1_score"].append(
                f1_score(self.target[test], mlp_predicted, average=None))
            self.stats["mlp"]["accuracy_score"].append(
                accuracy_score(self.target[test], mlp_predicted, average=None))