Ejemplo n.º 1
0
def test_many_classifiers(X, y, classifiers, Kfold=5):
    ErrorClassifiers = np.zeros(len(classifiers))

    rkf = RepeatedKFold(n_splits=Kfold, n_repeats=1)

    for train_index, test_index in rkf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        for i_clas in range(len(classifiers)):
            classifiers[i_clas].fit(X_train, y_train)
            ErrorClassifiers[i_clas] += 1 - \
                classifiers[i_clas].score(X_test, y_test)

    ErrorClassifiers /= rkf.get_n_splits(X)

    return ErrorClassifiers
Ejemplo n.º 2
0
def mlpKFold(X, y, k, act, solve, alph, it):
    acc = 0
    rkf = RepeatedKFold(n_splits=k)
    for train_index, test_index in rkf.split(X):
        X_train = []
        X_test = []
        y_train = []
        y_test = []
        for i in train_index:
            X_train.append(X[i])
            y_train.append(y[i])
        for i in test_index:
            X_test.append(X[i])
            y_test.append(y[i])
        mlp = MLPClassifier(activation=activationGlobal[act],
                            solver=solverGlobal[solve],
                            alpha=alphaGlobal[alph],
                            max_iter=max_iterationsGlobal[it])
        mlp = mlp.fit(X_train, y_train)
        acc += mlp.score(X_test, y_test)
    acc /= rkf.get_n_splits()
    return acc
import pandas
import matplotlib.pyplot as plot
from sklearn import metrics
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RepeatedKFold

dataset = pandas.read_csv('salaryData.csv')

x = dataset['YearsExperience'].values
y = dataset['Salary'].values
X = x.reshape(len(x), 1)
Y = y.reshape(len(y), 1)

kf = RepeatedKFold(n_splits=2, n_repeats=1, random_state=200)
kf.get_n_splits(X)

for train_index, test_index in kf.split(X):
    xTrain, xTest = X[train_index], X[test_index]
    yTrain, yTest = Y[train_index], Y[test_index]

    regressor = DecisionTreeRegressor()
    regressor.fit(xTrain, yTrain)

    regr = AdaBoostRegressor()
    regr.fit(X, y)

    yPrediction = regressor.predict(xTest)
    yPred = regr.predict(xTest)

    df = pandas.DataFrame({
Ejemplo n.º 4
0
    def Xgboost_RepeatedKFold(self):
        i = 0
        self.d_test = self.CreateDMatrix(DF=self.Test_df, is_test=True)
        self.Train_df.reset_index(inplace=True, drop=True)
        kf = RepeatedKFold(n_splits=self.nbr_fold,
                           random_state=self.random_state,
                           n_repeats=self.nbr_RepeatedKFold)
        kf.get_n_splits(self.Train_df)
        self.Pred_train = np.zeros((len(self.Train_df)))
        self.Pred_test = np.zeros((len(self.Test_df)))
        List_validation_fold = []
        List_Train_fold = []
        self.logs = []
        for (train_index, val_index) in (kf.split(self.Train_df)):
            i += 1
            self.logs.append("#" * 50 + "fold:" + str(i) + "#" * 50)
            List_train_run = []
            List_validation_run = []
            Train_fold, Val_flod = self.Train_df.loc[
                train_index, :], self.Train_df.loc[val_index, :]

            for run in range(self.nbr_run):
                clear_output()
                if self.nbr_run > 0:
                    self.params["seed"] = random.randint(1, 10000)
                self.print_log(self.logs)

                self.xgboost = self.fit(Train_fold, Val_flod)

                train_metrics, val_metrics, Val_pred, Test_pred = self.eval_xgboost(
                )
                List_train_run.append(train_metrics)
                List_validation_run.append(val_metrics)

                self.logs.append("run " + str(run) + " train metrics :" +
                                 str(train_metrics) + " val metrics : " +
                                 str(val_metrics))
                self.Pred_train[val_index] += Val_pred
                self.Pred_test += Test_pred

                clear_output()

            List_validation_fold.append(np.mean(List_validation_run))
            List_Train_fold.append(np.mean(List_train_run))
            self.logs.append("\n" + "fold-" + str(i) + " train metrics :" +
                             str(np.mean(List_train_run)) + " val metrics : " +
                             str(np.mean(List_validation_run)))

            clear_output()
            self.print_log(self.logs)

        Val_mtercis = np.mean(List_validation_fold)
        Train_mtercis = np.mean(List_Train_fold)
        self.Pred_test /= (self.nbr_fold * self.nbr_run *
                           self.nbr_RepeatedKFold)
        self.Pred_train /= (self.nbr_run * self.nbr_RepeatedKFold)

        print("End Training with  train metrics :" + str(Train_mtercis) +
              " val metrics : " + str(Val_mtercis))

        return self.get_output(self.Pred_test, self.Pred_train)
Ejemplo n.º 5
0
def train_and_test(classifier, csv_train_path, csv_test_path, selected_feature_names=None, swap_traintest=False, cv_fold_and_repeat=None,
                   balance=False, standardize=True, shuffle=False, categorical_feature_mapping=None, one_hot=False, prob_scores=False):
    """
    Function to train and test a new model on specified ntrain and test sets

    :param classifier: sklearn object of the model (not yet fitted) to be used
    :param csv_train_path: .csv path of train data
    :param csv_test_path: .csv path of test data
    :param selected_feature_names: List of names of the selected features
    :param swap_traintest: If True, swap the specified train and test sets (i.e. use csv_test_path as train set)
    :param cv_fold_and_repeat: Set this to a tuple (k, n) to perform cross validation. (k=k-fold CV, n=number of repetitions)
    :param balance: Set True to balance the data. (#samples same for all classes)
    :param standardize: Set to True to standardize data, or privide a path to a .pickle file containing a stored standardizer
    :param shuffle: Set to True to activate shuffling for cross validation
    :param categorical_feature_mapping: Dictionary to map categorical features to numerical values (see doc of function ml_helper.load_dataset())
    :param one_hot: If categorical features are present, set this parameter to True to enable one hot encoding
    :param prob_scores: Set true to include score predictions (e.g. probabilities) into reported performance metrics
                        --> necessary for precision-recall curves
    :return:
    """
    if cv_fold_and_repeat != None:
        if csv_test_path != None:
            X, Y = ml_helpers.load_dataset_seperate(csv_train_path, csv_test_path, selected_feature_names=selected_feature_names,
                                                balance=balance, standardize=standardize, merge=True, categorical_feature_mapping=categorical_feature_mapping, one_hot=one_hot)
        else:
            X, Y, _, _ = ml_helpers.load_dataset(csv_train_path, 1.0, selected_feature_names=selected_feature_names, balance=balance,
                         standardize=standardize, categorical_feature_mapping=categorical_feature_mapping, one_hot=one_hot)


        kf = RepeatedKFold(n_splits=cv_fold_and_repeat[0], n_repeats=cv_fold_and_repeat[1])

        # kf = KFold(n_splits=cv_fold_and_repeat[0], shuffle=shuffle)
        kf.get_n_splits(X)

        metrics = []

        print('Cross Validation ...\n')
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            Y_train, Y_test = Y[train_index], Y[test_index]

            clf = clone(classifier)  # yields a new estimator with the same parameters that has not been fit on any data.
            clf.fit(X_train, Y_train)

            Y_predicted = clf.predict(X_test)

            metrics.append(performance_metrics(Y_test, Y_predicted, report=True))

        return metrics
        # scores = cross_val_score(clf, X_train, Y_train, cv=6)
    else:
        X_train, Y_train, X_test, Y_test  = ml_helpers.load_dataset_seperate(csv_train_path, csv_test_path, selected_feature_names=selected_feature_names,
                                                                            balance=balance, standardize=standardize, swap_traintest=swap_traintest,
                                                                            categorical_feature_mapping=categorical_feature_mapping, one_hot=one_hot)

        print('Training the model ...')
        start = time.time()
        classifier.fit(X_train, Y_train)
        end = time.time()
        train_time = end - start
        print('... Training took {}s'.format(train_time))

        print('Performing predictions on testset ...')
        start = time.time()
        Y_predicted = classifier.predict(X_test)
        end = time.time()
        inference_time = end - start
        print('... Inference took {}s'.format(inference_time))

        if prob_scores:
            Y_scores = classifier.predict_proba(X_test)
            metrics = performance_metrics(Y_test, Y_predicted, Y_scores, report=True)
        else:
            metrics = performance_metrics(Y_test, Y_predicted, report=True)

        metrics['train_time'] = train_time
        metrics['inference_time'] = inference_time
        metrics['nr_train_samples'] = len(Y_train)
        metrics['nr_test_samples'] = len(Y_test)
        return metrics
Ejemplo n.º 6
0
    def fit(self,
            X,
            y,
            labels=None,
            dist=None,
            importance_weights=None,
            cv_indices=None,
            dist_savename=None):
        t = time.time()

        if y.ndim < 2:
            y = y.reshape(-1, 1)

        if self.n_components is not None:
            if self.verbose > 0:
                elapsed = time.time() - t
                print('PCA [%dmin %dsec]' %
                      (int(elapsed / 60), int(elapsed % 60)))
            sys.stdout.flush()
            self.pca = PCA(n_components=self.n_components, svd_solver='arpack')
            y_ = self.pca.fit_transform(y)
            if self.verbose > 0:
                print('Lost %.1f%% information ' % (self.pca.noise_variance_) +
                      '[%dmin %dsec]' % (int(elapsed / 60), int(elapsed % 60)))
                elapsed = time.time() - t
        else:
            y_ = y

        if labels is not None:
            raise RuntimeError('Not implemented.')

        if cv_indices is None:
            cv_indices = np.arange(X.shape[0])
        if self.cv_type is None:
            kfold = RepeatedKFold(n_splits=self.cv_nfolds,
                                  n_repeats=self.cv_shuffles)
            cv_folds = kfold.split(X[cv_indices])
            n_cv_folds = kfold.get_n_splits()
        elif self.cv_type == 'iter':
            cv_folds = self.cv_groups
            n_cv_folds = len(self.cv_groups)
        elif self.cv_type == 'group':
            groups = self.cv_groups
            if self.cv_nfolds is None:
                self.cv_nfolds = len(np.unique(groups))
            kfold = GroupKFold(n_splits=self.cv_nfolds)
            cv_folds = kfold.split(X[cv_indices], y[cv_indices], groups)
            n_cv_folds = kfold.get_n_splits()
        else:
            raise Exception('Cross-validation type not supported')

        add_train_inds = np.setdiff1d(np.arange(X.shape[0]), cv_indices)
        cv_folds = list(cv_folds)
        cv_folds = [(np.concatenate((train_fold, add_train_inds)), test_fold)
                    for train_fold, test_fold in cv_folds]

        if self.verbose > 0:
            elapsed = time.time() - t
            print('Computing distance matrix [%dmin %dsec]' %
                  (int(elapsed / 60), int(elapsed % 60)))
            sys.stdout.flush()

        if dist is None:
            dist = euclidean_distances(X, None, squared=self.squared_dist)
            if dist_savename is not None:
                if self.verbose > 0:
                    print('Saving distance matrix to file:', dist_savename)
                np.save(dist_savename, dist)

        if importance_weights is None:
            self.krr_param_grid['lambda'] = [0]
            importance_weights = np.ones((X.shape[0], ))

        importance_weights = importance_weights**(0.5)

        errors = []
        if 'v' in self.krr_param_grid:
            for fold_i, (train_i, test_i) in enumerate(cv_folds):
                fold_errors = np.empty(
                    (len(self.krr_param_grid['v']),
                     len(self.krr_param_grid['gamma']), 1,
                     len(self.krr_param_grid['alpha']), y_.shape[1]))
                if self.verbose > 0:
                    elapsed = time.time() - t
                    print('CV %d of %d [%dmin %dsec]' %
                          (fold_i + 1, n_cv_folds, int(
                              elapsed / 60), int(elapsed % 60)))
                    sys.stdout.flush()
                for v_i, v in enumerate(self.krr_param_grid['v']):
                    for gamma_i, gamma in enumerate(
                            self.krr_param_grid['gamma']):
                        for lamb_i, lamb in enumerate(
                                self.krr_param_grid['lambda']):
                            iw = importance_weights**lamb
                            iw = iw[:, None]
                            K_train = self.kernel.apply_to_dist(dist[np.ix_(
                                train_i, train_i)],
                                                                gamma=gamma)
                            K_train *= np.outer(iw[train_i], iw[train_i])
                            K_test = self.kernel.apply_to_dist(dist[np.ix_(
                                test_i, train_i)],
                                                               gamma=gamma)
                        if self.verbose > 0:
                            sys.stdout.write('.')
                            sys.stdout.flush()
                            for alpha_i, alpha in enumerate(
                                    self.krr_param_grid['alpha']):
                                if self.verbose > 0:
                                    sys.stdout.write(',')
                                    sys.stdout.flush()
                                for y_i in np.arange(y_.shape[1]):
                                    K_train_ = K_train.copy()
                                    alpha_add = get_alpha_add(
                                        self.n_basis, self.n_grid, self.delta,
                                        v)
                                    K_train_.flat[::K_train_.shape[0] +
                                                  1] += alpha * alpha_add[y_i]
                                    try:
                                        L_ = cholesky(K_train_, lower=True)
                                        x = solve_triangular(L_,
                                                             y_[train_i, y_i],
                                                             lower=True)
                                        dual_coef_ = solve_triangular(L_.T, x)
                                        pred_mean = np.dot(K_test, dual_coef_)
                                        if self.mae:
                                            e = np.mean(
                                                np.abs(pred_mean -
                                                       y_[test_i, y_i]), 0)
                                        else:
                                            e = np.mean((pred_mean -
                                                         y_[test_i, y_i])**2,
                                                        0)
                                    except np.linalg.LinAlgError:
                                        e = np.inf
                                    fold_errors[v_i, gamma_i, 0, alpha_i,
                                                y_i] = e
                if self.verbose > 0:
                    sys.stdout.write('\n')
                    sys.stdout.flush()
                errors.append(fold_errors)
            errors = np.array(errors)
            errors = np.mean(errors, 0)  # average over folds
        else:
            for fold_i, (train_i, test_i) in enumerate(cv_folds):
                fold_errors = np.empty(
                    (len(self.krr_param_grid['gamma']),
                     len(self.krr_param_grid['lambda']),
                     len(self.krr_param_grid['alpha']), y_.shape[1]))
                if self.verbose > 0:
                    elapsed = time.time() - t
                    print('CV %d of %d [%dmin %dsec]' %
                          (fold_i + 1, n_cv_folds, int(
                              elapsed / 60), int(elapsed % 60)))
                    sys.stdout.flush()
                for gamma_i, gamma in enumerate(self.krr_param_grid['gamma']):
                    if self.verbose > 0:
                        sys.stdout.write('.')
                        sys.stdout.flush()
                    for lamb_i, lamb in enumerate(
                            self.krr_param_grid['lambda']):
                        iw = importance_weights**lamb
                        iw = iw[:, None]
                        K_train = self.kernel.apply_to_dist(dist[np.ix_(
                            train_i, train_i)],
                                                            gamma=gamma)
                        K_train *= np.outer(iw[train_i], iw[train_i])
                        K_test = self.kernel.apply_to_dist(dist[np.ix_(
                            test_i, train_i)],
                                                           gamma=gamma)
                        for alpha_i, alpha in enumerate(
                                self.krr_param_grid['alpha']):
                            if self.verbose > 0:
                                sys.stdout.write(',')
                                sys.stdout.flush()
                            K_train_ = K_train.copy()
                            K_train_.flat[::K_train_.shape[0] + 1] += alpha
                            try:
                                L_ = cholesky(K_train_, lower=True)
                                x = solve_triangular(L_,
                                                     iw[train_i] * y_[train_i],
                                                     lower=True)
                                dual_coef_ = iw[train_i] * solve_triangular(
                                    L_.T, x)
                                pred_mean = np.dot(K_test, dual_coef_)
                                if self.mae:
                                    e = np.mean(
                                        np.abs(pred_mean - y_[test_i]) *
                                        importance_weights[test_i, None]**2, 0)
                                else:
                                    e = np.mean(
                                        ((pred_mean - y_[test_i])**2) *
                                        importance_weights[test_i, None]**2, 0)
                            except np.linalg.LinAlgError:
                                e = np.inf
                            fold_errors[gamma_i, lamb_i, alpha_i] = e
                if self.verbose > 0:
                    sys.stdout.write('\n')
                    sys.stdout.flush()
                errors.append(fold_errors)
            errors = np.array(errors)
            errors = np.mean(errors, 0)  # average over folds

        self.dual_coefs_ = np.empty((y_.shape[1], X.shape[0]))
        self.alphas_ = np.empty(y_.shape[1])
        self.lambdas_ = np.empty(y_.shape[1])
        self.gammas_ = np.empty(y_.shape[1])
        if self.verbose > 0:
            elapsed = time.time() - t
            print('Refit [%dmin %dsec]' %
                  (int(elapsed / 60), int(elapsed % 60)))
            sys.stdout.flush()
        print_count = 0

        if not self.single_combo:
            for i in range(y_.shape[1]):
                min_params = np.argsort(errors[:, :, :, i], axis=None)
                # lin_alg_errors = 0
                gamma_i, lamb_i, alpha_i = np.unravel_index(
                    min_params[0], errors.shape[:2])
                gamma = self.krr_param_grid['gamma'][gamma_i]
                lamb = self.krr_param_grid['lambda'][lamb_i]
                alpha = self.krr_param_grid['alpha'][alpha_i]
                self.alphas_[i] = alpha
                self.gammas_[i] = gamma
                self.lambdas_[i] = lamb

                if (gamma_i in (0, len(self.krr_param_grid['gamma']) - 1) or
                        lamb_i in (0, len(self.krr_param_grid['lambda']) - 1)
                        or alpha_i
                        in (0, len(self.krr_param_grid['alpha']) - 1)):
                    if print_count <= 200:
                        fmtstr = '%d: gamma=%g\talpha=%g\tlambda=%g\terror=%g\tmean=%g'
                        print(fmtstr % (i, gamma, alpha, lamb,
                                        errors[gamma_i, lamb_i, alpha_i, i],
                                        errors[gamma_i, lamb_i, alpha_i, i] /
                                        np.mean(np.abs(y_[:, i]))))
                        print_count += 1
        else:
            errors = np.mean(errors, -1)  # average over outputs
            if self.verbose > 1:
                print('CV errors:')
                print(errors)
                print('Alpha params:')
                print(self.krr_param_grid['alpha'])
                print('Gamma params:')
                print(self.krr_param_grid['gamma'])
                print('Lambda params:')
                print(self.krr_param_grid['lambda'])
            if self.verbose > 0:
                print('Min error: ', np.min(errors))

            # print np.log(errors)
            # plt.imshow(np.log(errors))
            # plt.xticks(range(10), map('{:.1e}'.format, list(self.krr_param_grid['alpha'])))
            # plt.yticks(range(10), map('{:.1e}'.format, list(self.krr_param_grid['gamma'])))
            # plt.xlabel('alpha')
            # plt.ylabel('gamma')
            # plt.colorbar()
            # plt.show()
            min_params = np.argsort(errors, axis=None)
            if 'v' in self.krr_param_grid:
                v_i, gamma_i, lamb_i, alpha_i = np.unravel_index(
                    min_params[0], errors.shape)
            else:
                gamma_i, lamb_i, alpha_i = np.unravel_index(
                    min_params[0], errors.shape)
            if 'v' in self.krr_param_grid:
                v = self.krr_param_grid['v'][v_i]
                print('v=', v)
            gamma = self.krr_param_grid['gamma'][gamma_i]
            alpha = self.krr_param_grid['alpha'][alpha_i]
            lamb = self.krr_param_grid['lambda'][lamb_i]

            if 'v' in self.krr_param_grid:
                if v == self.krr_param_grid['v'][0]:
                    print('v at lower edge.')
                if v == self.krr_param_grid['v'][-1]:
                    print('v at upper edge.')
            if len(self.krr_param_grid['gamma']) > 1:
                if gamma == self.krr_param_grid['gamma'][0]:
                    print('Gamma at lower edge.')
                if gamma == self.krr_param_grid['gamma'][-1]:
                    print('Gamma at upper edge.')
            if len(self.krr_param_grid['alpha']) > 1:
                if alpha == self.krr_param_grid['alpha'][0]:
                    print('Alpha at lower edge.')
                if alpha == self.krr_param_grid['alpha'][-1]:
                    print('Alpha at upper edge.')
            if len(self.krr_param_grid['lambda']) > 1:
                if lamb == self.krr_param_grid['lambda'][0]:
                    print('Lambda at lower edge.')
                if lamb == self.krr_param_grid['lambda'][-1]:
                    print('Lambda at upper edge.')
            self.alphas_[:] = alpha
            self.gammas_[:] = gamma
            self.lambdas_[:] = lamb

            if 'v' in self.krr_param_grid:
                alpha_add = get_alpha_add(self.n_basis, self.n_grid,
                                          self.delta, v)
                self.alphas_ *= alpha_add

        combos = list(zip(self.alphas_, self.gammas_, self.lambdas_))
        n_unique_combos = len(set(combos))
        self.L_fit_ = [None] * n_unique_combos
        for i, (alpha, gamma, lamb) in enumerate(set(combos)):
            if self.verbose > 0:
                elapsed = time.time() - t
                print('Parameter combinations ' + '%d of %d [%dmin %dsec]' %
                      (i + 1, n_unique_combos, int(elapsed / 60),
                       int(elapsed % 60)))
                sys.stdout.flush()
            y_list = [
                i for i in range(y_.shape[1]) if self.alphas_[i] == alpha
                and self.gammas_[i] == gamma and self.lambdas_[i] == lamb
            ]

            iw = importance_weights**lamb
            iw = iw[:, None]
            K = self.kernel.apply_to_dist(dist, gamma=gamma)
            K *= np.outer(iw, iw)
            # np.exp(K, K)
            while True:
                K.flat[::K.shape[0] + 1] += alpha - (alpha / 10)
                try:
                    if self.verbose > 0:
                        print('trying cholesky decomposition, alpha', alpha)
                    L_ = cholesky(K, lower=True)
                    self.L_fit_[i] = L_
                    x = solve_triangular(L_, iw * y_[:, y_list], lower=True)
                    # x = solve_triangular(L_, y_[:, y_list], lower=True)
                    dual_coef_ = solve_triangular(L_.T, x)
                    self.dual_coefs_[y_list] = iw.T * dual_coef_.T.copy()
                    break
                except np.linalg.LinAlgError:
                    if self.verbose > 0:
                        print('LinalgError, increasing alpha')
                    alpha *= 10
                    self.alphas_[0] = alpha

        if self.copy_X:
            self.X_fit_ = X.copy()
            self.y_fit_ = y.copy()
        else:
            self.X_fit_ = X
            self.y_fit_ = y
        self.errors = errors

        if self.verbose > 0:
            elapsed = time.time() - t
            print('Done [%dmin %dsec]' %
                  (int(elapsed / 60), int(elapsed % 60)))
            sys.stdout.flush()
Ejemplo n.º 7
0
def test_get_n_splits_for_repeated_kfold():
    n_splits = 3
    n_repeats = 4
    rkf = RepeatedKFold(n_splits, n_repeats)
    expected_n_splits = n_splits * n_repeats
    assert_equal(expected_n_splits, rkf.get_n_splits())
Ejemplo n.º 8
0
del df

XToScale = pd.DataFrame(scaler.fit_transform(XToScale),
                        columns=colunasFeatures)

X = XToScale

X['destination_port'] = X['destination_port'].astype('category')
Y = Y.astype('category')
del XToScale

rkf = RepeatedKFold(n_splits=numberOfFolds,
                    n_repeats=numberOfRepeats,
                    random_state=2652124)

rkf.get_n_splits(X, Y)
print(rkf)
acc = np.zeros((1, numberOfFolds * numberOfRepeats))
f1 = np.zeros((1, numberOfFolds * numberOfRepeats))
pr = np.zeros((1, numberOfFolds * numberOfRepeats))
rc = np.zeros((1, numberOfFolds * numberOfRepeats))
tim = np.zeros((1, numberOfFolds * numberOfRepeats))
rod = 0
models = []
classReports = []

for train_index, test_index in rkf.split(X, Y):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = Y[train_index], Y[test_index]
    print("classes de treinamento", set(y_train))
    print("classes de teste", set(y_test))
Ejemplo n.º 9
0
    excelNome = 'Nearest-Centroid-Classifier.xlsx'
elif classif == '5':
    clf = RandomForestClassifier()
    excelNome = 'Random-Forest-Classifier.xlsx'
elif classif == '6':
    clf = svm.SVC()
    excelNome = 'Support-Vector-Machines-Classifier.xlsx'
else:
    print('Erro!')

excel = pd.ExcelWriter(excelNome, engine='xlsxwriter')

#DEFININDO A DIVISÃO
rkf = RepeatedKFold(n_splits=10, n_repeats=10, random_state=2652124)
#RETORNA O NUMERO DE INTERAÇÕES DE DIVISÃO NO VALIDADOR CRUZADO
rkf.get_n_splits(X, y)
#print(rkf)
#print(X.columns)

#SEPARAND VARIAVEIS DE TREINO E TESTE
for train_index, test_index in rkf.split(X, y):
    print(train_index, test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    #CLASSIFICAÇÃO USANDO NEAREST-CENTROID
    inicio = time.time()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    fim = time.time()
    tempo = fim - inicio
    Tem.append(tempo)
def compare_datasets2(fig_fn=None):
    fig, axes = plt.subplots(3, 2)

    if not fig_fn is None:
        fig.set_size_inches(12, 8)

    # for k, type in enumerate (["linear", "quadratic", "open_circle"]):
    for k, type in enumerate(["quadratic"]):
        print("Type: %s" % type)

        X, target, d_X, d_target = create_artificial_dataset2(type=type, n=150)
        keys = list(d_X.keys())

        visualize_dataset2(X[:, 0],
                           X[:, 1],
                           target,
                           axes[k, 0],
                           title="Dataset: %s" % type)

        target_pred = np.zeros(len(X))
        mean_score = 0.0

        param_grid = {"C": [1]}

        cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=1)
        for k_cv, (train_set, test_set) in enumerate(cv.split(keys)):
            print("Fold %d / %d" % (k_cv + 1, cv.get_n_splits()))

            keys_train = [keys[idx] for idx in train_set]
            keys_test = [keys[idx] for idx in test_set]

            d_X_train, d_target_train = OrderedDict(), OrderedDict()
            for key in keys_train:
                d_X_train[key] = d_X[key]
                d_target_train[key] = d_target[key]

            # d_X_train = {key: value for key, value in d_X.items() if key in keys_train}
            # d_target_train = {key: value for key, value in d_target.items() if key in keys_train}

            if type == "linear":
                ranksvm_kernel = KernelRankSVC(
                    verbose=False,
                    kernel="linear",
                    feature_type="difference",
                    slack_type="on_pairs",
                    step_size_algorithm="diminishing_2",
                    convergence_criteria="alpha_change_norm")
            elif type == "quadratic":
                ranksvm_kernel = KernelRankSVC(verbose=False,
                                               kernel="poly",
                                               feature_type="difference",
                                               slack_type="on_pairs")
                param_grid["degree"] = [2]
            elif type == "open_circle":
                ranksvm_kernel = KernelRankSVC(verbose=False,
                                               kernel="rbf",
                                               feature_type="difference",
                                               slack_type="on_pairs")
                param_grid["gamma"] = [3]
            else:
                raise ValueError("Invalid test data type: %s" % type)

            cv_inner = GroupKFold(n_splits=3)
            best_params, param_scores, n_pairs_train, best_estimator, _, _ = find_hparan_ranksvm(
                ranksvm_kernel,
                d_X_train,
                d_target_train,
                cv=cv_inner,
                param_grid=param_grid,
                pair_params={
                    "allow_overlap": True,
                    "d_upper": 4,
                    "d_lower": 0,
                    "ireverse": True
                },
                n_jobs=1)
            print(best_params)

            X_test = np.array([d_X[key] for key in keys_test])
            target_test = np.array([d_target[key] for key in keys_test])
            pairs_test = get_pairs_single_system(target_test,
                                                 d_lower=0,
                                                 d_upper=np.inf)

            target_pred[test_set] += best_estimator.map_values(X_test)
            score = best_estimator.score(X_test, pairs_test)
            print(score)
            mean_score += score

        target_pred /= cv.get_n_splits()
        mean_score /= cv.get_n_splits()

        print(mean_score)

        visualize_ranksvm([d_target[key] for key in keys], target_pred,
                          axes[k, 1])

    if not fig_fn is None:
        plt.tight_layout()
        plt.savefig(fig_fn)
    else:
        plt.show()