Example #1
0
def test_repeated_stratified_kfold_determinstic_split():
    X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
    y = [1, 1, 1, 0, 0]
    random_state = 1944695409
    rskf = RepeatedStratifiedKFold(
        n_splits=2,
        n_repeats=2,
        random_state=random_state)

    # split should produce same and deterministic splits on
    # each call
    for _ in range(3):
        splits = rskf.split(X, y)
        train, test = next(splits)
        assert_array_equal(train, [1, 4])
        assert_array_equal(test, [0, 2, 3])

        train, test = next(splits)
        assert_array_equal(train, [0, 2, 3])
        assert_array_equal(test, [1, 4])

        train, test = next(splits)
        assert_array_equal(train, [2, 3])
        assert_array_equal(test, [0, 1, 4])

        train, test = next(splits)
        assert_array_equal(train, [0, 1, 4])
        assert_array_equal(test, [2, 3])

        assert_raises(StopIteration, next, splits)
Example #2
0
def main(dataset_name):

    dataset = load_dataset()

    raw_data = np.asarray(dataset['raw']['data'])
    raw_label = np.asarray(dataset['raw']['label'])
    num_classes = len(np.unique(raw_label))

    rskf = RepeatedStratifiedKFold(n_splits=k_folds,
                                   n_repeats=k_fold_reps,
                                   random_state=42)

    print('L2X-Method')
    cont_seed = 0

    nfeats = []
    accuracies = []
    model_accuracies = []
    svc_accuracies = []
    fs_time = []
    BAs = []
    svc_BAs = []
    model_BAs = []
    mAPs = []
    svc_mAPs = []
    model_mAPs = []
    mus = []
    name = dataset_name + '_' + kernel + '_mu_' + str(mu)
    print(name)

    for j, (train_index,
            test_index) in enumerate(rskf.split(raw_data, raw_label)):
        print('k_fold', j, 'of', k_folds * k_fold_reps)

        train_data, train_labels = raw_data[train_index], raw_label[
            train_index]
        test_data, test_labels = raw_data[test_index], raw_label[test_index]

        train_labels = to_categorical(train_labels, num_classes=num_classes)
        test_labels = to_categorical(test_labels, num_classes=num_classes)

        valid_features = np.where(np.abs(train_data).sum(axis=0) > 0)[0]
        if len(valid_features) < train_data.shape[1]:
            print('Removing', train_data.shape[1] - len(valid_features),
                  'zero features')
            train_data = train_data[:, valid_features]
            test_data = test_data[:, valid_features]

        model_kwargs = {
            'mu': mu / len(train_data),
            'kernel': kernel,
            'degree': 3
        }

        svc_kwargs = {'C': 1.0, 'solver': 0.}

        for i, n_features in enumerate([10, 50, 100, 150, 200]):
            n_accuracies = []
            n_svc_accuracies = []
            n_model_accuracies = []
            n_BAs = []
            n_svc_BAs = []
            n_model_BAs = []
            n_mAPs = []
            n_svc_mAPs = []
            n_model_mAPs = []
            n_train_accuracies = []
            n_time = []
            print('n_features : ', n_features)

            heatmaps = []
            for r in range(reps):
                np.random.seed(cont_seed)
                K.tf.set_random_seed(cont_seed)
                cont_seed += 1

                model = train_Keras(
                    train_data,
                    train_labels,
                    test_data,
                    test_labels,
                    model_kwargs,
                    l2x_model_func=get_l2x_model,
                    n_features=n_features,
                )
                heatmaps.append(model.heatmap)
                n_time.append(model.fs_time)
                test_data_norm = model.normalization.transform(test_data)
                train_data_norm = model.normalization.transform(train_data)
                test_pred = model.predict(test_data_norm)
                n_model_accuracies.append(
                    model.evaluate(test_data_norm, test_labels, verbose=0)[-1])
                n_model_BAs.append(balance_accuracy(test_labels, test_pred))
                n_model_mAPs.append(
                    average_precision_score(test_labels[:, -1], test_pred))
                train_acc = model.evaluate(train_data_norm,
                                           train_labels,
                                           verbose=0)[-1]
                print('n_features : ', n_features, ', accuracy : ',
                      n_model_accuracies[-1], ', BA : ', n_model_BAs[-1],
                      ', mAP : ', n_model_mAPs[-1], ', train_accuracy : ',
                      train_acc, ', time : ', n_time[-1], 's')
                del model
                K.clear_session()

            heatmap = np.mean(heatmaps, axis=0)
            best_features = np.argsort(heatmap)[::-1][:n_features]

            svc_train_data = train_data[:, best_features]
            svc_test_data = test_data[:, best_features]

            norm = normalization_func()
            svc_train_data_norm = norm.fit_transform(svc_train_data)
            svc_test_data_norm = norm.transform(svc_test_data)

            bestcv = -1
            bestc = None
            bestSolver = None
            for s in [0, 1, 2, 3]:
                for my_c in [
                        0.001, 0.1, 0.5, 1.0, 1.4, 1.5, 1.6, 2.0, 2.5, 5.0,
                        100.0
                ]:
                    cmd = '-v 5 -s ' + str(s) + ' -c ' + str(my_c) + ' -q'
                    cv = liblinearutil.train(
                        (2 * train_labels[:, -1] - 1).tolist(),
                        svc_train_data_norm.tolist(), cmd)
                    if cv > bestcv:
                        # print('Best -> C:', my_c, ', s:', s, ', acc:', cv)
                        bestcv = cv
                        bestc = my_c
                        bestSolver = s
            svc_kwargs['C'] = bestc
            svc_kwargs['solver'] = bestSolver
            print('Best -> C:', bestc, ', s:', bestSolver, ', acc:', bestcv)

            for r in range(reps):
                np.random.seed(cont_seed)
                K.tf.set_random_seed(cont_seed)
                cont_seed += 1

                model = train_SVC(svc_train_data_norm, train_labels,
                                  svc_kwargs)
                _, accuracy, test_pred = liblinearutil.predict(
                    (2 * test_labels[:, -1] - 1).tolist(),
                    svc_test_data_norm.tolist(), model, '-q')
                test_pred = np.asarray(test_pred)
                n_svc_accuracies.append(accuracy[0])
                n_svc_BAs.append(balance_accuracy(test_labels, test_pred))
                n_svc_mAPs.append(
                    average_precision_score(test_labels[:, -1], test_pred))
                del model
                model = train_Keras(svc_train_data, train_labels,
                                    svc_test_data, test_labels, model_kwargs)
                train_data_norm = model.normalization.transform(svc_train_data)
                test_data_norm = model.normalization.transform(svc_test_data)
                test_pred = model.predict(test_data_norm)
                n_BAs.append(balance_accuracy(test_labels, test_pred))
                n_mAPs.append(
                    average_precision_score(test_labels[:, -1], test_pred))
                n_accuracies.append(
                    model.evaluate(test_data_norm, test_labels, verbose=0)[-1])
                n_train_accuracies.append(
                    model.evaluate(train_data_norm, train_labels,
                                   verbose=0)[-1])
                del model
                K.clear_session()
                print(
                    'n_features : ',
                    n_features,
                    ', acc : ',
                    n_accuracies[-1],
                    ', BA : ',
                    n_BAs[-1],
                    ', mAP : ',
                    n_mAPs[-1],
                    ', train_acc : ',
                    n_train_accuracies[-1],
                    ', svc_acc : ',
                    n_svc_accuracies[-1],
                    ', svc_BA : ',
                    n_svc_BAs[-1],
                    ', svc_mAP : ',
                    n_svc_mAPs[-1],
                )
            if i >= len(accuracies):
                accuracies.append(n_accuracies)
                svc_accuracies.append(n_svc_accuracies)
                model_accuracies.append(n_model_accuracies)
                BAs.append(n_BAs)
                mAPs.append(n_mAPs)
                fs_time.append(n_time)
                svc_BAs.append(n_svc_BAs)
                svc_mAPs.append(n_svc_mAPs)
                model_BAs.append(n_model_BAs)
                model_mAPs.append(n_model_mAPs)
                nfeats.append(n_features)
                mus.append(model_kwargs['mu'])
            else:
                accuracies[i] += n_accuracies
                svc_accuracies[i] += n_svc_accuracies
                model_accuracies[i] += n_model_accuracies
                fs_time[i] += n_time
                BAs[i] += n_BAs
                mAPs[i] += n_mAPs
                svc_BAs[i] += n_svc_BAs
                svc_mAPs[i] += n_svc_mAPs
                model_BAs[i] += n_model_BAs
                model_mAPs[i] += n_model_mAPs

    output_filename = directory + 'LinearSVC_' + kernel + '_L2X.json'

    if not os.path.isdir(directory):
        os.makedirs(directory)

    info_data = {
        'kernel': kernel,
        'reps': reps,
        'classification': {
            'mus': mus,
            'n_features': nfeats,
            'accuracy': accuracies,
            'mean_accuracy': np.array(accuracies).mean(axis=1).tolist(),
            'svc_accuracy': svc_accuracies,
            'mean_svc_accuracy':
            np.array(svc_accuracies).mean(axis=1).tolist(),
            'model_accuracy': model_accuracies,
            'mean_model_accuracy':
            np.array(model_accuracies).mean(axis=1).tolist(),
            'BA': BAs,
            'mean_BA': np.array(BAs).mean(axis=1).tolist(),
            'mAP': mAPs,
            'mean_mAP': np.array(mAPs).mean(axis=1).tolist(),
            'svc_BA': svc_BAs,
            'svc_mean_BA': np.array(svc_BAs).mean(axis=1).tolist(),
            'svc_mAP': svc_mAPs,
            'svc_mean_mAP': np.array(svc_mAPs).mean(axis=1).tolist(),
            'model_BA': model_BAs,
            'model_mean_BA': np.array(model_BAs).mean(axis=1).tolist(),
            'model_mAP': model_mAPs,
            'model_mean_mAP': np.array(model_mAPs).mean(axis=1).tolist(),
            'fs_time': fs_time
        }
    }

    for k, v in info_data['classification'].items():
        if 'mean' in k:
            print(k, v)

    with open(output_filename, 'w') as outfile:
        json.dump(info_data, outfile)
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])
else:
    training_transform, test_transform = 2 * [
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        ])
    ]
if args.dataset == 'cars':
    from data import cars
    stratified_crossvalidation = RepeatedStratifiedKFold(
        n_splits=3, n_repeats=5, random_state=args.seed)
    data = cars.Calltech101(images_folder='images',
                            input_transform=training_transform)
    data.shuffle(seed=args.seed)
    data_size = len(data)
    indexes = np.arange(0, data_size, 1, dtype=int)
    targets = np.asarray(data.targets)
    kfolds = list(stratified_crossvalidation.split(indexes, targets))
    train_dataset = cars.Calltech101(images_folder='images',
                                     input_transform=training_transform)
    train_dataset.shuffle(seed=args.seed)
    train_dataset.prune_dataset(indexes=kfolds[args.cross_validation_split][1])
    test_dataset = cars.Calltech101(images_folder='images',
                                    input_transform=test_transform)
    train_dataset.shuffle(seed=args.seed)
    test_dataset.prune_dataset(indexes=kfolds[args.cross_validation_split][0])
Example #4
0
def generate_kfold(X,
                   y=None,
                   n_splits=5,
                   random_state=0,
                   stratified=False,
                   n_repeats=1):
    if stratified and (y is not None):
        if n_repeats > 1:
            kf = RepeatedStratifiedKFold(n_splits=n_splits,
                                         n_repeats=n_repeats,
                                         random_state=random_state)
        else:
            kf = StratifiedKFold(n_splits=n_splits,
                                 shuffle=True,
                                 random_state=random_state)

        kf.get_n_splits(X, y)
        return [[train_index, test_index]
                for train_index, test_index in kf.split(X, y)]
    else:
        if n_repeats > 1:
            kf = RepeatedKFold(n_splits=n_splits,
                               n_repeats=n_repeats,
                               random_state=random_state)
        else:
            kf = KFold(n_splits=n_splits,
                       shuffle=True,
                       random_state=random_state)

        kf.get_n_splits(X)
        return [[train_index, test_index]
                for train_index, test_index in kf.split(X)]
    def rfe_feature_selection(self, input, output, dict_of_models,
                              list_number_of_features_to_select):
        """
        Performs models evaluation within the No_outer times repeated No_inner-fold cross-validation procedure for different
        number of features selected by RFE algorithm with nested 10-times cross-validation for model hyperparameters' tuning
        ----------
        :param input : array-like, shape (n_samples, n_features)
            The training input samples.
        :param output : array-like, shape (n_samples, 1)
            The target values.
        :param dict_of_models: dictionary
            Models with details for grid-search.
        :param list_number_of_features_to_select - list
            Number of features to select.

        :return df_aucs : DataFrame object, shape (No_outer x No_inner, number of models x length of list_number_of_features)
            AUC values for every step of No_outer x No_inner-times CV are provided.
        :return df_res : DataFrame object, shape ([number of models x length of list_number_of_features_to_select], 9)
            For every model and every No. of selected features best classifier's parameters and averaged classification
            metrics are provided : Accuracy, Sensitivity, Specificity, Precision, F1-Score, AUC.
        :return df_stds : DataFrame object, shape ([number of models x length of list_number_of_features_to_select], 8)
            For every model and every No. of selected features standard deviations of classification metrics
            are provided.
        """

        df_res = pd.DataFrame(columns=[
            'FS Method', 'Classifier', 'Selected features', 'Best parameters',
            'Accuracy', 'Sensitivity', 'Specificity', 'Precision', 'F1-score',
            'ROC_AUC'
        ])

        df_stds = pd.DataFrame(columns=[
            'FS Method', 'Classifier', 'Selected features', 'Acc_std',
            'Sens_std', 'Spec_std', 'Prec_std', 'F1_std', 'ROC_AUC_std'
        ])

        df_aucs = pd.DataFrame()

        for m in dict_of_models:
            for k in list_number_of_features_to_select:
                accuracy = []
                aucs = []
                sensitivity = []
                specificity = []
                precision = []
                f1score = []
                cohen_kappa = []
                tprs = []
                params = []
                X, y = input, output
                skf = RepeatedStratifiedKFold(n_splits=self.N_inner,
                                              n_repeats=self.N_outer,
                                              random_state=88)

                clf = m['classifier']

                for train_index, test_index in skf.split(X, y):
                    X_train, X_test = X[train_index], X[test_index]
                    y_train, y_test = y[train_index], y[test_index]

                    best_params = []

                    rfe = RFE(estimator=clf, n_features_to_select=k, step=1)

                    rfe_smote_clf = Pipeline([('oversampling',
                                               SMOTE(random_state=88)),
                                              ('feature_selection', rfe),
                                              ('classifier', clf)])

                    param_grid = m['grid']

                    gridsearch_cv = GridSearchCV(rfe_smote_clf,
                                                 param_grid,
                                                 cv=10,
                                                 scoring='roc_auc')

                    gridsearch_cv.fit(X_train, y_train)
                    best_params.append(gridsearch_cv.best_params_)

                    # predicted class
                    y_predict = gridsearch_cv.predict(X_test)

                    # predicted probabilities
                    probas_ = gridsearch_cv.predict_proba(X_test)

                    # accuracy
                    acc = accuracy_score(y_predict, y_test)
                    accuracy.append(acc)

                    # sensitivity = recall
                    sens = recall_score(y_test, y_predict)
                    sensitivity.append(sens)

                    # specificity
                    spec = self.get_specificity(y_test, y_predict)
                    specificity.append(spec)

                    # precision
                    prec = precision_score(y_test, y_predict)
                    precision.append(prec)

                    # f1-score
                    f1 = f1_score(y_test, y_predict)
                    f1score.append(f1)

                    # cohen-kappa-score
                    kappa = cohen_kappa_score(y_test, y_predict)
                    cohen_kappa.append(kappa)

                    # Compute ROC curve and area the curve
                    fpr, tpr, thresholds = roc_curve(y[test_index], probas_[:,
                                                                            1])
                    tprs.append(interp(self.mean_fprs, fpr, tpr))
                    tprs[-1][0] = 0.0
                    roc_auc = auc(fpr, tpr)
                    aucs.append(roc_auc)

                    # best parameters
                    params.append(best_params)

                df_aucs[m['name'] + str(k)] = aucs

                df_stds = df_stds.append(
                    {
                        'Classifier': m['name'],
                        'Selected features': k,
                        'Acc_std': np.std(accuracy),
                        'Sens_std': np.std(sensitivity),
                        'Spec_std': np.std(specificity),
                        'Prec_std': np.std(precision),
                        'F1_std': np.std(f1score),
                        'ROC_AUC_std': np.std(aucs)
                    },
                    ignore_index=True)

                df_res = df_res.append(
                    {
                        'Classifier': m['name'],
                        'Selected features': k,
                        'Best parameters': params,
                        'Accuracy': np.mean(accuracy),
                        'Sensitivity': np.mean(sensitivity),
                        'Specificity': np.mean(specificity),
                        'Precision': np.mean(precision),
                        'F1-score': np.mean(f1score),
                        'ROC_AUC': np.mean(aucs)
                    },
                    ignore_index=True)

        return df_aucs, df_res, df_stds
    def best_models_ROC_curves(self, input, output, models_dict, show_plot):
        """
            Plot ROC curves for the best evaluated methods or returns averaged predicted probabilities for every sample
            by all selected models
            Parameters
            ----------
            :param input : array-like, shape (n_samples, n_features)
                The training input samples.
            :param output : array-like, shape (n_samples, 1)
                The target values.
            :param models_dict : dictionary
                Models with details for grid-search
            :param show_plot - boolean
                Indicator whether plot should be rendered

            :return [selected_models]_proba_[number_of_selected_features] : array-like, shape (1, n_samples)
                Averaged predicted probabilities for every sample by every selected model
            :return plot
        """

        X, y = input, output
        mean_fpr = np.linspace(0, 1, 100)

        instances = X.shape[0]
        proba = np.zeros((len(models_dict), 10, instances))

        skf = RepeatedStratifiedKFold(n_splits=self.N_inner,
                                      n_repeats=self.N_outer,
                                      random_state=88)

        plt.figure(1)
        plt.rcParams["figure.figsize"] = (10, 6)
        plt.plot([0, 1], [0, 1],
                 linestyle='--',
                 lw=2,
                 color='brown',
                 label='Chance',
                 alpha=.8)

        j = 0
        for m in models_dict:
            tprs = []
            aucs = []
            clf = m['classifier']
            fs = m['fs_method']
            color_line = m['color_line']
            color_shadow = m['color_shadow']

            k = 1
            i = 0
            for train_index, test_index in skf.split(X, y):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]

                fs_smote_clf = Pipeline([('oversampling',
                                          SMOTE(random_state=88)),
                                         ('feature_selection', fs),
                                         ('classifier', clf)])

                param_grid = m['grid']
                # for survival problem
                if m['name'] == 'MLP_mRMR' or m['name'] == 'SVM_mRMR_50':
                    gridsearch_cv = fs_smote_clf
                else:
                    gridsearch_cv = GridSearchCV(fs_smote_clf,
                                                 param_grid,
                                                 cv=10,
                                                 scoring='roc_auc')

                gridsearch_cv.fit(X_train, y_train)

                # predicted probabilities
                probas_ = gridsearch_cv.predict_proba(X_test)

                if k > self.N_outer:
                    break
                elif i >= k * self.N_inner:
                    k += 1

                proba[j, k - 1, test_index] = probas_[:, 1]

                # Compute ROC curve
                fpr, tpr, thresholds = roc_curve(y[test_index], probas_[:, 1])
                tprs.append(interp(self.mean_fprs, fpr, tpr))
                tprs[-1][0] = 0.0
                roc_auc = auc(fpr, tpr)
                aucs.append(roc_auc)
                i += 1

            mean_tpr = np.mean(tprs, axis=0)
            mean_tpr[-1] = 1.0
            mean_auc = auc(mean_fpr, mean_tpr)
            std_auc = np.std(aucs)
            plt.plot(mean_fpr,
                     mean_tpr,
                     color=color_line,
                     label=r'Mean ROC %s (AUC = %0.2f $\pm$ %0.2f)' %
                     (m['name'], mean_auc, std_auc),
                     lw=2,
                     alpha=.8)

            std_tpr = np.std(tprs, axis=0)
            tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
            tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
            plt.fill_between(mean_fpr,
                             tprs_lower,
                             tprs_upper,
                             color=color_shadow,
                             alpha=.2,
                             label=r'$\pm$ 1 std. dev. for ' + m['name'])

            j += 1

        if show_plot:
            plt.xlim([-0.05, 1.05])
            plt.ylim([-0.05, 1.05])

            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('Receiver operating characteristic')
            plt.legend(loc="lower right")
            plt.show()
            return
        else:
            mean_proba = np.mean(proba, axis=1)
            return mean_proba
Example #7
0
    def cross_val(
        self,
        X=None,
        y=None,
        X_test=None,
        model=None,
        folds=10,
        score_folds=5,
        n_repeats=2,
        print_metric=False,
        metric_round=4,
        predict=False,
        get_feature_importance=False,
    ):
        """
        Description of cross_val:
            Cross-validation function

        Args:
            X=None (undefined):
            y=None (undefined):
            X_test=None (undefined):
            model=None (undefined):
            folds=10 (undefined):
            score_folds=5 (undefined):
            n_repeats=2 (undefined):
            print_metric=False (undefined):
            metric_round=4 (undefined):
            predict=False (undefined):
            get_feature_importance=False (undefined):
        
        Returns:
            result (dict)
        """
        if model is None:
            model = self

        if X is None:
            X = model._data.X_train
        if y is None:
            y = model._data.y_train

        if X_test is None:
            X_test = model._data.X_test

        if predict and (X_test is None):
            raise Exception("No X_test for predict")

        if model.type_of_estimator == 'classifier':
            skf = RepeatedStratifiedKFold(
                n_splits=folds,
                n_repeats=n_repeats,
                random_state=model._random_state,
            )
        else:
            skf = RepeatedKFold(
                n_splits=folds,
                n_repeats=n_repeats,
                random_state=model._random_state,
            )

        folds_scores = []
        stacking_y_pred_train = np.zeros(len(X))
        stacking_y_pred_test = np.zeros(len(X_test))
        feature_importance_df = pd.DataFrame(np.zeros(len(X.columns)),
                                             index=X.columns)

        for i, (train_idx, valid_idx) in enumerate(skf.split(X, y)):

            train_x, train_y = X.iloc[train_idx], y.iloc[train_idx]
            val_x, val_y = X.iloc[valid_idx], y.iloc[valid_idx]

            # TargetEncoders
            train_x, val_x, X_test = model.preproc_data_in_cv(
                train_x, train_y, val_x, X_test)

            # Fit
            model._fit(
                model=model,
                X_train=train_x.reset_index(drop=True),
                y_train=train_y.reset_index(drop=True),
                X_test=val_x.reset_index(drop=True),
                y_test=val_y.reset_index(drop=True),
            )

            # Predict
            if (model.metric.__name__ in predict_proba_metrics) and (
                    model.is_possible_predict_proba()):
                y_pred = model._predict_proba(val_x)
                if predict:
                    y_pred_test = model._predict_proba(X_test)
            else:
                y_pred = model._predict(val_x)
                if predict:
                    y_pred_test = model._predict(X_test)

            score_model = model.metric(val_y, y_pred)
            folds_scores.append(score_model)

            if get_feature_importance:
                feature_importance_df += model._get_feature_importance(train_x)

            if predict:
                stacking_y_pred_train[valid_idx] += y_pred
                stacking_y_pred_test += y_pred_test
            else:
                # score_folds
                if i + 1 >= score_folds:
                    break

        if predict:
            stacking_y_pred_train = stacking_y_pred_train / n_repeats
            stacking_y_pred_test = stacking_y_pred_test / (folds * n_repeats)

        if score_folds > 1 or predict:
            score = round(np.mean(folds_scores), metric_round)
            score_std = round(np.std(folds_scores), metric_round + 2)
        else:
            score = round(score_model, metric_round)
            score_std = 0

        if print_metric:
            print(
                f'\n Mean Score {model.metric.__name__} on {i+1} Folds: {score} std: {score_std}'
            )

        # Total
        result = {
            'Score': score,
            'Score_Std': score_std,
            'Test_predict': stacking_y_pred_test,
            'Train_predict': stacking_y_pred_train,
            'Feature_importance': dict(feature_importance_df[0]),
        }
        return (result)
Example #8
0
def cross_validate(feature_name, classifier_name, X, y, cv_num_folds,
                   cv_num_repeats):
    """Runs repeated stratified $k$-fold cross-validation.

  Returns multiple cross-validation metrics as a dictionary, where for each
  metric mean and variance across multiple repeats and folds is summarized.

  Args:
    feature_name: (string) Name of the WALS feature.
    classifier_name: (string) Classifier name.
    X: (numpy array) Input features.
    y: (numpy array) Labels.
    cv_num_folds: (int) Number of folds ($k$).
    cv_num_repeats: (int) Number of repetitions.

  Returns:
    Dictionary containing cross-validation scores and stats.
  """
    model = _make_classifier(classifier_name)
    scoring = ["f1_micro", "precision_micro", "recall_micro", "accuracy"]
    try:
        # Really primitive logic to figure out class distribution.
        _, y_counts = np.unique(y, return_counts=True)
        y_max_freq = np.max(y_counts)

        # Check if the class counts are not reliable to run cross-validation.
        if y_max_freq < cv_num_folds:
            logging.warning(
                "[%s] %s: Not enough data. Fitting the model instead "
                "of running CV", feature_name, classifier_name)
            # Simply fit the model.
            model.fit(X, y)
            cv_scores = {}
            cv_scores["accuracy"] = (model.score(X, y), 0.0)
            cv_scores[MODEL_INFO_SPARSITY_KEY] = True
            return cv_scores
        else:
            logging.info(
                "[%s] Running cross-validation of %s (k=%d, n=%d) ...",
                feature_name, classifier_name, cv_num_folds, cv_num_repeats)
            # Run cross-validation.
            cv = RepeatedStratifiedKFold(n_splits=cv_num_folds,
                                         n_repeats=cv_num_repeats,
                                         random_state=_RANDOM_STATE)
            cv_scores = model_selection.cross_validate(model,
                                                       X,
                                                       y,
                                                       cv=cv,
                                                       scoring=scoring,
                                                       n_jobs=cv_num_folds)
            cv_scores[MODEL_INFO_SPARSITY_KEY] = False
    except Exception as e:  # pylint: disable=broad-except
        logging.error("[%s] %s: CV: Exception: %s", feature_name,
                      classifier_name, e)
        return None

    del cv_scores["fit_time"]
    del cv_scores["score_time"]
    for score_name in scoring:
        scores_vec_key = "test_" + score_name
        cv_scores[score_name] = (np.mean(cv_scores[scores_vec_key]),
                                 np.var(cv_scores[scores_vec_key]))
        del cv_scores[scores_vec_key]
    # Sanity check.
    if math.isnan(cv_scores["accuracy"][0]):
        return None
    logging.info("[train] %s: CV scores for %s: %s", feature_name,
                 classifier_name, cv_scores)
    return cv_scores
        'use_particle_clamp_each_iteration': False,
        'unchanged_iterations_stop': 20000,
        'use_only_early_stopping': False
        # no early stopping used, that is why 20k
    },
    'pso_velocity_clamp': (-1, 1),
    'n_particles':
    100,
    'pso_iters':
    5000,
    'pso_optimizer':
    PSO,
}
CONFIG['cv'] = StratifiedKFold(n_splits=CONFIG['n_splits'], shuffle=True, random_state=CONFIG['random_state']) \
    if CONFIG['n_repeats'] == 1 else RepeatedStratifiedKFold(n_splits=CONFIG['n_splits'],
                                                             n_repeats=CONFIG['n_repeats'],
                                                             random_state=CONFIG['random_state'])
INPUT_FEATURES = torch.load(CONFIG['labels_features_common_name'] +
                            "_features.tr").numpy()
INPUT_LABELS = torch.load(CONFIG['labels_features_common_name'] +
                          "_labels.tr").numpy()
make_experiment_reproducible(CONFIG['random_state'])


def run_cross_validation_psobp(file_to_print) -> torch.Tensor:
    logger = logging.getLogger('10_fold_cv')
    configure_logger_by_default(logger)
    logger.info("START run_cross_validation")

    def print_info(info):
        logger.info(info)
np.set_printoptions(suppress=True)

# Pobieramy zestaw danych
digits = load_digits()
# Obrazy
images = digits.images
# Etykiety
y = digits.target

# Spłaszczenie obrazów do wektora
X = images.reshape((images.shape[0], -1))
print(X.shape)

#Podział na zbiór treningowy i testowy stratyfikowany na 5 foldów
kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1410)
results = []
# iterowanie po zbiorach treningowych i testowych po podzieleniu na foldy
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    #ekstrakcja cech PCA
    pca = PCA(n_components=X_train.shape[1], random_state=1410)
    pca.fit(X_train)

    # procent objasnioniej wariacji
    evr = pca.explained_variance_ratio_
    evr_acc = np.add.accumulate(evr)
    print(evr_acc)
def CueDesc_SegDecAnalysis(dat):
    nPe = 100
    nRepeats = 10
    nSh = 50
    njobs = 20

    trConds = dat['TrialConds']
    trDat = dat['TrialLongMat']
    nUnits = dat['fitTable2'].shape[0]

    gTrialsIDs = trConds['Good']
    Trials = trConds[gTrialsIDs].index.values
    nTrials = len(Trials)

    allZoneFR,unitIDs = reformatFRDat(dat,Trials)

    CoTrials =  trConds[gTrialsIDs & (trConds['Co']=='Co')].index.values
    InCoTrials = trConds[gTrialsIDs & (trConds['Co']=='InCo')].index.values

    nInCo = len(InCoTrials)
    TrSets = {}
    TrSets['all'] = np.arange(nTrials)
    _,idx,_=np.intersect1d(np.array(Trials),np.array(CoTrials),return_indices=True)
    TrSets['co'] = idx
    _,idx,_=np.intersect1d(np.array(Trials),np.array(InCoTrials),return_indices=True)
    TrSets['inco'] = idx

    cueVec = trConds.loc[gTrialsIDs]['Cues'].values
    descVec = trConds.loc[gTrialsIDs]['Desc'].values
    predVec = {'Cue':cueVec, 'Desc':descVec}

    nFeatures = {'h':np.arange(1),'a':np.arange(2),'center':np.arange(3),'be':np.arange(4),'int':np.arange(5),'cdfg':np.arange(6),'goal':np.arange(7)}

    def correctTrials_Decoder(train,test):
        res = pd.DataFrame(np.zeros((3,4)),columns=['Test','BAc','P','Z'])

        temp = mod.fit(X_train[train],y_train[train])

        res.loc[0,'Test'] = 'Model'
        y_hat = temp.predict(X_train[test])
        res.loc[0,'BAc'] = bac(y_train[test],y_hat)*100

        # shuffle for held out train set
        mod_sh = np.zeros(nSh)
        for sh in np.arange(nSh):
            y_perm_hat = np.random.permutation(y_hat)
            mod_sh[sh] = bac(y_train[test],y_perm_hat)*100
        res.loc[0,'Z'] = getPerm_Z(mod_sh, res.loc[0,'BAc'] )
        res.loc[0,'P'] = getPerm_Pval(mod_sh, res.loc[0,'BAc'] )

        # predictions on x test
        y_hat = temp.predict(X_test)
        res.loc[1,'Test'] = 'Cue'
        res.loc[1,'BAc'] = bac(y_test_cue,y_hat)*100

        res.loc[2,'Test'] = 'Desc'
        res.loc[2,'BAc'] = bac(y_test_desc,y_hat)*100

        # shuffles for ytest cue/desc
        cue_sh = np.zeros(nSh)
        desc_sh = np.zeros(nSh)
        for sh in np.arange(nSh):
            y_perm_hat = np.random.permutation(y_hat)
            cue_sh[sh] = bac(y_test_cue,y_perm_hat)*100
            desc_sh[sh] = bac(y_test_desc,y_perm_hat)*100

        res.loc[1,'Z'] = getPerm_Z(cue_sh, res.loc[1,'BAc'] )
        res.loc[1,'P'] = getPerm_Pval(cue_sh, res.loc[1,'BAc'] )

        res.loc[2,'Z'] = getPerm_Z(desc_sh, res.loc[2,'BAc'] )
        res.loc[2,'P'] = getPerm_Pval(desc_sh, res.loc[2,'BAc'] )

        res['nSeUnits'] = nUnits
        return res

    def balancedCoIncoTrial_Decoder(pe,feats):

        res = pd.DataFrame(np.zeros((2,4)),columns=['Test','BAc','P','Z'])

        # sample correct trials to match the number of incorrect trials.
        samp_co_trials = np.random.choice(TrSets['co'],nInCo,replace=False)

        train = np.concatenate( (TrSets['inco'], samp_co_trials ))
        test = np.setdiff1d(TrSets['co'], samp_co_trials)

        X_train = allZoneFR.loc[train,feats].values
        X_test = allZoneFR.loc[test,feats].values

        Y_cue_train = predVec['Cue'][train]
        Y_desc_train = predVec['Desc'][train]

        Y_test = predVec['Cue'][test] # cue and desc trials are the on the test set.

        # model trained on the cue
        res.loc[0,'Test'] = 'Cue'
        cue_mod = mod.fit(X_train,Y_cue_train)
        y_cue_hat = cue_mod.predict(X_test)
        res.loc[0,'BAc']  = bac(Y_test,y_cue_hat)*100

        cue_sh = np.zeros(nSh)
        for sh in np.arange(nSh):
            y_perm = np.random.permutation(Y_test)
            cue_sh[sh] = bac(y_perm,y_cue_hat)*100

        res.loc[0,'Z'] = getPerm_Z(cue_sh, res.loc[0,'BAc'] )
        res.loc[0,'P'] = getPerm_Pval(cue_sh, res.loc[0,'BAc'] )

        # model trained on the desc
        res.loc[1,'Test'] = 'Desc'
        desc_mod = mod.fit(X_train,Y_desc_train)
        y_desc_hat = desc_mod.predict(X_test)
        res.loc[1,'BAc']  = bac(Y_test,y_desc_hat)*100

        desc_sh = np.zeros(nSh)
        for sh in np.arange(nSh):
            y_perm = np.random.permutation(Y_test)
            desc_sh[sh] = bac(y_perm,y_desc_hat)*100
        res.loc[1,'Z'] = getPerm_Z(cue_sh, res.loc[1,'BAc'] )
        res.loc[1,'P'] = getPerm_Pval(cue_sh, res.loc[1,'BAc'] )

        return res

    def IncoTrial_Decoder(train,test):

        res = pd.DataFrame(np.zeros((3,4)),columns=['Test','BAc','P','Z'])
        temp = mod.fit(X_train[train],y_train[train])

        res.loc[0,'Test'] = 'Model'
        y_hat = temp.predict(X_train[test])
        res.loc[0,'BAc'] = bac(y_train[test],y_hat)*100

        # shuffle for held out train set
        mod_sh = np.zeros(nSh)
        for sh in np.arange(nSh):
            y_perm_hat = np.random.permutation(y_hat)
            mod_sh[sh] = bac(y_train[test],y_perm_hat)*100
        res.loc[0,'Z'] = getPerm_Z(mod_sh, res.loc[0,'BAc'] )
        res.loc[0,'P'] = getPerm_Pval(mod_sh, res.loc[0,'BAc'] )

        # predictions on x test
        y_hat = temp.predict(X_test)
        res.loc[1,'Test'] = 'Cue'
        res.loc[1,'BAc'] = bac(y_test_cue,y_hat)*100

        res.loc[2,'Test'] = 'Desc'
        res.loc[2,'BAc'] = 100-res.loc[1,'BAc']

        # shuffles for ytest cue/desc
        cue_sh = np.zeros(nSh)
        for sh in np.arange(nSh):
            y_perm_hat = np.random.permutation(y_hat)
            cue_sh[sh] = bac(y_test_cue,y_perm_hat)*100

        res.loc[1,'Z'] = getPerm_Z(cue_sh, res.loc[1,'BAc'] )
        res.loc[1,'P'] = getPerm_Pval(cue_sh, res.loc[1,'BAc'] )

        res.loc[2,'Z'] = getPerm_Z(100-cue_sh, res.loc[2,'BAc'] )
        res.loc[2,'P'] = getPerm_Pval(100-cue_sh, res.loc[2,'BAc'] )

        return res

    with Parallel(n_jobs=njobs) as parallel:
        # correct trials Model:
        coModsDec = pd.DataFrame()
        popCoModsDec = pd.DataFrame()

        try:
            nFolds = 10
            y_train = predVec['Cue'][TrSets['co']]
            y_test_cue = predVec['Cue'][TrSets['inco']]
            y_test_desc = predVec['Desc'][TrSets['inco']]
            rskf = RepeatedStratifiedKFold(n_splits=nFolds,n_repeats=nRepeats, random_state=0)

            t0=time.time()
            for unitNum in np.arange(nUnits):
                for p,nF in nFeatures.items():

                    feats = unitIDs[unitNum][nF]
                    mod = lm.LogisticRegression(class_weight='balanced',C=1/np.sqrt(len(feats)))

                    X_train = allZoneFR.loc[TrSets['co'], feats ].values
                    X_test = allZoneFR.loc[TrSets['inco'], feats ].values

                    cnt=0

                    r = parallel(delayed(correctTrials_Decoder)(train,test) for train,test in rskf.split(X_train,y_train))
                    t1=time.time()

                    res = pd.DataFrame()
                    for jj in r:
                        res = pd.concat((jj,res))
                    res['Loc'] = p
                    res['-log(P)'] = -np.log(res['P'])
                    res['unit'] = unitNum

                    coModsDec = pd.concat((coModsDec,res))
                    print(end='.')
            coModsDec['Decoder'] = 'Correct'
            # -population
            for p,nF in nFeatures.items():
                feats=np.array([])
                for f in nF:
                    feats=np.concatenate((feats,np.arange(f,nUnits*7,7)))
                feats=feats.astype(int)
                mod = lm.LogisticRegression(class_weight='balanced',C=1/np.sqrt(len(feats)))

                X_train = allZoneFR.loc[TrSets['co'], feats ].values
                X_test = allZoneFR.loc[TrSets['inco'], feats ].values

                cnt=0
                r = parallel(delayed(correctTrials_Decoder)(train,test) for train,test in rskf.split(X_train,y_train))

                res = pd.DataFrame()
                for jj in r:
                    res = pd.concat((jj,res))
                res['Loc'] = p
                res['-log(P)'] = -np.log(res['P'])

                popCoModsDec = pd.concat((popCoModsDec,res))
                print(end='.')
            print('\nDecoding Correct Model Completed. Time  = {0:.2f}s \n'.format(time.time()-t0))
            popCoModsDec['Decoder'] = 'Correct'
        except:
            print('CorrectTrials Model Failed.')
            print ("Error", sys.exc_info()[0],sys.exc_info()[1],sys.exc_info()[2].tb_lineno)

        # balanced correct/inco model:
        baModsDec = pd.DataFrame()
        popBaModsDec = pd.DataFrame()
        try:
            t0=time.time()
            for unitNum in np.arange(nUnits):
                for p,nF in nFeatures.items():
                    feats = unitIDs[unitNum][nF]
                    mod = lm.LogisticRegression(class_weight='balanced',C=1/np.sqrt(len(feats)))
                    r = parallel(delayed(balancedCoIncoTrial_Decoder)(pe, feats) for pe in np.arange(nPe))
                    res = pd.DataFrame()
                    for jj in r:
                        res = pd.concat((jj,res))
                    res['Loc'] = p
                    res['-log(P)'] = -np.log(res['P'])
                    res['unit'] = unitNum

                    baModsDec = pd.concat((baModsDec,res))
                    print(end='.')
            baModsDec['Decoder'] = 'Balanced'
            # -population
            for p,nF in nFeatures.items():
                feats=np.array([])
                for f in nF:
                    feats=np.concatenate((feats,np.arange(f,nUnits*7,7)))
                feats=feats.astype(int)
                mod = lm.LogisticRegression(class_weight='balanced',C=1/np.sqrt(len(feats)))
                r = parallel(delayed(balancedCoIncoTrial_Decoder)(pe, feats) for pe in np.arange(nPe))
                res = pd.DataFrame()
                for jj in r:
                    res = pd.concat((jj,res))
                res['Loc'] = p
                res['-log(P)'] = -np.log(res['P'])

                popBaModsDec = pd.concat((popBaModsDec,res))
                print(end='.')
            print('\nDecoding Balanced  Model Completed. Time  = {0:.2f}s \n'.format(time.time()-t0))
            popBaModsDec['Decoder'] = 'Balanced'
        except:
            print('Balanced Model Failed.')
            print ("Error", sys.exc_info()[0],sys.exc_info()[1],sys.exc_info()[2].tb_lineno)

        # incorrect trials model:
        InCoModsDec = pd.DataFrame()
        popInCoModsDec = pd.DataFrame()
        try:
            t0=time.time()
            nFolds = 5
            y_train = predVec['Cue'][TrSets['inco']]
            y_test_cue = predVec['Cue'][TrSets['co']]
            y_test_desc = predVec['Desc'][TrSets['co']]
            rskf = RepeatedStratifiedKFold(n_splits=nFolds,n_repeats=nRepeats, random_state=0)

            for unitNum in np.arange(nUnits):
                for p,nF in nFeatures.items():
                    feats = unitIDs[unitNum][nF]
                    mod = lm.LogisticRegression(class_weight='balanced',C=1/np.sqrt(len(feats)))

                    X_train = allZoneFR.loc[TrSets['inco'], feats ].values
                    X_test = allZoneFR.loc[TrSets['co'], feats ].values

                    cnt=0
                    r = parallel(delayed(IncoTrial_Decoder)(train,test) for train,test in rskf.split(X_train,y_train))
                    res = pd.DataFrame()
                    for jj in r:
                        res = pd.concat((jj,res))
                    res['Loc'] = p
                    res['-log(P)'] = -np.log(res['P'])
                    res['unit'] = unitNum

                    InCoModsDec = pd.concat((InCoModsDec,res))
                    print(end='.')
            InCoModsDec['Decoder'] = 'Incorrect'

            #-population
            for p,nF in nFeatures.items():
                feats=np.array([])
                for f in nF:
                    feats=np.concatenate((feats,np.arange(f,nUnits*7,7)))
                feats=feats.astype(int)
                mod = lm.LogisticRegression(class_weight='balanced',C=1/np.sqrt(len(feats)))

                X_train = allZoneFR.loc[TrSets['inco'], feats ].values
                X_test = allZoneFR.loc[TrSets['co'], feats ].values

                cnt=0
                r = parallel(delayed(IncoTrial_Decoder)(train,test) for train,test in rskf.split(X_train,y_train))
                res = pd.DataFrame()
                for jj in r:
                    res = pd.concat((jj,res))
                res['Loc'] = p
                res['-log(P)'] = -np.log(res['P'])

                popInCoModsDec = pd.concat((popInCoModsDec,res))
                print(end='.')
            print('\nDecoding Incorrect Model Completed. Time  = {0:.2f}s \n'.format(time.time()-t0))

            popInCoModsDec['Decoder'] = 'Incorrect'
        except:
            print('Incorrect Model Failed.')
            print ("Error", sys.exc_info()[0],sys.exc_info()[1],sys.exc_info()[2].tb_lineno)

        # group results.
        singCellDec = pd.concat((coModsDec,baModsDec,InCoModsDec))
        popDec = pd.concat((popCoModsDec,popBaModsDec,popInCoModsDec))

        singCellDecSummary = singCellDec.groupby(['Loc','Test','unit','Decoder']).mean()
        singCellDecSummary = singCellDecSummary.reset_index()
        singCellDecSummary['Test'] = pd.Categorical(singCellDecSummary['Test'],categories=['Model','Cue','Desc'],ordered=True)
        singCellDecSummary.sort_values('Test',inplace=True)
        singCellDecSummary['Loc'] = pd.Categorical(singCellDecSummary['Loc'],categories=nFeatures.keys(),ordered=True)
        singCellDecSummary.sort_values('Loc',inplace=True)

    return singCellDec,singCellDecSummary, popDec
Example #12
0
# Creamos un ColumnTransformer para el StandardScaler
scaler = ColumnTransformer([('scaler_media', scaler_media, slice(0, 8)),
                            ('scaler_moda', scaler_moda,
                             slice(8, len(X.columns)))])

# Creamos el Pipeline incorporando ColumnTransformer y Clasificador
pipeline = Pipeline([('imputer', imputer), ('scaler', scaler),
                     ('svm',
                      SVC(random_state=RANDOM_STATE,
                          class_weight=CLASS_WEIGHT,
                          probability=True))])

# InnerCV (GridSearchCV de 2-folds 5-times (stratified) para obtener mejores parámetros)
rskf = RepeatedStratifiedKFold(n_splits=2,
                               n_repeats=5,
                               random_state=RANDOM_STATE)  # inner
grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=PARAM_GRID,
                           scoring=SCORING,
                           cv=rskf)

# # OuterCV (Validación cruzada de 5 folds (stratified) para estimar Accuracy)
# scores = cross_validate(estimator=grid_search, X=X, y=y, cv=5, error_score='raise', return_estimator=True, scoring=SCORING)  # outer
# print('Scores: {}' .format(scores['test_score']))
# print('Mean score: {}' .format(np.mean(scores['test_score'])))

# # Creamos clasificador 'tonto' y obtenemos resultados también con validación cruzada (CV=5) para tener resultados más realistas
# dummy_clf = DummyClassifier(strategy='most_frequent', random_state=RANDOM_STATE)
# dummy_scores = cross_validate(estimator=dummy_clf, X=X, y=y, cv=5, error_score='raise', return_estimator=True, scoring=SCORING)
# print('Dummy scores: {}' .format(dummy_scores['test_score']))
Example #13
0
    def launch(self) -> int:
        """Execute the :class:`Resampling <resampling.resampling.Resampling>` resampling.resampling.Resampling object."""

        # check input/output paths and parameters
        self.check_data_params(self.out_log, self.err_log)

        # Setup Biobb
        if self.check_restart(): return 0
        self.stage_files()

        # check mandatory properties
        method, over, under = getCombinedMethod(self.method, self.out_log,
                                                self.__class__.__name__)
        checkResamplingType(self.type, self.out_log, self.__class__.__name__)
        sampling_strategy_over = getSamplingStrategy(
            self.sampling_strategy_over, self.out_log, self.__class__.__name__)
        sampling_strategy_under = getSamplingStrategy(
            self.sampling_strategy_under, self.out_log,
            self.__class__.__name__)

        # load dataset
        fu.log(
            'Getting dataset from %s' %
            self.io_dict["in"]["input_dataset_path"], self.out_log,
            self.global_log)
        if 'column' in self.target:
            labels = getHeader(self.io_dict["in"]["input_dataset_path"])
            skiprows = 1
            header = 0
        else:
            labels = None
            skiprows = None
            header = None
        data = pd.read_csv(self.io_dict["in"]["input_dataset_path"],
                           header=None,
                           sep="\s+|;|:|,|\t",
                           engine="python",
                           skiprows=skiprows,
                           names=labels)

        train_df = data
        ranges = None

        le = preprocessing.LabelEncoder()

        cols_encoded = []
        for column in train_df:
            # if type object, LabelEncoder.fit_transform
            if train_df[column].dtypes == 'object':
                cols_encoded.append(column)
                train_df[column] = le.fit_transform(train_df[column])

        # defining X
        X = train_df.loc[:, train_df.columns != getTargetValue(
            self.target, self.out_log, self.__class__.__name__)]
        # calling resample method
        if self.method == 'smotetomek':
            method = method(
                smote=over(sampling_strategy=sampling_strategy_over),
                tomek=under(sampling_strategy=sampling_strategy_under),
                random_state=self.random_state_method)
        elif self.method == 'smotenn':
            method = method(
                smote=over(sampling_strategy=sampling_strategy_over),
                enn=under(sampling_strategy=sampling_strategy_under),
                random_state=self.random_state_method)

        fu.log(
            'Target: %s' % (getTargetValue(self.target, self.out_log,
                                           self.__class__.__name__)),
            self.out_log, self.global_log)

        # resampling
        if self.type == 'regression':
            fu.log(
                'Resampling regression dataset, continuous data will be classified',
                self.out_log, self.global_log)
            # call resampler class for Regression ReSampling
            rs = resampler()
            # Create n_bins classes for the dataset
            ranges, y, target_pos = rs.fit(
                train_df,
                target=getTargetValue(self.target, self.out_log,
                                      self.__class__.__name__),
                bins=self.n_bins,
                balanced_binning=self.balanced_binning,
                verbose=0)
            # Get the re-sampled data
            final_X, final_y = rs.resample(method, train_df, y)
        elif self.type == 'classification':
            # get X and y
            y = getTarget(self.target, train_df, self.out_log,
                          self.__class__.__name__)
            # fit and resample
            final_X, final_y = method.fit_resample(X, y)
            target_pos = None

        # evaluate resampling
        if self.evaluate:
            fu.log(
                'Evaluating data before resampling with RandomForestClassifier',
                self.out_log, self.global_log)
            cv = RepeatedStratifiedKFold(
                n_splits=self.evaluate_splits,
                n_repeats=self.evaluate_repeats,
                random_state=self.random_state_evaluate)
            # evaluate model
            scores = cross_val_score(
                RandomForestClassifier(class_weight='balanced'),
                X,
                y,
                scoring='accuracy',
                cv=cv,
                n_jobs=-1)
            if not np.isnan(np.mean(scores)):
                fu.log(
                    'Mean Accuracy before resampling: %.3f' %
                    (np.mean(scores)), self.out_log, self.global_log)
            else:
                fu.log(
                    'Unable to calculate cross validation score, NaN was returned.',
                    self.out_log, self.global_log)

        # log distribution before resampling
        dist = ''
        for k, v in Counter(y).items():
            per = v / len(y) * 100
            rng = ''
            if ranges: rng = str(ranges[k])
            dist = dist + 'Class=%d, n=%d (%.3f%%) %s\n' % (k, v, per, rng)
        fu.log('Classes distribution before resampling:\n\n%s' % dist,
               self.out_log, self.global_log)

        # join final_X and final_y in the output dataframe
        if header is None:
            # numpy
            out_df = np.column_stack((final_X, final_y))
        else:
            # pandas
            out_df = final_X.join(final_y)

        # if no header, convert np to pd
        if header is None: out_df = pd.DataFrame(data=out_df)

        # if cols encoded, decode them
        if cols_encoded:
            for column in cols_encoded:
                if header is None:
                    out_df = out_df.astype({column: int})
                out_df[column] = le.inverse_transform(
                    out_df[column].values.ravel())

        # if no header, target is in a different column
        if target_pos: t = target_pos
        else:
            t = getTargetValue(self.target, self.out_log,
                               self.__class__.__name__)
        # log distribution after resampling
        if self.type == 'regression':
            ranges, y_out, _ = rs.fit(out_df,
                                      target=t,
                                      bins=self.n_bins,
                                      balanced_binning=self.balanced_binning,
                                      verbose=0)
        elif self.type == 'classification':
            y_out = getTarget(self.target, out_df, self.out_log,
                              self.__class__.__name__)

        dist = ''
        for k, v in Counter(y_out).items():
            per = v / len(y_out) * 100
            rng = ''
            if ranges: rng = str(ranges[k])
            dist = dist + 'Class=%d, n=%d (%.3f%%) %s\n' % (k, v, per, rng)
        fu.log('Classes distribution after resampling:\n\n%s' % dist,
               self.out_log, self.global_log)

        # evaluate resampling
        if self.evaluate:
            fu.log(
                'Evaluating data after resampling with RandomForestClassifier',
                self.out_log, self.global_log)
            cv = RepeatedStratifiedKFold(n_splits=3,
                                         n_repeats=3,
                                         random_state=42)
            # evaluate model
            scores = cross_val_score(
                RandomForestClassifier(class_weight='balanced'),
                final_X,
                y_out,
                scoring='accuracy',
                cv=cv,
                n_jobs=-1)
            if not np.isnan(np.mean(scores)):
                fu.log(
                    'Mean Accuracy after resampling a %s dataset with %s method: %.3f'
                    % (self.type, resampling_methods[self.method]['method'],
                       np.mean(scores)), self.out_log, self.global_log)
            else:
                fu.log(
                    'Unable to calculate cross validation score, NaN was returned.',
                    self.out_log, self.global_log)

        # save output
        hdr = False
        if header == 0: hdr = True
        fu.log(
            'Saving resampled dataset to %s' %
            self.io_dict["out"]["output_dataset_path"], self.out_log,
            self.global_log)
        out_df.to_csv(self.io_dict["out"]["output_dataset_path"],
                      index=False,
                      header=hdr)

        return 0
#evaluate knn with uncalibrated probabilities for imbalanced classification
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier

#generate dataset
X, y = make_classification(n_samples=10000,
                           n_features=2,
                           n_redundant=0,
                           n_clusters_per_class=1,
                           weights=[0.99],
                           flip_y=0,
                           random_state=4)
#define model
model = KNeighborsClassifier()
#define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
#evaluate model
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
#summarize performance
print('Mean ROC AUC: %.3f' % mean(scores))
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=test_split,
                                                    stratify=y)
x_train = x_train[:int(x_train.shape[0] * training_fraction)]
y_train = y_train[:int(y_train.shape[0] * training_fraction)]

# Train model
print('Training model...')
# history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=val_split)

train_accuracies = []
train_losses = []
val_accuracies = []
val_losses = []
rskf = RepeatedStratifiedKFold(n_splits=kfolds_splits,
                               n_repeats=kfolds_repeats)
for train_index, test_index in rskf.split(x_train, y_train):
    history = model.fit(x_train[train_index],
                        y_train[train_index],
                        batch_size=batch_size,
                        epochs=epochs,
                        validation_data=(x_train[test_index],
                                         y_train[test_index]))
    train_accuracies.append(history.history['acc'])
    train_losses.append(history.history['loss'])
    val_accuracies.append(history.history['val_acc'])
    val_losses.append(history.history['val_loss'])

train_accuracies = np.array(train_accuracies)
train_losses = np.array(train_losses)
val_accuracies = np.array(val_accuracies)
Example #16
0
def test_get_n_splits_for_repeated_stratified_kfold():
    n_splits = 3
    n_repeats = 4
    rskf = RepeatedStratifiedKFold(n_splits, n_repeats)
    expected_n_splits = n_splits * n_repeats
    assert_equal(expected_n_splits, rskf.get_n_splits())
def evaluate_model(X, y, model):
	# define evaluation procedure
	cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
	# evaluate model
	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
	return scores
Example #18
0
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1337)

# lgb =========================================================================
import lightgbm as lgb

clf_lgb = lgb.LGBMClassifier(objective='binary',
                             boosting_type='dart',
                             verbose=-1,
                             random_state=1337)

scorer = make_scorer(roc_auc_score, greater_is_better=True, needs_proba=True)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1337)
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1337)

acc = cross_val_score(estimator=clf_lgb,
                      X=X_train,
                      y=y_train,
                      cv=rskf,
                      scoring='roc_auc')
acc.mean(), acc.std()

# GridSearchCV needs a predefined plan of the experiments
param_grid = {
    'learning_rate': [0.1],
    'max_depth': [5, 7, 9, -1],
    'min_data_in_leaf': [5, 10, 15],
    'num_leaves': [10, 20, 30],
    'bagging_freq': [7],
Example #19
0
print(">> loading dataset ... ")
path=Path("data/")
train=pd.read_csv(path/"train.csv")
#train = train[:100]
train_ID_code = train["ID_code"].tolist()
train=train.drop("ID_code",axis=1)

test=pd.read_csv(path/"test.csv")
test_ID_code = test["ID_code"].tolist()
test=test.drop("ID_code",axis=1)

##
valid_df = pd.DataFrame({"ID_code": train_ID_code , 'target':-1})
result=np.zeros(test.shape[0])
#
rskf = RepeatedStratifiedKFold(n_splits=4, n_repeats=1,random_state=SEED)
for counter,(train_index, valid_index) in enumerate(rskf.split(train, train.target),1):
    K.clear_session()
    model = None # Clearing the NN.
    model , model_name = create_model(init_dim=200,n0=200,n1=100,n2=50,act='relu')
    print ("fold:",counter, "   -- model name:",model_name)
    sys.stdout.flush()
    #Train data
    t=train.iloc[train_index]
    v = train.iloc[valid_index]
    early_stopping = EarlyStopping(monitor='val_auc_roc', patience=2 , mode='max')
    model_path = model_name + '.h5'
    model_checkpoint = ModelCheckpoint(model_path, monitor='val_auc_roc'  , mode='max', save_best_only=True, verbose=1)
    results = model.fit(t.drop("target",axis=1),
                        t.target,
                        validation_data=(v.drop("target",axis=1),v.target),
Example #20
0
 def __init__(self, n_splits=10, n_repeats=2, groupcount=10, random_state=0, strategy='quantile'):
     self.groupcount = groupcount
     self.strategy = strategy
     self.cvkwargs = dict(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)
     self.cv = RepeatedStratifiedKFold(**self.cvkwargs)
     self.discretizer = KBinsDiscretizer(n_bins=self.groupcount, encode='ordinal', strategy=self.strategy)
Example #21
0
def logi(request):
    hospital.objects.all().delete()
    inp = request.FILES['testinput'].name
    print(inp)

    train_df = pd.read_csv("junapp/static/junapp/data/train.csv")

    # Read CSV test data file into DataFrame
    test_df = pd.read_csv("junapp/static/junapp/data/" + inp)
    print('The number of samples into the train data is {}.'.format(
        train_df.shape[0]))
    a = train_df.isnull().sum()

    train_data = train_df.copy()
    train_data["How many days, immunization service is provided?"].fillna(
        train_df["How many days, immunization service is provided?"].median(
            skipna=True),
        inplace=True)
    train_data["How many bed are available in this hospital?"].fillna(
        train_df["How many bed are available in this hospital?"].median(
            skipna=True),
        inplace=True)

    train_data.isnull().sum()

    test_data = test_df.copy()
    test_data["How many days, immunization service is provided?"].fillna(
        test_df["How many days, immunization service is provided?"].median(
            skipna=True),
        inplace=True)

    a = test_data.isnull().sum()
    print(a)
    cols = [
        "Does this health facility have its own building?",
        "Infrastructure Needs Repairing",
        "Number of rooms available in the health facilities? Number",
        "How many bed are available in this hospital?",
        "OPD service avaliable?", "Immunization Service Avaliable",
        "How many days, immunization service is provided?",
        "Laboraotry Service Avaliable",
        "ASRH (Adolescent Friendly Services) Service Avaliable",
        "Mental health Service Avaliable", "Substance abuse Service Avaliable",
        "Oral Health Service Avaliable"
    ]

    X = train_data[cols]
    y = train_data['Passed Threshold']
    # Build a logreg and compute the feature importances
    model = LogisticRegression()
    # create the RFE model and select 8 attributes
    rfe = RFE(model, 13)
    rfe = rfe.fit(X, y)
    # summarize the selection of the attributes
    print('Selected features: %s' % list(X.columns[rfe.support_]))

    # -------------------------
    rfecv = RFECV(estimator=LogisticRegression(),
                  step=1,
                  cv=10,
                  scoring='accuracy')
    rfecv.fit(X, y)

    print("Optimal number of features: %d" % rfecv.n_features_)
    print('Selected features: %s' % list(X.columns[rfecv.support_]))

    Selected_features = [
        "Does this health facility have its own building?",
        "Infrastructure Needs Repairing",
        "Number of rooms available in the health facilities? Number",
        "How many bed are available in this hospital?",
        "OPD service avaliable?", "Immunization Service Avaliable",
        "How many days, immunization service is provided?",
        "Laboraotry Service Avaliable",
        "ASRH (Adolescent Friendly Services) Service Avaliable",
        "Mental health Service Avaliable", "Substance abuse Service Avaliable",
        "Oral Health Service Avaliable"
    ]

    X = train_data[Selected_features]
    C = np.arange(1e-05, 5.5, 0.1)
    scoring = {
        'Accuracy': 'accuracy',
        'AUC': 'roc_auc',
        'Log_loss': 'neg_log_loss'
    }
    log_reg = LogisticRegression()

    # Simple pre-processing estimators
    ###############################################################################
    std_scale = StandardScaler(with_mean=False, with_std=False)
    # std_scale = StandardScaler()

    # Defining the CV method: Using the Repeated Stratified K Fold
    ###############################################################################

    n_folds = 5
    n_repeats = 5

    rskfold = RepeatedStratifiedKFold(n_splits=n_folds,
                                      n_repeats=n_repeats,
                                      random_state=2)

    # Creating simple pipeline and defining the gridsearch
    ###############################################################################

    log_clf_pipe = Pipeline(steps=[('scale', std_scale), ('clf', log_reg)])

    log_clf = GridSearchCV(estimator=log_clf_pipe,
                           cv=rskfold,
                           scoring=scoring,
                           return_train_score=True,
                           param_grid=dict(clf__C=C),
                           refit='Accuracy')

    log_clf.fit(X, y)
    results = log_clf.cv_results_

    # print('=' * 20)
    print("best params: " + str(log_clf.best_estimator_))
    print("best params: " + str(log_clf.best_params_))
    print('best score:', (log_clf.best_score_) * 100)
    # print('=' * 20)
    test_data['Passed Threshold'] = log_clf.predict(
        test_data[Selected_features])
    test_data['Ward No'] = test_df['Ward No']
    test_data['Address'] = test_df['Address']
    test_data['Does this health facility have its own building?'] = test_df[
        'Does this health facility have its own building?']
    test_data['Infrastructure Needs Repairing'] = test_df[
        'Infrastructure Needs Repairing']
    test_data[
        'Number of rooms available in the health facilities? Number'] = test_df[
            'Number of rooms available in the health facilities? Number']
    test_data['How many bed are available in this hospital?'] = test_df[
        'How many bed are available in this hospital?']
    test_data['OPD service avaliable?'] = test_df['OPD service avaliable?']
    test_data['Immunization Service Avaliable'] = test_df[
        'Immunization Service Avaliable']
    test_data['Oral Health Service Avaliable'] = test_df[
        'Oral Health Service Avaliable']
    test_data['Type of Health facility'] = test_df['Type of Health facility']
    test_data['longt'] = test_df['longt']
    test_data['lat'] = test_df['lat']
    submission = test_data[[
        'Ward No', 'Address', 'Type of Health facility', 'Passed Threshold',
        'longt', 'lat', 'Does this health facility have its own building?',
        'Infrastructure Needs Repairing',
        'Number of rooms available in the health facilities? Number',
        'How many bed are available in this hospital?',
        'OPD service avaliable?', 'Immunization Service Avaliable',
        'Oral Health Service Avaliable', 'Type of Health facility'
    ]]
    submission.to_csv("submission.csv", index=False)
    submission.tail()
    # dict = {}
    result = pd.read_csv("submission.csv")
    # dict = {
    #     'ward': result['Ward No'],
    #     'address': result['Address'],
    #     'type': result['Type of Health facility'],
    #     'pass': result['Passed Threshold'],
    #     'longt': result['longt'],
    #     'lat': result['lat']
    # }
    # out = hospital.objects.create(ward_no= dict['ward'], address= dict['address'], type= dict['type'], passed= dict['pass'])
    # out.save()
    print('Here')
    a = len(result['Ward No'])
    for i in range(a):
        hospitals = hospital.objects.create(
            ward_no=result['Ward No'][i],
            address=result['Address'][i],
            type=result['Type of Health facility'][i],
            passed=result['Passed Threshold'][i],
            lat=result['lat'][i],
            log=result['longt'][i],
            building=result['Does this health facility have its own building?']
            [i],
            repair=result['Infrastructure Needs Repairing'][i],
            noofrooms=result[
                'Number of rooms available in the health facilities? Number']
            [i],
            beds=result['How many bed are available in this hospital?'][i],
            optservice=result['OPD service avaliable?'][i],
            immunizationservice=result['Immunization Service Avaliable'][i],
            oralhealth=result['Oral Health Service Avaliable'][i]

            # lat=str(round(result['lat'][i],4)),
            # log=str(round(result['long'][i],4))
        )
        hospitals.save()
        # print(result['Ward No'][i])
    # gethospitals = hospital.objects.all()
    # finaldata = []
    # for i in gethospitals:
    #     finaldata.append(
    #         {'address': i.address, 'building': i.building, 'repair': i.repair, 'beds': i.beds, 'room': i.noofrooms})
    #
    # return render(request, 'junapp/logistic.html', {'result': finaldata})
    # return render(request, 'junapp/logistic.html')
    passed = hospital.objects.filter(passed=1).count()
    failed = hospital.objects.filter(passed=0).count()
    total = passed + failed
    gethospitals = hospital.objects.all()

    finaldata = []
    for i in gethospitals:
        finaldata.append({
            'address': i.address,
            'building': i.building,
            'immunizationservice': i.immunizationservice,
            'beds': i.beds,
            'optservice': i.optservice,
            'oralhealth': i.oralhealth,
            'type': i.type,
            'repair': i.repair,
            'pass': i.passed,
            'room': i.noofrooms
        })
    finalresult = {'finaldata': finaldata, 'pass': passed, 'fail': failed}
    return render(request, 'junapp/logistic.html', {'result': finalresult})
    def calculate_reliability(self, input, output, models_dict, file_path):
        """
        Calculates reliability estimations for selected models based on test data within the single CV procedure.
        The calculated estimations are compared with prediction accuracy by using Pearson's test.

        :param input: array-like, shape (n_samples, n_features)
            Training data.
        :param output: array-like, shape (n_samples, )
            True labels
        :param models_dict: dictionary
            Selected classifiers with details for grid-search and feature selection.
        :param file_path: String
            File path for saving data

        :return: DataFrame
            Calculated correlation coefficients.
        """
        from ReliabilityEstimation import ReliabilityEstimation
        X, y = input, output

        df_res = pd.DataFrame(columns=[
            'Classifier', 'Corr_Oref', 'p_Oref', 'Corr_DENS', 'p_DENS',
            'Corr_CNK', 'p_CNK', 'Corr_LCV', 'p_LCV'
        ])

        for m in models_dict:
            rel_ref = []
            rel_dens = []
            rel_cnk = []
            rel_lcv = []
            acc_probabilities = []
            predicted_label = []
            true_label = []
            clf = m['classifier']
            fs = m['fs_method']

            df_raw_res = pd.DataFrame(columns=[
                'Oref', 'DENS', 'CNK', 'LCV', 'Accuracy', 'Predicted_label',
                'True_label'
            ])

            # leave one out cross - validation
            skf = RepeatedStratifiedKFold(n_splits=self.N_inner,
                                          n_repeats=1,
                                          random_state=88)

            for train_index, test_index in skf.split(X, y):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]

                fs_smote_clf = Pipeline([('oversampling',
                                          SMOTE(random_state=88,
                                                k_neighbors=3)),
                                         ('feature_selection', fs),
                                         ('classifier', clf)])

                param_grid = m['grid']

                classifier = copy.deepcopy(fs_smote_clf)

                if m['name'] == 'MLP_mRMR_50':
                    gridsearch_cv = fs_smote_clf
                else:
                    gridsearch_cv = GridSearchCV(fs_smote_clf,
                                                 param_grid,
                                                 cv=10,
                                                 scoring='roc_auc')

                gridsearch_cv.fit(X_train, y_train)

                # predicted class
                y_predict = gridsearch_cv.predict(X_test)
                predicted_label.append(y_predict)

                # predicted probabilities
                probas_ = gridsearch_cv.predict_proba(X_test)
                acc = self.ind_classification_accuracy(probas_, y_test)
                acc_probabilities.append(acc)
                true_label.append(y_test)

                rel = ReliabilityEstimation()
                ref = list(
                    map(lambda prob: rel.o_ref(prob), np.max(probas_, axis=1)))
                dens = list(map(lambda test: rel.DENS(X_train, test), X_test))
                cnk = list(
                    map(
                        lambda test: rel.CNK(
                            X_train, y_train, test,
                            gridsearch_cv.predict_proba(test.reshape(1, -1))),
                        X_test))
                lcv = list(
                    map(
                        lambda test: rel.LCV(X_train, y_train, test, 40,
                                             classifier), X_test))

                rel_ref.append(ref)
                rel_dens.append(dens)
                rel_cnk.append(cnk)
                rel_lcv.append(lcv)

            merged_rel_ref = np.concatenate(rel_ref).ravel()
            merged_rel_dens = np.concatenate(rel_dens).ravel()
            merged_rel_cnk = np.concatenate(rel_cnk).ravel()
            merged_rel_lcv = np.concatenate(rel_lcv).ravel()
            merged_acc = np.concatenate(acc_probabilities).ravel()
            merged_predicted_labels = np.concatenate(predicted_label).ravel()
            merged_true_labels = np.concatenate(true_label).ravel()

            df_raw_res["Oref"] = merged_rel_ref
            df_raw_res["DENS"] = merged_rel_dens
            df_raw_res["CNK"] = merged_rel_cnk
            df_raw_res["LCV"] = merged_rel_lcv
            df_raw_res["Accuracy"] = merged_acc
            df_raw_res["Predicted_label"] = merged_predicted_labels
            df_raw_res["True_label"] = merged_true_labels

            df_raw_res.to_csv(file_path + 'Reliability_data_' + m['name'] +
                              '.csv',
                              header=True)

            correlation_ref, p_ref = spearmanr(merged_rel_ref, merged_acc)
            correlation_dens, p_dens = spearmanr(merged_rel_dens, merged_acc)
            correlation_cnk, p_cnk = spearmanr(merged_rel_cnk, merged_acc)
            correlation_lcv, p_lcv = spearmanr(merged_rel_lcv, merged_acc)

            df_res = df_res.append(
                {
                    'Classifier': m['name'],
                    'Corr_Oref': correlation_ref,
                    'p_Oref': p_ref,
                    'Corr_DENS': correlation_dens,
                    'p_DENS': p_dens,
                    'Corr_CNK': correlation_cnk,
                    'p_CNK': p_cnk,
                    'Corr_LCV': correlation_lcv,
                    'p_LCV': p_lcv
                },
                ignore_index=True)

        return df_res
Example #23
0
def fit_model_kfold(
    features,
    model,
    analysis_type="classification",
    reduce_set=True,
    reduced_set_size=100,
    reduced_set_max_correlation=0.9,
    n_repeats=1,
    random_state=42,
    n_splits=None,
    compute_shap=True,
):
    """Classify graphs from extracted features with kfold.

    Args:
        features (dataframe): extracted features
        model (str): model to preform analysis
        analysis_type (str): 'classification' or 'regression'
        reduce_set (bool): is True, the classification will be rerun
                           on a reduced set of top features (from shapely analysis)
        reduce_set_size (int): number of features to keep for reduces set
        reduced_set_max_correlation (float): to discared highly correlated top features
                                             in reduced set of features
        n_repeats (int): number of k-fold repeats
        random_state (int): rng seed
        n_splits (int): numbere of split for k-fold, None=automatic estimation
        compute_shap (bool): compute SHAP values or not

    Returns:
        (dict): dictionary with results
    """
    if model is None:
        raise Exception("Please provide a model for classification")

    X, y = features_to_Xy(features)

    if analysis_type == "classification":
        if n_splits is None:
            n_splits = _number_folds(y)
        L.info("Using %s splits", str(n_splits))
        folds = RepeatedStratifiedKFold(n_splits=n_splits,
                                        n_repeats=n_repeats,
                                        random_state=random_state)
    elif analysis_type == "regression":
        if n_splits is None:
            n_splits = _number_folds(y)
        L.info("Using %s splits", str(n_splits))
        folds = RepeatedKFold(n_splits=n_splits,
                              n_repeats=n_repeats,
                              random_state=random_state)

    acc_scores, shap_values = _evaluate_kfold(X, y, model, folds,
                                              analysis_type, compute_shap)
    _print_accuracy(acc_scores, analysis_type)

    if compute_shap:
        mean_shap_values, shap_feature_importance = _get_shap_feature_importance(
            shap_values)
    else:
        mean_shap_values = None
        shap_feature_importance = None

    analysis_results = {
        "X": X,
        "y": y,
        "acc_scores": acc_scores,
        "mean_shap_values": mean_shap_values,
        "shap_values": shap_values,
        "shap_feature_importance": shap_feature_importance,
        "reduced_features": None,
    }
    if not reduce_set:
        return analysis_results

    if not compute_shap:
        return analysis_results

    reduced_features = _get_reduced_feature_set(
        X,
        shap_feature_importance,
        n_top_features=reduced_set_size,
        alpha=reduced_set_max_correlation,
    )
    reduced_acc_scores, reduced_shap_values = _evaluate_kfold(
        X[reduced_features], y, model, folds, analysis_type, compute_shap)
    _print_accuracy(reduced_acc_scores, analysis_type, reduced=True)
    (
        reduced_mean_shap_values,
        reduced_shap_feature_importance,
    ) = _get_shap_feature_importance(reduced_shap_values)

    analysis_results.update({
        "reduced_features":
        reduced_features,
        "reduced_shap_values":
        reduced_shap_values,
        "shap_values":
        shap_values,
        "reduced_acc_scores":
        reduced_acc_scores,
        "reduced_mean_shap_values":
        reduced_mean_shap_values,
        "reduced_shap_feature_importance":
        reduced_shap_feature_importance,
    })
    return analysis_results
Example #24
0
)

param_grid = [
    {
        "regressor__C": np.logspace(-3, 0, 4),
        "regressor__solver": ["liblinear"],
        "regressor__penalty": ["l1"]
    },
    {
        "regressor__C": np.logspace(-3, 0, 4),
        "regressor__solver": ["lbfgs"],
        "regressor__penalty": ["l2"] # these are actually just the defaults
    }
]

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5)

grid = GridSearchCV(
    pipe,
    param_grid,
    scoring="recall",
    cv=cv,
    return_train_score=True,
    verbose=10
)


### Scoring ###
def print_model_scores(model,
                       scoring_list = [
                           accuracy_score,
Example #25
0
def main(dataset_name):

    dataset = load_dataset()

    raw_data = np.asarray(dataset['raw']['data'])
    raw_label = np.asarray(dataset['raw']['label'])
    num_classes = len(np.unique(raw_label))

    rskf = RepeatedStratifiedKFold(n_splits=k_folds, n_repeats=k_fold_reps, random_state=42)

    for fs_method, fs_range in fs_methods:
        print('FS-Method : ', fs_method.__name__)

        nfeats = []
        accuracies = []
        svc_accuracies = []
        BAs = []
        svc_BAs = []
        mAPs = []
        svc_mAPs = []
        mus = []
        name = dataset_name + '_mu_' + str(mu)
        print(name)

        for j, (train_index, test_index) in enumerate(rskf.split(raw_data, raw_label)):
            print('k_fold', j, 'of', k_folds*k_fold_reps)

            train_data, train_labels = raw_data[train_index].copy(), raw_label[train_index].copy()
            test_data, test_labels = raw_data[test_index].copy(), raw_label[test_index].copy()

            train_labels = to_categorical(train_labels, num_classes=num_classes)
            test_labels = to_categorical(test_labels, num_classes=num_classes)

            valid_features = np.where(np.abs(train_data).sum(axis=0) > 0)[0]
            if len(valid_features) < train_data.shape[1]:
                print('Removing', train_data.shape[1] - len(valid_features), 'zero features')
                train_data = train_data[:, valid_features]
                test_data = test_data[:, valid_features]

            model_kwargs = {
                # 'nclasses': num_classes,
                'mu': mu / len(train_data),
                'degree': 3
            }
            print('mu :', model_kwargs['mu'], ', batch_size :', batch_size)

            svc_kwargs = {
                'C': 1.0,
                'solver': 0.
            }

            print('Starting feature selection')
            best_fs = 0
            best_value = None
            for fs_value in fs_range:
                fs_class = fs_method(10, fs_value, matlab_engine=matlab_engine)
                fs_class.fit(train_data, 2. * train_labels[:, -1] - 1.)
                svc_train_data = fs_class.transform(train_data)

                norm = normalization_func()
                svc_train_data_norm = norm.fit_transform(svc_train_data)
                for s in [0, 1, 2, 3]:
                    for my_c in [0.001, 0.01, 0.1, 0.5, 1.0, 1.4, 1.5, 1.6, 2.0, 2.5, 5.0, 25.0, 50.0, 100.0]:
                        cmd = '-v 5 -s ' + str(s) + ' -c ' + str(my_c) + ' -q'
                        cv = liblinearutil.train((2 * train_labels[:, -1] - 1).tolist(), svc_train_data_norm.tolist(), cmd)
                        if cv > best_fs:
                            best_fs = cv
                            best_value = fs_value
            print('best fs_value: ', best_value)
            fs_class = fs_method(200, best_value, matlab_engine=matlab_engine)
            fs_class.fit(train_data, 2. * train_labels[:, -1] - 1.)
            print('Finishing feature selection')

            for i, n_features in enumerate([10, 50, 100, 150, 200]):
                n_accuracies = []
                n_svc_accuracies = []
                n_BAs = []
                n_svc_BAs = []
                n_mAPs = []
                n_svc_mAPs = []
                n_train_accuracies = []
                print('n_features : ', n_features)

                fs_class.n_features_to_select = n_features
                svc_train_data = fs_class.transform(train_data)
                svc_test_data = fs_class.transform(test_data)

                norm = normalization_func()
                svc_train_data_norm = norm.fit_transform(svc_train_data)
                svc_test_data_norm = norm.transform(svc_test_data)

                bestcv = -1
                bestc = None
                bestSolver = None
                for s in [0, 1, 2, 3]:
                    for my_c in [0.001, 0.01, 0.1, 0.5, 1.0, 1.4, 1.5, 1.6, 2.0, 2.5, 5.0, 25.0, 50.0, 100.0]:
                        cmd = '-v 5 -s ' + str(s) + ' -c ' + str(my_c) + ' -q'
                        cv = liblinearutil.train((2 * train_labels[:, -1] - 1).tolist(), svc_train_data_norm.tolist(), cmd)
                        if cv > bestcv:
                            bestcv = cv
                            bestc = my_c
                            bestSolver = s
                svc_kwargs['C'] = bestc
                svc_kwargs['solver'] = bestSolver
                print('Best -> C:', bestc, ', s:', bestSolver, ', acc:', bestcv)

                for r in range(reps):
                    model = train_SVC(svc_train_data_norm, train_labels, svc_kwargs)
                    _, accuracy, test_pred = liblinearutil.predict(
                        (2 * test_labels[:, -1] - 1).tolist(), svc_test_data_norm.tolist(), model, '-q'
                    )
                    test_pred = np.asarray(test_pred)
                    n_svc_accuracies.append(accuracy[0])
                    n_svc_BAs.append(balance_accuracy(test_labels, test_pred))
                    n_svc_mAPs.append(average_precision_score(test_labels[:, -1], test_pred))
                    del model
                    model = train_Keras(svc_train_data, train_labels, svc_test_data, test_labels, model_kwargs)
                    train_data_norm = model.normalization.transform(svc_train_data)
                    test_data_norm = model.normalization.transform(svc_test_data)
                    test_pred = model.predict(test_data_norm)
                    n_BAs.append(balance_accuracy(test_labels, test_pred))
                    n_mAPs.append(average_precision_score(test_labels[:, -1], test_pred))
                    n_accuracies.append(model.evaluate(test_data_norm, test_labels, verbose=0)[-1])
                    n_train_accuracies.append(model.evaluate(train_data_norm, train_labels, verbose=0)[-1])
                    del model
                    K.clear_session()
                    print(
                        'n_features : ', n_features,
                        ', acc : ', n_accuracies[-1],
                        ', BA : ', n_BAs[-1],
                        ', mAP : ', n_mAPs[-1],
                        ', train_acc : ', n_train_accuracies[-1],
                        ', svc_acc : ', n_svc_accuracies[-1],
                        ', svc_BA : ', n_svc_BAs[-1],
                        ', svc_mAP : ', n_svc_mAPs[-1],
                    )
                if i >= len(accuracies):
                    accuracies.append(n_accuracies)
                    svc_accuracies.append(n_svc_accuracies)
                    BAs.append(n_BAs)
                    mAPs.append(n_mAPs)
                    svc_BAs.append(n_svc_BAs)
                    svc_mAPs.append(n_svc_mAPs)
                    nfeats.append(n_features)
                    mus.append(model_kwargs['mu'])
                else:
                    accuracies[i] += n_accuracies
                    svc_accuracies[i] += n_svc_accuracies
                    BAs[i] += n_BAs
                    mAPs[i] += n_mAPs
                    svc_BAs[i] += n_svc_BAs
                    svc_mAPs[i] += n_svc_mAPs


        output_filename = directory + 'LinearSVC_' + fs_method.__name__ + '.json'

        if not os.path.isdir(directory):
            os.makedirs(directory)

        info_data = {
            'reps': reps,
            'classification': {
                'mus': mus,
                'n_features': nfeats,
                'accuracy': accuracies,
                'mean_accuracy': np.array(accuracies).mean(axis=1).tolist(),
                'svc_accuracy': svc_accuracies,
                'mean_svc_accuracy': np.array(svc_accuracies).mean(axis=1).tolist(),
                'BA': BAs,
                'mean_BA': np.array(BAs).mean(axis=1).tolist(),
                'mAP': mAPs,
                'mean_mAP': np.array(mAPs).mean(axis=1).tolist(),
                'svc_BA': svc_BAs,
                'svc_mean_BA': np.array(svc_BAs).mean(axis=1).tolist(),
                'svc_mAP': svc_mAPs,
                'svc_mean_mAP': np.array(svc_mAPs).mean(axis=1).tolist(),
            }
        }

        for k, v in info_data['classification'].items():
            if 'mean' in k:
                print(k, v)

        with open(output_filename, 'w') as outfile:
            json.dump(info_data, outfile)
Example #26
0
         sbsMinIdx = np.random.choice(minIdx,
                                      int(majSize * imbRatio),
                                      replace=False)
     minDel = np.setdiff1d(minIdx, sbsMinIdx)
     if len(minDel) > 0:
         imbY = np.delete(imbY, minDel)
         imbX = np.delete(imbX, minDel, axis=0)
         print("to size: " + str(np.sum(imbY == c)))
 techNames = []
 imbSizesTxt = []
 for techType, a in [['none', 0], ['WeightedBase', 0], ['SMOTE', 0],
                     ['mixup', 0.1], ['remix', 0.1]]:
     techNames = np.append(techNames, techType + str(a))
     imbSizesTxt = np.append(imbSizesTxt, str(imbRatio))
     rskf = RepeatedStratifiedKFold(n_splits=2,
                                    n_repeats=10,
                                    random_state=36851234)
     tmpGm = np.array([])
     tmpFm = np.array([])
     tmpBa = np.array([])
     tmpBp = np.array([])
     tmpBmc = np.array([])
     tmpBb = np.array([])
     for train_index, test_index in rskf.split(imbX, imbY):
         X_train, X_test = imbX[train_index, :], imbX[test_index, :]
         y_train, y_test = imbY[train_index], imbY[test_index]
         scaler = StandardScaler()
         X_train = scaler.fit_transform(X_train)
         X_test = scaler.transform(X_test)
         X_train = np.clip(X_train, -5, 5)
         X_test = np.clip(X_test, -5, 5)
def runBS(input, output, classifier, oDim, lenTrainer):

    inputClass = input[:lenTrainer, :]
    inputValid = input[lenTrainer:, :]

    outputClass = output[:lenTrainer]
    outputValid = output[lenTrainer:]

    decision = []
    success = 0

    probs = np.zeros((len(outputClass), oDim))
    nt = np.zeros(len(outputClass))
    #preds = [[]]*len(outputClass)

    random_state = np.random.RandomState(0)
    rskf = RepeatedStratifiedKFold(n_splits=2,
                                   n_repeats=np.power(2, 10),
                                   random_state=random_state)

    for train, test in rskf.split(inputClass, outputClass):

        random_state = np.random.RandomState(0)
        clf = None

        if classifier['type'] == 'svm':
            clf = svm.SVC(kernel=classifier['kernel'],
                          degree=classifier['degree'],
                          probability=True,
                          random_state=random_state)
        elif classifier['type'] == 'lda':
            clf = LDA(solver="svd", store_covariance=True)
        elif classifier['type'] == 'knn':
            clf = KNeighborsClassifier(5)

        probas_ = clf.fit(inputClass[train],
                          outputClass[train]).predict_proba(inputClass[test])
        #pred_ = clf.fit(inputClass[train], outputClass[train]).predict(inputClass[test])
        '''
        print(probas_)
        print(pred_)
        print(outputClass[train])
        '''
        for ip, it in enumerate(test):
            probs[it, :] += probas_[ip, :]
            #preds[it].append(pred_[ip])
            nt[it] += 1.0

    # Validation part
    clf_v = None
    if classifier['type'] == 'svm':
        clf_v = svm.SVC(kernel=classifier['kernel'],
                        degree=classifier['degree'],
                        probability=True,
                        random_state=random_state)
    elif classifier['type'] == 'lda':
        clf_v = LDA(solver="svd", store_covariance=True)
    elif classifier['type'] == 'knn':
        clf_v = KNeighborsClassifier(5)

    probas_v = clf_v.fit(inputClass, outputClass).predict_proba(inputValid)
    #pred_v = clf_v.fit(inputClass, outputClass).predict(inputValid)

    prob_v = []
    pred_v = []
    for ipv in probas_v:
        prob_v.append(max(ipv))
        pred_v.append(np.argmax(ipv))
    prob_v = np.array(prob_v)
    pred_v = np.array(pred_v)
    '''
    prob_t = []; pred_t = []
    for ie in range(len(nt)):
        med = probs[it,:]/nt[it]
        prob_t.append(max(med))
        (values, counts) = np.unique(preds[it], return_counts=True)
        #pred_t.append(values[np.argmax(counts)])
        pred_t.append(np.argmax(med))
    prob_t = np.array(prob_t)
    pred_t = np.array(pred_t)
    '''
    score_t = np.zeros((len(outputClass), oDim))
    for ip in range(len(outputClass)):
        score_t[ip, :] = probs[ip, :] / nt[ip]

    pred_t = []
    prob_t = []
    for il in score_t:
        posLab = np.argmax(il)
        prob_t.append(max(il))
        if posLab == 0:
            pred_t.append(0)
        elif posLab == 1:
            pred_t.append(1)
        else:
            pred_t.append(2)
    pred_t = np.array(pred_t)
    prob_t = np.array(prob_t)

    return pred_t, prob_t, pred_v, prob_v
Example #28
0
# Define models and parameters
model = RandomForestClassifier(random_state=0,
                               n_jobs=-1,
                               class_weight='balanced_subsample')
n_estimators = [400, 600, 700, 1000]
max_features = [2, 3]
max_depth = [4, 5]

# Define grid search
grid = dict(n_estimators=n_estimators,
            max_features=max_features,
            max_depth=max_depth)

# 3 - Define how it will be seaarched, being stratified to preserve the two calsses, splitting in 10 and repeating 3 random times
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=6, random_state=0)

# Create model with grid and CV structure
grid_search = GridSearchCV(estimator=model,
                           param_grid=grid,
                           n_jobs=-1,
                           cv=cv,
                           error_score=0,
                           verbose=2)

# 4 - Find hyperparameters
grid_result = grid_search.fit(X_train, y_train.values.ravel())

# Summarize results ################################
print("!! Best: %f using %s" %
      (grid_result.best_score_, grid_result.best_params_))
Example #29
0
def main():
    #if dataset is not provided on call terminate
    if len(sys.argv) < 3:
        print(
            "usage: python classifier_metrics.py <train_data_file> <test_data_file> "
        )
        sys.exit()

    #pass dataset and get the matrix containing the data vectors and data targets
    ret_value = data_preprocessing(sys.argv[1])
    data_matrix = ret_value[0]
    category_labels = ret_value[1]

    #create stratified k_fold iterator to calculate metrics
    k_fold = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)
    sk_fold = StratifiedKFold(n_splits=10)
    metrics = [
        'accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'
    ]

    #create RandomForest classifier and calculate metrics
    rf_clf = RandomForestClassifier(n_jobs=-1)
    rf_result = cross_validate(rf_clf,
                               data_matrix,
                               category_labels,
                               cv=k_fold,
                               scoring=metrics,
                               return_train_score=False,
                               n_jobs=-1)
    print "RANDOM FOREST:"
    for key, value in rf_result.iteritems():
        print key + " : " + str(np.round_(np.mean(value), decimals=5))
    print("\n")

    #create MNB classifier and calculate metrics
    #scale data matrix to positive values because MNB does not accept negative values
    #increasing scaling range increases accuracy up until scale is around 10
    scaler = preprocessing.MinMaxScaler(feature_range=(0, 10), copy=True)
    scaled_data_matrix = scaler.fit_transform(data_matrix)
    mnb_clf = MultinomialNB()
    mnb_result = cross_validate(mnb_clf,
                                scaled_data_matrix,
                                category_labels,
                                cv=k_fold,
                                scoring=metrics,
                                return_train_score=False,
                                n_jobs=-1)
    print "MULTINOMIAL NAIVE BAYES:"
    for key, value in mnb_result.iteritems():
        print key + " : " + str(np.round_(np.mean(value), decimals=5))
    print("\n")

    #load hyperparameters for svc classifier from file hyperparameter_values.py
    kernel_hp = HYPERPARAMETER_VALUES['kernel']
    c_hp = HYPERPARAMETER_VALUES['C']
    gamma_hp = 'auto'
    if kernel_hp == 'rbf':
        gamma_hp = HYPERPARAMETER_VALUES['gamma']
    #create svc classifier and calculate metrics
    svc_clf = SVC(kernel=kernel_hp, C=c_hp, gamma=gamma_hp)
    svc_result = cross_validate(svc_clf,
                                data_matrix,
                                category_labels,
                                cv=k_fold,
                                scoring=metrics,
                                return_train_score=False,
                                n_jobs=-1)
    print "svm.SVC (kernel=" + kernel_hp + " ,C=" + str(
        c_hp) + ", gamma=" + str(gamma_hp) + ")"
    for key, value in svc_result.iteritems():
        print key + " : " + str(np.round_(np.mean(value), decimals=5))
    print("\n")

    #create KNN(my implementation) classifier and calculate metrics
    knn_clf = MyKNN(k=10)
    knn_result = cross_validate(knn_clf,
                                data_matrix,
                                category_labels,
                                cv=sk_fold,
                                scoring=metrics,
                                return_train_score=False)
    print "My implementation of KNN(brute force):"
    for key, value in knn_result.iteritems():
        print key + " : " + str(np.round_(np.mean(value), decimals=5))
    print("\n")

    #Beat the benchmark
    TITLE_WEIGHT = 5
    #preprocess the data differently to achieve better score
    btb_ret_value = btb_data_preprocessing(sys.argv[1],
                                           title_weight=TITLE_WEIGHT,
                                           n_comp=250,
                                           ret_vectorizers=True)
    btb_data_matrix = btb_ret_value[0]
    btb_category_labels = btb_ret_value[1]

    #i chose svc because it was the better scoring classifier
    #calculate metrics
    btb_clf = SVC(kernel=kernel_hp,
                  C=c_hp,
                  gamma=gamma_hp,
                  class_weight='balanced',
                  probability=False)
    btb_result = cross_validate(btb_clf,
                                btb_data_matrix,
                                btb_category_labels,
                                cv=k_fold,
                                scoring=metrics,
                                return_train_score=False,
                                n_jobs=-1)
    print "(Beat the benchmark)svm.SVC (kernel=" + kernel_hp + " ,C=" + str(
        c_hp) + ", gamma=" + str(gamma_hp) + ")"
    for key, value in btb_result.iteritems():
        print key + " : " + str(np.round_(np.mean(value), decimals=5))
    print("\n")

    #train the classifier with the train data
    #cross_validate() does not train the classifier object passed to it but a copy of it
    btb_clf.fit(btb_data_matrix, btb_category_labels)  #refit

    #get the vectorizers and transformers used to fit and trasform the train data
    count_vectorizer = btb_ret_value[2]
    tfidf_transformer = btb_ret_value[3]
    svd = btb_ret_value[4]
    le = btb_ret_value[5]
    #read test data and trasform them using the above vectorizers/transformers
    test_data = pd.read_csv(sys.argv[2], sep="\t")
    test_redu_matrix = test_data_transformation(test_data, count_vectorizer,
                                                tfidf_transformer, svd,
                                                TITLE_WEIGHT)

    #do the class predictions for the test data
    test_category_pred = btb_clf.predict(test_redu_matrix)

    #store predictions to file
    create_pred_file(test_data, test_category_pred, le)

    #store metrics to file
    create_eval_file(mnb_result, rf_result, svc_result, knn_result, btb_result,
                     metrics)
def train_job(train_cfg, train_dmatrix, val_dmatrix, train_val_dmatrix,
              model_dir, checkpoint_dir, is_master):
    """Train and save XGBoost model using data on current node.

    If doing distributed training, XGBoost will use rabit to sync the trained model between each boosting iteration.
    Trained model is only saved if 'is_master' is True.

    :param train_cfg: Training hyperparameter configurations
    :param train_dmatrix: Training Data Matrix
    :param val_dmatrix: Validation Data Matrix
    :param train_val_dmatrix: Training + Validation Data Matrix
    :param model_dir: Directory where model will be saved
    :param is_master: True if single node training, or the current node is the master node in distributed training.
    """
    # Parse arguments for train() API
    num_round = train_cfg.pop("num_round")
    # Parse arguments for intermediate model callback
    save_model_on_termination = train_cfg.pop('save_model_on_termination',
                                              "false")

    # Evaluation metrics to use with train() API
    tuning_objective_metric_param = train_cfg.pop("_tuning_objective_metric",
                                                  None)
    eval_metric = train_cfg.get("eval_metric")
    cleaned_eval_metric, configured_feval, tuning_objective_metric = train_utils.get_eval_metrics_and_feval(
        tuning_objective_metric_param, eval_metric)
    if cleaned_eval_metric:
        train_cfg['eval_metric'] = cleaned_eval_metric
    else:
        train_cfg.pop('eval_metric', None)

    early_stopping_rounds = train_cfg.pop('early_stopping_rounds', None)
    early_stopping_data_name = 'validation' if val_dmatrix else None
    early_stopping_metric = None
    if early_stopping_rounds:
        if tuning_objective_metric:
            early_stopping_metric = tuning_objective_metric[-1]
        elif eval_metric:
            early_stopping_metric = eval_metric[-1]

    logging.info("Train matrix has {} rows and {} columns".format(
        train_dmatrix.num_row(), train_dmatrix.num_col()))
    if val_dmatrix:
        logging.info("Validation matrix has {} rows".format(
            val_dmatrix.num_row()))

    try:
        kfold = train_cfg.pop("_kfold", None)

        if kfold is None:
            xgb_model, iteration, callbacks, watchlist = get_callbacks_watchlist(
                train_dmatrix=train_dmatrix,
                val_dmatrix=val_dmatrix,
                model_dir=model_dir,
                checkpoint_dir=checkpoint_dir,
                early_stopping_data_name=early_stopping_data_name,
                early_stopping_metric=early_stopping_metric,
                early_stopping_rounds=early_stopping_rounds,
                save_model_on_termination=save_model_on_termination,
                is_master=is_master)
            add_debugging(callbacks=callbacks,
                          hyperparameters=train_cfg,
                          train_dmatrix=train_dmatrix,
                          val_dmatrix=val_dmatrix)

            bst = xgb.train(train_cfg,
                            train_dmatrix,
                            num_boost_round=num_round - iteration,
                            evals=watchlist,
                            feval=configured_feval,
                            callbacks=callbacks,
                            xgb_model=xgb_model,
                            verbose_eval=False)

        else:
            num_cv_round = train_cfg.pop("_num_cv_round", 1)
            logging.info(
                "Run {}-round of {}-fold cross validation with {} rows".format(
                    num_cv_round, kfold, train_val_dmatrix.num_row()))

            bst = []
            evals_results = []

            num_class = train_cfg.get("num_class", None)
            objective = train_cfg.get("objective", None)
            # RepeatedStratifiedKFold expects X as array-like of shape (n_samples, n_features)
            X = range(train_val_dmatrix.num_row())
            y = train_val_dmatrix.get_label(
            ) if num_class or objective.startswith("binary:") else None
            rkf = RepeatedStratifiedKFold(n_splits=kfold, n_repeats=num_cv_round) if y is not None \
                else RepeatedKFold(n_splits=kfold, n_repeats=num_cv_round)

            for train_index, val_index in rkf.split(X=X, y=y):
                cv_train_dmatrix = train_val_dmatrix.slice(train_index)
                cv_val_dmatrix = train_val_dmatrix.slice(val_index)

                xgb_model, iteration, callbacks, watchlist = get_callbacks_watchlist(
                    train_dmatrix=cv_train_dmatrix,
                    val_dmatrix=cv_val_dmatrix,
                    model_dir=model_dir,
                    checkpoint_dir=checkpoint_dir,
                    early_stopping_data_name=early_stopping_data_name,
                    early_stopping_metric=early_stopping_metric,
                    early_stopping_rounds=early_stopping_rounds,
                    save_model_on_termination=save_model_on_termination,
                    is_master=is_master,
                    fold=len(bst))
                add_debugging(callbacks=callbacks,
                              hyperparameters=train_cfg,
                              train_dmatrix=cv_train_dmatrix,
                              val_dmatrix=cv_val_dmatrix)

                evals_result = {}
                logging.info(
                    "Train cross validation fold {}".format((len(bst) %
                                                             kfold) + 1))
                booster = xgb.train(train_cfg,
                                    cv_train_dmatrix,
                                    num_boost_round=num_round - iteration,
                                    evals=watchlist,
                                    feval=configured_feval,
                                    evals_result=evals_result,
                                    callbacks=callbacks,
                                    xgb_model=xgb_model,
                                    verbose_eval=False)
                bst.append(booster)
                evals_results.append(evals_result)

                if len(bst) % kfold == 0:
                    logging.info(
                        "The metrics of round {} cross validation".format(
                            int(len(bst) / kfold)))
                    print_cv_metric(num_round, evals_results[-kfold:])

            if num_cv_round > 1:
                logging.info(
                    "The overall metrics of {}-round cross validation".format(
                        num_cv_round))
                print_cv_metric(num_round, evals_results)
    except Exception as e:
        for customer_error_message in CUSTOMER_ERRORS:
            if customer_error_message in str(e):
                raise exc.UserError(str(e))

        exception_prefix = "XGB train call failed with exception"
        raise exc.AlgorithmError("{}:\n {}".format(exception_prefix, str(e)))

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    if is_master:
        if type(bst) is not list:
            model_location = os.path.join(model_dir, MODEL_NAME)
            bst.save_model(model_location)
            logging.debug("Stored trained model at {}".format(model_location))
        else:
            for fold in range(len(bst)):
                model_location = os.path.join(model_dir,
                                              f"{MODEL_NAME}-{fold}")
                bst[fold].save_model(model_location)
                logging.debug("Stored trained model {} at {}".format(
                    fold, model_location))
Example #31
0
traindataall = datapreparing['traindataall']
testdataall = datapreparing['testdataall']
WHO_train = traindataall['WHO']
WHO_test = testdataall['WHO']
print("Preparing data done.")

ALLgroup_bestmodscore = []
ALLgroup_bestmodmeanscore = []
ALLgroupresults = {}
for key in select_features.keys():
    val = select_features[key]
    select_features_mod = val
    select_U_features_mod = list(val.values())[0]
    select_U_data_mod = list(val.values())[1]
    select_U_data_test_mod = list(val.values())[2]
    rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=0)

    k = 0
    ALLmodscores = []
    ALLmod = {}
    LRscores = []
    SVMscores = []
    KNNscores = []
    NBscores = []
    RFscores = []
    Stackscores = []
    Lassoscores = []

    param_grid_svm = {
        'kernel': ['linear', 'rbf'],
        'C': [0.1, 1, 10, 50],
Example #32
0
# Creamos un ColumnTransformer para el StandardScaler
scaler = ColumnTransformer([
    ('scaler_media', scaler_media, slice(0, 8)),
    ('scaler_moda', scaler_moda, slice(8, len(X.columns)))
])

# Creamos el Pipeline incorporando ColumnTransformer y Clasificador
pipeline = Pipeline([
    ('imputer', imputer),
    ('scaler', scaler),
    ('svm', SVC(random_state=random_state, class_weight=class_weight))
])

# InnerCV (GridSearchCV de 2-folds 5-times (stratified) para obtener mejores parámetros)
rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=5, random_state=random_state)  # inner
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring=SCORING, cv=rskf)

# # OuterCV (Validación cruzada de 5 folds (stratified) para estimar Accuracy)
# scores = cross_validate(estimator=grid_search, X=X, y=y, cv=5, error_score='raise', return_estimator=True, scoring=SCORING)  # outer
# print('Scores: {}' .format(scores['test_score']))
# print('Mean score: {}' .format(np.mean(scores['test_score'])))
#
# # Creamos clasificador 'tonto' y obtenemos resultados también con validación cruzada (CV=5) para tener resultados más realistas
# dummy_clf = DummyClassifier(strategy='most_frequent', random_state=random_state)
# dummy_scores = cross_validate(estimator=dummy_clf, X=X, y=y, cv=5, error_score='raise', return_estimator=True, scoring=SCORING)
# print('Dummy scores: {}' .format(dummy_scores['test_score']))
# print('Dummy mean score: {}' .format(np.mean(dummy_scores['test_score'])))

# Matriz de confusion
results = cross_val_predict(grid_search, X, y, cv=5)
Example #33
0
    'feature_fraction': 0.02,
    'learning_rate': 0.001,
    'max_depth': 6,
    'metric':'auc',
    'min_data_in_leaf': 100,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'n_jobs': 30,
    'tree_learner': 'serial',
    'objective': 'binary',
    'verbosity': -1
    }

result=np.zeros(test.shape[0])

rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5,random_state=10)
best_iteration , best_valid_auc = 0, 0 
for counter,(train_index, valid_index) in enumerate(rskf.split(train, train.target),1):
    print ("Rep-Fold:",counter)
    sys.stdout.flush()
    #Train data
    t=train.iloc[train_index]
    trn_data = lgb.Dataset(t.drop("target",axis=1), label=t.target)
    #Validation data
    v=train.iloc[valid_index]
    val_data = lgb.Dataset(v.drop("target",axis=1), label=v.target)
    #Training
    model = lgb.train(param, trn_data, 1000000, feature_name=train.columns.tolist()[1:], valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 4000)
    result +=model.predict(test)
    ## feat imp
    gain = model.feature_importance('gain')