def score_value(self, Y_test, y_pred, model_type, score_type):
     '''Computes different scores given options'''
     Y_test = Y_test.astype(
         float).values  # make Y_test and y_pred same type
     if (score_type == 'auc' and model_type == 'absv'):
         TPR, FPR = du.roc_curve_adaboost(y_pred, Y_test)
         score_value = auc(FPR, TPR)
     elif (score_type == 'auc' and model_type != 'absv'):
         score_value = roc_auc_score(Y_test, y_pred)
     elif (score_type == 'acc'):
         score_value = accuracy_score(Y_test, y_pred)
     elif (score_type == 'prec'):
         score_value = precision_score(Y_test, y_pred)
     elif (score_type == 'f1'):
         score_value = f1_score(Y_test, y_pred)
     elif (score_type == 'rec'):
         score_value = recall_score(Y_test, y_pred)
     elif (score_type == 'gmean'):
         score_value = np.sqrt(
             precision_score(Y_test, y_pred) * recall_score(Y_test, y_pred))
     return score_value
Exemple #2
0
                                my_gamma_end=100,
                                myKernel='rbf',
                                myDegree=1,
                                myCoef0=+1)  # "trad-rbf-NOTdiv"

        start = datetime.datetime.now()
        model.fit(X_train, Y_train)
        end = datetime.datetime.now()
        elapsed_time = pd.DataFrame({"Elapsed time": [end - start]})

        elapsed_time.to_csv("output/" + name + "/" + "AdaBoostSVM_time.csv",
                            index=False)
        y_preda = model.predict(X_test)
        print("Final test accuracy:   ", accuracy_score(Y_test, y_preda))
        y_thresholds = model.decision_thresholds(X_test, glob_dec=True)
        TPR, FPR = du.roc_curve_adaboost(y_thresholds, Y_test)

        prec = precision_score(Y_test, y_preda)
        print("Final test precision:   ", prec)

        area = auc(FPR, TPR)
        print("Final test AUC:   ", area)

        nWeaks = len(model.alphas)  # print on plot no. classifiers
        dv.plot_roc_curve(TPR,
                          FPR,
                          name,
                          "sorted",
                          glob_local=True,
                          name="nom",
                          kernel=myKernel,
def bootstrap(sample_name,
              model,
              roc_area,
              selection,
              GA_mut=0.3,
              GA_score='',
              GA_selec='',
              GA_coef=0.5,
              n_cycles=1,
              split_frac=0.6,
              path='.'):

    # fetch data_frame without preparation
    data = data_preparation(path)
    sample_df_temp = data.fetch_data(sample_name)
    train_test = type(
        sample_df_temp) is tuple  # are the data already splitted?
    if not train_test: sample_df = sample_df_temp
    else: sample_train_df, sample_test_df = sample_df_temp

    area_scores, prec_scores, f1_scores, recall_scores, acc_scores, gmean_scores, time_scores = (
        []), ([]), ([]), ([]), ([]), ([]), ([])
    n_class_scores, n_train_scores = ([]), ([])

    data_size = sample_df.shape[0]
    n_samples = int(split_frac * data_size)

    # bootstrap score calculations
    i_sample = 0
    for _ in range(
            n_cycles):  # arbitrary number of bootstrap samples to produce
        i_sample += 1
        start = time.time()

        if not train_test:  # train_test == True means we're given separate files for train and test
            sampled_data_train = resample(sample_df,
                                          replace=True,
                                          n_samples=n_samples,
                                          random_state=i_sample)

            if selection == 'trad':
                # test data are the complement of full input data that is not considered for training
                sampled_train_no_dup = sampled_data_train.drop_duplicates(
                    keep=False)
                sampled_data_test = pd.concat(
                    [sample_df,
                     sampled_train_no_dup]).drop_duplicates(keep=False)

                X_train, Y_train = data.dataset(sample_name=sample_name,
                                                data_set=sampled_data_train,
                                                sampling=True,
                                                split_sample=0.0)
                X_test, Y_test = data.dataset(sample_name=sample_name,
                                              data_set=sampled_data_test,
                                              sampling=True,
                                              split_sample=0.0)
            elif selection == 'gene':  # genetic selection
                train_indexes = sampled_data_train.index
                X, Y = data.dataset(sample_name=sample_name,
                                    data_set=sample_df,
                                    sampling=True)
                X_train, X_test, Y_train, Y_test = data.indexes_split(
                    X, Y, split_indexes=train_indexes, train_test=train_test)
                print(len(X_train.index), len(Y_train.index),
                      'X_train, Y_train sizes')

                GA_selection = genetic_selection(
                    model,
                    roc_area,
                    X_train,
                    Y_train,
                    X_test,
                    Y_test,
                    pop_size=10,
                    chrom_len=int(len(Y_train.index) * 0.20),
                    n_gen=50,
                    coef=GA_coef,
                    mut_rate=GA_mut,
                    score_type=GA_score,
                    selec_type=GA_selec)
                GA_selection.execute()
                GA_train_indexes = GA_selection.best_population()
                X_train, Y_train, X_test, Y_test = data.dataset(
                    sample_name=sample_name, indexes=GA_train_indexes)
        else:
            sampled_data_train = resample(sample_train_df,
                                          replace=True,
                                          n_samples=5000,
                                          random_state=None)
            sampled_data_test = resample(sample_test_df,
                                         replace=True,
                                         n_samples=10000,
                                         random_state=None)
            X_train, Y_train, X_test, Y_test = data.dataset(
                sample_name=sample_name,
                data_set='',
                data_train=sampled_data_train,
                data_test=sampled_data_test,
                sampling=True,
                split_sample=0.4)
        model.fit(X_train, Y_train)
        n_base_class = 0
        if (model.n_classifiers != 0):
            y_pred = model.predict(X_test)
            prec = precision_score(Y_test, y_pred)
            f1 = f1_score(Y_test, y_pred)
            recall = recall_score(Y_test, y_pred)
            acc = accuracy_score(Y_test, y_pred)
            gmean = np.sqrt(prec * recall)
            n_base_class = model.n_classifiers
            # calcualate roc-auc depending on the classifier
            if roc_area == "absv":
                y_thresholds = model.decision_thresholds(X_test, glob_dec=True)
                TPR, FPR = du.roc_curve_adaboost(y_thresholds, Y_test)
                area = auc(FPR, TPR)
                model.clean()
            elif roc_area == "prob":
                Y_pred_prob = model.predict_proba(X_test)[:, 1]
                area = roc_auc_score(Y_test, Y_pred_prob)
            elif roc_area == "deci":
                Y_pred_dec = model.decision_function(X_test)
                area = roc_auc_score(Y_test, Y_pred_dec)

            end = time.time()
            time_scores = np.append(time_scores, end - start)
            area_scores = np.append(area_scores, area)
            prec_scores = np.append(prec_scores, prec)
            f1_scores = np.append(f1_scores, f1)
            recall_scores = np.append(recall_scores, recall)
            acc_scores = np.append(acc_scores, acc)
            gmean_scores = np.append(gmean_scores, gmean)
            n_class_scores = np.append(n_class_scores, n_base_class)
            n_train_scores = np.append(n_train_scores, len(X_train))
        else:  # this needs to be re-checked carefully
            end = time.time()
            time_scores = np.append(time_scores, end - start)
            area_scores = np.append(area_scores, 0)
            prec_scores = np.append(prec_scores, 0)
            f1_scores = np.append(f1_scores, 0)
            recall_scores = np.append(recall_scores, 0)
            acc_scores = np.append(acc_scores, 0)
            gmean_scores = np.append(gmean_scores, 0)
            n_class_scores = np.append(n_class_scores, 0)
            n_train_scores = np.append(n_train_scores, len(X_train))

    return area_scores, prec_scores, f1_scores, recall_scores, acc_scores, gmean_scores, time_scores, n_class_scores, n_class_scores, n_train_scores
def mcnemar_test(sample_name,
                 selection='gene',
                 model='no_div',
                 train_test=False,
                 GA_score='',
                 GA_selec=''):

    if model == 'diverse': model1 = mm.adaboost_svm(True)
    elif model == 'no_div': model1 = mm.adaboost_svm(False)

    # fetch data
    data = data_preparation()
    X_train, Y_train, X_test, Y_test = \
    data.dataset(sample_name=sample_name,
                 sampling=False,split_sample=0.4)
    if selection == 'gene':
        GA_selection = genetic_selection(model1,
                                         "absv",
                                         X_train,
                                         Y_train,
                                         X_test,
                                         Y_test,
                                         pop_size=10,
                                         chrom_len=50,
                                         n_gen=50,
                                         coef=0.5,
                                         mut_rate=0.3,
                                         score_type=GA_score,
                                         selec_type=GA_selec)
        GA_selection.execute()
        GA_train_indexes = GA_selection.best_population()
        X_train, Y_train, X_test, Y_test = data.dataset(
            sample_name=sample_name, indexes=GA_train_indexes)

    # train the model we are analyzing
    model1.fit(X_train, Y_train)
    y_pred1 = model1.predict(X_test)
    prec1 = precision_score(Y_test, y_pred1)
    f1_1 = f1_score(Y_test, y_pred1)
    recall1 = recall_score(Y_test, y_pred1)
    acc1 = accuracy_score(Y_test, y_pred1)
    gmean1 = np.sqrt(prec1 * recall1)
    y_thresholds = model1.decision_thresholds(X_test, glob_dec=True)
    TPR, FPR = du.roc_curve_adaboost(y_thresholds, Y_test)
    area1 = auc(FPR, TPR)
    model1.clean()

    p_values, stats, rejects, areas2, precs2, f1_s2, recalls2, accs2, gmeans2 = (
        []), ([]), ([]), ([]), ([]), ([]), ([]), ([]), ([])
    names = []
    # call and train the models to compare, including the AUC calculation method
    model_auc2 = mm.model_loader(model, sample_name)
    for i in range(len(model_auc2)):
        model_auc2[i][0].fit(X_train, Y_train)
        y_pred2 = model_auc2[i][0].predict(X_test)
        prec2 = precision_score(Y_test, y_pred2)
        f1_2 = f1_score(Y_test, y_pred2)
        recall2 = recall_score(Y_test, y_pred2)
        acc2 = accuracy_score(Y_test, y_pred2)
        gmean2 = np.sqrt(prec2 * recall2)

        if model_auc2[i][1] == "absv":
            y_thresholds_2 = model_auc2[i][0].decision_thresholds(
                X_test, glob_dec=True)
            TPR_2, FPR_2 = du.roc_curve_adaboost(y_thresholds, Y_test)
            area2 = auc(FPR_2, TPR_2)
        elif model_auc2[i][1] == "prob":
            Y_pred_prob = model_auc2[i][0].predict_proba(X_test)[:, 1]
            area2 = roc_auc_score(Y_test, Y_pred_prob)
        elif model_auc2[i][1] == "deci":
            Y_pred_dec = model_auc2[i][0].decision_function(X_test)
            area2 = roc_auc_score(Y_test, Y_pred_dec)

        contingency, corrected = mcnemar_table(y_pred1, y_pred2, Y_test)

        if corrected:
            result = mcnemar(contingency, exact=False, correction=True)
        else:
            result = mcnemar(contingency, exact=True)

        print('statistic=%.3f, p-value=%.3f' %
              (result.statistic, result.pvalue))
        alpha = 0.05
        if result.pvalue > alpha:
            reject_null = False
            print('Same proportions of errors, fail to reject H0')
        else:
            reject_null = True
            print('Different proportions of errors, reject H0')

        p_values = np.append(p_values, result.pvalue)
        stats = np.append(stats, result.statistic)
        rejects = np.append(rejects, reject_null)
        areas2 = np.append(areas2, area2)
        precs2 = np.append(precs2, prec2)
        f1_s2 = np.append(f1_s2, f1_2)
        recalls2 = np.append(recalls2, recall2)
        accs2 = np.append(accs2, acc2)
        gmeans2 = np.append(gmeans2, gmean2)

        names.append(model_auc2[i][2])

    f_mcnemar = open('./tables/mcnemar_' + sample_name + '_' + model + '.tex',
                     "w")

    dv.latex_table_mcnemar(names, p_values, stats, rejects, areas2, precs2,
                           f1_s2, recalls2, accs2, gmeans2, area1, prec1, f1_1,
                           recall1, acc1, gmean1, f_mcnemar)
def cross_validation(sample_name,
                     model,
                     roc_area,
                     selection,
                     GA_mut=0.25,
                     GA_score='',
                     GA_selec='',
                     GA_coef=0.5,
                     kfolds=1,
                     n_reps=1,
                     path='.'):

    # fetch data_frame without preparation
    data = data_preparation(path)
    sample_df_temp = data.fetch_data(sample_name)
    train_test = type(
        sample_df_temp) is tuple  # are the data already splitted?
    if not train_test: sample_df = sample_df_temp
    else: sample_train_df, sample_test_df = sample_df_temp

    area_scores, prec_scores, f1_scores, recall_scores, acc_scores, gmean_scores, time_scores = (
        []), ([]), ([]), ([]), ([]), ([]), ([])
    n_class_scores, n_train_scores = ([]), ([])

    X, Y = data.dataset(sample_name=sample_name,
                        data_set=sample_df,
                        sampling=True,
                        split_sample=0.0)

    from sklearn.model_selection import train_test_split
    # n-k fold cross validation, n_cycles = n_splits * n_repeats
    rkf = RepeatedKFold(
        n_splits=kfolds, n_repeats=n_reps,
        random_state=1)  # set random state=1 for reproducibility
    for i in range(1):  #train_index, test_index in rkf.split(X):
        # X_train, X_test = X.loc[train_index], X.loc[test_index]
        # Y_train, Y_test = Y.loc[train_index], Y.loc[test_index]
        X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                            Y,
                                                            test_size=0.1,
                                                            random_state=1)
        start = time.time()

        # keep the chromosome size under the limit [100,1000]
        sample_chromn_len = int(len(Y_train) * 0.25)
        if sample_chromn_len > 1000:
            sample_chromn_len = 1000
        elif sample_chromn_len < 100:
            sample_chromn_len = 100

        if selection == 'gene':  # genetic selection
            GA_selection = genetic_selection(model,
                                             roc_area,
                                             X_train,
                                             Y_train,
                                             X_test,
                                             Y_test,
                                             pop_size=10,
                                             chrom_len=sample_chromn_len,
                                             n_gen=50,
                                             coef=GA_coef,
                                             mut_rate=GA_mut,
                                             score_type=GA_score,
                                             selec_type=GA_selec)
            GA_selection.execute()
            GA_train_indexes = GA_selection.best_population()
            X_train, Y_train, X_test, Y_test = data.dataset(
                sample_name=sample_name, indexes=GA_train_indexes)
            print(len(X_train), len(Y_test), len(GA_train_indexes),
                  'important check for GA outcome')
            print(len(Y_train[Y_train == 1]), 'important check for GA outcome')

        model.fit(X_train, Y_train)
        n_base_class = 0
        no_zero_classifiers = True
        if roc_area == "absv":
            n_base_class = model.n_classifiers
            if n_base_class == 0:
                no_zero_classifiers = False

        if no_zero_classifiers:
            y_pred = model.predict(X_test)
            prec = precision_score(Y_test, y_pred)
            f1 = f1_score(Y_test, y_pred)
            recall = recall_score(Y_test, y_pred)
            acc = accuracy_score(Y_test, y_pred)
            gmean = np.sqrt(prec * recall)
            # calcualate roc-auc depending on the classifier
            if roc_area == "absv":
                y_thresholds = model.decision_thresholds(X_test, glob_dec=True)
                TPR, FPR = du.roc_curve_adaboost(y_thresholds, Y_test)
                area = auc(FPR, TPR)
                model.clean()
            elif roc_area == "prob":
                Y_pred_prob = model.predict_proba(X_test)[:, 1]
                area = roc_auc_score(Y_test, Y_pred_prob)
            elif roc_area == "deci":
                Y_pred_dec = model.decision_function(X_test)
                area = roc_auc_score(Y_test, Y_pred_dec)

            end = time.time()
            time_scores = np.append(time_scores, end - start)
            area_scores = np.append(area_scores, area)
            prec_scores = np.append(prec_scores, prec)
            f1_scores = np.append(f1_scores, f1)
            recall_scores = np.append(recall_scores, recall)
            acc_scores = np.append(acc_scores, acc)
            gmean_scores = np.append(gmean_scores, gmean)
            n_class_scores = np.append(n_class_scores, n_base_class)
            n_train_scores = np.append(n_train_scores, len(X_train))
        else:  # this needs to be re-checked carefully
            end = time.time()
            time_scores = np.append(time_scores, end - start)
            area_scores = np.append(area_scores, 0)
            prec_scores = np.append(prec_scores, 0)
            f1_scores = np.append(f1_scores, 0)
            recall_scores = np.append(recall_scores, 0)
            acc_scores = np.append(acc_scores, 0)
            gmean_scores = np.append(gmean_scores, 0)
            n_class_scores = np.append(n_class_scores, 0)
            n_train_scores = np.append(n_train_scores, len(X_train))

    return area_scores, prec_scores, f1_scores, recall_scores, acc_scores, gmean_scores, time_scores, n_class_scores, n_train_scores