Esempio n. 1
0
def run_bs_adaboost():
    df = pd.read_csv('Files/csv_result-Descriptors_Training.csv', sep=',')
    df = df.drop(['id'], axis=1).replace(['P', 'N'], [1, 0])
    df = prc.handle_outlier(
        prc.detect_outlier_iterative_IQR(df).dropna(thresh=20))
    df = prc.standarize(df)  # or normalize
    dt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1,
                                                   class_weight={
                                                       1: 20,
                                                       0: 1
                                                   }),
                            n_estimators=20)
    print(main(df, "AdaBoost", dt, bs_estimate=True, verbose=True))


# run_depth_test()
# run_bs_dt()
#run_bs_adaboost()

# Test meta learning example
#abc = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=100)
#main(df=df, name = "AdaBoost Decision Stumps", model=abc)
# Print PR Curves from test
#plt.legend(loc=1)
#plt.title("Precision Recall Curve")
#plt.show()
def run_bs_adaboost():
    df = pd.read_csv('Files/csv_result-Descriptors_Training.csv', sep=',') 
    df = df.drop(['id'], axis=1).replace(['P', 'N'], [1, 0])
    df = prc.handle_outlier(prc.detect_outlier_iterative_IQR(df).dropna(thresh=20))
    df = prc.standarize(df) # or normalize
    dt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1, class_weight = {1: 20, 0:1}), n_estimators=20)
    print(main(df, "AdaBoost", dt, bs_estimate = True, verbose=True))
def run_depth_test():
    df = pd.read_csv('Files/csv_result-Descriptors_Training.csv', sep=',') 
    df = df.drop(['id'], axis=1).replace(['P', 'N'], [1, 0])
    df = prc.handle_outlier(prc.detect_outlier_iterative_IQR(df).dropna(thresh=20))
    df = prc.standarize(df) # or normalize
    rslt = test_tree_depth(df)

    print("Run Time: " + str(datetime.now() - startTime))

    # Print PR Curves from test
    plt.legend(loc=1)
    plt.title("Precision Recall Curve")
    plt.show()

    # Print out the distribution of curves 
    plt.plot(list(range(2, len(rslt))), rslt[2:])
    plt.ylabel("Depth of Tree")
    plt.xlabel("Pr@Re>50")
    plt.title("Testing Decision Tree Depth")
    plt.xticks(list(range(2, len(rslt))))
    plt.show()
Esempio n. 4
0
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_mldata
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

import preprocessing as prc
import feature_selection as fs

df = pd.read_csv('Files\csv_result-Descriptors_Training.csv', sep=',')
df = df.drop(['id'], axis=1).replace(['P', 'N'], [1, 0])
df = prc.handle_outlier(prc.detect_outlier_iterative_IQR(df).dropna(thresh=20))
df_norm = prc.normalize(df)  #normalize

features = df_norm.iloc[:, :-1]
target = df_norm.iloc[:, -1]

from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
f_anova = fs.select_k_best(features, target, f_classif, 2)
df = df_norm[f_anova.iloc[:, 0].append(pd.Series('class'))]  #

#sns.set()
#plt.title("Distribution of Feature 15")
#sns.distplot(df['Pb_NO_sideR35_S'])
#plt.show()

#plt.figure(figsize=(16,7))
        
        df = fs.select_k_best_ANOVA(data, k=n_features)
        out = old_main.test_tree_depth(df, class_weight="balanced")
        summary_balance.append([data_str_name + '-ANOVA', i, out.index(max(out)), max(out)])
        
        df = fs.RFECV_DT(data, min_features_to_select=n_features, max_depth=max_dapth)
        out = old_main.test_tree_depth(df, class_weight="balanced")
        summary_balance.append([data_str_name + '-RFECV', i,  out.index(max(out)), max(out)])

    return summary_balance

summary_balance = []

df = pd.read_csv('Files\csv_result-Descriptors_Training.csv', sep=',') 
df = df.drop(['id'], axis=1).replace(['P', 'N'], [1, 0])
df = prc.handle_outlier(prc.detect_outlier_iterative_IQR(df))
df = prc.standarize(df) # or normalize


# =============================================================================
# Unsupervised optimal feature selection | optimal tree depth
# =============================================================================
vt = fs.variance_threshold(df, threshold=1)
rslt_vt = main.test_tree_depth(vt, class_weight="balanced")
summary_balance.append(['variance-threshold', rslt_vt.index(max(rslt_vt)), max(rslt_vt)])

pca_2 = fs.pca_linear(df, n=2) # n_c9 is 9, based VarianceThreshold results, axis to gain most information
rslt_pca = main.test_tree_depth(pca_2, class_weight="balanced")
summary_balance.append(['pca-2', rslt_pca.index(max(rslt_pca)), max(rslt_pca)])

pca_7 = fs.pca_linear(df, n=7) # n_c9 is 9, based VarianceThreshold results, axis to gain most information
import sklearn.metrics as sklm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.utils import resample

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import permutation_test_score

import preprocessing as prc

start_time = time.time()

# load datasets
inTrainData = pd.read_csv('csv_result-Descriptors_Training.csv', sep=',')
inTrainData = inTrainData.drop(['id'], axis=1).replace(['P', 'N'], [1, 0])
inTrainData = prc.detect_outlier_iterative_IQR(inTrainData).dropna()
# inTrainData = prc.detect_outlier_iterative_IQR(inTrainData).fillna(0)
# Split into data and class
train_data = inTrainData.drop(['class'], axis=1)
train_class = inTrainData['class']

inTestData = pd.read_csv('csv_result-Descriptors_Calibration.csv', sep=',')
inTestData = inTestData.drop(['id'], axis=1).replace(['P', 'N'], [1, 0])
inTestData = prc.detect_outlier_iterative_IQR(inTestData).dropna()
# inTestData = prc.detect_outlier_iterative_IQR(inTestData).fillna(0)
# Split into data and class
test_data = inTestData.drop(['class'], axis=1)
test_class = inTestData['class']

# configure bootstrap
n_iterations = 20
Esempio n. 7
0
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import *
from sklearn.tree import *
from sklearn.metrics import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import scipy.stats as stats
import preprocessing as prc

# Import data
df = pd.read_csv('csv_result-Descriptors_Training.csv', sep=',')
df = df.drop(['id'], axis=1).replace(['P', 'N'], [1, 0])
df = prc.detect_outlier_iterative_IQR(df).fillna(0)

# Split into train and test
X = df.drop(['class'], axis=1)
y = df['class']

# NOTE Stratified KFold!
kf = StratifiedKFold(n_splits=5, shuffle=True)
kf.get_n_splits(X)

for i in range(1, 20):
    # for j in range (1,10):
    y_true = []
    y_pred = []
    y_prob = []
    for train_index, test_index in kf.split(X, y):