def run_bs_adaboost(): df = pd.read_csv('Files/csv_result-Descriptors_Training.csv', sep=',') df = df.drop(['id'], axis=1).replace(['P', 'N'], [1, 0]) df = prc.handle_outlier( prc.detect_outlier_iterative_IQR(df).dropna(thresh=20)) df = prc.standarize(df) # or normalize dt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1, class_weight={ 1: 20, 0: 1 }), n_estimators=20) print(main(df, "AdaBoost", dt, bs_estimate=True, verbose=True)) # run_depth_test() # run_bs_dt() #run_bs_adaboost() # Test meta learning example #abc = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=100) #main(df=df, name = "AdaBoost Decision Stumps", model=abc) # Print PR Curves from test #plt.legend(loc=1) #plt.title("Precision Recall Curve") #plt.show()
def run_bs_adaboost(): df = pd.read_csv('Files/csv_result-Descriptors_Training.csv', sep=',') df = df.drop(['id'], axis=1).replace(['P', 'N'], [1, 0]) df = prc.handle_outlier(prc.detect_outlier_iterative_IQR(df).dropna(thresh=20)) df = prc.standarize(df) # or normalize dt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1, class_weight = {1: 20, 0:1}), n_estimators=20) print(main(df, "AdaBoost", dt, bs_estimate = True, verbose=True))
def run_depth_test(): df = pd.read_csv('Files/csv_result-Descriptors_Training.csv', sep=',') df = df.drop(['id'], axis=1).replace(['P', 'N'], [1, 0]) df = prc.handle_outlier(prc.detect_outlier_iterative_IQR(df).dropna(thresh=20)) df = prc.standarize(df) # or normalize rslt = test_tree_depth(df) print("Run Time: " + str(datetime.now() - startTime)) # Print PR Curves from test plt.legend(loc=1) plt.title("Precision Recall Curve") plt.show() # Print out the distribution of curves plt.plot(list(range(2, len(rslt))), rslt[2:]) plt.ylabel("Depth of Tree") plt.xlabel("Pr@Re>50") plt.title("Testing Decision Tree Depth") plt.xticks(list(range(2, len(rslt)))) plt.show()
import numpy as np import pandas as pd from sklearn.datasets import fetch_mldata from sklearn.decomposition import PCA from sklearn.manifold import TSNE import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D import seaborn as sns import preprocessing as prc import feature_selection as fs df = pd.read_csv('Files\csv_result-Descriptors_Training.csv', sep=',') df = df.drop(['id'], axis=1).replace(['P', 'N'], [1, 0]) df = prc.handle_outlier(prc.detect_outlier_iterative_IQR(df).dropna(thresh=20)) df_norm = prc.normalize(df) #normalize features = df_norm.iloc[:, :-1] target = df_norm.iloc[:, -1] from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif f_anova = fs.select_k_best(features, target, f_classif, 2) df = df_norm[f_anova.iloc[:, 0].append(pd.Series('class'))] # #sns.set() #plt.title("Distribution of Feature 15") #sns.distplot(df['Pb_NO_sideR35_S']) #plt.show() #plt.figure(figsize=(16,7))
df = fs.select_k_best_ANOVA(data, k=n_features) out = old_main.test_tree_depth(df, class_weight="balanced") summary_balance.append([data_str_name + '-ANOVA', i, out.index(max(out)), max(out)]) df = fs.RFECV_DT(data, min_features_to_select=n_features, max_depth=max_dapth) out = old_main.test_tree_depth(df, class_weight="balanced") summary_balance.append([data_str_name + '-RFECV', i, out.index(max(out)), max(out)]) return summary_balance summary_balance = [] df = pd.read_csv('Files\csv_result-Descriptors_Training.csv', sep=',') df = df.drop(['id'], axis=1).replace(['P', 'N'], [1, 0]) df = prc.handle_outlier(prc.detect_outlier_iterative_IQR(df)) df = prc.standarize(df) # or normalize # ============================================================================= # Unsupervised optimal feature selection | optimal tree depth # ============================================================================= vt = fs.variance_threshold(df, threshold=1) rslt_vt = main.test_tree_depth(vt, class_weight="balanced") summary_balance.append(['variance-threshold', rslt_vt.index(max(rslt_vt)), max(rslt_vt)]) pca_2 = fs.pca_linear(df, n=2) # n_c9 is 9, based VarianceThreshold results, axis to gain most information rslt_pca = main.test_tree_depth(pca_2, class_weight="balanced") summary_balance.append(['pca-2', rslt_pca.index(max(rslt_pca)), max(rslt_pca)]) pca_7 = fs.pca_linear(df, n=7) # n_c9 is 9, based VarianceThreshold results, axis to gain most information
import sklearn.metrics as sklm from sklearn.model_selection import train_test_split from sklearn.ensemble import AdaBoostClassifier from sklearn.utils import resample from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import permutation_test_score import preprocessing as prc start_time = time.time() # load datasets inTrainData = pd.read_csv('csv_result-Descriptors_Training.csv', sep=',') inTrainData = inTrainData.drop(['id'], axis=1).replace(['P', 'N'], [1, 0]) inTrainData = prc.detect_outlier_iterative_IQR(inTrainData).dropna() # inTrainData = prc.detect_outlier_iterative_IQR(inTrainData).fillna(0) # Split into data and class train_data = inTrainData.drop(['class'], axis=1) train_class = inTrainData['class'] inTestData = pd.read_csv('csv_result-Descriptors_Calibration.csv', sep=',') inTestData = inTestData.drop(['id'], axis=1).replace(['P', 'N'], [1, 0]) inTestData = prc.detect_outlier_iterative_IQR(inTestData).dropna() # inTestData = prc.detect_outlier_iterative_IQR(inTestData).fillna(0) # Split into data and class test_data = inTestData.drop(['class'], axis=1) test_class = inTestData['class'] # configure bootstrap n_iterations = 20
import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import * from sklearn.tree import * from sklearn.metrics import * import numpy as np import pandas as pd import matplotlib.pyplot as plt import math import scipy.stats as stats import preprocessing as prc # Import data df = pd.read_csv('csv_result-Descriptors_Training.csv', sep=',') df = df.drop(['id'], axis=1).replace(['P', 'N'], [1, 0]) df = prc.detect_outlier_iterative_IQR(df).fillna(0) # Split into train and test X = df.drop(['class'], axis=1) y = df['class'] # NOTE Stratified KFold! kf = StratifiedKFold(n_splits=5, shuffle=True) kf.get_n_splits(X) for i in range(1, 20): # for j in range (1,10): y_true = [] y_pred = [] y_prob = [] for train_index, test_index in kf.split(X, y):