def run_dt(data, title, solved_params=None): """ run the decision tree algo on the data given """ x, y, pipeline = data pipe = Pipeline([ *pipeline, ('DT', dtclf_pruned()), ]) print("Splitting into train/test") x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y) if solved_params is None: print("Doing a GridSearch for best hyperparameters") params = { 'DT__criterion': ['gini', 'entropy'], 'DT__alpha': ALPHAS, 'DT__class_weight': ['balanced'], 'DT__min_samples_split': [2, 3, 4, 5], } clf = basicResults(pipe, x_train, y_train, x_test, y_test, params, 'DT', title) else: print("Using pre-solved hyperparameters") clf = pipe.set_params(**solved_params) # print ("Plotting learning curve") # plot_learning_curve(clf, title + ' decision tree', x, # y, n_jobs=4, scoring=scorer, ylim=(0, 1)) # plt.savefig('./graphs/' + title + '-dt.png') y_pred = clf.predict(x_test) conf = confusion_matrix(y_test, clf.predict(x_test)) conf = conf.astype('float') / conf.sum(axis=1)[:, np.newaxis] print('Confusion matrix:') print(conf) np.savetxt('./output/DT_{}_confusion.csv'.format(title), conf, delimiter=',', fmt='%.2f')
def run_boost(data, dataset, dtparams={}): x, y, pipeline = data pipe = Pipeline([ *pipeline, ('Boost', ensemble.AdaBoostClassifier(algorithm='SAMME', base_estimator=dtclf_pruned(**dtparams))), ]) x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y) params = { 'Boost__n_estimators': [2**i for i in range(8)], 'Boost__algorithm': ['SAMME', 'SAMME.R'], } clf = basicResults(pipe, x_train, y_train, x_test, y_test, params, 'boosted', dataset) # plot_learning_curve(clf, dataset + ' boosted', x, y, # ylim=(0.0, 1.01), cv=5, n_jobs=4, scoring=scorer) # plt.savefig('./graphs/' + dataset + '-boost.png') # plot_timing_curve(clf, x, y, 'boost', dataset) # plt.savefig('./graphs/' + dataset + '-boost-timing.png') # plot_iteration_curve(clf, x_train, y_train, x_test, y_test, params, 'boosted', dataset) # plt.savefig('./graphs/' + dataset + '-boost-iteration.png') conf = confusion_matrix(y_test, clf.predict(x_test)) conf = conf.astype('float') / conf.sum(axis=1)[:, np.newaxis] print('Confusion matrix:') print(conf) np.savetxt('./output/Boosted_{}_confusion.csv'.format(dataset), conf, delimiter=',', fmt='%.2f')
alphas = [ -1, -1e-3, -(1e-3) * 10**-0.5, -1e-2, -(1e-2) * 10**-0.5, -1e-1, -(1e-1) * 10**-0.5, 0, (1e-1) * 10**-0.5, 1e-1, (1e-2) * 10**-0.5, 1e-2, (1e-3) * 10**-0.5, 1e-3 ] ##alphas=[0] #pipeM = Pipeline([('Scale',StandardScaler()), # ('Cull1',SelectFromModel(RandomForestClassifier(random_state=1),threshold='median')), # ('Cull2',SelectFromModel(RandomForestClassifier(random_state=2),threshold='median')), # ('Cull3',SelectFromModel(RandomForestClassifier(random_state=3),threshold='median')), # ('Cull4',SelectFromModel(RandomForestClassifier(random_state=4),threshold='median')), # ('DT',dtclf_pruned(random_state=55))]) # pipeA = Pipeline([('Scale', StandardScaler()), ('DT', dtclf_pruned(random_state=55))]) params = { 'DT__criterion': ['gini', 'entropy'], 'DT__alpha': alphas, 'DT__class_weight': ['balanced'] } #madelon_clf = basicResults(pipeM,madelon_trgX,madelon_trgY,madelon_tstX,madelon_tstY,params,'DT','madelon') adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, params, 'DT', 'adult') #madelon_final_params = {'DT__alpha': -0.00031622776601683794, 'DT__class_weight': 'balanced', 'DT__criterion': 'entropy'} #adult_final_params = {'class_weight': 'balanced', 'alpha': 0.0031622776601683794, 'criterion': 'entropy'} #madelon_final_params = madelon_clf.best_params_ adult_final_params = adult_clf.best_params_
adultX = adult.drop('income',1).copy().values adultY = adult['income'].copy().values cancer = pd.read_hdf('cancer.hdf','cancer') cancerX = cancer.drop('class',1).copy().values cancerY = cancer['class'].copy().values alphas = [-1,-1e-3,-(1e-3)*10**-0.5, -1e-2, -(1e-2)*10**-0.5,-1e-1,-(1e-1)*10**-0.5, 0, (1e-1)*10**-0.5,1e-1,(1e-2)*10**-0.5,1e-2,(1e-3)*10**-0.5,1e-3] adult_trgX, adult_tstX, adult_trgY, adult_tstY = ms.train_test_split(adultX, adultY, test_size=0.25, random_state=0,stratify=adultY) cancer_trgX, cancer_tstX, cancer_trgY, cancer_tstY = ms.train_test_split(cancerX, cancerY, test_size=0.25, random_state=0,stratify=cancerY) cancer_base = dtclf_pruned(criterion='entropy',class_weight='balanced',random_state=55) adult_base = dtclf_pruned(criterion='entropy',class_weight='balanced',random_state=55) OF_base = dtclf_pruned(criterion='gini',class_weight='balanced',random_state=55) #paramsA= {'Boost__n_estimators':[1,2,5,10,20,30,40,50],'Boost__learning_rate':[(2**x)/100 for x in range(8)]+[1]} paramsA= {'Boost__n_estimators':[1,2,5,10,20,30,45,60,80,100], 'Boost__base_estimator__alpha':alphas} #paramsM = {'Boost__n_estimators':[1,2,5,10,20,30,40,50,60,70,80,90,100], # 'Boost__learning_rate':[(2**x)/100 for x in range(8)]+[1]} paramsM = {'Boost__n_estimators':[1,2,5,10,20,30,45,60,80,100], 'Boost__base_estimator__alpha':alphas} cancer_booster = AdaBoostClassifier(algorithm='SAMME',learning_rate=1,base_estimator=cancer_base,random_state=55) adult_booster = AdaBoostClassifier(algorithm='SAMME',learning_rate=1,base_estimator=adult_base,random_state=55)
from sklearn import metrics, preprocessing from helpers import dtclf_pruned from sklearn.ensemble import AdaBoostClassifier import pandas as pd import numpy as np df = pd.read_csv("student-prf.csv", sep=';', header=0) df = df.apply(preprocessing.LabelEncoder().fit_transform) df = np.array(df) # type conversion needed to use slicing all_column = np.arange(25) # select feature for prediction all_column = np.append(all_column, [28, 29, 30, 31]) X = df[:400, all_column] y = df[:400, 26] #for j,alpha in enumerate([-99999, -1,-0.01,-0.0001, 0,0.01,0.25]): for j, alpha in enumerate([-1, -0.01, 0, 0.01, 0.1]): boost = AdaBoostClassifier(dtclf_pruned(alpha=alpha), n_estimators=5) boost.fit(X, y) predicted = boost.predict(df[400:, all_column]) expected = df[400:, 26] print('Booster number {}'.format(j)) for i, dt in enumerate(boost.estimators_): print('pruned tree {}. Alpha is {}. There are {} nodes'.format( i + 1, dt.alpha, dt.numNodes())) print("Classification report for classifier %s:\n%s\n" % (boost, metrics.classification_report(expected, predicted)))
stratify=adultY) mushrooms_trgX, mushrooms_tstX, mushrooms_trgY, mushrooms_tstY = ms.train_test_split( mushroomsX, mushroomsY, test_size=0.3, random_state=0, stratify=mushroomsY) redwine_trgX, redwine_tstX, redwine_trgY, redwine_tstY = ms.train_test_split( redwineX, redwineY, test_size=0.3, random_state=0, stratify=redwineY) # Search for good alphas alphas = [ -1, -1e-3, -(1e-3) * 10**-0.5, -1e-2, -(1e-2) * 10**-0.5, -1e-1, -(1e-1) * 10**-0.5, 0, (1e-1) * 10**-0.5, 1e-1, (1e-2) * 10**-0.5, 1e-2, (1e-3) * 10**-0.5, 1e-3 ] #madelon_base = dtclf_pruned(criterion='gini',class_weight='balanced',random_state=55) adult_base = dtclf_pruned(criterion='entropy', class_weight='balanced', random_state=55) mushrooms_base = dtclf_pruned(criterion='entropy', class_weight='balanced', random_state=55) redwine_base = dtclf_pruned(criterion='entropy', class_weight='balanced', random_state=55) OF_base = dtclf_pruned(criterion='gini', class_weight='balanced', random_state=55) # Define parameters for grid search cross validation #paramsA= {'Boost__n_estimators':[1,2,5,10,20,30,40,50],'Boost__learning_rate':[(2**x)/100 for x in range(8)]+[1]} paramsA = { 'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 45, 60, 80, 100],
from sklearn import metrics from sklearn.tree import DecisionTreeClassifier from helpers import dtclf_pruned from sklearn.ensemble import AdaBoostClassifier import pandas as pd import numpy as np df = pd.read_csv("alpha-recognition.csv") # !!!type is dataframe, not ndarray!! # print 'shape of data: ', df.shape df = np.array(df) # type conversion needed to use slicing # print type(df2) # print df2[:1,:] X = df[0:16000, 1:] y = df[0:16000, 0] for j,alpha in enumerate([-1000,-0.1,-0.01,-0.001,-0.0001,0,0.0001, 0.01,0.1,10]): #for j, alpha in enumerate([-9999, -0.1, -0.01, -0.001, -0.0001, 0, 0.0001, 0.01, 0.01, 0.25]): boost = dtclf_pruned(alpha=alpha) boost.fit(X, y) predicted = boost.predict(df[16000:, 1:]) expected = df[16000:, 0] print('Booster number {}'.format(j)) print('There are {} nodes'.format(boost.numNodes())) print("Classification report for classifier %s:\n%s\n" % (boost, metrics.classification_report(expected, predicted)))
def main(): cars = pd.read_hdf('data/processed/datasets.hdf', 'cars') carsX = cars.drop('Class', 1).copy().values carsY = cars['Class'].copy().values madelon = pd.read_hdf('data/processed/datasets.hdf', 'madelon') madelonX = madelon.drop('Class', 1).copy().values madelonY = madelon['Class'].copy().values alphas = [ -1, -1e-3, -(1e-3) * 10**-0.5, -1e-2, -(1e-2) * 10**-0.5, -1e-1, -(1e-1) * 10**-0.5, 0, (1e-1) * 10**-0.5, 1e-1, (1e-2) * 10**-0.5, 1e-2, (1e-3) * 10**-0.5, 1e-3 ] cars_trgX, cars_tstX, cars_trgY, cars_tstY = ms.train_test_split( carsX, carsY, test_size=0.3, random_state=0, stratify=carsY) madelon_trgX, madelon_tstX, madelon_trgY, madelon_tstY = ms.train_test_split( madelonX, madelonY, test_size=0.3, random_state=0, stratify=madelonY) madelon_base = dtclf_pruned(criterion='gini', class_weight='balanced', random_state=55) cars_base = dtclf_pruned(criterion='entropy', class_weight='balanced', random_state=55) OF_base = dtclf_pruned(criterion='gini', class_weight='balanced', random_state=55) #paramsA= {'Boost__n_estimators':[1,2,5,10,20,30,40,50],'Boost__learning_rate':[(2**x)/100 for x in range(8)]+[1]} paramsA = { 'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 45, 60, 80, 100], 'Boost__base_estimator__alpha': alphas } #paramsM = {'Boost__n_estimators':[1,2,5,10,20,30,40,50,60,70,80,90,100], # 'Boost__learning_rate':[(2**x)/100 for x in range(8)]+[1]} paramsM = { 'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 45, 60, 80, 100], 'Boost__base_estimator__alpha': alphas } madelon_booster = AdaBoostClassifier(algorithm='SAMME', learning_rate=1, base_estimator=madelon_base, random_state=55) cars_booster = AdaBoostClassifier(algorithm='SAMME', learning_rate=1, base_estimator=cars_base, random_state=55) OF_booster = AdaBoostClassifier(algorithm='SAMME', learning_rate=1, base_estimator=OF_base, random_state=55) pipeM = Pipeline([('Scale', StandardScaler()), ('Cull1', SelectFromModel(RandomForestClassifier(random_state=1), threshold='median')), ('Cull2', SelectFromModel(RandomForestClassifier(random_state=2), threshold='median')), ('Cull3', SelectFromModel(RandomForestClassifier(random_state=3), threshold='median')), ('Cull4', SelectFromModel(RandomForestClassifier(random_state=4), threshold='median')), ('Boost', madelon_booster)]) pipeA = Pipeline([('Scale', StandardScaler()), ('Boost', cars_booster)]) # madelon_clf = basicResults(pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, paramsM, 'Boost', 'madelon') cars_clf = basicResults(pipeA, cars_trgX, cars_trgY, cars_tstX, cars_tstY, paramsA, 'Boost', 'cars') # # #madelon_final_params = {'n_estimators': 20, 'learning_rate': 0.02} #cars_final_params = {'n_estimators': 10, 'learning_rate': 1} #OF_params = {'learning_rate':1} madelon_final_params = madelon_clf.best_params_ cars_final_params = cars_clf.best_params_ OF_params = {'Boost__base_estimator__alpha': -1, 'Boost__n_estimators': 50} ## pipeM.set_params(**madelon_final_params) pipeA.set_params(**cars_final_params) makeTimingCurve(madelonX, madelonY, pipeM, 'Boost', 'madelon') makeTimingCurve(carsX, carsY, pipeA, 'Boost', 'cars') # pipeM.set_params(**madelon_final_params) iterationLC(pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, { 'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] }, 'Boost', 'madelon') pipeA.set_params(**cars_final_params) iterationLC(pipeA, cars_trgX, cars_trgY, cars_tstX, cars_tstY, {'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 40, 50]}, 'Boost', 'cars') pipeM.set_params(**OF_params) iterationLC(pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, { 'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] }, 'Boost_OF', 'madelon') pipeA.set_params(**OF_params) iterationLC(pipeA, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, {'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 40, 50]}, 'Boost_OF', 'cars')
def main(): # adult = pd.read_csv('data/adult_parsed.csv') # plt.figure(figsize=(15,12)) # cor_map = adult.corr() # sns.heatmap(cor_map, annot=True, fmt='.3f', cmap='YlGnBu') # plt.show() # adult['net_capital'] = adult['capital-gain']-adult['capital-loss'] # adult = adult.drop(["fnlwgt","capital-gain","capital-loss","workclass"],axis=1) # # adult['income']=adult['income'].map({'<=50K': 0, '>50K': 1}) # adult['gender'] = adult['gender'].map({'Male': 0, 'Female': 1}).astype(int) # adult['race'] = adult['race'].map({'Black': 0, 'Asian-Pac-Islander': 1, 'Other': 2, 'White': 3, # 'Amer-Indian-Eskimo': 4}).astype(int) # adult['marital-status'] = adult['marital-status'].map({'Never-married':0,'Widowed':1,'Divorced':2, 'Separated':3, # 'Married-spouse-absent':4, 'Married-civ-spouse':5, 'Married-AF-spouse':6}) # adult['education'] = adult['education'].map({'Preschool':0,'1st-4th':1,'5th-6th':2, '7th-8th':3, # '9th':4, '10th':5, '11th':6, '12th':7, 'Prof-school':8, # 'HS-grad':9, 'Some-college':10, 'Assoc-voc':11, 'Assoc-acdm':12, # 'Bachelors':13, 'Masters':14, 'Doctorate':15}) # # adult['occupation'] = adult['occupation'].map({'Priv-house-serv':0,'Protective-serv':1,'Handlers-cleaners':2, 'Machine-op-inspct':3, # 'Adm-clerical':4, 'Farming-fishing':5, 'Transport-moving':6, 'Craft-repair':7, 'Other-service':8, # 'Tech-support':9, 'Sales':10, 'Exec-managerial':11, 'Prof-specialty':12, 'Armed-Forces':13 }) # adult['native-country'] = adult['native-country'].map({'?':-1,'Puerto-Rico':0,'Haiti':1,'Cuba':2, 'Iran':3, # 'Honduras':4, 'Jamaica':5, 'Vietnam':6, 'Mexico':7, 'Dominican-Republic':8, # 'Laos':9, 'Ecuador':10, 'El-Salvador':11, 'Cambodia':12, 'Columbia':13, # 'Guatemala':14, 'South':15, 'India':16, 'Nicaragua':17, 'Yugoslavia':18, # 'Philippines':19, 'Thailand':20, 'Trinadad&Tobago':21, 'Peru':22, 'Poland':23, # 'China':24, 'Hungary':25, 'Greece':26, 'Taiwan':27, 'Italy':28, 'Portugal':29, # 'France':30, 'Hong':31, 'England':32, 'Scotland':33, 'Ireland':34, # 'Holand-Netherlands':35, 'Canada':36, 'Germany':37, 'Japan':38, # 'Outlying-US(Guam-USVI-etc)':39, 'United-States':40 # }) # # adult['relationship'] = adult['relationship'].map({'Unmarried':0,'Other-relative':1, 'Not-in-family':2, # 'Wife':3, 'Husband':4,'Own-child':5}) # # adult = pd.get_dummies(adult) # adult_income_X = adult.drop('income',1).copy().values # adult_income_Y = adult['income'].copy().values # # # # # # adult_trgX, adult_tstX, adult_trgY, adult_tstY = ms.train_test_split(adult_income_X, adult_income_Y, test_size=0.3, random_state=0,stratify=adult_income_Y) # # alphas = [0.00005, 0.0001, 0.0002,0.00025, 0.0003, 0.0004,0.0005, 0.0006,0.0007, 0.0008, 0.001, 0.0015, 0.002, 0.005, 0.01, 0.05, 0.1, 0.5] alphas = np.append(np.arange(0.001, 0.05, 0.001), 0) pipeA = Pipeline([('Scale', StandardScaler()), ('DT', dtclf_pruned(random_state=55))]) # params = { 'DT__criterion': ['gini', 'entropy'], 'DT__alpha': alphas, 'DT__class_weight': ['balanced'] } # adult_income_clf = basicResults(pipeA,adult_trgX,adult_trgY,adult_tstX,adult_tstY,params,'DT','adult_income') # adult_final_params = adult_income_clf.best_params_ # pipeA.set_params(**adult_final_params) # makeTimingCurve(adult_income_X,adult_income_Y,pipeA,'DT','adult_income') # DTpruningVSnodes(pipeA,alphas,adult_trgX,adult_trgY,'adult_income') #wine_final_params = {'DT__alpha': -0.00031622776601683794, 'DT__class_weight': 'balanced', 'DT__criterion': 'entropy'} #adult_final_params = {'class_weight': 'balanced', 'alpha': 0.0031622776601683794, 'criterion': 'entropy'} # Data Parsing for wine quality dataset wine_data = pd.read_csv('data/winequality_white.csv') wine_data['category'] = wine_data['quality'] >= 7 wineX = wine_data[wine_data.columns[0:11]].values wineY = wine_data['category'].values.astype(np.int) # plt.figure(figsize=(12,6)) # sns.heatmap(wine_data.corr(),annot=True) # plt.show() wine_trgX, wine_tstX, wine_trgY, wine_tstY = ms.train_test_split( wineX, wineY, test_size=0.3, random_state=0, stratify=wineY) wine_clf = basicResults(pipeA, wine_trgX, wine_trgY, wine_tstX, wine_tstY, params, 'DT', 'wine') wine_final_params = wine_clf.best_params_ pipeA.set_params(**wine_final_params) makeTimingCurve(wineX, wineY, pipeA, 'DT', 'wine') DTpruningVSnodes(pipeA, alphas, wine_trgX, wine_trgY, 'wine')
def main(): # Load Data cars = pd.read_hdf('data/processed/datasets.hdf', 'cars') carsX = cars.drop('Class', 1).copy().values carsY = cars['Class'].copy().values madelon = pd.read_hdf('data/processed/datasets.hdf', 'madelon') madelonX = madelon.drop('Class', 1).copy().values madelonY = madelon['Class'].copy().values cars_trgX, cars_tstX, cars_trgY, cars_tstY = ms.train_test_split( carsX, carsY, test_size=0.3, random_state=0, stratify=carsY) madelon_trgX, madelon_tstX, madelon_trgY, madelon_tstY = ms.train_test_split( madelonX, madelonY, test_size=0.3, random_state=0, stratify=madelonY) # Search for good alphas alphas = [ -1, -1e-3, -(1e-3) * 10**-0.5, -1e-2, -(1e-2) * 10**-0.5, -1e-1, -(1e-1) * 10**-0.5, 0, (1e-1) * 10**-0.5, 1e-1, (1e-2) * 10**-0.5, 1e-2, (1e-3) * 10**-0.5, 1e-3 ] #alphas=[0] pipeM = Pipeline([('Scale', StandardScaler()), ('Cull1', SelectFromModel(RandomForestClassifier(random_state=1), threshold='median')), ('Cull2', SelectFromModel(RandomForestClassifier(random_state=2), threshold='median')), ('Cull3', SelectFromModel(RandomForestClassifier(random_state=3), threshold='median')), ('Cull4', SelectFromModel(RandomForestClassifier(random_state=4), threshold='median')), ('DT', dtclf_pruned(random_state=55))]) pipeA = Pipeline([('Scale', StandardScaler()), ('DT', dtclf_pruned(random_state=55))]) params = { 'DT__criterion': ['gini', 'entropy'], 'DT__alpha': alphas, 'DT__class_weight': ['balanced'] } madelon_clf = basicResults(pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, params, 'DT', 'madelon') cars_clf = basicResults(pipeA, cars_trgX, cars_trgY, cars_tstX, cars_tstY, params, 'DT', 'cars') #madelon_final_params = {'DT__alpha': -0.00031622776601683794, 'DT__class_weight': 'balanced', 'DT__criterion': 'entropy'} #cars_final_params = {'class_weight': 'balanced', 'alpha': 0.0031622776601683794, 'criterion': 'entropy'} madelon_final_params = madelon_clf.best_params_ cars_final_params = cars_clf.best_params_ pipeM.set_params(**madelon_final_params) makeTimingCurve(madelonX, madelonY, pipeM, 'DT', 'madelon') pipeA.set_params(**cars_final_params) makeTimingCurve(carsX, carsY, pipeA, 'DT', 'cars') DTpruningVSnodes(pipeM, alphas, madelon_trgX, madelon_trgY, 'madelon') DTpruningVSnodes(pipeA, alphas, cars_trgX, cars_trgY, 'cars')
def main(): # adult = pd.read_csv('data/adult_parsed.csv') # adult['net_capital'] = adult['capital-gain']-adult['capital-loss'] # adult = adult.drop(["fnlwgt","capital-gain","capital-loss","workclass","native-country"],axis=1) # # adult['income']=adult['income'].map({'<=50K': 0, '>50K': 1}) # adult['gender'] = adult['gender'].map({'Male': 0, 'Female': 1}).astype(int) # adult['race'] = adult['race'].map({'Black': 0, 'Asian-Pac-Islander': 1, 'Other': 2, 'White': 3, # 'Amer-Indian-Eskimo': 4}).astype(int) # adult['marital-status'] = adult['marital-status'].map({'Never-married':0,'Widowed':1,'Divorced':2, 'Separated':3, # 'Married-spouse-absent':4, 'Married-civ-spouse':5, 'Married-AF-spouse':6}) # adult['education'] = adult['education'].map({'Preschool':0,'1st-4th':1,'5th-6th':2, '7th-8th':3, # '9th':4, '10th':5, '11th':6, '12th':7, 'Prof-school':8, # 'HS-grad':9, 'Some-college':10, 'Assoc-voc':11, 'Assoc-acdm':12, # 'Bachelors':13, 'Masters':14, 'Doctorate':15}) # # adult['occupation'] = adult['occupation'].map({'Priv-house-serv':0,'Protective-serv':1,'Handlers-cleaners':2, 'Machine-op-inspct':3, # 'Adm-clerical':4, 'Farming-fishing':5, 'Transport-moving':6, 'Craft-repair':7, 'Other-service':8, # 'Tech-support':9, 'Sales':10, 'Exec-managerial':11, 'Prof-specialty':12, 'Armed-Forces':13 }) # # adult['relationship'] = adult['relationship'].map({'Unmarried':0,'Other-relative':1, 'Not-in-family':2, # 'Wife':3, 'Husband':4,'Own-child':5}) # # adult = pd.get_dummies(adult) # adult_income_X = adult.drop('income',1).copy().values # adult_income_Y = adult['income'].copy().values wine_data = pd.read_csv('data/winequality_white.csv') wine_data['category'] = wine_data['quality'] >= 7 wineX = wine_data[wine_data.columns[0:11]].values wineY = wine_data['category'].values.astype(np.int) alphas = np.append(np.arange(0.001, 0.05, 0.001), 0) # adult_income_trgX, adult_income_tstX, adult_income_trgY, adult_income_tstY = ms.train_test_split(adult_income_X, adult_income_Y, test_size=0.3, random_state=0,stratify=adult_income_Y) wine_trgX, wine_tstX, wine_trgY, wine_tstY = ms.train_test_split( wineX, wineY, test_size=0.3, random_state=0, stratify=wineY) # adult_income_base = dtclf_pruned(criterion='entropy',class_weight='balanced',random_state=55) wine_base = dtclf_pruned(criterion='gini', class_weight='balanced', random_state=55) OF_base = dtclf_pruned(criterion='gini', class_weight='balanced', random_state=55) #paramsA= {'Boost__n_estimators':[1,2,5,10,20,30,40,50],'Boost__learning_rate':[(2**x)/100 for x in range(8)]+[1]} paramsA = { 'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 45, 60, 80, 100], 'Boost__base_estimator__alpha': alphas } #paramsM = {'Boost__n_estimators':[1,2,5,10,20,30,40,50,60,70,80,90,100], # 'Boost__learning_rate':[(2**x)/100 for x in range(8)]+[1]} paramsM = { 'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 45, 60, 80, 100], 'Boost__base_estimator__alpha': alphas } # adult_income_booster = AdaBoostClassifier(algorithm='SAMME',learning_rate=1,base_estimator=adult_income_base,random_state=55) wine_booster = AdaBoostClassifier(algorithm='SAMME', learning_rate=1, base_estimator=wine_base, random_state=55) OF_booster = AdaBoostClassifier(algorithm='SAMME', learning_rate=1, base_estimator=OF_base, random_state=55) pipeM = Pipeline([('Scale', StandardScaler()), ('Cull1', SelectFromModel(RandomForestClassifier(random_state=1), threshold='median')), ('Cull2', SelectFromModel(RandomForestClassifier(random_state=2), threshold='median')), ('Cull3', SelectFromModel(RandomForestClassifier(random_state=3), threshold='median')), ('Cull4', SelectFromModel(RandomForestClassifier(random_state=4), threshold='median')), ('Boost', wine_booster)]) pipeA = Pipeline([('Scale', StandardScaler()), ('Boost', wine_booster)]) # # adult_income_clf = basicResults(pipeM,adult_income_trgX,adult_income_trgY,adult_income_tstX,adult_income_tstY,paramsM,'Boost','adult_income') wine_clf = basicResults(pipeA, wine_trgX, wine_trgY, wine_tstX, wine_tstY, paramsA, 'Boost', 'wine') # # # adult_income_final_params = adult_income_clf.best_params_ wine_final_params = wine_clf.best_params_ OF_params = {'Boost__base_estimator__alpha': -1, 'Boost__n_estimators': 50} ## # pipeM.set_params(**adult_income_final_params) pipeA.set_params(**wine_final_params) # makeTimingCurve(adult_income_X,adult_income_Y,pipeM,'Boost','adult_income') makeTimingCurve(wineX, wineY, pipeA, 'Boost', 'wine') # pipeM.set_params(**adult_income_final_params) # iterationLC(pipeM,adult_income_trgX,adult_income_trgY,adult_income_tstX,adult_income_tstY,{'Boost__n_estimators':[1,2,5,10,20,30,40,50,60,70,80,90,100]},'Boost','adult_income') pipeM.set_params(**wine_final_params) iterationLC(pipeA, wine_trgX, wine_trgY, wine_tstX, wine_tstY, {'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 40, 50]}, 'Boost', 'wine') # pipeM.set_params(**OF_params) # iterationLC(pipeM,adult_income_trgX,adult_income_trgY,adult_income_tstX,adult_income_tstY,{'Boost__n_estimators':[1,2,5,10,20,30,40,50,60,70,80,90,100]},'Boost_OF','adult_income') pipeA.set_params(**OF_params) iterationLC(pipeA, wine_trgX, wine_trgY, wine_tstX, wine_tstY, {'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 40, 50]}, 'Boost_OF', 'wine')
redwineY = redwine['quality'].copy().values # Split data 70/30 between train and test in a stratified manner adult_trgX, adult_tstX, adult_trgY, adult_tstY = ms.train_test_split( adultX, adultY, test_size=0.05, train_size=0.1666, random_state=0, stratify=adultY) redwine_trgX, redwine_tstX, redwine_trgY, redwine_tstY = ms.train_test_split( redwineX, redwineY, test_size=0.3, random_state=0, stratify=redwineY) # DT pipeA = Pipeline([('Scale', StandardScaler()), ('DT', dtclf_pruned(random_state=55))]) pipeR = Pipeline([('Scale', StandardScaler()), ('DT', dtclf_pruned(random_state=55))]) adult_final_params = { 'DT__alpha': 0.0031622776601683794, 'DT__class_weight': 'balanced', 'DT__criterion': 'entropy' } redwine_final_params = { 'DT__alpha': -0.0316227766016838, 'DT__class_weight': 'balanced', 'DT__criterion': 'entropy' }