def main(): cars = pd.read_hdf('data/processed/datasets.hdf', 'cars') carsX = cars.drop('Class', 1).copy().values carsY = cars['Class'].copy().values madelon = pd.read_hdf('data/processed/datasets.hdf', 'madelon') madelonX = madelon.drop('Class', 1).copy().values madelonY = madelon['Class'].copy().values cars_trgX, cars_tstX, cars_trgY, cars_tstY = ms.train_test_split( carsX, carsY, test_size=0.3, random_state=0, stratify=carsY) madelon_trgX, madelon_tstX, madelon_trgY, madelon_tstY = ms.train_test_split( madelonX, madelonY, test_size=0.3, random_state=0, stratify=madelonY) d = carsX.shape[1] hiddens_cars = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]] alphas = [10**-x for x in np.arange(1, 9.01, 1 / 2)] d = madelonX.shape[1] hiddens_madelon = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]] pipeM = Pipeline([ ('Scale', StandardScaler()), ('Cull1', SelectFromModel(RandomForestClassifier(), threshold='median')), ('Cull2', SelectFromModel(RandomForestClassifier(), threshold='median')), ('Cull3', SelectFromModel(RandomForestClassifier(), threshold='median')), ('Cull4', SelectFromModel(RandomForestClassifier(), threshold='median')), ('KNN', knnC()) ]) pipeA = Pipeline([('Scale', StandardScaler()), ('KNN', knnC())]) params_madelon = { 'KNN__metric': ['manhattan', 'euclidean', 'chebyshev'], 'KNN__n_neighbors': np.arange(1, 51, 3), 'KNN__weights': ['uniform', 'distance'] } params_cars = { 'KNN__metric': ['manhattan', 'euclidean', 'chebyshev'], 'KNN__n_neighbors': np.arange(1, 51, 3), 'KNN__weights': ['uniform', 'distance'] } madelon_clf = basicResults(pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, params_madelon, 'KNN', 'madelon') cars_clf = basicResults(pipeA, cars_trgX, cars_trgY, cars_tstX, cars_tstY, params_cars, 'KNN', 'cars') #madelon_final_params={'KNN__n_neighbors': 43, 'KNN__weights': 'uniform', 'KNN__p': 1} #cars_final_params={'KNN__n_neighbors': 142, 'KNN__p': 1, 'KNN__weights': 'uniform'} madelon_final_params = madelon_clf.best_params_ cars_final_params = cars_clf.best_params_ pipeM.set_params(**madelon_final_params) makeTimingCurve(madelonX, madelonY, pipeM, 'KNN', 'madelon') pipeA.set_params(**cars_final_params) makeTimingCurve(carsX, carsY, pipeA, 'KNN', 'cars')
def run_dt(data, title, solved_params=None): """ run the decision tree algo on the data given """ x, y, pipeline = data pipe = Pipeline([ *pipeline, ('DT', dtclf_pruned()), ]) print("Splitting into train/test") x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y) if solved_params is None: print("Doing a GridSearch for best hyperparameters") params = { 'DT__criterion': ['gini', 'entropy'], 'DT__alpha': ALPHAS, 'DT__class_weight': ['balanced'], 'DT__min_samples_split': [2, 3, 4, 5], } clf = basicResults(pipe, x_train, y_train, x_test, y_test, params, 'DT', title) else: print("Using pre-solved hyperparameters") clf = pipe.set_params(**solved_params) # print ("Plotting learning curve") # plot_learning_curve(clf, title + ' decision tree', x, # y, n_jobs=4, scoring=scorer, ylim=(0, 1)) # plt.savefig('./graphs/' + title + '-dt.png') y_pred = clf.predict(x_test) conf = confusion_matrix(y_test, clf.predict(x_test)) conf = conf.astype('float') / conf.sum(axis=1)[:, np.newaxis] print('Confusion matrix:') print(conf) np.savetxt('./output/DT_{}_confusion.csv'.format(title), conf, delimiter=',', fmt='%.2f')
def run_boost(data, dataset, dtparams={}): x, y, pipeline = data pipe = Pipeline([ *pipeline, ('Boost', ensemble.AdaBoostClassifier(algorithm='SAMME', base_estimator=dtclf_pruned(**dtparams))), ]) x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y) params = { 'Boost__n_estimators': [2**i for i in range(8)], 'Boost__algorithm': ['SAMME', 'SAMME.R'], } clf = basicResults(pipe, x_train, y_train, x_test, y_test, params, 'boosted', dataset) # plot_learning_curve(clf, dataset + ' boosted', x, y, # ylim=(0.0, 1.01), cv=5, n_jobs=4, scoring=scorer) # plt.savefig('./graphs/' + dataset + '-boost.png') # plot_timing_curve(clf, x, y, 'boost', dataset) # plt.savefig('./graphs/' + dataset + '-boost-timing.png') # plot_iteration_curve(clf, x_train, y_train, x_test, y_test, params, 'boosted', dataset) # plt.savefig('./graphs/' + dataset + '-boost-iteration.png') conf = confusion_matrix(y_test, clf.predict(x_test)) conf = conf.astype('float') / conf.sum(axis=1)[:, np.newaxis] print('Confusion matrix:') print(conf) np.savetxt('./output/Boosted_{}_confusion.csv'.format(dataset), conf, delimiter=',', fmt='%.2f')
def run_knn(data, dataset): x, y, pipeline = data pipe = Pipeline([ *pipeline, ('KNN', neighbors.KNeighborsClassifier()), ]) print('Splitting data ' + dataset) x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y) print('Calculating hyperparameters ' + dataset) clf = basicResults(pipe, x_train, y_train, x_test, y_test, params, 'KNN', dataset) conf = confusion_matrix(y_test, clf.predict(x_test)) conf = conf.astype('float') / conf.sum(axis=1)[:, np.newaxis] print('Confusion matrix:') print(conf) np.savetxt('./output/KNN_{}_confusion.csv'.format(dataset), conf, delimiter=',', fmt='%.2f')
def run_svm_rbf(data, dataset): x, y, pipeline = data pipe = Pipeline([ *pipeline, ('SVM', svm.SVC(class_weight='balanced')), ]) print('Splitting data SVM RBF -- ' + dataset) x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y) print('Computing hyperparameters SVM RBF -- ' + dataset) clf = basicResults(pipe, x_train, y_train, x_test, y_test, params, 'SVM-RBF', dataset) # plot_timing_curve(clf, x, y, 'rbf svm', dataset) # plt.savefig('./graphs/' + dataset + '-svm-rbf-timing.png') # plot_iteration_curve(clf, x_train, y_train, x_test, # y_test, iter_adjust, 'rbf svm', dataset) # plt.savefig('./graphs/' + dataset + '-svm-rbf-iteration.png') conf = confusion_matrix(y_test, clf.predict(x_test)) conf = conf.astype('float') / conf.sum(axis=1)[:, np.newaxis] print('Confusion matrix:') print(conf) np.savetxt('./output/SVM-RBF_{}_confusion.csv'.format(dataset), conf, delimiter=',', fmt='%.2f')
def run_ann(data, dataset, solved_params=None): x, y, pipeline = data print('Data size: ', x.shape) pipe = Pipeline([ *pipeline, ('ANN', neural_network.MLPClassifier(max_iter=1000, early_stopping=True)), ]) print('Splitting dataset for ' + dataset) x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y) if solved_params is None: print('Calculating hyperparameters for ' + dataset) dim = x.shape[1] params = { 'ANN__hidden_layer_sizes': [(d,) for d in [dim, dim//2]], 'ANN__solver': ['adam'], 'ANN__alpha': 10.0 ** -np.arange(1, 7), 'ANN__activation': ['relu', 'tanh', 'logistic'], } clf = basicResults(pipe, x_train, y_train, x_test, y_test, params, 'ANN', dataset) else: print('Using presolved hyperparameters for ' + dataset) clf = pipe.set_params(**solved_params) # plot_learning_curve(clf, dataset + ' neural network', # x, y, cv=5, n_jobs=4, scoring=scorer) # plt.savefig('./graphs/' + dataset + '-ann.png') # print('Creating timing curve for ' + dataset) # plot_timing_curve(clf, x, y, 'neural network', dataset) # plt.savefig('./graphs/' + dataset + '-ANN-timing.png') # print('Creating iteration curve for ' + dataset) # plot_iteration_curve(clf, x_train, y_train, x_test, y_test, iter_adjust, 'neural network', dataset) # plt.savefig('./graphs/' + dataset + '-ANN-iteration.png') conf = confusion_matrix(y_test, clf.predict(x_test)) conf = conf.astype('float') / conf.sum(axis=1)[:, np.newaxis] print('Confusion matrix:') print(conf) np.savetxt('./output/ANN_{}_confusion.csv'.format(dataset), conf, delimiter=',', fmt='%.2f')
#Linear SVM pipeS = Pipeline([('Scale', StandardScaler()), ('SVM', SGDClassifier(loss='hinge', l1_ratio=0, penalty='l2', class_weight='balanced', random_state=55))]) params_spam = { 'SVM__alpha': alphas, 'SVM__n_iter': [int((1e6 / N_spam) / .8) + 1] } spam_clf = basicResults(pipeS, spam_trgX, spam_trgY, spam_tstX, spam_tstY, params_spam, 'SVM_Lin', 'spam') y_score = spam_clf.decision_function(spam_tstX) fpr, tpr, thresholds = roc_curve(spam_tstY, y_score) import matplotlib.pyplot as plt plt.figure() plt.plot(fpr, tpr) plt.xlabel('FPR') plt.ylabel('TPR') plt.title('ROC_Curve(Spambase)')
def main(): adult = pd.read_csv('data/adult_parsed.csv') adult_income_X = adult.drop('income', 1).copy().values adult_income_Y = adult['income'].copy().values # wine_data = pd.read_csv('data/winequality_white.csv') # wine_data['category'] = wine_data['quality'] >= 7 # # wineX = wine_data[wine_data.columns[0:11]].values # wineY = wine_data['category'].values.astype(np.int) adult_income_trgX, adult_income_tstX, adult_income_trgY, adult_income_tstY = ms.train_test_split( adult_income_X, adult_income_Y, test_size=0.3, random_state=0, stratify=adult_income_Y) # wine_trgX, wine_tstX, wine_trgY, wine_tstY = ms.train_test_split(wineX, wineY, test_size=0.3, random_state=0,stratify=wineY) N_adult_income = adult_income_trgX.shape[0] # N_wine = wine_trgX.shape[0] # alphas = [10**-x for x in np.arange(1,9.01,1/2)] #Linear SVM pipeM = Pipeline([('Scale', StandardScaler()), ('Cull1', SelectFromModel(RandomForestClassifier(random_state=1), threshold='median')), ('Cull2', SelectFromModel(RandomForestClassifier(random_state=2), threshold='median')), ('SVM', SGDClassifier(loss='hinge', l1_ratio=0, penalty='l2', class_weight='balanced', random_state=55))]) pipeA = Pipeline([('Scale', StandardScaler()), ('SVM', SGDClassifier(loss='hinge', l1_ratio=0, penalty='l2', class_weight='balanced', random_state=55))]) params_adult_income = { 'SVM__alpha': [100, 10, 1, 0.1, 0.001, 0.0001], 'SVM__n_iter': np.arange(0.1, 1, 10) } # params_wine = {'SVM__alpha':alphas,'SVM__n_iter':[int((1e6/N_wine)/.8)+1]} adult_income_clf = basicResults(pipeA, adult_income_trgX, adult_income_trgY, adult_income_tstX, adult_income_tstY, params_adult_income, 'SVM_Lin', 'adult_income') # wine_clf = basicResults(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,params_wine,'SVM_Lin','wine') #wine_final_params = {'SVM__alpha': 0.031622776601683791, 'SVM__n_iter': 687.25} # wine_final_params = wine_clf.best_params_ # wine_OF_params = {'SVM__n_iter': 1303, 'SVM__alpha': 1e-16} #adult_income_final_params ={'SVM__alpha': 0.0001, 'SVM__n_iter': 428} adult_income_final_params = adult_income_clf.best_params_ adult_income_OF_params = {'SVM__n_iter': 55, 'SVM__alpha': 1e-16} # pipeM.set_params(**wine_final_params) # makeTimingCurve(wineX,wineY,pipeM,'SVM_Lin','wine') pipeA.set_params(**adult_income_final_params) makeTimingCurve(adult_income_X, adult_income_Y, pipeA, 'SVM_Lin', 'adult_income') # pipeM.set_params(**wine_final_params) # iterationLC(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,{'SVM__n_iter':[2**x for x in range(12)]},'SVM_Lin','wine') pipeA.set_params(**adult_income_final_params) iterationLC(pipeA, adult_income_trgX, adult_income_trgY, adult_income_tstX, adult_income_tstY, {'SVM__n_iter': np.arange(1, 75, 3)}, 'SVM_Lin', 'adult_income') pipeA.set_params(**adult_income_OF_params) iterationLC(pipeA, adult_income_trgX, adult_income_trgY, adult_income_tstX, adult_income_tstY, {'SVM__n_iter': np.arange(1, 200, 5)}, 'SVM_LinOF', 'adult_income') # pipeM.set_params(**wine_OF_params) # iterationLC(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,{'SVM__n_iter':np.arange(100,2600,100)},'SVM_LinOF','wine') #RBF SVM gamma_fracsA = np.arange(0.2, 2.1, 0.2) gamma_fracsM = np.arange(0.05, 1.01, 0.1) # pipeM = Pipeline([('Scale', StandardScaler()), ('Cull1', SelectFromModel(RandomForestClassifier(random_state=1), threshold='median')), ('Cull2', SelectFromModel(RandomForestClassifier(random_state=2), threshold='median')), ('SVM', primalSVM_RBF())]) pipeA = Pipeline([('Scale', StandardScaler()), ('SVM', primalSVM_RBF())]) params_adult_income = { 'SVM__alpha': [100, 10, 1, 0.1, 0.001, 0.0001], 'SVM__n_iter': [int((1e6 / N_adult_income) / .8) + 1], 'SVM__gamma_frac': gamma_fracsA } # params_wine = {'SVM__alpha':alphas,'SVM__n_iter':[int((1e6/N_wine)/.8)+1],'SVM__gamma_frac':gamma_fracsM} # # wine_clf = basicResults(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,params_wine,'SVM_RBF','wine') adult_income_clf = basicResults(pipeA, adult_income_trgX, adult_income_trgY, adult_income_tstX, adult_income_tstY, params_adult_income, 'SVM_RBF', 'adult_income') # wine_final_params = wine_clf.best_params_ # wine_OF_params = wine_final_params.copy() # wine_OF_params['SVM__alpha'] = 1e-16 adult_income_final_params = adult_income_clf.best_params_ adult_income_OF_params = adult_income_final_params.copy() adult_income_OF_params['SVM__alpha'] = 1e-16 # pipeM.set_params(**wine_final_params) # makeTimingCurve(wineX,wineY,pipeM,'SVM_RBF','wine') pipeA.set_params(**adult_income_final_params) makeTimingCurve(adult_income_X, adult_income_Y, pipeM, 'SVM_RBF', 'adult_income') # pipeM.set_params(**wine_final_params) # iterationLC(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,{'SVM__n_iter':[2**x for x in range(12)]},'SVM_RBF','wine') pipeA.set_params(**adult_income_final_params) iterationLC(pipeA, adult_income_trgX, adult_income_trgY, adult_income_tstX, adult_income_tstY, {'SVM__n_iter': np.arange(1, 75, 3)}, 'SVM_RBF', 'adult_income') pipeA.set_params(**adult_income_OF_params) iterationLC(pipeA, adult_income_trgX, adult_income_trgY, adult_income_tstX, adult_income_tstY, {'SVM__n_iter': np.arange(1, 75, 3)}, 'SVM_RBF_OF', 'adult_income')
def main(): adult = pd.read_csv('data/adult_parsed.csv') adult['net_capital'] = adult['capital-gain'] - adult['capital-loss'] adult = adult.drop(["fnlwgt", "capital-gain", "capital-loss", "workclass"], axis=1) adult['income'] = adult['income'].map({'<=50K': 0, '>50K': 1}) adult['gender'] = adult['gender'].map({'Male': 0, 'Female': 1}).astype(int) adult['race'] = adult['race'].map({ 'Black': 0, 'Asian-Pac-Islander': 1, 'Other': 2, 'White': 3, 'Amer-Indian-Eskimo': 4 }).astype(int) adult['marital-status'] = adult['marital-status'].map({ 'Never-married': 0, 'Widowed': 1, 'Divorced': 2, 'Separated': 3, 'Married-spouse-absent': 4, 'Married-civ-spouse': 5, 'Married-AF-spouse': 6 }) adult['education'] = adult['education'].map({ 'Preschool': 0, '1st-4th': 1, '5th-6th': 2, '7th-8th': 3, '9th': 4, '10th': 5, '11th': 6, '12th': 7, 'Prof-school': 8, 'HS-grad': 9, 'Some-college': 10, 'Assoc-voc': 11, 'Assoc-acdm': 12, 'Bachelors': 13, 'Masters': 14, 'Doctorate': 15 }) adult['occupation'] = adult['occupation'].map({ 'Priv-house-serv': 0, 'Protective-serv': 1, 'Handlers-cleaners': 2, 'Machine-op-inspct': 3, 'Adm-clerical': 4, 'Farming-fishing': 5, 'Transport-moving': 6, 'Craft-repair': 7, 'Other-service': 8, 'Tech-support': 9, 'Sales': 10, 'Exec-managerial': 11, 'Prof-specialty': 12, 'Armed-Forces': 13 }) adult['native-country'] = adult['native-country'].map({ '?': -1, 'Puerto-Rico': 0, 'Haiti': 1, 'Cuba': 2, 'Iran': 3, 'Honduras': 4, 'Jamaica': 5, 'Vietnam': 6, 'Mexico': 7, 'Dominican-Republic': 8, 'Laos': 9, 'Ecuador': 10, 'El-Salvador': 11, 'Cambodia': 12, 'Columbia': 13, 'Guatemala': 14, 'South': 15, 'India': 16, 'Nicaragua': 17, 'Yugoslavia': 18, 'Philippines': 19, 'Thailand': 20, 'Trinadad&Tobago': 21, 'Peru': 22, 'Poland': 23, 'China': 24, 'Hungary': 25, 'Greece': 26, 'Taiwan': 27, 'Italy': 28, 'Portugal': 29, 'France': 30, 'Hong': 31, 'England': 32, 'Scotland': 33, 'Ireland': 34, 'Holand-Netherlands': 35, 'Canada': 36, 'Germany': 37, 'Japan': 38, 'Outlying-US(Guam-USVI-etc)': 39, 'United-States': 40 }) adult['relationship'] = adult['relationship'].map({ 'Unmarried': 0, 'Other-relative': 1, 'Not-in-family': 2, 'Wife': 3, 'Husband': 4, 'Own-child': 5 }) adult = pd.get_dummies(adult) adult_income_X = adult.drop('income', 1).copy().values adult_income_Y = adult['income'].copy().values # wine_data = pd.read_csv('data/wine-red-white-merge.csv') # wineX = wine_data.drop('quality',1).copy().values # wineY = wine_data['quality'].copy().values adult_trgX, adult_tstX, adult_trgY, adult_tstY = ms.train_test_split( adult_income_X, adult_income_Y, test_size=0.3, random_state=0, stratify=adult_income_Y) # wine_trgX, wine_tstX, wine_trgY, wine_tstY = ms.train_test_split(wineX, wineY, test_size=0.3, random_state=0,stratify=wineY) pipeA = Pipeline([('Scale', StandardScaler()), ('MLP', MLPClassifier(max_iter=2000, early_stopping=True, random_state=55))]) pipeM = Pipeline([('Scale', StandardScaler()), ('Cull1', SelectFromModel(RandomForestClassifier(random_state=1), threshold='median')), ('Cull2', SelectFromModel(RandomForestClassifier(random_state=2), threshold='median')), ('Cull3', SelectFromModel(RandomForestClassifier(random_state=3), threshold='median')), ('Cull4', SelectFromModel(RandomForestClassifier(random_state=4), threshold='median')), ('MLP', MLPClassifier(max_iter=2000, early_stopping=True, random_state=55))]) d = adult_income_X.shape[1] hiddens_adult = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]] alphas = [10**-x for x in np.arange(-1, 5.01, 1 / 2)] alphasM = [10**-x for x in np.arange(-1, 9.01, 1 / 2)] # d = wineX.shape[1] hiddens_wine = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]] params_adult = { 'MLP__activation': ['relu', 'logistic'], 'MLP__alpha': alphas, 'MLP__hidden_layer_sizes': hiddens_adult } # params_wine = {'MLP__activation':['relu','logistic'],'MLP__alpha':alphas,'MLP__hidden_layer_sizes':hiddens_wine} # # wine_clf = basicResults(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,params_wine,'ANN','wine') adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, params_adult, 'ANN', 'adult') #wine_final_params = {'MLP__hidden_layer_sizes': (500,), 'MLP__activation': 'logistic', 'MLP__alpha': 10.0} #adult_final_params ={'MLP__hidden_layer_sizes': (28, 28, 28), 'MLP__activation': 'logistic', 'MLP__alpha': 0.0031622776601683794} # wine_final_params = wine_clf.best_params_ adult_final_params = adult_clf.best_params_ adult_OF_params = adult_final_params.copy() adult_OF_params['MLP__alpha'] = 0 # wine_OF_params =wine_final_params.copy() # wine_OF_params['MLP__alpha'] = 0 #raise # # pipeM.set_params(**wine_final_params) pipeM.set_params(**{'MLP__early_stopping': False}) # makeTimingCurve(wineX,wineY,pipeM,'ANN','wine') pipeA.set_params(**adult_final_params) pipeA.set_params(**{'MLP__early_stopping': False}) makeTimingCurve(adult_income_X, adult_income_Y, pipeA, 'ANN', 'adult') # pipeM.set_params(**wine_final_params) pipeM.set_params(**{'MLP__early_stopping': False}) # iterationLC(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,{'MLP__max_iter':[2**x for x in range(12)]+[2100,2200,2300,2400,2500,2600,2700,2800,2900,3000]},'ANN','wine') pipeA.set_params(**adult_final_params) pipeA.set_params(**{'MLP__early_stopping': False}) iterationLC( pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, { 'MLP__max_iter': [2**x for x in range(12)] + [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000] }, 'ANN', 'adult') # pipeM.set_params(**wine_OF_params) pipeM.set_params(**{'MLP__early_stopping': False}) # iterationLC(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,{'MLP__max_iter':[2**x for x in range(12)]+[2100,2200,2300,2400,2500,2600,2700,2800,2900,3000]},'ANN_OF','wine') pipeA.set_params(**adult_OF_params) pipeA.set_params(**{'MLP__early_stopping': False}) iterationLC( pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, { 'MLP__max_iter': [2**x for x in range(12)] + [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000] }, 'ANN_OF', 'adult')
adult_booster = AdaBoostClassifier(algorithm='SAMME',learning_rate=1,base_estimator=adult_base,random_state=55) OF_booster = AdaBoostClassifier(algorithm='SAMME',learning_rate=1,base_estimator=OF_base,random_state=55) #pipeM = Pipeline([('Scale',StandardScaler()), # ('Cull1',SelectFromModel(RandomForestClassifier(random_state=1),threshold='median')), # ('Cull2',SelectFromModel(RandomForestClassifier(random_state=2),threshold='median')), # ('Cull3',SelectFromModel(RandomForestClassifier(random_state=3),threshold='median')), # ('Cull4',SelectFromModel(RandomForestClassifier(random_state=4),threshold='median')), # ('Boost',madelon_booster)]) pipeA = Pipeline([('Scale',StandardScaler()), ('Boost',adult_booster)]) # #madelon_clf = basicResults(pipeM,madelon_trgX,madelon_trgY,madelon_tstX,madelon_tstY,paramsM,'Boost','madelon') adult_clf = basicResults(pipeA,adult_trgX,adult_trgY,adult_tstX,adult_tstY,paramsA,'Boost','adult') # # #madelon_final_params = {'n_estimators': 20, 'learning_rate': 0.02} #adult_final_params = {'n_estimators': 10, 'learning_rate': 1} #OF_params = {'learning_rate':1} #madelon_final_params = madelon_clf.best_params_ adult_final_params = adult_clf.best_params_ OF_params = {'Boost__base_estimator__alpha':-1, 'Boost__n_estimators':50} ## #pipeM.set_params(**madelon_final_params) pipeA.set_params(**adult_final_params) #makeTimingCurve(madelonX,madelonY,pipeM,'Boost','madelon')
threshold='median')), ('Cull4', SelectFromModel(RandomForestClassifier(random_state=4), threshold='median')), ('DT', dtclf_pruned(random_state=55))]) pipeS = Pipeline([('Scale', StandardScaler()), ('DT', dtclf_pruned(random_state=55))]) params = { 'DT__criterion': ['gini', 'entropy'], 'DT__alpha': alphas, 'DT__class_weight': ['balanced'] } spam_clf_fs = basicResults(pipeS_fs, spam_trgX, spam_trgY, spam_tstX, spam_tstY, params, 'DT', 'spam_fs') spam_clf = basicResults(pipeS, spam_trgX, spam_trgY, spam_tstX, spam_tstY, params, 'DT', 'spam') #madelon_final_params = {'DT__alpha': -0.00031622776601683794, 'DT__class_weight': 'balanced', 'DT__criterion': 'entropy'} #adult_final_params = {'class_weight': 'balanced', 'alpha': 0.0031622776601683794, 'criterion': 'entropy'} spam_fs_final_params = spam_clf_fs.best_params_ spam_final_params = spam_clf.best_params_ pipeS_fs.set_params(**spam_fs_final_params) makeTimingCurve(spamX, spamY, pipeS_fs, 'DT', 'spam_fs') pipeS.set_params(**spam_final_params) makeTimingCurve(spamX, spamY, pipeS, 'DT', 'spam') DTpruningVSnodes(pipeS_fs, alphas, spam_trgX, spam_trgY, 'spam_fs') DTpruningVSnodes(pipeS, alphas, spam_trgX, spam_trgY, 'spam')
cancer_booster = AdaBoostClassifier(algorithm='SAMME',learning_rate=1,base_estimator=cancer_base,random_state=55) adult_booster = AdaBoostClassifier(algorithm='SAMME',learning_rate=1,base_estimator=adult_base,random_state=55) OF_booster = AdaBoostClassifier(algorithm='SAMME',learning_rate=1,base_estimator=OF_base,random_state=55) pipeM = Pipeline([('Scale',StandardScaler()), ('Cull1',SelectFromModel(RandomForestClassifier(random_state=1),threshold='median')), ('Cull2',SelectFromModel(RandomForestClassifier(random_state=2),threshold='median')), ('Cull3',SelectFromModel(RandomForestClassifier(random_state=3),threshold='median')), ('Cull4',SelectFromModel(RandomForestClassifier(random_state=4),threshold='median')), ('Boost',cancer_booster)]) pipeA = Pipeline([('Scale',StandardScaler()), ('Boost',adult_booster)]) # cancer_clf = basicResults(pipeM,cancer_trgX,cancer_trgY,cancer_tstX,cancer_tstY,paramsM,'Boost','cancer') adult_clf = basicResults(pipeA,adult_trgX,adult_trgY,adult_tstX,adult_tstY,paramsA,'Boost','adult') # #madelon_final_params = {'n_estimators': 20, 'learning_rate': 0.02} #adult_final_params = {'n_estimators': 10, 'learning_rate': 1} #OF_params = {'learning_rate':1} cancer_final_params = cancer_clf.best_params_ adult_final_params = adult_clf.best_params_ OF_params = {'Boost__base_estimator__alpha':-1, 'Boost__n_estimators':50} ## pipeM.set_params(**cancer_final_params) pipeA.set_params(**adult_final_params) makeTimingCurve(cancerX,cancerY,pipeM,'Boost','cancer')
paramsA = { 'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 45, 60, 80, 100], 'Boost__base_estimator__alpha': alphas } ab_booster = AdaBoostClassifier(algorithm='SAMME', learning_rate=1, base_estimator=ab_base, random_state=55) OF_booster = AdaBoostClassifier(algorithm='SAMME', learning_rate=1, base_estimator=OF_base, random_state=55) pipeA = Pipeline([('Scale', StandardScaler()), ('Boost', ab_booster)]) ab_clf = basicResults(pipeA, ab_trgX, ab_trgY, ab_tstX, ab_tstY, paramsA, 'Boost', 'ab') ab_final_params = ab_clf.best_params_ OF_params = {'Boost__base_estimator__alpha': -1, 'Boost__n_estimators': 50} pipeA.set_params(**ab_final_params) makeTimingCurve(abX, abY, pipeA, 'Boost', 'ab') pipeA.set_params(**ab_final_params) iterationLC(pipeA, ab_trgX, ab_trgY, ab_tstX, ab_tstY, {'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 40, 50]}, 'Boost', 'ab')
#pipeM = Pipeline([('Scale',StandardScaler()), # ('Cull1',SelectFromModel(RandomForestClassifier(random_state=1),threshold='median')), # ('Cull2',SelectFromModel(RandomForestClassifier(random_state=2),threshold='median')), # ('Cull3',SelectFromModel(RandomForestClassifier(random_state=3),threshold='median')), # ('Cull4',SelectFromModel(RandomForestClassifier(random_state=4),threshold='median')), # ('Boost',madelon_booster)]) # Build pipeline for feature scaling and learner pipeA = Pipeline([('Scale', StandardScaler()), ('Boost', adult_booster)]) pipeM = Pipeline([('Scale', StandardScaler()), ('Boost', mushrooms_booster)]) pipeR = Pipeline([('Scale', StandardScaler()), ('Boost', redwine_booster)]) # Perform grid search cross validation over the hyperparameter grid #madelon_clf = basicResults(pipeM,madelon_trgX,madelon_trgY,madelon_tstX,madelon_tstY,paramsM,'Boost','madelon') adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, paramsA, 'Boost', 'adult') mushrooms_clf = basicResults(pipeM, mushrooms_trgX, mushrooms_trgY, mushrooms_tstX, mushrooms_tstY, paramsM, 'Boost', 'mushrooms') redwine_clf = basicResults(pipeR, redwine_trgX, redwine_trgY, redwine_tstX, redwine_tstY, paramsR, 'Boost', 'redwine') # #madelon_final_params = {'n_estimators': 20, 'learning_rate': 0.02} #adult_final_params = {'n_estimators': 10, 'learning_rate': 1} #OF_params = {'learning_rate':1} # Save hyperparameters that grid search cross validation has identified as optimal #madelon_final_params = madelon_clf.best_params_ adult_final_params = adult_clf.best_params_ mushrooms_final_params = mushrooms_clf.best_params_
('SVM', primalSVM_RBF())]) pipeA = Pipeline([('Scale', StandardScaler()), ('SVM', primalSVM_RBF())]) params_adult = { 'SVM__alpha': alphas, 'SVM__n_iter': [int((1e6 / N_adult) / .8) + 1], 'SVM__gamma_frac': gamma_fracsA } params_madelon = { 'SVM__alpha': alphas, 'SVM__n_iter': [int((1e6 / N_madelon) / .8) + 1], 'SVM__gamma_frac': gamma_fracsM } # madelon_clf = basicResults(pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, params_madelon, 'SVM_RBF', 'madelon') adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, params_adult, 'SVM_RBF', 'adult') madelon_final_params = madelon_clf.best_params_ madelon_OF_params = madelon_final_params.copy() madelon_OF_params['SVM__alpha'] = 1e-16 adult_final_params = adult_clf.best_params_ adult_OF_params = adult_final_params.copy() adult_OF_params['SVM__alpha'] = 1e-16 pipeM.set_params(**madelon_final_params) makeTimingCurve(madelonX, madelonY, pipeM, 'SVM_RBF', 'madelon') pipeA.set_params(**adult_final_params) makeTimingCurve(adultX, adultY, pipeM, 'SVM_RBF', 'adult')
alphasM = [10**-x for x in np.arange(-1, 9.01, 1 / 2)] d = madelonX.shape[1] # d = d//(2**4) hiddens_madelon = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]] params_adult = { 'MLP__activation': ['relu', 'logistic'], 'MLP__alpha': alphas, 'MLP__hidden_layer_sizes': hiddens_adult } params_madelon = { 'MLP__activation': ['relu', 'logistic'], 'MLP__alpha': alphas, 'MLP__hidden_layer_sizes': hiddens_madelon } # madelon_clf = basicResults(pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, params_madelon, 'ANN', 'cancer') adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, params_adult, 'ANN', 'adult') #madelon_final_params = {'MLP__hidden_layer_sizes': (500,), 'MLP__activation': 'logistic', 'MLP__alpha': 10.0} #adult_final_params ={'MLP__hidden_layer_sizes': (28, 28, 28), 'MLP__activation': 'logistic', 'MLP__alpha': 0.0031622776601683794} madelon_final_params = madelon_clf.best_params_ adult_final_params = adult_clf.best_params_ adult_OF_params = adult_final_params.copy() adult_OF_params['MLP__alpha'] = 0 madelon_OF_params = madelon_final_params.copy() madelon_OF_params['MLP__alpha'] = 0 #raise
adultY = adult['income'].copy().values adult_trgX, adult_tstX, adult_trgY, adult_tstY = ms.train_test_split(adultX, adultY, test_size=0.3, random_state=0,stratify=adultY) N_adult = adult_trgX.shape[0] alphas = [10**-x for x in np.arange(1,9.01,1/2)] #Linear SVM pipeA = Pipeline([('Scale',StandardScaler()), ('SVM',SGDClassifier(loss='hinge',l1_ratio=0,penalty='l2',class_weight='balanced',random_state=55))]) params_adult = {'SVM__alpha':alphas,'SVM__n_iter':[int((1e6/N_adult)/.8)+1]} adult_clf = basicResults(pipeA,adult_trgX,adult_trgY,adult_tstX,adult_tstY,params_adult,'SVM_Lin','adult') adult_final_params =adult_clf.best_params_ #adult_OF_params ={'SVM__n_iter': 55, 'SVM__alpha': 1e-16} # # adult_OF_params = adult_final_params.copy() adult_OF_params['SVM__alpha'] = 1e-16 pipeA.set_params(**adult_final_params) makeTimingCurve(adultX,adultY,pipeA,'SVM_Lin','adult') pipeA.set_params(**adult_final_params) iterationLC(pipeA,adult_trgX,adult_trgY,adult_tstX,adult_tstY,{'SVM__n_iter':np.arange(1,75,3)},'SVM_Lin','adult') # pipeA.set_params(**adult_OF_params)
from sklearn.feature_selection import SelectFromModel ab = pd.read_hdf('datasets.hdf', 'ab') abX = ab.drop('rings', 1).copy().values abY = ab['rings'].copy().values ab_trgX, ab_tstX, ab_trgY, ab_tstY = ms.train_test_split(abX, abY, test_size=0.3, random_state=0, stratify=abY) d = abX.shape[1] hiddens_ab = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]] alphas = [10**-x for x in np.arange(1, 9.01, 1 / 2)] pipeA = Pipeline([('Scale', StandardScaler()), ('KNN', knnC())]) params_ab = { 'KNN__metric': ['manhattan', 'euclidean', 'chebyshev'], 'KNN__n_neighbors': np.arange(1, 51, 3), 'KNN__weights': ['uniform', 'distance'] } ab_clf = basicResults(pipeA, ab_trgX, ab_trgY, ab_tstX, ab_tstY, params_ab, 'KNN', 'ab') ab_final_params = ab_clf.best_params_ pipeA.set_params(**ab_final_params) makeTimingCurve(abX, abY, pipeA, 'KNN', 'ab')
MLPClassifier(max_iter=2000, early_stopping=True, random_state=55))]) d = spamX.shape[1] hiddens_spam = [(h, ) * l for l in [1, 2, 3] for h in [d, int(d // 2), d * 2]] alphas = [10**-x for x in np.arange(-1, 8.01, 1 / 2)] params_spam = { 'MLP__activation': ['relu', 'logistic'], 'MLP__alpha': alphas, 'MLP__hidden_layer_sizes': hiddens_spam } #spam_clf = basicResults(pipeS,spam_trgX,spam_trgY,spam_tstX,spam_tstY,params_spam,'ANN','spam') spam_clf_fs = basicResults(pipeS_fs, spam_trgX, spam_trgY, spam_tstX, spam_tstY, params_spam, 'ANN', 'spam_fs') #spam_final_params = spam_clf.best_params_ #spam_OF_params =spam_final_params.copy() #spam_OF_params['MLP__alpha'] = 0 spam_fs_final_params = spam_clf_fs.best_params_ spam_fs_OF_params = spam_fs_final_params.copy() spam_fs_OF_params['MLP__alpha'] = 0 #pipeS.set_params(**spam_final_params) #pipeS.set_params(**{'MLP__early_stopping':False}) #makeTimingCurve(spamX,spamY,pipeS,'ANN','spam') pipeS_fs.set_params(**spam_fs_final_params) pipeS_fs.set_params(**{'MLP__early_stopping': False})
# ('Cull2',SelectFromModel(RandomForestClassifier(random_state=2),threshold='median')), # ('Cull3',SelectFromModel(RandomForestClassifier(random_state=3),threshold='median')), # ('Cull4',SelectFromModel(RandomForestClassifier(random_state=4),threshold='median')), # ('MLP',MLPClassifier(max_iter=2000,early_stopping=True,random_state=55))]) d = spamX.shape[1] hiddens_spam = [(h, ) * l for l in [3] for h in [d * 2]] alphas = [10**-x for x in np.arange(-1, 8.01, 1 / 2)] params_spam = { 'MLP__activation': ['relu'], 'MLP__alpha': alphas, 'MLP__hidden_layer_sizes': hiddens_spam } spam_clf = basicResults(pipeS, spam_trgX, spam_trgY, spam_tstX, spam_tstY, params_spam, 'ANN', 'spam') #spam_clf_fs = basicResults(pipeS_fs,spam_trgX,spam_trgY,spam_tstX,spam_tstY,params_spam,'ANN','spam_fs') spam_final_params = spam_clf.best_params_ spam_OF_params = spam_final_params.copy() spam_OF_params['MLP__alpha'] = 0 #spam_fs_final_params = spam_clf_fs.best_params_ #spam_fs_OF_params =spam_fs_final_params.copy() #spam_fs_OF_params['MLP__alpha'] = 0 pipeS.set_params(**spam_final_params) pipeS.set_params(**{'MLP__early_stopping': False}) makeTimingCurve(spamX, spamY, pipeS, 'ANN', 'spam') #pipeS_fs.set_params(**spam_fs_final_params)
pipeA = Pipeline([('Scale', StandardScaler()), ('SVM', SGDClassifier(loss='hinge', l1_ratio=0, penalty='l2', class_weight='balanced', random_state=55))]) params_adult = { 'SVM__alpha': alphas, 'SVM__n_iter': [int((1e6 / N_adult) / .8) + 1] } # print("target",np.unique(adult_trgY)) # print("train",np.unique(adult_tstY)) adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, params_adult, 'SVM_Lin', 'adult') adult_final_params = adult_clf.best_params_ #adult_OF_params ={'SVM__n_iter': 55, 'SVM__alpha': 1e-16} # # adult_OF_params = adult_final_params.copy() adult_OF_params['SVM__alpha'] = 1e-16 pipeA.set_params(**adult_final_params) makeTimingCurve(adultX, adultY, pipeA, 'SVM_Lin', 'adult') pipeA.set_params(**adult_final_params) iterationLC(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, {'SVM__n_iter': np.arange(1, 75, 3)}, 'SVM_Lin', 'adult') #
# ('Cull1',SelectFromModel(RandomForestClassifier(),threshold='median')), # ('Cull2',SelectFromModel(RandomForestClassifier(),threshold='median')), # ('Cull3',SelectFromModel(RandomForestClassifier(),threshold='median')), # ('Cull4',SelectFromModel(RandomForestClassifier(),threshold='median')), # ('KNN',knnC())]) pipeA = Pipeline([('Scale',StandardScaler()), ('KNN',knnC())]) #params_madelon= {'KNN__metric':['manhattan','euclidean','chebyshev'],'KNN__n_neighbors':np.arange(1,51,3),'KNN__weights':['uniform','distance']} params_adult= {'KNN__metric':['manhattan','euclidean','chebyshev'],'KNN__n_neighbors':np.arange(1,51,3),'KNN__weights':['uniform','distance']} #madelon_clf = basicResults(pipeM,madelon_trgX,madelon_trgY,madelon_tstX,madelon_tstY,params_madelon,'KNN','madelon') adult_clf = basicResults(pipeA,adult_trgX,adult_trgY,adult_tstX,adult_tstY,params_adult,'KNN','adult') #madelon_final_params={'KNN__n_neighbors': 43, 'KNN__weights': 'uniform', 'KNN__p': 1} #adult_final_params={'KNN__n_neighbors': 142, 'KNN__p': 1, 'KNN__weights': 'uniform'} #madelon_final_params=madelon_clf.best_params_ adult_final_params=adult_clf.best_params_ #pipeM.set_params(**madelon_final_params) #makeTimingCurve(madelonX,madelonY,pipeM,'KNN','madelon') pipeA.set_params(**adult_final_params) makeTimingCurve(adultX,adultY,pipeA,'KNN','adult')
# ('Cull3',SelectFromModel(RandomForestClassifier(random_state=3),threshold='median')), # ('Cull4',SelectFromModel(RandomForestClassifier(random_state=4),threshold='median')), # ('DT',dtclf_pruned(random_state=55))]) # pipeA = Pipeline([('Scale', StandardScaler()), ('DT', dtclf_pruned(random_state=55))]) params = { 'DT__criterion': ['gini', 'entropy'], 'DT__alpha': alphas, 'DT__class_weight': ['balanced'] } #madelon_clf = basicResults(pipeM,madelon_trgX,madelon_trgY,madelon_tstX,madelon_tstY,params,'DT','madelon') adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, params, 'DT', 'adult') #madelon_final_params = {'DT__alpha': -0.00031622776601683794, 'DT__class_weight': 'balanced', 'DT__criterion': 'entropy'} #adult_final_params = {'class_weight': 'balanced', 'alpha': 0.0031622776601683794, 'criterion': 'entropy'} #madelon_final_params = madelon_clf.best_params_ adult_final_params = adult_clf.best_params_ #pipeM.set_params(**madelon_final_params) #makeTimingCurve(madelonX,madelonY,pipeM,'DT','madelon') pipeA.set_params(**adult_final_params) makeTimingCurve(adultX, adultY, pipeA, 'DT', 'adult') #DTpruningVSnodes(pipeM,alphas,madelon_trgX,madelon_trgY,'madelon') DTpruningVSnodes(pipeA, alphas, adult_trgX, adult_trgY, 'adult') # SPAM
'SVM__kernel': ['linear', 'poly', 'rbf'], 'SVM__C': [.1, .5, 1], 'SVM__gamma': ['scale'] } complexity_params = { 'name': 'SVM__C', 'display_name': 'Penalty', 'values': np.arange(0.001, 2.5, 0.1) } data_clf = basicResults(pipeM, data_train_x, data_train_y, data_test_x, data_test_y, params, 'SVM', dataset, scorer='f1', complexity_curve=True, complexity_params=complexity_params, clf_name='SVM') data_final_params = data_clf.best_params_ pipeM.set_params(**data_final_params) makeTimingCurve(data_x, data_y, pipeM, 'SVM', dataset) iterationLC(pipeM, data_train_x, data_train_y, data_test_x, data_test_y, {'SVM__max_iter': range(1, 250, 10)},
pipeM = Pipeline([ #('Scale',StandardScaler()), ('MLP', MLPClassifier(max_iter=2000, early_stopping=False, random_state=55)) ]) d = data_x.shape[1] hiddens_data = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]] alphas = [10**-x for x in np.arange(-1, 3.01, 1)] params = { 'MLP__activation': ['relu', 'logistic'], 'MLP__hidden_layer_sizes': hiddens_data, 'MLP__alpha': alphas } data_clf = basicResults(pipeM, data_train_x, data_train_y, data_test_x, data_test_y, params, 'ANN', dataset) data_final_params = data_clf.best_params_ pipeM.set_params(**data_final_params) makeTimingCurve(data_x, data_y, pipeM, 'ANN', dataset) iterationLC(pipeM, data_train_x, data_train_y, data_test_x, data_test_y, {'MLP__max_iter': [2**x for x in range(8)]}, 'ANN', dataset=dataset)
def main(): cars = pd.read_hdf('data/processed/datasets.hdf', 'cars') carsX = cars.drop('Class', 1).copy().values carsY = cars['Class'].copy().values madelon = pd.read_hdf('data/processed/datasets.hdf', 'madelon') madelonX = madelon.drop('Class', 1).copy().values madelonY = madelon['Class'].copy().values cars_trgX, cars_tstX, cars_trgY, cars_tstY = ms.train_test_split( carsX, carsY, test_size=0.3, random_state=0, stratify=carsY) madelon_trgX, madelon_tstX, madelon_trgY, madelon_tstY = ms.train_test_split( madelonX, madelonY, test_size=0.3, random_state=0, stratify=madelonY) pipeA = Pipeline([('Scale', StandardScaler()), ('MLP', MLPClassifier(max_iter=2000, early_stopping=True, random_state=55))]) pipeM = Pipeline([('Scale', StandardScaler()), ('Cull1', SelectFromModel(RandomForestClassifier(random_state=1), threshold='median')), ('Cull2', SelectFromModel(RandomForestClassifier(random_state=2), threshold='median')), ('Cull3', SelectFromModel(RandomForestClassifier(random_state=3), threshold='median')), ('Cull4', SelectFromModel(RandomForestClassifier(random_state=4), threshold='median')), ('MLP', MLPClassifier(max_iter=2000, early_stopping=True, random_state=55))]) d = carsX.shape[1] hiddens_cars = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]] alphas = [10**-x for x in np.arange(-1, 5.01, 1 / 2)] alphasM = [10**-x for x in np.arange(-1, 9.01, 1 / 2)] d = madelonX.shape[1] d = d // (2**4) hiddens_madelon = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]] params_cars = { 'MLP__activation': ['relu', 'logistic'], 'MLP__alpha': alphas, 'MLP__hidden_layer_sizes': hiddens_cars } params_madelon = { 'MLP__activation': ['relu', 'logistic'], 'MLP__alpha': alphas, 'MLP__hidden_layer_sizes': hiddens_madelon } # madelon_clf = basicResults(pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, params_madelon, 'ANN', 'madelon') cars_clf = basicResults(pipeA, cars_trgX, cars_trgY, cars_tstX, cars_tstY, params_cars, 'ANN', 'cars') #madelon_final_params = {'MLP__hidden_layer_sizes': (500,), 'MLP__activation': 'logistic', 'MLP__alpha': 10.0} #cars_final_params ={'MLP__hidden_layer_sizes': (28, 28, 28), 'MLP__activation': 'logistic', 'MLP__alpha': 0.0031622776601683794} madelon_final_params = madelon_clf.best_params_ cars_final_params = cars_clf.best_params_ cars_OF_params = cars_final_params.copy() cars_OF_params['MLP__alpha'] = 0 madelon_OF_params = madelon_final_params.copy() madelon_OF_params['MLP__alpha'] = 0 #raise # pipeM.set_params(**madelon_final_params) pipeM.set_params(**{'MLP__early_stopping': False}) makeTimingCurve(madelonX, madelonY, pipeM, 'ANN', 'madelon') pipeA.set_params(**cars_final_params) pipeA.set_params(**{'MLP__early_stopping': False}) makeTimingCurve(carsX, carsY, pipeA, 'ANN', 'cars') pipeM.set_params(**madelon_final_params) pipeM.set_params(**{'MLP__early_stopping': False}) iterationLC( pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, { 'MLP__max_iter': [2**x for x in range(12)] + [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000] }, 'ANN', 'madelon') pipeA.set_params(**cars_final_params) pipeA.set_params(**{'MLP__early_stopping': False}) iterationLC( pipeA, cars_trgX, cars_trgY, cars_tstX, cars_tstY, { 'MLP__max_iter': [2**x for x in range(12)] + [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000] }, 'ANN', 'cars') pipeM.set_params(**madelon_OF_params) pipeM.set_params(**{'MLP__early_stopping': False}) iterationLC( pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, { 'MLP__max_iter': [2**x for x in range(12)] + [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000] }, 'ANN_OF', 'madelon') pipeA.set_params(**cars_OF_params) pipeA.set_params(**{'MLP__early_stopping': False}) iterationLC( pipeA, cars_trgX, cars_trgY, cars_tstX, cars_tstY, { 'MLP__max_iter': [2**x for x in range(12)] + [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000] }, 'ANN_OF', 'cars')
def main(): adult = pd.read_csv('data/adult_parsed.csv') # plt.figure(figsize=(15,12)) # cor_map = adult.corr() # sns.heatmap(cor_map, annot=True, fmt='.3f', cmap='YlGnBu') # plt.show() adult['net_capital'] = adult['capital-gain']-adult['capital-loss'] adult = adult.drop(["fnlwgt","capital-gain","capital-loss","workclass"],axis=1) adult['income']=adult['income'].map({'<=50K': 0, '>50K': 1}) adult['gender'] = adult['gender'].map({'Male': 0, 'Female': 1}).astype(int) adult['race'] = adult['race'].map({'Black': 0, 'Asian-Pac-Islander': 1, 'Other': 2, 'White': 3, 'Amer-Indian-Eskimo': 4}).astype(int) adult['marital-status'] = adult['marital-status'].map({'Never-married':0,'Widowed':1,'Divorced':2, 'Separated':3, 'Married-spouse-absent':4, 'Married-civ-spouse':5, 'Married-AF-spouse':6}) adult['education'] = adult['education'].map({'Preschool':0,'1st-4th':1,'5th-6th':2, '7th-8th':3, '9th':4, '10th':5, '11th':6, '12th':7, 'Prof-school':8, 'HS-grad':9, 'Some-college':10, 'Assoc-voc':11, 'Assoc-acdm':12, 'Bachelors':13, 'Masters':14, 'Doctorate':15}) adult['occupation'] = adult['occupation'].map({'Priv-house-serv':0,'Protective-serv':1,'Handlers-cleaners':2, 'Machine-op-inspct':3, 'Adm-clerical':4, 'Farming-fishing':5, 'Transport-moving':6, 'Craft-repair':7, 'Other-service':8, 'Tech-support':9, 'Sales':10, 'Exec-managerial':11, 'Prof-specialty':12, 'Armed-Forces':13 }) adult['native-country'] = adult['native-country'].map({'?':-1,'Puerto-Rico':0,'Haiti':1,'Cuba':2, 'Iran':3, 'Honduras':4, 'Jamaica':5, 'Vietnam':6, 'Mexico':7, 'Dominican-Republic':8, 'Laos':9, 'Ecuador':10, 'El-Salvador':11, 'Cambodia':12, 'Columbia':13, 'Guatemala':14, 'South':15, 'India':16, 'Nicaragua':17, 'Yugoslavia':18, 'Philippines':19, 'Thailand':20, 'Trinadad&Tobago':21, 'Peru':22, 'Poland':23, 'China':24, 'Hungary':25, 'Greece':26, 'Taiwan':27, 'Italy':28, 'Portugal':29, 'France':30, 'Hong':31, 'England':32, 'Scotland':33, 'Ireland':34, 'Holand-Netherlands':35, 'Canada':36, 'Germany':37, 'Japan':38, 'Outlying-US(Guam-USVI-etc)':39, 'United-States':40 }) adult['relationship'] = adult['relationship'].map({'Unmarried':0,'Other-relative':1, 'Not-in-family':2, 'Wife':3, 'Husband':4,'Own-child':5}) adult = pd.get_dummies(adult) adult_income_X = adult.drop('income',1).copy().values adult_income_Y = adult['income'].copy().values # wine_data = pd.read_csv('data/wine-red-white-merge.csv') # wineX = wine_data.drop('quality',1).copy().values # wineY = wine_data['quality'].copy().values adult_income_trgX, adult_income_tstX, adult_income_trgY, adult_income_tstY = ms.train_test_split(adult_income_X, adult_income_Y, test_size=0.3, random_state=0,stratify=adult_income_Y) # wine_trgX, wine_tstX, wine_trgY, wine_tstY = ms.train_test_split(wineX, wineY, test_size=0.3, random_state=0,stratify=wineY) d = adult_income_X.shape[1] hiddens_adult_income = [(h,)*l for l in [1,2,3] for h in [d,d//2,d*2]] alphas = [10**-x for x in np.arange(1,9.01,1/2)] # d = wineX.shape[1] # hiddens_wine = [(h,)*l for l in [1,2,3] for h in [d,d//2,d*2]] pipeM = Pipeline([('Scale',StandardScaler()), ('Cull1',SelectFromModel(RandomForestClassifier(),threshold='median')), ('Cull2',SelectFromModel(RandomForestClassifier(),threshold='median')), ('Cull3',SelectFromModel(RandomForestClassifier(),threshold='median')), ('Cull4',SelectFromModel(RandomForestClassifier(),threshold='median')), ('KNN',knnC())]) pipeA = Pipeline([('Scale',StandardScaler()), ('KNN',knnC())]) params_adult_income= {'KNN__metric':['manhattan','euclidean','chebyshev'],'KNN__n_neighbors':np.arange(1,51,3),'KNN__weights':['uniform','distance']} # params_wine= {'KNN__metric':['manhattan','euclidean','chebyshev'],'KNN__n_neighbors':np.arange(1,51,3),'KNN__weights':['uniform','distance']} adult_income_clf = basicResults(pipeA,adult_income_trgX,adult_income_trgY,adult_income_tstX,adult_income_tstY,params_adult_income,'KNN','adult_income') # wine_clf = basicResults(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,params_wine,'KNN','wine') # wine_final_params={'KNN__n_neighbors': 43, 'KNN__weights': 'uniform', 'KNN__p': 1} #adult_income_final_params={'KNN__n_neighbors': 142, 'KNN__p': 1, 'KNN__weights': 'uniform'} # wine_final_params=wine_clf.best_params_ adult_income_final_params=adult_income_clf.best_params_ # pipeM.set_params(**wine_final_params) # makeTimingCurve(wineX,wineY,pipeM,'KNN','wine') pipeA.set_params(**adult_income_final_params) makeTimingCurve(adult_income_X,adult_income_Y,pipeA,'KNN','adult_income')
early_stopping=True, random_state=55))]) d = adultX.shape[1] #hiddens_adult = [(h,)*l for l in [1,2,3] for h in [d/4,d/2,d,int(round(d*1.2,0))]] hiddens_adult = [56, 56, 56] #alphas = [10**-x for x in np.arange(-1, 5.01, 0.5)] alphas = [0.01] params_adult = { 'MLP__activation': ['logistic'], 'MLP__alpha': alphas, 'MLP__hidden_layer_sizes': hiddens_adult } adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, params_adult, 'ANN', 'adult') # adult_final_params = adult_clf.best_params_ adult_OF_params = adult_final_params.copy() adult_OF_params['MLP__alpha'] = 0 #raise # Make timing curve of final model #pipeA.set_params(**adult_final_params) #pipeA.set_params(**{'MLP__early_stopping':False}) #makeTimingCurve(adultX,adultY,pipeA,'ANN','adult') # Find opt number of iterations; in range 1 to 3000 pipeA.set_params(**adult_final_params)
# ('Cull3',SelectFromModel(RandomForestClassifier(random_state=3),threshold='median')), # ('Cull4',SelectFromModel(RandomForestClassifier(random_state=4),threshold='median')), # ('DT',dtclf_pruned(random_state=55))]) # pipeA = Pipeline([('Scale', StandardScaler()), ('DT', dtclf_pruned(random_state=55))]) params = { 'DT__criterion': ['gini', 'entropy'], 'DT__alpha': alphas, 'DT__class_weight': ['balanced'] } #madelon_clf = basicResults(pipeM,madelon_trgX,madelon_trgY,madelon_tstX,madelon_tstY,params,'DT','madelon') adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, params, 'DT', 'adult') #madelon_final_params = {'DT__alpha': -0.00031622776601683794, 'DT__class_weight': 'balanced', 'DT__criterion': 'entropy'} #adult_final_params = {'class_weight': 'balanced', 'alpha': 0.0031622776601683794, 'criterion': 'entropy'} #madelon_final_params = madelon_clf.best_params_ adult_final_params = adult_clf.best_params_ #pipeM.set_params(**madelon_final_params) #makeTimingCurve(madelonX,madelonY,pipeM,'DT','madelon') pipeA.set_params(**adult_final_params) makeTimingCurve(adultX, adultY, pipeA, 'DT', 'adult') #DTpruningVSnodes(pipeM,alphas,madelon_trgX,madelon_trgY,'madelon') DTpruningVSnodes(pipeA, alphas, adult_trgX, adult_trgY, 'adult') ###################################################################################################
]) pipeA = Pipeline([('Scale', StandardScaler()), ('KNN', knnC())]) params_adult = { 'KNN__metric': ['manhattan', 'euclidean', 'chebyshev'], 'KNN__n_neighbors': np.arange(1, 51, 3), 'KNN__weights': ['uniform', 'distance'] } params_cancer = { 'KNN__metric': ['manhattan', 'euclidean', 'chebyshev'], 'KNN__n_neighbors': np.arange(1, 51, 3), 'KNN__weights': ['uniform', 'distance'] } cancer_clf = basicResults(pipeM, cancer_trgX, cancer_trgY, cancer_tstX, cancer_tstY, params_cancer, 'KNN', 'cancer') adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, params_adult, 'KNN', 'adult') adult_final_params = { 'KNN__n_neighbors': 160, 'KNN__p': 1, 'KNN__weights': 'uniform' } adult_final_params = adult_clf.best_params_ cancer_final_params = { 'KNN__n_neighbors': 90, 'KNN__p': 1, 'KNN__weights': 'uniform' } cancer_final_params = cancer_clf.best_params_