def main(): cars = pd.read_hdf('data/processed/datasets.hdf', 'cars') carsX = cars.drop('Class', 1).copy().values carsY = cars['Class'].copy().values madelon = pd.read_hdf('data/processed/datasets.hdf', 'madelon') madelonX = madelon.drop('Class', 1).copy().values madelonY = madelon['Class'].copy().values cars_trgX, cars_tstX, cars_trgY, cars_tstY = ms.train_test_split( carsX, carsY, test_size=0.3, random_state=0, stratify=carsY) madelon_trgX, madelon_tstX, madelon_trgY, madelon_tstY = ms.train_test_split( madelonX, madelonY, test_size=0.3, random_state=0, stratify=madelonY) d = carsX.shape[1] hiddens_cars = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]] alphas = [10**-x for x in np.arange(1, 9.01, 1 / 2)] d = madelonX.shape[1] hiddens_madelon = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]] pipeM = Pipeline([ ('Scale', StandardScaler()), ('Cull1', SelectFromModel(RandomForestClassifier(), threshold='median')), ('Cull2', SelectFromModel(RandomForestClassifier(), threshold='median')), ('Cull3', SelectFromModel(RandomForestClassifier(), threshold='median')), ('Cull4', SelectFromModel(RandomForestClassifier(), threshold='median')), ('KNN', knnC()) ]) pipeA = Pipeline([('Scale', StandardScaler()), ('KNN', knnC())]) params_madelon = { 'KNN__metric': ['manhattan', 'euclidean', 'chebyshev'], 'KNN__n_neighbors': np.arange(1, 51, 3), 'KNN__weights': ['uniform', 'distance'] } params_cars = { 'KNN__metric': ['manhattan', 'euclidean', 'chebyshev'], 'KNN__n_neighbors': np.arange(1, 51, 3), 'KNN__weights': ['uniform', 'distance'] } madelon_clf = basicResults(pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, params_madelon, 'KNN', 'madelon') cars_clf = basicResults(pipeA, cars_trgX, cars_trgY, cars_tstX, cars_tstY, params_cars, 'KNN', 'cars') #madelon_final_params={'KNN__n_neighbors': 43, 'KNN__weights': 'uniform', 'KNN__p': 1} #cars_final_params={'KNN__n_neighbors': 142, 'KNN__p': 1, 'KNN__weights': 'uniform'} madelon_final_params = madelon_clf.best_params_ cars_final_params = cars_clf.best_params_ pipeM.set_params(**madelon_final_params) makeTimingCurve(madelonX, madelonY, pipeM, 'KNN', 'madelon') pipeA.set_params(**cars_final_params) makeTimingCurve(carsX, carsY, pipeA, 'KNN', 'cars')
def main(): # adult = pd.read_csv('data/adult_parsed.csv') # plt.figure(figsize=(15,12)) # cor_map = adult.corr() # sns.heatmap(cor_map, annot=True, fmt='.3f', cmap='YlGnBu') # plt.show() # adult['net_capital'] = adult['capital-gain']-adult['capital-loss'] # adult = adult.drop(["fnlwgt","capital-gain","capital-loss","workclass"],axis=1) # # adult['income']=adult['income'].map({'<=50K': 0, '>50K': 1}) # adult['gender'] = adult['gender'].map({'Male': 0, 'Female': 1}).astype(int) # adult['race'] = adult['race'].map({'Black': 0, 'Asian-Pac-Islander': 1, 'Other': 2, 'White': 3, # 'Amer-Indian-Eskimo': 4}).astype(int) # adult['marital-status'] = adult['marital-status'].map({'Never-married':0,'Widowed':1,'Divorced':2, 'Separated':3, # 'Married-spouse-absent':4, 'Married-civ-spouse':5, 'Married-AF-spouse':6}) # adult['education'] = adult['education'].map({'Preschool':0,'1st-4th':1,'5th-6th':2, '7th-8th':3, # '9th':4, '10th':5, '11th':6, '12th':7, 'Prof-school':8, # 'HS-grad':9, 'Some-college':10, 'Assoc-voc':11, 'Assoc-acdm':12, # 'Bachelors':13, 'Masters':14, 'Doctorate':15}) # # adult['occupation'] = adult['occupation'].map({'Priv-house-serv':0,'Protective-serv':1,'Handlers-cleaners':2, 'Machine-op-inspct':3, # 'Adm-clerical':4, 'Farming-fishing':5, 'Transport-moving':6, 'Craft-repair':7, 'Other-service':8, # 'Tech-support':9, 'Sales':10, 'Exec-managerial':11, 'Prof-specialty':12, 'Armed-Forces':13 }) # adult['native-country'] = adult['native-country'].map({'?':-1,'Puerto-Rico':0,'Haiti':1,'Cuba':2, 'Iran':3, # 'Honduras':4, 'Jamaica':5, 'Vietnam':6, 'Mexico':7, 'Dominican-Republic':8, # 'Laos':9, 'Ecuador':10, 'El-Salvador':11, 'Cambodia':12, 'Columbia':13, # 'Guatemala':14, 'South':15, 'India':16, 'Nicaragua':17, 'Yugoslavia':18, # 'Philippines':19, 'Thailand':20, 'Trinadad&Tobago':21, 'Peru':22, 'Poland':23, # 'China':24, 'Hungary':25, 'Greece':26, 'Taiwan':27, 'Italy':28, 'Portugal':29, # 'France':30, 'Hong':31, 'England':32, 'Scotland':33, 'Ireland':34, # 'Holand-Netherlands':35, 'Canada':36, 'Germany':37, 'Japan':38, # 'Outlying-US(Guam-USVI-etc)':39, 'United-States':40 # }) # # adult['relationship'] = adult['relationship'].map({'Unmarried':0,'Other-relative':1, 'Not-in-family':2, # 'Wife':3, 'Husband':4,'Own-child':5}) # # adult = pd.get_dummies(adult) # adult_income_X = adult.drop('income',1).copy().values # adult_income_Y = adult['income'].copy().values # # # # # # adult_trgX, adult_tstX, adult_trgY, adult_tstY = ms.train_test_split(adult_income_X, adult_income_Y, test_size=0.3, random_state=0,stratify=adult_income_Y) # # alphas = [0.00005, 0.0001, 0.0002,0.00025, 0.0003, 0.0004,0.0005, 0.0006,0.0007, 0.0008, 0.001, 0.0015, 0.002, 0.005, 0.01, 0.05, 0.1, 0.5] alphas = np.append(np.arange(0.001, 0.05, 0.001), 0) pipeA = Pipeline([('Scale', StandardScaler()), ('DT', dtclf_pruned(random_state=55))]) # params = { 'DT__criterion': ['gini', 'entropy'], 'DT__alpha': alphas, 'DT__class_weight': ['balanced'] } # adult_income_clf = basicResults(pipeA,adult_trgX,adult_trgY,adult_tstX,adult_tstY,params,'DT','adult_income') # adult_final_params = adult_income_clf.best_params_ # pipeA.set_params(**adult_final_params) # makeTimingCurve(adult_income_X,adult_income_Y,pipeA,'DT','adult_income') # DTpruningVSnodes(pipeA,alphas,adult_trgX,adult_trgY,'adult_income') #wine_final_params = {'DT__alpha': -0.00031622776601683794, 'DT__class_weight': 'balanced', 'DT__criterion': 'entropy'} #adult_final_params = {'class_weight': 'balanced', 'alpha': 0.0031622776601683794, 'criterion': 'entropy'} # Data Parsing for wine quality dataset wine_data = pd.read_csv('data/winequality_white.csv') wine_data['category'] = wine_data['quality'] >= 7 wineX = wine_data[wine_data.columns[0:11]].values wineY = wine_data['category'].values.astype(np.int) # plt.figure(figsize=(12,6)) # sns.heatmap(wine_data.corr(),annot=True) # plt.show() wine_trgX, wine_tstX, wine_trgY, wine_tstY = ms.train_test_split( wineX, wineY, test_size=0.3, random_state=0, stratify=wineY) wine_clf = basicResults(pipeA, wine_trgX, wine_trgY, wine_tstX, wine_tstY, params, 'DT', 'wine') wine_final_params = wine_clf.best_params_ pipeA.set_params(**wine_final_params) makeTimingCurve(wineX, wineY, pipeA, 'DT', 'wine') DTpruningVSnodes(pipeA, alphas, wine_trgX, wine_trgY, 'wine')
} # madelon_clf = basicResults(pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, params_madelon, 'SVM_RBF', 'madelon') adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, params_adult, 'SVM_RBF', 'adult') madelon_final_params = madelon_clf.best_params_ madelon_OF_params = madelon_final_params.copy() madelon_OF_params['SVM__alpha'] = 1e-16 adult_final_params = adult_clf.best_params_ adult_OF_params = adult_final_params.copy() adult_OF_params['SVM__alpha'] = 1e-16 pipeM.set_params(**madelon_final_params) makeTimingCurve(madelonX, madelonY, pipeM, 'SVM_RBF', 'madelon') pipeA.set_params(**adult_final_params) makeTimingCurve(adultX, adultY, pipeM, 'SVM_RBF', 'adult') pipeM.set_params(**madelon_final_params) iterationLC(pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, {'SVM__n_iter': [2**x for x in range(12)]}, 'SVM_RBF', 'madelon') pipeA.set_params(**adult_final_params) iterationLC(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, {'SVM__n_iter': np.arange(1, 75, 3)}, 'SVM_RBF', 'adult') pipeA.set_params(**adult_OF_params) iterationLC(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, {'SVM__n_iter': np.arange(1, 75, 3)}, 'SVM_RBF_OF', 'adult') pipeM.set_params(**madelon_OF_params) iterationLC(pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY,
# cancer_clf = basicResults(pipeM,cancer_trgX,cancer_trgY,cancer_tstX,cancer_tstY,paramsM,'Boost','cancer') adult_clf = basicResults(pipeA,adult_trgX,adult_trgY,adult_tstX,adult_tstY,paramsA,'Boost','adult') # #madelon_final_params = {'n_estimators': 20, 'learning_rate': 0.02} #adult_final_params = {'n_estimators': 10, 'learning_rate': 1} #OF_params = {'learning_rate':1} cancer_final_params = cancer_clf.best_params_ adult_final_params = adult_clf.best_params_ OF_params = {'Boost__base_estimator__alpha':-1, 'Boost__n_estimators':50} ## pipeM.set_params(**cancer_final_params) pipeA.set_params(**adult_final_params) makeTimingCurve(cancerX,cancerY,pipeM,'Boost','cancer') makeTimingCurve(adultX,adultY,pipeA,'Boost','adult') # pipeM.set_params(**cancer_final_params) iterationLC(pipeM,cancer_trgX,cancer_trgY,cancer_tstX,cancer_tstY,{'Boost__n_estimators':[1,2,5,10,20,30,40,50,60,70,80,90,100]},'Boost','cancer') pipeA.set_params(**adult_final_params) iterationLC(pipeA,adult_trgX,adult_trgY,adult_tstX,adult_tstY,{'Boost__n_estimators':[1,2,5,10,20,30,40,50]},'Boost','adult') pipeM.set_params(**OF_params) iterationLC(pipeM,cancer_trgX,cancer_trgY,cancer_tstX,cancer_tstY,{'Boost__n_estimators':[1,2,5,10,20,30,40,50,60,70,80,90,100]},'Boost_OF','cancer') pipeA.set_params(**OF_params) iterationLC(pipeA,adult_trgX,adult_trgY,adult_tstX,adult_tstY,{'Boost__n_estimators':[1,2,5,10,20,30,40,50]},'Boost_OF','adult')
} # print("target",np.unique(adult_trgY)) # print("train",np.unique(adult_tstY)) adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, params_adult, 'SVM_Lin', 'adult') adult_final_params = adult_clf.best_params_ #adult_OF_params ={'SVM__n_iter': 55, 'SVM__alpha': 1e-16} # # adult_OF_params = adult_final_params.copy() adult_OF_params['SVM__alpha'] = 1e-16 pipeA.set_params(**adult_final_params) makeTimingCurve(adultX, adultY, pipeA, 'SVM_Lin', 'adult') pipeA.set_params(**adult_final_params) iterationLC(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, {'SVM__n_iter': np.arange(1, 75, 3)}, 'SVM_Lin', 'adult') # pipeA.set_params(**adult_OF_params) iterationLC(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, {'SVM__n_iter': np.arange(1, 200, 5)}, 'SVM_LinOF', 'adult') #pipeM.set_params(**madelon_OF_params) #iterationLC(pipeM,madelon_trgX,madelon_trgY,madelon_tstX,madelon_tstY,{'SVM__n_iter':np.arange(100,2600,100)},'SVM_LinOF','madelon') #RBF SVM gamma_fracsA = np.arange(0.2, 2.1, 0.2) pipeA = Pipeline([('Scale', StandardScaler()), ('SVM', primalSVM_RBF())])
def main(): adult = pd.read_csv('data/adult_parsed.csv') adult['net_capital'] = adult['capital-gain'] - adult['capital-loss'] adult = adult.drop(["fnlwgt", "capital-gain", "capital-loss", "workclass"], axis=1) adult['income'] = adult['income'].map({'<=50K': 0, '>50K': 1}) adult['gender'] = adult['gender'].map({'Male': 0, 'Female': 1}).astype(int) adult['race'] = adult['race'].map({ 'Black': 0, 'Asian-Pac-Islander': 1, 'Other': 2, 'White': 3, 'Amer-Indian-Eskimo': 4 }).astype(int) adult['marital-status'] = adult['marital-status'].map({ 'Never-married': 0, 'Widowed': 1, 'Divorced': 2, 'Separated': 3, 'Married-spouse-absent': 4, 'Married-civ-spouse': 5, 'Married-AF-spouse': 6 }) adult['education'] = adult['education'].map({ 'Preschool': 0, '1st-4th': 1, '5th-6th': 2, '7th-8th': 3, '9th': 4, '10th': 5, '11th': 6, '12th': 7, 'Prof-school': 8, 'HS-grad': 9, 'Some-college': 10, 'Assoc-voc': 11, 'Assoc-acdm': 12, 'Bachelors': 13, 'Masters': 14, 'Doctorate': 15 }) adult['occupation'] = adult['occupation'].map({ 'Priv-house-serv': 0, 'Protective-serv': 1, 'Handlers-cleaners': 2, 'Machine-op-inspct': 3, 'Adm-clerical': 4, 'Farming-fishing': 5, 'Transport-moving': 6, 'Craft-repair': 7, 'Other-service': 8, 'Tech-support': 9, 'Sales': 10, 'Exec-managerial': 11, 'Prof-specialty': 12, 'Armed-Forces': 13 }) adult['native-country'] = adult['native-country'].map({ '?': -1, 'Puerto-Rico': 0, 'Haiti': 1, 'Cuba': 2, 'Iran': 3, 'Honduras': 4, 'Jamaica': 5, 'Vietnam': 6, 'Mexico': 7, 'Dominican-Republic': 8, 'Laos': 9, 'Ecuador': 10, 'El-Salvador': 11, 'Cambodia': 12, 'Columbia': 13, 'Guatemala': 14, 'South': 15, 'India': 16, 'Nicaragua': 17, 'Yugoslavia': 18, 'Philippines': 19, 'Thailand': 20, 'Trinadad&Tobago': 21, 'Peru': 22, 'Poland': 23, 'China': 24, 'Hungary': 25, 'Greece': 26, 'Taiwan': 27, 'Italy': 28, 'Portugal': 29, 'France': 30, 'Hong': 31, 'England': 32, 'Scotland': 33, 'Ireland': 34, 'Holand-Netherlands': 35, 'Canada': 36, 'Germany': 37, 'Japan': 38, 'Outlying-US(Guam-USVI-etc)': 39, 'United-States': 40 }) adult['relationship'] = adult['relationship'].map({ 'Unmarried': 0, 'Other-relative': 1, 'Not-in-family': 2, 'Wife': 3, 'Husband': 4, 'Own-child': 5 }) adult = pd.get_dummies(adult) adult_income_X = adult.drop('income', 1).copy().values adult_income_Y = adult['income'].copy().values # wine_data = pd.read_csv('data/wine-red-white-merge.csv') # wineX = wine_data.drop('quality',1).copy().values # wineY = wine_data['quality'].copy().values adult_trgX, adult_tstX, adult_trgY, adult_tstY = ms.train_test_split( adult_income_X, adult_income_Y, test_size=0.3, random_state=0, stratify=adult_income_Y) # wine_trgX, wine_tstX, wine_trgY, wine_tstY = ms.train_test_split(wineX, wineY, test_size=0.3, random_state=0,stratify=wineY) pipeA = Pipeline([('Scale', StandardScaler()), ('MLP', MLPClassifier(max_iter=2000, early_stopping=True, random_state=55))]) pipeM = Pipeline([('Scale', StandardScaler()), ('Cull1', SelectFromModel(RandomForestClassifier(random_state=1), threshold='median')), ('Cull2', SelectFromModel(RandomForestClassifier(random_state=2), threshold='median')), ('Cull3', SelectFromModel(RandomForestClassifier(random_state=3), threshold='median')), ('Cull4', SelectFromModel(RandomForestClassifier(random_state=4), threshold='median')), ('MLP', MLPClassifier(max_iter=2000, early_stopping=True, random_state=55))]) d = adult_income_X.shape[1] hiddens_adult = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]] alphas = [10**-x for x in np.arange(-1, 5.01, 1 / 2)] alphasM = [10**-x for x in np.arange(-1, 9.01, 1 / 2)] # d = wineX.shape[1] hiddens_wine = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]] params_adult = { 'MLP__activation': ['relu', 'logistic'], 'MLP__alpha': alphas, 'MLP__hidden_layer_sizes': hiddens_adult } # params_wine = {'MLP__activation':['relu','logistic'],'MLP__alpha':alphas,'MLP__hidden_layer_sizes':hiddens_wine} # # wine_clf = basicResults(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,params_wine,'ANN','wine') adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, params_adult, 'ANN', 'adult') #wine_final_params = {'MLP__hidden_layer_sizes': (500,), 'MLP__activation': 'logistic', 'MLP__alpha': 10.0} #adult_final_params ={'MLP__hidden_layer_sizes': (28, 28, 28), 'MLP__activation': 'logistic', 'MLP__alpha': 0.0031622776601683794} # wine_final_params = wine_clf.best_params_ adult_final_params = adult_clf.best_params_ adult_OF_params = adult_final_params.copy() adult_OF_params['MLP__alpha'] = 0 # wine_OF_params =wine_final_params.copy() # wine_OF_params['MLP__alpha'] = 0 #raise # # pipeM.set_params(**wine_final_params) pipeM.set_params(**{'MLP__early_stopping': False}) # makeTimingCurve(wineX,wineY,pipeM,'ANN','wine') pipeA.set_params(**adult_final_params) pipeA.set_params(**{'MLP__early_stopping': False}) makeTimingCurve(adult_income_X, adult_income_Y, pipeA, 'ANN', 'adult') # pipeM.set_params(**wine_final_params) pipeM.set_params(**{'MLP__early_stopping': False}) # iterationLC(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,{'MLP__max_iter':[2**x for x in range(12)]+[2100,2200,2300,2400,2500,2600,2700,2800,2900,3000]},'ANN','wine') pipeA.set_params(**adult_final_params) pipeA.set_params(**{'MLP__early_stopping': False}) iterationLC( pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, { 'MLP__max_iter': [2**x for x in range(12)] + [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000] }, 'ANN', 'adult') # pipeM.set_params(**wine_OF_params) pipeM.set_params(**{'MLP__early_stopping': False}) # iterationLC(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,{'MLP__max_iter':[2**x for x in range(12)]+[2100,2200,2300,2400,2500,2600,2700,2800,2900,3000]},'ANN_OF','wine') pipeA.set_params(**adult_OF_params) pipeA.set_params(**{'MLP__early_stopping': False}) iterationLC( pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, { 'MLP__max_iter': [2**x for x in range(12)] + [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000] }, 'ANN_OF', 'adult')
'Boost__n_estimators': [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110], 'Boost__learning_rate': [(2**x) / 100 for x in range(8)] + [1] } # paramsA= {'Boost__n_estimators':[1,2,5,10,20,30,45,60,80,100], # 'Boost__base_estimator__alpha':alphas} booster = AdaBoostClassifier(algorithm='SAMME', learning_rate=1, base_estimator=base, random_state=55) pipeM = Pipeline([ #('Scale',StandardScaler()), # ('Cull1',SelectFromModel(RandomForestClassifier(random_state=1),threshold='median')), ('Boost', booster) ]) data_clf = basicResults(pipeM, data_train_x, data_train_y, data_test_x, data_test_y, params, 'Boost', dataset) data_final_params = data_clf.best_params_ pipeM.set_params(**data_final_params) makeTimingCurve(data_x, data_y, pipeM, 'Boost', dataset) iterationLC(pipeM, data_train_x, data_train_y, data_test_x, data_test_y, {'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 40, 50]}, 'Boost', dataset=dataset)
} params_cancer = { 'KNN__metric': ['manhattan', 'euclidean', 'chebyshev'], 'KNN__n_neighbors': np.arange(1, 51, 3), 'KNN__weights': ['uniform', 'distance'] } cancer_clf = basicResults(pipeM, cancer_trgX, cancer_trgY, cancer_tstX, cancer_tstY, params_cancer, 'KNN', 'cancer') adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, params_adult, 'KNN', 'adult') adult_final_params = { 'KNN__n_neighbors': 160, 'KNN__p': 1, 'KNN__weights': 'uniform' } adult_final_params = adult_clf.best_params_ cancer_final_params = { 'KNN__n_neighbors': 90, 'KNN__p': 1, 'KNN__weights': 'uniform' } cancer_final_params = cancer_clf.best_params_ pipeM.set_params(**cancer_final_params) makeTimingCurve(cancerX, cancerY, pipeA, 'KNN', 'cancer') pipeA.set_params(**adult_final_params) makeTimingCurve(adultX, adultY, pipeA, 'KNN', 'adult')
('Cull4', SelectFromModel(RandomForestClassifier(random_state=4), threshold='median')), ('DT', dtclf_pruned(random_state=55))]) pipeS = Pipeline([('Scale', StandardScaler()), ('DT', dtclf_pruned(random_state=55))]) params = { 'DT__criterion': ['gini', 'entropy'], 'DT__alpha': alphas, 'DT__class_weight': ['balanced'] } spam_clf_fs = basicResults(pipeS_fs, spam_trgX, spam_trgY, spam_tstX, spam_tstY, params, 'DT', 'spam_fs') spam_clf = basicResults(pipeS, spam_trgX, spam_trgY, spam_tstX, spam_tstY, params, 'DT', 'spam') #madelon_final_params = {'DT__alpha': -0.00031622776601683794, 'DT__class_weight': 'balanced', 'DT__criterion': 'entropy'} #adult_final_params = {'class_weight': 'balanced', 'alpha': 0.0031622776601683794, 'criterion': 'entropy'} spam_fs_final_params = spam_clf_fs.best_params_ spam_final_params = spam_clf.best_params_ pipeS_fs.set_params(**spam_fs_final_params) makeTimingCurve(spamX, spamY, pipeS_fs, 'DT', 'spam_fs') pipeS.set_params(**spam_final_params) makeTimingCurve(spamX, spamY, pipeS, 'DT', 'spam') DTpruningVSnodes(pipeS_fs, alphas, spam_trgX, spam_trgY, 'spam_fs') DTpruningVSnodes(pipeS, alphas, spam_trgX, spam_trgY, 'spam')
def main(): adult = pd.read_csv('data/adult_parsed.csv') # plt.figure(figsize=(15,12)) # cor_map = adult.corr() # sns.heatmap(cor_map, annot=True, fmt='.3f', cmap='YlGnBu') # plt.show() adult['net_capital'] = adult['capital-gain']-adult['capital-loss'] adult = adult.drop(["fnlwgt","capital-gain","capital-loss","workclass"],axis=1) adult['income']=adult['income'].map({'<=50K': 0, '>50K': 1}) adult['gender'] = adult['gender'].map({'Male': 0, 'Female': 1}).astype(int) adult['race'] = adult['race'].map({'Black': 0, 'Asian-Pac-Islander': 1, 'Other': 2, 'White': 3, 'Amer-Indian-Eskimo': 4}).astype(int) adult['marital-status'] = adult['marital-status'].map({'Never-married':0,'Widowed':1,'Divorced':2, 'Separated':3, 'Married-spouse-absent':4, 'Married-civ-spouse':5, 'Married-AF-spouse':6}) adult['education'] = adult['education'].map({'Preschool':0,'1st-4th':1,'5th-6th':2, '7th-8th':3, '9th':4, '10th':5, '11th':6, '12th':7, 'Prof-school':8, 'HS-grad':9, 'Some-college':10, 'Assoc-voc':11, 'Assoc-acdm':12, 'Bachelors':13, 'Masters':14, 'Doctorate':15}) adult['occupation'] = adult['occupation'].map({'Priv-house-serv':0,'Protective-serv':1,'Handlers-cleaners':2, 'Machine-op-inspct':3, 'Adm-clerical':4, 'Farming-fishing':5, 'Transport-moving':6, 'Craft-repair':7, 'Other-service':8, 'Tech-support':9, 'Sales':10, 'Exec-managerial':11, 'Prof-specialty':12, 'Armed-Forces':13 }) adult['native-country'] = adult['native-country'].map({'?':-1,'Puerto-Rico':0,'Haiti':1,'Cuba':2, 'Iran':3, 'Honduras':4, 'Jamaica':5, 'Vietnam':6, 'Mexico':7, 'Dominican-Republic':8, 'Laos':9, 'Ecuador':10, 'El-Salvador':11, 'Cambodia':12, 'Columbia':13, 'Guatemala':14, 'South':15, 'India':16, 'Nicaragua':17, 'Yugoslavia':18, 'Philippines':19, 'Thailand':20, 'Trinadad&Tobago':21, 'Peru':22, 'Poland':23, 'China':24, 'Hungary':25, 'Greece':26, 'Taiwan':27, 'Italy':28, 'Portugal':29, 'France':30, 'Hong':31, 'England':32, 'Scotland':33, 'Ireland':34, 'Holand-Netherlands':35, 'Canada':36, 'Germany':37, 'Japan':38, 'Outlying-US(Guam-USVI-etc)':39, 'United-States':40 }) adult['relationship'] = adult['relationship'].map({'Unmarried':0,'Other-relative':1, 'Not-in-family':2, 'Wife':3, 'Husband':4,'Own-child':5}) adult = pd.get_dummies(adult) adult_income_X = adult.drop('income',1).copy().values adult_income_Y = adult['income'].copy().values # wine_data = pd.read_csv('data/wine-red-white-merge.csv') # wineX = wine_data.drop('quality',1).copy().values # wineY = wine_data['quality'].copy().values adult_income_trgX, adult_income_tstX, adult_income_trgY, adult_income_tstY = ms.train_test_split(adult_income_X, adult_income_Y, test_size=0.3, random_state=0,stratify=adult_income_Y) # wine_trgX, wine_tstX, wine_trgY, wine_tstY = ms.train_test_split(wineX, wineY, test_size=0.3, random_state=0,stratify=wineY) d = adult_income_X.shape[1] hiddens_adult_income = [(h,)*l for l in [1,2,3] for h in [d,d//2,d*2]] alphas = [10**-x for x in np.arange(1,9.01,1/2)] # d = wineX.shape[1] # hiddens_wine = [(h,)*l for l in [1,2,3] for h in [d,d//2,d*2]] pipeM = Pipeline([('Scale',StandardScaler()), ('Cull1',SelectFromModel(RandomForestClassifier(),threshold='median')), ('Cull2',SelectFromModel(RandomForestClassifier(),threshold='median')), ('Cull3',SelectFromModel(RandomForestClassifier(),threshold='median')), ('Cull4',SelectFromModel(RandomForestClassifier(),threshold='median')), ('KNN',knnC())]) pipeA = Pipeline([('Scale',StandardScaler()), ('KNN',knnC())]) params_adult_income= {'KNN__metric':['manhattan','euclidean','chebyshev'],'KNN__n_neighbors':np.arange(1,51,3),'KNN__weights':['uniform','distance']} # params_wine= {'KNN__metric':['manhattan','euclidean','chebyshev'],'KNN__n_neighbors':np.arange(1,51,3),'KNN__weights':['uniform','distance']} adult_income_clf = basicResults(pipeA,adult_income_trgX,adult_income_trgY,adult_income_tstX,adult_income_tstY,params_adult_income,'KNN','adult_income') # wine_clf = basicResults(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,params_wine,'KNN','wine') # wine_final_params={'KNN__n_neighbors': 43, 'KNN__weights': 'uniform', 'KNN__p': 1} #adult_income_final_params={'KNN__n_neighbors': 142, 'KNN__p': 1, 'KNN__weights': 'uniform'} # wine_final_params=wine_clf.best_params_ adult_income_final_params=adult_income_clf.best_params_ # pipeM.set_params(**wine_final_params) # makeTimingCurve(wineX,wineY,pipeM,'KNN','wine') pipeA.set_params(**adult_income_final_params) makeTimingCurve(adult_income_X,adult_income_Y,pipeA,'KNN','adult_income')
def main(): abalone = pd.read_hdf('data/processed/datasets.hdf', 'abalone') abaloneX = abalone.drop('Class', 1).copy().values abaloneY = abalone['Class'].copy().values madelon = pd.read_hdf('data/processed/datasets.hdf', 'madelon') madelonX = madelon.drop('Class', 1).copy().values madelonY = madelon['Class'].copy().values abalone_trgX, abalone_tstX, abalone_trgY, abalone_tstY = ms.train_test_split( abaloneX, abaloneY, test_size=0.3, random_state=0, stratify=abaloneY) madelon_trgX, madelon_tstX, madelon_trgY, madelon_tstY = ms.train_test_split( madelonX, madelonY, test_size=0.3, random_state=0, stratify=madelonY) N_abalone = abalone_trgX.shape[0] N_madelon = madelon_trgX.shape[0] alphas = [10**-x for x in np.arange(1, 9.01, 1 / 2)] #Linear SVM pipeM = Pipeline([('Scale', StandardScaler()), ('Cull1', SelectFromModel(RandomForestClassifier(random_state=1), threshold='median')), ('Cull2', SelectFromModel(RandomForestClassifier(random_state=2), threshold='median')), ('Cull3', SelectFromModel(RandomForestClassifier(random_state=3), threshold='median')), ('Cull4', SelectFromModel(RandomForestClassifier(random_state=4), threshold='median')), ('SVM', SGDClassifier(loss='hinge', l1_ratio=0, penalty='l2', class_weight='balanced', random_state=55))]) pipeA = Pipeline([('Scale', StandardScaler()), ('SVM', SGDClassifier(loss='hinge', l1_ratio=0, penalty='l2', class_weight='balanced', random_state=55))]) params_abalone = { 'SVM__alpha': alphas, 'SVM__n_iter': [int((1e6 / N_abalone) / .8) + 1] } params_madelon = { 'SVM__alpha': alphas, 'SVM__n_iter': [int((1e6 / N_madelon) / .8) + 1] } madelon_clf = basicResults(pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, params_madelon, 'SVM_Lin', 'madelon') abalone_clf = basicResults(pipeA, abalone_trgX, abalone_trgY, abalone_tstX, abalone_tstY, params_abalone, 'SVM_Lin', 'abalone') #madelon_final_params = {'SVM__alpha': 0.031622776601683791, 'SVM__n_iter': 687.25} madelon_final_params = madelon_clf.best_params_ madelon_OF_params = {'SVM__n_iter': 1303, 'SVM__alpha': 1e-16} #abalone_final_params ={'SVM__alpha': 0.0001, 'SVM__n_iter': 428} abalone_final_params = abalone_clf.best_params_ abalone_OF_params = {'SVM__n_iter': 55, 'SVM__alpha': 1e-16} pipeM.set_params(**madelon_final_params) makeTimingCurve(madelonX, madelonY, pipeM, 'SVM_Lin', 'madelon') pipeA.set_params(**abalone_final_params) makeTimingCurve(abaloneX, abaloneY, pipeA, 'SVM_Lin', 'abalone') pipeM.set_params(**madelon_final_params) iterationLC(pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, {'SVM__n_iter': [2**x for x in range(12)]}, 'SVM_Lin', 'madelon') pipeA.set_params(**abalone_final_params) iterationLC(pipeA, abalone_trgX, abalone_trgY, abalone_tstX, abalone_tstY, {'SVM__n_iter': np.arange(1, 75, 3)}, 'SVM_Lin', 'abalone') pipeA.set_params(**abalone_OF_params) iterationLC(pipeA, abalone_trgX, abalone_trgY, abalone_tstX, abalone_tstY, {'SVM__n_iter': np.arange(1, 200, 5)}, 'SVM_LinOF', 'abalone') pipeM.set_params(**madelon_OF_params) iterationLC(pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, {'SVM__n_iter': np.arange(100, 2600, 100)}, 'SVM_LinOF', 'madelon') #RBF SVM gamma_fracsA = np.arange(0.2, 2.1, 0.2) gamma_fracsM = np.arange(0.05, 1.01, 0.1) # pipeM = Pipeline([('Scale', StandardScaler()), ('Cull1', SelectFromModel(RandomForestClassifier(random_state=1), threshold='median')), ('Cull2', SelectFromModel(RandomForestClassifier(random_state=2), threshold='median')), ('Cull3', SelectFromModel(RandomForestClassifier(random_state=3), threshold='median')), ('Cull4', SelectFromModel(RandomForestClassifier(random_state=4), threshold='median')), ('SVM', primalSVM_RBF())]) pipeA = Pipeline([('Scale', StandardScaler()), ('SVM', primalSVM_RBF())]) params_abalone = { 'SVM__alpha': alphas, 'SVM__n_iter': [int((1e6 / N_abalone) / .8) + 1], 'SVM__gamma_frac': gamma_fracsA } params_madelon = { 'SVM__alpha': alphas, 'SVM__n_iter': [int((1e6 / N_madelon) / .8) + 1], 'SVM__gamma_frac': gamma_fracsM } # madelon_clf = basicResults(pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, params_madelon, 'SVM_RBF', 'madelon') abalone_clf = basicResults(pipeA, abalone_trgX, abalone_trgY, abalone_tstX, abalone_tstY, params_abalone, 'SVM_RBF', 'abalone') madelon_final_params = madelon_clf.best_params_ madelon_OF_params = madelon_final_params.copy() madelon_OF_params['SVM__alpha'] = 1e-16 abalone_final_params = abalone_clf.best_params_ abalone_OF_params = abalone_final_params.copy() abalone_OF_params['SVM__alpha'] = 1e-16 pipeM.set_params(**madelon_final_params) makeTimingCurve(madelonX, madelonY, pipeM, 'SVM_RBF', 'madelon') pipeA.set_params(**abalone_final_params) makeTimingCurve(abaloneX, abaloneY, pipeM, 'SVM_RBF', 'abalone') pipeM.set_params(**madelon_final_params) iterationLC(pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, {'SVM__n_iter': [2**x for x in range(12)]}, 'SVM_RBF', 'madelon') pipeA.set_params(**abalone_final_params) iterationLC(pipeA, abalone_trgX, abalone_trgY, abalone_tstX, abalone_tstY, {'SVM__n_iter': np.arange(1, 75, 3)}, 'SVM_RBF', 'abalone') pipeA.set_params(**abalone_OF_params) iterationLC(pipeA, abalone_trgX, abalone_trgY, abalone_tstX, abalone_tstY, {'SVM__n_iter': np.arange(1, 75, 3)}, 'SVM_RBF_OF', 'abalone') pipeM.set_params(**madelon_OF_params) iterationLC(pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, {'SVM__n_iter': np.arange(100, 2600, 100)}, 'SVM_RBF_OF', 'madelon')
SelectFromModel(RandomForestClassifier(random_state=4, n_estimators=10), threshold='median')), ('DT', dtclf_pruned(random_state=55))]) pipeA = Pipeline([('Scale', StandardScaler()), ('DT', dtclf_pruned(random_state=55))]) params = { 'DT__criterion': ['gini', 'entropy'], 'DT__alpha': alphas, 'DT__class_weight': ['balanced'] } madelon_clf = basicResults(pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, params, 'DT', 'madelon') adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, params, 'DT', 'adult') #madelon_final_params = {'DT__alpha': -0.00031622776601683794, 'DT__class_weight': 'balanced', 'DT__criterion': 'entropy'} #adult_final_params = {'class_weight': 'balanced', 'alpha': 0.0031622776601683794, 'criterion': 'entropy'} madelon_final_params = madelon_clf.best_params_ adult_final_params = adult_clf.best_params_ pipeM.set_params(**madelon_final_params) makeTimingCurve(madelonX, madelonY, pipeM, 'DT', 'madelon') pipeA.set_params(**adult_final_params) makeTimingCurve(adultX, adultY, pipeA, 'DT', 'adult') DTpruningVSnodes(pipeM, alphas, madelon_trgX, madelon_trgY, 'madelon') DTpruningVSnodes(pipeA, alphas, adult_trgX, adult_trgY, 'adult')
## DECISION TREE ## ============================================================================= pipeSeg = Pipeline([('Scale',StandardScaler()), ('DT',DecisionTreeClassifier(random_state=55))]) params_seg = {'DT__max_depth':np.arange(5,30,1),'DT__min_samples_leaf':np.arange(1,16,1),'DT__class_weight':['balanced']} seg_clf = basicResults(pipeSeg,seg_trgX,seg_trgY,seg_tstX,seg_tstY,params_seg,'DT','seg') seg_final_params = seg_clf.best_params_ pipeSeg.set_params(**seg_final_params) makeTimingCurve(segX,segY,pipeSeg,'DT','seg') ## ============================================================================= ## BOOST ## ============================================================================= pipeSeg = Pipeline([('Scale',StandardScaler()), ('Boost',AdaBoostClassifier(DecisionTreeClassifier(), random_state=1))]) max_depth_seg = np.arange(1,21,1) params_seg = {'Boost__n_estimators':[1,2,5,10,20,30,40,50,100],'Boost__base_estimator__max_depth':max_depth_seg, 'Boost__base_estimator__class_weight':['balanced']}
#madelon_clf = basicResults(pipeM,madelon_trgX,madelon_trgY,madelon_tstX,madelon_tstY,paramsM,'Boost','madelon') spam_clf = basicResults(pipeS, spam_trgX, spam_trgY, spam_tstX, spam_tstY, paramsS, 'Boost', 'spam') # # #madelon_final_params = {'n_estimators': 20, 'learning_rate': 0.02} #adult_final_params = {'n_estimators': 10, 'learning_rate': 1} #OF_params = {'learning_rate':1} #madelon_final_params = madelon_clf.best_params_ spam_final_params = spam_clf.best_params_ OF_params = {'Boost__base_estimator__alpha': -1, 'Boost__n_estimators': 50} ## #pipeM.set_params(**madelon_final_params) pipeS.set_params(**spam_final_params) #makeTimingCurve(madelonX,madelonY,pipeM,'Boost','madelon') makeTimingCurve(spamX, spamY, pipeS, 'Boost', 'spam') # #pipeM.set_params(**madelon_final_params) #iterationLC(pipeM,madelon_trgX,madelon_trgY,madelon_tstX,madelon_tstY,{'Boost__n_estimators':[1,2,5,10,20,30,40,50,60,70,80,90,100]},'Boost','madelon') pipeS.set_params(**spam_final_params) iterationLC(pipeS, spam_trgX, spam_trgY, spam_tstX, spam_tstY, {'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 40, 50]}, 'Boost', 'spam') #pipeM.set_params(**OF_params) #iterationLC(pipeM,madelon_trgX,madelon_trgY,madelon_tstX,madelon_tstY,{'Boost__n_estimators':[1,2,5,10,20,30,40,50,60,70,80,90,100]},'Boost_OF','madelon') #pipeA.set_params(**OF_params) #iterationLC(pipeA,madelon_trgX,madelon_trgY,madelon_tstX,madelon_tstY,{'Boost__n_estimators':[1,2,5,10,20,30,40,50]},'Boost_OF','adult')
def main(): cars = pd.read_hdf('data/processed/datasets.hdf', 'cars') carsX = cars.drop('Class', 1).copy().values carsY = cars['Class'].copy().values madelon = pd.read_hdf('data/processed/datasets.hdf', 'madelon') madelonX = madelon.drop('Class', 1).copy().values madelonY = madelon['Class'].copy().values alphas = [ -1, -1e-3, -(1e-3) * 10**-0.5, -1e-2, -(1e-2) * 10**-0.5, -1e-1, -(1e-1) * 10**-0.5, 0, (1e-1) * 10**-0.5, 1e-1, (1e-2) * 10**-0.5, 1e-2, (1e-3) * 10**-0.5, 1e-3 ] cars_trgX, cars_tstX, cars_trgY, cars_tstY = ms.train_test_split( carsX, carsY, test_size=0.3, random_state=0, stratify=carsY) madelon_trgX, madelon_tstX, madelon_trgY, madelon_tstY = ms.train_test_split( madelonX, madelonY, test_size=0.3, random_state=0, stratify=madelonY) madelon_base = dtclf_pruned(criterion='gini', class_weight='balanced', random_state=55) cars_base = dtclf_pruned(criterion='entropy', class_weight='balanced', random_state=55) OF_base = dtclf_pruned(criterion='gini', class_weight='balanced', random_state=55) #paramsA= {'Boost__n_estimators':[1,2,5,10,20,30,40,50],'Boost__learning_rate':[(2**x)/100 for x in range(8)]+[1]} paramsA = { 'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 45, 60, 80, 100], 'Boost__base_estimator__alpha': alphas } #paramsM = {'Boost__n_estimators':[1,2,5,10,20,30,40,50,60,70,80,90,100], # 'Boost__learning_rate':[(2**x)/100 for x in range(8)]+[1]} paramsM = { 'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 45, 60, 80, 100], 'Boost__base_estimator__alpha': alphas } madelon_booster = AdaBoostClassifier(algorithm='SAMME', learning_rate=1, base_estimator=madelon_base, random_state=55) cars_booster = AdaBoostClassifier(algorithm='SAMME', learning_rate=1, base_estimator=cars_base, random_state=55) OF_booster = AdaBoostClassifier(algorithm='SAMME', learning_rate=1, base_estimator=OF_base, random_state=55) pipeM = Pipeline([('Scale', StandardScaler()), ('Cull1', SelectFromModel(RandomForestClassifier(random_state=1), threshold='median')), ('Cull2', SelectFromModel(RandomForestClassifier(random_state=2), threshold='median')), ('Cull3', SelectFromModel(RandomForestClassifier(random_state=3), threshold='median')), ('Cull4', SelectFromModel(RandomForestClassifier(random_state=4), threshold='median')), ('Boost', madelon_booster)]) pipeA = Pipeline([('Scale', StandardScaler()), ('Boost', cars_booster)]) # madelon_clf = basicResults(pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, paramsM, 'Boost', 'madelon') cars_clf = basicResults(pipeA, cars_trgX, cars_trgY, cars_tstX, cars_tstY, paramsA, 'Boost', 'cars') # # #madelon_final_params = {'n_estimators': 20, 'learning_rate': 0.02} #cars_final_params = {'n_estimators': 10, 'learning_rate': 1} #OF_params = {'learning_rate':1} madelon_final_params = madelon_clf.best_params_ cars_final_params = cars_clf.best_params_ OF_params = {'Boost__base_estimator__alpha': -1, 'Boost__n_estimators': 50} ## pipeM.set_params(**madelon_final_params) pipeA.set_params(**cars_final_params) makeTimingCurve(madelonX, madelonY, pipeM, 'Boost', 'madelon') makeTimingCurve(carsX, carsY, pipeA, 'Boost', 'cars') # pipeM.set_params(**madelon_final_params) iterationLC(pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, { 'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] }, 'Boost', 'madelon') pipeA.set_params(**cars_final_params) iterationLC(pipeA, cars_trgX, cars_trgY, cars_tstX, cars_tstY, {'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 40, 50]}, 'Boost', 'cars') pipeM.set_params(**OF_params) iterationLC(pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, { 'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] }, 'Boost_OF', 'madelon') pipeA.set_params(**OF_params) iterationLC(pipeA, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, {'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 40, 50]}, 'Boost_OF', 'cars')
#madelon_final_params = {'n_estimators': 20, 'learning_rate': 0.02} #adult_final_params = {'n_estimators': 10, 'learning_rate': 1} #OF_params = {'learning_rate':1} # Save hyperparameters that grid search cross validation has identified as optimal #madelon_final_params = madelon_clf.best_params_ adult_final_params = adult_clf.best_params_ mushrooms_final_params = mushrooms_clf.best_params_ redwine_final_params = redwine_clf.best_params_ OF_params = {'Boost__base_estimator__alpha': -1, 'Boost__n_estimators': 50} # Feed learning algorithm optimal hyperparameters and output train/test timing curves over various train/test split ratios #pipeM.set_params(**madelon_final_params) #makeTimingCurve(madelonX,madelonY,pipeM,'Boost','madelon') pipeA.set_params(**adult_final_params) makeTimingCurve(adultX, adultY, pipeA, 'Boost', 'adult') pipeM.set_params(**mushrooms_final_params) makeTimingCurve(mushroomsX, mushroomsY, pipeM, 'Boost', 'mushrooms') pipeR.set_params(**redwine_final_params) makeTimingCurve(redwineX, redwineY, pipeR, 'Boost', 'redwine') # #pipeM.set_params(**madelon_final_params) #iterationLC(pipeM,madelon_trgX,madelon_trgY,madelon_tstX,madelon_tstY,{'Boost__n_estimators':[1,2,5,10,20,30,40,50,60,70,80,90,100]},'Boost','madelon') pipeA.set_params(**adult_final_params) iterationLC(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, {'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 40, 50]}, 'Boost', 'adult') pipeM.set_params(**mushrooms_final_params) iterationLC(pipeM, mushrooms_trgX, mushrooms_trgY, mushrooms_tstX, mushrooms_tstY,
def main(): cars = pd.read_hdf('data/processed/datasets.hdf', 'cars') carsX = cars.drop('Class', 1).copy().values carsY = cars['Class'].copy().values madelon = pd.read_hdf('data/processed/datasets.hdf', 'madelon') madelonX = madelon.drop('Class', 1).copy().values madelonY = madelon['Class'].copy().values cars_trgX, cars_tstX, cars_trgY, cars_tstY = ms.train_test_split( carsX, carsY, test_size=0.3, random_state=0, stratify=carsY) madelon_trgX, madelon_tstX, madelon_trgY, madelon_tstY = ms.train_test_split( madelonX, madelonY, test_size=0.3, random_state=0, stratify=madelonY) pipeA = Pipeline([('Scale', StandardScaler()), ('MLP', MLPClassifier(max_iter=2000, early_stopping=True, random_state=55))]) pipeM = Pipeline([('Scale', StandardScaler()), ('Cull1', SelectFromModel(RandomForestClassifier(random_state=1), threshold='median')), ('Cull2', SelectFromModel(RandomForestClassifier(random_state=2), threshold='median')), ('Cull3', SelectFromModel(RandomForestClassifier(random_state=3), threshold='median')), ('Cull4', SelectFromModel(RandomForestClassifier(random_state=4), threshold='median')), ('MLP', MLPClassifier(max_iter=2000, early_stopping=True, random_state=55))]) d = carsX.shape[1] hiddens_cars = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]] alphas = [10**-x for x in np.arange(-1, 5.01, 1 / 2)] alphasM = [10**-x for x in np.arange(-1, 9.01, 1 / 2)] d = madelonX.shape[1] d = d // (2**4) hiddens_madelon = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]] params_cars = { 'MLP__activation': ['relu', 'logistic'], 'MLP__alpha': alphas, 'MLP__hidden_layer_sizes': hiddens_cars } params_madelon = { 'MLP__activation': ['relu', 'logistic'], 'MLP__alpha': alphas, 'MLP__hidden_layer_sizes': hiddens_madelon } # madelon_clf = basicResults(pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, params_madelon, 'ANN', 'madelon') cars_clf = basicResults(pipeA, cars_trgX, cars_trgY, cars_tstX, cars_tstY, params_cars, 'ANN', 'cars') #madelon_final_params = {'MLP__hidden_layer_sizes': (500,), 'MLP__activation': 'logistic', 'MLP__alpha': 10.0} #cars_final_params ={'MLP__hidden_layer_sizes': (28, 28, 28), 'MLP__activation': 'logistic', 'MLP__alpha': 0.0031622776601683794} madelon_final_params = madelon_clf.best_params_ cars_final_params = cars_clf.best_params_ cars_OF_params = cars_final_params.copy() cars_OF_params['MLP__alpha'] = 0 madelon_OF_params = madelon_final_params.copy() madelon_OF_params['MLP__alpha'] = 0 #raise # pipeM.set_params(**madelon_final_params) pipeM.set_params(**{'MLP__early_stopping': False}) makeTimingCurve(madelonX, madelonY, pipeM, 'ANN', 'madelon') pipeA.set_params(**cars_final_params) pipeA.set_params(**{'MLP__early_stopping': False}) makeTimingCurve(carsX, carsY, pipeA, 'ANN', 'cars') pipeM.set_params(**madelon_final_params) pipeM.set_params(**{'MLP__early_stopping': False}) iterationLC( pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, { 'MLP__max_iter': [2**x for x in range(12)] + [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000] }, 'ANN', 'madelon') pipeA.set_params(**cars_final_params) pipeA.set_params(**{'MLP__early_stopping': False}) iterationLC( pipeA, cars_trgX, cars_trgY, cars_tstX, cars_tstY, { 'MLP__max_iter': [2**x for x in range(12)] + [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000] }, 'ANN', 'cars') pipeM.set_params(**madelon_OF_params) pipeM.set_params(**{'MLP__early_stopping': False}) iterationLC( pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, { 'MLP__max_iter': [2**x for x in range(12)] + [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000] }, 'ANN_OF', 'madelon') pipeA.set_params(**cars_OF_params) pipeA.set_params(**{'MLP__early_stopping': False}) iterationLC( pipeA, cars_trgX, cars_trgY, cars_tstX, cars_tstY, { 'MLP__max_iter': [2**x for x in range(12)] + [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000] }, 'ANN_OF', 'cars')
paramsA = { 'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 45, 60, 80, 100], 'Boost__base_estimator__alpha': alphas } ab_booster = AdaBoostClassifier(algorithm='SAMME', learning_rate=1, base_estimator=ab_base, random_state=55) OF_booster = AdaBoostClassifier(algorithm='SAMME', learning_rate=1, base_estimator=OF_base, random_state=55) pipeA = Pipeline([('Scale', StandardScaler()), ('Boost', ab_booster)]) ab_clf = basicResults(pipeA, ab_trgX, ab_trgY, ab_tstX, ab_tstY, paramsA, 'Boost', 'ab') ab_final_params = ab_clf.best_params_ OF_params = {'Boost__base_estimator__alpha': -1, 'Boost__n_estimators': 50} pipeA.set_params(**ab_final_params) makeTimingCurve(abX, abY, pipeA, 'Boost', 'ab') pipeA.set_params(**ab_final_params) iterationLC(pipeA, ab_trgX, ab_trgY, ab_tstX, ab_tstY, {'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 40, 50]}, 'Boost', 'ab')
complexity_params = { 'name': 'SVM__C', 'display_name': 'Penalty', 'values': np.arange(0.001, 2.5, 0.1) } data_clf = basicResults(pipeM, data_train_x, data_train_y, data_test_x, data_test_y, params, 'SVM', dataset, scorer='f1', complexity_curve=True, complexity_params=complexity_params, clf_name='SVM') data_final_params = data_clf.best_params_ pipeM.set_params(**data_final_params) makeTimingCurve(data_x, data_y, pipeM, 'SVM', dataset) iterationLC(pipeM, data_train_x, data_train_y, data_test_x, data_test_y, {'SVM__max_iter': range(1, 250, 10)}, 'SVM', dataset=dataset, scorer='f1')
spam_clf = basicResults(pipeS, spam_trgX, spam_trgY, spam_tstX, spam_tstY, params_spam, 'ANN', 'spam') #spam_clf_fs = basicResults(pipeS_fs,spam_trgX,spam_trgY,spam_tstX,spam_tstY,params_spam,'ANN','spam_fs') spam_final_params = spam_clf.best_params_ spam_OF_params = spam_final_params.copy() spam_OF_params['MLP__alpha'] = 0 #spam_fs_final_params = spam_clf_fs.best_params_ #spam_fs_OF_params =spam_fs_final_params.copy() #spam_fs_OF_params['MLP__alpha'] = 0 pipeS.set_params(**spam_final_params) pipeS.set_params(**{'MLP__early_stopping': False}) makeTimingCurve(spamX, spamY, pipeS, 'ANN', 'spam') #pipeS_fs.set_params(**spam_fs_final_params) #pipeS_fs.set_params(**{'MLP__early_stopping':False}) #makeTimingCurve(spamX,spamY,pipeS_fs,'ANN','spam_fs') pipeS.set_params(**spam_final_params) pipeS.set_params(**{'MLP__early_stopping': False}) iterationLC(pipeS, spam_trgX, spam_trgY, spam_tstX, spam_tstY, {'MLP__max_iter': [2**x for x in range(12)]}, 'ANN', 'spam') #pipeS_fs.set_params(**spam_fs_final_params) #pipeS_fs.set_params(**{'MLP__early_stopping':False}) #iterationLC(pipeS_fs,spam_trgX,spam_trgY,spam_tstX,spam_tstY,{'MLP__max_iter':[2**x for x in range(12)]+[2100,2200,2300,2400,2500,2600,2700,2800,2900,3000]},'ANN','spam_fs') pipeS.set_params(**spam_OF_params)
def main(): adult = pd.read_csv('data/adult_parsed.csv') adult_income_X = adult.drop('income', 1).copy().values adult_income_Y = adult['income'].copy().values # wine_data = pd.read_csv('data/winequality_white.csv') # wine_data['category'] = wine_data['quality'] >= 7 # # wineX = wine_data[wine_data.columns[0:11]].values # wineY = wine_data['category'].values.astype(np.int) adult_income_trgX, adult_income_tstX, adult_income_trgY, adult_income_tstY = ms.train_test_split( adult_income_X, adult_income_Y, test_size=0.3, random_state=0, stratify=adult_income_Y) # wine_trgX, wine_tstX, wine_trgY, wine_tstY = ms.train_test_split(wineX, wineY, test_size=0.3, random_state=0,stratify=wineY) N_adult_income = adult_income_trgX.shape[0] # N_wine = wine_trgX.shape[0] # alphas = [10**-x for x in np.arange(1,9.01,1/2)] #Linear SVM pipeM = Pipeline([('Scale', StandardScaler()), ('Cull1', SelectFromModel(RandomForestClassifier(random_state=1), threshold='median')), ('Cull2', SelectFromModel(RandomForestClassifier(random_state=2), threshold='median')), ('SVM', SGDClassifier(loss='hinge', l1_ratio=0, penalty='l2', class_weight='balanced', random_state=55))]) pipeA = Pipeline([('Scale', StandardScaler()), ('SVM', SGDClassifier(loss='hinge', l1_ratio=0, penalty='l2', class_weight='balanced', random_state=55))]) params_adult_income = { 'SVM__alpha': [100, 10, 1, 0.1, 0.001, 0.0001], 'SVM__n_iter': np.arange(0.1, 1, 10) } # params_wine = {'SVM__alpha':alphas,'SVM__n_iter':[int((1e6/N_wine)/.8)+1]} adult_income_clf = basicResults(pipeA, adult_income_trgX, adult_income_trgY, adult_income_tstX, adult_income_tstY, params_adult_income, 'SVM_Lin', 'adult_income') # wine_clf = basicResults(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,params_wine,'SVM_Lin','wine') #wine_final_params = {'SVM__alpha': 0.031622776601683791, 'SVM__n_iter': 687.25} # wine_final_params = wine_clf.best_params_ # wine_OF_params = {'SVM__n_iter': 1303, 'SVM__alpha': 1e-16} #adult_income_final_params ={'SVM__alpha': 0.0001, 'SVM__n_iter': 428} adult_income_final_params = adult_income_clf.best_params_ adult_income_OF_params = {'SVM__n_iter': 55, 'SVM__alpha': 1e-16} # pipeM.set_params(**wine_final_params) # makeTimingCurve(wineX,wineY,pipeM,'SVM_Lin','wine') pipeA.set_params(**adult_income_final_params) makeTimingCurve(adult_income_X, adult_income_Y, pipeA, 'SVM_Lin', 'adult_income') # pipeM.set_params(**wine_final_params) # iterationLC(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,{'SVM__n_iter':[2**x for x in range(12)]},'SVM_Lin','wine') pipeA.set_params(**adult_income_final_params) iterationLC(pipeA, adult_income_trgX, adult_income_trgY, adult_income_tstX, adult_income_tstY, {'SVM__n_iter': np.arange(1, 75, 3)}, 'SVM_Lin', 'adult_income') pipeA.set_params(**adult_income_OF_params) iterationLC(pipeA, adult_income_trgX, adult_income_trgY, adult_income_tstX, adult_income_tstY, {'SVM__n_iter': np.arange(1, 200, 5)}, 'SVM_LinOF', 'adult_income') # pipeM.set_params(**wine_OF_params) # iterationLC(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,{'SVM__n_iter':np.arange(100,2600,100)},'SVM_LinOF','wine') #RBF SVM gamma_fracsA = np.arange(0.2, 2.1, 0.2) gamma_fracsM = np.arange(0.05, 1.01, 0.1) # pipeM = Pipeline([('Scale', StandardScaler()), ('Cull1', SelectFromModel(RandomForestClassifier(random_state=1), threshold='median')), ('Cull2', SelectFromModel(RandomForestClassifier(random_state=2), threshold='median')), ('SVM', primalSVM_RBF())]) pipeA = Pipeline([('Scale', StandardScaler()), ('SVM', primalSVM_RBF())]) params_adult_income = { 'SVM__alpha': [100, 10, 1, 0.1, 0.001, 0.0001], 'SVM__n_iter': [int((1e6 / N_adult_income) / .8) + 1], 'SVM__gamma_frac': gamma_fracsA } # params_wine = {'SVM__alpha':alphas,'SVM__n_iter':[int((1e6/N_wine)/.8)+1],'SVM__gamma_frac':gamma_fracsM} # # wine_clf = basicResults(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,params_wine,'SVM_RBF','wine') adult_income_clf = basicResults(pipeA, adult_income_trgX, adult_income_trgY, adult_income_tstX, adult_income_tstY, params_adult_income, 'SVM_RBF', 'adult_income') # wine_final_params = wine_clf.best_params_ # wine_OF_params = wine_final_params.copy() # wine_OF_params['SVM__alpha'] = 1e-16 adult_income_final_params = adult_income_clf.best_params_ adult_income_OF_params = adult_income_final_params.copy() adult_income_OF_params['SVM__alpha'] = 1e-16 # pipeM.set_params(**wine_final_params) # makeTimingCurve(wineX,wineY,pipeM,'SVM_RBF','wine') pipeA.set_params(**adult_income_final_params) makeTimingCurve(adult_income_X, adult_income_Y, pipeM, 'SVM_RBF', 'adult_income') # pipeM.set_params(**wine_final_params) # iterationLC(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,{'SVM__n_iter':[2**x for x in range(12)]},'SVM_RBF','wine') pipeA.set_params(**adult_income_final_params) iterationLC(pipeA, adult_income_trgX, adult_income_trgY, adult_income_tstX, adult_income_tstY, {'SVM__n_iter': np.arange(1, 75, 3)}, 'SVM_RBF', 'adult_income') pipeA.set_params(**adult_income_OF_params) iterationLC(pipeA, adult_income_trgX, adult_income_trgY, adult_income_tstX, adult_income_tstY, {'SVM__n_iter': np.arange(1, 75, 3)}, 'SVM_RBF_OF', 'adult_income')
#spam_final_params = spam_clf.best_params_ #spam_OF_params =spam_final_params.copy() #spam_OF_params['MLP__alpha'] = 0 spam_fs_final_params = spam_clf_fs.best_params_ spam_fs_OF_params = spam_fs_final_params.copy() spam_fs_OF_params['MLP__alpha'] = 0 #pipeS.set_params(**spam_final_params) #pipeS.set_params(**{'MLP__early_stopping':False}) #makeTimingCurve(spamX,spamY,pipeS,'ANN','spam') pipeS_fs.set_params(**spam_fs_final_params) pipeS_fs.set_params(**{'MLP__early_stopping': False}) makeTimingCurve(spamX, spamY, pipeS_fs, 'ANN', 'spam_fs') #pipeS.set_params(**spam_final_params) #pipeS.set_params(**{'MLP__early_stopping':False}) #iterationLC(pipeS,spam_trgX,spam_trgY,spam_tstX,spam_tstY,{'MLP__max_iter':[2**x for x in range(12)]+[2100,2200,2300,2400,2500,2600,2700,2800,2900,3000]},'ANN','spam') pipeS_fs.set_params(**spam_fs_final_params) pipeS_fs.set_params(**{'MLP__early_stopping': False}) iterationLC( pipeS_fs, spam_trgX, spam_trgY, spam_tstX, spam_tstY, { 'MLP__max_iter': [2**x for x in range(12)] + [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000] }, 'ANN', 'spam_fs') #pipeS.set_params(**spam_OF_params) #pipeS.set_params(**{'MLP__early_stopping':False})
plt.title('ROC_Curve(Spambase)') plt.savefig('./output/SVM_Lin_ROC_Curve.png') plt.clf() cm = pd.DataFrame(confusion_matrix(spam_tstY, spam_clf.predict(spam_tstX))) cm.to_csv('./output/SVM_Lin_Confusion_matrix.csv') spam_final_params = spam_clf.best_params_ spam_OF_params = {'SVM__n_iter': 55, 'SVM__alpha': 1e-16} pipeS.set_params(**spam_final_params) makeTimingCurve(spamX, spamY, pipeS, 'SVM_Lin', 'spam') pipeS.set_params(**spam_final_params) iterationLC(pipeS, spam_trgX, spam_trgY, spam_tstX, spam_tstY, {'SVM__n_iter': np.arange(1, 75, 3)}, 'SVM_Lin', 'spam') pipeS.set_params(**spam_OF_params) iterationLC(pipeS, spam_trgX, spam_trgY, spam_tstX, spam_tstY, {'SVM__n_iter': np.arange(1, 200, 5)}, 'SVM_LinOF', 'spam') #RBF SVM gamma_fracsS = np.arange(0.2, 2.1, 0.2) # pipeS_fs = Pipeline([('Scale', StandardScaler()), ('Cull1',
ab = pd.read_hdf('datasets.hdf', 'ab') abX = ab.drop('rings', 1).copy().values abY = ab['rings'].copy().values ab_trgX, ab_tstX, ab_trgY, ab_tstY = ms.train_test_split(abX, abY, test_size=0.3, random_state=0, stratify=abY) d = abX.shape[1] hiddens_ab = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]] alphas = [10**-x for x in np.arange(1, 9.01, 1 / 2)] pipeA = Pipeline([('Scale', StandardScaler()), ('KNN', knnC())]) params_ab = { 'KNN__metric': ['manhattan', 'euclidean', 'chebyshev'], 'KNN__n_neighbors': np.arange(1, 51, 3), 'KNN__weights': ['uniform', 'distance'] } ab_clf = basicResults(pipeA, ab_trgX, ab_trgY, ab_tstX, ab_tstY, params_ab, 'KNN', 'ab') ab_final_params = ab_clf.best_params_ pipeA.set_params(**ab_final_params) makeTimingCurve(abX, abY, pipeA, 'KNN', 'ab')
'DT__class_weight': ['balanced'] } #madelon_clf = basicResults(pipeM,madelon_trgX,madelon_trgY,madelon_tstX,madelon_tstY,params,'DT','madelon') adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, params, 'DT', 'adult') #madelon_final_params = {'DT__alpha': -0.00031622776601683794, 'DT__class_weight': 'balanced', 'DT__criterion': 'entropy'} #adult_final_params = {'class_weight': 'balanced', 'alpha': 0.0031622776601683794, 'criterion': 'entropy'} #madelon_final_params = madelon_clf.best_params_ adult_final_params = adult_clf.best_params_ #pipeM.set_params(**madelon_final_params) #makeTimingCurve(madelonX,madelonY,pipeM,'DT','madelon') pipeA.set_params(**adult_final_params) makeTimingCurve(adultX, adultY, pipeA, 'DT', 'adult') #DTpruningVSnodes(pipeM,alphas,madelon_trgX,madelon_trgY,'madelon') DTpruningVSnodes(pipeA, alphas, adult_trgX, adult_trgY, 'adult') # SPAM spam = pd.read_hdf('spam.hdf', 'spam') spam.describe() spamX = spam.drop('clas', 1).copy().values spamY = spam['clas'].copy().values spam_trgX, spam_tstX, spam_trgY, spam_tstY = ms.train_test_split( spamX, spamY, test_size=0.3, random_state=0, stratify=spamY) alphas = [ -1, -1e-3, -(1e-3) * 10**-0.5, -1e-2, -(1e-2) * 10**-0.5, -1e-1,
#madelon_final_params = {'MLP__hidden_layer_sizes': (500,), 'MLP__activation': 'logistic', 'MLP__alpha': 10.0} #adult_final_params ={'MLP__hidden_layer_sizes': (28, 28, 28), 'MLP__activation': 'logistic', 'MLP__alpha': 0.0031622776601683794} madelon_final_params = madelon_clf.best_params_ adult_final_params = adult_clf.best_params_ adult_OF_params = adult_final_params.copy() adult_OF_params['MLP__alpha'] = 0 madelon_OF_params = madelon_final_params.copy() madelon_OF_params['MLP__alpha'] = 0 #raise # pipeM.set_params(**madelon_final_params) pipeM.set_params(**{'MLP__early_stopping': False}) makeTimingCurve(madelonX, madelonY, pipeM, 'ANN', 'cancer') pipeA.set_params(**adult_final_params) pipeA.set_params(**{'MLP__early_stopping': False}) makeTimingCurve(adultX, adultY, pipeA, 'ANN', 'adult') pipeM.set_params(**madelon_final_params) pipeM.set_params(**{'MLP__early_stopping': False}) iterationLC( pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, { 'MLP__max_iter': [2**x for x in range(12)] + [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000] }, 'ANN', 'cancer') pipeA.set_params(**adult_final_params) pipeA.set_params(**{'MLP__early_stopping': False}) iterationLC( pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, {
pipeM = Pipeline([ #('Scale',StandardScaler()), ('MLP', MLPClassifier(max_iter=2000, early_stopping=False, random_state=55)) ]) d = data_x.shape[1] hiddens_data = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]] alphas = [10**-x for x in np.arange(-1, 3.01, 1)] params = { 'MLP__activation': ['relu', 'logistic'], 'MLP__hidden_layer_sizes': hiddens_data, 'MLP__alpha': alphas } data_clf = basicResults(pipeM, data_train_x, data_train_y, data_test_x, data_test_y, params, 'ANN', dataset) data_final_params = data_clf.best_params_ pipeM.set_params(**data_final_params) makeTimingCurve(data_x, data_y, pipeM, 'ANN', dataset) iterationLC(pipeM, data_train_x, data_train_y, data_test_x, data_test_y, {'MLP__max_iter': [2**x for x in range(8)]}, 'ANN', dataset=dataset)
pipeA = Pipeline([('Scale',StandardScaler()), ('SVM',SGDClassifier(loss='hinge',l1_ratio=0,penalty='l2',class_weight='balanced',random_state=55))]) params_adult = {'SVM__alpha':alphas,'SVM__n_iter':[int((1e6/N_adult)/.8)+1]} adult_clf = basicResults(pipeA,adult_trgX,adult_trgY,adult_tstX,adult_tstY,params_adult,'SVM_Lin','adult') adult_final_params =adult_clf.best_params_ #adult_OF_params ={'SVM__n_iter': 55, 'SVM__alpha': 1e-16} # # adult_OF_params = adult_final_params.copy() adult_OF_params['SVM__alpha'] = 1e-16 pipeA.set_params(**adult_final_params) makeTimingCurve(adultX,adultY,pipeA,'SVM_Lin','adult') pipeA.set_params(**adult_final_params) iterationLC(pipeA,adult_trgX,adult_trgY,adult_tstX,adult_tstY,{'SVM__n_iter':np.arange(1,75,3)},'SVM_Lin','adult') # pipeA.set_params(**adult_OF_params) iterationLC(pipeA,adult_trgX,adult_trgY,adult_tstX,adult_tstY,{'SVM__n_iter':np.arange(1,200,5)},'SVM_LinOF','adult') #pipeM.set_params(**madelon_OF_params) #iterationLC(pipeM,madelon_trgX,madelon_trgY,madelon_tstX,madelon_tstY,{'SVM__n_iter':np.arange(100,2600,100)},'SVM_LinOF','madelon') #RBF SVM
'DT__class_weight': ['balanced'] } #madelon_clf = basicResults(pipeM,madelon_trgX,madelon_trgY,madelon_tstX,madelon_tstY,params,'DT','madelon') adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, params, 'DT', 'adult') #madelon_final_params = {'DT__alpha': -0.00031622776601683794, 'DT__class_weight': 'balanced', 'DT__criterion': 'entropy'} #adult_final_params = {'class_weight': 'balanced', 'alpha': 0.0031622776601683794, 'criterion': 'entropy'} #madelon_final_params = madelon_clf.best_params_ adult_final_params = adult_clf.best_params_ #pipeM.set_params(**madelon_final_params) #makeTimingCurve(madelonX,madelonY,pipeM,'DT','madelon') pipeA.set_params(**adult_final_params) makeTimingCurve(adultX, adultY, pipeA, 'DT', 'adult') #DTpruningVSnodes(pipeM,alphas,madelon_trgX,madelon_trgY,'madelon') DTpruningVSnodes(pipeA, alphas, adult_trgX, adult_trgY, 'adult') ################################################################################################### # # #banknote= pd.read_hdf('datasets.hdf','banknote') #banknote.describe() #banknoteX = banknote.drop('clas',1).copy().values #banknoteY = banknote['clas'].copy().values # #banknote_trgX, banknote_tstX, banknote_trgY, banknote_tstY = ms.train_test_split(banknoteX, banknoteY, test_size=0.7, random_state=0,stratify=banknoteY) # #alphas = [-1,-1e-3,-(1e-3)*10**-0.5, -1e-2, -(1e-2)*10**-0.5,-1e-1,-(1e-1)*10**-0.5, 0, (1e-1)*10**-0.5,1e-1,(1e-2)*10**-0.5,1e-2,(1e-3)*10**-0.5,1e-3]
'SVM__n_iter': [int((1e6 / N_redwine) / .8) + 1] } adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, params_adult, 'SVM_Lin', 'adult') redwine_clf = basicResults(pipeR, redwine_trgX, redwine_trgY, redwine_tstX, redwine_tstY, params_redwine, 'SVM_Lin', 'redwine') #adult_final_params ={'SVM__alpha': 0.001, 'SVM__n_iter': 54.75} adult_final_params = adult_clf.best_params_ adult_OF_params = {'SVM__n_iter': 55, 'SVM__alpha': 1e-16} redwine_final_params = redwine_clf.best_params_ redwine_OF_params = {'SVM__n_iter': 55, 'SVM__alpha': 1e-16} pipeA.set_params(**adult_final_params) makeTimingCurve(adultX, adultY, pipeA, 'SVM_Lin', 'adult') pipeR.set_params(**redwine_final_params) makeTimingCurve(redwineX, redwineY, pipeR, 'SVM_Lin', 'redwine') pipeA.set_params(**adult_final_params) iterationLC(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, {'SVM__n_iter': np.arange(1, 75, 3)}, 'SVM_Lin', 'adult') pipeR.set_params(**redwine_final_params) iterationLC(pipeR, redwine_trgX, redwine_trgY, redwine_tstX, redwine_tstY, {'SVM__n_iter': np.arange(1, 75, 3)}, 'SVM_Lin', 'redwine') pipeA.set_params(**adult_OF_params) iterationLC(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, {'SVM__n_iter': np.arange(1, 200, 5)}, 'SVM_LinOF', 'adult') pipeR.set_params(**redwine_OF_params) iterationLC(pipeR, redwine_trgX, redwine_trgY, redwine_tstX, redwine_tstY,