'NN__activation': nn_activation } mlp = MLPClassifier(max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('filter', filtr), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5, scoring='f1_macro') gs.fit(digitsX, digitsY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'digits dim red.csv') # raise #%% data for 3 # Set this from chart 2 and dump, use clustering script to finish up dim = 7 filtr = ImportanceSelect(rfc, dim) abaloneX2 = filtr.fit_transform(abaloneX, abaloneY) abalone2 = pd.DataFrame(np.hstack((abaloneX2, np.atleast_2d(abaloneY).T))) cols = list(range(abalone2.shape[1])) cols[-1] = 'Class' abalone2.columns = cols abalone2.to_hdf(out + 'datasets.hdf', 'abalone', complib='blosc', complevel=9) dim = 41 filtr = ImportanceSelect(rfc, dim) digitsX2 = filtr.fit_transform(digitsX, digitsY) digits2 = pd.DataFrame(np.hstack((digitsX2, np.atleast_2d(digitsY).T))) cols = list(range(digits2.shape[1])) cols[-1] = 'Class'
max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('filter', filtr), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, n_jobs=num_jobs, verbose=10, cv=5) gs.fit(digitsX, digitsY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'digits dim red.csv') # raise #%% data for 3 # Set this from chart 2 and dump, use clustering script to finish up dim = 20 filtr = ImportanceSelect(rfc, dim) madelonX2 = filtr.fit_transform(madelonX, madelonY) madelon2 = pd.DataFrame(np.hstack((madelonX2, np.atleast_2d(madelonY).T))) cols = list(range(madelon2.shape[1])) cols[-1] = 'Class' madelon2.columns = cols madelon2.to_hdf(out + 'datasets.hdf', 'madelon', complib='blosc', complevel=9) dim = 40 filtr = ImportanceSelect(rfc, dim) digitsX2 = filtr.fit_transform(digitsX, digitsY) digits2 = pd.DataFrame(np.hstack((digitsX2, np.atleast_2d(digitsY).T))) cols = list(range(digits2.shape[1])) cols[-1] = 'Class'
pipe = Pipeline([('filter', filtr), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(blocks_X, blocks_Y) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'pageblocks_RF_dim_red.csv') #%% For part 4 out = './Part4/' rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=5) dim = 5 filtr = ImportanceSelect(rfc, dim) blocks_X2 = filtr.fit_transform(blocks_X, blocks_Y) grid = {'NN__alpha': nn_reg} mlp = MLPClassifier(solver='lbfgs', activation='logistic', hidden_layer_sizes=(50, ), max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) start = time() gs.fit(blocks_X2, blocks_Y) print "Benchmark run time: %.2f seconds" % (time() - start) tmp = pd.DataFrame(gs.cv_results_)
max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('filter', filtr), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(digitX, digitY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'digit dim red.csv') # data for 3 # Set this from chart 2 and dump, use clustering script to finish up dim = 6 filtr = ImportanceSelect(rfc, dim) wineX2 = filtr.fit_transform(wineX, wineY) wine2 = pd.DataFrame(np.hstack((wineX2, np.atleast_2d(wineY)))) cols = list(range(wine2.shape[1])) cols[-1] = 'Class' wine2.columns = cols wine2.to_csv(out + 'wine_datasets.csv', index=False, header=False) dim = 60 filtr = ImportanceSelect(rfc, dim) digitX2 = filtr.fit_transform(digitX, digitY) digit2 = pd.DataFrame(np.hstack((digitX2, np.atleast_2d(digitY)))) cols = list(range(digit2.shape[1])) cols[-1] = 'Class' digit2.columns = cols digit2.to_csv(out + 'digit_datasets.csv', index=False, header=False)
def main(): out = './BASES/' np.random.seed(0) character = pd.read_hdf('./BASES/datasets.hdf', 'character') character_X = character.drop('Class', 1).copy().values character_Y = character['Class'].copy().values madelon = pd.read_hdf('./BASES/datasets.hdf', 'madelon') madelon_X = madelon.drop('Class', 1).copy().values madelon_Y = madelon['Class'].copy().values madelon_X = StandardScaler().fit_transform(madelon_X) character_X = StandardScaler().fit_transform(character_X) # clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40] dim_red = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60] dims_red_s = [2, 4, 6, 8, 10, 12, 14, 16] # %% data for 1 rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=5, n_jobs=7) fs_madelon = rfc.fit(madelon_X, madelon_Y).feature_importances_ fs_character = rfc.fit(character_X, character_Y).feature_importances_ tmp = pd.Series(np.sort(fs_madelon)[::-1]) tmp.to_csv(out + 'madelon scree.csv') tmp = pd.Series(np.sort(fs_character)[::-1]) tmp.to_csv(out + 'character_scree.csv') # %% Data for 2 filtr = ImportanceSelect(rfc) grid = {'filter__n': dim_red, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch} mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('filter', filtr), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(madelon_X, madelon_Y) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'Madelon dim red.csv') grid = {'filter__n': dims_red_s, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch} mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('filter', filtr), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(character_X, character_Y) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'character_dim_red.csv') # raise # %% data for 3 # Set this from chart 2 and dump, use clustering script to finish up dim = 10 filtr = ImportanceSelect(rfc, dim) madelon_X2 = filtr.fit_transform(madelon_X, madelon_Y) madelon_2 = pd.DataFrame(np.hstack((madelon_X2, np.atleast_2d(madelon_Y).T))) cols = list(range(madelon_2.shape[1])) cols[-1] = 'Class' madelon_2.columns = cols madelon_2.to_hdf(out + 'datasets.hdf', 'madelon', complib='blosc', complevel=9) dim = 10 filtr = ImportanceSelect(rfc, dim) character_X2 = filtr.fit_transform(character_X, character_Y) character_2 = pd.DataFrame(np.hstack((character_X2, np.atleast_2d(character_Y).T))) cols = list(range(character_2.shape[1])) cols[-1] = 'Class' character_2.columns = cols character_2.to_hdf(out + 'datasets.hdf', 'character', complib='blosc', complevel=9)
mlp = MLPClassifier(activation='relu',max_iter=nn_iter,early_stopping=True,random_state=5) pipe = Pipeline([('filter',filtr),('NN',mlp)]) gs = GridSearchCV(pipe,grid,verbose=10,cv=5) gs.fit(perm_x,perm_y) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out+'perm dim red.csv') grid ={'filter__n':dims_big,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_layers} mlp = MLPClassifier(activation='relu',max_iter=nn_iter,early_stopping=True,random_state=5) pipe = Pipeline([('filter',filtr),('NN',mlp)]) gs = GridSearchCV(pipe,grid,verbose=10,cv=5) gs.fit(housing_x,housing_y) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out+'housing dim red.csv') #3 dim = 5 filtr = ImportanceSelect(rfc,dim) perm_x2 = filtr.fit_transform(perm_x,perm_y) dim = 9 filtr = ImportanceSelect(rfc,dim) housing_x2 = filtr.fit_transform(housing_x,housing_y) run_clustering(out, perm_x2, perm_y, housing_x2, housing_y)
gs.fit(digitsX, digitsY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'digits dim red.csv') #raise #%% data for task 3 but find the good dim values first, use clustering script to finish up rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=5, n_jobs=1) dim = 6 filtr = ImportanceSelect(rfc, dim) diamondsX2 = filtr.fit_transform(diamondsX, diamondsY) diamonds2 = pd.DataFrame( np.hstack((diamondsX2, np.atleast_2d(diamondsY).T))) cols = list(range(diamonds2.shape[1])) cols[-1] = 'Class' diamonds2.columns = cols diamonds2.to_hdf(out + 'datasets.hdf', 'diamonds', complib='blosc', complevel=9) dim = 45 filtr = ImportanceSelect(rfc, dim) digitsX2 = filtr.fit_transform(digitsX, digitsY) digits2 = pd.DataFrame(np.hstack((digitsX2, np.atleast_2d(digitsY).T))) cols = list(range(digits2.shape[1]))
max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('filter', filtr), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(cancerX, cancerY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'cancer dim red.csv') # raise #%% data for 3 # Set this from chart 2 and dump, use clustering script to finish up dim = 7 filtr = ImportanceSelect(rfc, dim) wineX2 = filtr.fit_transform(wineX, wineY) wine2 = pd.DataFrame(np.hstack((wineX2, np.atleast_2d(wineY).T))) cols = list(range(wine2.shape[1])) cols[-1] = 'class' wine2.columns = cols wine2.to_hdf(out + 'datasets.hdf', 'wine', complib='blosc', complevel=9) dim = 6 filtr = ImportanceSelect(rfc, dim) cancerX2 = filtr.fit_transform(cancerX, cancerY) cancer2 = pd.DataFrame(np.hstack((cancerX2, np.atleast_2d(cancerY).T))) cols = list(range(cancer2.shape[1])) cols[-1] = 'class' cancer2.columns = cols cancer2.to_hdf(out + 'datasets.hdf', 'cancer',
cluster_range = range(1, 11) dims = range(1, 30) rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=5, n_jobs=-1) fs_br = rfc.fit(brX, brY).feature_importances_ tmp = pd.Series(np.sort(fs_br)[::-1]) tmp.to_csv('./RF/breast_scree.csv') barplot_breast(tmp) dim = 10 filtr = ImportanceSelect(rfc, dim) brX2 = filtr.fit_transform(brX, brY) br2 = pd.DataFrame(np.hstack((brX2, np.atleast_2d(brY).T))) cols = list(range(br2.shape[1])) cols[-1] = 'Class' br2.columns = cols br2.to_csv('./RF/breast.csv') # Abalone Dataset abalone = pd.read_csv('./BASE/abalone.csv') abaloneX = abalone.drop('Class', 1).copy().values abaloneY = abalone['Class'].copy().values abaloneX = StandardScaler().fit_transform(abaloneX) cluster_range = range(1, 11) dims = range(1, 10)
max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('filter', filtr), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(digitsX, digitsY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'digits dim red.csv') # raise #%% data for 3 # Set this from chart 2 and dump, use clustering script to finish up dim = 35 filtr = ImportanceSelect(rfc, dim) biodegX2 = filtr.fit_transform(biodegX, biodegY) biodeg2 = pd.DataFrame(np.hstack((biodegX2, np.atleast_2d(biodegY).T))) cols = list(range(biodeg2.shape[1])) cols[-1] = 'Class' biodeg2.columns = cols biodeg2.to_hdf(out + 'datasets.hdf', 'biodeg', complib='blosc', complevel=9) dim = 40 filtr = ImportanceSelect(rfc, dim) digitsX2 = filtr.fit_transform(digitsX, digitsY) digits2 = pd.DataFrame(np.hstack((digitsX2, np.atleast_2d(digitsY).T))) cols = list(range(digits2.shape[1])) cols[-1] = 'Class'
momentum=0.3) pipe = Pipeline([('filter', filtr), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(bcX, bcY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out1 + 'bc dim red.csv') # raise #%% data for 3 # Set this from chart 2 and dump, use clustering script to finish up dim = 24 filtr = ImportanceSelect(rfc, dim) faultsX2 = filtr.fit_transform(faultsX, faultsY) faults2 = pd.DataFrame(np.hstack((faultsX2, np.atleast_2d(faultsY).T))) cols = list(range(faults2.shape[1])) cols[-1] = 'labels' faults2.columns = cols faults2.to_hdf(out + 'datasets.hdf', 'train_faults', complib='blosc', complevel=9) dim = 10 filtr = ImportanceSelect(rfc, dim) bcX2 = filtr.fit_transform(bcX, bcY) bc2 = pd.DataFrame(np.hstack((bcX2, np.atleast_2d(bcY).T))) cols = list(range(bc2.shape[1])) cols[-1] = 'diagnosis'
blocks_balanced = blocks_balanced.append(blocks[blocks['Class'] == n]) blocks_X = blocks_balanced.drop('Class', 1).copy().values blocks_Y = blocks_balanced['Class'].copy().values blocks_X = StandardScaler().fit_transform(blocks_X) print blocks_X.shape rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=5, n_jobs=7) #%% Select features by random forests dim = 5 filtr = ImportanceSelect(rfc, dim) loansX2 = filtr.fit_transform(loans_X, loans_Y) loans2 = pd.DataFrame(np.hstack((loansX2, np.atleast_2d(loans_Y).T))) cols = list(range(loans2.shape[1])) cols[-1] = 'Class' loans2.columns = cols #madelon2.to_hdf(out+'datasets.hdf','madelon',complib='blosc',complevel=9) #%%Clustering on selected data km = kmeans(random_state=5) gmm = GMM(random_state=5) clusters = [2, 3, 4, 5, 8, 12, 15, 18, 21, 25] loans_km_acc = [] loans_gmm_acc = [] loans_km_score = []
max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('filter', filtr), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(cancerX, cancerY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'cancer dim red.csv') # raise #%% data for 3 # Set this from chart 2 and dump, use clustering script to finish up dim = 7 filtr = ImportanceSelect(rfc, dim) contraX2 = filtr.fit_transform(contraX, contraY) contra2 = pd.DataFrame(np.hstack((contraX2, np.atleast_2d(contraY).T))) cols = list(range(contra2.shape[1])) cols[-1] = 'class' contra2.columns = cols contra2.to_hdf(out + 'datasets.hdf', 'contra', complib='blosc', complevel=9) dim = 6 filtr = ImportanceSelect(rfc, dim) cancerX2 = filtr.fit_transform(cancerX, cancerY) cancer2 = pd.DataFrame(np.hstack((cancerX2, np.atleast_2d(cancerY).T))) cols = list(range(cancer2.shape[1])) cols[-1] = 'class'
grid ={'filter__n':dims,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_layers} mlp = MLPClassifier(activation='relu',max_iter=nn_iter,early_stopping=True,random_state=5) pipe = Pipeline([('filter',filtr),('NN',mlp)]) gs = GridSearchCV(pipe,grid,verbose=10,cv=5) gs.fit(cancer_x,cancer_y) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out+'cancer part 4.csv') grid ={'filter__n':dims_big,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_layers} mlp = MLPClassifier(activation='relu',max_iter=nn_iter,early_stopping=True,random_state=5) pipe = Pipeline([('filter',filtr),('NN',mlp)]) gs = GridSearchCV(pipe,grid,verbose=10,cv=5) gs.fit(housing_x,housing_y) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out+'housing part 4.csv') #3 dim = 7 filtr = ImportanceSelect(rfc,dim) cancer_x2 = filtr.fit_transform(cancer_x,cancer_y) dim = 7 filtr = ImportanceSelect(rfc,dim) housing_x2 = filtr.fit_transform(housing_x,housing_y) run_clustering(out, cancer_x2, cancer_y, housing_x2, housing_y)