def reduction_cluster_nn(X, y, problem): n = len(X[0]) sm = SMOTE() rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=5, n_jobs=7) filtr = ImportanceSelect(rf) km = KMeans(random_state=5) mlp = MLPClassifier(solver='adam', alpha=1e-5, shuffle=True, early_stopping=True, activation='relu', verbose=True) X_res, y_res = sm.fit_sample(X, y) parameters = { 'NN__hidden_layer_sizes': [(n, n, n, n, n)], 'filtr__n': [2, 5, 10, 15, 20], 'km__n_clusters': [2, 3, 4, 5, 6], } sss = StratifiedShuffleSplit( n_splits=5, test_size=0.2) ## no need for this given 50000 random sample pipe = Pipeline([('filtr', filtr), ('km', km), ('NN', mlp)]) gs = GridSearchCV(pipe, parameters, verbose=10, cv=sss) gs.fit(X_res, y_res) clf = gs.best_estimator_ print(clf) print(gs.best_score_) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + problem + ' dr_cluster_nn.csv') return clf, gs.best_score_, gs
fi_RF_pageblocks = rfc.fit(blocks_X, blocks_Y).feature_importances_ plt.plot(range(10), fi_RF_pageblocks, marker='o', markersize=4, linestyle="-") plt.title("Pageblocks Feature Importance by RF") plt.xlabel("Feature") plt.ylabel("Importance") plt.savefig(out + 'Pageblock_part2_RF.png') plt.close() tmp2 = pd.Series(np.sort(fi_RF_pageblocks)[::-1]) tmp2.to_csv(out + 'fi_RF_pageblocks.csv') #%% Validation for part2 filtr = ImportanceSelect(rfc) dims1 = [2, 4, 5, 10, 15, 20, 26] grid = {'filter__n': dims1} mlp = MLPClassifier(solver='lbfgs', activation='identity', alpha=0.1, hidden_layer_sizes=(50, ), max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('filter', filtr), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(loans_X, loans_Y) tmp = pd.DataFrame(gs.cv_results_)
random_state=5, n_jobs=7) fs_madelon = rfc.fit(madelonX, madelonY).feature_importances_ fs_digits = rfc.fit(digitsX, digitsY).feature_importances_ tmp = pd.Series(np.sort(fs_madelon)[::-1]) tmp.to_csv(out + 'madelon scree.csv') tmp = pd.Series(np.sort(fs_digits)[::-1]) tmp.to_csv(out + 'digits scree.csv') # raise #%% Data for 2 if flag == 1: nn_arch = nn_arch_madelon filtr = ImportanceSelect(rfc) grid = { 'filter__n': dims, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('filter', filtr), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, n_jobs=num_jobs, verbose=10, cv=5) gs.fit(madelonX, madelonY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'Madelon dim red.csv')
def main(): out = './BASES/' np.random.seed(0) character = pd.read_hdf('./BASES/datasets.hdf', 'character') character_X = character.drop('Class', 1).copy().values character_Y = character['Class'].copy().values madelon = pd.read_hdf('./BASES/datasets.hdf', 'madelon') madelon_X = madelon.drop('Class', 1).copy().values madelon_Y = madelon['Class'].copy().values madelon_X = StandardScaler().fit_transform(madelon_X) character_X = StandardScaler().fit_transform(character_X) # clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40] dim_red = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60] dims_red_s = [2, 4, 6, 8, 10, 12, 14, 16] # %% data for 1 rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=5, n_jobs=7) fs_madelon = rfc.fit(madelon_X, madelon_Y).feature_importances_ fs_character = rfc.fit(character_X, character_Y).feature_importances_ tmp = pd.Series(np.sort(fs_madelon)[::-1]) tmp.to_csv(out + 'madelon scree.csv') tmp = pd.Series(np.sort(fs_character)[::-1]) tmp.to_csv(out + 'character_scree.csv') # %% Data for 2 filtr = ImportanceSelect(rfc) grid = {'filter__n': dim_red, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch} mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('filter', filtr), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(madelon_X, madelon_Y) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'Madelon dim red.csv') grid = {'filter__n': dims_red_s, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch} mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('filter', filtr), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(character_X, character_Y) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'character_dim_red.csv') # raise # %% data for 3 # Set this from chart 2 and dump, use clustering script to finish up dim = 10 filtr = ImportanceSelect(rfc, dim) madelon_X2 = filtr.fit_transform(madelon_X, madelon_Y) madelon_2 = pd.DataFrame(np.hstack((madelon_X2, np.atleast_2d(madelon_Y).T))) cols = list(range(madelon_2.shape[1])) cols[-1] = 'Class' madelon_2.columns = cols madelon_2.to_hdf(out + 'datasets.hdf', 'madelon', complib='blosc', complevel=9) dim = 10 filtr = ImportanceSelect(rfc, dim) character_X2 = filtr.fit_transform(character_X, character_Y) character_2 = pd.DataFrame(np.hstack((character_X2, np.atleast_2d(character_Y).T))) cols = list(range(character_2.shape[1])) cols[-1] = 'Class' character_2.columns = cols character_2.to_hdf(out + 'datasets.hdf', 'character', complib='blosc', complevel=9)
max_iter=1000, early_stopping=True, random_state=5) pipe = Pipeline([('rp', rp), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(wineX, wineY) results[i]['RP'] = 100. * gs.best_score_ tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv('./RP/nn.csv') rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=5, n_jobs=-1) filtr = ImportanceSelect(rfc) grid = { 'filter__n': [i], 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } mlp = MLPClassifier(activation='relu', max_iter=1000, early_stopping=True, random_state=5) pipe = Pipeline([('filter', filtr), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(wineX, wineY) results[i]['RF'] = 100. * gs.best_score_ tmp = pd.DataFrame(gs.cv_results_)
indices = np.argsort(fs_contra)[::-1] for f in range(contraX.shape[1]): print("%d. feature %d (%f)" % (f + 1, indices[f], fs_contra[indices[f]])) tmp = pd.Series(np.sort(fs_cancer)[::-1]) tmp.to_csv(out + 'cancer scree.csv') indices = np.argsort(fs_cancer)[::-1] for f in range(cancerX.shape[1]): print("%d. feature %d (%f)" % (f + 1, indices[f], fs_cancer[indices[f]])) #%% Data for 2 filtr = ImportanceSelect(rfc) grid = { 'filter__n': dims_contra, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('filter', filtr), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(contraX, contraY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'contra dim red.csv')
rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=5, n_jobs=7) fs_wine = rfc.fit(wineX, wineY).feature_importances_ fs_digit = rfc.fit(digitX, digitY).feature_importances_ tmp = pd.Series(np.sort(fs_wine)[::-1]) tmp.to_csv(out + 'wine scree.csv') tmp = pd.Series(np.sort(fs_digit)[::-1]) tmp.to_csv(out + 'digit scree.csv') # Data for 2 filtr = ImportanceSelect(rfc) grid = { 'filter__n': dims_wine, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('filter', filtr), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(wineX, wineY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'wine dim red.csv')
cluster_range = range(1, 11) dims = range(1, 30) rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=5, n_jobs=-1) fs_br = rfc.fit(brX, brY).feature_importances_ tmp = pd.Series(np.sort(fs_br)[::-1]) tmp.to_csv('./RF/breast_scree.csv') barplot_breast(tmp) dim = 10 filtr = ImportanceSelect(rfc, dim) brX2 = filtr.fit_transform(brX, brY) br2 = pd.DataFrame(np.hstack((brX2, np.atleast_2d(brY).T))) cols = list(range(br2.shape[1])) cols[-1] = 'Class' br2.columns = cols br2.to_csv('./RF/breast.csv') # Abalone Dataset abalone = pd.read_csv('./BASE/abalone.csv') abaloneX = abalone.drop('Class', 1).copy().values abaloneY = abalone['Class'].copy().values abaloneX = StandardScaler().fit_transform(abaloneX) cluster_range = range(1, 11) dims = range(1, 10)
# raise Exception('Remove this line to run code') #2 rfc = RandomForestClassifier(n_estimators=100,class_weight='balanced',random_state=5,n_jobs=7) fs_perm = rfc.fit(perm_x,perm_y).feature_importances_ fs_housing = rfc.fit(housing_x,housing_y).feature_importances_ tmp = pd.Series(np.sort(fs_perm)[::-1]) tmp.to_csv(out+'perm scree.csv') tmp = pd.Series(np.sort(fs_housing)[::-1]) tmp.to_csv(out+'housing scree.csv') #4 filtr = ImportanceSelect(rfc) grid ={'filter__n':dims,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_layers} mlp = MLPClassifier(activation='relu',max_iter=nn_iter,early_stopping=True,random_state=5) pipe = Pipeline([('filter',filtr),('NN',mlp)]) gs = GridSearchCV(pipe,grid,verbose=10,cv=5) gs.fit(perm_x,perm_y) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out+'perm dim red.csv') grid ={'filter__n':dims_big,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_layers} mlp = MLPClassifier(activation='relu',max_iter=nn_iter,early_stopping=True,random_state=5) pipe = Pipeline([('filter',filtr),('NN',mlp)]) gs = GridSearchCV(pipe,grid,verbose=10,cv=5)
rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=5, n_jobs=7) fs_biodeg = rfc.fit(biodegX, biodegY).feature_importances_ fs_digits = rfc.fit(digitsX, digitsY).feature_importances_ tmp = pd.Series(np.sort(fs_biodeg)[::-1]) tmp.to_csv(out + 'biodeg scree.csv') tmp = pd.Series(np.sort(fs_digits)[::-1]) tmp.to_csv(out + 'digits scree.csv') #%% Data for 2 filtr = ImportanceSelect(rfc) grid = { 'filter__n': dimsb, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('filter', filtr), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(biodegX, biodegY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'Biodeg dim red.csv')
tmp.to_csv(out + 'wine scree.csv') indices = np.argsort(fs_wine)[::-1] for f in range(wineX.shape[1]): print("%d. feature %d (%f)" % (f + 1, indices[f], fs_wine[indices[f]])) tmp = pd.Series(np.sort(fs_cancer)[::-1]) tmp.to_csv(out + 'cancer scree.csv') indices = np.argsort(fs_cancer)[::-1] for f in range(cancerX.shape[1]): print("%d. feature %d (%f)" % (f + 1, indices[f], fs_cancer[indices[f]])) #%% Data for 2 filtr = ImportanceSelect(rfc) grid = { 'filter__n': dims_wine, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('filter', filtr), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(wineX, wineY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'wine dim red.csv')
rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=5, n_jobs=7) fs_faults = rfc.fit(faultsX, faultsY).feature_importances_ fs_bc = rfc.fit(bcX, bcY).feature_importances_ tmp = pd.Series(np.sort(fs_faults)[::-1]) tmp.to_csv(out1 + 'faults scree.csv') tmp = pd.Series(np.sort(fs_bc)[::-1]) tmp.to_csv(out1 + 'bc scree.csv') #%% Data for 2 filtr = ImportanceSelect(rfc) grid = { 'filter__n': dims, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } mlp = MLPClassifier(activation='relu', max_iter=200, early_stopping=True, random_state=5, learning_rate_init=0.1, momentum=0.3) pipe = Pipeline([('filter', filtr), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(faultsX, faultsY)
fs_digits = rfc.fit(digitsX, digitsY).feature_importances_ print('Part 2D - Starting RF for segmentation dataset...') fs_seg = rfc.fit(segX, segY).feature_importances_ tmp = pd.Series(np.sort(fs_digits)[::-1]) tmp.to_csv('./P2_Dimensionality_Reduction/digits_RF_feature_importance.csv') tmp = pd.Series(np.sort(fs_seg)[::-1]) tmp.to_csv('./P2_Dimensionality_Reduction/seg_RF_feature_importance.csv') # Run Neural Networks rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=5, n_jobs=7) filtr = ImportanceSelect(rfc) grid = { 'filter__n': dims_digits, 'NN__learning_rate_init': nn_lr, 'NN__hidden_layer_sizes': nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('filter', filtr), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(digitsX, digitsY) nn_results = pd.DataFrame(gs.cv_results_) nn_results.to_csv('./P4_Neural_Networks_Reduced/digits_RF_nn_results.csv')
blocks_balanced = class1 for n in range(2, 6): blocks_balanced = blocks_balanced.append(blocks[blocks['Class'] == n]) blocks_X = blocks_balanced.drop('Class', 1).copy().values blocks_Y = blocks_balanced['Class'].copy().values blocks_X = StandardScaler().fit_transform(blocks_X) print blocks_X.shape rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=5, n_jobs=7) #%% Select features by random forests dim = 5 filtr = ImportanceSelect(rfc, dim) loansX2 = filtr.fit_transform(loans_X, loans_Y) loans2 = pd.DataFrame(np.hstack((loansX2, np.atleast_2d(loans_Y).T))) cols = list(range(loans2.shape[1])) cols[-1] = 'Class' loans2.columns = cols #madelon2.to_hdf(out+'datasets.hdf','madelon',complib='blosc',complevel=9) #%%Clustering on selected data km = kmeans(random_state=5) gmm = GMM(random_state=5) clusters = [2, 3, 4, 5, 8, 12, 15, 18, 21, 25] loans_km_acc = []
rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=5, n_jobs=7) fs_abalone = rfc.fit(abaloneX, abaloneY).feature_importances_ fs_digits = rfc.fit(digitsX, digitsY).feature_importances_ tmp = pd.Series(np.sort(fs_abalone)[::-1]) tmp.to_csv(out + 'abalone scree.csv') tmp = pd.Series(np.sort(fs_digits)[::-1]) tmp.to_csv(out + 'digits scree.csv') #%% Data for 2 filtr = ImportanceSelect(rfc) grid = { 'filter__n': abalone_dims, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch, 'NN__activation': nn_activation } mlp = MLPClassifier(max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('filter', filtr), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5, scoring='f1_macro') gs.fit(abaloneX, abaloneY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'abalone dim red.csv') grid = {
rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=5, n_jobs=1) fs_diamonds = rfc.fit(diamondsX, diamondsY).feature_importances_ fs_digits = rfc.fit(digitsX, digitsY).feature_importances_ tmp = pd.Series(np.sort(fs_diamonds)[::-1]) tmp.to_csv(out + 'diamonds scree.csv') tmp = pd.Series(np.sort(fs_digits)[::-1]) tmp.to_csv(out + 'digits scree.csv') #%% task 4 filtr = ImportanceSelect(rfc) grid = { 'filter__n': dims1, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('filter', filtr), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(diamondsX, diamondsY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'diamonds dim red.csv')
fs_spam = rfc.fit(spamX, spamY).feature_importances_ print('Part 2D - Starting RF for letter dataset...') fs_letter = rfc.fit(letterX, letterY).feature_importances_ tmp = pd.Series(np.sort(fs_spam)[::-1]) tmp.to_csv('./P2_Dimensionality_Reduction/spam_RF_feature_importance.csv') tmp = pd.Series(np.sort(fs_letter)[::-1]) tmp.to_csv('./P2_Dimensionality_Reduction/letter_RF_feature_importance.csv') # Run Neural Networks rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=5, n_jobs=7) filtr = ImportanceSelect(rfc) grid = { 'filter__n': dims_spam, 'NN__learning_rate_init': nn_lr, 'NN__hidden_layer_sizes': nn_arch } mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('filter', filtr), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(spamX, spamY) nn_results = pd.DataFrame(gs.cv_results_) nn_results.to_csv('./P4_Neural_Networks_Reduced/spam_RF_nn_results.csv')
out = '../results/random_forest/' cancer_x, cancer_y, housing_x, housing_y = load_data() # cancer, housing rfc = RandomForestClassifier(n_estimators=100,class_weight='balanced',random_state=5,n_jobs=7) fs_cancer = rfc.fit(cancer_x,cancer_y).feature_importances_ fs_housing = rfc.fit(housing_x,housing_y).feature_importances_ tmp = pd.Series(np.sort(fs_cancer)[::-1]) tmp.to_csv(out+'cancer part 2.csv') tmp = pd.Series(np.sort(fs_housing)[::-1]) tmp.to_csv(out+'housing part 2.csv') dims = list(range(1, 31)) filtr = ImportanceSelect(rfc) grid ={'filter__n':dims,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_layers} mlp = MLPClassifier(activation='relu',max_iter=nn_iter,early_stopping=True,random_state=5) pipe = Pipeline([('filter',filtr),('NN',mlp)]) gs = GridSearchCV(pipe,grid,verbose=10,cv=5) gs.fit(cancer_x,cancer_y) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out+'cancer part 4.csv') grid ={'filter__n':dims_big,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_layers} mlp = MLPClassifier(activation='relu',max_iter=nn_iter,early_stopping=True,random_state=5) pipe = Pipeline([('filter',filtr),('NN',mlp)]) gs = GridSearchCV(pipe,grid,verbose=10,cv=5)