Esempio n. 1
0
        'NN__activation': nn_activation
    }
    mlp = MLPClassifier(max_iter=2000, early_stopping=True, random_state=5)
    pipe = Pipeline([('filter', filtr), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5, scoring='f1_macro')

    gs.fit(digitsX, digitsY)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'digits dim red.csv')
    # raise
    #%% data for 3
    # Set this from chart 2 and dump, use clustering script to finish up
    dim = 7
    filtr = ImportanceSelect(rfc, dim)

    abaloneX2 = filtr.fit_transform(abaloneX, abaloneY)
    abalone2 = pd.DataFrame(np.hstack((abaloneX2, np.atleast_2d(abaloneY).T)))
    cols = list(range(abalone2.shape[1]))
    cols[-1] = 'Class'
    abalone2.columns = cols
    abalone2.to_hdf(out + 'datasets.hdf',
                    'abalone',
                    complib='blosc',
                    complevel=9)

    dim = 41
    filtr = ImportanceSelect(rfc, dim)
    digitsX2 = filtr.fit_transform(digitsX, digitsY)
    digits2 = pd.DataFrame(np.hstack((digitsX2, np.atleast_2d(digitsY).T)))
    cols = list(range(digits2.shape[1]))
    cols[-1] = 'Class'
Esempio n. 2
0
                        max_iter=2000,
                        early_stopping=True,
                        random_state=5)
    pipe = Pipeline([('filter', filtr), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, n_jobs=num_jobs, verbose=10, cv=5)

    gs.fit(digitsX, digitsY)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'digits dim red.csv')
    # raise
    #%% data for 3
    # Set this from chart 2 and dump, use clustering script to finish up
    dim = 20
    filtr = ImportanceSelect(rfc, dim)

    madelonX2 = filtr.fit_transform(madelonX, madelonY)
    madelon2 = pd.DataFrame(np.hstack((madelonX2, np.atleast_2d(madelonY).T)))
    cols = list(range(madelon2.shape[1]))
    cols[-1] = 'Class'
    madelon2.columns = cols
    madelon2.to_hdf(out + 'datasets.hdf',
                    'madelon',
                    complib='blosc',
                    complevel=9)

    dim = 40
    filtr = ImportanceSelect(rfc, dim)
    digitsX2 = filtr.fit_transform(digitsX, digitsY)
    digits2 = pd.DataFrame(np.hstack((digitsX2, np.atleast_2d(digitsY).T)))
    cols = list(range(digits2.shape[1]))
    cols[-1] = 'Class'
Esempio n. 3
0
pipe = Pipeline([('filter', filtr), ('NN', mlp)])
gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

gs.fit(blocks_X, blocks_Y)
tmp = pd.DataFrame(gs.cv_results_)
tmp.to_csv(out + 'pageblocks_RF_dim_red.csv')

#%% For part 4
out = './Part4/'

rfc = RandomForestClassifier(n_estimators=100,
                             class_weight='balanced',
                             random_state=5)
dim = 5
filtr = ImportanceSelect(rfc, dim)
blocks_X2 = filtr.fit_transform(blocks_X, blocks_Y)

grid = {'NN__alpha': nn_reg}
mlp = MLPClassifier(solver='lbfgs',
                    activation='logistic',
                    hidden_layer_sizes=(50, ),
                    max_iter=2000,
                    early_stopping=True,
                    random_state=5)
pipe = Pipeline([('NN', mlp)])
gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

start = time()
gs.fit(blocks_X2, blocks_Y)
print "Benchmark run time: %.2f seconds" % (time() - start)
tmp = pd.DataFrame(gs.cv_results_)
Esempio n. 4
0
                        max_iter=2000,
                        early_stopping=True,
                        random_state=5)
    pipe = Pipeline([('filter', filtr), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(digitX, digitY)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'digit dim red.csv')

    # data for 3
    # Set this from chart 2 and dump, use clustering script to finish up
    dim = 6
    filtr = ImportanceSelect(rfc, dim)

    wineX2 = filtr.fit_transform(wineX, wineY)
    wine2 = pd.DataFrame(np.hstack((wineX2, np.atleast_2d(wineY))))
    cols = list(range(wine2.shape[1]))
    cols[-1] = 'Class'
    wine2.columns = cols
    wine2.to_csv(out + 'wine_datasets.csv', index=False, header=False)

    dim = 60
    filtr = ImportanceSelect(rfc, dim)
    digitX2 = filtr.fit_transform(digitX, digitY)
    digit2 = pd.DataFrame(np.hstack((digitX2, np.atleast_2d(digitY))))
    cols = list(range(digit2.shape[1]))
    cols[-1] = 'Class'
    digit2.columns = cols
    digit2.to_csv(out + 'digit_datasets.csv', index=False, header=False)
Esempio n. 5
0
def main():
    out = './BASES/'

    np.random.seed(0)
    character = pd.read_hdf('./BASES/datasets.hdf', 'character')
    character_X = character.drop('Class', 1).copy().values
    character_Y = character['Class'].copy().values

    madelon = pd.read_hdf('./BASES/datasets.hdf', 'madelon')
    madelon_X = madelon.drop('Class', 1).copy().values
    madelon_Y = madelon['Class'].copy().values

    madelon_X = StandardScaler().fit_transform(madelon_X)
    character_X = StandardScaler().fit_transform(character_X)

    # clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40]
    dim_red = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
    dims_red_s = [2, 4, 6, 8, 10, 12, 14, 16]

    # %% data for 1

    rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=5, n_jobs=7)
    fs_madelon = rfc.fit(madelon_X, madelon_Y).feature_importances_
    fs_character = rfc.fit(character_X, character_Y).feature_importances_

    tmp = pd.Series(np.sort(fs_madelon)[::-1])
    tmp.to_csv(out + 'madelon scree.csv')

    tmp = pd.Series(np.sort(fs_character)[::-1])
    tmp.to_csv(out + 'character_scree.csv')

    # %% Data for 2
    filtr = ImportanceSelect(rfc)
    grid = {'filter__n': dim_red, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch}
    mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5)
    pipe = Pipeline([('filter', filtr), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(madelon_X, madelon_Y)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'Madelon dim red.csv')

    grid = {'filter__n': dims_red_s, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch}
    mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5)
    pipe = Pipeline([('filter', filtr), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(character_X, character_Y)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'character_dim_red.csv')
    #    raise
    # %% data for 3
    # Set this from chart 2 and dump, use clustering script to finish up
    dim = 10
    filtr = ImportanceSelect(rfc, dim)

    madelon_X2 = filtr.fit_transform(madelon_X, madelon_Y)
    madelon_2 = pd.DataFrame(np.hstack((madelon_X2, np.atleast_2d(madelon_Y).T)))
    cols = list(range(madelon_2.shape[1]))
    cols[-1] = 'Class'
    madelon_2.columns = cols
    madelon_2.to_hdf(out + 'datasets.hdf', 'madelon', complib='blosc', complevel=9)

    dim = 10
    filtr = ImportanceSelect(rfc, dim)
    character_X2 = filtr.fit_transform(character_X, character_Y)
    character_2 = pd.DataFrame(np.hstack((character_X2, np.atleast_2d(character_Y).T)))
    cols = list(range(character_2.shape[1]))
    cols[-1] = 'Class'
    character_2.columns = cols
    character_2.to_hdf(out + 'datasets.hdf', 'character', complib='blosc', complevel=9)
mlp = MLPClassifier(activation='relu',max_iter=nn_iter,early_stopping=True,random_state=5)
pipe = Pipeline([('filter',filtr),('NN',mlp)])
gs = GridSearchCV(pipe,grid,verbose=10,cv=5)

gs.fit(perm_x,perm_y)
tmp = pd.DataFrame(gs.cv_results_)
tmp.to_csv(out+'perm dim red.csv')


grid ={'filter__n':dims_big,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_layers}  
mlp = MLPClassifier(activation='relu',max_iter=nn_iter,early_stopping=True,random_state=5)
pipe = Pipeline([('filter',filtr),('NN',mlp)])
gs = GridSearchCV(pipe,grid,verbose=10,cv=5)

gs.fit(housing_x,housing_y)
tmp = pd.DataFrame(gs.cv_results_)
tmp.to_csv(out+'housing dim red.csv')


#3
dim = 5
filtr = ImportanceSelect(rfc,dim)
perm_x2 = filtr.fit_transform(perm_x,perm_y)


dim = 9
filtr = ImportanceSelect(rfc,dim)
housing_x2 = filtr.fit_transform(housing_x,housing_y)

run_clustering(out, perm_x2, perm_y, housing_x2, housing_y)
Esempio n. 7
0
    gs.fit(digitsX, digitsY)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'digits dim red.csv')
    #raise

    #%% data for task 3 but find the good dim values first, use clustering script to finish up
    rfc = RandomForestClassifier(n_estimators=100,
                                 class_weight='balanced',
                                 random_state=5,
                                 n_jobs=1)

    dim = 6
    filtr = ImportanceSelect(rfc, dim)

    diamondsX2 = filtr.fit_transform(diamondsX, diamondsY)
    diamonds2 = pd.DataFrame(
        np.hstack((diamondsX2, np.atleast_2d(diamondsY).T)))
    cols = list(range(diamonds2.shape[1]))
    cols[-1] = 'Class'
    diamonds2.columns = cols
    diamonds2.to_hdf(out + 'datasets.hdf',
                     'diamonds',
                     complib='blosc',
                     complevel=9)

    dim = 45
    filtr = ImportanceSelect(rfc, dim)
    digitsX2 = filtr.fit_transform(digitsX, digitsY)
    digits2 = pd.DataFrame(np.hstack((digitsX2, np.atleast_2d(digitsY).T)))
    cols = list(range(digits2.shape[1]))
Esempio n. 8
0
                        max_iter=2000,
                        early_stopping=True,
                        random_state=5)
    pipe = Pipeline([('filter', filtr), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(cancerX, cancerY)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'cancer dim red.csv')
    #    raise
    #%% data for 3
    # Set this from chart 2 and dump, use clustering script to finish up
    dim = 7
    filtr = ImportanceSelect(rfc, dim)

    wineX2 = filtr.fit_transform(wineX, wineY)
    wine2 = pd.DataFrame(np.hstack((wineX2, np.atleast_2d(wineY).T)))
    cols = list(range(wine2.shape[1]))
    cols[-1] = 'class'
    wine2.columns = cols
    wine2.to_hdf(out + 'datasets.hdf', 'wine', complib='blosc', complevel=9)

    dim = 6
    filtr = ImportanceSelect(rfc, dim)
    cancerX2 = filtr.fit_transform(cancerX, cancerY)
    cancer2 = pd.DataFrame(np.hstack((cancerX2, np.atleast_2d(cancerY).T)))
    cols = list(range(cancer2.shape[1]))
    cols[-1] = 'class'
    cancer2.columns = cols
    cancer2.to_hdf(out + 'datasets.hdf',
                   'cancer',
Esempio n. 9
0
cluster_range = range(1, 11)
dims = range(1, 30)

rfc = RandomForestClassifier(n_estimators=100,
                             class_weight='balanced',
                             random_state=5,
                             n_jobs=-1)
fs_br = rfc.fit(brX, brY).feature_importances_
tmp = pd.Series(np.sort(fs_br)[::-1])
tmp.to_csv('./RF/breast_scree.csv')

barplot_breast(tmp)

dim = 10
filtr = ImportanceSelect(rfc, dim)
brX2 = filtr.fit_transform(brX, brY)
br2 = pd.DataFrame(np.hstack((brX2, np.atleast_2d(brY).T)))
cols = list(range(br2.shape[1]))
cols[-1] = 'Class'
br2.columns = cols
br2.to_csv('./RF/breast.csv')

# Abalone Dataset
abalone = pd.read_csv('./BASE/abalone.csv')
abaloneX = abalone.drop('Class', 1).copy().values
abaloneY = abalone['Class'].copy().values
abaloneX = StandardScaler().fit_transform(abaloneX)

cluster_range = range(1, 11)
dims = range(1, 10)
Esempio n. 10
0
                        max_iter=2000,
                        early_stopping=True,
                        random_state=5)
    pipe = Pipeline([('filter', filtr), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(digitsX, digitsY)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'digits dim red.csv')
    #    raise
    #%% data for 3
    # Set this from chart 2 and dump, use clustering script to finish up
    dim = 35
    filtr = ImportanceSelect(rfc, dim)

    biodegX2 = filtr.fit_transform(biodegX, biodegY)
    biodeg2 = pd.DataFrame(np.hstack((biodegX2, np.atleast_2d(biodegY).T)))
    cols = list(range(biodeg2.shape[1]))
    cols[-1] = 'Class'
    biodeg2.columns = cols
    biodeg2.to_hdf(out + 'datasets.hdf',
                   'biodeg',
                   complib='blosc',
                   complevel=9)

    dim = 40
    filtr = ImportanceSelect(rfc, dim)
    digitsX2 = filtr.fit_transform(digitsX, digitsY)
    digits2 = pd.DataFrame(np.hstack((digitsX2, np.atleast_2d(digitsY).T)))
    cols = list(range(digits2.shape[1]))
    cols[-1] = 'Class'
Esempio n. 11
0
                        momentum=0.3)
    pipe = Pipeline([('filter', filtr), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(bcX, bcY)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out1 + 'bc dim red.csv')
    #    raise

    #%% data for 3
    # Set this from chart 2 and dump, use clustering script to finish up

    dim = 24
    filtr = ImportanceSelect(rfc, dim)

    faultsX2 = filtr.fit_transform(faultsX, faultsY)
    faults2 = pd.DataFrame(np.hstack((faultsX2, np.atleast_2d(faultsY).T)))
    cols = list(range(faults2.shape[1]))
    cols[-1] = 'labels'
    faults2.columns = cols
    faults2.to_hdf(out + 'datasets.hdf',
                   'train_faults',
                   complib='blosc',
                   complevel=9)

    dim = 10
    filtr = ImportanceSelect(rfc, dim)
    bcX2 = filtr.fit_transform(bcX, bcY)
    bc2 = pd.DataFrame(np.hstack((bcX2, np.atleast_2d(bcY).T)))
    cols = list(range(bc2.shape[1]))
    cols[-1] = 'diagnosis'
Esempio n. 12
0
    blocks_balanced = blocks_balanced.append(blocks[blocks['Class'] == n])
blocks_X = blocks_balanced.drop('Class', 1).copy().values
blocks_Y = blocks_balanced['Class'].copy().values
blocks_X = StandardScaler().fit_transform(blocks_X)
print blocks_X.shape

rfc = RandomForestClassifier(n_estimators=100,
                             class_weight='balanced',
                             random_state=5,
                             n_jobs=7)
#%% Select features by random forests

dim = 5
filtr = ImportanceSelect(rfc, dim)

loansX2 = filtr.fit_transform(loans_X, loans_Y)
loans2 = pd.DataFrame(np.hstack((loansX2, np.atleast_2d(loans_Y).T)))
cols = list(range(loans2.shape[1]))
cols[-1] = 'Class'
loans2.columns = cols
#madelon2.to_hdf(out+'datasets.hdf','madelon',complib='blosc',complevel=9)

#%%Clustering on selected data
km = kmeans(random_state=5)
gmm = GMM(random_state=5)

clusters = [2, 3, 4, 5, 8, 12, 15, 18, 21, 25]

loans_km_acc = []
loans_gmm_acc = []
loans_km_score = []
                        max_iter=2000,
                        early_stopping=True,
                        random_state=5)
    pipe = Pipeline([('filter', filtr), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(cancerX, cancerY)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'cancer dim red.csv')
    #    raise
    #%% data for 3
    # Set this from chart 2 and dump, use clustering script to finish up
    dim = 7
    filtr = ImportanceSelect(rfc, dim)

    contraX2 = filtr.fit_transform(contraX, contraY)
    contra2 = pd.DataFrame(np.hstack((contraX2, np.atleast_2d(contraY).T)))
    cols = list(range(contra2.shape[1]))
    cols[-1] = 'class'
    contra2.columns = cols
    contra2.to_hdf(out + 'datasets.hdf',
                   'contra',
                   complib='blosc',
                   complevel=9)

    dim = 6
    filtr = ImportanceSelect(rfc, dim)
    cancerX2 = filtr.fit_transform(cancerX, cancerY)
    cancer2 = pd.DataFrame(np.hstack((cancerX2, np.atleast_2d(cancerY).T)))
    cols = list(range(cancer2.shape[1]))
    cols[-1] = 'class'
Esempio n. 14
0
grid ={'filter__n':dims,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_layers}
mlp = MLPClassifier(activation='relu',max_iter=nn_iter,early_stopping=True,random_state=5)
pipe = Pipeline([('filter',filtr),('NN',mlp)])
gs = GridSearchCV(pipe,grid,verbose=10,cv=5)

gs.fit(cancer_x,cancer_y)
tmp = pd.DataFrame(gs.cv_results_)
tmp.to_csv(out+'cancer part 4.csv')


grid ={'filter__n':dims_big,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_layers}  
mlp = MLPClassifier(activation='relu',max_iter=nn_iter,early_stopping=True,random_state=5)
pipe = Pipeline([('filter',filtr),('NN',mlp)])
gs = GridSearchCV(pipe,grid,verbose=10,cv=5)

gs.fit(housing_x,housing_y)
tmp = pd.DataFrame(gs.cv_results_)
tmp.to_csv(out+'housing part 4.csv')

#3
dim = 7
filtr = ImportanceSelect(rfc,dim)
cancer_x2 = filtr.fit_transform(cancer_x,cancer_y)


dim = 7
filtr = ImportanceSelect(rfc,dim)
housing_x2 = filtr.fit_transform(housing_x,housing_y)

run_clustering(out, cancer_x2, cancer_y, housing_x2, housing_y)