def main():
    cars = pd.read_hdf('data/processed/datasets.hdf', 'cars')
    carsX = cars.drop('Class', 1).copy().values
    carsY = cars['Class'].copy().values

    madelon = pd.read_hdf('data/processed/datasets.hdf', 'madelon')
    madelonX = madelon.drop('Class', 1).copy().values
    madelonY = madelon['Class'].copy().values

    cars_trgX, cars_tstX, cars_trgY, cars_tstY = ms.train_test_split(
        carsX, carsY, test_size=0.3, random_state=0, stratify=carsY)
    madelon_trgX, madelon_tstX, madelon_trgY, madelon_tstY = ms.train_test_split(
        madelonX, madelonY, test_size=0.3, random_state=0, stratify=madelonY)

    d = carsX.shape[1]
    hiddens_cars = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]]
    alphas = [10**-x for x in np.arange(1, 9.01, 1 / 2)]
    d = madelonX.shape[1]
    hiddens_madelon = [(h, ) * l for l in [1, 2, 3]
                       for h in [d, d // 2, d * 2]]

    pipeM = Pipeline([
        ('Scale', StandardScaler()),
        ('Cull1', SelectFromModel(RandomForestClassifier(),
                                  threshold='median')),
        ('Cull2', SelectFromModel(RandomForestClassifier(),
                                  threshold='median')),
        ('Cull3', SelectFromModel(RandomForestClassifier(),
                                  threshold='median')),
        ('Cull4', SelectFromModel(RandomForestClassifier(),
                                  threshold='median')), ('KNN', knnC())
    ])

    pipeA = Pipeline([('Scale', StandardScaler()), ('KNN', knnC())])

    params_madelon = {
        'KNN__metric': ['manhattan', 'euclidean', 'chebyshev'],
        'KNN__n_neighbors': np.arange(1, 51, 3),
        'KNN__weights': ['uniform', 'distance']
    }
    params_cars = {
        'KNN__metric': ['manhattan', 'euclidean', 'chebyshev'],
        'KNN__n_neighbors': np.arange(1, 51, 3),
        'KNN__weights': ['uniform', 'distance']
    }

    madelon_clf = basicResults(pipeM, madelon_trgX, madelon_trgY, madelon_tstX,
                               madelon_tstY, params_madelon, 'KNN', 'madelon')
    cars_clf = basicResults(pipeA, cars_trgX, cars_trgY, cars_tstX, cars_tstY,
                            params_cars, 'KNN', 'cars')

    #madelon_final_params={'KNN__n_neighbors': 43, 'KNN__weights': 'uniform', 'KNN__p': 1}
    #cars_final_params={'KNN__n_neighbors': 142, 'KNN__p': 1, 'KNN__weights': 'uniform'}
    madelon_final_params = madelon_clf.best_params_
    cars_final_params = cars_clf.best_params_

    pipeM.set_params(**madelon_final_params)
    makeTimingCurve(madelonX, madelonY, pipeM, 'KNN', 'madelon')
    pipeA.set_params(**cars_final_params)
    makeTimingCurve(carsX, carsY, pipeA, 'KNN', 'cars')
Exemple #2
0
def run_dt(data, title, solved_params=None):
    """
    run the decision tree algo on the data given
    """
    x, y, pipeline = data
    pipe = Pipeline([
        *pipeline,
        ('DT', dtclf_pruned()),
    ])
    print("Splitting into train/test")
    x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y)
    if solved_params is None:
        print("Doing a GridSearch for best hyperparameters")
        params = {
            'DT__criterion': ['gini', 'entropy'],
            'DT__alpha': ALPHAS,
            'DT__class_weight': ['balanced'],
            'DT__min_samples_split': [2, 3, 4, 5],
        }
        clf = basicResults(pipe, x_train, y_train, x_test,
                           y_test, params, 'DT', title)
    else:
        print("Using pre-solved hyperparameters")
        clf = pipe.set_params(**solved_params)
    # print ("Plotting learning curve")
    # plot_learning_curve(clf, title + ' decision tree', x,
    #                     y, n_jobs=4, scoring=scorer, ylim=(0, 1))
    # plt.savefig('./graphs/' + title + '-dt.png')
    y_pred = clf.predict(x_test)
    conf = confusion_matrix(y_test, clf.predict(x_test))
    conf = conf.astype('float') / conf.sum(axis=1)[:, np.newaxis]
    print('Confusion matrix:')
    print(conf)
    np.savetxt('./output/DT_{}_confusion.csv'.format(title), conf, delimiter=',', fmt='%.2f')
Exemple #3
0
def run_boost(data, dataset, dtparams={}):
    x, y, pipeline = data
    pipe = Pipeline([
        *pipeline,
        ('Boost',
         ensemble.AdaBoostClassifier(algorithm='SAMME',
                                     base_estimator=dtclf_pruned(**dtparams))),
    ])
    x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y)
    params = {
        'Boost__n_estimators': [2**i for i in range(8)],
        'Boost__algorithm': ['SAMME', 'SAMME.R'],
    }
    clf = basicResults(pipe, x_train, y_train, x_test, y_test, params,
                       'boosted', dataset)
    # plot_learning_curve(clf, dataset + ' boosted', x, y,
    #                     ylim=(0.0, 1.01), cv=5, n_jobs=4, scoring=scorer)
    # plt.savefig('./graphs/' + dataset + '-boost.png')
    # plot_timing_curve(clf, x, y, 'boost', dataset)
    # plt.savefig('./graphs/' + dataset + '-boost-timing.png')
    # plot_iteration_curve(clf, x_train, y_train, x_test, y_test, params, 'boosted', dataset)
    # plt.savefig('./graphs/' + dataset + '-boost-iteration.png')
    conf = confusion_matrix(y_test, clf.predict(x_test))
    conf = conf.astype('float') / conf.sum(axis=1)[:, np.newaxis]
    print('Confusion matrix:')
    print(conf)
    np.savetxt('./output/Boosted_{}_confusion.csv'.format(dataset),
               conf,
               delimiter=',',
               fmt='%.2f')
Exemple #4
0
def run_knn(data, dataset):
    x, y, pipeline = data
    pipe = Pipeline([
        *pipeline,
        ('KNN', neighbors.KNeighborsClassifier()),
    ])
    print('Splitting data ' + dataset)
    x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y)
    print('Calculating hyperparameters ' + dataset)
    clf = basicResults(pipe, x_train, y_train, x_test, y_test, params, 'KNN',
                       dataset)
    conf = confusion_matrix(y_test, clf.predict(x_test))
    conf = conf.astype('float') / conf.sum(axis=1)[:, np.newaxis]
    print('Confusion matrix:')
    print(conf)
    np.savetxt('./output/KNN_{}_confusion.csv'.format(dataset),
               conf,
               delimiter=',',
               fmt='%.2f')
Exemple #5
0
def run_svm_rbf(data, dataset):
    x, y, pipeline = data
    pipe = Pipeline([
        *pipeline,
        ('SVM', svm.SVC(class_weight='balanced')),
    ])
    print('Splitting data SVM RBF -- ' + dataset)
    x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y)
    print('Computing hyperparameters SVM RBF -- ' + dataset)
    clf = basicResults(pipe, x_train, y_train, x_test,
                       y_test, params, 'SVM-RBF', dataset)

    # plot_timing_curve(clf, x, y, 'rbf svm', dataset)
    # plt.savefig('./graphs/' + dataset + '-svm-rbf-timing.png')
    # plot_iteration_curve(clf, x_train, y_train, x_test,
    #                      y_test, iter_adjust, 'rbf svm', dataset)
    # plt.savefig('./graphs/' + dataset + '-svm-rbf-iteration.png')
    conf = confusion_matrix(y_test, clf.predict(x_test))
    conf = conf.astype('float') / conf.sum(axis=1)[:, np.newaxis]
    print('Confusion matrix:')
    print(conf)
    np.savetxt('./output/SVM-RBF_{}_confusion.csv'.format(dataset), conf, delimiter=',', fmt='%.2f')
Exemple #6
0
def run_ann(data, dataset, solved_params=None):
    x, y, pipeline = data
    print('Data size: ', x.shape)
    pipe = Pipeline([
        *pipeline,
        ('ANN', neural_network.MLPClassifier(max_iter=1000, early_stopping=True)),
    ])
    print('Splitting dataset for ' + dataset)
    x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y)
    if solved_params is None:
        print('Calculating hyperparameters for ' + dataset)
        dim = x.shape[1]
        params = {
            'ANN__hidden_layer_sizes': [(d,) for d in [dim, dim//2]],
            'ANN__solver': ['adam'],
            'ANN__alpha': 10.0 ** -np.arange(1, 7),
            'ANN__activation': ['relu', 'tanh', 'logistic'],
        }
        clf = basicResults(pipe, x_train, y_train, x_test,
                           y_test, params, 'ANN', dataset)
    else:
        print('Using presolved hyperparameters for ' + dataset)
        clf = pipe.set_params(**solved_params)
    # plot_learning_curve(clf, dataset + ' neural network',
    #                     x, y, cv=5, n_jobs=4, scoring=scorer)
    # plt.savefig('./graphs/' + dataset + '-ann.png')
    # print('Creating timing curve for ' + dataset)
    # plot_timing_curve(clf, x, y, 'neural network', dataset)
    # plt.savefig('./graphs/' + dataset + '-ANN-timing.png')
    # print('Creating iteration curve for ' + dataset)
    # plot_iteration_curve(clf, x_train, y_train, x_test, y_test, iter_adjust, 'neural network', dataset)
    # plt.savefig('./graphs/' + dataset + '-ANN-iteration.png')
    conf = confusion_matrix(y_test, clf.predict(x_test))
    conf = conf.astype('float') / conf.sum(axis=1)[:, np.newaxis]
    print('Confusion matrix:')
    print(conf)
    np.savetxt('./output/ANN_{}_confusion.csv'.format(dataset), conf, delimiter=',', fmt='%.2f')
Exemple #7
0
#Linear SVM
pipeS = Pipeline([('Scale', StandardScaler()),
                  ('SVM',
                   SGDClassifier(loss='hinge',
                                 l1_ratio=0,
                                 penalty='l2',
                                 class_weight='balanced',
                                 random_state=55))])

params_spam = {
    'SVM__alpha': alphas,
    'SVM__n_iter': [int((1e6 / N_spam) / .8) + 1]
}

spam_clf = basicResults(pipeS, spam_trgX, spam_trgY, spam_tstX, spam_tstY,
                        params_spam, 'SVM_Lin', 'spam')

y_score = spam_clf.decision_function(spam_tstX)

fpr, tpr, thresholds = roc_curve(spam_tstY, y_score)

import matplotlib.pyplot as plt

plt.figure()

plt.plot(fpr, tpr)

plt.xlabel('FPR')
plt.ylabel('TPR')

plt.title('ROC_Curve(Spambase)')
Exemple #8
0
def main():

    adult = pd.read_csv('data/adult_parsed.csv')

    adult_income_X = adult.drop('income', 1).copy().values
    adult_income_Y = adult['income'].copy().values

    # wine_data = pd.read_csv('data/winequality_white.csv')
    # wine_data['category'] = wine_data['quality'] >= 7
    #
    # wineX = wine_data[wine_data.columns[0:11]].values
    # wineY = wine_data['category'].values.astype(np.int)

    adult_income_trgX, adult_income_tstX, adult_income_trgY, adult_income_tstY = ms.train_test_split(
        adult_income_X,
        adult_income_Y,
        test_size=0.3,
        random_state=0,
        stratify=adult_income_Y)
    # wine_trgX, wine_tstX, wine_trgY, wine_tstY = ms.train_test_split(wineX, wineY, test_size=0.3, random_state=0,stratify=wineY)

    N_adult_income = adult_income_trgX.shape[0]
    # N_wine = wine_trgX.shape[0]

    # alphas = [10**-x for x in np.arange(1,9.01,1/2)]

    #Linear SVM
    pipeM = Pipeline([('Scale', StandardScaler()),
                      ('Cull1',
                       SelectFromModel(RandomForestClassifier(random_state=1),
                                       threshold='median')),
                      ('Cull2',
                       SelectFromModel(RandomForestClassifier(random_state=2),
                                       threshold='median')),
                      ('SVM',
                       SGDClassifier(loss='hinge',
                                     l1_ratio=0,
                                     penalty='l2',
                                     class_weight='balanced',
                                     random_state=55))])
    pipeA = Pipeline([('Scale', StandardScaler()),
                      ('SVM',
                       SGDClassifier(loss='hinge',
                                     l1_ratio=0,
                                     penalty='l2',
                                     class_weight='balanced',
                                     random_state=55))])

    params_adult_income = {
        'SVM__alpha': [100, 10, 1, 0.1, 0.001, 0.0001],
        'SVM__n_iter': np.arange(0.1, 1, 10)
    }
    # params_wine = {'SVM__alpha':alphas,'SVM__n_iter':[int((1e6/N_wine)/.8)+1]}

    adult_income_clf = basicResults(pipeA, adult_income_trgX,
                                    adult_income_trgY, adult_income_tstX,
                                    adult_income_tstY, params_adult_income,
                                    'SVM_Lin', 'adult_income')
    # wine_clf = basicResults(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,params_wine,'SVM_Lin','wine')

    #wine_final_params = {'SVM__alpha': 0.031622776601683791, 'SVM__n_iter': 687.25}
    # wine_final_params = wine_clf.best_params_
    # wine_OF_params = {'SVM__n_iter': 1303, 'SVM__alpha': 1e-16}
    #adult_income_final_params ={'SVM__alpha': 0.0001, 'SVM__n_iter': 428}
    adult_income_final_params = adult_income_clf.best_params_
    adult_income_OF_params = {'SVM__n_iter': 55, 'SVM__alpha': 1e-16}

    # pipeM.set_params(**wine_final_params)
    # makeTimingCurve(wineX,wineY,pipeM,'SVM_Lin','wine')
    pipeA.set_params(**adult_income_final_params)
    makeTimingCurve(adult_income_X, adult_income_Y, pipeA, 'SVM_Lin',
                    'adult_income')

    # pipeM.set_params(**wine_final_params)
    # iterationLC(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,{'SVM__n_iter':[2**x for x in range(12)]},'SVM_Lin','wine')
    pipeA.set_params(**adult_income_final_params)
    iterationLC(pipeA, adult_income_trgX, adult_income_trgY, adult_income_tstX,
                adult_income_tstY, {'SVM__n_iter': np.arange(1, 75, 3)},
                'SVM_Lin', 'adult_income')

    pipeA.set_params(**adult_income_OF_params)
    iterationLC(pipeA, adult_income_trgX, adult_income_trgY, adult_income_tstX,
                adult_income_tstY, {'SVM__n_iter': np.arange(1, 200, 5)},
                'SVM_LinOF', 'adult_income')
    # pipeM.set_params(**wine_OF_params)
    # iterationLC(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,{'SVM__n_iter':np.arange(100,2600,100)},'SVM_LinOF','wine')

    #RBF SVM
    gamma_fracsA = np.arange(0.2, 2.1, 0.2)
    gamma_fracsM = np.arange(0.05, 1.01, 0.1)

    #
    pipeM = Pipeline([('Scale', StandardScaler()),
                      ('Cull1',
                       SelectFromModel(RandomForestClassifier(random_state=1),
                                       threshold='median')),
                      ('Cull2',
                       SelectFromModel(RandomForestClassifier(random_state=2),
                                       threshold='median')),
                      ('SVM', primalSVM_RBF())])

    pipeA = Pipeline([('Scale', StandardScaler()), ('SVM', primalSVM_RBF())])

    params_adult_income = {
        'SVM__alpha': [100, 10, 1, 0.1, 0.001, 0.0001],
        'SVM__n_iter': [int((1e6 / N_adult_income) / .8) + 1],
        'SVM__gamma_frac': gamma_fracsA
    }
    # params_wine = {'SVM__alpha':alphas,'SVM__n_iter':[int((1e6/N_wine)/.8)+1],'SVM__gamma_frac':gamma_fracsM}
    #
    # wine_clf = basicResults(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,params_wine,'SVM_RBF','wine')
    adult_income_clf = basicResults(pipeA, adult_income_trgX,
                                    adult_income_trgY, adult_income_tstX,
                                    adult_income_tstY, params_adult_income,
                                    'SVM_RBF', 'adult_income')

    # wine_final_params = wine_clf.best_params_
    # wine_OF_params = wine_final_params.copy()
    # wine_OF_params['SVM__alpha'] = 1e-16
    adult_income_final_params = adult_income_clf.best_params_
    adult_income_OF_params = adult_income_final_params.copy()
    adult_income_OF_params['SVM__alpha'] = 1e-16

    # pipeM.set_params(**wine_final_params)
    # makeTimingCurve(wineX,wineY,pipeM,'SVM_RBF','wine')
    pipeA.set_params(**adult_income_final_params)
    makeTimingCurve(adult_income_X, adult_income_Y, pipeM, 'SVM_RBF',
                    'adult_income')

    # pipeM.set_params(**wine_final_params)
    # iterationLC(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,{'SVM__n_iter':[2**x for x in range(12)]},'SVM_RBF','wine')
    pipeA.set_params(**adult_income_final_params)
    iterationLC(pipeA, adult_income_trgX, adult_income_trgY, adult_income_tstX,
                adult_income_tstY, {'SVM__n_iter': np.arange(1, 75, 3)},
                'SVM_RBF', 'adult_income')

    pipeA.set_params(**adult_income_OF_params)
    iterationLC(pipeA, adult_income_trgX, adult_income_trgY, adult_income_tstX,
                adult_income_tstY, {'SVM__n_iter': np.arange(1, 75, 3)},
                'SVM_RBF_OF', 'adult_income')
Exemple #9
0
def main():

    adult = pd.read_csv('data/adult_parsed.csv')
    adult['net_capital'] = adult['capital-gain'] - adult['capital-loss']
    adult = adult.drop(["fnlwgt", "capital-gain", "capital-loss", "workclass"],
                       axis=1)

    adult['income'] = adult['income'].map({'<=50K': 0, '>50K': 1})
    adult['gender'] = adult['gender'].map({'Male': 0, 'Female': 1}).astype(int)
    adult['race'] = adult['race'].map({
        'Black': 0,
        'Asian-Pac-Islander': 1,
        'Other': 2,
        'White': 3,
        'Amer-Indian-Eskimo': 4
    }).astype(int)
    adult['marital-status'] = adult['marital-status'].map({
        'Never-married':
        0,
        'Widowed':
        1,
        'Divorced':
        2,
        'Separated':
        3,
        'Married-spouse-absent':
        4,
        'Married-civ-spouse':
        5,
        'Married-AF-spouse':
        6
    })
    adult['education'] = adult['education'].map({
        'Preschool': 0,
        '1st-4th': 1,
        '5th-6th': 2,
        '7th-8th': 3,
        '9th': 4,
        '10th': 5,
        '11th': 6,
        '12th': 7,
        'Prof-school': 8,
        'HS-grad': 9,
        'Some-college': 10,
        'Assoc-voc': 11,
        'Assoc-acdm': 12,
        'Bachelors': 13,
        'Masters': 14,
        'Doctorate': 15
    })

    adult['occupation'] = adult['occupation'].map({
        'Priv-house-serv': 0,
        'Protective-serv': 1,
        'Handlers-cleaners': 2,
        'Machine-op-inspct': 3,
        'Adm-clerical': 4,
        'Farming-fishing': 5,
        'Transport-moving': 6,
        'Craft-repair': 7,
        'Other-service': 8,
        'Tech-support': 9,
        'Sales': 10,
        'Exec-managerial': 11,
        'Prof-specialty': 12,
        'Armed-Forces': 13
    })
    adult['native-country'] = adult['native-country'].map({
        '?':
        -1,
        'Puerto-Rico':
        0,
        'Haiti':
        1,
        'Cuba':
        2,
        'Iran':
        3,
        'Honduras':
        4,
        'Jamaica':
        5,
        'Vietnam':
        6,
        'Mexico':
        7,
        'Dominican-Republic':
        8,
        'Laos':
        9,
        'Ecuador':
        10,
        'El-Salvador':
        11,
        'Cambodia':
        12,
        'Columbia':
        13,
        'Guatemala':
        14,
        'South':
        15,
        'India':
        16,
        'Nicaragua':
        17,
        'Yugoslavia':
        18,
        'Philippines':
        19,
        'Thailand':
        20,
        'Trinadad&Tobago':
        21,
        'Peru':
        22,
        'Poland':
        23,
        'China':
        24,
        'Hungary':
        25,
        'Greece':
        26,
        'Taiwan':
        27,
        'Italy':
        28,
        'Portugal':
        29,
        'France':
        30,
        'Hong':
        31,
        'England':
        32,
        'Scotland':
        33,
        'Ireland':
        34,
        'Holand-Netherlands':
        35,
        'Canada':
        36,
        'Germany':
        37,
        'Japan':
        38,
        'Outlying-US(Guam-USVI-etc)':
        39,
        'United-States':
        40
    })

    adult['relationship'] = adult['relationship'].map({
        'Unmarried': 0,
        'Other-relative': 1,
        'Not-in-family': 2,
        'Wife': 3,
        'Husband': 4,
        'Own-child': 5
    })

    adult = pd.get_dummies(adult)
    adult_income_X = adult.drop('income', 1).copy().values
    adult_income_Y = adult['income'].copy().values

    # wine_data = pd.read_csv('data/wine-red-white-merge.csv')
    # wineX = wine_data.drop('quality',1).copy().values
    # wineY = wine_data['quality'].copy().values

    adult_trgX, adult_tstX, adult_trgY, adult_tstY = ms.train_test_split(
        adult_income_X,
        adult_income_Y,
        test_size=0.3,
        random_state=0,
        stratify=adult_income_Y)
    # wine_trgX, wine_tstX, wine_trgY, wine_tstY = ms.train_test_split(wineX, wineY, test_size=0.3, random_state=0,stratify=wineY)

    pipeA = Pipeline([('Scale', StandardScaler()),
                      ('MLP',
                       MLPClassifier(max_iter=2000,
                                     early_stopping=True,
                                     random_state=55))])

    pipeM = Pipeline([('Scale', StandardScaler()),
                      ('Cull1',
                       SelectFromModel(RandomForestClassifier(random_state=1),
                                       threshold='median')),
                      ('Cull2',
                       SelectFromModel(RandomForestClassifier(random_state=2),
                                       threshold='median')),
                      ('Cull3',
                       SelectFromModel(RandomForestClassifier(random_state=3),
                                       threshold='median')),
                      ('Cull4',
                       SelectFromModel(RandomForestClassifier(random_state=4),
                                       threshold='median')),
                      ('MLP',
                       MLPClassifier(max_iter=2000,
                                     early_stopping=True,
                                     random_state=55))])

    d = adult_income_X.shape[1]
    hiddens_adult = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]]
    alphas = [10**-x for x in np.arange(-1, 5.01, 1 / 2)]
    alphasM = [10**-x for x in np.arange(-1, 9.01, 1 / 2)]
    # d = wineX.shape[1]
    hiddens_wine = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]]
    params_adult = {
        'MLP__activation': ['relu', 'logistic'],
        'MLP__alpha': alphas,
        'MLP__hidden_layer_sizes': hiddens_adult
    }
    # params_wine = {'MLP__activation':['relu','logistic'],'MLP__alpha':alphas,'MLP__hidden_layer_sizes':hiddens_wine}
    #
    # wine_clf = basicResults(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,params_wine,'ANN','wine')
    adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX,
                             adult_tstY, params_adult, 'ANN', 'adult')

    #wine_final_params = {'MLP__hidden_layer_sizes': (500,), 'MLP__activation': 'logistic', 'MLP__alpha': 10.0}
    #adult_final_params ={'MLP__hidden_layer_sizes': (28, 28, 28), 'MLP__activation': 'logistic', 'MLP__alpha': 0.0031622776601683794}

    # wine_final_params = wine_clf.best_params_
    adult_final_params = adult_clf.best_params_
    adult_OF_params = adult_final_params.copy()
    adult_OF_params['MLP__alpha'] = 0
    # wine_OF_params =wine_final_params.copy()
    # wine_OF_params['MLP__alpha'] = 0

    #raise

    #
    # pipeM.set_params(**wine_final_params)
    pipeM.set_params(**{'MLP__early_stopping': False})
    # makeTimingCurve(wineX,wineY,pipeM,'ANN','wine')
    pipeA.set_params(**adult_final_params)
    pipeA.set_params(**{'MLP__early_stopping': False})
    makeTimingCurve(adult_income_X, adult_income_Y, pipeA, 'ANN', 'adult')

    # pipeM.set_params(**wine_final_params)
    pipeM.set_params(**{'MLP__early_stopping': False})
    # iterationLC(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,{'MLP__max_iter':[2**x for x in range(12)]+[2100,2200,2300,2400,2500,2600,2700,2800,2900,3000]},'ANN','wine')
    pipeA.set_params(**adult_final_params)
    pipeA.set_params(**{'MLP__early_stopping': False})
    iterationLC(
        pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, {
            'MLP__max_iter': [2**x for x in range(12)] +
            [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000]
        }, 'ANN', 'adult')

    # pipeM.set_params(**wine_OF_params)
    pipeM.set_params(**{'MLP__early_stopping': False})
    # iterationLC(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,{'MLP__max_iter':[2**x for x in range(12)]+[2100,2200,2300,2400,2500,2600,2700,2800,2900,3000]},'ANN_OF','wine')
    pipeA.set_params(**adult_OF_params)
    pipeA.set_params(**{'MLP__early_stopping': False})
    iterationLC(
        pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY, {
            'MLP__max_iter': [2**x for x in range(12)] +
            [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000]
        }, 'ANN_OF', 'adult')
Exemple #10
0
adult_booster = AdaBoostClassifier(algorithm='SAMME',learning_rate=1,base_estimator=adult_base,random_state=55)
OF_booster = AdaBoostClassifier(algorithm='SAMME',learning_rate=1,base_estimator=OF_base,random_state=55)

#pipeM = Pipeline([('Scale',StandardScaler()),
#                 ('Cull1',SelectFromModel(RandomForestClassifier(random_state=1),threshold='median')),
#                 ('Cull2',SelectFromModel(RandomForestClassifier(random_state=2),threshold='median')),
#                 ('Cull3',SelectFromModel(RandomForestClassifier(random_state=3),threshold='median')),
#                 ('Cull4',SelectFromModel(RandomForestClassifier(random_state=4),threshold='median')),
#                 ('Boost',madelon_booster)])

pipeA = Pipeline([('Scale',StandardScaler()),
                 ('Boost',adult_booster)])

#
#madelon_clf = basicResults(pipeM,madelon_trgX,madelon_trgY,madelon_tstX,madelon_tstY,paramsM,'Boost','madelon')
adult_clf = basicResults(pipeA,adult_trgX,adult_trgY,adult_tstX,adult_tstY,paramsA,'Boost','adult')

#
#
#madelon_final_params = {'n_estimators': 20, 'learning_rate': 0.02}
#adult_final_params = {'n_estimators': 10, 'learning_rate': 1}
#OF_params = {'learning_rate':1}

#madelon_final_params = madelon_clf.best_params_
adult_final_params = adult_clf.best_params_
OF_params = {'Boost__base_estimator__alpha':-1, 'Boost__n_estimators':50}

##
#pipeM.set_params(**madelon_final_params)
pipeA.set_params(**adult_final_params)
#makeTimingCurve(madelonX,madelonY,pipeM,'Boost','madelon')
                                      threshold='median')),
                     ('Cull4',
                      SelectFromModel(RandomForestClassifier(random_state=4),
                                      threshold='median')),
                     ('DT', dtclf_pruned(random_state=55))])

pipeS = Pipeline([('Scale', StandardScaler()),
                  ('DT', dtclf_pruned(random_state=55))])

params = {
    'DT__criterion': ['gini', 'entropy'],
    'DT__alpha': alphas,
    'DT__class_weight': ['balanced']
}

spam_clf_fs = basicResults(pipeS_fs, spam_trgX, spam_trgY, spam_tstX,
                           spam_tstY, params, 'DT', 'spam_fs')
spam_clf = basicResults(pipeS, spam_trgX, spam_trgY, spam_tstX, spam_tstY,
                        params, 'DT', 'spam')

#madelon_final_params = {'DT__alpha': -0.00031622776601683794, 'DT__class_weight': 'balanced', 'DT__criterion': 'entropy'}
#adult_final_params = {'class_weight': 'balanced', 'alpha': 0.0031622776601683794, 'criterion': 'entropy'}
spam_fs_final_params = spam_clf_fs.best_params_
spam_final_params = spam_clf.best_params_

pipeS_fs.set_params(**spam_fs_final_params)
makeTimingCurve(spamX, spamY, pipeS_fs, 'DT', 'spam_fs')
pipeS.set_params(**spam_final_params)
makeTimingCurve(spamX, spamY, pipeS, 'DT', 'spam')

DTpruningVSnodes(pipeS_fs, alphas, spam_trgX, spam_trgY, 'spam_fs')
DTpruningVSnodes(pipeS, alphas, spam_trgX, spam_trgY, 'spam')
Exemple #12
0
cancer_booster = AdaBoostClassifier(algorithm='SAMME',learning_rate=1,base_estimator=cancer_base,random_state=55)
adult_booster = AdaBoostClassifier(algorithm='SAMME',learning_rate=1,base_estimator=adult_base,random_state=55)
OF_booster = AdaBoostClassifier(algorithm='SAMME',learning_rate=1,base_estimator=OF_base,random_state=55)

pipeM = Pipeline([('Scale',StandardScaler()),
                 ('Cull1',SelectFromModel(RandomForestClassifier(random_state=1),threshold='median')),
                 ('Cull2',SelectFromModel(RandomForestClassifier(random_state=2),threshold='median')),
                 ('Cull3',SelectFromModel(RandomForestClassifier(random_state=3),threshold='median')),
                 ('Cull4',SelectFromModel(RandomForestClassifier(random_state=4),threshold='median')),
                 ('Boost',cancer_booster)])

pipeA = Pipeline([('Scale',StandardScaler()),                
                 ('Boost',adult_booster)])

#
cancer_clf = basicResults(pipeM,cancer_trgX,cancer_trgY,cancer_tstX,cancer_tstY,paramsM,'Boost','cancer')        
adult_clf = basicResults(pipeA,adult_trgX,adult_trgY,adult_tstX,adult_tstY,paramsA,'Boost','adult')        

#
#madelon_final_params = {'n_estimators': 20, 'learning_rate': 0.02}
#adult_final_params = {'n_estimators': 10, 'learning_rate': 1}
#OF_params = {'learning_rate':1}

cancer_final_params = cancer_clf.best_params_
adult_final_params = adult_clf.best_params_
OF_params = {'Boost__base_estimator__alpha':-1, 'Boost__n_estimators':50}

##
pipeM.set_params(**cancer_final_params)
pipeA.set_params(**adult_final_params)
makeTimingCurve(cancerX,cancerY,pipeM,'Boost','cancer')
Exemple #13
0
paramsA = {
    'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 45, 60, 80, 100],
    'Boost__base_estimator__alpha': alphas
}

ab_booster = AdaBoostClassifier(algorithm='SAMME',
                                learning_rate=1,
                                base_estimator=ab_base,
                                random_state=55)
OF_booster = AdaBoostClassifier(algorithm='SAMME',
                                learning_rate=1,
                                base_estimator=OF_base,
                                random_state=55)

pipeA = Pipeline([('Scale', StandardScaler()), ('Boost', ab_booster)])

ab_clf = basicResults(pipeA, ab_trgX, ab_trgY, ab_tstX, ab_tstY, paramsA,
                      'Boost', 'ab')

ab_final_params = ab_clf.best_params_
OF_params = {'Boost__base_estimator__alpha': -1, 'Boost__n_estimators': 50}

pipeA.set_params(**ab_final_params)

makeTimingCurve(abX, abY, pipeA, 'Boost', 'ab')
pipeA.set_params(**ab_final_params)
iterationLC(pipeA, ab_trgX, ab_trgY, ab_tstX, ab_tstY,
            {'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 40, 50]}, 'Boost',
            'ab')
#pipeM = Pipeline([('Scale',StandardScaler()),
#                 ('Cull1',SelectFromModel(RandomForestClassifier(random_state=1),threshold='median')),
#                 ('Cull2',SelectFromModel(RandomForestClassifier(random_state=2),threshold='median')),
#                 ('Cull3',SelectFromModel(RandomForestClassifier(random_state=3),threshold='median')),
#                 ('Cull4',SelectFromModel(RandomForestClassifier(random_state=4),threshold='median')),
#                 ('Boost',madelon_booster)])

# Build pipeline for feature scaling and learner
pipeA = Pipeline([('Scale', StandardScaler()), ('Boost', adult_booster)])
pipeM = Pipeline([('Scale', StandardScaler()), ('Boost', mushrooms_booster)])
pipeR = Pipeline([('Scale', StandardScaler()), ('Boost', redwine_booster)])

# Perform grid search cross validation over the hyperparameter grid
#madelon_clf = basicResults(pipeM,madelon_trgX,madelon_trgY,madelon_tstX,madelon_tstY,paramsM,'Boost','madelon')
adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY,
                         paramsA, 'Boost', 'adult')
mushrooms_clf = basicResults(pipeM, mushrooms_trgX, mushrooms_trgY,
                             mushrooms_tstX, mushrooms_tstY, paramsM, 'Boost',
                             'mushrooms')
redwine_clf = basicResults(pipeR, redwine_trgX, redwine_trgY, redwine_tstX,
                           redwine_tstY, paramsR, 'Boost', 'redwine')

#
#madelon_final_params = {'n_estimators': 20, 'learning_rate': 0.02}
#adult_final_params = {'n_estimators': 10, 'learning_rate': 1}
#OF_params = {'learning_rate':1}

# Save hyperparameters that grid search cross validation has identified as optimal
#madelon_final_params = madelon_clf.best_params_
adult_final_params = adult_clf.best_params_
mushrooms_final_params = mushrooms_clf.best_params_
Exemple #15
0
                  ('SVM', primalSVM_RBF())])

pipeA = Pipeline([('Scale', StandardScaler()), ('SVM', primalSVM_RBF())])

params_adult = {
    'SVM__alpha': alphas,
    'SVM__n_iter': [int((1e6 / N_adult) / .8) + 1],
    'SVM__gamma_frac': gamma_fracsA
}
params_madelon = {
    'SVM__alpha': alphas,
    'SVM__n_iter': [int((1e6 / N_madelon) / .8) + 1],
    'SVM__gamma_frac': gamma_fracsM
}
#
madelon_clf = basicResults(pipeM, madelon_trgX, madelon_trgY, madelon_tstX,
                           madelon_tstY, params_madelon, 'SVM_RBF', 'madelon')
adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY,
                         params_adult, 'SVM_RBF', 'adult')

madelon_final_params = madelon_clf.best_params_
madelon_OF_params = madelon_final_params.copy()
madelon_OF_params['SVM__alpha'] = 1e-16
adult_final_params = adult_clf.best_params_
adult_OF_params = adult_final_params.copy()
adult_OF_params['SVM__alpha'] = 1e-16

pipeM.set_params(**madelon_final_params)
makeTimingCurve(madelonX, madelonY, pipeM, 'SVM_RBF', 'madelon')
pipeA.set_params(**adult_final_params)
makeTimingCurve(adultX, adultY, pipeM, 'SVM_RBF', 'adult')
Exemple #16
0
alphasM = [10**-x for x in np.arange(-1, 9.01, 1 / 2)]
d = madelonX.shape[1]
# d = d//(2**4)
hiddens_madelon = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]]
params_adult = {
    'MLP__activation': ['relu', 'logistic'],
    'MLP__alpha': alphas,
    'MLP__hidden_layer_sizes': hiddens_adult
}
params_madelon = {
    'MLP__activation': ['relu', 'logistic'],
    'MLP__alpha': alphas,
    'MLP__hidden_layer_sizes': hiddens_madelon
}
#
madelon_clf = basicResults(pipeM, madelon_trgX, madelon_trgY, madelon_tstX,
                           madelon_tstY, params_madelon, 'ANN', 'cancer')
adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY,
                         params_adult, 'ANN', 'adult')

#madelon_final_params = {'MLP__hidden_layer_sizes': (500,), 'MLP__activation': 'logistic', 'MLP__alpha': 10.0}
#adult_final_params ={'MLP__hidden_layer_sizes': (28, 28, 28), 'MLP__activation': 'logistic', 'MLP__alpha': 0.0031622776601683794}

madelon_final_params = madelon_clf.best_params_
adult_final_params = adult_clf.best_params_
adult_OF_params = adult_final_params.copy()
adult_OF_params['MLP__alpha'] = 0
madelon_OF_params = madelon_final_params.copy()
madelon_OF_params['MLP__alpha'] = 0

#raise
adultY = adult['income'].copy().values

adult_trgX, adult_tstX, adult_trgY, adult_tstY = ms.train_test_split(adultX, adultY, test_size=0.3, random_state=0,stratify=adultY)     

N_adult = adult_trgX.shape[0]

alphas = [10**-x for x in np.arange(1,9.01,1/2)]


#Linear SVM
pipeA = Pipeline([('Scale',StandardScaler()),                
                 ('SVM',SGDClassifier(loss='hinge',l1_ratio=0,penalty='l2',class_weight='balanced',random_state=55))])

params_adult = {'SVM__alpha':alphas,'SVM__n_iter':[int((1e6/N_adult)/.8)+1]}

adult_clf = basicResults(pipeA,adult_trgX,adult_trgY,adult_tstX,adult_tstY,params_adult,'SVM_Lin','adult')        

adult_final_params =adult_clf.best_params_
#adult_OF_params ={'SVM__n_iter': 55, 'SVM__alpha': 1e-16}
#
#
adult_OF_params = adult_final_params.copy()
adult_OF_params['SVM__alpha'] = 1e-16

pipeA.set_params(**adult_final_params)
makeTimingCurve(adultX,adultY,pipeA,'SVM_Lin','adult')

pipeA.set_params(**adult_final_params)
iterationLC(pipeA,adult_trgX,adult_trgY,adult_tstX,adult_tstY,{'SVM__n_iter':np.arange(1,75,3)},'SVM_Lin','adult')                
#
pipeA.set_params(**adult_OF_params)
Exemple #18
0
from sklearn.feature_selection import SelectFromModel

ab = pd.read_hdf('datasets.hdf', 'ab')
abX = ab.drop('rings', 1).copy().values
abY = ab['rings'].copy().values

ab_trgX, ab_tstX, ab_trgY, ab_tstY = ms.train_test_split(abX,
                                                         abY,
                                                         test_size=0.3,
                                                         random_state=0,
                                                         stratify=abY)

d = abX.shape[1]
hiddens_ab = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]]
alphas = [10**-x for x in np.arange(1, 9.01, 1 / 2)]

pipeA = Pipeline([('Scale', StandardScaler()), ('KNN', knnC())])

params_ab = {
    'KNN__metric': ['manhattan', 'euclidean', 'chebyshev'],
    'KNN__n_neighbors': np.arange(1, 51, 3),
    'KNN__weights': ['uniform', 'distance']
}

ab_clf = basicResults(pipeA, ab_trgX, ab_trgY, ab_tstX, ab_tstY, params_ab,
                      'KNN', 'ab')

ab_final_params = ab_clf.best_params_

pipeA.set_params(**ab_final_params)
makeTimingCurve(abX, abY, pipeA, 'KNN', 'ab')
Exemple #19
0
                      MLPClassifier(max_iter=2000,
                                    early_stopping=True,
                                    random_state=55))])

d = spamX.shape[1]
hiddens_spam = [(h, ) * l for l in [1, 2, 3] for h in [d, int(d // 2), d * 2]]
alphas = [10**-x for x in np.arange(-1, 8.01, 1 / 2)]

params_spam = {
    'MLP__activation': ['relu', 'logistic'],
    'MLP__alpha': alphas,
    'MLP__hidden_layer_sizes': hiddens_spam
}

#spam_clf = basicResults(pipeS,spam_trgX,spam_trgY,spam_tstX,spam_tstY,params_spam,'ANN','spam')
spam_clf_fs = basicResults(pipeS_fs, spam_trgX, spam_trgY, spam_tstX,
                           spam_tstY, params_spam, 'ANN', 'spam_fs')

#spam_final_params = spam_clf.best_params_
#spam_OF_params =spam_final_params.copy()
#spam_OF_params['MLP__alpha'] = 0

spam_fs_final_params = spam_clf_fs.best_params_
spam_fs_OF_params = spam_fs_final_params.copy()
spam_fs_OF_params['MLP__alpha'] = 0

#pipeS.set_params(**spam_final_params)
#pipeS.set_params(**{'MLP__early_stopping':False})
#makeTimingCurve(spamX,spamY,pipeS,'ANN','spam')

pipeS_fs.set_params(**spam_fs_final_params)
pipeS_fs.set_params(**{'MLP__early_stopping': False})
#                 ('Cull2',SelectFromModel(RandomForestClassifier(random_state=2),threshold='median')),
#                 ('Cull3',SelectFromModel(RandomForestClassifier(random_state=3),threshold='median')),
#                 ('Cull4',SelectFromModel(RandomForestClassifier(random_state=4),threshold='median')),
#                 ('MLP',MLPClassifier(max_iter=2000,early_stopping=True,random_state=55))])

d = spamX.shape[1]
hiddens_spam = [(h, ) * l for l in [3] for h in [d * 2]]
alphas = [10**-x for x in np.arange(-1, 8.01, 1 / 2)]

params_spam = {
    'MLP__activation': ['relu'],
    'MLP__alpha': alphas,
    'MLP__hidden_layer_sizes': hiddens_spam
}

spam_clf = basicResults(pipeS, spam_trgX, spam_trgY, spam_tstX, spam_tstY,
                        params_spam, 'ANN', 'spam')
#spam_clf_fs = basicResults(pipeS_fs,spam_trgX,spam_trgY,spam_tstX,spam_tstY,params_spam,'ANN','spam_fs')

spam_final_params = spam_clf.best_params_
spam_OF_params = spam_final_params.copy()
spam_OF_params['MLP__alpha'] = 0

#spam_fs_final_params = spam_clf_fs.best_params_
#spam_fs_OF_params =spam_fs_final_params.copy()
#spam_fs_OF_params['MLP__alpha'] = 0

pipeS.set_params(**spam_final_params)
pipeS.set_params(**{'MLP__early_stopping': False})
makeTimingCurve(spamX, spamY, pipeS, 'ANN', 'spam')

#pipeS_fs.set_params(**spam_fs_final_params)
Exemple #21
0
pipeA = Pipeline([('Scale', StandardScaler()),
                  ('SVM',
                   SGDClassifier(loss='hinge',
                                 l1_ratio=0,
                                 penalty='l2',
                                 class_weight='balanced',
                                 random_state=55))])

params_adult = {
    'SVM__alpha': alphas,
    'SVM__n_iter': [int((1e6 / N_adult) / .8) + 1]
}

# print("target",np.unique(adult_trgY))
# print("train",np.unique(adult_tstY))
adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY,
                         params_adult, 'SVM_Lin', 'adult')

adult_final_params = adult_clf.best_params_
#adult_OF_params ={'SVM__n_iter': 55, 'SVM__alpha': 1e-16}
#
#
adult_OF_params = adult_final_params.copy()
adult_OF_params['SVM__alpha'] = 1e-16

pipeA.set_params(**adult_final_params)
makeTimingCurve(adultX, adultY, pipeA, 'SVM_Lin', 'adult')

pipeA.set_params(**adult_final_params)
iterationLC(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY,
            {'SVM__n_iter': np.arange(1, 75, 3)}, 'SVM_Lin', 'adult')
#
#                 ('Cull1',SelectFromModel(RandomForestClassifier(),threshold='median')),
#                 ('Cull2',SelectFromModel(RandomForestClassifier(),threshold='median')),
#                 ('Cull3',SelectFromModel(RandomForestClassifier(),threshold='median')),
#                 ('Cull4',SelectFromModel(RandomForestClassifier(),threshold='median')),
#                 ('KNN',knnC())])  

pipeA = Pipeline([('Scale',StandardScaler()),                
                 ('KNN',knnC())])  



#params_madelon= {'KNN__metric':['manhattan','euclidean','chebyshev'],'KNN__n_neighbors':np.arange(1,51,3),'KNN__weights':['uniform','distance']}
params_adult= {'KNN__metric':['manhattan','euclidean','chebyshev'],'KNN__n_neighbors':np.arange(1,51,3),'KNN__weights':['uniform','distance']}

#madelon_clf = basicResults(pipeM,madelon_trgX,madelon_trgY,madelon_tstX,madelon_tstY,params_madelon,'KNN','madelon')        
adult_clf = basicResults(pipeA,adult_trgX,adult_trgY,adult_tstX,adult_tstY,params_adult,'KNN','adult')        


#madelon_final_params={'KNN__n_neighbors': 43, 'KNN__weights': 'uniform', 'KNN__p': 1}
#adult_final_params={'KNN__n_neighbors': 142, 'KNN__p': 1, 'KNN__weights': 'uniform'}
#madelon_final_params=madelon_clf.best_params_
adult_final_params=adult_clf.best_params_



#pipeM.set_params(**madelon_final_params)
#makeTimingCurve(madelonX,madelonY,pipeM,'KNN','madelon')
pipeA.set_params(**adult_final_params)
makeTimingCurve(adultX,adultY,pipeA,'KNN','adult')

Exemple #23
0
#                 ('Cull3',SelectFromModel(RandomForestClassifier(random_state=3),threshold='median')),
#                 ('Cull4',SelectFromModel(RandomForestClassifier(random_state=4),threshold='median')),
#                 ('DT',dtclf_pruned(random_state=55))])
#

pipeA = Pipeline([('Scale', StandardScaler()),
                  ('DT', dtclf_pruned(random_state=55))])

params = {
    'DT__criterion': ['gini', 'entropy'],
    'DT__alpha': alphas,
    'DT__class_weight': ['balanced']
}

#madelon_clf = basicResults(pipeM,madelon_trgX,madelon_trgY,madelon_tstX,madelon_tstY,params,'DT','madelon')
adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY,
                         params, 'DT', 'adult')

#madelon_final_params = {'DT__alpha': -0.00031622776601683794, 'DT__class_weight': 'balanced', 'DT__criterion': 'entropy'}
#adult_final_params = {'class_weight': 'balanced', 'alpha': 0.0031622776601683794, 'criterion': 'entropy'}
#madelon_final_params = madelon_clf.best_params_
adult_final_params = adult_clf.best_params_

#pipeM.set_params(**madelon_final_params)
#makeTimingCurve(madelonX,madelonY,pipeM,'DT','madelon')
pipeA.set_params(**adult_final_params)
makeTimingCurve(adultX, adultY, pipeA, 'DT', 'adult')

#DTpruningVSnodes(pipeM,alphas,madelon_trgX,madelon_trgY,'madelon')
DTpruningVSnodes(pipeA, alphas, adult_trgX, adult_trgY, 'adult')

# SPAM
Exemple #24
0
    'SVM__kernel': ['linear', 'poly', 'rbf'],
    'SVM__C': [.1, .5, 1],
    'SVM__gamma': ['scale']
}
complexity_params = {
    'name': 'SVM__C',
    'display_name': 'Penalty',
    'values': np.arange(0.001, 2.5, 0.1)
}

data_clf = basicResults(pipeM,
                        data_train_x,
                        data_train_y,
                        data_test_x,
                        data_test_y,
                        params,
                        'SVM',
                        dataset,
                        scorer='f1',
                        complexity_curve=True,
                        complexity_params=complexity_params,
                        clf_name='SVM')
data_final_params = data_clf.best_params_

pipeM.set_params(**data_final_params)
makeTimingCurve(data_x, data_y, pipeM, 'SVM', dataset)

iterationLC(pipeM,
            data_train_x,
            data_train_y,
            data_test_x,
            data_test_y, {'SVM__max_iter': range(1, 250, 10)},
Exemple #25
0
pipeM = Pipeline([  #('Scale',StandardScaler()),
    ('MLP', MLPClassifier(max_iter=2000, early_stopping=False,
                          random_state=55))
])

d = data_x.shape[1]
hiddens_data = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]]
alphas = [10**-x for x in np.arange(-1, 3.01, 1)]

params = {
    'MLP__activation': ['relu', 'logistic'],
    'MLP__hidden_layer_sizes': hiddens_data,
    'MLP__alpha': alphas
}

data_clf = basicResults(pipeM, data_train_x, data_train_y, data_test_x,
                        data_test_y, params, 'ANN', dataset)

data_final_params = data_clf.best_params_

pipeM.set_params(**data_final_params)
makeTimingCurve(data_x, data_y, pipeM, 'ANN', dataset)

iterationLC(pipeM,
            data_train_x,
            data_train_y,
            data_test_x,
            data_test_y, {'MLP__max_iter': [2**x for x in range(8)]},
            'ANN',
            dataset=dataset)
def main():

    cars = pd.read_hdf('data/processed/datasets.hdf', 'cars')
    carsX = cars.drop('Class', 1).copy().values
    carsY = cars['Class'].copy().values

    madelon = pd.read_hdf('data/processed/datasets.hdf', 'madelon')
    madelonX = madelon.drop('Class', 1).copy().values
    madelonY = madelon['Class'].copy().values

    cars_trgX, cars_tstX, cars_trgY, cars_tstY = ms.train_test_split(
        carsX, carsY, test_size=0.3, random_state=0, stratify=carsY)
    madelon_trgX, madelon_tstX, madelon_trgY, madelon_tstY = ms.train_test_split(
        madelonX, madelonY, test_size=0.3, random_state=0, stratify=madelonY)

    pipeA = Pipeline([('Scale', StandardScaler()),
                      ('MLP',
                       MLPClassifier(max_iter=2000,
                                     early_stopping=True,
                                     random_state=55))])

    pipeM = Pipeline([('Scale', StandardScaler()),
                      ('Cull1',
                       SelectFromModel(RandomForestClassifier(random_state=1),
                                       threshold='median')),
                      ('Cull2',
                       SelectFromModel(RandomForestClassifier(random_state=2),
                                       threshold='median')),
                      ('Cull3',
                       SelectFromModel(RandomForestClassifier(random_state=3),
                                       threshold='median')),
                      ('Cull4',
                       SelectFromModel(RandomForestClassifier(random_state=4),
                                       threshold='median')),
                      ('MLP',
                       MLPClassifier(max_iter=2000,
                                     early_stopping=True,
                                     random_state=55))])

    d = carsX.shape[1]
    hiddens_cars = [(h, ) * l for l in [1, 2, 3] for h in [d, d // 2, d * 2]]
    alphas = [10**-x for x in np.arange(-1, 5.01, 1 / 2)]
    alphasM = [10**-x for x in np.arange(-1, 9.01, 1 / 2)]
    d = madelonX.shape[1]
    d = d // (2**4)
    hiddens_madelon = [(h, ) * l for l in [1, 2, 3]
                       for h in [d, d // 2, d * 2]]
    params_cars = {
        'MLP__activation': ['relu', 'logistic'],
        'MLP__alpha': alphas,
        'MLP__hidden_layer_sizes': hiddens_cars
    }
    params_madelon = {
        'MLP__activation': ['relu', 'logistic'],
        'MLP__alpha': alphas,
        'MLP__hidden_layer_sizes': hiddens_madelon
    }
    #
    madelon_clf = basicResults(pipeM, madelon_trgX, madelon_trgY, madelon_tstX,
                               madelon_tstY, params_madelon, 'ANN', 'madelon')
    cars_clf = basicResults(pipeA, cars_trgX, cars_trgY, cars_tstX, cars_tstY,
                            params_cars, 'ANN', 'cars')

    #madelon_final_params = {'MLP__hidden_layer_sizes': (500,), 'MLP__activation': 'logistic', 'MLP__alpha': 10.0}
    #cars_final_params ={'MLP__hidden_layer_sizes': (28, 28, 28), 'MLP__activation': 'logistic', 'MLP__alpha': 0.0031622776601683794}

    madelon_final_params = madelon_clf.best_params_
    cars_final_params = cars_clf.best_params_
    cars_OF_params = cars_final_params.copy()
    cars_OF_params['MLP__alpha'] = 0
    madelon_OF_params = madelon_final_params.copy()
    madelon_OF_params['MLP__alpha'] = 0

    #raise

    #
    pipeM.set_params(**madelon_final_params)
    pipeM.set_params(**{'MLP__early_stopping': False})
    makeTimingCurve(madelonX, madelonY, pipeM, 'ANN', 'madelon')
    pipeA.set_params(**cars_final_params)
    pipeA.set_params(**{'MLP__early_stopping': False})
    makeTimingCurve(carsX, carsY, pipeA, 'ANN', 'cars')

    pipeM.set_params(**madelon_final_params)
    pipeM.set_params(**{'MLP__early_stopping': False})
    iterationLC(
        pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, {
            'MLP__max_iter': [2**x for x in range(12)] +
            [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000]
        }, 'ANN', 'madelon')
    pipeA.set_params(**cars_final_params)
    pipeA.set_params(**{'MLP__early_stopping': False})
    iterationLC(
        pipeA, cars_trgX, cars_trgY, cars_tstX, cars_tstY, {
            'MLP__max_iter': [2**x for x in range(12)] +
            [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000]
        }, 'ANN', 'cars')

    pipeM.set_params(**madelon_OF_params)
    pipeM.set_params(**{'MLP__early_stopping': False})
    iterationLC(
        pipeM, madelon_trgX, madelon_trgY, madelon_tstX, madelon_tstY, {
            'MLP__max_iter': [2**x for x in range(12)] +
            [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000]
        }, 'ANN_OF', 'madelon')
    pipeA.set_params(**cars_OF_params)
    pipeA.set_params(**{'MLP__early_stopping': False})
    iterationLC(
        pipeA, cars_trgX, cars_trgY, cars_tstX, cars_tstY, {
            'MLP__max_iter': [2**x for x in range(12)] +
            [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000]
        }, 'ANN_OF', 'cars')
Exemple #27
0
def main():
    adult = pd.read_csv('data/adult_parsed.csv')
    # plt.figure(figsize=(15,12))
    # cor_map = adult.corr()
    # sns.heatmap(cor_map, annot=True, fmt='.3f', cmap='YlGnBu')
    # plt.show()

    adult['net_capital'] = adult['capital-gain']-adult['capital-loss']
    adult = adult.drop(["fnlwgt","capital-gain","capital-loss","workclass"],axis=1)

    adult['income']=adult['income'].map({'<=50K': 0, '>50K': 1})
    adult['gender'] = adult['gender'].map({'Male': 0, 'Female': 1}).astype(int)
    adult['race'] = adult['race'].map({'Black': 0, 'Asian-Pac-Islander': 1, 'Other': 2, 'White': 3,
                                       'Amer-Indian-Eskimo': 4}).astype(int)
    adult['marital-status'] = adult['marital-status'].map({'Never-married':0,'Widowed':1,'Divorced':2, 'Separated':3,
                                                           'Married-spouse-absent':4, 'Married-civ-spouse':5, 'Married-AF-spouse':6})
    adult['education'] = adult['education'].map({'Preschool':0,'1st-4th':1,'5th-6th':2, '7th-8th':3,
                                                 '9th':4, '10th':5, '11th':6, '12th':7, 'Prof-school':8,
                                                 'HS-grad':9, 'Some-college':10, 'Assoc-voc':11, 'Assoc-acdm':12,
                                                 'Bachelors':13, 'Masters':14, 'Doctorate':15})

    adult['occupation'] = adult['occupation'].map({'Priv-house-serv':0,'Protective-serv':1,'Handlers-cleaners':2, 'Machine-op-inspct':3,
                                                   'Adm-clerical':4, 'Farming-fishing':5, 'Transport-moving':6, 'Craft-repair':7, 'Other-service':8,
                                                   'Tech-support':9, 'Sales':10, 'Exec-managerial':11, 'Prof-specialty':12, 'Armed-Forces':13 })
    adult['native-country'] = adult['native-country'].map({'?':-1,'Puerto-Rico':0,'Haiti':1,'Cuba':2, 'Iran':3,
                                                           'Honduras':4, 'Jamaica':5, 'Vietnam':6, 'Mexico':7, 'Dominican-Republic':8,
                                                           'Laos':9, 'Ecuador':10, 'El-Salvador':11, 'Cambodia':12, 'Columbia':13,
                                                           'Guatemala':14, 'South':15, 'India':16, 'Nicaragua':17, 'Yugoslavia':18,
                                                           'Philippines':19, 'Thailand':20, 'Trinadad&Tobago':21, 'Peru':22, 'Poland':23,
                                                           'China':24, 'Hungary':25, 'Greece':26, 'Taiwan':27, 'Italy':28, 'Portugal':29,
                                                           'France':30, 'Hong':31, 'England':32, 'Scotland':33, 'Ireland':34,
                                                           'Holand-Netherlands':35, 'Canada':36, 'Germany':37, 'Japan':38,
                                                           'Outlying-US(Guam-USVI-etc)':39, 'United-States':40
                                                           })

    adult['relationship'] = adult['relationship'].map({'Unmarried':0,'Other-relative':1, 'Not-in-family':2,
                                                       'Wife':3, 'Husband':4,'Own-child':5})

    adult = pd.get_dummies(adult)
    adult_income_X = adult.drop('income',1).copy().values
    adult_income_Y = adult['income'].copy().values

    # wine_data = pd.read_csv('data/wine-red-white-merge.csv')

    # wineX = wine_data.drop('quality',1).copy().values
    # wineY = wine_data['quality'].copy().values





    adult_income_trgX, adult_income_tstX, adult_income_trgY, adult_income_tstY = ms.train_test_split(adult_income_X, adult_income_Y, test_size=0.3, random_state=0,stratify=adult_income_Y)
    # wine_trgX, wine_tstX, wine_trgY, wine_tstY = ms.train_test_split(wineX, wineY, test_size=0.3, random_state=0,stratify=wineY)


    d = adult_income_X.shape[1]
    hiddens_adult_income = [(h,)*l for l in [1,2,3] for h in [d,d//2,d*2]]
    alphas = [10**-x for x in np.arange(1,9.01,1/2)]
    # d = wineX.shape[1]
    # hiddens_wine = [(h,)*l for l in [1,2,3] for h in [d,d//2,d*2]]


    pipeM = Pipeline([('Scale',StandardScaler()),
                     ('Cull1',SelectFromModel(RandomForestClassifier(),threshold='median')),
                     ('Cull2',SelectFromModel(RandomForestClassifier(),threshold='median')),
                     ('Cull3',SelectFromModel(RandomForestClassifier(),threshold='median')),
                     ('Cull4',SelectFromModel(RandomForestClassifier(),threshold='median')),
                     ('KNN',knnC())])

    pipeA = Pipeline([('Scale',StandardScaler()),
                     ('KNN',knnC())])



    params_adult_income= {'KNN__metric':['manhattan','euclidean','chebyshev'],'KNN__n_neighbors':np.arange(1,51,3),'KNN__weights':['uniform','distance']}
    # params_wine= {'KNN__metric':['manhattan','euclidean','chebyshev'],'KNN__n_neighbors':np.arange(1,51,3),'KNN__weights':['uniform','distance']}


    adult_income_clf = basicResults(pipeA,adult_income_trgX,adult_income_trgY,adult_income_tstX,adult_income_tstY,params_adult_income,'KNN','adult_income')
    # wine_clf = basicResults(pipeM,wine_trgX,wine_trgY,wine_tstX,wine_tstY,params_wine,'KNN','wine')


    # wine_final_params={'KNN__n_neighbors': 43, 'KNN__weights': 'uniform', 'KNN__p': 1}
    #adult_income_final_params={'KNN__n_neighbors': 142, 'KNN__p': 1, 'KNN__weights': 'uniform'}
    # wine_final_params=wine_clf.best_params_
    adult_income_final_params=adult_income_clf.best_params_



    # pipeM.set_params(**wine_final_params)
    # makeTimingCurve(wineX,wineY,pipeM,'KNN','wine')
    pipeA.set_params(**adult_income_final_params)
    makeTimingCurve(adult_income_X,adult_income_Y,pipeA,'KNN','adult_income')
Exemple #28
0
                                 early_stopping=True,
                                 random_state=55))])

d = adultX.shape[1]
#hiddens_adult = [(h,)*l for l in [1,2,3] for h in [d/4,d/2,d,int(round(d*1.2,0))]]
hiddens_adult = [56, 56, 56]
#alphas = [10**-x for x in np.arange(-1, 5.01, 0.5)]
alphas = [0.01]

params_adult = {
    'MLP__activation': ['logistic'],
    'MLP__alpha': alphas,
    'MLP__hidden_layer_sizes': hiddens_adult
}

adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY,
                         params_adult, 'ANN', 'adult')

#
adult_final_params = adult_clf.best_params_
adult_OF_params = adult_final_params.copy()
adult_OF_params['MLP__alpha'] = 0

#raise

# Make timing curve of final model
#pipeA.set_params(**adult_final_params)
#pipeA.set_params(**{'MLP__early_stopping':False})
#makeTimingCurve(adultX,adultY,pipeA,'ANN','adult')

# Find opt number of iterations; in range 1 to 3000
pipeA.set_params(**adult_final_params)
Exemple #29
0
#                 ('Cull3',SelectFromModel(RandomForestClassifier(random_state=3),threshold='median')),
#                 ('Cull4',SelectFromModel(RandomForestClassifier(random_state=4),threshold='median')),
#                 ('DT',dtclf_pruned(random_state=55))])
#

pipeA = Pipeline([('Scale', StandardScaler()),
                  ('DT', dtclf_pruned(random_state=55))])

params = {
    'DT__criterion': ['gini', 'entropy'],
    'DT__alpha': alphas,
    'DT__class_weight': ['balanced']
}

#madelon_clf = basicResults(pipeM,madelon_trgX,madelon_trgY,madelon_tstX,madelon_tstY,params,'DT','madelon')
adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY,
                         params, 'DT', 'adult')

#madelon_final_params = {'DT__alpha': -0.00031622776601683794, 'DT__class_weight': 'balanced', 'DT__criterion': 'entropy'}
#adult_final_params = {'class_weight': 'balanced', 'alpha': 0.0031622776601683794, 'criterion': 'entropy'}
#madelon_final_params = madelon_clf.best_params_
adult_final_params = adult_clf.best_params_

#pipeM.set_params(**madelon_final_params)
#makeTimingCurve(madelonX,madelonY,pipeM,'DT','madelon')
pipeA.set_params(**adult_final_params)
makeTimingCurve(adultX, adultY, pipeA, 'DT', 'adult')

#DTpruningVSnodes(pipeM,alphas,madelon_trgX,madelon_trgY,'madelon')
DTpruningVSnodes(pipeA, alphas, adult_trgX, adult_trgY, 'adult')

###################################################################################################
Exemple #30
0
])

pipeA = Pipeline([('Scale', StandardScaler()), ('KNN', knnC())])

params_adult = {
    'KNN__metric': ['manhattan', 'euclidean', 'chebyshev'],
    'KNN__n_neighbors': np.arange(1, 51, 3),
    'KNN__weights': ['uniform', 'distance']
}
params_cancer = {
    'KNN__metric': ['manhattan', 'euclidean', 'chebyshev'],
    'KNN__n_neighbors': np.arange(1, 51, 3),
    'KNN__weights': ['uniform', 'distance']
}

cancer_clf = basicResults(pipeM, cancer_trgX, cancer_trgY, cancer_tstX,
                          cancer_tstY, params_cancer, 'KNN', 'cancer')
adult_clf = basicResults(pipeA, adult_trgX, adult_trgY, adult_tstX, adult_tstY,
                         params_adult, 'KNN', 'adult')

adult_final_params = {
    'KNN__n_neighbors': 160,
    'KNN__p': 1,
    'KNN__weights': 'uniform'
}
adult_final_params = adult_clf.best_params_
cancer_final_params = {
    'KNN__n_neighbors': 90,
    'KNN__p': 1,
    'KNN__weights': 'uniform'
}
cancer_final_params = cancer_clf.best_params_