Example #1
0
def dt_classifier():
    dt_clf = DecisionTreeClassifier(max_depth=25)
    dt_clf.fit(data_train, target_train)
    
    missing_data_rows, op = common.load_train_data_and_split(targetcol=6, file='data/processed_only_missing.csv', split=False)
    
    preds = list(dt_clf.predict(missing_data_rows))
#    print [[x,preds.count(x)] for x in set(preds)]
    return preds
Example #2
0
def dt_classifier():
    dt_clf = DecisionTreeClassifier(max_depth=25)
    dt_clf.fit(data_train, target_train)

    missing_data_rows, op = common.load_train_data_and_split(
        targetcol=6, file='data/processed_only_missing.csv', split=False)

    preds = list(dt_clf.predict(missing_data_rows))
    #    print [[x,preds.count(x)] for x in set(preds)]
    return preds
Example #3
0
def main():
    data_train, data_test, target_train, target_test = common.load_train_data_and_split(file='data/processed_missing_filled_in.csv')
   
    data_train = np.asarray(data_train)
    target_train = np.array(target_train)
    target_train = target_train.astype(np.int32)
    
    print(target_train)
    
    data_train, target_train = smote.smote_data(data_train, target_train)
    
    classify(data_train, target_train, data_test, target_test)
Example #4
0
def main():
    datafiles = ['data/processed_missing_filled_in.csv', 'data/processed_without_missing.csv', 'data/processed.csv']
    datanames = ['md=imputed', 'md=deleted', 'md=0s']

    num_samples_per_class = [-1] #, 6000]
    nsnames = ['ns=all'] #, 'ns=6000']
    
    num_classes = [2, 3]
    cnames = ['nc=2', 'nc=3']
    
    oversample = [True] #, False]
    osnames = ["os=t"] #, "os=f"]
    
    algnames = ["NN", "DT", "RandomForest", "AdaBoost", "GaussianNB", "LDA", "QDA", "SGD", "NNet"]
    algs = [
        KNeighborsClassifier(5),
        DecisionTreeClassifier(max_depth=25),
        RandomForestClassifier(max_depth=25, n_estimators=10, max_features=1),
        AdaBoostClassifier(),
        GaussianNB(),
        LDA(),
        QDA(),
        SGDClassifier(penalty='elasticnet', alpha=0.1, loss='modified_huber'),
        0
    ]
    
    for alg, algname in zip(algs, algnames):
        for dat, datname in zip(datafiles, datanames):
            for numspl, sname in zip(num_samples_per_class, nsnames):
                for numcls, cname in zip(num_classes, cnames):
                    for os, osname in zip(oversample, osnames):
                        algdesc = algname + "_" + datname + "_" + sname + "_" + cname + "_" + osname
                        print(algdesc)
                        input_train, input_test, output_train, output_test = common.load_train_data_and_split(file=dat, num_samples_per_class=numspl, num_classes=numcls, smote=os)
                        
                        if algname is "NNet":
                            alg = NeuralNet(layers=[('input', InputLayer), ('dense0', DenseLayer), ('dropout0', DropoutLayer), ('dense1', DenseLayer), ('dropout1', DropoutLayer), ('output', DenseLayer)], input_shape=(None, input_train.shape[1]), dense0_num_units=300, dropout0_p=0.075, dropout1_p=0.1, dense1_num_units=750, output_num_units=numcls, output_nonlinearity=softmax, update=nesterov_momentum, update_learning_rate=0.001, update_momentum=0.99, eval_size=0.33, verbose=1, max_epochs=15)
                        
                        model = alg.fit(input_train, output_train)
                        
                        print("TRAIN ", algdesc)
                        predictions_train = model.predict(input_train)
                        save_results(output_train, predictions_train, algdesc+"_train", algname)
                        
                        print("TEST ", algdesc)
                        predictions_test = model.predict(input_test)
                        save_results(output_test, predictions_test, algdesc+"_test", algname)
Example #5
0
                  n_iter=10,
                  random_state=42,
                  n_jobs=-1,
                  average=True),
    KNeighborsClassifier(3),
    #   SVC(kernel="linear", C=0.025),
    #   SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=15),
    RandomForestClassifier(max_depth=15, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB(),
    LDA(),
    QDA()
]

X_train, X_test, y_train, y_test = common.load_train_data_and_split(
    file='data/processed_missing_filled_in.csv')

X_train = np.asarray(X_train)
y_train = np.array(y_train)
y_train = y_train.astype(np.int32)

X_train, y_train = smote.smote_data(X_train, y_train)

# iterate over classifiers
for name, clf in zip(names, classifiers):
    print("Fitting " + name + "...")

    predicted_test = clf.fit(X_train, y_train).predict(X_test)
    test_p = ((y_test != predicted_test).sum()) / (len(X_test)) * 100
    print("Error on test set: %d" % test_p)
Example #6
0
    ('Percentile', SelectPercentile()): {
        'Percentile__percentile': (1, 5)
    },

    ('PCA', PCA()): {
        'PCA__n_components': (2, 4, 8, 16, 32)
    }
}

learners = {
    ('SGD', SGDClassifier()): {
        'SGD__loss': ('hinge', 'squared_hinge', 'modified_huber'),
        'SGD__penalty': ('l2', 'l1', 'elasticnet'),
        'SGD__alpha': tuple([0.1 ** x for x in range(1, 5)])
    }
}

data_train, data_test, target_train, target_test = common.load_train_data_and_split(num_samples_per_class=6000, file='data/processed_missing_filled_in.csv') # 0.21

for alg_name, pipeline, params in build_pipelines(learners, selectors):
    grid = GridSearchCV(pipeline, params, cv=3, scoring='f1_weighted')
    grid.fit(data_train, target_train)
    predictions = grid.predict(data_test)
    get_results.save_results(target_test, predictions, alg_name, alg_name)


    print(grid.best_estimator_)
    print(grid.best_params_)
    print(grid.best_score_)
    print(metrics.classification_report(target_test, predictions))
Example #7
0
# learners = {
#     ('sgd', SGDClassifier()): {
#         'sgd__loss': ('hinge', 'squared_hinge', 'modified_huber'),
#         'sgd__penalty': ('l2', 'l1', 'elasticnet'),
#         'sgd__kernel': ('rbf', 'sigmoid', 'linear'),
#         'sgd__alpha': tuple([0.1 ** x for x in range(1, 5)])
#     }
# }

params = {
    'base_estimator__loss': ['hinge', 'modified_huber'],
    'base_estimator__penalty': ['l2', 'l1', 'elasticnet'],
    'base_estimator__alpha': [0.1 ** x for x in range(1, 5)]
}

data_train, data_test, target_train, target_test = common.load_train_data_and_split()

sgd = SGDClassifier()
bagger = BaggingClassifier(sgd)

grid = GridSearchCV(bagger, params, cv=10)
grid.fit(data_train, target_train)

print(grid.best_estimator_)
print(grid.best_params_)
print(grid.best_score_)

predictions = grid.predict(data_test)
print(metrics.precision_recall_fscore_support(target_test, predictions))

# for score in grid.grid_scores_:
Example #8
0
# learners = {
#     ('sgd', SGDClassifier()): {
#         'sgd__loss': ('hinge', 'squared_hinge', 'modified_huber'),
#         'sgd__penalty': ('l2', 'l1', 'elasticnet'),
#         'sgd__kernel': ('rbf', 'sigmoid', 'linear'),
#         'sgd__alpha': tuple([0.1 ** x for x in range(1, 5)])
#     }
# }

params = {
    'base_estimator__loss': ['hinge', 'modified_huber'],
    'base_estimator__penalty': ['l2', 'l1', 'elasticnet'],
    'base_estimator__alpha': [0.1**x for x in range(1, 5)]
}

data_train, data_test, target_train, target_test = common.load_train_data_and_split(
)

sgd = SGDClassifier()
bagger = BaggingClassifier(sgd)

grid = GridSearchCV(bagger, params, cv=10)
grid.fit(data_train, target_train)

print(grid.best_estimator_)
print(grid.best_params_)
print(grid.best_score_)

predictions = grid.predict(data_test)
print(metrics.precision_recall_fscore_support(target_test, predictions))

# for score in grid.grid_scores_:
Example #9
0
names = ["SGD", "Nearest Neighbors", # "Linear SVM", "RBF SVM", 
         "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"]
classifiers = [
    SGDClassifier(loss='hinge', penalty='l2', alpha=0.005, n_iter=10, random_state=42, n_jobs=-1, average=True),
    KNeighborsClassifier(3),
 #   SVC(kernel="linear", C=0.025),
 #   SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=15),
    RandomForestClassifier(max_depth=15, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB(),
    LDA(),
    QDA()
]

X_train, X_test, y_train, y_test = common.load_train_data_and_split(file='data/processed_missing_filled_in.csv')

X_train = np.asarray(X_train)
y_train = np.array(y_train)
y_train = y_train.astype(np.int32)

X_train, y_train = smote.smote_data(X_train, y_train)

# iterate over classifiers
for name, clf in zip(names, classifiers):
    print("Fitting " + name + "...")
        
    predicted_test = clf.fit(X_train, y_train).predict(X_test)
    test_p = ((y_test != predicted_test).sum())/(len(X_test))*100
    print("Error on test set: %d" % test_p)
    
Example #10
0
    'loss': ['hinge', 'squared_hinge', 'modified_huber'],
    # 'loss': ['hinge'],

    'penalty': ['l2', 'l1', 'elasticnet'],
    # 'penalty': ['l2', 'l1', 'elasticnet'],

    'alpha': [0.1 ** x for x in range(1, 5)]
    # 'alpha': [.001]
}

#data_train, data_test, target_train, target_test = common.load_test_train_as_two_class(f='data/processed_missing_filled_in.csv')
#data_train, data_test, target_train, target_test = common.load_test_train_as_two_class(f='data/processed_without_missing.csv')
#data_train, data_test, target_train, target_test = common.load_train_data_and_split() # 0.53
#data_train, data_test, target_train, target_test = common.load_train_data_and_split(num_samples_per_class=3000) # 0.24
#data_train, data_test, target_train, target_test = common.load_train_data_and_split(num_samples_per_class=6000, file='data/processed_missing_filled_in.csv') # 0.21
data_train, data_test, target_train, target_test = common.load_train_data_and_split(file='data/processed_missing_filled_in.csv') # 0.49
sgd = SGDClassifier()
grid = GridSearchCV(sgd, params, cv=10, verbose=10)
grid.fit(data_train, target_train)

print(grid.best_estimator_)
print(grid.best_params_)
print(grid.best_score_)

predictions = grid.predict(data_test)
np.save('data/predictions', predictions)

#print(metrics.precision_recall_fscore_support(target_test, predictions))
print(metrics.classification_report(target_test, predictions))

cm = confusion_matrix(target_test, predictions)
Example #11
0
def main():
    datafiles = [
        'data/processed_missing_filled_in.csv',
        'data/processed_without_missing.csv', 'data/processed.csv'
    ]
    datanames = ['md=imputed', 'md=deleted', 'md=0s']

    num_samples_per_class = [-1]  #, 6000]
    nsnames = ['ns=all']  #, 'ns=6000']

    num_classes = [2, 3]
    cnames = ['nc=2', 'nc=3']

    oversample = [True]  #, False]
    osnames = ["os=t"]  #, "os=f"]

    algnames = [
        "NN", "DT", "RandomForest", "AdaBoost", "GaussianNB", "LDA", "QDA",
        "SGD", "NNet"
    ]
    algs = [
        KNeighborsClassifier(5),
        DecisionTreeClassifier(max_depth=25),
        RandomForestClassifier(max_depth=25, n_estimators=10, max_features=1),
        AdaBoostClassifier(),
        GaussianNB(),
        LDA(),
        QDA(),
        SGDClassifier(penalty='elasticnet', alpha=0.1, loss='modified_huber'),
        0
    ]

    for alg, algname in zip(algs, algnames):
        for dat, datname in zip(datafiles, datanames):
            for numspl, sname in zip(num_samples_per_class, nsnames):
                for numcls, cname in zip(num_classes, cnames):
                    for os, osname in zip(oversample, osnames):
                        algdesc = algname + "_" + datname + "_" + sname + "_" + cname + "_" + osname
                        print(algdesc)
                        input_train, input_test, output_train, output_test = common.load_train_data_and_split(
                            file=dat,
                            num_samples_per_class=numspl,
                            num_classes=numcls,
                            smote=os)

                        if algname is "NNet":
                            alg = NeuralNet(layers=[('input', InputLayer),
                                                    ('dense0', DenseLayer),
                                                    ('dropout0', DropoutLayer),
                                                    ('dense1', DenseLayer),
                                                    ('dropout1', DropoutLayer),
                                                    ('output', DenseLayer)],
                                            input_shape=(None,
                                                         input_train.shape[1]),
                                            dense0_num_units=300,
                                            dropout0_p=0.075,
                                            dropout1_p=0.1,
                                            dense1_num_units=750,
                                            output_num_units=numcls,
                                            output_nonlinearity=softmax,
                                            update=nesterov_momentum,
                                            update_learning_rate=0.001,
                                            update_momentum=0.99,
                                            eval_size=0.33,
                                            verbose=1,
                                            max_epochs=15)

                        model = alg.fit(input_train, output_train)

                        print("TRAIN ", algdesc)
                        predictions_train = model.predict(input_train)
                        save_results(output_train, predictions_train,
                                     algdesc + "_train", algname)

                        print("TEST ", algdesc)
                        predictions_test = model.predict(input_test)
                        save_results(output_test, predictions_test,
                                     algdesc + "_test", algname)
Example #12
0
params = {
    'loss': ['hinge', 'squared_hinge', 'modified_huber'],
    # 'loss': ['hinge'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    # 'penalty': ['l2', 'l1', 'elasticnet'],
    'alpha': [0.1**x for x in range(1, 5)]
    # 'alpha': [.001]
}

#data_train, data_test, target_train, target_test = common.load_test_train_as_two_class(f='data/processed_missing_filled_in.csv')
#data_train, data_test, target_train, target_test = common.load_test_train_as_two_class(f='data/processed_without_missing.csv')
#data_train, data_test, target_train, target_test = common.load_train_data_and_split() # 0.53
#data_train, data_test, target_train, target_test = common.load_train_data_and_split(num_samples_per_class=3000) # 0.24
#data_train, data_test, target_train, target_test = common.load_train_data_and_split(num_samples_per_class=6000, file='data/processed_missing_filled_in.csv') # 0.21
data_train, data_test, target_train, target_test = common.load_train_data_and_split(
    file='data/processed_missing_filled_in.csv')  # 0.49
sgd = SGDClassifier()
grid = GridSearchCV(sgd, params, cv=10, verbose=10)
grid.fit(data_train, target_train)

print(grid.best_estimator_)
print(grid.best_params_)
print(grid.best_score_)

predictions = grid.predict(data_test)
np.save('data/predictions', predictions)

#print(metrics.precision_recall_fscore_support(target_test, predictions))
print(metrics.classification_report(target_test, predictions))

cm = confusion_matrix(target_test, predictions)