def dt_classifier(): dt_clf = DecisionTreeClassifier(max_depth=25) dt_clf.fit(data_train, target_train) missing_data_rows, op = common.load_train_data_and_split(targetcol=6, file='data/processed_only_missing.csv', split=False) preds = list(dt_clf.predict(missing_data_rows)) # print [[x,preds.count(x)] for x in set(preds)] return preds
def dt_classifier(): dt_clf = DecisionTreeClassifier(max_depth=25) dt_clf.fit(data_train, target_train) missing_data_rows, op = common.load_train_data_and_split( targetcol=6, file='data/processed_only_missing.csv', split=False) preds = list(dt_clf.predict(missing_data_rows)) # print [[x,preds.count(x)] for x in set(preds)] return preds
def main(): data_train, data_test, target_train, target_test = common.load_train_data_and_split(file='data/processed_missing_filled_in.csv') data_train = np.asarray(data_train) target_train = np.array(target_train) target_train = target_train.astype(np.int32) print(target_train) data_train, target_train = smote.smote_data(data_train, target_train) classify(data_train, target_train, data_test, target_test)
def main(): datafiles = ['data/processed_missing_filled_in.csv', 'data/processed_without_missing.csv', 'data/processed.csv'] datanames = ['md=imputed', 'md=deleted', 'md=0s'] num_samples_per_class = [-1] #, 6000] nsnames = ['ns=all'] #, 'ns=6000'] num_classes = [2, 3] cnames = ['nc=2', 'nc=3'] oversample = [True] #, False] osnames = ["os=t"] #, "os=f"] algnames = ["NN", "DT", "RandomForest", "AdaBoost", "GaussianNB", "LDA", "QDA", "SGD", "NNet"] algs = [ KNeighborsClassifier(5), DecisionTreeClassifier(max_depth=25), RandomForestClassifier(max_depth=25, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), LDA(), QDA(), SGDClassifier(penalty='elasticnet', alpha=0.1, loss='modified_huber'), 0 ] for alg, algname in zip(algs, algnames): for dat, datname in zip(datafiles, datanames): for numspl, sname in zip(num_samples_per_class, nsnames): for numcls, cname in zip(num_classes, cnames): for os, osname in zip(oversample, osnames): algdesc = algname + "_" + datname + "_" + sname + "_" + cname + "_" + osname print(algdesc) input_train, input_test, output_train, output_test = common.load_train_data_and_split(file=dat, num_samples_per_class=numspl, num_classes=numcls, smote=os) if algname is "NNet": alg = NeuralNet(layers=[('input', InputLayer), ('dense0', DenseLayer), ('dropout0', DropoutLayer), ('dense1', DenseLayer), ('dropout1', DropoutLayer), ('output', DenseLayer)], input_shape=(None, input_train.shape[1]), dense0_num_units=300, dropout0_p=0.075, dropout1_p=0.1, dense1_num_units=750, output_num_units=numcls, output_nonlinearity=softmax, update=nesterov_momentum, update_learning_rate=0.001, update_momentum=0.99, eval_size=0.33, verbose=1, max_epochs=15) model = alg.fit(input_train, output_train) print("TRAIN ", algdesc) predictions_train = model.predict(input_train) save_results(output_train, predictions_train, algdesc+"_train", algname) print("TEST ", algdesc) predictions_test = model.predict(input_test) save_results(output_test, predictions_test, algdesc+"_test", algname)
n_iter=10, random_state=42, n_jobs=-1, average=True), KNeighborsClassifier(3), # SVC(kernel="linear", C=0.025), # SVC(gamma=2, C=1), DecisionTreeClassifier(max_depth=15), RandomForestClassifier(max_depth=15, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), LDA(), QDA() ] X_train, X_test, y_train, y_test = common.load_train_data_and_split( file='data/processed_missing_filled_in.csv') X_train = np.asarray(X_train) y_train = np.array(y_train) y_train = y_train.astype(np.int32) X_train, y_train = smote.smote_data(X_train, y_train) # iterate over classifiers for name, clf in zip(names, classifiers): print("Fitting " + name + "...") predicted_test = clf.fit(X_train, y_train).predict(X_test) test_p = ((y_test != predicted_test).sum()) / (len(X_test)) * 100 print("Error on test set: %d" % test_p)
('Percentile', SelectPercentile()): { 'Percentile__percentile': (1, 5) }, ('PCA', PCA()): { 'PCA__n_components': (2, 4, 8, 16, 32) } } learners = { ('SGD', SGDClassifier()): { 'SGD__loss': ('hinge', 'squared_hinge', 'modified_huber'), 'SGD__penalty': ('l2', 'l1', 'elasticnet'), 'SGD__alpha': tuple([0.1 ** x for x in range(1, 5)]) } } data_train, data_test, target_train, target_test = common.load_train_data_and_split(num_samples_per_class=6000, file='data/processed_missing_filled_in.csv') # 0.21 for alg_name, pipeline, params in build_pipelines(learners, selectors): grid = GridSearchCV(pipeline, params, cv=3, scoring='f1_weighted') grid.fit(data_train, target_train) predictions = grid.predict(data_test) get_results.save_results(target_test, predictions, alg_name, alg_name) print(grid.best_estimator_) print(grid.best_params_) print(grid.best_score_) print(metrics.classification_report(target_test, predictions))
# learners = { # ('sgd', SGDClassifier()): { # 'sgd__loss': ('hinge', 'squared_hinge', 'modified_huber'), # 'sgd__penalty': ('l2', 'l1', 'elasticnet'), # 'sgd__kernel': ('rbf', 'sigmoid', 'linear'), # 'sgd__alpha': tuple([0.1 ** x for x in range(1, 5)]) # } # } params = { 'base_estimator__loss': ['hinge', 'modified_huber'], 'base_estimator__penalty': ['l2', 'l1', 'elasticnet'], 'base_estimator__alpha': [0.1 ** x for x in range(1, 5)] } data_train, data_test, target_train, target_test = common.load_train_data_and_split() sgd = SGDClassifier() bagger = BaggingClassifier(sgd) grid = GridSearchCV(bagger, params, cv=10) grid.fit(data_train, target_train) print(grid.best_estimator_) print(grid.best_params_) print(grid.best_score_) predictions = grid.predict(data_test) print(metrics.precision_recall_fscore_support(target_test, predictions)) # for score in grid.grid_scores_:
# learners = { # ('sgd', SGDClassifier()): { # 'sgd__loss': ('hinge', 'squared_hinge', 'modified_huber'), # 'sgd__penalty': ('l2', 'l1', 'elasticnet'), # 'sgd__kernel': ('rbf', 'sigmoid', 'linear'), # 'sgd__alpha': tuple([0.1 ** x for x in range(1, 5)]) # } # } params = { 'base_estimator__loss': ['hinge', 'modified_huber'], 'base_estimator__penalty': ['l2', 'l1', 'elasticnet'], 'base_estimator__alpha': [0.1**x for x in range(1, 5)] } data_train, data_test, target_train, target_test = common.load_train_data_and_split( ) sgd = SGDClassifier() bagger = BaggingClassifier(sgd) grid = GridSearchCV(bagger, params, cv=10) grid.fit(data_train, target_train) print(grid.best_estimator_) print(grid.best_params_) print(grid.best_score_) predictions = grid.predict(data_test) print(metrics.precision_recall_fscore_support(target_test, predictions)) # for score in grid.grid_scores_:
names = ["SGD", "Nearest Neighbors", # "Linear SVM", "RBF SVM", "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"] classifiers = [ SGDClassifier(loss='hinge', penalty='l2', alpha=0.005, n_iter=10, random_state=42, n_jobs=-1, average=True), KNeighborsClassifier(3), # SVC(kernel="linear", C=0.025), # SVC(gamma=2, C=1), DecisionTreeClassifier(max_depth=15), RandomForestClassifier(max_depth=15, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), LDA(), QDA() ] X_train, X_test, y_train, y_test = common.load_train_data_and_split(file='data/processed_missing_filled_in.csv') X_train = np.asarray(X_train) y_train = np.array(y_train) y_train = y_train.astype(np.int32) X_train, y_train = smote.smote_data(X_train, y_train) # iterate over classifiers for name, clf in zip(names, classifiers): print("Fitting " + name + "...") predicted_test = clf.fit(X_train, y_train).predict(X_test) test_p = ((y_test != predicted_test).sum())/(len(X_test))*100 print("Error on test set: %d" % test_p)
'loss': ['hinge', 'squared_hinge', 'modified_huber'], # 'loss': ['hinge'], 'penalty': ['l2', 'l1', 'elasticnet'], # 'penalty': ['l2', 'l1', 'elasticnet'], 'alpha': [0.1 ** x for x in range(1, 5)] # 'alpha': [.001] } #data_train, data_test, target_train, target_test = common.load_test_train_as_two_class(f='data/processed_missing_filled_in.csv') #data_train, data_test, target_train, target_test = common.load_test_train_as_two_class(f='data/processed_without_missing.csv') #data_train, data_test, target_train, target_test = common.load_train_data_and_split() # 0.53 #data_train, data_test, target_train, target_test = common.load_train_data_and_split(num_samples_per_class=3000) # 0.24 #data_train, data_test, target_train, target_test = common.load_train_data_and_split(num_samples_per_class=6000, file='data/processed_missing_filled_in.csv') # 0.21 data_train, data_test, target_train, target_test = common.load_train_data_and_split(file='data/processed_missing_filled_in.csv') # 0.49 sgd = SGDClassifier() grid = GridSearchCV(sgd, params, cv=10, verbose=10) grid.fit(data_train, target_train) print(grid.best_estimator_) print(grid.best_params_) print(grid.best_score_) predictions = grid.predict(data_test) np.save('data/predictions', predictions) #print(metrics.precision_recall_fscore_support(target_test, predictions)) print(metrics.classification_report(target_test, predictions)) cm = confusion_matrix(target_test, predictions)
def main(): datafiles = [ 'data/processed_missing_filled_in.csv', 'data/processed_without_missing.csv', 'data/processed.csv' ] datanames = ['md=imputed', 'md=deleted', 'md=0s'] num_samples_per_class = [-1] #, 6000] nsnames = ['ns=all'] #, 'ns=6000'] num_classes = [2, 3] cnames = ['nc=2', 'nc=3'] oversample = [True] #, False] osnames = ["os=t"] #, "os=f"] algnames = [ "NN", "DT", "RandomForest", "AdaBoost", "GaussianNB", "LDA", "QDA", "SGD", "NNet" ] algs = [ KNeighborsClassifier(5), DecisionTreeClassifier(max_depth=25), RandomForestClassifier(max_depth=25, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), LDA(), QDA(), SGDClassifier(penalty='elasticnet', alpha=0.1, loss='modified_huber'), 0 ] for alg, algname in zip(algs, algnames): for dat, datname in zip(datafiles, datanames): for numspl, sname in zip(num_samples_per_class, nsnames): for numcls, cname in zip(num_classes, cnames): for os, osname in zip(oversample, osnames): algdesc = algname + "_" + datname + "_" + sname + "_" + cname + "_" + osname print(algdesc) input_train, input_test, output_train, output_test = common.load_train_data_and_split( file=dat, num_samples_per_class=numspl, num_classes=numcls, smote=os) if algname is "NNet": alg = NeuralNet(layers=[('input', InputLayer), ('dense0', DenseLayer), ('dropout0', DropoutLayer), ('dense1', DenseLayer), ('dropout1', DropoutLayer), ('output', DenseLayer)], input_shape=(None, input_train.shape[1]), dense0_num_units=300, dropout0_p=0.075, dropout1_p=0.1, dense1_num_units=750, output_num_units=numcls, output_nonlinearity=softmax, update=nesterov_momentum, update_learning_rate=0.001, update_momentum=0.99, eval_size=0.33, verbose=1, max_epochs=15) model = alg.fit(input_train, output_train) print("TRAIN ", algdesc) predictions_train = model.predict(input_train) save_results(output_train, predictions_train, algdesc + "_train", algname) print("TEST ", algdesc) predictions_test = model.predict(input_test) save_results(output_test, predictions_test, algdesc + "_test", algname)
params = { 'loss': ['hinge', 'squared_hinge', 'modified_huber'], # 'loss': ['hinge'], 'penalty': ['l2', 'l1', 'elasticnet'], # 'penalty': ['l2', 'l1', 'elasticnet'], 'alpha': [0.1**x for x in range(1, 5)] # 'alpha': [.001] } #data_train, data_test, target_train, target_test = common.load_test_train_as_two_class(f='data/processed_missing_filled_in.csv') #data_train, data_test, target_train, target_test = common.load_test_train_as_two_class(f='data/processed_without_missing.csv') #data_train, data_test, target_train, target_test = common.load_train_data_and_split() # 0.53 #data_train, data_test, target_train, target_test = common.load_train_data_and_split(num_samples_per_class=3000) # 0.24 #data_train, data_test, target_train, target_test = common.load_train_data_and_split(num_samples_per_class=6000, file='data/processed_missing_filled_in.csv') # 0.21 data_train, data_test, target_train, target_test = common.load_train_data_and_split( file='data/processed_missing_filled_in.csv') # 0.49 sgd = SGDClassifier() grid = GridSearchCV(sgd, params, cv=10, verbose=10) grid.fit(data_train, target_train) print(grid.best_estimator_) print(grid.best_params_) print(grid.best_score_) predictions = grid.predict(data_test) np.save('data/predictions', predictions) #print(metrics.precision_recall_fscore_support(target_test, predictions)) print(metrics.classification_report(target_test, predictions)) cm = confusion_matrix(target_test, predictions)