def get_model(classifier_name, training_data, target_data, model_settings=None): """ Function returns a trained model based on the csv file and features chosen. Creation and training of models has been separated into their own file: models.py Arguments: training_data: the data the training is performed on target_data: the targets for supervised learning model_settings: dict of settings for the classifier Returns: model: the trained model """ model = ((models.linreg(training_data, target_data)) if (classifier_name == "linreg") else (models.svm(training_data, target_data, model_settings)) if (classifier_name == "svm") else (models.cart(training_data, target_data)) if (classifier_name == "cart") else (models.knn(training_data, target_data, model_settings)) if (classifier_name == "knn") else (models.lda(training_data, target_data, model_settings)) if (classifier_name == "lda") else (models.nb(training_data, target_data)) if (classifier_name == "nb") else (models.lr(training_data, target_data)) if (classifier_name == "lr") else None) return model
def main(argv): param_fpath = sys.argv[1] # prepare training data X, y = get_train_data(param_fpath) # X: feature list y: label list # train print '\nBegin Running Logistic Regression...' lr_acc = lr(X, y) print "LR Accuracy: " + str(lr_acc) print '\nBegin Running KNN...' knn_acc = knn(X, y) print "KNN Accuracy: " + str(knn_acc) print '\nBegin Running GNB...' gnb_acc = gnb(X, y) print "GNB Accuracy: " + str(gnb_acc) print '\nBegin Running MNB...' mnb_acc = mnb(X, y) print "MNB Accuracy: " + str(mnb_acc) print '\nBegin Running BNB...' bnb_acc = bnb(X, y) print "BNB Accuracy: " + str(bnb_acc) print '\nBegin Running Decision Tree...' dtree_acc = dtree(X, y) print "Decision Tree Accuracy: " + str(dtree_acc) print '\nBegin Running SVM...' svm_acc = svm(X, y) print "SVM Accuracy: " + str(svm_acc)
def training_KFold(index, data, name): _min = sys.maxsize _model = None for train, validation in index: train_set = data[train] validation_set = data[validation] training_X, training_y = models.features_labels(train_set) validation_X, validation_y = models.features_labels(validation_set) if name == 'gbr': model, error = models.gbr(training_X, training_y, validation_X, validation_y) if name == 'svm': model, error = models.svm(training_X, training_y, validation_X, validation_y) if name == 'rfr': model, error = models.rfr(training_X, training_y, validation_X, validation_y) if name == 'mlr': model, error = models.mlr(training_X, training_y, validation_X, validation_y) if _min > error: _min = error _model = model print("update model") print(error) print() return _model
def runAllModels(X_train,X_test,y_test,y_train): y_pred= xgb(X_train,y_train,X_test) plotter(y_test,y_pred) cm,fscore,a=evaluate(y_test,y_pred) y_pred = rf(X_train,y_train,X_test) plotter(y_test,y_pred) cm,fscore,a=evaluate(y_test,y_pred) y_pred = nn(X_train,y_train,X_test) plotter(y_test,y_pred) cm,fscore,a=evaluate(y_test,y_pred) y_pred = svm(X_train,y_train,X_test) plotter(y_test,y_pred) cm,fscore,a=evaluate(y_test,y_pred)
def runModels(testdata, testlabel): res = models.knn(testdata, testlabel) print("knn: ", res.mean()) res = models.svm(testdata, testlabel) print("svm: ", res.mean()) res = models.decisionTree(testdata, testlabel) print("decision tree: ", res.mean()) res = models.naiveBayes(testdata, testlabel) print("Gaussian NB: ", res.mean()) res = models.randomForest(testdata, testlabel) print("random forest: ", res.mean()) res = models.nearestCentroid(testdata, testlabel) print("nearest centroid: ", res.mean()) res = models.extraTree(testdata, testlabel) print("extra tree: ", res.mean()) res = models.extraTrees(testdata, testlabel) print("extra trees: ", res.mean())
def training(data, name, arg): training_set = data[:3 * len(data) // 4] validation_set = data[3 * len(data) // 4:] training_X, training_y = models.features_labels(training_set) validation_X, validation_y = models.features_labels(validation_set) if name == 'gbr': model, error = models.gbr(training_X, training_y, validation_X, validation_y, n_estimators=10, min_samples_split=arg) if name == 'svm': model, error = models.svm(training_X, training_y, validation_X, validation_y) if name == 'rfr': model, error = models.rfr(training_X, training_y, validation_X, validation_y) if name == 'mlr': model, error = models.mlr(training_X, training_y, validation_X, validation_y) print(error) print() return model
training_file, development_file) print("Word length training: " + str(training_performance)) print("Word length development: " + str(development_performance)) frequency_training_performance, frequency_development_performance = \ word_frequency_threshold(training_file, development_file, counts) print("Word frequency training: " + str(frequency_training_performance)) print("Word frequency development: " + str(frequency_development_performance)) nb_development_performance = naive_bayes(training_file, development_file, counts) print("Naive bayes development: " + str(nb_development_performance)) lr_development_performance = logistic_regression(training_file, development_file, counts) print("Logistic Regression development: " + str(lr_development_performance)) svm_development_performance = models.svm(training_file, development_file, test_file, counts) print("Support vector machine development: " + str(svm_development_performance)) random_forest_development_performance = models.random_forest( training_file, development_file, test_file, counts) print("Random forest development: " + str(random_forest_development_performance)) # tune_parameter(training_file, development_file, counts)
import pandas as pd import preprocessingfile as preprocess import models data = 'pc2.csv' original_data, original_X, original_Y, combined_training_data, x_train1, x_train2, x_train, x_test, x_val, y_train1, y_train2, y_train, y_test, y_val = preprocess.my_sdp_preprocessor( data) all_data = [ original_data, original_X, original_Y, combined_training_data, x_train1, x_train2, x_train, x_test, x_val, y_train1, y_train2, y_train, y_test, y_val ] cnn_clf = models.cnn(*all_data) svm_clf = models.svm(*all_data) rf_clf = models.random_forest(*all_data) nn_clf = models.NN(*all_data) from sklearn.metrics import * def print_accuracy(model): #nn,cnn,svm,clf if (model == nn_clf): y_pred_on_val = model.predict(x_val) > 0.5 y_pred_on_test = model.predict(x_test) > 0.5 elif (model == cnn_clf): x_val_matrix = x_val.values x_val1 = x_val_matrix.reshape(x_val_matrix.shape[0], 1, len(x_val.columns), 1) y_pred_on_val = model.predict(x_val1) > 0.5