def experimentDefaultSetting(self, trainset, testset): print("Reading data") x, y = DataService().read_corpus(trainset) clf = SVM().construct_classifier("linear", 1.0) # Vectorize the text data and return an (n_samples, n_features) matrix. x_vec = DataService().vectorize_input(x) conversion_dict, y = DataService().labels_string_to_float(y) x_train, y_train, x_dev, y_dev, x_test, y_test = DataService( ).test_dev_train_split(x_vec, y) x_dev_train, y_dev_train, x_dev_test, y_dev_test = DataService( ).test_train_split(x_dev, y_dev) start_time = datetime.utcnow() print('Fitting training data on', len(x_train), 'Samples') clf.fit(x_train, y_train) training_time = (datetime.utcnow() - start_time).seconds print("Training took", training_time, 'seconds..') y_pred = clf.predict(x_dev_test) print("Accuracy score:", accuracy_score(y_pred=y_pred, y_true=y_dev_test)) print("F1 score (macro):", f1_score(y_pred=y_pred, y_true=y_dev_test, average='macro'))
def experiment_probabilities_binary(self): x, y = DataService().read_corpus('trainset.txt', use_sentiment=True) x_train, y_train, x_test, y_test = DataService().test_train_split(x, y) classifier = ClassifierService().construct_classifier() classifier.fit(x_train, y_train) y_pred_prob = classifier.predict_proba(x_test) y_pred_class = classifier.predict(x_test) param = classifier.get_params()
def experiment_probabilities_multi(self): x, y = DataService().read_corpus('trainset.txt', use_sentiment=False) x_train, y_train, x_test, y_test = DataService().test_train_split(x, y) classifier = ClassifierService().construct_classifier() classifier.fit(x_train, y_train) params = classifier.get_params(deep=True) y_pred_prob = classifier.predict_proba(x_test) print("bug stop") print("\nPosterior probabilities multi-class:") print('\t', y_pred_prob)
def experiment_binary(self): """ The code below runs the experiment for the binary classification problem """ x, y = DataService().read_corpus('trainset.txt', use_sentiment=True) """ OUR COMMENT: A splitpoint variable is used to divide the whole dataset into 75% training and 25% test sets. """ x_train, y_train, x_test, y_test = DataService().test_train_split(x, y) classifier = ClassifierService().construct_classifier() """ OUR COMMENT for classifier.fit: The classifier object is a Pipeline object from the scikit-learn package. From the documentation: https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html, we observe that this object is used to apply a number of transforms using an estimator. From the object we see that the final_estimator is a MultinomialNB, which is a Naive Bayes Estimator for multinomial objects (the sentences have more than one words). The fit functions fits each pattern (combination of one sentence from the set Xtrain and label from the set Ytrain) using the MultinomialNB estimator. There is no output to the function. However, it modifies the object classifier such that its parameters are trained """ classifier.fit(x_train, y_train) """ OUR COMMENT for classifier.predict: After the Pipeline object has fitted on the training data, we call .predict to predict on the test set Xtest. The object will perform a forward pass of the test data without updating its network paramaters. The function takes as input a vector of input. In this case a list of sentences. The output of the network is a vector of sentiment labels. In this case, the size of the output is 1500 """ y_pred = classifier.predict(x_test) print("\nPrinting scores for binary problem") PrintScores().print_precision_score(y_test=y_test, y_pred=y_pred) PrintScores().print_recall_score(y_test=y_test, y_pred=y_pred) PrintScores().print_f1_score(y_test=y_test, y_pred=y_pred) print("\nPrinting accuracy score") PrintScores().print_accuracy_score(y_test=y_test, y_pred=y_pred) PrintScores().print_confusion_matrix(y_test=y_test, y_pred=y_pred)
def __init__(self): """ :return: """ ViewAdd.__init__(self, def_view_data) self.__service = DataService() self.__id = None self.sig_submit.connect(self.__save)
def experiment_multi_class(self): x, y = DataService().read_corpus('trainset.txt', use_sentiment=False) x_train, y_train, x_test, y_test = DataService().test_train_split(x, y) classifier = ClassifierService().construct_classifier() classifier.fit(x_train, y_train) y_pred = classifier.predict(x_test) print("\nPrinting scores for multi-class problem") PrintScores().print_precision_score(y_test=y_test, y_pred=y_pred) PrintScores().print_recall_score(y_test=y_test, y_pred=y_pred) PrintScores().print_f1_score(y_test=y_test, y_pred=y_pred) print("\nPrinting accuracy score") PrintScores().print_accuracy_score(y_test=y_test, y_pred=y_pred) PrintScores().print_confusion_matrix(y_test=y_test, y_pred=y_pred)
def experimentBestModel(self, trainset, testset): print("Reading data") x, y = DataService().read_corpus(trainset) clf = SVM().construct_best_classifier() # Vectorize the text data and return an (n_samples, n_features) matrix. x_vec = DataService().vectorize_input(x) conversion_dict, y = DataService().labels_string_to_float(y) x_train, y_train, x_dev, y_dev, x_test, y_test = DataService( ).test_dev_train_split(x_vec, y) dev_sets = DataService().cross_validation_split(x_train, y_train) best_accuracy = -inf best_classifier = None clf.fit(x_train, y_train) y_pred = clf.predict(x_test) print("F1 score (macro):", f1_score(y_pred=y_pred, y_true=y_test, average='macro'))
def main(params): data_service = DataService() env_manager = EnvManager(data_service) if params[1] == "swap": env_manager.swap_db(params[2], params[3]) print(params[2] + " swapped to: " + params[3]) if params[1] == "which": db = env_manager.get_current(params[2]) msg = db["name"] + " - " + db["conf_data"]["DB_HOST"] print("current db: " + msg)
def experimentCombinatorialCrossValidation(self, trainset, testset): print("Reading data") x, y = DataService().read_corpus(trainset) clf = SVM().construct_classifier("linear", 1.0) # Vectorize the text data and return an (n_samples, n_features) matrix. x_vec = DataService().vectorize_input(x) conversion_dict, y = DataService().labels_string_to_float(y) x_train, y_train, x_dev, y_dev, x_test, y_test = DataService( ).test_dev_train_split(x_vec, y) dev_sets = DataService().cross_validation_split(x_train, y_train) best_accuracy = -inf best_classifier = None cv_results = {} for gamma in arange(0.5, 1.4, 0.15): for C in arange(0.5, 2.1, 0.25): print("\nProcessing Gamma:", gamma, "C:", C) average_score = [] for set in dev_sets: clf = SVM().construct_rbf_classifier(kernel='rbf', gamma=gamma, C=C) validation_set = set union_set = DataService().construct_union_set( set.copy(), dev_sets.copy()) # fit on the rest of the data clf.fit(union_set[0], union_set[1]) # validate on validation set y_pred = clf.predict(validation_set[0]) score = f1_score(y_true=validation_set[1], y_pred=y_pred, average='binary') average_score.append(score) score = round(mean(average_score), 3) cv_results[[C, gamma]] = score print("Average F1 score for C:", str(C) + ".", score) # save the best model and use that to classify the testset if score > best_accuracy: best_accuracy = score best_classifier = clf y_pred = best_classifier.predict(x_test) print("F1 score (macro):", f1_score(y_pred=y_pred, y_true=y_test, average='macro'))
def experimentLinearKernel(self, trainset, testset): print("Reading data") x, y = DataService().read_corpus(trainset) # Vectorize the text data and return an (n_samples, n_features) matrix. x_vec = DataService().vectorize_input(x) conversion_dict, y = DataService().labels_string_to_float(y) x_train, y_train, x_dev, y_dev, x_test, y_test = DataService( ).test_dev_train_split(x_vec, y) dev_sets = DataService().cross_validation_split(x_train, y_train) best_accuracy = -inf best_classifier = None cv_results1 = {} cv_results2 = {} for C in arange(0.5, 2.25, 0.25): print("\nProcessing C:", C) average_score1 = [] average_score2 = [] for set in dev_sets: clf2 = SVM().construct_linear_classifier(penalty='l2', C=C) validation_set = set union_set = DataService().construct_union_set( set.copy(), dev_sets.copy()) # fit on the rest of the data clf2.fit(union_set[0], union_set[1]) # validate on validation set y_pred = clf2.predict(validation_set[0]) score = f1_score(y_true=validation_set[1], y_pred=y_pred, average='binary') average_score1.append(score) cv_results1[C] = mean(average_score1) score = round(mean(average_score2), 3) print("Average F1 score for CLF1:", round(mean(average_score1), 3)) print("Average F1 score for CLF2:", round(mean(average_score2), 3)) # save the best model and use that to classify the testset if score > best_accuracy: best_accuracy = score best_classifier = clf2 y_pred = best_classifier.predict(x_test) print("F1 score (macro):", f1_score(y_pred=y_pred, y_true=y_test, average='macro'))
search['values'] = self.populateMenu(self.deptList) # menuChoices = ['Dept 1', 'Dept 2'] # tkvar.set(menuChoices[0]) print(menuChoices[1]) # popupMenu = OptionMenu(root, tkvar, *menuChoices) Label(root, text="Choose a Dept").place(x=50, y=250) # popupMenu.place(x=50, y=300) search.place(x=50, y=250) sumText = Label(self, text=("Dept Total" + (self.getDeptTotal))) sumText.place(x=200, y=550) def change_dropdown(*args): print(self.tkvar.get()) # link function to change dropdown self.tkvar.trace('w', change_dropdown) print(filename) def getDeptTotal(self): ws = self.currentFile["Sheet1"] dept = self.tkvar.get() total = DataService.sumDept(dept, ws) return str(total) root = Tk() root.geometry("800x600") DataService = DataService() app = Window(root) app.mainloop()
def experimentFeatures(self, trainset, testset): print("Reading data") x, y = DataService().read_corpus(trainset) clf = SVM().construct_classifier("linear", 1.0) # Vectorize the text data and return an (n_samples, n_features) matrix. x_vec = DataService().vectorize_input(x) conversion_dict, y = DataService().labels_string_to_float(y) x_train, y_train, x_dev, y_dev, x_test, y_test = DataService( ).test_dev_train_split(x_vec, y) x_dev_train, y_dev_train, x_dev_test, y_dev_test = DataService( ).test_train_split(x_dev, y_dev) start_time = datetime.utcnow() print('Fitting training data on', len(x_dev_train), 'Samples') clf.fit(x_train, y_train) non_zero = [] training_time = (datetime.utcnow() - start_time).seconds print("Training took", training_time, 'seconds..') y_pred = clf.predict(x_dev_test) print("Accuracy score:", accuracy_score(y_pred=y_pred, y_true=y_dev_test)) print("F1 score (macro):", f1_score(y_pred=y_pred, y_true=y_dev_test, average='macro')) coef = clf.coef_ def identity(x): return x vec = TfidfVectorizer(preprocessor=identity, tokenizer=identity) vec.fit_transform(x) names = vec.get_feature_names() coefs_and_features = list(zip(coef[0], names)) list_sorted_pos = sorted(coefs_and_features, key=lambda x: x[0], reverse=True) list_sorted_neg = sorted(coefs_and_features, key=lambda x: x[0]) features = [] for i in range(200): features.append(list_sorted_pos[i][1]) for i in range(200): features.append(list_sorted_neg[i][1]) print("\nneg", list_sorted_neg[:100], "\npos", list_sorted_pos[:100]) new_data = DataService().get_features_from_data(x, features) clf2 = SVM().construct_classifier("linear", 1.0) # Vectorize the text data and return an (n_samples, n_features) matrix. x_vec = DataService().vectorize_input(new_data) conversion_dict, y = DataService().labels_string_to_float(y) x_train, y_train, x_dev, y_dev, x_test, y_test = DataService( ).test_dev_train_split(x_vec, y) x_dev_train, y_dev_train, x_dev_test, y_dev_test = DataService( ).test_train_split(x_dev, y_dev) start_time = datetime.utcnow() print("\nTRIMMED DATA SET\n----------") print('Fitting training data on', len(x_dev_train), 'Samples') clf2.fit(x_dev_train, y_dev_train) non_zero = [] training_time = (datetime.utcnow() - start_time).seconds print("Training took", training_time, 'seconds..') y_pred = clf2.predict(x_dev_test) print("Accuracy score:", accuracy_score(y_pred=y_pred, y_true=y_dev_test)) print("F1 score (macro):", f1_score(y_pred=y_pred, y_true=y_dev_test, average='macro'))
def __init__(self, window, dataservice): self.window = window self.dataservice = DataService()
from DataService import DataService from RecommenderEngine import RecommenderEngine from Server import Server data = DataService() recommender = RecommenderEngine(data) server = Server(data, recommender) server.run()
def __init__(self): ModelTable.__init__(self) service = DataService() self.usr_set_service(service)
#!/usr/bin/python import sys from tkinter import * from DataService import DataService from EnvManager import EnvManager from GuiService import GuiService master = Tk() data_service = DataService() env_manager = EnvManager(data_service) gui_service = GuiService(master, env_manager) configs = data_service.get_db_opts() values = [x["name"] for x in configs] envs = data_service.get_envs() gui_service.create_main_menu() for i, item in enumerate(envs): item["current"] = env_manager.get_current(item["name"])["name"] gui_service.init_env_menu(item, values, i) Button(master, text='Save', command=lambda: gui_service.save_changes(envs)).grid(row=len(envs), column=1, sticky=W, pady=4) Button(master, text='Reload', command=lambda: gui_service.reload_state(envs)).grid(row=len(envs), column=2,