def main(): # Check if import is needed if os.path.exists("data/import_me.tsv"): print("Importing new dataset") CSVHandler.import_new_dataset() # evaluate_best_parameters() evaluate_best_model()
def evaluate_best_TFIDF_parameters(): documents = CSVHandler.get_document("normal", 2, True) best_scores = [] scores = [] feature_model = "TF IDF" tfidf_max_features = [1500, 1000, 700, 2000] tfidf_min_df = [5, 2, 6, 9] tfidf_max_df = [0.5, 0.6, 0.7, 0.8, 0.9] for algo in algorithms_list: for mf in tfidf_max_features: for min_df in tfidf_min_df: for max_df in tfidf_max_df: identifier_addition = "text-mode: 'normal', 2-grams, reduced-categories: True, max_features: {}, min_df: {}, max_df: {}".format(mf, min_df, max_df) algorithm = algo["algorithm"] scores.append(Algorithm.run(documents, algorithm, feature_model, identifier_addition, True, False, mf, min_df, max_df, 0, 0, 0)) # Print models sorted by accuracy print("\n\n\n\nOverview:") scores = sorted(scores, key=lambda s: (-s[1], s[0])) for s in scores: print(s) print("\nBest model:") print(max(scores, key=itemgetter(1))) best_scores.append(max(scores, key=itemgetter(1))) print("Best scores:") print(best_scores)
def evaluate_best_model(): scores = [] # Go through all possible parameters and run the algorithm in order to evaluate the best model for reduced_categories in reduced_categories_possibilities: for text_mode in text_modes: for n in possible_n_grams: documents = CSVHandler.get_document(text_mode, n, reduced_categories) identifier_addition = "text-mode: '{}', {}-grams, reduced-categories: {}".format(text_mode, n, reduced_categories) for feature_model in feature_models: for algorithm in algorithms_list: model = algorithm["algorithm"] tfidf_max_features = algorithm["tfidf_max_features"] tfidf_min_df = algorithm["tfidf_min_df"] tfidf_max_df = algorithm["tfidf_max_df"] bow_max_features = algorithm["bow_max_features"] bow_min_df = algorithm["bow_min_df"] bow_max_df = algorithm["bow_max_df"] scores.append(Algorithm.run(documents, model, feature_model, identifier_addition, True, False, tfidf_max_features, tfidf_min_df, tfidf_max_df, bow_max_features, bow_min_df, bow_max_df)) # Print models sorted by accuracy print("\n\n\n\nOverview:") scores = sorted(scores, key=lambda s: (-s[1], s[0])) for s in scores: print(s) # Only the best model when considering accuracy, which is not the case for this project! print("\nBest model:") print(max(scores, key=itemgetter(1)))
def runUpdateCheck(option, server, type, scope=None, path=None): myPath = '' if path: if option in ['csv2csv', 'xml2xml']: myPath = [] for item in path.split(','): myPath.append(item + '\\' + type) else: myPath = path + '\\' + type if type == 'report': report = True else: report = False if option == 'csv2csv': keyWord = 'upgrade' myData1 = CSVHandler.getData(myPath[0]) myData2 = CSVHandler.getData(myPath[1]) elif option == 'csv2db': keyWord = 'install' db = server myData1 = CSVHandler.getData(myPath) myrawData = getUpdateInfo.getData(db, type) myData2 = myrawData[type] elif option == 'xml2xml': keyWord = 'upgrade' myData1 = getUpdateInfo.getXML(path=myPath[0], report=report) myData2 = getUpdateInfo.getXML(path=myPath[1], report=report) elif option == 'xml2db': keyWord = 'install' db = server myData1 = getUpdateInfo.getXML(path=myPath, report=report) myrawData = getUpdateInfo.getData(db, type) myData2 = myData[type] elif option == 'db2db': keyWord = 'upgrade' db = server.split(',') myrawData1 = getUpdateInfo.getData(db[0], type) myrawData2 = getUpdateInfo.getData(db[1], type) myData1 = myrawData1[type] myData2 = myrawData2[type] getUpdateInfo.output(myData1, myData2, type, keyWord) print '\nDone'
def post(self): """ Store sent file :return: Success or fail """ # Get file from request content fin = request.files[Const.FILE + '[0]'] file_name = str(fin).split('\'')[1].split('\'')[0] fu.log(fu.get_current_time() + '[' + Const.SEND_CSV + ' ' + request.method + '] Received request to store csv file:' + file_name + '\n') # Store file result = csvh.add_file(fin, file_name) return result[0], result[1]
def getAutoParams(self): if not self.auto_params: self.auto_params = CSVHandler.getDataFromFile( 'incidentData', csv_path, None, None) return self.auto_params
def evaluate_best_parameters(): documents = CSVHandler.get_document("normal", 2, True) algorithms.hyperparameter_tuning__random_forest(documents=documents, feature_model="TF IDF")
def getCVSData(self): param=CSVHandler.getData() self.csvEventType=param
def run(documents, model, feature_model, identifier_addition, write_output, predict_uncategorized, tfidf_max_features, tfidf_min_df, tfidf_max_df, bow_max_features, bow_min_df, bow_max_df): """ Run training and 10-fold cross validation on the given model as well as out-of-sample predictions and print the results. Depending on parametrization also writes the prediction to a CSV file named after the identifier. :param documents: A list of preprocessed documents used for training and testing the model. :param model: The sklearn model to be trained and tested (e.g. RandomForestClassifier) :param feature_model: A String being either 'Bag of Words' or 'TF IDF' used for vectorizing the document. :param identifier_addition: A String of additional information to add to the identifier in order to make it distinguishable and used for file name. :param write_output: A boolean value whether or not to write the predictions into a CSV file. If predict_uncategorized is set to True, the data used for prediction is the remaining data entries which are not categorized. If it is set to False, it will use out-of-sample data (which is categorized) for the prediction. :param predict_uncategorized: A boolean value used in combination with write_output whether or not to predict on uncategorized data. Is only showing results when write_output is true. :param tfidf_max_features: An Integer value passed to TfidfVectorizer. See TfidfVectorizer documentation for more information. :param tfidf_min_df: A Float value passed to TfidfVectorizer. See TfidfVectorizer documentation for more information. :param tfidf_max_df: A Float value passed to TfidfVectorizer. See TfidfVectorizer documentation for more information. :param bow_max_features: An Integer value passed to CountVectorizer. See CountVectorizer documentation for more information. :param bow_min_df: A Float value passed to CountVectorizer. See CountVectorizer documentation for more information. :param bow_max_df: A Float value passed to CountVectorizer. See CountVectorizer documentation for more information. :return: A tuple consisting of the identifier and the mean score of the 10-fold cross validation. """ identifier = "Algorithm: '{}', feature-model: '{}', {}".format(model.__class__.__name__, feature_model, identifier_addition) print("\n\nRunning: '{}'".format(identifier)) X, y, docs = get_X_and_y(documents, feature_model, True, tfidf_max_features, tfidf_min_df, tfidf_max_df, bow_max_features, bow_min_df, bow_max_df) out_of_sample_threshold = len(X) - 200 out_of_sample_X = X[out_of_sample_threshold:] X = X[:out_of_sample_threshold] out_of_sample_y = y[out_of_sample_threshold:] y = y[:out_of_sample_threshold] k_folds = 10 X_folds = np.array_split(X, k_folds) y_folds = np.array_split(y, k_folds) scores = [] # Perform 10-fold cross validation for k in range(k_folds): X_pred = list(X_folds) X_test = X_pred.pop(k) X_pred = np.concatenate(X_pred) y_train = list(y_folds) y_test = y_train.pop(k) y_train = np.concatenate(y_train) scores.append(model.fit(X_pred, y_train).score(X_test, y_test)) print("Average score after {} fold crossvalidation: {}".format(k_folds, np.mean(scores))) # Perform predictions on out-of-sample data print("\nOut of sample:") # Model must be fitted again with the whole dataset (without out-of-sample data), because 10-fold cross validation # overwrites the model on each iteration and 1 fold is always used for training and thus the model is missing # potentially important data for the fitting process model.fit(X, y) X_pred = out_of_sample_X y_test = out_of_sample_y y_pred = model.predict(X_pred) print("Confusion matrix:") print(confusion_matrix(y_test, y_pred)) print("\n\nClassification report:") print(classification_report(y_test, y_pred)) print("\n\nAccuracy score:") accuracy = accuracy_score(y_test, y_pred) print(accuracy) # Write out-of-sample predictions to CSV file if write_output and not predict_uncategorized: out_of_sample_x_data = docs[:out_of_sample_threshold] out_of_sample_y_data = y[:out_of_sample_threshold] prediction = list(zip(zip(out_of_sample_x_data, out_of_sample_y_data), y_pred)) # Write to CSV file_name = identifier.replace(" ", "_").replace("'", "").replace(":", "").replace(",", "_") with open("data/output/" + file_name + ".csv", 'w') as out: csv_out = csv.writer(out, delimiter="|") csv_out.writerow(['test_set', 'prediction']) for row in prediction: csv_out.writerow(row) # Write new data predictions to CSV file elif write_output and predict_uncategorized: uncategorized_documents = CSVHandler.get_document(text_mode="normal", n=3, reduced_categories=True, categorized=False) X_pred, _, u_docs = get_X_and_y(uncategorized_documents, feature_model, False, tfidf_max_features, tfidf_min_df, tfidf_max_df, bow_max_features, bow_min_df, bow_max_df) y_pred = model.predict(X_pred) uncategorized_identifier = "uncategorized_" + identifier prediction = list(zip(u_docs, y_pred)) # Write to CSV file_name = uncategorized_identifier.replace(" ", "_").replace("'", "").replace(":", "").replace(",", "_") with open("data/output/uncategorized/" + file_name + ".csv", 'w') as out: csv_out = csv.writer(out, delimiter="|") csv_out.writerow(['test_set', 'prediction']) for row in prediction: csv_out.writerow(row) return (identifier, np.mean(scores)) # Return tuple