def main():
    # Check if import is needed
    if os.path.exists("data/import_me.tsv"):
        print("Importing new dataset")
        CSVHandler.import_new_dataset()

    # evaluate_best_parameters()
    evaluate_best_model()
def evaluate_best_TFIDF_parameters():
    documents = CSVHandler.get_document("normal", 2, True)
    best_scores = []
    scores = []
    feature_model = "TF IDF"

    tfidf_max_features = [1500, 1000, 700, 2000]
    tfidf_min_df = [5, 2, 6, 9]
    tfidf_max_df = [0.5, 0.6, 0.7, 0.8, 0.9]

    for algo in algorithms_list:
        for mf in tfidf_max_features:
            for min_df in tfidf_min_df:
                for max_df in tfidf_max_df:
                    identifier_addition = "text-mode: 'normal', 2-grams, reduced-categories: True, max_features: {}, min_df: {}, max_df: {}".format(mf, min_df, max_df)
                    algorithm = algo["algorithm"]
                    scores.append(Algorithm.run(documents, algorithm, feature_model, identifier_addition, True, False, mf, min_df, max_df, 0, 0, 0))

        # Print models sorted by accuracy
        print("\n\n\n\nOverview:")
        scores = sorted(scores, key=lambda s: (-s[1], s[0]))
        for s in scores:
            print(s)

        print("\nBest model:")
        print(max(scores, key=itemgetter(1)))
        best_scores.append(max(scores, key=itemgetter(1)))

    print("Best scores:")
    print(best_scores)
def evaluate_best_model():
    scores = []

    # Go through all possible parameters and run the algorithm in order to evaluate the best model
    for reduced_categories in reduced_categories_possibilities:
        for text_mode in text_modes:
            for n in possible_n_grams:
                documents = CSVHandler.get_document(text_mode, n, reduced_categories)
                identifier_addition = "text-mode: '{}', {}-grams, reduced-categories: {}".format(text_mode, n, reduced_categories)

                for feature_model in feature_models:
                    for algorithm in algorithms_list:
                        model = algorithm["algorithm"]
                        tfidf_max_features = algorithm["tfidf_max_features"]
                        tfidf_min_df = algorithm["tfidf_min_df"]
                        tfidf_max_df = algorithm["tfidf_max_df"]
                        bow_max_features = algorithm["bow_max_features"]
                        bow_min_df = algorithm["bow_min_df"]
                        bow_max_df = algorithm["bow_max_df"]
                        scores.append(Algorithm.run(documents, model, feature_model, identifier_addition, True, False, tfidf_max_features, tfidf_min_df, tfidf_max_df, bow_max_features, bow_min_df, bow_max_df))

    # Print models sorted by accuracy
    print("\n\n\n\nOverview:")
    scores = sorted(scores, key=lambda s: (-s[1], s[0]))
    for s in scores:
        print(s)

    # Only the best model when considering accuracy, which is not the case for this project!
    print("\nBest model:")
    print(max(scores, key=itemgetter(1)))
Esempio n. 4
0
def runUpdateCheck(option, server, type, scope=None, path=None):
    myPath = ''
    if path:
        if option in ['csv2csv', 'xml2xml']:
            myPath = []
            for item in path.split(','):
                myPath.append(item + '\\' + type)
        else:
            myPath = path + '\\' + type

    if type == 'report':
        report = True
    else:
        report = False

    if option == 'csv2csv':
        keyWord = 'upgrade'
        myData1 = CSVHandler.getData(myPath[0])
        myData2 = CSVHandler.getData(myPath[1])
    elif option == 'csv2db':
        keyWord = 'install'
        db = server
        myData1 = CSVHandler.getData(myPath)
        myrawData = getUpdateInfo.getData(db, type)
        myData2 = myrawData[type]
    elif option == 'xml2xml':
        keyWord = 'upgrade'
        myData1 = getUpdateInfo.getXML(path=myPath[0], report=report)
        myData2 = getUpdateInfo.getXML(path=myPath[1], report=report)
    elif option == 'xml2db':
        keyWord = 'install'
        db = server
        myData1 = getUpdateInfo.getXML(path=myPath, report=report)
        myrawData = getUpdateInfo.getData(db, type)
        myData2 = myData[type]
    elif option == 'db2db':
        keyWord = 'upgrade'
        db = server.split(',')
        myrawData1 = getUpdateInfo.getData(db[0], type)
        myrawData2 = getUpdateInfo.getData(db[1], type)
        myData1 = myrawData1[type]
        myData2 = myrawData2[type]

    getUpdateInfo.output(myData1, myData2, type, keyWord)
    print '\nDone'
 def post(self):
     """
     Store sent file
     :return: Success or fail
     """
     # Get file from request content
     fin = request.files[Const.FILE + '[0]']
     file_name = str(fin).split('\'')[1].split('\'')[0]
     fu.log(fu.get_current_time() + '[' + Const.SEND_CSV + ' ' +
            request.method + '] Received request to store csv file:' +
            file_name + '\n')
     # Store file
     result = csvh.add_file(fin, file_name)
     return result[0], result[1]
Esempio n. 6
0
    def getAutoParams(self):
        if not self.auto_params:
            self.auto_params = CSVHandler.getDataFromFile(
                'incidentData', csv_path, None, None)

        return self.auto_params
def evaluate_best_parameters():
    documents = CSVHandler.get_document("normal", 2, True)

    algorithms.hyperparameter_tuning__random_forest(documents=documents, feature_model="TF IDF")
Esempio n. 8
0
 def getCVSData(self):
     param=CSVHandler.getData()
     self.csvEventType=param
Esempio n. 9
0
def run(documents, model, feature_model, identifier_addition, write_output, predict_uncategorized, tfidf_max_features, tfidf_min_df, tfidf_max_df, bow_max_features, bow_min_df, bow_max_df):
    """
    Run training and 10-fold cross validation on the given model as well as out-of-sample predictions and print the
    results. Depending on parametrization also writes the prediction to a CSV file named after the identifier.

    :param documents: A list of preprocessed documents used for training and testing the model.

    :param model: The sklearn model to be trained and tested (e.g. RandomForestClassifier)

    :param feature_model: A String being either 'Bag of Words' or 'TF IDF' used for vectorizing the document.

    :param identifier_addition: A String of additional information to add to the identifier in order to make it
                                distinguishable and used for file name.

    :param write_output: A boolean value whether or not to write the predictions into a CSV file. If predict_uncategorized
                         is set to True, the data used for prediction is the remaining data entries which are not categorized.
                         If it is set to False, it will use out-of-sample data (which is categorized) for the prediction.

    :param predict_uncategorized: A boolean value used in combination with write_output whether or not to predict
                                  on uncategorized data. Is only showing results when write_output is true.

    :param tfidf_max_features: An Integer value passed to TfidfVectorizer. See TfidfVectorizer documentation for more information.

    :param tfidf_min_df: A Float value passed to TfidfVectorizer. See TfidfVectorizer documentation for more information.

    :param tfidf_max_df: A Float value passed to TfidfVectorizer. See TfidfVectorizer documentation for more information.

    :param bow_max_features: An Integer value passed to CountVectorizer. See CountVectorizer documentation for more information.

    :param bow_min_df: A Float value passed to CountVectorizer. See CountVectorizer documentation for more information.

    :param bow_max_df: A Float value passed to CountVectorizer. See CountVectorizer documentation for more information.

    :return: A tuple consisting of the identifier and the mean score of the 10-fold cross validation.
    """

    identifier = "Algorithm: '{}', feature-model: '{}', {}".format(model.__class__.__name__, feature_model, identifier_addition)
    print("\n\nRunning: '{}'".format(identifier))

    X, y, docs = get_X_and_y(documents, feature_model, True, tfidf_max_features, tfidf_min_df, tfidf_max_df, bow_max_features, bow_min_df, bow_max_df)
    out_of_sample_threshold = len(X) - 200

    out_of_sample_X = X[out_of_sample_threshold:]
    X = X[:out_of_sample_threshold]

    out_of_sample_y = y[out_of_sample_threshold:]
    y = y[:out_of_sample_threshold]

    k_folds = 10
    X_folds = np.array_split(X, k_folds)
    y_folds = np.array_split(y, k_folds)
    scores = []

    # Perform 10-fold cross validation
    for k in range(k_folds):
        X_pred = list(X_folds)
        X_test = X_pred.pop(k)
        X_pred = np.concatenate(X_pred)

        y_train = list(y_folds)
        y_test = y_train.pop(k)
        y_train = np.concatenate(y_train)

        scores.append(model.fit(X_pred, y_train).score(X_test, y_test))

    print("Average score after {} fold crossvalidation: {}".format(k_folds, np.mean(scores)))

    # Perform predictions on out-of-sample data
    print("\nOut of sample:")

    # Model must be fitted again with the whole dataset (without out-of-sample data), because 10-fold cross validation
    # overwrites the model on each iteration and 1 fold is always used for training and thus the model is missing
    # potentially important data for the fitting process
    model.fit(X, y)

    X_pred = out_of_sample_X
    y_test = out_of_sample_y

    y_pred = model.predict(X_pred)

    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("\n\nClassification report:")
    print(classification_report(y_test, y_pred))

    print("\n\nAccuracy score:")
    accuracy = accuracy_score(y_test, y_pred)
    print(accuracy)

    # Write out-of-sample predictions to CSV file
    if write_output and not predict_uncategorized:
        out_of_sample_x_data = docs[:out_of_sample_threshold]
        out_of_sample_y_data = y[:out_of_sample_threshold]
        prediction = list(zip(zip(out_of_sample_x_data, out_of_sample_y_data), y_pred))

        # Write to CSV
        file_name = identifier.replace(" ", "_").replace("'", "").replace(":", "").replace(",", "_")
        with open("data/output/" + file_name + ".csv", 'w') as out:
            csv_out = csv.writer(out, delimiter="|")
            csv_out.writerow(['test_set', 'prediction'])
            for row in prediction:
                csv_out.writerow(row)

    # Write new data predictions to CSV file
    elif write_output and predict_uncategorized:
        uncategorized_documents = CSVHandler.get_document(text_mode="normal", n=3, reduced_categories=True, categorized=False)

        X_pred, _, u_docs = get_X_and_y(uncategorized_documents, feature_model, False, tfidf_max_features, tfidf_min_df, tfidf_max_df, bow_max_features, bow_min_df, bow_max_df)
        y_pred = model.predict(X_pred)
        uncategorized_identifier = "uncategorized_" + identifier
        prediction = list(zip(u_docs, y_pred))

        # Write to CSV
        file_name = uncategorized_identifier.replace(" ", "_").replace("'", "").replace(":", "").replace(",", "_")
        with open("data/output/uncategorized/" + file_name + ".csv", 'w') as out:
            csv_out = csv.writer(out, delimiter="|")
            csv_out.writerow(['test_set', 'prediction'])
            for row in prediction:
                csv_out.writerow(row)

    return (identifier, np.mean(scores))  # Return tuple