def test_svm_configurations(kernels: list,
                            c_values: list,
                            X_train: pd.DataFrame,
                            y_train: pd.Series,
                            X_cv_set: pd.DataFrame,
                            y_cv_set: pd.Series,
                            printConfusionMatrices: bool = False):
    svm_values = pd.DataFrame(columns=[
        "Kernel", "C value", "Training set accuracy", "CV set accuracy"
    ])

    i = 0
    for kernel in kernels:
        for c_value in c_values:
            clf = svm.SVC(kernel=kernel,
                          C=c_value,
                          gamma="scale",
                          cache_size=1000)
            clf.fit(X_train, y_train)
            predictions_train = pd.Series(clf.predict(X_train))
            predictions_cv = pd.Series(clf.predict(X_cv_set))
            accuracy_train = computeAccuracy(predictions_train, y_train)
            accuracy_cv = computeAccuracy(predictions_cv, y_cv_set)

            configuration_data = [kernel, c_value, accuracy_train, accuracy_cv]
            svm_values.loc[i] = configuration_data
            i += 1

            if printConfusionMatrices:
                print("\n", configuration_data[:2])
                print(getConfusionMatrix(predictions_cv, y_cv_set))

    best_svm_values = svm_values.sort_values(by="CV set accuracy",
                                             ascending=False).head(1)
    best_svm = svm.SVC(kernel=best_svm_values.iat[0, 0],
                       C=best_svm_values.iat[0, 1],
                       gamma="scale",
                       cache_size=500)

    return svm_values, best_svm
Example #2
0
def main():
    # a)  Divide dataset randomly into training and evaluation set
    dataset = pd.read_excel(DEFAULT_FILEPATH)
    dataset = dataset.dropna()
    dataset = dataset.drop(
        "tvdlm", axis=1)  # Drop tvdlm columns which does not add information
    dataset_scaled = scale_dataset(dataset=dataset,
                                   objective=DEFAULT_OBJECTIVE,
                                   scaling_type="minmax")

    train, testing_sets = divide_in_training_test_datasets(
        dataset_scaled, train_pctg=DEFAULT_TRAIN_PCTG)
    cv_set, test = divide_in_training_test_datasets(
        testing_sets, train_pctg=DEFAULT_CV_PCTG / (1 - DEFAULT_TRAIN_PCTG))

    X_train, y_train = separate_dataset_objective_data(train,
                                                       DEFAULT_OBJECTIVE)
    X_cv_set, y_cv_set = separate_dataset_objective_data(
        cv_set, DEFAULT_OBJECTIVE)
    X_test, y_test = separate_dataset_objective_data(test, DEFAULT_OBJECTIVE)

    # b)  Classify categorical variable "sigdz" using default SVC SVM
    words_then = datetime.datetime.now()
    c_value1 = 1
    kernel1 = "rbf"
    clf1 = svm.SVC(
        kernel=kernel1, gamma='scale', C=c_value1
    )  # using default parameters, written down for illustrative purposes
    clf1.fit(X_train, y_train)

    predictions_cv1 = pd.Series(clf1.predict(X_cv_set).T)
    confusion_matrix = getConfusionMatrix(predictions_cv1, y_cv_set)
    predictions_train1 = pd.Series(clf1.predict(X_train))
    accuracy_train1 = computeAccuracy(predictions_train1, y_train)

    predictions_cv1 = pd.Series(clf1.predict(X_cv_set))
    accuracy_cv1 = computeAccuracy(predictions_cv1, y_cv_set)

    data_default_svm = pd.DataFrame(columns=[
        "Kernel", "C value", "Training set accuracy", "CV set accuracy"
    ])
    data_default_svm.loc[0] = [
        kernel1, c_value1, accuracy_train1, accuracy_cv1
    ]

    words_now = datetime.datetime.now()
    print("Runtime Default SVM fitting and testing: ",
          divmod((words_now - words_then).total_seconds(), 60), "\n")

    # c)  Evaluate different values for C and different nuclei to find best performing classifier
    kernels = ["rbf", "poly", "linear", "sigmoid"]
    c_values = list(np.logspace(-3, 2, 6))
    svm_values, best_svm = test_svm_configurations(kernels, c_values, X_train,
                                                   y_train, X_cv_set, y_cv_set)

    time_now = datetime.datetime.now()
    print("\n\nRuntime parameter and kernel testing: ",
          divmod((time_now - words_now).total_seconds(), 60), "\n")

    # Calculate real performance on test set
    best_svm.fit(X_train, y_train)
    predictions_best_clf = pd.Series(best_svm.predict(X_test))
    winner_test_accuracy = computeAccuracy(predictions_best_clf, y_test)

    a = 1
def main():
    objective = DEFAULT_OBJECTIVE
    training_percentage = DEFAULT_TRAIN_PCTG
    view_trees = False

    dataset = pd.read_csv(DEFAULT_FILEPATH, sep="\t")
    dataset = pour_titanic_dataset(dataset)

    # =========== a) Divide data set in two parts, training and evaluation set
    train, test = divide_in_training_test_datasets(
        dataset, train_pctg=training_percentage)

    # =========== b) Decision tree using Shannon entropy
    decision_tree_shannon = DecisionTree(train,
                                         objective=objective,
                                         gain_f="shannon")
    decision_tree_shannon.plot(name_prefix="Shannon", view=view_trees)

    # =========== c) Decision tree using Gini index
    decision_tree_gini = DecisionTree(train,
                                      objective=objective,
                                      gain_f="gini")
    decision_tree_gini.plot(name_prefix="Gini", view=view_trees)

    # =========== d) Random forest for b) and c)
    random_forest_shannon = RandomForest(train,
                                         objective=objective,
                                         gain_f="shannon")
    random_forest_shannon.plot(name_prefix="Shannon", view=view_trees)

    random_forest_gini = RandomForest(train,
                                      objective=objective,
                                      gain_f="gini")
    random_forest_gini.plot(name_prefix="Gini", view=view_trees)

    # =========== e) Confusion matrix for b), c), d).1 and d).2
    predictions_dt_shannon = decision_tree_shannon.getPredictions(
        test, objective)  # b)
    predictions_dt_gini = decision_tree_gini.getPredictions(test,
                                                            objective)  # c)
    predictions_rf_shannon = random_forest_shannon.getPredictions(
        test, objective)  # d).1
    predictions_rf_gini = random_forest_gini.getPredictions(test,
                                                            objective)  # d).2

    conf_matrix_dt_shannon = getConfusionMatrix(predictions_dt_shannon,
                                                test[objective])
    conf_matrix_dt_gini = getConfusionMatrix(predictions_dt_gini,
                                             test[objective])
    conf_matrix_rf_shannon = getConfusionMatrix(predictions_rf_shannon,
                                                test[objective])
    conf_matrix_rf_gini = getConfusionMatrix(predictions_rf_gini,
                                             test[objective])

    accuracy_dt_shannon = computeAccuracy(predictions_dt_shannon,
                                          test[objective])
    accuracy_dt_gini = computeAccuracy(predictions_dt_gini, test[objective])
    accuracy_rf_shannon = computeAccuracy(predictions_rf_shannon,
                                          test[objective])
    accuracy_rf_gini = computeAccuracy(predictions_rf_gini, test[objective])

    print("\n\n=======================================")
    print("Decision Tree - Shannon:")
    print("\tAccuracy = ", accuracy_dt_shannon)
    print(conf_matrix_dt_shannon, "\n")

    print("Decision Tree - Gini:")
    print("\tAccuracy = ", accuracy_dt_gini)
    print(conf_matrix_dt_gini, "\n")

    print("Random Forest - Shannon:")
    print("\tAccuracy = ", accuracy_rf_shannon)
    print(conf_matrix_rf_shannon, "\n")

    print("Random Forest - Gini:")
    print("\tAccuracy = ", accuracy_rf_gini)
    print(conf_matrix_rf_gini)

    # =========== f) Graph precision of decision tree vs. no. of nodes for each case
    # Decision tree pruning
    # Graph: Accuracy vs. no of nodes
    # For each case: b), c), d).1, d).2

    accuracy_dt_shannon_table = [accuracy_dt_shannon]
    accuracy_dt_shannon_train = [
        computeAccuracy(decision_tree_shannon.getPredictions(train, objective),
                        train[objective])
    ]
    no_nodes_dt_shannon_table = [decision_tree_shannon.no_of_nodes()]

    accuracy_dt_gini_table = [accuracy_dt_gini]
    accuracy_dt_gini_train = [
        computeAccuracy(decision_tree_gini.getPredictions(train, objective),
                        train[objective])
    ]
    no_nodes_dt_gini_table = [decision_tree_gini.no_of_nodes()]

    # Try different pruning variations
    for i in range(1, 10):
        for no_branches_to_be_pruned in range(1,
                                              3):  # prune one or two branches
            decision_tree_shannon_pruned = DecisionTree(
                train, objective=objective,
                gain_f="shannon").prune_tree(no_branches_to_be_pruned)
            decision_tree_gini_pruned = DecisionTree(
                train, objective=objective,
                gain_f="gini").prune_tree(no_branches_to_be_pruned)
            # TODO Random Forests

            accuracy_dt_shannon_pruned = computeAccuracy(
                decision_tree_shannon_pruned.getPredictions(test, objective),
                test[objective])
            accuracy_dt_shannon_table.append(accuracy_dt_shannon_pruned)

            accuracy_dt_shannon_pruned_train = computeAccuracy(
                decision_tree_shannon_pruned.getPredictions(train, objective),
                train[objective])
            accuracy_dt_shannon_train.append(accuracy_dt_shannon_pruned_train)

            accuracy_dt_gini_pruned = computeAccuracy(
                decision_tree_gini_pruned.getPredictions(test, objective),
                test[objective])
            accuracy_dt_gini_table.append(accuracy_dt_gini_pruned)

            accuracy_dt_gini_pruned_train = computeAccuracy(
                decision_tree_gini_pruned.getPredictions(train, objective),
                train[objective])
            accuracy_dt_gini_train.append(accuracy_dt_gini_pruned_train)

            no_nodes_dt_shannon_table.append(
                decision_tree_shannon_pruned.no_of_nodes())
            no_nodes_dt_gini_table.append(
                decision_tree_gini_pruned.no_of_nodes())

    # plot graph, 4 lines (DT - Shannon; DT - Gini; RF - Shannon; RF - Gini)
    plt.plot(no_nodes_dt_shannon_table,
             accuracy_dt_shannon_table,
             'ro',
             label="Shannon - Test")
    plt.plot(no_nodes_dt_shannon_table,
             accuracy_dt_shannon_train,
             'rx',
             label="Shannon - Train")
    plt.plot(no_nodes_dt_gini_table,
             accuracy_dt_gini_table,
             'go',
             label="Gini - Test")
    plt.plot(no_nodes_dt_gini_table,
             accuracy_dt_gini_train,
             'gx',
             label="Gini - Train")
    plt.gca().legend()
    plt.xlabel("No. of nodes")
    plt.ylabel("Accuracy")
    plt.show()

    a = 1
def main():
    # Data import and cleaning
    dataset = pd.read_csv(
        DEFAULT_FILEPATH,
        sep=';')  # review_sentiments.csv is semicolon-separated (;)
    dataset = rewrite_positives_negatives(dataset)
    dataset = delete_non_numeric_columns(dataset)
    dataset["titleSentiment"] = dataset["titleSentiment"].fillna(
        dataset["textSentiment"])  # Handle NaN

    # ========== a) Mean no. of words of reviews valued with 1 star
    one_star_ratings = dataset[dataset["Star Rating"] == 1]
    one_star_review_mean_words = sum(
        one_star_ratings["wordcount"]) / len(one_star_ratings)

    # ========== b) Divide data set into two parts, training and evaluation set
    training_set, evaluation_set = divide_in_training_test_datasets(
        dataset=dataset, train_pctg=TRAIN_PCTG)
    evaluation_set_without_objective, orig_ratings = separate_dataset_objective_data(
        dataset=evaluation_set, objective=DEFAULT_OBJECTIVE)

    # ========== c) Apply KNN and Weighted-distances KNN to predict review ratings (stars)
    time1 = datetime.datetime.now()
    predicted_ratings = evaluation_set_without_objective.apply(
        knn, axis=1, args=(training_set, DEFAULT_OBJECTIVE, DEFAULT_K, False))
    time2 = datetime.datetime.now()
    print("Runtime Unweighted: ", divmod((time2 - time1).total_seconds(), 60),
          "\n")

    predicted_ratings_weighted = evaluation_set_without_objective.apply(
        knn, axis=1, args=(training_set, DEFAULT_OBJECTIVE, DEFAULT_K, True))
    print("Runtime Weighted: ",
          divmod((datetime.datetime.now() - time2).total_seconds(), 60), "\n")

    # ========== d) Calculate classifier precision and confusion matrix
    confusion_matrix = getConfusionMatrix(predicted_ratings, orig_ratings)
    accuracy = computeAccuracy(predicted_ratings, orig_ratings)
    true_positive_rate = computeTruePositiveRate(predicted_ratings,
                                                 orig_ratings)
    precision = computePrecision(predicted_ratings, orig_ratings)
    recall = computeRecall(predicted_ratings, orig_ratings)
    f1 = f1_score(precision, recall)

    # KNN with weighted distances
    confusion_matrix_weighted = getConfusionMatrix(predicted_ratings_weighted,
                                                   orig_ratings)
    accuracy_weighted = computeAccuracy(predicted_ratings_weighted,
                                        orig_ratings)
    true_positive_rate_weighted = computeTruePositiveRate(
        predicted_ratings_weighted, orig_ratings)
    precision_weighted = computePrecision(predicted_ratings_weighted,
                                          orig_ratings)
    recall_weighted = computeRecall(predicted_ratings_weighted, orig_ratings)
    f1_weighted = f1_score(precision_weighted, recall_weighted)

    # ============== Final printout ==============
    print("\n========== Ejercicio a) ==========")
    print("Mean no. of words of 1-star-reviews:", one_star_review_mean_words)

    print("\n\n========== Data info ==========")
    print("Data set dimensions: ", dataset.shape)
    print("Training set dimensions: ", training_set.shape)
    print("Evaluation set dimensions: ", evaluation_set.shape)
    print("Percentage of data set used for training: ", int(TRAIN_PCTG * 100),
          "%")
    print("Classification objective: ", DEFAULT_OBJECTIVE)

    print("\n========== Evaluation metrics standard KNN ==========")
    print("Accuracy: ", accuracy, "\n")

    print("Confusion matrix:\n", confusion_matrix)

    print("\nTrue positive rate (TP) (= Recall): ", true_positive_rate)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1-score: ", f1)

    print(
        "\n========== Evaluation metrics KNN with weighted distances =========="
    )
    print("Accuracy: ", accuracy_weighted, "\n")

    print("Confusion matrix:\n", confusion_matrix_weighted)

    print("\nTrue positive rate (TP) (= Recall): ",
          true_positive_rate_weighted)
    print("Precision: ", precision_weighted)
    print("Recall: ", recall_weighted)
    print("F1-score: ", f1_weighted)

    a = 1
Example #5
0
def main(data_filepath, training_percentage, keyword_amount,
         validation_amount):
    initial_time = datetime.datetime.now()

    # ============== Variable setup ==============
    path = data_filepath  # Set path containing text documents as .txt files
    no_of_keywords = keyword_amount  # how many highest scoring words on TF-IDF are selected as features
    no_of_validation_examples = validation_amount

    # ============== Get and process data ==============
    # Extract information and save in DataFrame
    objective = "categoria"
    predicted = "titular"

    data_set = pd.read_csv(path, sep="\t")
    data_set = data_set[
        data_set[objective] !=
        "Noticias destacadas"]  # Leave out massive, unspecific "Noticias destacadas" category

    # Split data set into data subsets by category
    available_classes = pd.Series(
        data_set[objective].unique()).dropna().sort_values()
    categories = {}
    for cls in available_classes:
        categories[cls] = data_set[data_set[objective] == cls]

    # Extract words from each data (sub-)set
    # TODO Consider implementing Porter stemming to reduce redundancy
    #  http://www.3engine.net/wp/2015/02/stemming-con-python/
    words_then = datetime.datetime.now()  # for measuring runtime

    words = list(
    )  # will contain words from all data subsets, each as one list element

    for category_name, category_data in categories.items():
        words_this_category = pd.DataFrame()
        counter = 0
        for row in category_data[predicted]:
            if counter >= int(len(category_data) * training_percentage):
                break
            words_one_title = extract_words_from_text(
                text=row, prevent_uppercase_duplicates=True)
            words_one_title.columns = [
                category_name + "_" + predicted + "_" + str(counter)
            ]
            words_this_category = pd.concat(
                [words_this_category, words_one_title], axis=1)
            counter += 1

        words.append(words_this_category)

    print("Runtime of word parsing:",
          divmod((datetime.datetime.now() - words_then).total_seconds(), 60),
          "\n")

    # ============== Compute TF-IDF scores and, based on those, choose keywords ==============
    then = datetime.datetime.now()  # perf measurement

    tf_idf_scores = list()
    for words_this_category in words:
        tf_idf_scores.append(
            tf_idf(words_this_category
                   ))  # word frequencies for Bayes classifier, contains NaN

    print("Runtime of TF-IDF:",
          divmod((datetime.datetime.now() - then).total_seconds(), 60))

    # Get x words with maximum TF-IDF scores for each category, with associated words as indices
    keywords = list()
    for scores_this_category in tf_idf_scores:
        keywords_this_category = scores_this_category.max(axis=1).sort_values(
            ascending=False).head(no_of_keywords)
        keywords.append(keywords_this_category)

    # ======= "Train" parameters: Retrieve frequency in respective category for each keyword =======
    keyword_frequency = list()

    for i, cat_dataset in enumerate(categories.values()):
        current_category_word_count = pd.DataFrame()

        for j in range(0, int(len(cat_dataset) * training_percentage)):
            counts_one_example = words[i].iloc[:, j].value_counts(
            )  # get word count in one example (column)
            current_category_word_count = pd.concat(
                [current_category_word_count, counts_one_example],
                axis=1,
                sort=True)

        category_no_of_words = current_category_word_count.sum().sum(
        )  # get overall number of words in this category
        temp = current_category_word_count.sum(
            axis=1) / category_no_of_words  # frequency of words in category
        temp = temp[temp.index.isin(
            keywords[i].index
        )]  # choose subset of words as keywords selected above
        keyword_frequency.append(temp)

    # ============== Bayes classifier ==============
    validation_examples = data_set.sample(
        n=no_of_validation_examples)  # random sample from data set
    validation_example_predictions = list()

    for i in range(0, no_of_validation_examples):  # get one example at a time
        example_words = extract_words_from_text(
            validation_examples[predicted].iat[i],
            True)  # get words for given example
        category_wise_prob = list()

        for j in range(0, len(categories)):
            prob_this_category = 0
            prob_keyword_in_entire_dataset = 0

            for word in example_words.iterrows():
                try:  # TODO maybe it's necessary to smoothen the results here (Laplace smoothing)
                    prob_keyword_in_category = keywords[j][word[1].iat[0]]
                except KeyError:  # when word not found in list of trained keywords
                    continue

                for k in range(
                        0, len(categories)
                ):  # get entire data set probability for this word
                    try:
                        prob_keyword_in_entire_dataset += keywords[k][
                            word[1].iat[0]] * (
                                1 / len(categories)
                            )  # P(P_i) = P(P_i|cat1)*P(cat1) + P(P_i|cat2)*P(cat2) + ...
                    except KeyError:
                        continue

                prob_this_category += prob_keyword_in_category * prob_keyword_in_entire_dataset  # P(Cat) = P(Cat|Key1)*P(Key1) + P(Cat|Key2)*P(Key2) + ...
            category_wise_prob.append(prob_this_category)

        predicted_class = category_wise_prob.index(
            max(category_wise_prob))  # find class with highest probability
        predicted_class_name = list(
            categories.keys())[predicted_class]  # find associated class name
        validation_example_predictions.append(predicted_class_name)

    # ============== Evaluation ==============
    predictions = pd.Series(validation_example_predictions)
    actual = validation_examples[objective]
    confusion_matrix = getConfusionMatrix(predictions, actual)

    # Eval metrics
    accuracy = computeAccuracy(predictions, actual)
    precision = computePrecision(predictions, actual)
    recall = computeRecall(predictions, actual)
    f1 = f1_score(precision, recall)

    # ============== Final printout ==============
    print("\n========== Data set info ==========")
    print("Number of entries in data set: ", data_set.shape[0],
          " Number of attributes: ", data_set.shape[1])
    print("Categories found:", categories.keys())

    print("\n========== Classifier info ==========")
    print("Number of training examples: ",
          current_category_word_count.shape[0], "x", len(categories), "=",
          current_category_word_count.shape[0] * len(categories))
    print("Number of validation examples: ", no_of_validation_examples)

    print("\n========== Evaluation metrics ==========")
    print("Confusion matrix:", confusion_matrix)
    metrics = pd.Series({
        "Accuracy:": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-score": f1
    })
    print(pd.DataFrame(metrics, columns=["Evaluation metrics"]))

    print("\nTotal runtime:",
          divmod((datetime.datetime.now() - initial_time).total_seconds(), 60))
    a = 1