def test_svm_configurations(kernels: list, c_values: list, X_train: pd.DataFrame, y_train: pd.Series, X_cv_set: pd.DataFrame, y_cv_set: pd.Series, printConfusionMatrices: bool = False): svm_values = pd.DataFrame(columns=[ "Kernel", "C value", "Training set accuracy", "CV set accuracy" ]) i = 0 for kernel in kernels: for c_value in c_values: clf = svm.SVC(kernel=kernel, C=c_value, gamma="scale", cache_size=1000) clf.fit(X_train, y_train) predictions_train = pd.Series(clf.predict(X_train)) predictions_cv = pd.Series(clf.predict(X_cv_set)) accuracy_train = computeAccuracy(predictions_train, y_train) accuracy_cv = computeAccuracy(predictions_cv, y_cv_set) configuration_data = [kernel, c_value, accuracy_train, accuracy_cv] svm_values.loc[i] = configuration_data i += 1 if printConfusionMatrices: print("\n", configuration_data[:2]) print(getConfusionMatrix(predictions_cv, y_cv_set)) best_svm_values = svm_values.sort_values(by="CV set accuracy", ascending=False).head(1) best_svm = svm.SVC(kernel=best_svm_values.iat[0, 0], C=best_svm_values.iat[0, 1], gamma="scale", cache_size=500) return svm_values, best_svm
def main(): # a) Divide dataset randomly into training and evaluation set dataset = pd.read_excel(DEFAULT_FILEPATH) dataset = dataset.dropna() dataset = dataset.drop( "tvdlm", axis=1) # Drop tvdlm columns which does not add information dataset_scaled = scale_dataset(dataset=dataset, objective=DEFAULT_OBJECTIVE, scaling_type="minmax") train, testing_sets = divide_in_training_test_datasets( dataset_scaled, train_pctg=DEFAULT_TRAIN_PCTG) cv_set, test = divide_in_training_test_datasets( testing_sets, train_pctg=DEFAULT_CV_PCTG / (1 - DEFAULT_TRAIN_PCTG)) X_train, y_train = separate_dataset_objective_data(train, DEFAULT_OBJECTIVE) X_cv_set, y_cv_set = separate_dataset_objective_data( cv_set, DEFAULT_OBJECTIVE) X_test, y_test = separate_dataset_objective_data(test, DEFAULT_OBJECTIVE) # b) Classify categorical variable "sigdz" using default SVC SVM words_then = datetime.datetime.now() c_value1 = 1 kernel1 = "rbf" clf1 = svm.SVC( kernel=kernel1, gamma='scale', C=c_value1 ) # using default parameters, written down for illustrative purposes clf1.fit(X_train, y_train) predictions_cv1 = pd.Series(clf1.predict(X_cv_set).T) confusion_matrix = getConfusionMatrix(predictions_cv1, y_cv_set) predictions_train1 = pd.Series(clf1.predict(X_train)) accuracy_train1 = computeAccuracy(predictions_train1, y_train) predictions_cv1 = pd.Series(clf1.predict(X_cv_set)) accuracy_cv1 = computeAccuracy(predictions_cv1, y_cv_set) data_default_svm = pd.DataFrame(columns=[ "Kernel", "C value", "Training set accuracy", "CV set accuracy" ]) data_default_svm.loc[0] = [ kernel1, c_value1, accuracy_train1, accuracy_cv1 ] words_now = datetime.datetime.now() print("Runtime Default SVM fitting and testing: ", divmod((words_now - words_then).total_seconds(), 60), "\n") # c) Evaluate different values for C and different nuclei to find best performing classifier kernels = ["rbf", "poly", "linear", "sigmoid"] c_values = list(np.logspace(-3, 2, 6)) svm_values, best_svm = test_svm_configurations(kernels, c_values, X_train, y_train, X_cv_set, y_cv_set) time_now = datetime.datetime.now() print("\n\nRuntime parameter and kernel testing: ", divmod((time_now - words_now).total_seconds(), 60), "\n") # Calculate real performance on test set best_svm.fit(X_train, y_train) predictions_best_clf = pd.Series(best_svm.predict(X_test)) winner_test_accuracy = computeAccuracy(predictions_best_clf, y_test) a = 1
def main(): objective = DEFAULT_OBJECTIVE training_percentage = DEFAULT_TRAIN_PCTG view_trees = False dataset = pd.read_csv(DEFAULT_FILEPATH, sep="\t") dataset = pour_titanic_dataset(dataset) # =========== a) Divide data set in two parts, training and evaluation set train, test = divide_in_training_test_datasets( dataset, train_pctg=training_percentage) # =========== b) Decision tree using Shannon entropy decision_tree_shannon = DecisionTree(train, objective=objective, gain_f="shannon") decision_tree_shannon.plot(name_prefix="Shannon", view=view_trees) # =========== c) Decision tree using Gini index decision_tree_gini = DecisionTree(train, objective=objective, gain_f="gini") decision_tree_gini.plot(name_prefix="Gini", view=view_trees) # =========== d) Random forest for b) and c) random_forest_shannon = RandomForest(train, objective=objective, gain_f="shannon") random_forest_shannon.plot(name_prefix="Shannon", view=view_trees) random_forest_gini = RandomForest(train, objective=objective, gain_f="gini") random_forest_gini.plot(name_prefix="Gini", view=view_trees) # =========== e) Confusion matrix for b), c), d).1 and d).2 predictions_dt_shannon = decision_tree_shannon.getPredictions( test, objective) # b) predictions_dt_gini = decision_tree_gini.getPredictions(test, objective) # c) predictions_rf_shannon = random_forest_shannon.getPredictions( test, objective) # d).1 predictions_rf_gini = random_forest_gini.getPredictions(test, objective) # d).2 conf_matrix_dt_shannon = getConfusionMatrix(predictions_dt_shannon, test[objective]) conf_matrix_dt_gini = getConfusionMatrix(predictions_dt_gini, test[objective]) conf_matrix_rf_shannon = getConfusionMatrix(predictions_rf_shannon, test[objective]) conf_matrix_rf_gini = getConfusionMatrix(predictions_rf_gini, test[objective]) accuracy_dt_shannon = computeAccuracy(predictions_dt_shannon, test[objective]) accuracy_dt_gini = computeAccuracy(predictions_dt_gini, test[objective]) accuracy_rf_shannon = computeAccuracy(predictions_rf_shannon, test[objective]) accuracy_rf_gini = computeAccuracy(predictions_rf_gini, test[objective]) print("\n\n=======================================") print("Decision Tree - Shannon:") print("\tAccuracy = ", accuracy_dt_shannon) print(conf_matrix_dt_shannon, "\n") print("Decision Tree - Gini:") print("\tAccuracy = ", accuracy_dt_gini) print(conf_matrix_dt_gini, "\n") print("Random Forest - Shannon:") print("\tAccuracy = ", accuracy_rf_shannon) print(conf_matrix_rf_shannon, "\n") print("Random Forest - Gini:") print("\tAccuracy = ", accuracy_rf_gini) print(conf_matrix_rf_gini) # =========== f) Graph precision of decision tree vs. no. of nodes for each case # Decision tree pruning # Graph: Accuracy vs. no of nodes # For each case: b), c), d).1, d).2 accuracy_dt_shannon_table = [accuracy_dt_shannon] accuracy_dt_shannon_train = [ computeAccuracy(decision_tree_shannon.getPredictions(train, objective), train[objective]) ] no_nodes_dt_shannon_table = [decision_tree_shannon.no_of_nodes()] accuracy_dt_gini_table = [accuracy_dt_gini] accuracy_dt_gini_train = [ computeAccuracy(decision_tree_gini.getPredictions(train, objective), train[objective]) ] no_nodes_dt_gini_table = [decision_tree_gini.no_of_nodes()] # Try different pruning variations for i in range(1, 10): for no_branches_to_be_pruned in range(1, 3): # prune one or two branches decision_tree_shannon_pruned = DecisionTree( train, objective=objective, gain_f="shannon").prune_tree(no_branches_to_be_pruned) decision_tree_gini_pruned = DecisionTree( train, objective=objective, gain_f="gini").prune_tree(no_branches_to_be_pruned) # TODO Random Forests accuracy_dt_shannon_pruned = computeAccuracy( decision_tree_shannon_pruned.getPredictions(test, objective), test[objective]) accuracy_dt_shannon_table.append(accuracy_dt_shannon_pruned) accuracy_dt_shannon_pruned_train = computeAccuracy( decision_tree_shannon_pruned.getPredictions(train, objective), train[objective]) accuracy_dt_shannon_train.append(accuracy_dt_shannon_pruned_train) accuracy_dt_gini_pruned = computeAccuracy( decision_tree_gini_pruned.getPredictions(test, objective), test[objective]) accuracy_dt_gini_table.append(accuracy_dt_gini_pruned) accuracy_dt_gini_pruned_train = computeAccuracy( decision_tree_gini_pruned.getPredictions(train, objective), train[objective]) accuracy_dt_gini_train.append(accuracy_dt_gini_pruned_train) no_nodes_dt_shannon_table.append( decision_tree_shannon_pruned.no_of_nodes()) no_nodes_dt_gini_table.append( decision_tree_gini_pruned.no_of_nodes()) # plot graph, 4 lines (DT - Shannon; DT - Gini; RF - Shannon; RF - Gini) plt.plot(no_nodes_dt_shannon_table, accuracy_dt_shannon_table, 'ro', label="Shannon - Test") plt.plot(no_nodes_dt_shannon_table, accuracy_dt_shannon_train, 'rx', label="Shannon - Train") plt.plot(no_nodes_dt_gini_table, accuracy_dt_gini_table, 'go', label="Gini - Test") plt.plot(no_nodes_dt_gini_table, accuracy_dt_gini_train, 'gx', label="Gini - Train") plt.gca().legend() plt.xlabel("No. of nodes") plt.ylabel("Accuracy") plt.show() a = 1
def main(): # Data import and cleaning dataset = pd.read_csv( DEFAULT_FILEPATH, sep=';') # review_sentiments.csv is semicolon-separated (;) dataset = rewrite_positives_negatives(dataset) dataset = delete_non_numeric_columns(dataset) dataset["titleSentiment"] = dataset["titleSentiment"].fillna( dataset["textSentiment"]) # Handle NaN # ========== a) Mean no. of words of reviews valued with 1 star one_star_ratings = dataset[dataset["Star Rating"] == 1] one_star_review_mean_words = sum( one_star_ratings["wordcount"]) / len(one_star_ratings) # ========== b) Divide data set into two parts, training and evaluation set training_set, evaluation_set = divide_in_training_test_datasets( dataset=dataset, train_pctg=TRAIN_PCTG) evaluation_set_without_objective, orig_ratings = separate_dataset_objective_data( dataset=evaluation_set, objective=DEFAULT_OBJECTIVE) # ========== c) Apply KNN and Weighted-distances KNN to predict review ratings (stars) time1 = datetime.datetime.now() predicted_ratings = evaluation_set_without_objective.apply( knn, axis=1, args=(training_set, DEFAULT_OBJECTIVE, DEFAULT_K, False)) time2 = datetime.datetime.now() print("Runtime Unweighted: ", divmod((time2 - time1).total_seconds(), 60), "\n") predicted_ratings_weighted = evaluation_set_without_objective.apply( knn, axis=1, args=(training_set, DEFAULT_OBJECTIVE, DEFAULT_K, True)) print("Runtime Weighted: ", divmod((datetime.datetime.now() - time2).total_seconds(), 60), "\n") # ========== d) Calculate classifier precision and confusion matrix confusion_matrix = getConfusionMatrix(predicted_ratings, orig_ratings) accuracy = computeAccuracy(predicted_ratings, orig_ratings) true_positive_rate = computeTruePositiveRate(predicted_ratings, orig_ratings) precision = computePrecision(predicted_ratings, orig_ratings) recall = computeRecall(predicted_ratings, orig_ratings) f1 = f1_score(precision, recall) # KNN with weighted distances confusion_matrix_weighted = getConfusionMatrix(predicted_ratings_weighted, orig_ratings) accuracy_weighted = computeAccuracy(predicted_ratings_weighted, orig_ratings) true_positive_rate_weighted = computeTruePositiveRate( predicted_ratings_weighted, orig_ratings) precision_weighted = computePrecision(predicted_ratings_weighted, orig_ratings) recall_weighted = computeRecall(predicted_ratings_weighted, orig_ratings) f1_weighted = f1_score(precision_weighted, recall_weighted) # ============== Final printout ============== print("\n========== Ejercicio a) ==========") print("Mean no. of words of 1-star-reviews:", one_star_review_mean_words) print("\n\n========== Data info ==========") print("Data set dimensions: ", dataset.shape) print("Training set dimensions: ", training_set.shape) print("Evaluation set dimensions: ", evaluation_set.shape) print("Percentage of data set used for training: ", int(TRAIN_PCTG * 100), "%") print("Classification objective: ", DEFAULT_OBJECTIVE) print("\n========== Evaluation metrics standard KNN ==========") print("Accuracy: ", accuracy, "\n") print("Confusion matrix:\n", confusion_matrix) print("\nTrue positive rate (TP) (= Recall): ", true_positive_rate) print("Precision: ", precision) print("Recall: ", recall) print("F1-score: ", f1) print( "\n========== Evaluation metrics KNN with weighted distances ==========" ) print("Accuracy: ", accuracy_weighted, "\n") print("Confusion matrix:\n", confusion_matrix_weighted) print("\nTrue positive rate (TP) (= Recall): ", true_positive_rate_weighted) print("Precision: ", precision_weighted) print("Recall: ", recall_weighted) print("F1-score: ", f1_weighted) a = 1
def main(data_filepath, training_percentage, keyword_amount, validation_amount): initial_time = datetime.datetime.now() # ============== Variable setup ============== path = data_filepath # Set path containing text documents as .txt files no_of_keywords = keyword_amount # how many highest scoring words on TF-IDF are selected as features no_of_validation_examples = validation_amount # ============== Get and process data ============== # Extract information and save in DataFrame objective = "categoria" predicted = "titular" data_set = pd.read_csv(path, sep="\t") data_set = data_set[ data_set[objective] != "Noticias destacadas"] # Leave out massive, unspecific "Noticias destacadas" category # Split data set into data subsets by category available_classes = pd.Series( data_set[objective].unique()).dropna().sort_values() categories = {} for cls in available_classes: categories[cls] = data_set[data_set[objective] == cls] # Extract words from each data (sub-)set # TODO Consider implementing Porter stemming to reduce redundancy # http://www.3engine.net/wp/2015/02/stemming-con-python/ words_then = datetime.datetime.now() # for measuring runtime words = list( ) # will contain words from all data subsets, each as one list element for category_name, category_data in categories.items(): words_this_category = pd.DataFrame() counter = 0 for row in category_data[predicted]: if counter >= int(len(category_data) * training_percentage): break words_one_title = extract_words_from_text( text=row, prevent_uppercase_duplicates=True) words_one_title.columns = [ category_name + "_" + predicted + "_" + str(counter) ] words_this_category = pd.concat( [words_this_category, words_one_title], axis=1) counter += 1 words.append(words_this_category) print("Runtime of word parsing:", divmod((datetime.datetime.now() - words_then).total_seconds(), 60), "\n") # ============== Compute TF-IDF scores and, based on those, choose keywords ============== then = datetime.datetime.now() # perf measurement tf_idf_scores = list() for words_this_category in words: tf_idf_scores.append( tf_idf(words_this_category )) # word frequencies for Bayes classifier, contains NaN print("Runtime of TF-IDF:", divmod((datetime.datetime.now() - then).total_seconds(), 60)) # Get x words with maximum TF-IDF scores for each category, with associated words as indices keywords = list() for scores_this_category in tf_idf_scores: keywords_this_category = scores_this_category.max(axis=1).sort_values( ascending=False).head(no_of_keywords) keywords.append(keywords_this_category) # ======= "Train" parameters: Retrieve frequency in respective category for each keyword ======= keyword_frequency = list() for i, cat_dataset in enumerate(categories.values()): current_category_word_count = pd.DataFrame() for j in range(0, int(len(cat_dataset) * training_percentage)): counts_one_example = words[i].iloc[:, j].value_counts( ) # get word count in one example (column) current_category_word_count = pd.concat( [current_category_word_count, counts_one_example], axis=1, sort=True) category_no_of_words = current_category_word_count.sum().sum( ) # get overall number of words in this category temp = current_category_word_count.sum( axis=1) / category_no_of_words # frequency of words in category temp = temp[temp.index.isin( keywords[i].index )] # choose subset of words as keywords selected above keyword_frequency.append(temp) # ============== Bayes classifier ============== validation_examples = data_set.sample( n=no_of_validation_examples) # random sample from data set validation_example_predictions = list() for i in range(0, no_of_validation_examples): # get one example at a time example_words = extract_words_from_text( validation_examples[predicted].iat[i], True) # get words for given example category_wise_prob = list() for j in range(0, len(categories)): prob_this_category = 0 prob_keyword_in_entire_dataset = 0 for word in example_words.iterrows(): try: # TODO maybe it's necessary to smoothen the results here (Laplace smoothing) prob_keyword_in_category = keywords[j][word[1].iat[0]] except KeyError: # when word not found in list of trained keywords continue for k in range( 0, len(categories) ): # get entire data set probability for this word try: prob_keyword_in_entire_dataset += keywords[k][ word[1].iat[0]] * ( 1 / len(categories) ) # P(P_i) = P(P_i|cat1)*P(cat1) + P(P_i|cat2)*P(cat2) + ... except KeyError: continue prob_this_category += prob_keyword_in_category * prob_keyword_in_entire_dataset # P(Cat) = P(Cat|Key1)*P(Key1) + P(Cat|Key2)*P(Key2) + ... category_wise_prob.append(prob_this_category) predicted_class = category_wise_prob.index( max(category_wise_prob)) # find class with highest probability predicted_class_name = list( categories.keys())[predicted_class] # find associated class name validation_example_predictions.append(predicted_class_name) # ============== Evaluation ============== predictions = pd.Series(validation_example_predictions) actual = validation_examples[objective] confusion_matrix = getConfusionMatrix(predictions, actual) # Eval metrics accuracy = computeAccuracy(predictions, actual) precision = computePrecision(predictions, actual) recall = computeRecall(predictions, actual) f1 = f1_score(precision, recall) # ============== Final printout ============== print("\n========== Data set info ==========") print("Number of entries in data set: ", data_set.shape[0], " Number of attributes: ", data_set.shape[1]) print("Categories found:", categories.keys()) print("\n========== Classifier info ==========") print("Number of training examples: ", current_category_word_count.shape[0], "x", len(categories), "=", current_category_word_count.shape[0] * len(categories)) print("Number of validation examples: ", no_of_validation_examples) print("\n========== Evaluation metrics ==========") print("Confusion matrix:", confusion_matrix) metrics = pd.Series({ "Accuracy:": accuracy, "Precision": precision, "Recall": recall, "F1-score": f1 }) print(pd.DataFrame(metrics, columns=["Evaluation metrics"])) print("\nTotal runtime:", divmod((datetime.datetime.now() - initial_time).total_seconds(), 60)) a = 1