/
Main.py
141 lines (112 loc) · 5.24 KB
/
Main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import CSVHandler
import os
import Algorithms
import Algorithm
from operator import itemgetter
text_modes = ["normal", "title_only", "text_only"]
feature_models = ["Bag of Words", "TF IDF"]
possible_n_grams = [1, 2, 3]
reduced_categories_possibilities = [True, False]
algorithms = Algorithms.Algorithms
algorithms_list = [
algorithms.multinomial_naive_bayes(),
algorithms.complement_naive_bayes(),
algorithms.gaussian_naive_bayes(),
algorithms.random_forest(),
algorithms.svc(),
algorithms.k_neighbors(),
algorithms.ada_boost()
]
def main():
# Check if import is needed
if os.path.exists("data/import_me.tsv"):
print("Importing new dataset")
CSVHandler.import_new_dataset()
# evaluate_best_parameters()
evaluate_best_model()
# evaluate_best_TFIDF_parameters()
# evaluate_best_BoW_parameters()
def evaluate_best_model():
scores = []
# Go through all possible parameters and run the algorithm in order to evaluate the best model
for reduced_categories in reduced_categories_possibilities:
for text_mode in text_modes:
for n in possible_n_grams:
documents = CSVHandler.get_document(text_mode, n, reduced_categories)
identifier_addition = "text-mode: '{}', {}-grams, reduced-categories: {}".format(text_mode, n, reduced_categories)
for feature_model in feature_models:
for algorithm in algorithms_list:
model = algorithm["algorithm"]
tfidf_max_features = algorithm["tfidf_max_features"]
tfidf_min_df = algorithm["tfidf_min_df"]
tfidf_max_df = algorithm["tfidf_max_df"]
bow_max_features = algorithm["bow_max_features"]
bow_min_df = algorithm["bow_min_df"]
bow_max_df = algorithm["bow_max_df"]
scores.append(Algorithm.run(documents, model, feature_model, identifier_addition, True, False, tfidf_max_features, tfidf_min_df, tfidf_max_df, bow_max_features, bow_min_df, bow_max_df))
# Print models sorted by accuracy
print("\n\n\n\nOverview:")
scores = sorted(scores, key=lambda s: (-s[1], s[0]))
for s in scores:
print(s)
# Only the best model when considering accuracy, which is not the case for this project!
print("\nBest model:")
print(max(scores, key=itemgetter(1)))
def evaluate_best_parameters():
documents = CSVHandler.get_document("normal", 2, True)
algorithms.hyperparameter_tuning__random_forest(documents=documents, feature_model="TF IDF")
# algorithms.hyperparameter_tuning__k_neighbors(documents=documents, feature_model="TF IDF")
# algorithms.hyperparameter_tuning__multinomial_naive_bayes(documents=documents, feature_model="TF IDF")
# algorithms.hyperparameter_tuning__svc(documents=documents, feature_model="TF IDF")
def evaluate_best_TFIDF_parameters():
documents = CSVHandler.get_document("normal", 2, True)
best_scores = []
scores = []
feature_model = "TF IDF"
tfidf_max_features = [1500, 1000, 700, 2000]
tfidf_min_df = [5, 2, 6, 9]
tfidf_max_df = [0.5, 0.6, 0.7, 0.8, 0.9]
for algo in algorithms_list:
for mf in tfidf_max_features:
for min_df in tfidf_min_df:
for max_df in tfidf_max_df:
identifier_addition = "text-mode: 'normal', 2-grams, reduced-categories: True, max_features: {}, min_df: {}, max_df: {}".format(mf, min_df, max_df)
algorithm = algo["algorithm"]
scores.append(Algorithm.run(documents, algorithm, feature_model, identifier_addition, True, False, mf, min_df, max_df, 0, 0, 0))
# Print models sorted by accuracy
print("\n\n\n\nOverview:")
scores = sorted(scores, key=lambda s: (-s[1], s[0]))
for s in scores:
print(s)
print("\nBest model:")
print(max(scores, key=itemgetter(1)))
best_scores.append(max(scores, key=itemgetter(1)))
print("Best scores:")
print(best_scores)
def evaluate_best_BoW_parameters():
documents = CSVHandler.get_document("normal", 2, True)
best_scores = []
scores = []
feature_model = "Bag of Words"
bow_max_features = [1500, 1000, 700, 2000]
bow_min_df = [5, 2, 6, 9]
bow_max_df = [0.5, 0.6, 0.7, 0.8, 0.9]
for algo in algorithms_list:
for mf in bow_max_features:
for min_df in bow_min_df:
for max_df in bow_max_df:
identifier_addition = "text-mode: 'normal', 2-grams, reduced-categories: True, max_features: {}, min_df: {}, max_df: {}".format(mf, min_df, max_df)
algorithm = algo["algorithm"]
scores.append(Algorithm.run(documents, algorithm, feature_model, identifier_addition, True, False, 0, 0, 0, mf, min_df, max_df))
# Print models sorted by accuracy
print("\n\n\n\nOverview:")
scores = sorted(scores, key=lambda s: (-s[1], s[0]))
for s in scores:
print(s)
print("\nBest model:")
print(max(scores, key=itemgetter(1)))
best_scores.append(max(scores, key=itemgetter(1)))
print("Best scores:")
print(best_scores)
if __name__ == '__main__':
main()