/
template.py
200 lines (166 loc) · 6.02 KB
/
template.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.4.1
# kernelspec:
# display_name: Python 3
# language: python
# name: python3
# ---
# %% [markdown]
# # Text Classification
# %% [markdown]
# ## Setup
# %%
# Importing Libraries
import hyperopt
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from hyperopt import fmin, hp, space_eval, tpe
from scipy.sparse import csr_matrix
from sklearn.ensemble import (
AdaBoostClassifier,
GradientBoostingClassifier,
RandomForestClassifier,
)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import (
accuracy_score,
classification_report,
confusion_matrix,
f1_score,
make_scorer,
plot_confusion_matrix,
precision_score,
recall_score,
)
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
# Change default plot size
matplotlib.rcParams["figure.figsize"] = (12, 8)
# Setting random seed
np.random.seed(42)
# %%
# Importing data
train = pd.read_csv("training.csv", low_memory=False, index_col="article_number")
test = pd.read_csv("test.csv", low_memory=False, index_col="article_number")
# %%
# Create Ordinal Encoding
le = LabelEncoder().fit(train.topic)
train["label"] = le.transform(train.topic)
test["label"] = le.transform(test.topic)
# Split into x and y
train_x = train.drop(["label", "topic"], axis=1)
test_x = test.drop(["label", "topic"], axis=1)
train_y = train["label"]
test_y = test["label"]
# %% [markdown]
# ## Word Representation + Additional Processing
# Here we select how the words will be converted into input for the model. E.g. bag of words, word2vec, TF-IDF etc.
# If you would like to any additional preprocessing, this is the place to do it as well.
# %%
# Additional preprocessing (none for this example)
# %%
# Create 3 representations for the documents: Bag of Words, TF, TF-IDF.
# Parameters of each repr need to be tuned separately
tfidf = TfidfVectorizer(max_features=500).fit(train_x.article_words)
tf = TfidfVectorizer(max_features=500, use_idf=False).fit(train_x.article_words)
bow = CountVectorizer(max_features=500).fit(train_x.article_words)
# Transform words and convert from sparse matrix to array
train_tfidf = csr_matrix.toarray(tfidf.transform(train_x.article_words))
test_tfidf = csr_matrix.toarray(tfidf.transform(test_x.article_words))
train_tf = csr_matrix.toarray(tf.transform(train_x.article_words))
test_tf = csr_matrix.toarray(tf.transform(test_x.article_words))
train_bow = csr_matrix.toarray(bow.transform(train_x.article_words))
test_bow = csr_matrix.toarray(bow.transform(test_x.article_words))
# %% [markdown]
# ## Model Selection
# Choose what type of model you would like to use. Pick some hyperparameters (these will be tuned in the next section) and ensure the model runs as expected on the data.
# %%
# For this example lets use LogisticRegression
model = LogisticRegression()
model.fit(train_bow, train_y)
model.score(test_bow, test_y)
# %% [markdown]
# ## Hyper Parameter Optimisation
# This step is optional, but will use Bayesian optimisation to find the best hyperparameters in the search space. This is done using the [hyperopt](https://github.com/hyperopt/hyperopt) package. The algorithm searches the hyperparameter space by minimising the f1 score across many trials. The larger the `max_evals` parameter, the higher the likelihood of obtaining the optimal hyperparameters (should be >1000 ideally). Note: HPO can take a significant amount of time.
#
# If you are using an sklearn model, consider the package [hyperopt-sklearn](https://github.com/hyperopt/hyperopt-sklearn) which will automatically search over the supported hyperparameters of the model. This isn't used in this example however.
# %%
model_type = LogisticRegression
word_reps = {"tfidf": train_tfidf, "tf": train_tf, "bow": train_bow}
def objective(args):
model_type = args.pop("model_type")
words = word_reps[args.pop("word_rep")]
model = model_type(**args)
return -np.mean(cross_val_score(model, words, train_y, cv=3, scoring="f1_macro"))
# Define a search space - Logistic regression so lets search for a value of C and which penalty to use.
# For full list of configuration options see http://hyperopt.github.io/hyperopt/getting-started/search_spaces/
space = {
"C": hp.uniform("C", 0, 10),
"penalty": hp.choice("penalty", ["l2", "none"]),
"word_rep": hp.choice("word_rep", ["tfidf", "tf", "bow"]),
}
# minimize the objective over the space
best = fmin(objective, space, algo=tpe.suggest, max_evals=5000)
print(hyperopt.space_eval(space, best))
# %%
# Create model with best hyperparameters seen above. Need to manually select test representation
args = hyperopt.space_eval(space, best)
train_words = word_reps[args.pop("word_rep")]
test_words = test_tfidf
model = model_type(**args)
# Fit to training data
model.fit(train_words, train_y)
# Print metrics
print(
"Test",
classification_report(test_y, model.predict(test_words), target_names=le.classes_),
)
# Get training metrics
train_scores = cross_validate(
model,
train_words,
train_y,
cv=5,
scoring=["precision_macro", "recall_macro", "f1_macro"],
)
print("Precision", np.mean(train_scores["test_precision_macro"]))
print("Recall", np.mean(train_scores["test_recall_macro"]))
print("f1", np.mean(train_scores["test_f1_macro"]))
# %%
# Investigate confusion matrices
plot_confusion_matrix(
model,
test_words,
test_y,
normalize="pred",
cmap="Blues",
display_labels=le.classes_,
xticks_rotation=75,
)
plot_confusion_matrix(
model,
test_words,
test_y,
normalize="true",
cmap="Blues",
display_labels=le.classes_,
xticks_rotation=75,
)
plot_confusion_matrix(
model,
test_words,
test_y,
cmap="Blues",
display_labels=le.classes_,
xticks_rotation=75,
)