/
main.py
70 lines (56 loc) · 2.24 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from bayesian_classifier import BayesianClassifier
import re
import pandas as pd
with open('stop_words.txt') as in_file:
STOP_WORDS = tuple(in_file.read().split('\n'))
def process_train_data(data_file, stop_words=STOP_WORDS):
"""
Function for training data processing and splitting it into X and y sets.
:param stop_words_file:
:param data_file: str - train data
:return: pd.DataFrame|list, pd.DataFrame|list - X and y data frames or lists
"""
word_discrim = set()
def clear_text(row, min_len=4):
"""
Clears text from unnecessary
information and words of length 3 or less
"""
text = re.sub(r"[^a-zA-Z]", r" ", row["tweet"])
text = re.sub(r"(.)\1(\1+)", r"\1", text)
row["Processed Tweet"] = [word for word in text.lower().split()
if word not in stop_words and word != 'user' and
len(word) >= min_len]
if row['label'] == 'discrim':
for word in row["Processed Tweet"]:
word_discrim.add(word)
return row
def remove_neutral_words(row):
"""
Removes words that don't appear in
discriminating tweets (for better accuracy)
"""
row["Processed Tweet"] = [word for word in row["Processed Tweet"]
if word in word_discrim]
return row
df = pd \
.read_csv(data_file, encoding="utf8") \
.apply(clear_text, axis=1) \
.drop(labels=["tweet", "id", "Unnamed: 0"], axis=1) \
.apply(remove_neutral_words, axis=1)
return df.drop("label", axis=1), df.drop("Processed Tweet", axis=1)
def test_BayesianClassifier():
"""
Trains Bayesian Classifier on test data and then tests it
"""
train_X, train_y = process_train_data("train.csv")
classifier = BayesianClassifier()
classifier.fit(train_X, train_y)
test_data = pd \
.read_csv("test.csv", encoding="utf8") \
.drop(labels=["id", "Unnamed: 0"], axis=1)
test_X = test_data.drop("label", axis=1)
test_y = test_data.drop("tweet", axis=1)
print("model score: ", classifier.score(test_X, test_y) * 100, "%")
if __name__ == "__main__":
test_BayesianClassifier()