-
Notifications
You must be signed in to change notification settings - Fork 2
/
NaiveBayes.py
118 lines (81 loc) · 3.42 KB
/
NaiveBayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 1 20:12:22 2019
@author: apple
"""
import pandas as pd
import numpy as np
import preprocessor as p
import re
from nltk.classify import NaiveBayesClassifier
from nltk.sentiment import SentimentAnalyzer
from sklearn.model_selection import KFold
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def load_data(data):
## Read in the txt file and turn into a pandas df
ids = []
labels = []
texts = []
with open(data, 'r') as f:
for line in f:
if line.strip():
fields = line.lower().strip().split("\t")
ids.append(fields[0])
labels.append(fields[1])
texts.append(fields[2])
df = pd.DataFrame(
{'id': ids,
'label': labels,
'text': texts
})
return df
def clean_tweet(text):
# Write a function to clean emojis, smileys, mentions, punctuations & urls
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.EMOJI,p.OPT.SMILEY,p.OPT.HASHTAG)
clean_text = re.sub(r'[^a-zA-Z0-9\s]', ' ', p.clean(text))
return clean_text
### THE MODEL ###
def features(sentence):
words = sentence.lower().split()
return dict(('contains(%s)' % w, True) for w in words)
def get_nb_features(cleaned_df):
# Extract Positive, Negative, Netural Tweets
df_pos_train = cleaned_df[cleaned_df['label'] == 'positive']
pos_tweets = df_pos_train['text'].tolist()
df_neg_train = cleaned_df[cleaned_df['label'] == 'negative']
neg_tweets = df_neg_train['text'].tolist()
df_neutral_train = cleaned_df[cleaned_df['label'] == 'neutral']
neutral_tweets = df_neutral_train['text'].tolist()
# Create Traning Features
positive_featuresets = [(features(tweet),'positive') for tweet in pos_tweets]
negative_featuresets = [(features(tweet),'negative') for tweet in neg_tweets]
neutral_featuresets = [(features(tweet),'neutral') for tweet in neutral_tweets]
training_features = positive_featuresets + negative_featuresets + neutral_featuresets
return training_features
def nb_cv(cleaned_df):
# Get Features
training_set = get_nb_features(cleaned_df)
# Get 10-Fold. Important: Shuffle=True
cv = KFold(n_splits=10, random_state=0, shuffle=True)
# Model
sentiment_analyzer = SentimentAnalyzer()
trainer = NaiveBayesClassifier.train
# Store Result
Accuracy = []
# For each fold, train model, evaluate
for train_index, test_index in cv.split(training_set):
classifier = sentiment_analyzer.train(trainer, np.array(training_set)[train_index].tolist())
truth_list = np.array(training_set)[test_index].tolist()
performance = sentiment_analyzer.evaluate(truth_list,classifier)
Accuracy.append(performance['Accuracy'])
'''## Can add all other measures here. Sample Result as below:
{'Accuracy': 0.525,
'Precision [negative]': 0.28337874659400547, 'Recall [negative]': 0.7272727272727273, 'F-measure [negative]': 0.407843137254902,
'Precision [neutral]': 0.5011933174224343, 'Recall [neutral]': 0.30837004405286345, 'F-measure [neutral]': 0.38181818181818183,
'Precision [positive]': 0.7461629279811098, 'Recall [positive]': 0.611810261374637, 'F-measure [positive]': 0.672340425531915}
'''
return np.mean(np.asarray(Accuracy))
if __name__ == '__main__':
nb_cv(cleaned_df)