-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentenceNaiveBayes.py
149 lines (123 loc) · 4.35 KB
/
sentenceNaiveBayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
##########Naive Bayes Sentence Classification########
'''
Date: 4/14/2016
Trying to write the classifier in general terms with many parameters (useful for comparative analysis)
Fix from prev Sentence classifiers:
Planning to use top NUMFEA frequent ngrams from each langauge.
Previosuly we used top k features from entire dataset.
Parameters:
character n - grams = 1, 2, 3, 4, 5, 6
Number of features from each language - NUMFEA (k)
'''
##Dictionary Functions
#Not Working
def printDict(diction):
keys = diction.keys()
print keys
for i in range(0, len(keys)):
print keys[i]
def addKey(diction, key):
diction[key] = 1
def updateKey(diction, key, value):
diction[key] = diction[key] + value
def addEntry(diction, st):
if(diction.has_key(st)):
updateKey(diction, st, 1)
else:
addKey(diction, st)
LOW = 0
NUMFEA = 1000
NGRAM = 4
import time
import nltk
import re
import codecs
from nltk.classify import apply_features
#adds st to the correspondong culster dictionary. In else case, it creates new dictionary for the cluster and then adds
def langAddEntry(diction, st, cluster):
if(diction.has_key(cluster)):
addEntry(diction[cluster], st)
else:
diction[cluster] = {}
addEntry(diction[cluster], st)
#Given a sentence(or word) build the feature vector using selected features set
def charNgramfeatureDict(word):
chNgramFea = {}
#word = "@#" + word + "$%"
for key in features:
chNgramFea[key] = 1
for i in range(NGRAM - 1, len(word)):
st = word[i-(NGRAM - 1):i+1]
if st in features:
updateKey(chNgramFea, st, 1)
return chNgramFea
#Build Char Ngrams vocabulary for forming features. Space is considered in Ngrams. Builds dictionary for each langauge cluster
def charNgramVoca(filename):
chNgramVoca = {}
f = codecs.open(filename,'r',encoding='utf8')
#f = io.open(filename,'r',encoding='utf8')
line = f.readline()
while line:
entr = re.split(r'\t+', line)
sentence = entr[2][:-1]
#print repr(sentence)
#sentence = u"@#" + entr[2] + u"$%"
for i in range(NGRAM - 1, len(sentence)):
langAddEntry(chNgramVoca, sentence[i-(NGRAM - 1):i+1], entr[0] )
line = f.readline()
f.close()
return chNgramVoca
#Select top NUMFEA Ngrams from each cluster
def selectFeatures(chNgramVoca):
clusters = chNgramVoca.keys()
features = set()
import operator
for cluster in clusters:
selFea = sorted(chNgramVoca[cluster].items(), key=operator.itemgetter(1), reverse = True)
#print selFea
for item in selFea[LOW:NUMFEA]:
features.add(item[0])
return features
###Training and testing
# Return data should be used by apply features of NLTK to resolve memory error, But it is not working sometimes
# [(word, LangClass),....]
def buildLabelData(filename):
f = codecs.open(filename,'r',encoding='utf8')
line = f.readline()
labelTrainData = []
while line:
entr = re.split(r'\t+', line)
labelTrainData.append((entr[2], entr[0]))
line = f.readline()
f.close()
return labelTrainData
def trainNaivebayes(trainFile):
labelTrainData = buildLabelData(trainFile)
train_set = apply_features(charNgramfeatureDict, labelTrainData)
classifier = nltk.NaiveBayesClassifier.train(train_set)
return classifier
def classifyTestData(filename, classifier):
labelTestData = buildLabelData(filename)
test_set = apply_features(charNgramfeatureDict, labelTestData)
accuracy = nltk.classify.accuracy(classifier, test_set)
return accuracy
trainFile = "C:/Users/gsr/Desktop/NLP Project/code/data/twitter/train_recall_oriented.txt"
testFile = "C:/Users/gsr/Desktop/NLP Project/code/data/twitter/test_recall_oriented.txt"
smallFile = "C:/Users/gsr/Desktop/NLP Project/code/data/twitter/small.txt"
features = set() #needs to be global because charNgramfeatureDict doesn't accept more than one argument
def NgramMain(trainFile, testFile):
chNgramVoca = charNgramVoca(trainFile)
#print chTriVoca
global features
features = selectFeatures(chNgramVoca)
print "Training on file:"+ trainFile
tstart = time.time()
classifier = trainNaivebayes(trainFile)
tend = time.time()
print(tend - tstart)
print "Testing on file:"+ testFile
accuracy = classifyTestData(testFile, classifier)
print "Accuracy of the classifier:"
print accuracy
cend = time.time()
print(cend - tend)