-
Notifications
You must be signed in to change notification settings - Fork 0
/
unigram.py
111 lines (86 loc) · 2.58 KB
/
unigram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from sets import Set
from get_unigram import get_unigram
import nltk
import re
import os
from math import *
def perplexity(Prob_dist, filename, N):
fh = open(filename, 'r')
lines = fh.read()
lines = re.sub("[()+.,\']",'',lines)
words = nltk.tokenize.word_tokenize(lines)
N = 0
prob = float(0)
v= len(words)
for word in words:
if Prob_dist.has_key(word):
prob = prob + log(float(Prob_dist[word] + 0.00004), 2)
else:
prob = prob +log( 0.00004, 2)
#perplex = float(-1)*(float(1)/float(v))*prob
#perplex = 10**perplex
#print perplex
return prob
myset = Set([1,2,3,4,5]);
temp = myset
fp = open('results_unigram','w+')
Pos_Dict= dict()
Neg_Dict= dict()
for x in myset:
temp.remove(x)
Pos_Dict = get_unigram('pos','dataset',temp)
# Calculate pos_perplexity
print len(Pos_Dict)
Neg_Dict = get_unigram('neg','dataset',temp)
print len(Neg_Dict)
# Calculate neg_perplexity
fpath = '/home/avj/Documents/NLP/NLP_BinaryClassifier/dataset/'+str(x)
test_file_p = os.listdir(fpath +'/pos')
test_file_n = os.listdir(fpath+'/neg')
fp.writelines("Test folder:"+str(x)+"\n")
#test positive folder under test folder
y_true = list()
y_pred = list()
Npos =0
Nneg = 0
for i in Pos_Dict.values():
Npos = Npos + i
Npos = float(Npos)
for i in Neg_Dict.values():
Nneg = Nneg + i
Nneg = float(Nneg)
for p in Pos_Dict.keys():
Pos_Dict[p] = float(Pos_Dict[p])/Npos
for p in Neg_Dict.keys():
Neg_Dict[p] = float(Neg_Dict[p])/Nneg
for test in test_file_p:
pos_perp = perplexity(Pos_Dict, fpath+"/pos/"+test, Npos)
neg_perp = perplexity(Neg_Dict, fpath+"/pos/"+test, Npos)
if pos_perp > neg_perp:
y_true.append(1)
y_pred.append(1)
#fp.writelines("original pos\t"+test+"\tpred pos\n")
else:
y_true.append(1)
y_pred.append(0)
#fp.writelines("original pos\t"+test+"\tpred neg\n")
#test negative folder under test folder
for test in test_file_n:
pos_perp = perplexity(Pos_Dict, fpath+"/neg/"+test, Nneg)
neg_perp = perplexity(Neg_Dict, fpath+"/neg/"+test, Nneg)
#print neg_perp
if pos_perp > neg_perp:
y_true.append(0)
y_pred.append(1)
#fp.writelines("original neg\t"+test+"\tpred pos\n")
else:
y_true.append(0)
y_pred.append(0)
#fp.writelines("original neg\t"+test+"\tpred neg\n")
target_names = ['0', '1']
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred, target_names=target_names))
print "in progress"
temp.add(x)
fp.close()
#print Pos_Dict