-
Notifications
You must be signed in to change notification settings - Fork 0
/
pool.py
81 lines (70 loc) · 2.96 KB
/
pool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
__author__ = 'techbk'
import os
from bagofword import BagOfWords
from documentclass import DocumentClass
from document import Document
class Pool(object):
def __init__(self):
self.__document_classes = {} #dictionaries of document in a class
self.__vocabulary = BagOfWords() #dictionaries of word in a class
def sum_words_in_class(self, dclass):
""" The number of times all different words of a dclass appear in a class """
sum = 0
for word in self.__vocabulary.Words():
WaF = self.__document_classes[dclass].WordsAndFreq()
if word in WaF:
sum += WaF[word]
return sum
def learn(self, directory, dclass_name):
""" directory is a path, where the files of the class with the name dclass_name can be found """
x = DocumentClass(self.__vocabulary)
dir = os.listdir(directory)
for file in dir:
d = Document(self.__vocabulary)
print(directory + "/" + file)
d.read_document(directory + "/" + file, learn = True)
x = x + d
self.__document_classes[dclass_name] = x
x.SetNumberOfDocs(len(dir))
def Probability(self, doc, dclass = ""):
"""Calculates the probability for a class dclass given a document doc"""
if dclass:
sum_dclass = self.sum_words_in_class(dclass)
prob = 0
d = Document(self.__vocabulary)
d.read_document(doc)
for j in self.__document_classes:
sum_j = self.sum_words_in_class(j)
prod = 1
for i in d.Words():
wf_dclass = 1 + self.__document_classes[dclass].WordFreq(i)
wf = 1 + self.__document_classes[j].WordFreq(i)
r = wf * sum_dclass / (wf_dclass * sum_j)
prod *= r
prob += prod * self.__document_classes[j].NumberOfDocuments() / self.__document_classes[dclass].NumberOfDocuments()
if prob != 0:
return 1 / prob
else:
return -1
else:
prob_list = []
for dclass in self.__document_classes:
prob = self.Probability(doc, dclass)
prob_list.append([dclass,prob])
prob_list.sort(key = lambda x: x[1], reverse = True)
return prob_list
def DocumentIntersectionWithClasses(self, doc_name):
res = [doc_name]
for dc in self.__document_classes:
d = Document(self.__vocabulary)
d.read_document(doc_name, learn=False)
o = self.__document_classes[dc] & d
intersection_ratio = len(o) / len(d.Words())
res += (dc, intersection_ratio)
return res
def test(self):
document_class = self.__document_classes["clinton"]
bag = document_class.WordsAndFreq()
#print(bag)
for key,value in bag.items():
print(key,':',value)