/
IDF.py
65 lines (59 loc) · 2.23 KB
/
IDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from __future__ import division
from stop_words import get_stop_words
import string
import math
import operator
class IDF:
def __init__(self, str_1, str_2, corpus):
self.list_corpus = corpus
self.stop_words = get_stop_words('en')
# remove punctuation
str_1 = str_1.translate(string.maketrans("", ""), string.punctuation)
str_2 = str_2.translate(string.maketrans("", ""), string.punctuation)
#self.list_corpus = []
# Tokenize the sentence
self.str_1 = self.tokenize(str_1)
self.str_2 = self.tokenize(str_2)
#print "st1",str_1,"st2",str_2
#print "cccc",corpus
# for i in range(len(corpus)):
# item = str(corpus[i]).split(" ")
# for token in item:
# if token in self.stop_words:
# pass
# else:
# self.list_corpus.append(token)
def intersect(self):
return [i for i in self.str_1 if i in self.str_2]
def union(self):
return list(self.str_1 + self.str_2)
# tokenize str and remove stop words
def tokenize(self, item):
item = item.split(" ")
for token in item:
if token in self.stop_words:
item.remove(token)
return item
def cal_overlap(self):
list_intersect = self.intersect()
list_union = self.union()
freq_inter = 0.0
freq_union = 0.0
for item in list_intersect:
matches_intersect = [x for x in self.list_corpus if item == x]
freq_inter += math.log10(operator.pow(len(matches_intersect) + 1, -1))
for item in list_union:
if len(list_union)==0:
print self.str_1,self.str_2
matches_union = [x for x in self.list_corpus if item == x]
freq_union += math.log10(operator.pow(len(matches_union) + 1, -1))
#print "cor", self.list_corpus
#print list_intersect,list_union
idf_overlap = freq_inter / freq_union
return idf_overlap
# print matches_intersect, freq_inter
# print matches_union, freq_union
#
# print "intersect", list_intersect
# print "union", list_union
# print "corpus", self.list_corpus