forked from eric-sai/Data-Mining-on-Twitter
/
index.py
164 lines (155 loc) · 6.57 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import glob
import os
import re
import math
from parser1 import parseTokensFromText
import json
import nltk
def read_file(document,extend):
tf_by_docid={}
atid={}
os.chdir(document)
for file in glob.glob(extend):
f=open(file,'r', encoding='utf-8',errors='ignore')
for line in f:
line_object=json.loads(line)
tweet_text = line_object['text']
# if '#' in tweet_text:
# atid= file+ line_object['id_str']
text=re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",tweet_text)
# text = re.sub(r"@\S+", "",text)
# print(text)
tweet_id =file + line_object['id_str']
parsedTokenList=parseTokensFromText(text)
# only consider nouns
# parsedTokenList=nltk.pos_tag(parseTokensFromText(text))
# print(parsedTokenList)
tf_by_docid[tweet_id]=tf={}
for term in parsedTokenList:
# if term[1]=='NN':
tf[term]=tf.get(term, 0) + 1
# print(tf_by_docid)
return tf_by_docid,atid
def calculate_df(tf_by_docid):
df = {}
for docid in tf_by_docid:
for term in tf_by_docid[docid]:
df[term] = df.get(term, 0) + 1
return df
def cal_avel(tf_by_docid,N):
avel=0.0
for docid in tf_by_docid:
avel=avel+len(tf_by_docid[docid])
avel=avel/N
return avel
def AIG(tf_by_docid,query):
df={}
df1={}
for docid in tf_by_docid:
for term1 in query:
if term1 in tf_by_docid[docid]:
df1[term1]=ddf={}
for term2 in tf_by_docid[docid]:
df[term2]=df.get(term2,0)+1
if term1 != term2:
ddf[term2]=ddf.get(term2,0)+1
else:
for term2 in tf_by_docid[docid]:
df[term2]=df.get(term2,0)+1
return df,df1
def calculate_Dice(df,df1,query, N):
ret={}
for term1 in query:
if term1 in df1:
for term2 in df1[term]:
ret[term2]= ret.get(term2,0)+ 2*df1[term1][term2]/(df[term1]+df[term2])
return ret
def calculate_Jaccard(df,df1,query,N):
ret = {}
for term1 in query:
if term1 in df1:
for term2 in df1[term]:
ret[term2]=ret.get(term2,0)+df1[term1][term2]/(df[term1]+df[term2]-df1[term1][term2])
return ret
def calculate_okapiBM25(tf,df,N,avel):
return {term : ((tf[term]*(1.2+1))* math.log((float(N)-df[term]+0.5)/(df[term]+0.5))/(tf[term]+1.2*(1-0.75+(0.75*len(tf)/avel))))\
for term in tf}
def calculate_okapiBM25s(tf,df,tsv,R,N):
tfidf={}
for term in tf:
if term in tsv:
tfidf[term]=((tf[term]*(1.2+1))* float(1/3)*math.log(((tsv[term]+0.5)/(R-tsv[term]))/((df[term]-tsv[term]+0.5)/(N-df[term]-R+tsv[term]+0.5))))
else:
tfidf[term]=0.0
return tfidf
def calculate_tfidf(tf, df, N):
return {term : math.log(tf[term]+1) * math.log(float(N)/df[term])\
for term in tf}
def calculate_AIG(df,df1,query,N):
ret={}
for term1 in query:
if term1 in df1:
for term2 in df1[term1]:
ret[term2]=ret.get(term2,0)+math.log(1+N*(df1[term1][term2]/(df[term1]*df[term2])))
return ret
def make_invidx_by_OkapiBM25s(document, extend,tsv):
tf_by_docid,atid = read_file(document,extend)
df, N = calculate_df(tf_by_docid), len(tf_by_docid)
invidx= {term : {} for term in df}
R= len(tsv)
for docid in tf_by_docid:
tfidf = calculate_okapiBM25s(tf_by_docid[docid], df,tsv,R,N)
for term in tfidf:
invidx[term][docid]=tfidf[term]
return invidx
def make_index_AIG(document,extend,query):
invidx={}
tf_by_docid,atid=read_file(document,extend)
df,df1=AIG(tf_by_docid,query)
# print("df",df)
# print("df1",df1)
N=len(tf_by_docid)
score=calculate_AIG(df,df1,query,N)
# print("score",score)
for term in score:
invidx[term]={}
for docid in tf_by_docid:
for term in tf_by_docid[docid]:
if term in score:
invidx[term][docid]=score[term]
#print(invidx[term][docid])
return invidx
def make_invidx(document, extend):
tf_by_docid,atid = read_file(document,extend)
# print(tf_by_docid)
df, N = calculate_df(tf_by_docid), len(tf_by_docid)
# print(df,N)
invidx, length_by_docid = {term : {} for term in df}, {}
for docid in tf_by_docid:
tfidf = calculate_tfidf(tf_by_docid[docid], df, N)
# print(tfidf)
length = length_by_docid[docid] = \
math.sqrt(sum([value**2 for value in tfidf.values()]))
for term in tfidf:
invidx[term][docid] = tfidf[term] / length
return invidx, length_by_docid
def make_invidx_by_OkapiBM25(document,extend):
tf_by_docid,atid = read_file(document,extend)
df, N = calculate_df(tf_by_docid), len(tf_by_docid)
invidx, length_by_docid = {term : {} for term in df}, {}
avel=cal_avel(tf_by_docid,N)
for docid in tf_by_docid:
tfidf = calculate_okapiBM25(tf_by_docid[docid], df, N,avel)
for term in tfidf:
invidx[term][docid]=tfidf[term]
return invidx
def make_invidx_by_OkapiBM251(document,extend):
tf_by_docid,atid = read_file(document,extend)
df, N = calculate_df(tf_by_docid), len(tf_by_docid)
invidx, length_by_docid = {term : {} for term in df}, {}
avel=cal_avel(tf_by_docid,N)
for docid in tf_by_docid:
tfidf = calculate_okapiBM25(tf_by_docid[docid], df, N,avel)
for term in tfidf:
invidx[term][docid]=tfidf[term]
return invidx,tf_by_docid,df,N