-
Notifications
You must be signed in to change notification settings - Fork 0
/
ranker.py
202 lines (167 loc) · 6.67 KB
/
ranker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#!/usr/bin/env python
# Author: Jared Hancock
# NOTE: if nltk.data error: enter "include nltk.download" in a python terminal
import json, re, math, nltk, string
from urllib2 import urlopen
from nltk.corpus import stopwords
from collections import Counter
from nltk import PorterStemmer
API_KEY = 'insert key here'
CX_KEY = 'insert key here'
STOPS = [word.encode('utf-8') for word in stopwords.words('english')]
class Result:
def __init__(self, rank, title, url, snippet):
self.rank = rank
self.title = title
self.url = url
self.snippet = snippet
self.tokens = None
self.vector = None
self.jaccard = 0
self.cosine = 0
'''
Removes punctuation, bullets, and middots from text
@param string text: text to be edited
@return string: text without punctuation
'''
def removePunct( text ):
text = text.decode("utf-8").replace(u"\u2022", "").replace(u"\u00B7", "") # get rid of bullets and middots
return re.sub('[%s]' % re.escape(string.punctuation), '', text).encode("utf-8")
'''
Pre-processes text by tokenizing, removing stop words, and stemming to prepare for comparison
@param string text: text to be edited
@return list of preprocessed string tokens
'''
def preprocess( result ):
words = removePunct(result.title)
words += " "
words += removePunct(result.snippet)
result.tokens = nltk.word_tokenize(words)
for tok in result.tokens:
if tok not in STOPS:
tok = PorterStemmer().stem(tok.decode('utf-8'))
tok = tok.lower().encode('utf-8')
return result
'''
Calculates jaccard coefficient between two samples of text
@param string result: text rendered from a search result
@param string relevant: text to test the result string against for similarity
@return float: jaccard coefficient
'''
def jaccard( result, relevant ):
n = len(result.intersection(relevant))
return n / float(len(result) + len(relevant)- n)
'''
Calculates cosine similarity based on term frequency
@param dict vec1: dict including terms and the number of occurrences for each word
@param dict vec2: dict including terms and the number of occurrences for each word
@return float: cosine similarity between 2 vector params
'''
def calc_cos( vec1, vec2 ):
intersection = set(vec1.keys()) & set(vec2.keys()) # bitwise AND
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
sum2 = sum([vec2[x]**2 for x in vec2.keys()])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
'''
Searches google for a search term
@param string query: search term
@param list Result resList: list of Result objects containing search results
@param int start: Index of results to begin from (Google API only allows 10 results per request)
@return None
'''
def googleSearch( query, resList, start):
url = 'https://www.googleapis.com/customsearch/v1?key=AIzaSyACR0s18hhnQD52hqGGEMEMO3ESpHh65k4&start='+str(start)+'&prettyPrint=true&cx=014898526197053737379:tql4sbmedis&q='+query
req = urlopen(url)
data = json.load(req)
i = start
for item in data['items']:
r = Result(i, item['title'].encode('utf8'), item['link'].encode('utf8'), item['snippet'].encode('utf8'))
resList.append(r)
i += 1
'''
Main controller; Conducts user interface and I/O
@param dict vec1: dict including terms and the number of occurrences for each word
@param dict vec2: dict including terms and the number of occurrences for each word
@return float: cosine similarity between 2 vector params
'''
def searchRank( query ):
resList = [] # list of search result objects
relList = [] # list of "indexes" of relevant results
googleSearch(query, resList, 1)
googleSearch(query, resList, 11)
for r in resList:
r = preprocess(r) # initialize tokens attribute with pre-processed words
r.vector = Counter(r.tokens)
print r.rank
print r.title
print r.url
print r.snippet
print
# ask user which results are relevant
print "Choose up to 5 results that were relevant to your search."
print "Enter a negative number to quit."
relNum = int(input("Enter a result number: "))
i = 0
while relNum >= 0 and i < 5:
if relNum not in relList:
relList.append(relNum)
else:
print "Error: You already entered that result"
i += 1
relNum = int(input("Enter a result number ( negative to quit ): "))
# write relevant data to file
infile = open(query+'.txt', 'wb')
for i in relList:
for r in resList:
if i == r.rank:
infile.write(r.title + ' ')
infile.write(r.snippet + ' ')
infile.close()
'''--------------------pre-process our relevance test set-------------------------'''
readfile = open(query+'.txt', 'rb')
relWords = readfile.read()
relWords = removePunct(relWords)
relTokens = nltk.word_tokenize(relWords)
infile = open(query+'-clean.txt', 'w')
for tok in relTokens:
if tok not in STOPS:
tok = PorterStemmer().stem(tok.decode('utf-8'))
tok = tok.lower().encode('utf-8')
infile.write(tok + ' ')
infile.close()
'''--------------------calculate, sort, and display----------------------------------'''
relevanceVector = Counter(relTokens) # get vector for relevance data to calc similarity
print "Calculating relevancy of your search results......"
# calculate similarity
for r in resList:
r.cosine = calc_cos(r.vector, relevanceVector)
r.jaccard = jaccard(set(r.tokens), set(relTokens))
# print "cosine:", r.cosine
# print "jaccard:", r.jaccard
print "Select sorting preference:"
print "[1] Jaccard Coefficient"
print "[2] Cosine Similarity"
print
sortChoice = raw_input("Enter choice here: ")
if sortChoice.lower() in ['1', 'j', 'jaccard', 'jaccard coefficient']:
resList.sort(key = lambda x: x.jaccard, reverse=True)
print "Showing results based on jaccard coeffecient: "
elif sortChoice.lower() in ['2', 'c', 'cosine','cosine similarity']:
resList.sort(key = lambda x: x.cosine, reverse=True)
print "Showing results based on cosine similarity: "
for r in resList:
print
print r.rank
print r.title
print r.url
print r.snippet
print
if __name__=="__main__":
q = raw_input("Query to search for: ")
q = q.replace(' ', '+')
searchRank(q)