/
queryExpander.py
108 lines (76 loc) · 2.73 KB
/
queryExpander.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import base64
import json
import urllib2
from collections import defaultdict
from math import log
import utilities
#run the program
def main():
print "Enter precision:",
targetPrecision = float(raw_input())
precision = -1.0
print "Enter Query:",
inputQuery = raw_input()
qTerms = inputQuery.split()
bingAccountKey = "QgPWn9g/wi5g0BrHFTheQNwEjD+/m98WcIi8ps2G6V8="
while(precision < targetPrecision):
#Print bing API key
print "Client Key\t= " + bingAccountKey
#Print query terms
query = qTerms[0]
for qTerm in xrange(len(qTerms) - 1):
query += (" "+qTerms[qTerm + 1])
print "Query\t\t= " + query
query = query.replace(" ", "+")
#Print precision
print "Precision\t= " + str(targetPrecision)
#Print URL
url = "https://api.datamarket.azure.com/Bing/Search/Web?Query=%27" + query + "%27&$top=10&$format=json"
print "URL: " + url
#query the API and get docs
documents = utilities.search(query, url, bingAccountKey)
#show results and get feedback
documents = utilities.getFeedback(documents)
#show new results
print "======================"
print "FEEDBACK SUMMARY"
print "Query " + query.replace("+", " ")
#intermediate check
if utilities.checkTargetPrecision(documents, targetPrecision):
break
print "Indexing results..."
dFrequency = defaultdict(list)
for index in xrange(len(documents)):
#initialize scores to 0
documents[index]['score'] = defaultdict(int)
#get rid of characters
documents[index] = utilities.removeCharacters(documents[index])
#terms from description and title of document
terms = utilities.getTerms(documents[index])
wCount = float(len(terms))
for term in terms:
documents[index]['score'][term] = documents[index]['score'][term] + 1/wCount
if(len(dFrequency[term]) is 0):
dFrequency[term] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
dFrequency[term][index] = 1
dFrequency = utilities.computeDocumentFrequency(dFrequency, len(documents))
#use the document frequency to update the scores
documents = utilities.documentFrequencyTermScore(documents, dFrequency)
#we then create to dicts: one that is relevant and the other that is not
dRelevant, dNRelevant = utilities.divideDictionary(documents)
#run Rocchio Algorithm
mDict = utilities.getMasterDictionary(dRelevant, dNRelevant, documents, qTerms)
#get the two most important terms to add
bestTerm1, bestTerm2 = utilities.getBestTerms(mDict, qTerms)
print "Augmenting by "+ bestTerm1 + " " + bestTerm2
if(bestTerm1 != " "):
qTerms.append(bestTerm1)
if(bestTerm2 != " "):
qTerms.append(bestTerm2)
else:
print "No further augumentation!"
break
def score(m): return mDict[m.lower()]
qTerms.sort(key=score, reverse=True)
# call the main function
main()