-
Notifications
You must be signed in to change notification settings - Fork 0
/
links_from_search.py
121 lines (108 loc) · 4.65 KB
/
links_from_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import pprint
import xml.etree.ElementTree
from xml.sax.saxutils import escape
from googleapiclient.discovery import build
import common_lib
import passage_retrieval
from py_bing_search import PyBingSearch
import sys
import json
# Get top 20 search result links for a given query using Bing
def GetLinksForQueryBing(query):
#service = build("customsearch", "v1",
# developerKey="AIzaSyDBh9qkTpuXSWbsjCfnCTQJFuFGKOYCElM")
#res = service.cse().list(
# q=query,
# cx='000504779742960611072:dpmv5fihhu8',
# ).execute()
#return [item['link'] for item in res['items']][:20]
try:
bing = PyBingSearch('3Bybyj2qcK/w5FXbBqBUjI9MajN51efC2uYldmzvvnY')
result_list = bing.search_all(query, limit=20, format='json')
results = [result.url for result in result_list]
except:
return None
return results[:min(20, len(results))]
# Get top 20 search result links for a given query using Google
def GetLinksForQueryGoogle(query):
try:
service = build("customsearch", "v1",
developerKey="AIzaSyDBh9qkTpuXSWbsjCfnCTQJFuFGKOYCElM")
res = service.cse().list(
q=query,
cx='000504779742960611072:dpmv5fihhu8',
).execute()
results = [item['link'] for item in res['items']]
except:
return None
return results[:min(20, len(results))]
with open('output_links.txt') as data_file:
links = json.load(data_file)
parser, st, stop = common_lib.Init()
f = open("small.xml", "wb")
f.write("<data>\n")
e = xml.etree.ElementTree.parse('500_sample.xml').getroot()
i = 1
all_scores = 0
top_scores = 0
for ves in e.findall('vespaadd'):
for doc in ves.findall('document'):
f.write("<doc number = \"" + str(i) + "\">\n")
try:
# adding links for keyword query
if (i != 396):
i += 1
continue
print "doc " + str(i)
cont = ""
if doc.find('content') != None:
cont = doc.find('content').text
keyword_query = common_lib.buildFullQuery(doc.find('subject').text, cont, parser, stop)
links_bing = GetLinksForQueryBing(keyword_query)
links_google = GetLinksForQueryGoogle(keyword_query)
f.write("<keyword_query>" + escape(keyword_query) + "</keyword_query>\n")
f.write("<links>\n")
sum_score = 0
len_score = 0
j = 1
max_score = 0
passages = []
scores = {}
keyword_query = passage_retrieval.RemoveSynonymsFromKeywords(keyword_query)
print keyword_query
for link in links_bing:
f.write("<link_passage number = \"" + str(j) + "\">\n")
f.write("<link>" + escape(link) + "</link>\n")
f.write("<in_intersection>" + str(link in links_google) + "</in_intersection>\n")
passage, _ = passage_retrieval.GetTopPassageFromLink(keyword_query, link)
if passage:
passages.append(passage)
score = passage_retrieval.GetSimilarity(doc.find('bestanswer').text, passage)
scores[passage] = score
sum_score += score
if score > max_score:
max_score = score
len_score += 1
f.write("<best_passage_per_link>\n")
f.write("<passage>" + escape(passage.encode('utf-8')) + "</passage>\n")
f.write("<score>" + escape(str(score)) + "</score>\n")
f.write("</best_passage_per_link>\n")
f.write("</link_passage>\n")
j += 1
f.write("</links>\n")
if len_score > 0:
all_scores += float(sum_score) / len_score
f.write("<avg_score>" + escape(str(float(sum_score) / len_score)) + "</avg_score>\n")
f.write("<max_score>" + escape(str(max_score)) + "</max_score>\n")
top_passage = passage_retrieval.GetTopPassageFromList(keyword_query, passages)
f.write("<top_passage>" + escape(top_passage[0].encode('utf-8')) + "</top_passage>\n")
top_scores += scores[top_passage[0]]
f.write("<top_passage_score>" + str(float(scores[top_passage[0]])) + "</top_passage_score>\n")
except:
print "Error for doc " + str(i)
f.write("</doc>\n")
i += 1
f.write("<avg_total_score>" + escape(str(float(all_scores) / (i - 1 - 20))) + "</avg_total_score>\n")
f.write("<avg_top_score>" + escape(str(float(top_scores) / (i - 1 - 20))) + "</avg_top_score>\n")
f.write("</data>\n")
f.close()