-
Notifications
You must be signed in to change notification settings - Fork 0
/
query.py
201 lines (160 loc) · 7.56 KB
/
query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
"""
Tangent
Copyright (c) 2013, 2015 David Stalnaker, Richard Zanibbi, Nidhin Pattaniyil,
Andrew Kane, Frank Tompa, Kenny Davila Castellanos
This file is part of Tangent.
Tanget is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Tangent is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Tangent. If not, see <http://www.gnu.org/licenses/>.
Contact:
- Richard Zanibbi: rlaz@cs.rit.edu
"""
from concurrent.futures import ProcessPoolExecutor
import os
from sys import argv
import sys
import codecs
from bs4 import BeautifulSoup
import time
from tangent.math.version03_index import Version03Index
from tangent.math.math_extractor import MathExtractor
from tangent.utility.control import Control
from tangent.utility.Stats import Stats
sys.setrecursionlimit(10000)
"""
The main application that given an nticr-like query file, queries the collection and returns the results
Code is based on tangent/ntcir/ntcir11.py
"""
def print_help_and_exit():
"""
Prints usage statement
"""
print("Usage: python query.py [<cntl-file>] or python query.py help")
print(" default <cntl-file> is tangent.cntl")
print()
print("where <cntl-file> is a tsv file that contains a list of parameter-value pairs")
print("and must include at least the following entries:")
print(" database\\t<directory for storing database files>")
print(" queries\\t<file with queries in NTCIR format>")
print("and may optionally include:")
print(" window\\t<window-size>")
print(" run\\t<arbitrary name for query run>")
print(" weights\\t['math_only' | 'math_focused' | 'ntcir_default' | 'math_text_equal']")
print(" where 'math_only' is default")
print(" system\\t['Wikipedia' | 'NTCIR Test' | 'NTCIR Actual']")
print(" where 'Wikipedia' is default")
print("as well as other pairs.")
exit()
def process_query_batch(args):
"""
Given a query, generate query tuples for the math index
:param args:
:return: nil
"""
stats = Stats()
fileid = os.getpid()
system, db, run_tag, query_list, topk, math_index, strategy = args
math_index.openDB(fileid,topk)
stats.num_documents = len(query_list)
for (query_num,query_string) in query_list:
trees = MathExtractor.parse_from_xml(query_string, query_num, stats.missing_tags, stats.problem_files)
stats.num_expressions += len(trees)
math_index.search(fileid, query_num, trees)
# also need to handle keyword queries if present
math_index.closeDB(fileid)
return (fileid,stats)
def get_query(query_obj):
"""
Parsed the query object in xml and get the math and text
:param query_obj:
:return: query num, doc = '<doc>' formula* keyword* '</doc>'
"""
query_num = query_obj.num.text.strip().translate({10:r"\n",9:r"\t"})
query_list = []
# get formulas
for f in query_obj.findAll("formula"):
math = f.find("m:math") # assumes m is used for namespace
query_list.append(str(math))
# get keywords
for k in query_obj.findAll("keyword"):
word = k.text.strip() # get word and strip whitespace
query_list.append(word)
return query_num, "<doc>" + " ".join(query_list) + "</doc>"
if __name__ == '__main__':
ntcir_main_count = 1000 # main task require 1000 results returned
ntcir_wiki_count = 100
if sys.stdout.encoding != 'utf8':
sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer, 'strict')
if sys.stderr.encoding != 'utf8':
sys.stderr = codecs.getwriter('utf8')(sys.stderr.buffer, 'strict')
if (len(argv) > 2 or (len(argv) == 2 and argv[1] == 'help')): # uses control file to control all parameters
print_help_and_exit()
else:
start = time.time()
try:
cntl = Control(argv[1]) if len(argv) == 2 else Control()
except Exception as err:
print("Error in reading <cntl-file>: " +str(err))
print_help_and_exit()
db = cntl.read("database")
if not db:
print("<cntl-file> missing database")
print_help_and_exit()
query_file = cntl.read("queries")
if not query_file:
print("<cntl-file> missing queries")
print_help_and_exit()
window = cntl.read("window",num=True)
if window and window < 2: # window values smaller than 2 make no sense
print('Window values smaller than 2 not permitted -- using 2')
window = 2
run_tag = cntl.read("run",default="")
run_tag = 'rit_' + run_tag
weighting_strategy = cntl.read("weights",default='math_only')
if weighting_strategy not in ['math_only', 'math_focused' , 'ntcir_default', 'math_text_equal']:
print("Invalid weighting strategy. Using 'math_only' instead of %s\n" % weighting_strategy)
weighting_strategy = 'math_only'
system = cntl.read("system",default='Wikipedia')
if system not in ['Wikipedia', 'NTCIR Test', 'NTCIR Actual']:
print("Invalid system. Using 'Wikipedia' instead of %s\n" % system)
system = 'Wikipedia'
math_index = Version03Index(db=db, window=window)
if cntl.read("results"):
# try ingesting and processing results (temporary setting)
tuples = math_index.get(query_file)
for qid,hit in tuples.items():
print(qid,hit)
else:
topk = ntcir_wiki_count if system == 'Wikipedia' else ntcir_main_count
with open(query_file, encoding='utf-8') as file:
parsed = BeautifulSoup(file, "lxml")
query_list = parsed.find_all("topic")
print("There are %s queries." % (len(query_list)), flush=True)
combined_stats = Stats()
fileids = set()
try:
query_list_m = list(map(get_query,query_list)) # whole batch for now
args = [(system, db, run_tag, query_list_m, topk, math_index, weighting_strategy)]
for p in args: # single-process execution
(fileid,stats) = process_query_batch(p)
fileids.add(fileid)
combined_stats.add(stats)
except Exception as err:
reason = str(err)
print("Failed to process document "+filename+": "+reason, file=sys.stderr)
combined_stats.problem_files[reason] = combined_stats.problem_files.get(reason, set())
combined_stats.problem_files[reason].add(filename)
cntl.store("query_fileids",str(fileids))
print("Done preparing query batch for %s against %s" % (query_file, db))
combined_stats.dump()
cntl.dump() # output the revised cntl file
end = time.time()
elapsed = end - start
print("Elapsed time %s" % (elapsed))