def process_query_batch(args): """ Given a query, generate query tuples for the math index :param args: :return: nil """ stats = Stats() fileid = os.getpid() query_list, topk, math_index = args math_index.openDB(fileid, topk) stats.num_documents = len(query_list) for (query_num, query_string) in query_list: trees = MathExtractor.parse_from_xml(query_string, query_num, stats.missing_tags, stats.problem_files) stats.num_expressions += len(trees) # also need to handle keyword queries if present terms = re.findall(r"<keyword[^>]*>\s*([^<]*\S)\s*</keyword>", query_string) stats.num_keywords += len(terms) math_index.search(fileid, query_num, trees, terms, topk) math_index.closeDB(fileid) return (fileid, stats)
def math_indexer_task(pargs) -> (str, list): """ creates index tuples for the expressions in this subcollection :param pargs: :return: (fileid, combined_stats) """ math_index, cntl, chunkid = pargs combined_stats = Stats() docs = MathDocument(cntl) (chunk_size, mappings) = docs.read_mapping_file(chunkid) combined_stats.num_documents += len(mappings) seen_docs = [] # just dump them as they come for (doc_id, filename) in enumerate(mappings, start=chunkid * chunk_size): ## print('parsing %s, id:%s ' % (filename, doc_id),flush=True) try: # get all the symbol trees found in file for tree in read_file(filename, doc_id, missing_tags=combined_stats.missing_tags, problem_files=combined_stats.problem_files): combined_stats.num_expressions += 1 combined_stats.global_expressions += len(tree.position) # pairs = tree.get_pairs(window) do not store pairs -- will be created in C++ module seen_docs.append(tree) except Exception as err: reason = str(err) print("Failed to process document " + filename + ": " + reason, file=sys.stderr) combined_stats.problem_files[ reason] = combined_stats.problem_files.get(reason, set()) combined_stats.problem_files[reason].add(doc_id) fileid = math_index.add(seen_docs) print("%s is done saving to database %s" % (chunkid, fileid), flush=True) return fileid, combined_stats
math_index = Version03Index(cntl, window=window) ## if cntl.read("results"): ## # try ingesting and processing results (temporary setting) ## tuples = math_index.get(query_file) ## for qid,hit in tuples.items(): ## print(qid,hit) ## else: with open(query_file, encoding='utf-8') as file: parsed = BeautifulSoup(file, "html.parser") query_list = parsed.find_all("topic") print("There are %s queries." % len(query_list)) combined_stats = Stats() fileids = set() ## try: query_list_m = list(map(get_query, query_list)) # whole batch for now args = [(query_list_m, topk, math_index)] for p in args: # single-process execution (fileid, stats) = process_query_batch(p) fileids.add(fileid) combined_stats.add(stats) ## except Exception as err: ## reason = str(err) ## print("Failed to process queries: "+reason, file=sys.stderr) cntl.store("query_fileids", str(fileids))