def process_query_batch(args):
    """
    Given a query, generate query tuples for the math index
    :param args:
    :return: nil
    """
    stats = Stats()
    fileid = os.getpid()

    query_list, topk, math_index = args
    math_index.openDB(fileid, topk)

    stats.num_documents = len(query_list)

    for (query_num, query_string) in query_list:
        trees = MathExtractor.parse_from_xml(query_string, query_num,
                                             stats.missing_tags,
                                             stats.problem_files)
        stats.num_expressions += len(trees)

        # also need to handle keyword queries if present
        terms = re.findall(r"<keyword[^>]*>\s*([^<]*\S)\s*</keyword>",
                           query_string)
        stats.num_keywords += len(terms)

        math_index.search(fileid, query_num, trees, terms, topk)

    math_index.closeDB(fileid)
    return (fileid, stats)
Beispiel #2
0
def math_indexer_task(pargs) -> (str, list):
    """
    creates index tuples for the expressions in this subcollection
    :param pargs:
    :return: (fileid, combined_stats)
    """
    math_index, cntl, chunkid = pargs
    combined_stats = Stats()

    docs = MathDocument(cntl)

    (chunk_size, mappings) = docs.read_mapping_file(chunkid)
    combined_stats.num_documents += len(mappings)

    seen_docs = []  # just dump them as they come
    for (doc_id, filename) in enumerate(mappings, start=chunkid * chunk_size):
        ##        print('parsing %s, id:%s ' % (filename, doc_id),flush=True)
        try:
            # get all the symbol trees found in file
            for tree in read_file(filename,
                                  doc_id,
                                  missing_tags=combined_stats.missing_tags,
                                  problem_files=combined_stats.problem_files):
                combined_stats.num_expressions += 1
                combined_stats.global_expressions += len(tree.position)
                # pairs = tree.get_pairs(window) do not store pairs -- will be created in C++ module
                seen_docs.append(tree)
        except Exception as err:
            reason = str(err)
            print("Failed to process document " + filename + ": " + reason,
                  file=sys.stderr)
            combined_stats.problem_files[
                reason] = combined_stats.problem_files.get(reason, set())
            combined_stats.problem_files[reason].add(doc_id)

    fileid = math_index.add(seen_docs)
    print("%s is done saving to database %s" % (chunkid, fileid), flush=True)
    return fileid, combined_stats
        math_index = Version03Index(cntl, window=window)

        ##        if cntl.read("results"):
        ##            # try ingesting and processing results (temporary setting)
        ##            tuples = math_index.get(query_file)
        ##            for qid,hit in tuples.items():
        ##                print(qid,hit)
        ##        else:

        with open(query_file, encoding='utf-8') as file:
            parsed = BeautifulSoup(file, "html.parser")

        query_list = parsed.find_all("topic")
        print("There are %s queries." % len(query_list))
        combined_stats = Stats()
        fileids = set()

        ##          try:
        query_list_m = list(map(get_query, query_list))  # whole batch for now
        args = [(query_list_m, topk, math_index)]

        for p in args:  # single-process execution
            (fileid, stats) = process_query_batch(p)
            fileids.add(fileid)
            combined_stats.add(stats)
##          except Exception as err:
##              reason = str(err)
##              print("Failed to process queries: "+reason, file=sys.stderr)

        cntl.store("query_fileids", str(fileids))