def process_query_batch(args):
    """
    Given a query, generate query tuples for the math index
    :param args:
    :return: nil
    """
    stats = Stats()
    fileid = os.getpid()

    query_list, topk, math_index = args
    math_index.openDB(fileid, topk)

    stats.num_documents = len(query_list)

    for (query_num, query_string) in query_list:
        trees = MathExtractor.parse_from_xml(query_string, query_num,
                                             stats.missing_tags,
                                             stats.problem_files)
        stats.num_expressions += len(trees)

        # also need to handle keyword queries if present
        terms = re.findall(r"<keyword[^>]*>\s*([^<]*\S)\s*</keyword>",
                           query_string)
        stats.num_keywords += len(terms)

        math_index.search(fileid, query_num, trees, terms, topk)

    math_index.closeDB(fileid)
    return (fileid, stats)
Beispiel #2
0
def math_indexer_task(pargs) -> (str, list):
    """
    creates index tuples for the expressions in this subcollection
    :param pargs:
    :return: (fileid, combined_stats)
    """
    math_index, cntl, chunkid = pargs
    combined_stats = Stats()

    docs = MathDocument(cntl)

    (chunk_size, mappings) = docs.read_mapping_file(chunkid)
    combined_stats.num_documents += len(mappings)

    seen_docs = []  # just dump them as they come
    for (doc_id, filename) in enumerate(mappings,start=chunkid*chunk_size):
##        print('parsing %s, id:%s ' % (filename, doc_id),flush=True)
        try:
            # get all the symbol trees found in file
            for tree in read_file(filename, doc_id, missing_tags=combined_stats.missing_tags,
                               problem_files=combined_stats.problem_files):
                combined_stats.num_expressions += 1
                # pairs = tree.get_pairs(window) do not store pairs -- will be created in C++ module
                seen_docs.append(tree)
        except Exception as err:
            reason = str(err)
            print("Failed to process document "+filename+": "+reason, file=sys.stderr)
            combined_stats.problem_files[reason] = combined_stats.problem_files.get(reason, set())
            combined_stats.problem_files[reason].add(doc_id)

    fileid = math_index.add(seen_docs)
    print("%s is done saving to database %s" % (chunkid,fileid), flush=True)
    return fileid, combined_stats
Beispiel #3
0
def math_indexer_task(pargs) -> (str, list):
    """
    creates index tuples for the expressions in this subcollection
    :param pargs:
    :return: (fileid, combined_stats)
    """
    math_index, cntl, chunkid = pargs
    combined_stats = Stats()

    docs = MathDocument(cntl)

    (chunk_size, mappings) = docs.read_mapping_file(chunkid)
    combined_stats.num_documents += len(mappings)

    seen_docs = []  # just dump them as they come
    for (doc_id, filename) in enumerate(mappings, start=chunkid * chunk_size):
        ##        print('parsing %s, id:%s ' % (filename, doc_id),flush=True)
        try:
            # get all the symbol trees found in file
            for tree in read_file(filename,
                                  doc_id,
                                  missing_tags=combined_stats.missing_tags,
                                  problem_files=combined_stats.problem_files):
                combined_stats.num_expressions += 1
                combined_stats.global_expressions += len(tree.position)
                # pairs = tree.get_pairs(window) do not store pairs -- will be created in C++ module
                seen_docs.append(tree)
        except Exception as err:
            reason = str(err)
            print("Failed to process document " + filename + ": " + reason,
                  file=sys.stderr)
            combined_stats.problem_files[
                reason] = combined_stats.problem_files.get(reason, set())
            combined_stats.problem_files[reason].add(doc_id)

    fileid = math_index.add(seen_docs)
    print("%s is done saving to database %s" % (chunkid, fileid), flush=True)
    return fileid, combined_stats
Beispiel #4
0
def process_query_batch(args):
    """
    Given a query, generate query tuples for the math index
    :param args:
    :return: nil
    """
    stats = Stats()
    fileid = os.getpid()

    system, db, run_tag, query_list, topk, math_index, strategy = args
    math_index.openDB(fileid,topk)

    stats.num_documents = len(query_list)

    for (query_num,query_string) in query_list:
        trees = MathExtractor.parse_from_xml(query_string, query_num, stats.missing_tags, stats.problem_files)
        stats.num_expressions += len(trees)
        math_index.search(fileid, query_num, trees)

        # also need to handle keyword queries if present
    
    math_index.closeDB(fileid)
    return (fileid,stats)
        math_index = Version03Index(cntl, window=window)

        ##        if cntl.read("results"):
        ##            # try ingesting and processing results (temporary setting)
        ##            tuples = math_index.get(query_file)
        ##            for qid,hit in tuples.items():
        ##                print(qid,hit)
        ##        else:

        with open(query_file, encoding='utf-8') as file:
            parsed = BeautifulSoup(file, "html.parser")

        query_list = parsed.find_all("topic")
        print("There are %s queries." % len(query_list))
        combined_stats = Stats()
        fileids = set()

        ##          try:
        query_list_m = list(map(get_query, query_list))  # whole batch for now
        args = [(query_list_m, topk, math_index)]

        for p in args:  # single-process execution
            (fileid, stats) = process_query_batch(p)
            fileids.add(fileid)
            combined_stats.add(stats)
##          except Exception as err:
##              reason = str(err)
##              print("Failed to process queries: "+reason, file=sys.stderr)

        cntl.store("query_fileids", str(fileids))
Beispiel #6
0
            # try ingesting and processing results (temporary setting)
            tuples = math_index.get(query_file)
            for qid,hit in tuples.items():
                print(qid,hit)
        else:

                
            topk = ntcir_wiki_count if system == 'Wikipedia' else ntcir_main_count
            

            with open(query_file, encoding='utf-8') as file:
                parsed = BeautifulSoup(file, "lxml")

            query_list = parsed.find_all("topic")
            print("There are %s queries." % (len(query_list)), flush=True)
            combined_stats = Stats()
            fileids = set()

            try:
                query_list_m = list(map(get_query,query_list)) # whole batch for now
                args = [(system, db, run_tag, query_list_m, topk, math_index, weighting_strategy)]
            
                for p in args:  # single-process execution
                    (fileid,stats) = process_query_batch(p)
                    fileids.add(fileid)
                    combined_stats.add(stats)
            except Exception as err:
                reason = str(err)
                print("Failed to process document "+filename+": "+reason, file=sys.stderr)
                combined_stats.problem_files[reason] = combined_stats.problem_files.get(reason, set())
                combined_stats.problem_files[reason].add(filename)
Beispiel #7
0
        if cntl.read("results"):
            # try ingesting and processing results (temporary setting)
            tuples = math_index.get(query_file)
            for qid, hit in tuples.items():
                print(qid, hit)
        else:

            topk = ntcir_wiki_count if system == 'Wikipedia' else ntcir_main_count

            with open(query_file, encoding='utf-8') as file:
                parsed = BeautifulSoup(file)

            query_list = parsed.find_all("topic")
            print("There are %s queries." % (len(query_list)), flush=True)

            combined_stats = Stats()
            fileids = set()

            try:
                query_list_m = list(map(get_query,
                                        query_list))  # whole batch for now
                args = [(system, db, run_tag, query_list_m, topk, math_index,
                         weighting_strategy)]

                for p in args:  # single-process execution
                    (fileid, stats) = process_query_batch(p)
                    fileids.add(fileid)
                    combined_stats.add(stats)
            except Exception as err:
                reason = str(err)
                print("Failed to process document " + filename + ": " + reason,
Beispiel #8
0
        row = "-"
        with open(doc_id_mapping_path, newline='', encoding='utf-8') as mapping_file:
            while True:
                if num_docs % chunk_size == 0:
                    filepos.append(mapping_file.tell())
                num_docs += 1
                row = mapping_file.readline()
                if row == "":
                    num_docs -= 1
                    if num_docs % chunk_size == 0:
                        del filepos[-1]
                    break
        cntl.store("file_skips",str(filepos))
        
        print("There are " + str(num_docs) + " documents to index", flush=True)
        combined_stats = Stats()

        if num_docs > 0:
            math_index = Version03Index(db=database_name, window=window)

            max_jobs = min(10,num_docs)
            manager = multiprocessing.Manager()
            lock = manager.Lock()

            #identify chunks to be indexed by each process
            args = [(math_index, cntl, chunkid) for chunkid in list(range(len(filepos)))]

            fileids = set()
            
##            for p in args:  # single-process execution, for debugging
##                fileid, stats = math_indexer_task(p)