Ejemplo n.º 1
0
def math_indexer_task(pargs) -> (str, list):
    """
    creates index tuples for the expressions in this subcollection
    :param pargs:
    :return: (fileid, combined_stats)
    """
    math_index, cntl, chunkid = pargs
    combined_stats = Stats()

    docs = MathDocument(cntl)

    (chunk_size, mappings) = docs.read_mapping_file(chunkid)
    combined_stats.num_documents += len(mappings)

    seen_docs = []  # just dump them as they come
    for (doc_id, filename) in enumerate(mappings,start=chunkid*chunk_size):
##        print('parsing %s, id:%s ' % (filename, doc_id),flush=True)
        try:
            # get all the symbol trees found in file
            for tree in read_file(filename, doc_id, missing_tags=combined_stats.missing_tags,
                               problem_files=combined_stats.problem_files):
                combined_stats.num_expressions += 1
                # pairs = tree.get_pairs(window) do not store pairs -- will be created in C++ module
                seen_docs.append(tree)
        except Exception as err:
            reason = str(err)
            print("Failed to process document "+filename+": "+reason, file=sys.stderr)
            combined_stats.problem_files[reason] = combined_stats.problem_files.get(reason, set())
            combined_stats.problem_files[reason].add(doc_id)

    fileid = math_index.add(seen_docs)
    print("%s is done saving to database %s" % (chunkid,fileid), flush=True)
    return fileid, combined_stats
Ejemplo n.º 2
0
def math_indexer_task(pargs) -> (str, list):
    """
    creates index tuples for the expressions in this subcollection
    :param pargs:
    :return: (fileid, combined_stats)
    """
    math_index, cntl, chunkid = pargs
    combined_stats = Stats()

    docs = MathDocument(cntl)

    (chunk_size, mappings) = docs.read_mapping_file(chunkid)
    combined_stats.num_documents += len(mappings)

    seen_docs = []  # just dump them as they come
    for (doc_id, filename) in enumerate(mappings, start=chunkid * chunk_size):
        ##        print('parsing %s, id:%s ' % (filename, doc_id),flush=True)
        try:
            # get all the symbol trees found in file
            for tree in read_file(filename,
                                  doc_id,
                                  missing_tags=combined_stats.missing_tags,
                                  problem_files=combined_stats.problem_files):
                combined_stats.num_expressions += 1
                combined_stats.global_expressions += len(tree.position)
                # pairs = tree.get_pairs(window) do not store pairs -- will be created in C++ module
                seen_docs.append(tree)
        except Exception as err:
            reason = str(err)
            print("Failed to process document " + filename + ": " + reason,
                  file=sys.stderr)
            combined_stats.problem_files[
                reason] = combined_stats.problem_files.get(reason, set())
            combined_stats.problem_files[reason].add(doc_id)

    fileid = math_index.add(seen_docs)
    print("%s is done saving to database %s" % (chunkid, fileid), flush=True)
    return fileid, combined_stats