Ejemplo n.º 1
0
def run_engine(corpus_path, output_path, stemming=False):
    """
    Builds the retrieval model.
    Preprocess, parse and index corpus.
    :return: a tuple of number_of_documents in the corpus and average_document_length
    """

    number_of_documents = 0
    total_document_length = 0

    reader = ReadFile(corpus_path)
    parser = Parse()
    indexer = Indexer(output_path)

    # read all parquet data files
    files = glob(corpus_path + "/**/*.parquet", recursive=True)

    # read, parse and index document in batches. Posting files are divided by english alphabet
    # a batch is defined as all the documents in a single parquet file
    # each batch is first written as many sub-batches indicated by an index and later merged into one coherent batch
    batch_index = 0
    file_index = 0
    while file_index < len(files):

        # batch two files at a time to reduce disk seek time penalty
        first_file = files[file_index]
        first_documents_list = reader.read_file(first_file)

        if file_index + 1 < len(files):
            second_file = files[file_index + 1]
            second_documents_list = reader.read_file(second_file)
            documents_list = first_documents_list + second_documents_list

        else:  # if only one batch left for the last batch
            documents_list = first_documents_list

        file_index += 2

        # Iterate over every document in the file

        # parse documents
        parsed_file = set()
        for document_as_list in documents_list:
            parsed_document = parser.parse_doc(document_as_list, stemming)
            parsed_file.add(parsed_document)
            total_document_length += parsed_document.doc_length
            number_of_documents += 1

        # index parsed documents
        indexer.index_batch(parsed_file, str(batch_index))

        batch_index += 1

    # calculate average document length
    average_document_length = float(
        total_document_length) / number_of_documents

    # after indexing all non-entity terms in the corpus, index legal entities
    indexer.index_entities()

    # save index dictionary to disk
    utils.save_obj(indexer.inverted_idx, output_path + "inverted_idx")

    # after indexing the whole corpus, consolidate all partial posting files
    indexer.consolidate_postings()

    return number_of_documents, average_document_length