Example #1
0
def run_engine(corpus_path='', output_path='', stemming=False):
    """

    :return:
    """
    # Create PostingFile directory if it doesn't exist
    number_of_documents = 0
    config = ConfigClass()
    r = ReadFile(corpus_path=corpus_path)
    p = Parse(stemming)
    indexer = Indexer(config, output_path)
    # Get all parquet files from corpus path
    parquets = []
    for root, dirs, files in os.walk(corpus_path):
        for name in files:
            if name.endswith((".parquet", ".htm")):
                parquets.append((root, name))

    for index in range(len(parquets)):
        r.corpus_path = parquets[index][0]
        documents_list = r.read_file(file_name=parquets[index][1])
        # Create a new process for each document
        with Pool(CPUCOUNT) as _p:
            for parsed_doc in _p.imap_unordered(p.parse_doc, documents_list):
                number_of_documents += 1
                indexer.add_new_doc(parsed_doc)
            _p.close()
            _p.join()

    p.entities.clear()
    indexer.finish_index()
    save_obj(indexer.term_dict, output_path + '/' + "inverted_idx")
    save_obj(indexer.document_dict, output_path + '/' + "doc_dictionary")
    indexer.document_dict.clear()
    indexer.term_dict.clear()