Esempio n. 1
0
def run_engine(corpus_path, output_path, stemming, queries,
               num_docs_to_retrieve):
    """
    :return:
    """
    config = ConfigClass(corpus_path, output_path, stemming)
    number_of_documents = 0
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse()
    indexer = Indexer(config)
    Parse.stemmer = stemming

    corpus_list = r.read_corpus()

    for idx in range(len(corpus_list)):
        documents_list = r.read_file(file_name=corpus_list[idx],
                                     read_corpus=True)
        for i in tqdm(range(len(documents_list))):
            parsed_document = p.parse_doc(documents_list[i])
            if i == len(documents_list) - 1 and idx == len(corpus_list) - 1:
                indexer.is_last_doc = True
            indexer.add_new_doc(parsed_document)
            number_of_documents += 1
        indexer.is_last_doc = False
    documents_list = []

    with open('spell_dict.json', 'w') as f:
        json.dump(indexer.spell_dict, f)

    pickle_out = open("docs_dict_and_extras", "wb")
    pickle.dump(indexer.docs_dict, pickle_out)
    pickle_out.close()

    start = time.time()
    indexer.merge_files()
    end = time.time()
    print("merge time was: {}".format(end - start))

    utils.save_obj(indexer.inverted_idx, "inverted_index")
    pickle_out = open("docs_dict_and_extras", "ab")
    pickle.dump(number_of_documents, pickle_out)
    pickle.dump(Parse.AMOUNT_OF_NUMBERS_IN_CORPUS, pickle_out)
    pickle.dump(indexer.dump_path, pickle_out)
    pickle_out.close()
Esempio n. 2
0
    def run_engine(self):
        """
        :return:
        """
        r = ReadFile(corpus_path=self._config.get__corpusPath())
        number_of_files = 0

        for i, file in enumerate(r.read_corpus()):
            # Iterate over every document in the file
            number_of_files += 1
            for idx, document in enumerate(file):
                # parse the document
                parsed_document = self._parser.parse_doc(document)
                self._indexer.add_new_doc(parsed_document)

        self._indexer.entities_and_small_big()
        self._indexer.calculate_idf(self._parser.number_of_documents)
        # avg_doc_len = self._parser.total_len_docs / self._parser.number_of_documents
        # self._indexer.save_index("inverted_idx")
        # TODO - check the name of inverted_idx
        self._indexer.save_index("idx_bench.pkl")
Esempio n. 3
0
def run_engine(config):
    """
    :return:
    """
    parser = Parse(config)
    r = ReadFile(corpus_path=config.get__corpusPath())
    indexer = Indexer(config)
    number_of_files = 0

    for i, file in enumerate(r.read_corpus()):
        # Iterate over every document in the file
        number_of_files += 1
        for idx, document in enumerate(file):
            # parse the document
            parsed_document = parser.parse_doc(document)
            indexer.add_new_doc(parsed_document)
    indexer.check_last()
    indexer.merge_sort_parallel(3)
    indexer.calculate_idf(parser.number_of_documents)
    avg_doc_len = parser.total_len_docs / parser.number_of_documents
    utils.save_obj(avg_doc_len, config.get_savedFileMainFolder() + "\\data")

    utils.save_obj(indexer.inverted_idx, config.get_savedFileMainFolder() + "\\inverted_idx")
    utils.save_obj(indexer.docs_inverted, config.get_savedFileMainFolder() + "\\docs_inverted")