Esempio n. 1
0
def run_engine(corpus_path='', output_path='', stemming=False):
    """

    :return:
    """
    # Create PostingFile directory if it doesn't exist
    number_of_documents = 0
    config = ConfigClass()
    r = ReadFile(corpus_path=corpus_path)
    p = Parse(stemming)
    indexer = Indexer(config, output_path)
    # Get all parquet files from corpus path
    parquets = []
    for root, dirs, files in os.walk(corpus_path):
        for name in files:
            if name.endswith((".parquet", ".htm")):
                parquets.append((root, name))

    for index in range(len(parquets)):
        r.corpus_path = parquets[index][0]
        documents_list = r.read_file(file_name=parquets[index][1])
        # Create a new process for each document
        with Pool(CPUCOUNT) as _p:
            for parsed_doc in _p.imap_unordered(p.parse_doc, documents_list):
                number_of_documents += 1
                indexer.add_new_doc(parsed_doc)
            _p.close()
            _p.join()

    p.entities.clear()
    indexer.finish_index()
    save_obj(indexer.term_dict, output_path + '/' + "inverted_idx")
    save_obj(indexer.document_dict, output_path + '/' + "doc_dictionary")
    indexer.document_dict.clear()
    indexer.term_dict.clear()
Esempio n. 2
0
def run_engine(corpus_path="testData", output_path="posting", stemming=True, glove_dict=None):
    """
    This function build the inverted index over the corpus.
    send each tweet to parsing and indexing.
    if the stemming is True the parsing will use the stemmer on the tokens.
    :param glove_dict: Glove file including all word vectors
    :param corpus_path: root folder containing the raw tweet files
    :param output_path for the inverted index, posting files and tweets dictionary
    :param stemming if True use stemmer on terms
    """

    config = ConfigClass(corpus_path, number_of_term_buckets=26, number_of_entities_buckets=2, output_path=output_path)
    r = ReadFile(corpus_path=config.get_corpusPath())
    p = Parse(stemming)
    indexer = Indexer(config)
    all_files_paths = glob.glob(config.get_corpusPath() + "\\*\\*.snappy.parquet")
    all_files_names = [file_name[file_name.find("\\") + 1:] for file_name in all_files_paths]
    start_time = time.time()
    file_counter = 0
    for file_name in all_files_names:
        file_start_time = time.time()
        # print("start file :", file_counter)
        documents_list = [document for document in r.read_file(file_name=file_name)]
        # Iterate over every document in the file
        for idx, document in enumerate(documents_list):
            parsed_document = p.parse_doc(document)
            indexer.add_new_doc(parsed_document, glove_dict)
        # print("end file number ", file_counter, " in: ", time.time() - file_start_time)
        file_counter += 1
    total_time = time.time() - start_time
    indexer.finish_indexing()
Esempio n. 3
0
def run_engine(corpus_path, output_path, stemming=False):
    """

    :param corpus_path: path for parquet files
    :param output_path: path to write pickle files
    :param stemming: boolean to use stemming or not
    :return:
    """

    ConfigClass(corpus_path, output_path, stemming)
    r = ReadFile(corpus_path)
    p = Parse(stemming)
    indexer = Indexer(output_path, stemming)

    if corpus_path.endswith('parquet'):
        documents_list = r.read_file(corpus_path)
        parseAndIndexDocuments(documents_list, p, indexer)
    else:
        documents_list = r.read_dir()

        while documents_list:
            parseAndIndexDocuments(documents_list, p, indexer)
            documents_list = r.read_dir()

    documents_list.clear()
    indexer.merge_posting_files()

    lda = LDA(output_path, indexer.dictdoc, stemming)
    lda.build_ldaModel()
Esempio n. 4
0
def run_engine(corpus_path, output_path, stemming, queries, num_docs_to_retrieve):
    """
    :return:
    """
    number_of_documents = 0

    config = ConfigClass(corpus_path, output_path, stemming)
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(stemming)
    indexer = Indexer(config, p.terms_dic_to_document)
    # Iterate over every document in the file
    for i in r.filesPath:
        documents_list = r.read_file(i)
        start_time = time.time()
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = p.parse_doc(document)
            # update the number of doc in system
            number_of_documents += 1
            # index the document data
            indexer.add_new_doc(parsed_document)
        # print(time.time() - start_time)
    print('--------------------------')
    print('Start writing to disk left overs')
    indexer.save_all_left_overs()
    print('Finish without waiting ' + str(time.time() - start_time))
    print('Start waiting')
    indexer.wait_untill_all_finish()
    print('End Waiting')
    print('Finished writing to disk left overs')
    print('--------------------------')
    print('Finished parsing and indexing. Starting to export files')
    print('Finish all Time ' + str(time.time() - start_time))
    utils.save_obj(indexer.inverted_idx, "inverted_idx")
Esempio n. 5
0
def run_engine(config):
    """
    :return:
    """
    number_of_documents = 0
    output_path = config.savedFileMainFolder
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(config.toStem)
    m_Indexer = Indexer(output_path)
    parquetPaths = []
    for (dirPath, dirNames, fileNames) in os.walk(config.get__corpusPath()):
        for fileName in fileNames:
            parquetPaths.append((dirPath + '\\' + fileName))
    for i in range(len(parquetPaths)):
        parquetPaths[i] = parquetPaths[i][parquetPaths[i].find('\\') + 1:]
        if ".DS_Store" in parquetPaths[i]:
            continue
        parquet = r.read_file(file_name=parquetPaths[i])
        for document in parquet:
            number_of_documents += 1
            parsed_document = p.parse_doc(document)
            # index the document data
            m_Indexer.add_new_doc(parsed_document)
    # if there's more postings to flush, do it.
    if len(m_Indexer.postingDictionary) > 0:
        utils.save_obj(m_Indexer.postingDictionary,
                       m_Indexer.postingsPath + '/' + str(m_Indexer.pkl_key))
    # Clear single terms and entities, updated inverted index to disk.
    clearSingleEntities(m_Indexer.inverted_idx, p, output_path,
                        m_Indexer.num_of_docs_in_corpus)
    utils.save_obj(m_Indexer.inverted_idx, output_path + '/inverted_idx')
    m_Indexer.inverted_idx.clear()
    utils.save_obj(number_of_documents,
                   output_path + '/PostingFiles/num_of_docs_in_corpus')
Esempio n. 6
0
    def test_add_new_doc(self):
        config = ConfigClass()
        r = ReadFile(corpus_path=config.get__corpusPath())
        p = Parse()
        indexer = Indexer(config)
        documents_list = r.read_file(file_name='sample3.parquet')
        # text1 = '@ampalombo I was going to my grandsons baseball games and the dumb F****s made a mask mandatory, are you kidding me'
        assert indexer.add_new_doc()

        text = 'i wad born in 2019'
Esempio n. 7
0
 def test_reader():
     global num_test_failed, results_summary
     num_test_failed = 0
     r = ReadFile(corpus_path)
     correct_answers = [x['len'] for x in reader_inputs]
     student_answers = [
         len(r.read_file(x['file'])) for x in reader_inputs
     ]
     test_part(correct_answers, student_answers, error_str="read")
     if num_test_failed == 0:
         results_summary.append('All Reader tests passed')
Esempio n. 8
0
def run_engine(config, indexer):
    """
    :return:
    """
    number_of_documents = 0

    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(config)

    doc = r.read_file('benchmark_data_train.snappy.parquet')
    for document in doc:
        parsed_document = p.parse_doc(document)
        indexer.add_new_doc(parsed_document)
        number_of_documents += 1
    capital_letters = p.caps_dict
    indexer.change_inverted_by_caps(capital_letters)
    indexer.save_index('idx_bench')
Esempio n. 9
0
def run_engine(config):
    """

    :return:
    """

    number_of_documents = 0
    sum_of_doc_lengths = 0

    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(config.toStem)
    indexer = Indexer(config, glove_dict)
    # documents_list = r.read_file(file_name=config.get__corpusPath())
    parquet_documents_list = r.read_folder(config.get__corpusPath())
    for parquet_file in parquet_documents_list:
        documents_list = r.read_file(file_name=parquet_file)
        # Iterate over every document in the file
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = p.parse_doc(document)
            if parsed_document is None:
                continue
            number_of_documents += 1
            sum_of_doc_lengths += parsed_document.doc_length
            # index the document data
            indexer.add_new_doc(parsed_document)

    # saves last posting file after indexer has done adding documents.
    indexer.save_postings()
    if len(indexer.doc_posting_dict) > 0:
        indexer.save_doc_posting()
    utils.save_dict(indexer.document_dict, "documents_dict", config.get_out_path())
    if len(indexer.document_posting_covid) > 0:
        indexer.save_doc_covid()

    indexer.delete_dict_after_saving()

    # merges posting files.
    indexer.merge_chunks()
    utils.save_dict(indexer.inverted_idx, "inverted_idx", config.get_out_path())

    dits = {'number_of_documents': number_of_documents, "avg_length_per_doc": sum_of_doc_lengths/number_of_documents }

    utils.save_dict(dits, 'details', config.get_out_path())
Esempio n. 10
0
def run_engine(corpus_path, output_path, stemming, queries,
               num_docs_to_retrieve):
    """
    :return:
    """
    config = ConfigClass(corpus_path, output_path, stemming)
    number_of_documents = 0
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse()
    indexer = Indexer(config)
    Parse.stemmer = stemming

    corpus_list = r.read_corpus()

    for idx in range(len(corpus_list)):
        documents_list = r.read_file(file_name=corpus_list[idx],
                                     read_corpus=True)
        for i in tqdm(range(len(documents_list))):
            parsed_document = p.parse_doc(documents_list[i])
            if i == len(documents_list) - 1 and idx == len(corpus_list) - 1:
                indexer.is_last_doc = True
            indexer.add_new_doc(parsed_document)
            number_of_documents += 1
        indexer.is_last_doc = False
    documents_list = []

    with open('spell_dict.json', 'w') as f:
        json.dump(indexer.spell_dict, f)

    pickle_out = open("docs_dict_and_extras", "wb")
    pickle.dump(indexer.docs_dict, pickle_out)
    pickle_out.close()

    start = time.time()
    indexer.merge_files()
    end = time.time()
    print("merge time was: {}".format(end - start))

    utils.save_obj(indexer.inverted_idx, "inverted_index")
    pickle_out = open("docs_dict_and_extras", "ab")
    pickle.dump(number_of_documents, pickle_out)
    pickle.dump(Parse.AMOUNT_OF_NUMBERS_IN_CORPUS, pickle_out)
    pickle.dump(indexer.dump_path, pickle_out)
    pickle_out.close()
Esempio n. 11
0
 def build_index_from_parquet(self, fn):
     """
     Reads parquet file and passes it to the parser, then indexer.
     Input:
         fn - path to parquet file
     Output:
         No output, just modifies the internal _indexer object.
     """
     r = ReadFile()
     df = r.read_file(fn)
     documents_list = df
     # Iterate over every document in the file
     number_of_documents = 0
     for idx, document in enumerate(documents_list):
         # parse the document
         parsed_document = self._parser.parse_doc(document)
         number_of_documents += 1
         # index the document data
         self._indexer.add_new_doc(parsed_document)
     print('Finished parsing and indexing.')
Esempio n. 12
0
def run_engine(corpus_path=None,
               output_path=None,
               stemming=False,
               lemma=False,
               queries=None,
               num_docs_to_retrieve=None):
    """
    :return:
    """
    global config, number_of_documents

    number_of_documents = 0

    config = ConfigClass()
    config.corpusPath = corpus_path
    config.set_output_path(output_path)
    config.toStem = stemming
    config.toLemm = lemma
    if os.path.exists(config.get_output_path()):
        shutil.rmtree(config.get_output_path())

    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse(config.toStem, config.toLemm)
    indexer = Indexer(config)

    documents_list = []
    for root, dirs, files in os.walk(corpus_path):
        r.set_corpus_path(root)
        for file in files:
            if file.endswith(".parquet"):
                documents_list += r.read_file(file)
    # Iterate over every document in the file
    for idx, document in enumerate(documents_list):
        # parse the document
        parsed_document = p.parse_doc(document)
        number_of_documents += 1
        # index the document data
        indexer.add_new_doc(parsed_document)
    documents_list.clear(
    )  # Finished parsing and indexing all files - need to clean all the used memory
    indexer.cleanup(number_of_documents)
Esempio n. 13
0
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        number_of_documents = 0

        r = ReadFile(corpus_path=self._config.get__corpusPath())

        doc = r.read_file(fn)
        for document in doc:
            parsed_document = self._parser.parse_doc(document)
            self._indexer.add_new_doc(parsed_document)
            number_of_documents += 1
        capital_letters = self._parser.caps_dict
        self._indexer.change_inverted_by_caps(capital_letters)
        self._indexer.save_index('idx_bench')
        print('Finished parsing and indexing.')
Esempio n. 14
0
def run_engine():
    """

    :return:
    """
    number_of_documents = 0
    timer = True
    config = ConfigClass()
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse()  #p = Parse(with_stemmer=True)
    indexer = Indexer(config)

    data_dir = 'Data' + os.sep + 'Data'
    npy_dirs = [root for root, dirs, files in os.walk(data_dir)]
    for dir_path in npy_dirs:
        files = [
            os.path.join(dir_path, fname) for fname in os.listdir(dir_path)
            if fname.endswith('.parquet')
        ]
        for file in files:
            tweets = r.read_file(file_name=file)
            start_time = time.perf_counter()
            documents_list = multiprocessing.Pool(12).map(p.parse_doc, tweets)
            end_time = time.perf_counter()
            avg_time_per_tweet = (end_time - start_time) / len(tweets)
            print(
                f'Parsed {len(tweets)} tweets, Elapsed time: {end_time - start_time:0.4f} seconds, average per tweet: {avg_time_per_tweet:0.8f} seconds'
            )

            start_time = time.perf_counter()
            for parsed_document in documents_list:
                indexer.add_new_doc(parsed_document)
            end_time = time.perf_counter()
            print(
                f'Indexing {len(documents_list)} tweets, Elapsed time: {end_time - start_time:0.4f} seconds'
            )
    print('Finished parsing and indexing. Starting to export files')
    utils.save_obj(indexer.inverted_idx, "inverted_idx")
    utils.save_obj(indexer.postingDict, "posting")
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """

        rd = ReadFile(fn)
        documents_list = rd.read_file()

        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)
        self._indexer.thresh_hold = 100000
        self._indexer.thresh_hold_handler()
        self._indexer.save_index("inverted_idx")
Esempio n. 16
0
def run_engine():
    """

    :return:
    """
    number_of_documents = 0

    config = ConfigClass()
    r = ReadFile(corpus_path=config.get__corpusPath())
    p = Parse()
    indexer = Indexer(config)

    documents_list = r.read_file(file_name='sample3.parquet')
    # Iterate over every document in the file
    for idx, document in enumerate(documents_list):
        # parse the document
        parsed_document = p.parse_doc(document)
        number_of_documents += 1
        # index the document data
        indexer.add_new_doc(parsed_document)
    print('Finished parsing and indexing. Starting to export files')

    utils.save_obj(indexer.inverted_idx, "inverted_idx")
    utils.save_obj(indexer.postingDict, "posting")
Esempio n. 17
0
def run_engine():
    """

    :return:
    """
    number_of_documents = 0
    corpus_path = config.get__corpusPath()
    r = ReadFile(corpus_path)
    indexer = Indexer(config)
    p = Parse(config)

    #reading per folder
    r.create_files_name_list()
    files_list = []  # every index contains all tweets per folder
    for file_name in r.dates_list:
        tweets_per_date = r.read_file(file_name)
        files_list.append(tweets_per_date)
    #print("files_list", len(files_list))

    num_of_tweets = 0
    for folder_list in files_list:
        num_of_tweets += len(folder_list)
    #print("num_of_tweets", num_of_tweets)
    """#reading per folder
    r.create_files_name_list()
    threads = []
    for file_name in r.dates_list:
        t = threading.Thread(target=r.read_file(file_name))
        threads.append(t)
        t.start()
    print("files_list", r.files_list)"""
    """counter = 1
    procs = []
    # Iterate over every folder in the DATA
    for folder_list in files_list:
        proc = Process(target=test, args=(folder_list, counter, indexer, number_of_documents,))
        procs.append(proc)
        proc.start()
    # complete the processes
    for proc in procs:
        proc.join()
    print('Finished parsing and indexing. Starting to export files')"""

    counter = 1
    # Iterate over every folder in the DATA
    for folder_list in files_list:
        #print(counter)
        #print(datetime.now())
        # Iterate over every tweet in the folder
        for idx, tweet in enumerate(folder_list):
            # parse the tweet
            parsed_document = p.parse_doc(tweet)
            number_of_documents += 1
            # index the tweet data
            indexer.add_new_doc(parsed_document, num_of_tweets)

        #print("number of tweets", number_of_documents)
        #print(datetime.now())
        counter += 1
    #print('Finished parsing and indexing. Starting to export files')
    """#read only one folder
    documents_list = r.read_file(file_name='')
    num_indexed = len(documents_list)

    # Iterate over every document in the file
    for idx, document in enumerate(documents_list):
        # parse the document
        parsed_document = p.parse_doc(document)
        number_of_documents += 1
        # index the document data
        indexer.add_new_doc(parsed_document, num_indexed)
    #print('Finished parsing and indexing. Starting to export files')"""

    utils.save_obj(indexer.inverted_idx, "inverted_idx")
    utils.save_obj(indexer.tf_idf_dict, "tf_idf_dict")
    return indexer.get__lda__()
Esempio n. 18
0
#     r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-15-2020\covid19_07-15.snappy.parquet",
#     r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-16-2020\covid19_07-16.snappy.parquet",
#     r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-18-2020\covid19_07-18.snappy.parquet",
#     r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-20-2020\covid19_07-20.snappy.parquet",
#     r"C:\Users\Owner\Desktop\SearchEngine\Data\date=08-04-2020\covid19_08-04.snappy.parquet",
#     r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-27-2020\covid19_07-27.snappy.parquet",
# ]

files_to_process = [
    r"C:\Users\Owner\Desktop\SearchEngine\Data\date=08-07-2020\covid19_08-07.snappy.parquet",
    r"C:\Users\Owner\Desktop\SearchEngine\Data\date=08-06-2020\covid19_08-06.snappy.parquet",
    r"C:\Users\Owner\Desktop\SearchEngine\Data\date=08-05-2020\covid19_08-05.snappy.parquet",
]

for file in files_to_process:
    documents_list += reader.read_file(file)

with open(preprocessed_file, "a+") as f:
    for idx, document in enumerate(documents_list):
        # parse the document
        parsed_document = parser.parse_doc(document)
        doc = ""
        for i, word in enumerate(parsed_document.term_doc_dictionary):
            if word == "#" or "#_" in word:
                continue

            if i == len(parsed_document.term_doc_dictionary) - 1:
                doc = doc.replace('\n', "")
                doc += "\n"
                break
Esempio n. 19
0
last_tweet_num = 0
if start_from_backup:
    with open("backup_file.pkl", 'rb') as f:
        ls = pickle.load(f)
        tweet_annotation_dict = f[0]
        last_tweet_num = [1]
for dir in os.listdir(data_path):
    dir_path = os.path.join(data_path, dir)
    if not os.path.isdir(dir_path):
        continue
    for file in os.listdir(dir_path):
        file_path = os.path.join(dir, file)
        if file[-7:] != "parquet":
            continue
        print(f'Reading {file}...')
        file_as_list = rdr.read_file(file_path)
        for tweet in file_as_list:
            if int(tweet[0]) in tweets_queries_dict:
                if start_from_backup:
                    tweet_num += 1
                    if tweet_num == last_tweet_num:
                        start_from_backup = False
                    break
                print(
                    f"Query: {queries[tweets_queries_dict[int(tweet[0])]-1]}")
                print(
                    f"Tweet {tweet_num}/{len(tweets_queries_dict)}:\n{tweet}")
                annotation = ""
                while annotation != 0 and annotation != 1:
                    try:
                        annotation = int(
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        """
               init engine with the relevant model - Thesaurus_Searcher
               :param config:
               """
        self._config = config
        try:
            self._reader = ReadFile(corpus_path=config.get__corpusPath())
        except:
            self._reader = ReadFile("")
        self._parser = Parse()
        self._parser.STEMMER = config.toStem
        self._indexer = Indexer(config)
        self._model = Thesaurus_Searcher(self._indexer)
        self.last_parquet = False

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        documents_list = self._reader.read_file(fn)

        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            if self.last_parquet and idx == len(documents_list) - 1:
                self._indexer.last_doc = True
            self._indexer.add_new_doc(parsed_document)
        print('Finished parsing and indexing.')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(query, 1500)
Esempio n. 21
0
def run_engine(corpus_path, output_path, stemming=False):
    """
    Builds the retrieval model.
    Preprocess, parse and index corpus.
    :return: a tuple of number_of_documents in the corpus and average_document_length
    """

    number_of_documents = 0
    total_document_length = 0

    reader = ReadFile(corpus_path)
    parser = Parse()
    indexer = Indexer(output_path)

    # read all parquet data files
    files = glob(corpus_path + "/**/*.parquet", recursive=True)

    # read, parse and index document in batches. Posting files are divided by english alphabet
    # a batch is defined as all the documents in a single parquet file
    # each batch is first written as many sub-batches indicated by an index and later merged into one coherent batch
    batch_index = 0
    file_index = 0
    while file_index < len(files):

        # batch two files at a time to reduce disk seek time penalty
        first_file = files[file_index]
        first_documents_list = reader.read_file(first_file)

        if file_index + 1 < len(files):
            second_file = files[file_index + 1]
            second_documents_list = reader.read_file(second_file)
            documents_list = first_documents_list + second_documents_list

        else:  # if only one batch left for the last batch
            documents_list = first_documents_list

        file_index += 2

        # Iterate over every document in the file

        # parse documents
        parsed_file = set()
        for document_as_list in documents_list:
            parsed_document = parser.parse_doc(document_as_list, stemming)
            parsed_file.add(parsed_document)
            total_document_length += parsed_document.doc_length
            number_of_documents += 1

        # index parsed documents
        indexer.index_batch(parsed_file, str(batch_index))

        batch_index += 1

    # calculate average document length
    average_document_length = float(
        total_document_length) / number_of_documents

    # after indexing all non-entity terms in the corpus, index legal entities
    indexer.index_entities()

    # save index dictionary to disk
    utils.save_obj(indexer.inverted_idx, output_path + "inverted_idx")

    # after indexing the whole corpus, consolidate all partial posting files
    indexer.consolidate_postings()

    return number_of_documents, average_document_length