def build_index_and_doc_collection_from_csv(self, fileName): count = -1 docsCount = 1662756 #1662756 => 1662757 - 1 (header) batchSize = 10000 #10000 loops = (int)(docsCount / batchSize) + 1 # 1662.757 + 1 builder = StructureBuilder() dbManager = DbManager() pickleManager = PickleManager() sub_list = [] from_list = 1 #1 to_list = 1662756 #1662756 #1650000 #100000 # drop and create the collections in mongo dbManager.rebuild_structure() # delete all pickle files pickleManager.remove_all_files() with open(fileName) as csvfile: csv_content = csv.reader(csvfile, delimiter=',') for row in csv_content: count += 1 if (count == 0 or count < from_list ): #skip the headers or the previous processed documents continue doc_id = int(row[2]) doc_content = row[0] dbManager.insert_document({ 'id': doc_id, 'content': doc_content }) # add to the sublist waiting to save the list in a batch operation sub_list.append({'id': doc_id, 'content': doc_content}) if count % batchSize == 0: # every batchSize documents send the work to process pool with multiprocessing.Pool( processes=max(multiprocessing.cpu_count() - 1, 1)) as pool: # create the index structure index_structures = pool.map( builder.get_stemmed_terms_frequencies_from_doc, sub_list) pickleManager.save_index_and_max_freq( index_structures, str(count)) print("%d : %d : %s" % (loops, count, datetime.datetime.now())) sub_list = [] # empty the list for the next ones. loops -= 1 if loops <= 1 and count == to_list and len(sub_list) > 0: with multiprocessing.Pool() as pool: # create the index structure index_structures = pool.map( builder.get_stemmed_terms_frequencies_from_doc, sub_list) pickleManager.save_index_and_max_freq( index_structures, str(count)) print("%d : %d : reminder: %s" % (loops, count, datetime.datetime.now())) if count == to_list: break print( "Saved docs and max_freq in mongo. Saved index structures in pickles: %s" % (datetime.datetime.now()))