def test_merging(self): voc = dict() currentWorkspace = './tests/workspace/test3/' filename = 'test3' pathlist = Path("./tests/data/test3/").glob('**/la*') filemanager = fm.FileManager(filename, currentWorkspace) for path in pathlist: analysis.analyse_newspaper(path, voc) filemanager.save_vocabularyAndPL_file(voc, True) voc = dict() filemanager.mergePartialVocsAndPL(False) # TODO: changer quand on ait une function directe savedVoc = filemanager.read_vocabulary() mot, sortedByScore = query.get_posting_list( savedVoc, "aa", filemanager, True) self.assertEqual(mot, {1: [0, 3], 2: [0, 2], 3: [ 0, 1], 4: [0, 3], 5: [0, 2], 6: [0, 1]}) # The score is equal to zero self.assertEqual(sortedByScore, [(0, 1),(0, 2),(0, 3),(0, 4),(0, 5),(0, 6)]) mot, sortedByScore = query.get_posting_list( savedVoc, "bb", filemanager, True) # The score is equal to zero self.assertEqual(sortedByScore, [(0, 1), (0,2), (0,4), (0,5)]) self.assertEqual(mot, {1: [0, 1], 2: [0, 1], 4: [0, 1], 5: [0, 1]}) mot, sortedByScore = query.get_posting_list( savedVoc, "cc", filemanager, True) self.assertEqual(mot, {3: [0, 1], 6: [0, 1]}) self.assertEqual(sortedByScore, [(0, 3),(0, 6)])
def test_topk_trivial_file(self): pathlist = Path("./tests/data/testtrivialtopk/").glob('**/la*') filemana = filemanager.FileManager( "TestFaginsTopK", "./tests/workspace/testsfaginstopk/") tempVoc = SortedDict() for path in pathlist: analysis.analyse_newspaper(path, tempVoc, computeIDF=True) filemana.save_vocabularyAndPL_file(tempVoc) # Extraction of the saved Voc savedVoc = filemana.read_vocabulary() topk = faginstatopk.apply_fagins_ta(['aa', 'bb'], savedVoc, filemana, 0, 5) #If conjunctive : #self.checkResultApproximative(topk,[(2,(math.log(3/4)+math.log(3/2))/2)]) self.checkResultApproximative( topk, [(2, (math.log(3 / 4) + math.log(3 / 2)) / 2), (1, math.log(3 / 4) / 2), (3, math.log(3 / 4) / 2)]) topk = faginstatopk.apply_fagins_ta(['bb'], savedVoc, filemana, 0, 5) self.checkResultApproximative(topk, [(2, math.log(3 / 2))]) topk = faginstatopk.apply_fagins_ta(['cc'], savedVoc, filemana, 0, 5) self.checkResultApproximative(topk, []) topk = faginstatopk.apply_fagins_ta(['cc', 'dd'], savedVoc, filemana, 0, 5) self.checkResultApproximative(topk, [])
def test_merging_3_files_scores(self): voc = dict() currentWorkspace = './tests/workspace/test4/' filename = 'test4merging3filesscores' pathlist = Path("./tests/data/test4/").glob('**/la*') filemanager = fm.FileManager(filename, currentWorkspace) for path in pathlist: analysis.analyse_newspaper(path, voc, computeIDF=True) filemanager.save_vocabularyAndPL_file(voc, True) voc = dict() filemanager.mergePartialVocsAndPL(True) savedVoc = filemanager.read_vocabulary() mot,sortedByScore = query.get_posting_list(savedVoc, "aa", filemanager, True) self.assertEqual(mot, {1: [0.24718092381954193, 3.0], 2: [0.32882189750671387, 6.0], 5: [0.11778303235769272, 1.0], 6: [ 0.11778303235769272, 1.0], 20: [0.24718092381954193, 3.0], 21: [0.19942401349544525, 2.0], 22: [0.11778303235769272, 1.0]}) self.assertEqual(sortedByScore, [(0.32882189750671387, 2), (0.24718092381954193, 1), (0.24718092381954193, 20), (0.19942401349544525, 21), (0.11778303235769272, 5),(0.11778303235769272, 6),(0.11778303235769272, 22)]) mot,sortedByScore = query.get_posting_list(savedVoc, "bb", filemanager,True) self.assertEqual(mot, {1: [0.5274115204811096, 3.0], 2: [0.7016094326972961, 6.0], 4: [0.2513144314289093, 1.0], 5: [0.2513144314289093, 1.0], 20: [0.2513144314289093, 1.0], 21: [0.2513144314289093, 1.0]}) self.assertEqual(sortedByScore,[(0.7016094326972961, 2), (0.5274115204811096, 1), (0.2513144314289093,4),(0.2513144314289093, 5),(0.2513144314289093,20),(0.2513144314289093, 21)])
def test_merging_3_files(self): voc = dict() currentWorkspace = './tests/workspace/test4/' filename = 'test4merging3files' pathlist = Path("./tests/data/test4/").glob('**/la*') filemanager = fm.FileManager(filename, currentWorkspace) for path in pathlist: analysis.analyse_newspaper(path, voc) filemanager.save_vocabularyAndPL_file(voc, True) voc = dict() filemanager.mergePartialVocsAndPL(False) # TODO: changer quand on a une function directe savedVoc = filemanager.read_vocabulary() mot = query.get_posting_list(savedVoc, "aa", filemanager) self.assertEqual(mot, {1: [0, 3.0], 2: [0, 6.0], 5: [0, 1.0], 6: [ 0, 1.0], 20: [0, 3.0], 21: [0, 2.0], 22: [0, 1.0]}) mot = query.get_posting_list(savedVoc, "bb", filemanager) self.assertEqual(mot, {1: [0, 3], 2: [0, 6], 20: [ 0, 1], 21: [0, 1], 4: [0, 1], 5: [0, 1]}) mot = query.get_posting_list(savedVoc, "cc", filemanager) self.assertEqual(mot, {1: [0, 1], 2: [0, 3], 22: [ 0, 1], 4: [0, 1], 6: [0, 1]}) mot = query.get_posting_list(savedVoc, "dd", filemanager) self.assertEqual(mot, {1: [0, 2], 2: [0, 1]}) mot = query.get_posting_list(savedVoc, "ff", filemanager) self.assertEqual(mot, {1: [0, 1], 20: [0, 1], 6: [0, 1]}, "FF") mot = query.get_posting_list(savedVoc, "qq", filemanager) self.assertEqual(mot, {1: [0, 1], 5: [0, 1]}) mot = query.get_posting_list(savedVoc, "rr", filemanager) self.assertEqual(mot, {1: [0, 5], 21: [0, 1]}) mot = query.get_posting_list(savedVoc, "ee", filemanager) self.assertEqual(mot, {1: [0, 1], 23: [0, 1]}) mot = query.get_posting_list(savedVoc, "vv", filemanager) self.assertEqual(mot, {1: [0, 1]}) mot = query.get_posting_list(savedVoc, "yy", filemanager) self.assertEqual(mot, {1: [0, 1]}) mot = query.get_posting_list(savedVoc, "kk", filemanager) self.assertEqual(mot, {2: [0, 1], 23: [0, 1]}) mot = query.get_posting_list(savedVoc, "ii", filemanager) self.assertEqual(mot, {2: [0, 1]}) mot = query.get_posting_list(savedVoc, "jj", filemanager) self.assertEqual(mot, {2: [0, 1]}) mot = query.get_posting_list(savedVoc, "hh", filemanager) self.assertEqual(mot, {23: [0, 1]}) mot = query.get_posting_list(savedVoc, "ll", filemanager) self.assertEqual(mot, {}, 'll is considered a stopword')
def test_topk_trivial_file(self): pathlist = Path("./tests/data/testtrivialtopk/").glob('**/la*') filemana = filemanager.FileManager( "TestFaginsTopK", "./tests/workspace/testsfaginstopk") tempVoc = dict() for path in pathlist: analysis.analyse_newspaper(path, tempVoc, computeIDF=True) filemana.save_vocabularyAndPL_file(tempVoc) # Extraction of the saved Voc savedVoc = filemana.read_vocabulary() topk = naivetopk.apply_naive_top_k_algo(['bb'], savedVoc, filemana, 0, 5, naivetopk.conjunctive_queries) self.checkResultApproximative(topk, [(2, math.log(3 / 2))]) topk = naivetopk.apply_naive_top_k_algo(['cc'], savedVoc, filemana, 0, 5, naivetopk.conjunctive_queries) self.checkResultApproximative(topk, []) topk = naivetopk.apply_naive_top_k_algo(['cc', 'dd'], savedVoc, filemana, 0, 5, naivetopk.conjunctive_queries) self.checkResultApproximative(topk, []) topk = naivetopk.apply_naive_top_k_algo(['cc', 'dd'], savedVoc, filemana, 0, 5, naivetopk.disjunctive_queries) self.checkResultApproximative(topk, []) topk = naivetopk.apply_naive_top_k_algo(['bb'], savedVoc, filemana, 0, 5, naivetopk.disjunctive_queries) self.checkResultApproximative(topk, [(2, math.log(3 / 2))]) topk = naivetopk.apply_naive_top_k_algo(['aa', 'bb'], savedVoc, filemana, 0, 1, naivetopk.disjunctive_queries) self.checkResultApproximative( topk, [(2, (math.log(3 / 4) + math.log(3 / 2)) / 2)]) topk = naivetopk.apply_naive_top_k_algo(['aa', 'bb', 'cc'], savedVoc, filemana, 0, 1, naivetopk.disjunctive_queries) self.checkResultApproximative( topk, [(2, (math.log(3 / 4) + math.log(3 / 2)) / 3)])
def test_simple(self): voc = dict() currentWorkspace = './tests/workspace/test1/' filename = 'test1' pathlist = Path("./tests/data/test1/").glob('**/la*') for path in pathlist: analysis.analyse_newspaper(path, voc) filemanager = fm.FileManager(filename, currentWorkspace) filemanager.save_vocabularyAndPL_file(voc, False) savedVoc = filemanager.read_vocabulary() mot1 = query.get_posting_list(savedVoc, "aa", filemanager) mot2 = query.get_posting_list(savedVoc, "bb", filemanager) mot3 = query.get_posting_list(savedVoc, "cc", filemanager) self.assertEqual(mot1, {1: [0, 3], 2: [0, 2], 3: [0, 1]}) self.assertEqual(mot2, {1: [0, 1], 2: [0, 1]}) self.assertEqual(mot3, {3: [0, 1]})
def test_with_stopwords(self): voc = dict() currentWorkspace = './tests/workspace/test2/' filename = 'test2' pathlist = Path("./tests/data/test2/").glob('**/la*') for path in pathlist: analysis.analyse_newspaper(path, voc) filemanager = fm.FileManager(filename, currentWorkspace) filemanager.save_vocabularyAndPL_file(voc) # TODO: changer quand on ait une function directe savedVoc = filemanager.read_vocabulary() mot1 = query.get_posting_list(savedVoc, "aa", filemanager) mot2 = query.get_posting_list(savedVoc, "bb", filemanager) mot3 = query.get_posting_list(savedVoc, "cc", filemanager) self.assertEqual(mot1, {1: [0, 1], 2: [0, 2]}) self.assertEqual(mot2, {1: [0, 4], 2: [0, 1]}) self.assertEqual(mot3, {2: [0, 2]}) stop1 = query.get_posting_list(savedVoc, "doing", filemanager) self.assertEqual(stop1, {})
def analysis_parameters(): global MAX_RANDOM_INDEXING parser = argparse.ArgumentParser() parser.add_argument("-d", type=str, help="dossier avec les documents", required=True) parser.add_argument( "-f", type=str, help="nom de fichier pour enregistrer les fichiers après l'indexation ", required=True) parser.add_argument( "-o", type=str, default='./workspace/', help="dossier pour enregistrer les fichiers après l'indexation ") parser.add_argument("--zip", action='store_true', help="compression zip à la fin") parser.add_argument( "--partial", type=int, default=-1, help= 'créer les fichiers par réunion de plusieurs fichiers avec une granularité de documents choisie. Si -2, alors granularité d\'un journal. Valeur conseillée : 2000.' ) parser.add_argument("--stemmer", action='store_true', help='activer stemmer') parser.add_argument("--randomindexing", action='store_true', help='activer random indexing') args = parser.parse_args() latimes_path = args.d if not args.d.endswith("/"): latimes_path += "/" workspace_path = args.o if not args.d.endswith("/"): workspace_path += "/" pathlist = Path(latimes_path).glob('**/la*') vocabulary = dict() filemanager = fm.FileManager(args.f, workspace_path) random_indexing = None if args.randomindexing: random_indexing = ri.RandomIndexing() if args.stemmer: analysis.setPreprocessor(preprocessing.Preprocessor(True)) if args.partial == -2: print("Partial analysis in progress") for newspaper_path in tqdm(list(pathlist)): docsRedInDocIteration = analysis.analyse_newspaper( newspaper_path, vocabulary, None, False) filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=True) vocabulary = dict() print("Merging in progress…") filemanager.mergePartialVocsAndPL() print("PL and VOC merged succesfully") if args.partial != -1: nbDocsInMemory = 0 stepFlush = args.partial rand_indexing_counter = 0 for newspaper_path in tqdm(list(pathlist)): docsRedInDocIteration = -1 nbDocsRedInThisJournal = 0 while (docsRedInDocIteration != 0): if rand_indexing_counter < MAX_RANDOM_INDEXING: docsRedInDocIteration = analysis.analyse_newspaper( newspaper_path, vocabulary, random_indexing, False, nbDocsRedInThisJournal, nbDocsRedInThisJournal + stepFlush) else: docsRedInDocIteration = analysis.analyse_newspaper( newspaper_path, vocabulary, None, False, nbDocsRedInThisJournal, nbDocsRedInThisJournal + stepFlush) nbDocsInMemory += docsRedInDocIteration nbDocsRedInThisJournal += docsRedInDocIteration if nbDocsInMemory >= stepFlush: filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=True) vocabulary = dict() nbDocsInMemory = 0 rand_indexing_counter += 1 if nbDocsInMemory != 0: filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=True) print("Merging in progress…") filemanager.mergePartialVocsAndPL() print("PL and VOC merged succesfully") print("Inverted file created !") else: print("Non partial") rand_indexing_counter = 0 for newspaper_path in tqdm(list(pathlist)): if rand_indexing_counter < MAX_RANDOM_INDEXING: rand_indexing_counter += 1 analysis.analyse_newspaper(newspaper_path, vocabulary, random_indexing, False) else: analysis.analyse_newspaper(newspaper_path, vocabulary, None, False) analysis.computeIDF(vocabulary) filemanager.save_vocabularyAndPL_file(vocabulary) print("Inverted file created !") if args.zip: print("Compressing…") filemanager = fm.FileManager(args.f, args.o) zip.compressPLVBYTEFromSavedVocAndPL(filemanager) zip.compressZip(filemanager.getPathPLCompressed()) zip.compressZip(filemanager.getPathVocCompressed()) zip.compressZip(filemanager.getPathPLScore()) print("Compressed !") if args.randomindexing: filemanager.save_random_indexing(random_indexing.getTermsVectors(), random_indexing.getTermDimension()) print("Random indexing created")
postingListsOrderedById = dict() postingListsOrderedById['aaa'] = pl1_id postingListsOrderedById['bbb'] = pl2_id print('postingListsOrderedById : {}'.format(postingListsOrderedById)) print('postingListsOrderedByScore : {}'.format(postingListsOrderedByScore)) return postingListsOrderedById, postingListsOrderedByScore if __name__ == "__main__": # Applying Top K Algorithm to mockData # postingListsOrderedById, postingListsOrderedByScore = createMockData() # c = find_fagins_ta(postingListsOrderedById, postingListsOrderedByScore, 3, aggregative_function_mean) # print("Resulta c : {}".format(c)) currentWorkspace = './workspace/testfaginsta/' filename = 'test1' filemanag = fm.FileManager(filename, currentWorkspace) tempVoc = SortedDict() pathlist = Path("./tests/data/test4/").glob('**/la*') for path in pathlist: analysis.analyse_newspaper(path, tempVoc, True) filemanag.save_vocabularyAndPL_file(tempVoc) savedVoc = filemanag.read_vocabulary() faginsta = apply_fagins_ta(['aa', 'bb'], savedVoc, filemanag, 0.2, 2) print("result faginsTA : {}".format(faginsta))
def analyseAndMergeDocuments(array_of_iterations, stepFlush): path = "" print("analyse_newspaper") print("Merging involved, flush frequency : Every " + str(stepFlush) + " document.") pathlist = Path("./../data/latimes/").glob('**/la*') tmpPreprocessor = analysis.preprocessor analysis.setPreprocessor( preprocessing.Preprocessor(activate_stemmer=False)) timeToExtract = [] timeToMerge = [] timeToFlush = [0] * len(array_of_iterations) timeTotal = [] timeToAnalyse = [] for numBatch, nbDocsToRead in enumerate(array_of_iterations): startBatch = time.time() folder = './workspace/' for the_file in os.listdir(folder): file_path = os.path.join(folder, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception as e: print(e) filemanager = fm.FileManager("benchmarkAnalysisTest" + str(nbDocsToRead)) start = time.time() pathlist = Path("./../data/latimes/").glob('**/la*') vocabulary = dict() nbDocsRed = 0 nbDocsInMemory = 0 print("analysis in progress") for i, newspaper_path in enumerate(pathlist): if nbDocsRed >= nbDocsToRead: break docsRedInDocIteration = -1 nbDocsRedInThisJournal = 0 while (docsRedInDocIteration != 0): docsRedInDocIteration = analysis.analyse_newspaper( newspaper_path, vocabulary, None, False, nbDocsRedInThisJournal, nbDocsRedInThisJournal + stepFlush) nbDocsRed = docsRedInDocIteration + nbDocsRed nbDocsInMemory += docsRedInDocIteration nbDocsRedInThisJournal += docsRedInDocIteration if nbDocsInMemory == stepFlush or nbDocsRed >= nbDocsToRead: startFlush = time.time() filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=True) vocabulary = dict() nbDocsInMemory = 0 timeToFlush[numBatch] += (time.time() - startFlush) if nbDocsRed >= nbDocsToRead: break if nbDocsRed >= nbDocsToRead: break if nbDocsRed < nbDocsToRead: print("Benchmark invalid, as we ran out of documents to read.") timeToExtract.append(time.time() - start) start = time.time() print("Merging in progress…") filemanager.mergePartialVocsAndPL() timeToMerge.append(time.time() - start) timeTotal.append(time.time() - startBatch) analysis.setPreprocessor(tmpPreprocessor) print(array_of_iterations) print("Ttmerge") print(timeToMerge) plt.plot(array_of_iterations, timeToMerge, label="Time to merge") print("Ttextract") print(timeToExtract) plt.plot(array_of_iterations, timeToExtract, label="Time to analyse document (with flushing)") print("Ttflush") print(timeToFlush) plt.plot(array_of_iterations, timeToFlush, label="Time to flush documents") print("Overalltime") print(timeTotal) plt.plot(array_of_iterations, timeTotal, label="Overall time") plt.xlabel("Number of Documents") plt.ylabel("Time (s)") plt.legend() plt.show()
def analyseAndSaveDocuments(array_of_iterations, computeIDF=False, numberIterations=1): totaltimeToExtract = [] totaltimeToSave = [] totaltimeTotal = [] totaltimeToComputeIDF = [] for i in range(0, numberIterations): path = "" print("analyse_newspaper") print("Save only in the end, no merging involved") pathlist = Path("./../data/latimes/").glob('**/la*') tmpPreprocessor = analysis.preprocessor analysis.setPreprocessor( preprocessing.Preprocessor(activate_stemmer=False)) timeToExtract = [] timeToSave = [] timeTotal = [] timeToComputeIDF = [] for numBatch, nbDocsToRead in enumerate(array_of_iterations): startBatch = time.time() folder = './workspace/' for the_file in os.listdir(folder): file_path = os.path.join(folder, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception as e: print(e) filemanager = fm.FileManager("benchmarkAnalysisTest" + str(nbDocsToRead)) start = time.time() pathlist = Path("./../data/latimes/").glob('**/la*') vocabulary = dict() nbDocsRed = 0 print("analysis in progress") for i, newspaper_path in enumerate(pathlist): if nbDocsRed >= nbDocsToRead: break docsRedInDocIteration = -1 while (docsRedInDocIteration != 0): docsRedInDocIteration = analysis.analyse_newspaper( newspaper_path, vocabulary, None, False, 0, nbDocsToRead - nbDocsRed) nbDocsRed = docsRedInDocIteration + nbDocsRed if nbDocsRed >= nbDocsToRead: break if nbDocsRed >= nbDocsToRead: break if nbDocsRed < nbDocsToRead: print("Benchmark invalid, as we ran out of documents to read.") timeToExtract.append(time.time() - start) if computeIDF: startComputeIDF = time.time() analysis.computeIDF(vocabulary) timeToComputeIDF.append(time.time() - startComputeIDF) start = time.time() print("Saving in progress…") filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=False) timeToSave.append(time.time() - start) timeTotal.append(time.time() - startBatch) analysis.setPreprocessor(tmpPreprocessor) print("Number of documents :") print(array_of_iterations) plt.plot(array_of_iterations, timeToExtract, label="Time to analyse documents") print("Time to extract :") print(timeToExtract) if computeIDF: plt.plot(array_of_iterations, timeToComputeIDF, label="Time to compute IDF") print("Time to compute IDF :") print(timeToComputeIDF) plt.plot(array_of_iterations, timeToSave, label="Time to save") print("Time to save :") print(timeToSave) plt.plot(array_of_iterations, timeTotal, label="Overall time") print("Overall Time :") print(timeTotal) plt.xlabel("Number of Documents") plt.ylabel("Time (s)") plt.legend() plt.show() totaltimeToExtract.append(timeToExtract) if computeIDF: totaltimeToComputeIDF.append(timeToComputeIDF) totaltimeToSave.append(timeToSave) totaltimeTotal.append(timeTotal) if computeIDF: print("computeidf") resIDF = [0] * len(totaltimeToComputeIDF[0]) for arr in totaltimeToComputeIDF: for i, elt in enumerate(arr): resIDF[i] = resIDF[i] + elt / len(totaltimeToComputeIDF) print(totaltimeToComputeIDF) print(resIDF) print("extract") resextract = [0] * len(totaltimeToExtract[0]) for arr in totaltimeToExtract: for i, elt in enumerate(arr): resextract[i] = resextract[i] + elt / len(totaltimeToExtract) print(totaltimeToExtract) print(resextract) print("save") ressave = [0] * len(totaltimeToSave[0]) for arr in totaltimeToSave: for i, elt in enumerate(arr): ressave[i] = ressave[i] + elt / len(totaltimeToSave) print(totaltimeToSave) print(ressave) print("total") restotal = [0] * len(totaltimeTotal[0]) for arr in totaltimeTotal: for i, elt in enumerate(arr): restotal[i] = restotal[i] + elt / len(totaltimeTotal) print(totaltimeTotal) print(restotal) plt.plot(array_of_iterations, resextract, label="Time to analyse documents") if computeIDF: plt.plot(array_of_iterations, resIDF, label="Time to compute IDF") plt.plot(array_of_iterations, ressave, label="Time to save") plt.plot(array_of_iterations, restotal, label="Overall time") plt.xlabel("Number of Documents") plt.ylabel("Time (s)") plt.legend() plt.show()
def analyseAndSaveDocumentsMultithread(array_of_newspapers, computeIDF=False): path = "" print("analyse_newspaper") print("Save only in the end, no merging involved") pathlist = Path("./../data/latimes/").glob('**/la*') tmpPreprocessor = analysis.preprocessor analysis.setPreprocessor( preprocessing.Preprocessor(activate_stemmer=False)) timeToExtract = [] timeToSave = [] timeTotal = [] timeToAnalyse = [] timeToComputeIDF = [] for numBatch, nbNewsPaperToRead in enumerate(array_of_newspapers): startBatch = time.time() folder = './workspace/' for the_file in os.listdir(folder): file_path = os.path.join(folder, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) except Exception as e: print(e) filemanager = fm.FileManager("benchmarkAnalysisTest" + str(nbNewsPaperToRead)) start = time.time() pathlist = Path("./../data/latimes/").glob('**/la*') vocabulary = dict() nbNewspaperRed = 0 nbDocsRed = 0 print("analysis in progress") for i, newspaper_path in enumerate(pathlist): if nbNewspaperRed >= nbNewsPaperToRead: break docsRedInDocIteration = analysis.analyse_newspaper( newspaper_path, vocabulary, None, False) nbDocsRed = docsRedInDocIteration + nbDocsRed nbNewspaperRed += 1 if nbNewspaperRed < nbNewsPaperToRead: print("Benchmark invalid, as we ran out of newspaper to read.") timeToExtract.append(time.time() - start) print("We red documents : ") print(nbDocsRed) if computeIDF: startComputeIDF = time.time() analysis.computeIDF(vocabulary) timeToComputeIDF.append(time.time() - startComputeIDF) start = time.time() print("Saving in progress…") filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=False) timeToSave.append(time.time() - start) timeTotal.append(time.time() - startBatch) analysis.setPreprocessor(tmpPreprocessor) print("Number of documents :") print(array_of_newspapers) plt.plot(array_of_newspapers, timeToExtract, label="Time to analyse documents") print("Time to extract :") print(timeToExtract) if computeIDF: plt.plot(array_of_newspapers, timeToComputeIDF, label="Time to compute IDF") print("Time to compute IDF :") print(timeToComputeIDF) plt.plot(array_of_newspapers, timeToSave, label="Time to save") print("Time to save :") print(timeToSave) plt.plot(array_of_newspapers, timeTotal, label="Overall time") print("Overall Time :") print(timeTotal) plt.xlabel("Number of Documents") plt.ylabel("Time (s)") plt.legend() plt.show()