Ejemplo n.º 1
0
    def test_merging(self):
        voc = dict()
        currentWorkspace = './tests/workspace/test3/'
        filename = 'test3'

        pathlist = Path("./tests/data/test3/").glob('**/la*')

        filemanager = fm.FileManager(filename, currentWorkspace)

        for path in pathlist:
            analysis.analyse_newspaper(path, voc)
            filemanager.save_vocabularyAndPL_file(voc, True)
            voc = dict()

        filemanager.mergePartialVocsAndPL(False)

        # TODO: changer quand on ait une function directe
        savedVoc = filemanager.read_vocabulary()
        mot, sortedByScore = query.get_posting_list(
            savedVoc, "aa", filemanager, True)
        self.assertEqual(mot, {1: [0, 3], 2: [0, 2], 3: [
                         0, 1], 4: [0, 3], 5: [0, 2], 6: [0, 1]})
        # The score is equal to zero
        self.assertEqual(sortedByScore, [(0, 1),(0, 2),(0, 3),(0, 4),(0, 5),(0, 6)])
        mot, sortedByScore = query.get_posting_list(
            savedVoc, "bb", filemanager, True)
        # The score is equal to zero
        self.assertEqual(sortedByScore, [(0, 1), (0,2), (0,4), (0,5)])
        self.assertEqual(mot, {1: [0, 1], 2: [0, 1], 4: [0, 1], 5: [0, 1]})
        mot, sortedByScore = query.get_posting_list(
            savedVoc, "cc", filemanager, True)
        self.assertEqual(mot, {3: [0, 1], 6: [0, 1]})
        self.assertEqual(sortedByScore, [(0, 3),(0, 6)])
Ejemplo n.º 2
0
    def test_topk_trivial_file(self):

        pathlist = Path("./tests/data/testtrivialtopk/").glob('**/la*')

        filemana = filemanager.FileManager(
            "TestFaginsTopK", "./tests/workspace/testsfaginstopk/")
        tempVoc = SortedDict()
        for path in pathlist:
            analysis.analyse_newspaper(path, tempVoc, computeIDF=True)
        filemana.save_vocabularyAndPL_file(tempVoc)

        # Extraction of the saved Voc
        savedVoc = filemana.read_vocabulary()

        topk = faginstatopk.apply_fagins_ta(['aa', 'bb'], savedVoc, filemana,
                                            0, 5)
        #If conjunctive :
        #self.checkResultApproximative(topk,[(2,(math.log(3/4)+math.log(3/2))/2)])
        self.checkResultApproximative(
            topk, [(2, (math.log(3 / 4) + math.log(3 / 2)) / 2),
                   (1, math.log(3 / 4) / 2), (3, math.log(3 / 4) / 2)])

        topk = faginstatopk.apply_fagins_ta(['bb'], savedVoc, filemana, 0, 5)
        self.checkResultApproximative(topk, [(2, math.log(3 / 2))])

        topk = faginstatopk.apply_fagins_ta(['cc'], savedVoc, filemana, 0, 5)
        self.checkResultApproximative(topk, [])

        topk = faginstatopk.apply_fagins_ta(['cc', 'dd'], savedVoc, filemana,
                                            0, 5)
        self.checkResultApproximative(topk, [])
Ejemplo n.º 3
0
    def test_merging_3_files_scores(self):
        voc = dict()
        currentWorkspace = './tests/workspace/test4/'
        filename = 'test4merging3filesscores'

        pathlist = Path("./tests/data/test4/").glob('**/la*')

        filemanager = fm.FileManager(filename, currentWorkspace)

        for path in pathlist:
            analysis.analyse_newspaper(path, voc, computeIDF=True)
            filemanager.save_vocabularyAndPL_file(voc, True)
            voc = dict()

        filemanager.mergePartialVocsAndPL(True)

        savedVoc = filemanager.read_vocabulary()
        mot,sortedByScore = query.get_posting_list(savedVoc, "aa", filemanager, True)
        self.assertEqual(mot, {1: [0.24718092381954193, 3.0], 2: [0.32882189750671387, 6.0], 5: [0.11778303235769272, 1.0], 6: [
                         0.11778303235769272, 1.0], 20: [0.24718092381954193, 3.0], 21: [0.19942401349544525, 2.0], 22: [0.11778303235769272, 1.0]})
        
        self.assertEqual(sortedByScore, [(0.32882189750671387, 2), (0.24718092381954193, 1), (0.24718092381954193, 20), (0.19942401349544525, 21), (0.11778303235769272, 5),(0.11778303235769272, 6),(0.11778303235769272, 22)])
        
        mot,sortedByScore = query.get_posting_list(savedVoc, "bb", filemanager,True)
        self.assertEqual(mot, {1: [0.5274115204811096, 3.0], 2: [0.7016094326972961, 6.0], 4: [0.2513144314289093, 1.0], 5: [0.2513144314289093, 1.0], 20: [0.2513144314289093, 1.0], 21: [0.2513144314289093, 1.0]})
        self.assertEqual(sortedByScore,[(0.7016094326972961, 2), (0.5274115204811096, 1), (0.2513144314289093,4),(0.2513144314289093, 5),(0.2513144314289093,20),(0.2513144314289093, 21)])
Ejemplo n.º 4
0
    def test_merging_3_files(self):
        voc = dict()
        currentWorkspace = './tests/workspace/test4/'
        filename = 'test4merging3files'

        pathlist = Path("./tests/data/test4/").glob('**/la*')

        filemanager = fm.FileManager(filename, currentWorkspace)


        for path in pathlist:
            analysis.analyse_newspaper(path, voc)
            filemanager.save_vocabularyAndPL_file(voc, True)
            voc = dict()

        filemanager.mergePartialVocsAndPL(False)

        # TODO: changer quand on a   une function directe
        savedVoc = filemanager.read_vocabulary()
        mot = query.get_posting_list(savedVoc, "aa", filemanager)
        self.assertEqual(mot, {1: [0, 3.0], 2: [0, 6.0], 5: [0, 1.0], 6: [
                         0, 1.0], 20: [0, 3.0], 21: [0, 2.0], 22: [0, 1.0]})
        mot = query.get_posting_list(savedVoc, "bb", filemanager)
        self.assertEqual(mot, {1: [0, 3], 2: [0, 6], 20: [
                         0, 1], 21: [0, 1], 4: [0, 1], 5: [0, 1]})
        mot = query.get_posting_list(savedVoc, "cc", filemanager)
        self.assertEqual(mot, {1: [0, 1], 2: [0, 3], 22: [
                         0, 1], 4: [0, 1], 6: [0, 1]})
        mot = query.get_posting_list(savedVoc, "dd", filemanager)
        self.assertEqual(mot, {1: [0, 2], 2: [0, 1]})
        mot = query.get_posting_list(savedVoc, "ff", filemanager)
        self.assertEqual(mot, {1: [0, 1], 20: [0, 1], 6: [0, 1]}, "FF")
        mot = query.get_posting_list(savedVoc, "qq", filemanager)
        self.assertEqual(mot, {1: [0, 1], 5: [0, 1]})
        mot = query.get_posting_list(savedVoc, "rr", filemanager)
        self.assertEqual(mot, {1: [0, 5], 21: [0, 1]})
        mot = query.get_posting_list(savedVoc, "ee", filemanager)
        self.assertEqual(mot, {1: [0, 1], 23: [0, 1]})
        mot = query.get_posting_list(savedVoc, "vv", filemanager)
        self.assertEqual(mot, {1: [0, 1]})
        mot = query.get_posting_list(savedVoc, "yy", filemanager)
        self.assertEqual(mot, {1: [0, 1]})
        mot = query.get_posting_list(savedVoc, "kk", filemanager)
        self.assertEqual(mot, {2: [0, 1], 23: [0, 1]})
        mot = query.get_posting_list(savedVoc, "ii", filemanager)
        self.assertEqual(mot, {2: [0, 1]})
        mot = query.get_posting_list(savedVoc, "jj", filemanager)
        self.assertEqual(mot, {2: [0, 1]})
        mot = query.get_posting_list(savedVoc, "hh", filemanager)
        self.assertEqual(mot, {23: [0, 1]})
        mot = query.get_posting_list(savedVoc, "ll", filemanager)
        self.assertEqual(mot, {}, 'll is considered a stopword')
Ejemplo n.º 5
0
    def test_topk_trivial_file(self):

        pathlist = Path("./tests/data/testtrivialtopk/").glob('**/la*')

        filemana = filemanager.FileManager(
            "TestFaginsTopK", "./tests/workspace/testsfaginstopk")
        tempVoc = dict()
        for path in pathlist:
            analysis.analyse_newspaper(path, tempVoc, computeIDF=True)
        filemana.save_vocabularyAndPL_file(tempVoc)

        # Extraction of the saved Voc
        savedVoc = filemana.read_vocabulary()
        topk = naivetopk.apply_naive_top_k_algo(['bb'], savedVoc, filemana, 0,
                                                5,
                                                naivetopk.conjunctive_queries)
        self.checkResultApproximative(topk, [(2, math.log(3 / 2))])

        topk = naivetopk.apply_naive_top_k_algo(['cc'], savedVoc, filemana, 0,
                                                5,
                                                naivetopk.conjunctive_queries)
        self.checkResultApproximative(topk, [])

        topk = naivetopk.apply_naive_top_k_algo(['cc', 'dd'], savedVoc,
                                                filemana, 0, 5,
                                                naivetopk.conjunctive_queries)
        self.checkResultApproximative(topk, [])

        topk = naivetopk.apply_naive_top_k_algo(['cc', 'dd'], savedVoc,
                                                filemana, 0, 5,
                                                naivetopk.disjunctive_queries)
        self.checkResultApproximative(topk, [])

        topk = naivetopk.apply_naive_top_k_algo(['bb'], savedVoc, filemana, 0,
                                                5,
                                                naivetopk.disjunctive_queries)
        self.checkResultApproximative(topk, [(2, math.log(3 / 2))])

        topk = naivetopk.apply_naive_top_k_algo(['aa', 'bb'], savedVoc,
                                                filemana, 0, 1,
                                                naivetopk.disjunctive_queries)
        self.checkResultApproximative(
            topk, [(2, (math.log(3 / 4) + math.log(3 / 2)) / 2)])

        topk = naivetopk.apply_naive_top_k_algo(['aa', 'bb', 'cc'], savedVoc,
                                                filemana, 0, 1,
                                                naivetopk.disjunctive_queries)
        self.checkResultApproximative(
            topk, [(2, (math.log(3 / 4) + math.log(3 / 2)) / 3)])
Ejemplo n.º 6
0
    def test_simple(self):
        voc = dict()
        currentWorkspace = './tests/workspace/test1/'
        filename = 'test1'

        pathlist = Path("./tests/data/test1/").glob('**/la*')
        for path in pathlist:
            analysis.analyse_newspaper(path, voc)
        filemanager = fm.FileManager(filename, currentWorkspace)
        filemanager.save_vocabularyAndPL_file(voc, False)
        savedVoc = filemanager.read_vocabulary()

        mot1 = query.get_posting_list(savedVoc, "aa", filemanager)
        mot2 = query.get_posting_list(savedVoc, "bb", filemanager)
        mot3 = query.get_posting_list(savedVoc, "cc", filemanager)
        self.assertEqual(mot1, {1: [0, 3], 2: [0, 2], 3: [0, 1]})
        self.assertEqual(mot2, {1: [0, 1], 2: [0, 1]})
        self.assertEqual(mot3, {3: [0, 1]})
Ejemplo n.º 7
0
    def test_with_stopwords(self):
        voc = dict()
        currentWorkspace = './tests/workspace/test2/'
        filename = 'test2'

        pathlist = Path("./tests/data/test2/").glob('**/la*')
        for path in pathlist:
            analysis.analyse_newspaper(path, voc)

        filemanager = fm.FileManager(filename, currentWorkspace)
        filemanager.save_vocabularyAndPL_file(voc)
        # TODO: changer quand on ait une function directe
        savedVoc = filemanager.read_vocabulary()
        mot1 = query.get_posting_list(savedVoc, "aa", filemanager)
        mot2 = query.get_posting_list(savedVoc, "bb", filemanager)
        mot3 = query.get_posting_list(savedVoc, "cc", filemanager)
        self.assertEqual(mot1, {1: [0, 1], 2: [0, 2]})
        self.assertEqual(mot2, {1: [0, 4], 2: [0, 1]})
        self.assertEqual(mot3, {2: [0, 2]})

        stop1 = query.get_posting_list(savedVoc, "doing", filemanager)
        self.assertEqual(stop1, {})
Ejemplo n.º 8
0
def analysis_parameters():
    global MAX_RANDOM_INDEXING

    parser = argparse.ArgumentParser()

    parser.add_argument("-d",
                        type=str,
                        help="dossier avec les documents",
                        required=True)
    parser.add_argument(
        "-f",
        type=str,
        help="nom de fichier pour enregistrer les fichiers après l'indexation ",
        required=True)
    parser.add_argument(
        "-o",
        type=str,
        default='./workspace/',
        help="dossier pour enregistrer les fichiers après l'indexation ")
    parser.add_argument("--zip",
                        action='store_true',
                        help="compression zip à la fin")
    parser.add_argument(
        "--partial",
        type=int,
        default=-1,
        help=
        'créer les fichiers par réunion de plusieurs fichiers avec une granularité de documents choisie. Si -2, alors granularité d\'un journal. Valeur conseillée : 2000.'
    )
    parser.add_argument("--stemmer",
                        action='store_true',
                        help='activer stemmer')
    parser.add_argument("--randomindexing",
                        action='store_true',
                        help='activer random indexing')
    args = parser.parse_args()

    latimes_path = args.d
    if not args.d.endswith("/"):
        latimes_path += "/"

    workspace_path = args.o
    if not args.d.endswith("/"):
        workspace_path += "/"

    pathlist = Path(latimes_path).glob('**/la*')

    vocabulary = dict()
    filemanager = fm.FileManager(args.f, workspace_path)
    random_indexing = None
    if args.randomindexing:
        random_indexing = ri.RandomIndexing()

    if args.stemmer:
        analysis.setPreprocessor(preprocessing.Preprocessor(True))

    if args.partial == -2:
        print("Partial analysis in progress")
        for newspaper_path in tqdm(list(pathlist)):
            docsRedInDocIteration = analysis.analyse_newspaper(
                newspaper_path, vocabulary, None, False)
            filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=True)
            vocabulary = dict()
        print("Merging in progress…")
        filemanager.mergePartialVocsAndPL()
        print("PL and VOC merged succesfully")
    if args.partial != -1:
        nbDocsInMemory = 0
        stepFlush = args.partial

        rand_indexing_counter = 0
        for newspaper_path in tqdm(list(pathlist)):

            docsRedInDocIteration = -1
            nbDocsRedInThisJournal = 0
            while (docsRedInDocIteration != 0):
                if rand_indexing_counter < MAX_RANDOM_INDEXING:
                    docsRedInDocIteration = analysis.analyse_newspaper(
                        newspaper_path, vocabulary, random_indexing, False,
                        nbDocsRedInThisJournal,
                        nbDocsRedInThisJournal + stepFlush)
                else:
                    docsRedInDocIteration = analysis.analyse_newspaper(
                        newspaper_path, vocabulary, None, False,
                        nbDocsRedInThisJournal,
                        nbDocsRedInThisJournal + stepFlush)
                nbDocsInMemory += docsRedInDocIteration
                nbDocsRedInThisJournal += docsRedInDocIteration
                if nbDocsInMemory >= stepFlush:
                    filemanager.save_vocabularyAndPL_file(vocabulary,
                                                          isPartial=True)
                    vocabulary = dict()
                    nbDocsInMemory = 0
                rand_indexing_counter += 1

        if nbDocsInMemory != 0:
            filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=True)

        print("Merging in progress…")

        filemanager.mergePartialVocsAndPL()
        print("PL and VOC merged succesfully")
        print("Inverted file created !")

    else:
        print("Non partial")
        rand_indexing_counter = 0
        for newspaper_path in tqdm(list(pathlist)):
            if rand_indexing_counter < MAX_RANDOM_INDEXING:
                rand_indexing_counter += 1
                analysis.analyse_newspaper(newspaper_path, vocabulary,
                                           random_indexing, False)
            else:
                analysis.analyse_newspaper(newspaper_path, vocabulary, None,
                                           False)
        analysis.computeIDF(vocabulary)
        filemanager.save_vocabularyAndPL_file(vocabulary)

        print("Inverted file created !")

    if args.zip:

        print("Compressing…")
        filemanager = fm.FileManager(args.f, args.o)

        zip.compressPLVBYTEFromSavedVocAndPL(filemanager)

        zip.compressZip(filemanager.getPathPLCompressed())

        zip.compressZip(filemanager.getPathVocCompressed())

        zip.compressZip(filemanager.getPathPLScore())

        print("Compressed !")

    if args.randomindexing:
        filemanager.save_random_indexing(random_indexing.getTermsVectors(),
                                         random_indexing.getTermDimension())
        print("Random indexing created")
Ejemplo n.º 9
0
    postingListsOrderedById = dict()
    postingListsOrderedById['aaa'] = pl1_id
    postingListsOrderedById['bbb'] = pl2_id
    print('postingListsOrderedById : {}'.format(postingListsOrderedById))
    print('postingListsOrderedByScore : {}'.format(postingListsOrderedByScore))
    return postingListsOrderedById, postingListsOrderedByScore


if __name__ == "__main__":

    # Applying Top K Algorithm to mockData
    # postingListsOrderedById, postingListsOrderedByScore = createMockData()
    # c = find_fagins_ta(postingListsOrderedById, postingListsOrderedByScore, 3, aggregative_function_mean)
    # print("Resulta c : {}".format(c))

    currentWorkspace = './workspace/testfaginsta/'
    filename = 'test1'
    filemanag = fm.FileManager(filename, currentWorkspace)

    tempVoc = SortedDict()

    pathlist = Path("./tests/data/test4/").glob('**/la*')
    for path in pathlist:
        analysis.analyse_newspaper(path, tempVoc, True)
    filemanag.save_vocabularyAndPL_file(tempVoc)

    savedVoc = filemanag.read_vocabulary()
    faginsta = apply_fagins_ta(['aa', 'bb'], savedVoc, filemanag, 0.2, 2)
    print("result faginsTA : {}".format(faginsta))
Ejemplo n.º 10
0
def analyseAndMergeDocuments(array_of_iterations, stepFlush):
    path = ""
    print("analyse_newspaper")
    print("Merging involved, flush frequency : Every " + str(stepFlush) +
          " document.")
    pathlist = Path("./../data/latimes/").glob('**/la*')

    tmpPreprocessor = analysis.preprocessor
    analysis.setPreprocessor(
        preprocessing.Preprocessor(activate_stemmer=False))
    timeToExtract = []
    timeToMerge = []
    timeToFlush = [0] * len(array_of_iterations)
    timeTotal = []
    timeToAnalyse = []
    for numBatch, nbDocsToRead in enumerate(array_of_iterations):
        startBatch = time.time()
        folder = './workspace/'
        for the_file in os.listdir(folder):
            file_path = os.path.join(folder, the_file)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
            except Exception as e:
                print(e)
        filemanager = fm.FileManager("benchmarkAnalysisTest" +
                                     str(nbDocsToRead))
        start = time.time()
        pathlist = Path("./../data/latimes/").glob('**/la*')
        vocabulary = dict()
        nbDocsRed = 0
        nbDocsInMemory = 0
        print("analysis in progress")
        for i, newspaper_path in enumerate(pathlist):

            if nbDocsRed >= nbDocsToRead:
                break
            docsRedInDocIteration = -1
            nbDocsRedInThisJournal = 0
            while (docsRedInDocIteration != 0):
                docsRedInDocIteration = analysis.analyse_newspaper(
                    newspaper_path, vocabulary, None, False,
                    nbDocsRedInThisJournal, nbDocsRedInThisJournal + stepFlush)
                nbDocsRed = docsRedInDocIteration + nbDocsRed
                nbDocsInMemory += docsRedInDocIteration
                nbDocsRedInThisJournal += docsRedInDocIteration
                if nbDocsInMemory == stepFlush or nbDocsRed >= nbDocsToRead:
                    startFlush = time.time()
                    filemanager.save_vocabularyAndPL_file(vocabulary,
                                                          isPartial=True)
                    vocabulary = dict()
                    nbDocsInMemory = 0
                    timeToFlush[numBatch] += (time.time() - startFlush)
                if nbDocsRed >= nbDocsToRead:
                    break
            if nbDocsRed >= nbDocsToRead:
                break
        if nbDocsRed < nbDocsToRead:
            print("Benchmark invalid, as we ran out of documents to read.")
        timeToExtract.append(time.time() - start)
        start = time.time()
        print("Merging in progress…")
        filemanager.mergePartialVocsAndPL()
        timeToMerge.append(time.time() - start)
        timeTotal.append(time.time() - startBatch)

    analysis.setPreprocessor(tmpPreprocessor)
    print(array_of_iterations)
    print("Ttmerge")
    print(timeToMerge)
    plt.plot(array_of_iterations, timeToMerge, label="Time to merge")
    print("Ttextract")
    print(timeToExtract)
    plt.plot(array_of_iterations,
             timeToExtract,
             label="Time to analyse document (with flushing)")
    print("Ttflush")
    print(timeToFlush)
    plt.plot(array_of_iterations, timeToFlush, label="Time to flush documents")
    print("Overalltime")
    print(timeTotal)
    plt.plot(array_of_iterations, timeTotal, label="Overall time")
    plt.xlabel("Number of Documents")
    plt.ylabel("Time (s)")
    plt.legend()
    plt.show()
Ejemplo n.º 11
0
def analyseAndSaveDocuments(array_of_iterations,
                            computeIDF=False,
                            numberIterations=1):
    totaltimeToExtract = []
    totaltimeToSave = []
    totaltimeTotal = []
    totaltimeToComputeIDF = []
    for i in range(0, numberIterations):
        path = ""
        print("analyse_newspaper")
        print("Save only in the end, no merging involved")
        pathlist = Path("./../data/latimes/").glob('**/la*')

        tmpPreprocessor = analysis.preprocessor
        analysis.setPreprocessor(
            preprocessing.Preprocessor(activate_stemmer=False))
        timeToExtract = []
        timeToSave = []
        timeTotal = []
        timeToComputeIDF = []
        for numBatch, nbDocsToRead in enumerate(array_of_iterations):
            startBatch = time.time()
            folder = './workspace/'
            for the_file in os.listdir(folder):
                file_path = os.path.join(folder, the_file)
                try:
                    if os.path.isfile(file_path):
                        os.unlink(file_path)
                except Exception as e:
                    print(e)
            filemanager = fm.FileManager("benchmarkAnalysisTest" +
                                         str(nbDocsToRead))
            start = time.time()
            pathlist = Path("./../data/latimes/").glob('**/la*')
            vocabulary = dict()
            nbDocsRed = 0
            print("analysis in progress")
            for i, newspaper_path in enumerate(pathlist):

                if nbDocsRed >= nbDocsToRead:
                    break
                docsRedInDocIteration = -1

                while (docsRedInDocIteration != 0):
                    docsRedInDocIteration = analysis.analyse_newspaper(
                        newspaper_path, vocabulary, None, False, 0,
                        nbDocsToRead - nbDocsRed)
                    nbDocsRed = docsRedInDocIteration + nbDocsRed

                    if nbDocsRed >= nbDocsToRead:
                        break
                if nbDocsRed >= nbDocsToRead:
                    break
            if nbDocsRed < nbDocsToRead:
                print("Benchmark invalid, as we ran out of documents to read.")
            timeToExtract.append(time.time() - start)
            if computeIDF:
                startComputeIDF = time.time()
                analysis.computeIDF(vocabulary)
                timeToComputeIDF.append(time.time() - startComputeIDF)
            start = time.time()
            print("Saving in progress…")
            filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=False)
            timeToSave.append(time.time() - start)
            timeTotal.append(time.time() - startBatch)

        analysis.setPreprocessor(tmpPreprocessor)
        print("Number of documents :")
        print(array_of_iterations)
        plt.plot(array_of_iterations,
                 timeToExtract,
                 label="Time to analyse documents")
        print("Time to extract :")
        print(timeToExtract)

        if computeIDF:
            plt.plot(array_of_iterations,
                     timeToComputeIDF,
                     label="Time to compute IDF")

            print("Time to compute IDF :")
            print(timeToComputeIDF)

        plt.plot(array_of_iterations, timeToSave, label="Time to save")
        print("Time to save :")
        print(timeToSave)
        plt.plot(array_of_iterations, timeTotal, label="Overall time")
        print("Overall Time :")
        print(timeTotal)
        plt.xlabel("Number of Documents")
        plt.ylabel("Time (s)")
        plt.legend()
        plt.show()
        totaltimeToExtract.append(timeToExtract)
        if computeIDF:
            totaltimeToComputeIDF.append(timeToComputeIDF)
        totaltimeToSave.append(timeToSave)
        totaltimeTotal.append(timeTotal)

    if computeIDF:
        print("computeidf")
        resIDF = [0] * len(totaltimeToComputeIDF[0])
        for arr in totaltimeToComputeIDF:
            for i, elt in enumerate(arr):
                resIDF[i] = resIDF[i] + elt / len(totaltimeToComputeIDF)

        print(totaltimeToComputeIDF)
        print(resIDF)
    print("extract")
    resextract = [0] * len(totaltimeToExtract[0])
    for arr in totaltimeToExtract:
        for i, elt in enumerate(arr):
            resextract[i] = resextract[i] + elt / len(totaltimeToExtract)
    print(totaltimeToExtract)
    print(resextract)
    print("save")
    ressave = [0] * len(totaltimeToSave[0])
    for arr in totaltimeToSave:
        for i, elt in enumerate(arr):
            ressave[i] = ressave[i] + elt / len(totaltimeToSave)
    print(totaltimeToSave)
    print(ressave)
    print("total")
    restotal = [0] * len(totaltimeTotal[0])
    for arr in totaltimeTotal:
        for i, elt in enumerate(arr):
            restotal[i] = restotal[i] + elt / len(totaltimeTotal)
    print(totaltimeTotal)
    print(restotal)
    plt.plot(array_of_iterations,
             resextract,
             label="Time to analyse documents")
    if computeIDF:
        plt.plot(array_of_iterations, resIDF, label="Time to compute IDF")
    plt.plot(array_of_iterations, ressave, label="Time to save")
    plt.plot(array_of_iterations, restotal, label="Overall time")
    plt.xlabel("Number of Documents")
    plt.ylabel("Time (s)")
    plt.legend()
    plt.show()
Ejemplo n.º 12
0
def analyseAndSaveDocumentsMultithread(array_of_newspapers, computeIDF=False):
    path = ""
    print("analyse_newspaper")
    print("Save only in the end, no merging involved")

    pathlist = Path("./../data/latimes/").glob('**/la*')
    tmpPreprocessor = analysis.preprocessor
    analysis.setPreprocessor(
        preprocessing.Preprocessor(activate_stemmer=False))
    timeToExtract = []
    timeToSave = []
    timeTotal = []
    timeToAnalyse = []
    timeToComputeIDF = []
    for numBatch, nbNewsPaperToRead in enumerate(array_of_newspapers):
        startBatch = time.time()
        folder = './workspace/'
        for the_file in os.listdir(folder):
            file_path = os.path.join(folder, the_file)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
            except Exception as e:
                print(e)
        filemanager = fm.FileManager("benchmarkAnalysisTest" +
                                     str(nbNewsPaperToRead))
        start = time.time()
        pathlist = Path("./../data/latimes/").glob('**/la*')
        vocabulary = dict()
        nbNewspaperRed = 0
        nbDocsRed = 0
        print("analysis in progress")
        for i, newspaper_path in enumerate(pathlist):

            if nbNewspaperRed >= nbNewsPaperToRead:
                break
            docsRedInDocIteration = analysis.analyse_newspaper(
                newspaper_path, vocabulary, None, False)
            nbDocsRed = docsRedInDocIteration + nbDocsRed
            nbNewspaperRed += 1

        if nbNewspaperRed < nbNewsPaperToRead:
            print("Benchmark invalid, as we ran out of newspaper to read.")
        timeToExtract.append(time.time() - start)
        print("We red documents : ")
        print(nbDocsRed)
        if computeIDF:
            startComputeIDF = time.time()
            analysis.computeIDF(vocabulary)
            timeToComputeIDF.append(time.time() - startComputeIDF)
        start = time.time()
        print("Saving in progress…")
        filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=False)
        timeToSave.append(time.time() - start)
        timeTotal.append(time.time() - startBatch)

    analysis.setPreprocessor(tmpPreprocessor)
    print("Number of documents :")
    print(array_of_newspapers)
    plt.plot(array_of_newspapers,
             timeToExtract,
             label="Time to analyse documents")
    print("Time to extract :")
    print(timeToExtract)

    if computeIDF:
        plt.plot(array_of_newspapers,
                 timeToComputeIDF,
                 label="Time to compute IDF")

        print("Time to compute IDF :")
        print(timeToComputeIDF)

    plt.plot(array_of_newspapers, timeToSave, label="Time to save")
    print("Time to save :")
    print(timeToSave)
    plt.plot(array_of_newspapers, timeTotal, label="Overall time")
    print("Overall Time :")
    print(timeTotal)
    plt.xlabel("Number of Documents")
    plt.ylabel("Time (s)")
    plt.legend()
    plt.show()