Beispiel #1
0
    def test_merging(self):
        voc = dict()
        currentWorkspace = './tests/workspace/test3/'
        filename = 'test3'

        pathlist = Path("./tests/data/test3/").glob('**/la*')

        filemanager = fm.FileManager(filename, currentWorkspace)

        for path in pathlist:
            analysis.analyse_newspaper(path, voc)
            filemanager.save_vocabularyAndPL_file(voc, True)
            voc = dict()

        filemanager.mergePartialVocsAndPL(False)

        # TODO: changer quand on ait une function directe
        savedVoc = filemanager.read_vocabulary()
        mot, sortedByScore = query.get_posting_list(
            savedVoc, "aa", filemanager, True)
        self.assertEqual(mot, {1: [0, 3], 2: [0, 2], 3: [
                         0, 1], 4: [0, 3], 5: [0, 2], 6: [0, 1]})
        # The score is equal to zero
        self.assertEqual(sortedByScore, [(0, 1),(0, 2),(0, 3),(0, 4),(0, 5),(0, 6)])
        mot, sortedByScore = query.get_posting_list(
            savedVoc, "bb", filemanager, True)
        # The score is equal to zero
        self.assertEqual(sortedByScore, [(0, 1), (0,2), (0,4), (0,5)])
        self.assertEqual(mot, {1: [0, 1], 2: [0, 1], 4: [0, 1], 5: [0, 1]})
        mot, sortedByScore = query.get_posting_list(
            savedVoc, "cc", filemanager, True)
        self.assertEqual(mot, {3: [0, 1], 6: [0, 1]})
        self.assertEqual(sortedByScore, [(0, 3),(0, 6)])
Beispiel #2
0
    def test_topk_trivial_file(self):

        pathlist = Path("./tests/data/testtrivialtopk/").glob('**/la*')

        filemana = filemanager.FileManager(
            "TestFaginsTopK", "./tests/workspace/testsfaginstopk/")
        tempVoc = SortedDict()
        for path in pathlist:
            analysis.analyse_newspaper(path, tempVoc, computeIDF=True)
        filemana.save_vocabularyAndPL_file(tempVoc)

        # Extraction of the saved Voc
        savedVoc = filemana.read_vocabulary()

        topk = faginstatopk.apply_fagins_ta(['aa', 'bb'], savedVoc, filemana,
                                            0, 5)
        #If conjunctive :
        #self.checkResultApproximative(topk,[(2,(math.log(3/4)+math.log(3/2))/2)])
        self.checkResultApproximative(
            topk, [(2, (math.log(3 / 4) + math.log(3 / 2)) / 2),
                   (1, math.log(3 / 4) / 2), (3, math.log(3 / 4) / 2)])

        topk = faginstatopk.apply_fagins_ta(['bb'], savedVoc, filemana, 0, 5)
        self.checkResultApproximative(topk, [(2, math.log(3 / 2))])

        topk = faginstatopk.apply_fagins_ta(['cc'], savedVoc, filemana, 0, 5)
        self.checkResultApproximative(topk, [])

        topk = faginstatopk.apply_fagins_ta(['cc', 'dd'], savedVoc, filemana,
                                            0, 5)
        self.checkResultApproximative(topk, [])
Beispiel #3
0
    def test_merging_3_files_scores(self):
        voc = dict()
        currentWorkspace = './tests/workspace/test4/'
        filename = 'test4merging3filesscores'

        pathlist = Path("./tests/data/test4/").glob('**/la*')

        filemanager = fm.FileManager(filename, currentWorkspace)

        for path in pathlist:
            analysis.analyse_newspaper(path, voc, computeIDF=True)
            filemanager.save_vocabularyAndPL_file(voc, True)
            voc = dict()

        filemanager.mergePartialVocsAndPL(True)

        savedVoc = filemanager.read_vocabulary()
        mot,sortedByScore = query.get_posting_list(savedVoc, "aa", filemanager, True)
        self.assertEqual(mot, {1: [0.24718092381954193, 3.0], 2: [0.32882189750671387, 6.0], 5: [0.11778303235769272, 1.0], 6: [
                         0.11778303235769272, 1.0], 20: [0.24718092381954193, 3.0], 21: [0.19942401349544525, 2.0], 22: [0.11778303235769272, 1.0]})
        
        self.assertEqual(sortedByScore, [(0.32882189750671387, 2), (0.24718092381954193, 1), (0.24718092381954193, 20), (0.19942401349544525, 21), (0.11778303235769272, 5),(0.11778303235769272, 6),(0.11778303235769272, 22)])
        
        mot,sortedByScore = query.get_posting_list(savedVoc, "bb", filemanager,True)
        self.assertEqual(mot, {1: [0.5274115204811096, 3.0], 2: [0.7016094326972961, 6.0], 4: [0.2513144314289093, 1.0], 5: [0.2513144314289093, 1.0], 20: [0.2513144314289093, 1.0], 21: [0.2513144314289093, 1.0]})
        self.assertEqual(sortedByScore,[(0.7016094326972961, 2), (0.5274115204811096, 1), (0.2513144314289093,4),(0.2513144314289093, 5),(0.2513144314289093,20),(0.2513144314289093, 21)])
 def test_read_postingList(self):
     currentWorkspace = './tests/workspace/testfilemanager2/'
     filename = 'testfm2'
     postingList = dict()
     postingList[1]=[0,101]
     postingList[2]=[0,30023]
     postingList[294]=[0,159]
     postingList[23445]=[0,3006]
     filemanager = fm.FileManager(filename,currentWorkspace)
     filemanager.save_postList(postingList,0)
     pl = filemanager.read_postList(0,4)
     self.assertEqual(pl, {1: [0, 101], 2: [0,30023], 294: [0,159], 23445:[0,3006]}, "The sorted Dict should be the same")
Beispiel #5
0
    def test_merging_3_files(self):
        voc = dict()
        currentWorkspace = './tests/workspace/test4/'
        filename = 'test4merging3files'

        pathlist = Path("./tests/data/test4/").glob('**/la*')

        filemanager = fm.FileManager(filename, currentWorkspace)


        for path in pathlist:
            analysis.analyse_newspaper(path, voc)
            filemanager.save_vocabularyAndPL_file(voc, True)
            voc = dict()

        filemanager.mergePartialVocsAndPL(False)

        # TODO: changer quand on a   une function directe
        savedVoc = filemanager.read_vocabulary()
        mot = query.get_posting_list(savedVoc, "aa", filemanager)
        self.assertEqual(mot, {1: [0, 3.0], 2: [0, 6.0], 5: [0, 1.0], 6: [
                         0, 1.0], 20: [0, 3.0], 21: [0, 2.0], 22: [0, 1.0]})
        mot = query.get_posting_list(savedVoc, "bb", filemanager)
        self.assertEqual(mot, {1: [0, 3], 2: [0, 6], 20: [
                         0, 1], 21: [0, 1], 4: [0, 1], 5: [0, 1]})
        mot = query.get_posting_list(savedVoc, "cc", filemanager)
        self.assertEqual(mot, {1: [0, 1], 2: [0, 3], 22: [
                         0, 1], 4: [0, 1], 6: [0, 1]})
        mot = query.get_posting_list(savedVoc, "dd", filemanager)
        self.assertEqual(mot, {1: [0, 2], 2: [0, 1]})
        mot = query.get_posting_list(savedVoc, "ff", filemanager)
        self.assertEqual(mot, {1: [0, 1], 20: [0, 1], 6: [0, 1]}, "FF")
        mot = query.get_posting_list(savedVoc, "qq", filemanager)
        self.assertEqual(mot, {1: [0, 1], 5: [0, 1]})
        mot = query.get_posting_list(savedVoc, "rr", filemanager)
        self.assertEqual(mot, {1: [0, 5], 21: [0, 1]})
        mot = query.get_posting_list(savedVoc, "ee", filemanager)
        self.assertEqual(mot, {1: [0, 1], 23: [0, 1]})
        mot = query.get_posting_list(savedVoc, "vv", filemanager)
        self.assertEqual(mot, {1: [0, 1]})
        mot = query.get_posting_list(savedVoc, "yy", filemanager)
        self.assertEqual(mot, {1: [0, 1]})
        mot = query.get_posting_list(savedVoc, "kk", filemanager)
        self.assertEqual(mot, {2: [0, 1], 23: [0, 1]})
        mot = query.get_posting_list(savedVoc, "ii", filemanager)
        self.assertEqual(mot, {2: [0, 1]})
        mot = query.get_posting_list(savedVoc, "jj", filemanager)
        self.assertEqual(mot, {2: [0, 1]})
        mot = query.get_posting_list(savedVoc, "hh", filemanager)
        self.assertEqual(mot, {23: [0, 1]})
        mot = query.get_posting_list(savedVoc, "ll", filemanager)
        self.assertEqual(mot, {}, 'll is considered a stopword')
 def test_creation_postingLists(self):
     currentWorkspace = './tests/workspace/testfilemanager1/'
     filename = 'testfm1'
     postingList = dict()
     postingList[1]=[0,101]
     postingList[2]=[0,30023]
     postingList[34]=[0,308.0]
     postingList[294]=[0,159]
     postingList[2324]=[0,3005]
     postingList[23445]=[0,3006]
     filemanager = fm.FileManager(filename,currentWorkspace)
     filemanager.save_postList(postingList,0)
     self.assertTrue(os.path.isfile(currentWorkspace + filename + '.pl'), "The file .pl should exist")
     self.assertTrue(os.path.isfile(currentWorkspace + filename + '.vo'), "The file .vo should exist")
Beispiel #7
0
    def test_topk_trivial_file(self):

        pathlist = Path("./tests/data/testtrivialtopk/").glob('**/la*')

        filemana = filemanager.FileManager(
            "TestFaginsTopK", "./tests/workspace/testsfaginstopk")
        tempVoc = dict()
        for path in pathlist:
            analysis.analyse_newspaper(path, tempVoc, computeIDF=True)
        filemana.save_vocabularyAndPL_file(tempVoc)

        # Extraction of the saved Voc
        savedVoc = filemana.read_vocabulary()
        topk = naivetopk.apply_naive_top_k_algo(['bb'], savedVoc, filemana, 0,
                                                5,
                                                naivetopk.conjunctive_queries)
        self.checkResultApproximative(topk, [(2, math.log(3 / 2))])

        topk = naivetopk.apply_naive_top_k_algo(['cc'], savedVoc, filemana, 0,
                                                5,
                                                naivetopk.conjunctive_queries)
        self.checkResultApproximative(topk, [])

        topk = naivetopk.apply_naive_top_k_algo(['cc', 'dd'], savedVoc,
                                                filemana, 0, 5,
                                                naivetopk.conjunctive_queries)
        self.checkResultApproximative(topk, [])

        topk = naivetopk.apply_naive_top_k_algo(['cc', 'dd'], savedVoc,
                                                filemana, 0, 5,
                                                naivetopk.disjunctive_queries)
        self.checkResultApproximative(topk, [])

        topk = naivetopk.apply_naive_top_k_algo(['bb'], savedVoc, filemana, 0,
                                                5,
                                                naivetopk.disjunctive_queries)
        self.checkResultApproximative(topk, [(2, math.log(3 / 2))])

        topk = naivetopk.apply_naive_top_k_algo(['aa', 'bb'], savedVoc,
                                                filemana, 0, 1,
                                                naivetopk.disjunctive_queries)
        self.checkResultApproximative(
            topk, [(2, (math.log(3 / 4) + math.log(3 / 2)) / 2)])

        topk = naivetopk.apply_naive_top_k_algo(['aa', 'bb', 'cc'], savedVoc,
                                                filemana, 0, 1,
                                                naivetopk.disjunctive_queries)
        self.checkResultApproximative(
            topk, [(2, (math.log(3 / 4) + math.log(3 / 2)) / 3)])
 def test_modify_postingList(self):
     currentWorkspace = './tests/workspace/testfilemanager3/'
     filename = 'testfm3'
     postingList = dict()
     postingList[1]=[0,101]
     postingList[23]=[0,30023]
     postingList[234]=[0,3006]
     filemanager = fm.FileManager(filename,currentWorkspace)
     # TODO: le offset change quoi?
     filemanager.save_postList(postingList,0)
     postingList[1]=[0,201]
     filemanager.save_postList(postingList,0)
     postingList[1]=[0,301]
     filemanager.save_postList(postingList,0)
     pl = filemanager.read_postList(0,3)
     self.assertEqual(pl, {1: [0,301], 23: [0,30023], 234: [0,3006]}, "The sorted Dict should be the same")
Beispiel #9
0
    def test_simple(self):
        voc = dict()
        currentWorkspace = './tests/workspace/test1/'
        filename = 'test1'

        pathlist = Path("./tests/data/test1/").glob('**/la*')
        for path in pathlist:
            analysis.analyse_newspaper(path, voc)
        filemanager = fm.FileManager(filename, currentWorkspace)
        filemanager.save_vocabularyAndPL_file(voc, False)
        savedVoc = filemanager.read_vocabulary()

        mot1 = query.get_posting_list(savedVoc, "aa", filemanager)
        mot2 = query.get_posting_list(savedVoc, "bb", filemanager)
        mot3 = query.get_posting_list(savedVoc, "cc", filemanager)
        self.assertEqual(mot1, {1: [0, 3], 2: [0, 2], 3: [0, 1]})
        self.assertEqual(mot2, {1: [0, 1], 2: [0, 1]})
        self.assertEqual(mot3, {3: [0, 1]})
Beispiel #10
0
def analyse(nbNewspaper,
            path="./latimes/",
            flushEvery=1,
            analysisApproach=analysis.analyse_newspaper,
            mergeInTheEnd=True,
            useStemmer=True,
            sizeDocument=medium):
    """
  This benchmark will analyse documents, put the VOC and PL in memory
  and eventually flush it to the hardrive if requested. 
  In the end, a VOC and PL file will be created on the hardrive
  
  nbNewspaper is the number of newspaper we will go through in path
  path is the path to the directory
  flushEvery is the frequency of flush. (-1 if we never flush)
  mergeInTheEnd : if false, no merge in the end is proceeded and vocabulary is reset at the end of each loop
  """
    pathlist = Path(path).glob('**/la*')
    vocabulary = SortedDict()
    filemanager = fm.FileManager("benchmarkAnalysisTest")
    flushCounter = 0
    tmpPreprocessor = analysis.preprocessor
    if not useStemmer:
        analysis.setPreprocessor(
            preprocessing.Preprocessor(activateStemmer=False))
    for i, newspaper_path in enumerate(pathlist):
        if i >= nbNewspaper:
            break

        flushCounter += 1
        analysisApproach(newspaper_path, vocabulary, False)
        if mergeInTheEnd == False:
            vocabulary = SortedDict()
            continue
        if flushCounter >= flushEvery and flushEvery != 1:
            flushCounter = 0
            filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=True)
            vocabulary = SortedDict()
    if mergeInTheEnd:
        filemanager.mergePartialVocsAndPL()
    analysis.setPreprocessor(tmpPreprocessor)
Beispiel #11
0
    def test_with_stopwords(self):
        voc = dict()
        currentWorkspace = './tests/workspace/test2/'
        filename = 'test2'

        pathlist = Path("./tests/data/test2/").glob('**/la*')
        for path in pathlist:
            analysis.analyse_newspaper(path, voc)

        filemanager = fm.FileManager(filename, currentWorkspace)
        filemanager.save_vocabularyAndPL_file(voc)
        # TODO: changer quand on ait une function directe
        savedVoc = filemanager.read_vocabulary()
        mot1 = query.get_posting_list(savedVoc, "aa", filemanager)
        mot2 = query.get_posting_list(savedVoc, "bb", filemanager)
        mot3 = query.get_posting_list(savedVoc, "cc", filemanager)
        self.assertEqual(mot1, {1: [0, 1], 2: [0, 2]})
        self.assertEqual(mot2, {1: [0, 4], 2: [0, 1]})
        self.assertEqual(mot3, {2: [0, 2]})

        stop1 = query.get_posting_list(savedVoc, "doing", filemanager)
        self.assertEqual(stop1, {})
Beispiel #12
0
def analysis_parameters():
    global MAX_RANDOM_INDEXING

    parser = argparse.ArgumentParser()

    parser.add_argument("-d",
                        type=str,
                        help="dossier avec les documents",
                        required=True)
    parser.add_argument(
        "-f",
        type=str,
        help="nom de fichier pour enregistrer les fichiers après l'indexation ",
        required=True)
    parser.add_argument(
        "-o",
        type=str,
        default='./workspace/',
        help="dossier pour enregistrer les fichiers après l'indexation ")
    parser.add_argument("--zip",
                        action='store_true',
                        help="compression zip à la fin")
    parser.add_argument(
        "--partial",
        type=int,
        default=-1,
        help=
        'créer les fichiers par réunion de plusieurs fichiers avec une granularité de documents choisie. Si -2, alors granularité d\'un journal. Valeur conseillée : 2000.'
    )
    parser.add_argument("--stemmer",
                        action='store_true',
                        help='activer stemmer')
    parser.add_argument("--randomindexing",
                        action='store_true',
                        help='activer random indexing')
    args = parser.parse_args()

    latimes_path = args.d
    if not args.d.endswith("/"):
        latimes_path += "/"

    workspace_path = args.o
    if not args.d.endswith("/"):
        workspace_path += "/"

    pathlist = Path(latimes_path).glob('**/la*')

    vocabulary = dict()
    filemanager = fm.FileManager(args.f, workspace_path)
    random_indexing = None
    if args.randomindexing:
        random_indexing = ri.RandomIndexing()

    if args.stemmer:
        analysis.setPreprocessor(preprocessing.Preprocessor(True))

    if args.partial == -2:
        print("Partial analysis in progress")
        for newspaper_path in tqdm(list(pathlist)):
            docsRedInDocIteration = analysis.analyse_newspaper(
                newspaper_path, vocabulary, None, False)
            filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=True)
            vocabulary = dict()
        print("Merging in progress…")
        filemanager.mergePartialVocsAndPL()
        print("PL and VOC merged succesfully")
    if args.partial != -1:
        nbDocsInMemory = 0
        stepFlush = args.partial

        rand_indexing_counter = 0
        for newspaper_path in tqdm(list(pathlist)):

            docsRedInDocIteration = -1
            nbDocsRedInThisJournal = 0
            while (docsRedInDocIteration != 0):
                if rand_indexing_counter < MAX_RANDOM_INDEXING:
                    docsRedInDocIteration = analysis.analyse_newspaper(
                        newspaper_path, vocabulary, random_indexing, False,
                        nbDocsRedInThisJournal,
                        nbDocsRedInThisJournal + stepFlush)
                else:
                    docsRedInDocIteration = analysis.analyse_newspaper(
                        newspaper_path, vocabulary, None, False,
                        nbDocsRedInThisJournal,
                        nbDocsRedInThisJournal + stepFlush)
                nbDocsInMemory += docsRedInDocIteration
                nbDocsRedInThisJournal += docsRedInDocIteration
                if nbDocsInMemory >= stepFlush:
                    filemanager.save_vocabularyAndPL_file(vocabulary,
                                                          isPartial=True)
                    vocabulary = dict()
                    nbDocsInMemory = 0
                rand_indexing_counter += 1

        if nbDocsInMemory != 0:
            filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=True)

        print("Merging in progress…")

        filemanager.mergePartialVocsAndPL()
        print("PL and VOC merged succesfully")
        print("Inverted file created !")

    else:
        print("Non partial")
        rand_indexing_counter = 0
        for newspaper_path in tqdm(list(pathlist)):
            if rand_indexing_counter < MAX_RANDOM_INDEXING:
                rand_indexing_counter += 1
                analysis.analyse_newspaper(newspaper_path, vocabulary,
                                           random_indexing, False)
            else:
                analysis.analyse_newspaper(newspaper_path, vocabulary, None,
                                           False)
        analysis.computeIDF(vocabulary)
        filemanager.save_vocabularyAndPL_file(vocabulary)

        print("Inverted file created !")

    if args.zip:

        print("Compressing…")
        filemanager = fm.FileManager(args.f, args.o)

        zip.compressPLVBYTEFromSavedVocAndPL(filemanager)

        zip.compressZip(filemanager.getPathPLCompressed())

        zip.compressZip(filemanager.getPathVocCompressed())

        zip.compressZip(filemanager.getPathPLScore())

        print("Compressed !")

    if args.randomindexing:
        filemanager.save_random_indexing(random_indexing.getTermsVectors(),
                                         random_indexing.getTermDimension())
        print("Random indexing created")
Beispiel #13
0
    postingListsOrderedById = dict()
    postingListsOrderedById['aaa'] = pl1_id
    postingListsOrderedById['bbb'] = pl2_id
    print('postingListsOrderedById : {}'.format(postingListsOrderedById))
    print('postingListsOrderedByScore : {}'.format(postingListsOrderedByScore))
    return postingListsOrderedById, postingListsOrderedByScore


if __name__ == "__main__":

    # Applying Top K Algorithm to mockData
    # postingListsOrderedById, postingListsOrderedByScore = createMockData()
    # c = find_fagins_ta(postingListsOrderedById, postingListsOrderedByScore, 3, aggregative_function_mean)
    # print("Resulta c : {}".format(c))

    currentWorkspace = './workspace/testfaginsta/'
    filename = 'test1'
    filemanag = fm.FileManager(filename, currentWorkspace)

    tempVoc = SortedDict()

    pathlist = Path("./tests/data/test4/").glob('**/la*')
    for path in pathlist:
        analysis.analyse_newspaper(path, tempVoc, True)
    filemanag.save_vocabularyAndPL_file(tempVoc)

    savedVoc = filemanag.read_vocabulary()
    faginsta = apply_fagins_ta(['aa', 'bb'], savedVoc, filemanag, 0.2, 2)
    print("result faginsTA : {}".format(faginsta))
    parser.add_argument("--stemmer",
                        action='store_true',
                        help='activer stemmer')
    parser.add_argument("-n",
                        type=str,
                        required=True,
                        help='nombre de synonymes pour la requête')

    args = parser.parse_args()

    workspace_path = args.d
    if not args.d.endswith("/"):
        workspace_path += "/"

    random_indexing = ri.RandomIndexing()
    filemanager = fm.FileManager(args.f, workspace_path)

    ri_term, ri_voc = filemanager.read_random_indexing(
        random_indexing.getTermDimension())
    if args.stemmer:
        preprocessor = pp.Preprocessor(True)
    else:
        preprocessor = pp.Preprocessor(False)

    stemmed = preprocessor.process(args.t)
    try:
        indexToSearch = ri_term.index(stemmed[0])
        print("Synonymes for : {} ".format(ri_term[indexToSearch]))
        res = classify(ri_voc[indexToSearch], ri_voc, int(args.n))
        for i, term_index in enumerate(res):
            print("{:<3} : {}".format(i, ri_term[term_index]))
Beispiel #15
0
def analyseAndSaveDocumentsMultithread(array_of_newspapers, computeIDF=False):
    path = ""
    print("analyse_newspaper")
    print("Save only in the end, no merging involved")

    pathlist = Path("./../data/latimes/").glob('**/la*')
    tmpPreprocessor = analysis.preprocessor
    analysis.setPreprocessor(
        preprocessing.Preprocessor(activate_stemmer=False))
    timeToExtract = []
    timeToSave = []
    timeTotal = []
    timeToAnalyse = []
    timeToComputeIDF = []
    for numBatch, nbNewsPaperToRead in enumerate(array_of_newspapers):
        startBatch = time.time()
        folder = './workspace/'
        for the_file in os.listdir(folder):
            file_path = os.path.join(folder, the_file)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
            except Exception as e:
                print(e)
        filemanager = fm.FileManager("benchmarkAnalysisTest" +
                                     str(nbNewsPaperToRead))
        start = time.time()
        pathlist = Path("./../data/latimes/").glob('**/la*')
        vocabulary = dict()
        nbNewspaperRed = 0
        nbDocsRed = 0
        print("analysis in progress")
        for i, newspaper_path in enumerate(pathlist):

            if nbNewspaperRed >= nbNewsPaperToRead:
                break
            docsRedInDocIteration = analysis.analyse_newspaper(
                newspaper_path, vocabulary, None, False)
            nbDocsRed = docsRedInDocIteration + nbDocsRed
            nbNewspaperRed += 1

        if nbNewspaperRed < nbNewsPaperToRead:
            print("Benchmark invalid, as we ran out of newspaper to read.")
        timeToExtract.append(time.time() - start)
        print("We red documents : ")
        print(nbDocsRed)
        if computeIDF:
            startComputeIDF = time.time()
            analysis.computeIDF(vocabulary)
            timeToComputeIDF.append(time.time() - startComputeIDF)
        start = time.time()
        print("Saving in progress…")
        filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=False)
        timeToSave.append(time.time() - start)
        timeTotal.append(time.time() - startBatch)

    analysis.setPreprocessor(tmpPreprocessor)
    print("Number of documents :")
    print(array_of_newspapers)
    plt.plot(array_of_newspapers,
             timeToExtract,
             label="Time to analyse documents")
    print("Time to extract :")
    print(timeToExtract)

    if computeIDF:
        plt.plot(array_of_newspapers,
                 timeToComputeIDF,
                 label="Time to compute IDF")

        print("Time to compute IDF :")
        print(timeToComputeIDF)

    plt.plot(array_of_newspapers, timeToSave, label="Time to save")
    print("Time to save :")
    print(timeToSave)
    plt.plot(array_of_newspapers, timeTotal, label="Overall time")
    print("Overall Time :")
    print(timeTotal)
    plt.xlabel("Number of Documents")
    plt.ylabel("Time (s)")
    plt.legend()
    plt.show()
Beispiel #16
0
def analyseAndSaveDocuments(array_of_iterations,
                            computeIDF=False,
                            numberIterations=1):
    totaltimeToExtract = []
    totaltimeToSave = []
    totaltimeTotal = []
    totaltimeToComputeIDF = []
    for i in range(0, numberIterations):
        path = ""
        print("analyse_newspaper")
        print("Save only in the end, no merging involved")
        pathlist = Path("./../data/latimes/").glob('**/la*')

        tmpPreprocessor = analysis.preprocessor
        analysis.setPreprocessor(
            preprocessing.Preprocessor(activate_stemmer=False))
        timeToExtract = []
        timeToSave = []
        timeTotal = []
        timeToComputeIDF = []
        for numBatch, nbDocsToRead in enumerate(array_of_iterations):
            startBatch = time.time()
            folder = './workspace/'
            for the_file in os.listdir(folder):
                file_path = os.path.join(folder, the_file)
                try:
                    if os.path.isfile(file_path):
                        os.unlink(file_path)
                except Exception as e:
                    print(e)
            filemanager = fm.FileManager("benchmarkAnalysisTest" +
                                         str(nbDocsToRead))
            start = time.time()
            pathlist = Path("./../data/latimes/").glob('**/la*')
            vocabulary = dict()
            nbDocsRed = 0
            print("analysis in progress")
            for i, newspaper_path in enumerate(pathlist):

                if nbDocsRed >= nbDocsToRead:
                    break
                docsRedInDocIteration = -1

                while (docsRedInDocIteration != 0):
                    docsRedInDocIteration = analysis.analyse_newspaper(
                        newspaper_path, vocabulary, None, False, 0,
                        nbDocsToRead - nbDocsRed)
                    nbDocsRed = docsRedInDocIteration + nbDocsRed

                    if nbDocsRed >= nbDocsToRead:
                        break
                if nbDocsRed >= nbDocsToRead:
                    break
            if nbDocsRed < nbDocsToRead:
                print("Benchmark invalid, as we ran out of documents to read.")
            timeToExtract.append(time.time() - start)
            if computeIDF:
                startComputeIDF = time.time()
                analysis.computeIDF(vocabulary)
                timeToComputeIDF.append(time.time() - startComputeIDF)
            start = time.time()
            print("Saving in progress…")
            filemanager.save_vocabularyAndPL_file(vocabulary, isPartial=False)
            timeToSave.append(time.time() - start)
            timeTotal.append(time.time() - startBatch)

        analysis.setPreprocessor(tmpPreprocessor)
        print("Number of documents :")
        print(array_of_iterations)
        plt.plot(array_of_iterations,
                 timeToExtract,
                 label="Time to analyse documents")
        print("Time to extract :")
        print(timeToExtract)

        if computeIDF:
            plt.plot(array_of_iterations,
                     timeToComputeIDF,
                     label="Time to compute IDF")

            print("Time to compute IDF :")
            print(timeToComputeIDF)

        plt.plot(array_of_iterations, timeToSave, label="Time to save")
        print("Time to save :")
        print(timeToSave)
        plt.plot(array_of_iterations, timeTotal, label="Overall time")
        print("Overall Time :")
        print(timeTotal)
        plt.xlabel("Number of Documents")
        plt.ylabel("Time (s)")
        plt.legend()
        plt.show()
        totaltimeToExtract.append(timeToExtract)
        if computeIDF:
            totaltimeToComputeIDF.append(timeToComputeIDF)
        totaltimeToSave.append(timeToSave)
        totaltimeTotal.append(timeTotal)

    if computeIDF:
        print("computeidf")
        resIDF = [0] * len(totaltimeToComputeIDF[0])
        for arr in totaltimeToComputeIDF:
            for i, elt in enumerate(arr):
                resIDF[i] = resIDF[i] + elt / len(totaltimeToComputeIDF)

        print(totaltimeToComputeIDF)
        print(resIDF)
    print("extract")
    resextract = [0] * len(totaltimeToExtract[0])
    for arr in totaltimeToExtract:
        for i, elt in enumerate(arr):
            resextract[i] = resextract[i] + elt / len(totaltimeToExtract)
    print(totaltimeToExtract)
    print(resextract)
    print("save")
    ressave = [0] * len(totaltimeToSave[0])
    for arr in totaltimeToSave:
        for i, elt in enumerate(arr):
            ressave[i] = ressave[i] + elt / len(totaltimeToSave)
    print(totaltimeToSave)
    print(ressave)
    print("total")
    restotal = [0] * len(totaltimeTotal[0])
    for arr in totaltimeTotal:
        for i, elt in enumerate(arr):
            restotal[i] = restotal[i] + elt / len(totaltimeTotal)
    print(totaltimeTotal)
    print(restotal)
    plt.plot(array_of_iterations,
             resextract,
             label="Time to analyse documents")
    if computeIDF:
        plt.plot(array_of_iterations, resIDF, label="Time to compute IDF")
    plt.plot(array_of_iterations, ressave, label="Time to save")
    plt.plot(array_of_iterations, restotal, label="Overall time")
    plt.xlabel("Number of Documents")
    plt.ylabel("Time (s)")
    plt.legend()
    plt.show()
Beispiel #17
0
def analyseAndMergeDocuments(array_of_iterations, stepFlush):
    path = ""
    print("analyse_newspaper")
    print("Merging involved, flush frequency : Every " + str(stepFlush) +
          " document.")
    pathlist = Path("./../data/latimes/").glob('**/la*')

    tmpPreprocessor = analysis.preprocessor
    analysis.setPreprocessor(
        preprocessing.Preprocessor(activate_stemmer=False))
    timeToExtract = []
    timeToMerge = []
    timeToFlush = [0] * len(array_of_iterations)
    timeTotal = []
    timeToAnalyse = []
    for numBatch, nbDocsToRead in enumerate(array_of_iterations):
        startBatch = time.time()
        folder = './workspace/'
        for the_file in os.listdir(folder):
            file_path = os.path.join(folder, the_file)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
            except Exception as e:
                print(e)
        filemanager = fm.FileManager("benchmarkAnalysisTest" +
                                     str(nbDocsToRead))
        start = time.time()
        pathlist = Path("./../data/latimes/").glob('**/la*')
        vocabulary = dict()
        nbDocsRed = 0
        nbDocsInMemory = 0
        print("analysis in progress")
        for i, newspaper_path in enumerate(pathlist):

            if nbDocsRed >= nbDocsToRead:
                break
            docsRedInDocIteration = -1
            nbDocsRedInThisJournal = 0
            while (docsRedInDocIteration != 0):
                docsRedInDocIteration = analysis.analyse_newspaper(
                    newspaper_path, vocabulary, None, False,
                    nbDocsRedInThisJournal, nbDocsRedInThisJournal + stepFlush)
                nbDocsRed = docsRedInDocIteration + nbDocsRed
                nbDocsInMemory += docsRedInDocIteration
                nbDocsRedInThisJournal += docsRedInDocIteration
                if nbDocsInMemory == stepFlush or nbDocsRed >= nbDocsToRead:
                    startFlush = time.time()
                    filemanager.save_vocabularyAndPL_file(vocabulary,
                                                          isPartial=True)
                    vocabulary = dict()
                    nbDocsInMemory = 0
                    timeToFlush[numBatch] += (time.time() - startFlush)
                if nbDocsRed >= nbDocsToRead:
                    break
            if nbDocsRed >= nbDocsToRead:
                break
        if nbDocsRed < nbDocsToRead:
            print("Benchmark invalid, as we ran out of documents to read.")
        timeToExtract.append(time.time() - start)
        start = time.time()
        print("Merging in progress…")
        filemanager.mergePartialVocsAndPL()
        timeToMerge.append(time.time() - start)
        timeTotal.append(time.time() - startBatch)

    analysis.setPreprocessor(tmpPreprocessor)
    print(array_of_iterations)
    print("Ttmerge")
    print(timeToMerge)
    plt.plot(array_of_iterations, timeToMerge, label="Time to merge")
    print("Ttextract")
    print(timeToExtract)
    plt.plot(array_of_iterations,
             timeToExtract,
             label="Time to analyse document (with flushing)")
    print("Ttflush")
    print(timeToFlush)
    plt.plot(array_of_iterations, timeToFlush, label="Time to flush documents")
    print("Overalltime")
    print(timeTotal)
    plt.plot(array_of_iterations, timeTotal, label="Overall time")
    plt.xlabel("Number of Documents")
    plt.ylabel("Time (s)")
    plt.legend()
    plt.show()
Beispiel #18
0
def analysis_parameters():
    parser = argparse.ArgumentParser()

    parser.add_argument("-d", type=str, default='./workspace/',
                        help="dossier avec les fichier VOC et PL résultat de l'indexation")
    parser.add_argument("-f", type=str,
                        help="nom de fichier VOC et PL ", required=True)
    parser.add_argument("-q", type=str,
                        help="requête des termes separés par un virgule. Ex: voiture,maison ", required=True)
    parser.add_argument("-n", type=int, default=3,
                        help="nombre de résultats souhaité de documents ")
    parser.add_argument("--stemmer", action='store_true',
                        help="activer le stemming sur les termes de la requête")
    parser.add_argument("--algo", type=str, default="naive",
                        help="algorithme souhaité pour la requête ")
    parser.add_argument("--view", type=str, default="simple",
                        help="type de visualisation. Options possible: simple ou fullText ")
    parser.add_argument("--vpath", type=str, default="./data/latimes/",
                        help="path des fichier sources pour --view fullText")
    parser.add_argument("--improvedquery", action='store_true',
                        help="activer recherche de synonymes pour l'amélioration de la requête")

    args = parser.parse_args()
    latimes_path = args.d
    if not args.d.endswith("/"):
        latimes_path += "/"
    filemanager = fm.FileManager(args.f, latimes_path)
    savedVoc = filemanager.read_vocabulary()
    if args.stemmer:
        print("Stemmer activated")
        preprocessor = preprocessing.Preprocessor(True)
    else :
        preprocessor = preprocessing.Preprocessor(False)
    epsilon = 0

    switchAlgo = {"naive": naivetopk.apply_naive_top_k_algo,
                  "fagins": faginstopk.apply_top_k_algo,
                  "faginsTA": faginsta.apply_fagins_ta}

    algoFunct = switchAlgo[args.algo]

    words = preprocessor.process(args.q)
    words_request = []
    if args.improvedquery:
        random_indexing = ri.RandomIndexing()
        for word in words:
            words_request.append(word)

            try:
                synonymes = synknn.get_synonyms(
                    word, 2, random_indexing.getTermDimension(), filemanager)
                if len(synonymes) == 2:
                    words_request.append(synonymes[1])
            except Exception as e:
                print(e)
        print("Improved query: {}".format(words_request))
    else:
        words_request = words

    if (not filemanager.doesUnCompressedVersionExists()) and filemanager.doesCompressedVersionExists():
        print("Unzipping in progress…")
        compressor.decompressZip(filemanager.getPathPLCompressed(),filemanager.getPathPLCompressed())
        compressor.decompressZip(filemanager.getPathVocCompressed(),filemanager.getPathVocCompressed())
        compressor.decompressZip(filemanager.getPathPLScore(),filemanager.getPathPLScore())
        compressor.decompressPLVBYTE(filemanager)

    result = algoFunct(words_request, savedVoc, filemanager, epsilon, args.n)

    switchView = {"simple": view.displayResults,
                  "fullText": view.displayResultsText}
    viewFunct = switchView[args.view]
    print("\nResults: ")
    viewFunct(result, args.vpath)