Example #1
0
    def test_merging(self):
        voc = dict()
        currentWorkspace = './tests/workspace/test3/'
        filename = 'test3'

        pathlist = Path("./tests/data/test3/").glob('**/la*')

        filemanager = fm.FileManager(filename, currentWorkspace)

        for path in pathlist:
            analysis.analyse_newspaper(path, voc)
            filemanager.save_vocabularyAndPL_file(voc, True)
            voc = dict()

        filemanager.mergePartialVocsAndPL(False)

        # TODO: changer quand on ait une function directe
        savedVoc = filemanager.read_vocabulary()
        mot, sortedByScore = query.get_posting_list(
            savedVoc, "aa", filemanager, True)
        self.assertEqual(mot, {1: [0, 3], 2: [0, 2], 3: [
                         0, 1], 4: [0, 3], 5: [0, 2], 6: [0, 1]})
        # The score is equal to zero
        self.assertEqual(sortedByScore, [(0, 1),(0, 2),(0, 3),(0, 4),(0, 5),(0, 6)])
        mot, sortedByScore = query.get_posting_list(
            savedVoc, "bb", filemanager, True)
        # The score is equal to zero
        self.assertEqual(sortedByScore, [(0, 1), (0,2), (0,4), (0,5)])
        self.assertEqual(mot, {1: [0, 1], 2: [0, 1], 4: [0, 1], 5: [0, 1]})
        mot, sortedByScore = query.get_posting_list(
            savedVoc, "cc", filemanager, True)
        self.assertEqual(mot, {3: [0, 1], 6: [0, 1]})
        self.assertEqual(sortedByScore, [(0, 3),(0, 6)])
Example #2
0
    def test_merging_3_files_scores(self):
        voc = dict()
        currentWorkspace = './tests/workspace/test4/'
        filename = 'test4merging3filesscores'

        pathlist = Path("./tests/data/test4/").glob('**/la*')

        filemanager = fm.FileManager(filename, currentWorkspace)

        for path in pathlist:
            analysis.analyse_newspaper(path, voc, computeIDF=True)
            filemanager.save_vocabularyAndPL_file(voc, True)
            voc = dict()

        filemanager.mergePartialVocsAndPL(True)

        savedVoc = filemanager.read_vocabulary()
        mot,sortedByScore = query.get_posting_list(savedVoc, "aa", filemanager, True)
        self.assertEqual(mot, {1: [0.24718092381954193, 3.0], 2: [0.32882189750671387, 6.0], 5: [0.11778303235769272, 1.0], 6: [
                         0.11778303235769272, 1.0], 20: [0.24718092381954193, 3.0], 21: [0.19942401349544525, 2.0], 22: [0.11778303235769272, 1.0]})
        
        self.assertEqual(sortedByScore, [(0.32882189750671387, 2), (0.24718092381954193, 1), (0.24718092381954193, 20), (0.19942401349544525, 21), (0.11778303235769272, 5),(0.11778303235769272, 6),(0.11778303235769272, 22)])
        
        mot,sortedByScore = query.get_posting_list(savedVoc, "bb", filemanager,True)
        self.assertEqual(mot, {1: [0.5274115204811096, 3.0], 2: [0.7016094326972961, 6.0], 4: [0.2513144314289093, 1.0], 5: [0.2513144314289093, 1.0], 20: [0.2513144314289093, 1.0], 21: [0.2513144314289093, 1.0]})
        self.assertEqual(sortedByScore,[(0.7016094326972961, 2), (0.5274115204811096, 1), (0.2513144314289093,4),(0.2513144314289093, 5),(0.2513144314289093,20),(0.2513144314289093, 21)])
Example #3
0
def apply_top_k_algo(words,
                     voc,
                     filemanager,
                     epsilon,
                     k,
                     typeRequest='disjunctive'):
    """
        Apply the fagins top k algorithm
        Preconditions:
            words : an array of words to do the research on
            voc : a dictionnay of words and offsets
            filemanager : a filemanager to grab the posting lists
            epsilon : parameter for the algorithm
            k : number of results
            typeRequest : type of request
        Postconditions:
            Returns top k documents
        """
    posting_lists_ordered_by_id = dict()
    posting_lists_ordered_by_score = dict()

    for word in words:

        orderedById, orderedByScore = query.get_posting_list(
            voc, word, filemanager, True)

        if orderedById and orderedByScore:
            posting_lists_ordered_by_score[word] = orderedByScore
            posting_lists_ordered_by_id[word] = orderedById

    return find_fagins_top_k(posting_lists_ordered_by_id,
                             posting_lists_ordered_by_score, k, typeRequest)
Example #4
0
    def test_simple(self):
        voc = dict()
        currentWorkspace = './tests/workspace/test1/'
        filename = 'test1'

        pathlist = Path("./tests/data/test1/").glob('**/la*')
        for path in pathlist:
            analysis.analyse_newspaper(path, voc)
        filemanager = fm.FileManager(filename, currentWorkspace)
        filemanager.save_vocabularyAndPL_file(voc, False)
        savedVoc = filemanager.read_vocabulary()

        mot1 = query.get_posting_list(savedVoc, "aa", filemanager)
        mot2 = query.get_posting_list(savedVoc, "bb", filemanager)
        mot3 = query.get_posting_list(savedVoc, "cc", filemanager)
        self.assertEqual(mot1, {1: [0, 3], 2: [0, 2], 3: [0, 1]})
        self.assertEqual(mot2, {1: [0, 1], 2: [0, 1]})
        self.assertEqual(mot3, {3: [0, 1]})
Example #5
0
def apply_fagins_ta(words, voc, filemanager, epsilon, k):
    posting_lists_ordered_by_id = SortedDict()
    posting_lists_ordered_by_score = SortedDict()
    for word in words:
        orderedById, orderedByScore = query.get_posting_list(
            voc, word, filemanager, returnPostingListOrderedByScore=True)
        if orderedById and orderedByScore:
            posting_lists_ordered_by_score[word] = orderedByScore
            posting_lists_ordered_by_id[word] = orderedById
    return find_fagins_ta(posting_lists_ordered_by_id,
                          posting_lists_ordered_by_score, epsilon, k)
Example #6
0
    def test_with_stopwords(self):
        voc = dict()
        currentWorkspace = './tests/workspace/test2/'
        filename = 'test2'

        pathlist = Path("./tests/data/test2/").glob('**/la*')
        for path in pathlist:
            analysis.analyse_newspaper(path, voc)

        filemanager = fm.FileManager(filename, currentWorkspace)
        filemanager.save_vocabularyAndPL_file(voc)
        # TODO: changer quand on ait une function directe
        savedVoc = filemanager.read_vocabulary()
        mot1 = query.get_posting_list(savedVoc, "aa", filemanager)
        mot2 = query.get_posting_list(savedVoc, "bb", filemanager)
        mot3 = query.get_posting_list(savedVoc, "cc", filemanager)
        self.assertEqual(mot1, {1: [0, 1], 2: [0, 2]})
        self.assertEqual(mot2, {1: [0, 4], 2: [0, 1]})
        self.assertEqual(mot3, {2: [0, 2]})

        stop1 = query.get_posting_list(savedVoc, "doing", filemanager)
        self.assertEqual(stop1, {})
Example #7
0
def apply_top_k_algo(words, voc, filemanager, epsilon, k, typeRequest = 'disjunctive'):
    # print("WORDS: {}".format(words))
    posting_lists_ordered_by_id = SortedDict()
    posting_lists_ordered_by_score = SortedDict()
    for word in words:
        # print("WORDK: {}".format(word))
        orderedById, orderedByScore = query.get_posting_list(
        voc, word, filemanager, returnPostingListOrderedByScore = True)
        # print("RETURNED: {}||| {}".format(orderedById, orderedByScore))
        if orderedById and orderedByScore:
            posting_lists_ordered_by_score[word] = orderedByScore
            posting_lists_ordered_by_id[word] = orderedById
            # print("EEEO")
    # print('Result findla {},{}'.format(posting_lists_ordered_by_id,posting_lists_ordered_by_score))
    return find_fagins_top_k(posting_lists_ordered_by_id,
                      posting_lists_ordered_by_score, k, typeRequest)
Example #8
0
def apply_naive_top_k_algo(words,
                           voc,
                           filemanager,
                           epsilon,
                           k,
                           get_docs_func=disjunctive_queries):
    """
    Apply the naive top k algorithm
    Preconditions:
        words : an array of words to do the research on
        voc : a dictionnay of words and offsets
        filemanager : a filemanager to grab the posting lists
        epsilon : parameter for the algorithm
        k : number of results
        get_docs_func : type of request(can be conjunctive_queries or disjunctive_queries)
    Postconditions:
        Returns top k documents
    """
    posting_lists = [
        query.get_posting_list(voc, word, filemanager) for word in words
    ]
    if all((not posting_list) for posting_list in posting_lists):
        return []
    return naive_top_k_algo(posting_lists, k, get_docs_func)
Example #9
0
    def test_merging_3_files(self):
        voc = dict()
        currentWorkspace = './tests/workspace/test4/'
        filename = 'test4merging3files'

        pathlist = Path("./tests/data/test4/").glob('**/la*')

        filemanager = fm.FileManager(filename, currentWorkspace)


        for path in pathlist:
            analysis.analyse_newspaper(path, voc)
            filemanager.save_vocabularyAndPL_file(voc, True)
            voc = dict()

        filemanager.mergePartialVocsAndPL(False)

        # TODO: changer quand on a   une function directe
        savedVoc = filemanager.read_vocabulary()
        mot = query.get_posting_list(savedVoc, "aa", filemanager)
        self.assertEqual(mot, {1: [0, 3.0], 2: [0, 6.0], 5: [0, 1.0], 6: [
                         0, 1.0], 20: [0, 3.0], 21: [0, 2.0], 22: [0, 1.0]})
        mot = query.get_posting_list(savedVoc, "bb", filemanager)
        self.assertEqual(mot, {1: [0, 3], 2: [0, 6], 20: [
                         0, 1], 21: [0, 1], 4: [0, 1], 5: [0, 1]})
        mot = query.get_posting_list(savedVoc, "cc", filemanager)
        self.assertEqual(mot, {1: [0, 1], 2: [0, 3], 22: [
                         0, 1], 4: [0, 1], 6: [0, 1]})
        mot = query.get_posting_list(savedVoc, "dd", filemanager)
        self.assertEqual(mot, {1: [0, 2], 2: [0, 1]})
        mot = query.get_posting_list(savedVoc, "ff", filemanager)
        self.assertEqual(mot, {1: [0, 1], 20: [0, 1], 6: [0, 1]}, "FF")
        mot = query.get_posting_list(savedVoc, "qq", filemanager)
        self.assertEqual(mot, {1: [0, 1], 5: [0, 1]})
        mot = query.get_posting_list(savedVoc, "rr", filemanager)
        self.assertEqual(mot, {1: [0, 5], 21: [0, 1]})
        mot = query.get_posting_list(savedVoc, "ee", filemanager)
        self.assertEqual(mot, {1: [0, 1], 23: [0, 1]})
        mot = query.get_posting_list(savedVoc, "vv", filemanager)
        self.assertEqual(mot, {1: [0, 1]})
        mot = query.get_posting_list(savedVoc, "yy", filemanager)
        self.assertEqual(mot, {1: [0, 1]})
        mot = query.get_posting_list(savedVoc, "kk", filemanager)
        self.assertEqual(mot, {2: [0, 1], 23: [0, 1]})
        mot = query.get_posting_list(savedVoc, "ii", filemanager)
        self.assertEqual(mot, {2: [0, 1]})
        mot = query.get_posting_list(savedVoc, "jj", filemanager)
        self.assertEqual(mot, {2: [0, 1]})
        mot = query.get_posting_list(savedVoc, "hh", filemanager)
        self.assertEqual(mot, {23: [0, 1]})
        mot = query.get_posting_list(savedVoc, "ll", filemanager)
        self.assertEqual(mot, {}, 'll is considered a stopword')