def test_merging(self): voc = dict() currentWorkspace = './tests/workspace/test3/' filename = 'test3' pathlist = Path("./tests/data/test3/").glob('**/la*') filemanager = fm.FileManager(filename, currentWorkspace) for path in pathlist: analysis.analyse_newspaper(path, voc) filemanager.save_vocabularyAndPL_file(voc, True) voc = dict() filemanager.mergePartialVocsAndPL(False) # TODO: changer quand on ait une function directe savedVoc = filemanager.read_vocabulary() mot, sortedByScore = query.get_posting_list( savedVoc, "aa", filemanager, True) self.assertEqual(mot, {1: [0, 3], 2: [0, 2], 3: [ 0, 1], 4: [0, 3], 5: [0, 2], 6: [0, 1]}) # The score is equal to zero self.assertEqual(sortedByScore, [(0, 1),(0, 2),(0, 3),(0, 4),(0, 5),(0, 6)]) mot, sortedByScore = query.get_posting_list( savedVoc, "bb", filemanager, True) # The score is equal to zero self.assertEqual(sortedByScore, [(0, 1), (0,2), (0,4), (0,5)]) self.assertEqual(mot, {1: [0, 1], 2: [0, 1], 4: [0, 1], 5: [0, 1]}) mot, sortedByScore = query.get_posting_list( savedVoc, "cc", filemanager, True) self.assertEqual(mot, {3: [0, 1], 6: [0, 1]}) self.assertEqual(sortedByScore, [(0, 3),(0, 6)])
def test_merging_3_files_scores(self): voc = dict() currentWorkspace = './tests/workspace/test4/' filename = 'test4merging3filesscores' pathlist = Path("./tests/data/test4/").glob('**/la*') filemanager = fm.FileManager(filename, currentWorkspace) for path in pathlist: analysis.analyse_newspaper(path, voc, computeIDF=True) filemanager.save_vocabularyAndPL_file(voc, True) voc = dict() filemanager.mergePartialVocsAndPL(True) savedVoc = filemanager.read_vocabulary() mot,sortedByScore = query.get_posting_list(savedVoc, "aa", filemanager, True) self.assertEqual(mot, {1: [0.24718092381954193, 3.0], 2: [0.32882189750671387, 6.0], 5: [0.11778303235769272, 1.0], 6: [ 0.11778303235769272, 1.0], 20: [0.24718092381954193, 3.0], 21: [0.19942401349544525, 2.0], 22: [0.11778303235769272, 1.0]}) self.assertEqual(sortedByScore, [(0.32882189750671387, 2), (0.24718092381954193, 1), (0.24718092381954193, 20), (0.19942401349544525, 21), (0.11778303235769272, 5),(0.11778303235769272, 6),(0.11778303235769272, 22)]) mot,sortedByScore = query.get_posting_list(savedVoc, "bb", filemanager,True) self.assertEqual(mot, {1: [0.5274115204811096, 3.0], 2: [0.7016094326972961, 6.0], 4: [0.2513144314289093, 1.0], 5: [0.2513144314289093, 1.0], 20: [0.2513144314289093, 1.0], 21: [0.2513144314289093, 1.0]}) self.assertEqual(sortedByScore,[(0.7016094326972961, 2), (0.5274115204811096, 1), (0.2513144314289093,4),(0.2513144314289093, 5),(0.2513144314289093,20),(0.2513144314289093, 21)])
def apply_top_k_algo(words, voc, filemanager, epsilon, k, typeRequest='disjunctive'): """ Apply the fagins top k algorithm Preconditions: words : an array of words to do the research on voc : a dictionnay of words and offsets filemanager : a filemanager to grab the posting lists epsilon : parameter for the algorithm k : number of results typeRequest : type of request Postconditions: Returns top k documents """ posting_lists_ordered_by_id = dict() posting_lists_ordered_by_score = dict() for word in words: orderedById, orderedByScore = query.get_posting_list( voc, word, filemanager, True) if orderedById and orderedByScore: posting_lists_ordered_by_score[word] = orderedByScore posting_lists_ordered_by_id[word] = orderedById return find_fagins_top_k(posting_lists_ordered_by_id, posting_lists_ordered_by_score, k, typeRequest)
def test_simple(self): voc = dict() currentWorkspace = './tests/workspace/test1/' filename = 'test1' pathlist = Path("./tests/data/test1/").glob('**/la*') for path in pathlist: analysis.analyse_newspaper(path, voc) filemanager = fm.FileManager(filename, currentWorkspace) filemanager.save_vocabularyAndPL_file(voc, False) savedVoc = filemanager.read_vocabulary() mot1 = query.get_posting_list(savedVoc, "aa", filemanager) mot2 = query.get_posting_list(savedVoc, "bb", filemanager) mot3 = query.get_posting_list(savedVoc, "cc", filemanager) self.assertEqual(mot1, {1: [0, 3], 2: [0, 2], 3: [0, 1]}) self.assertEqual(mot2, {1: [0, 1], 2: [0, 1]}) self.assertEqual(mot3, {3: [0, 1]})
def apply_fagins_ta(words, voc, filemanager, epsilon, k): posting_lists_ordered_by_id = SortedDict() posting_lists_ordered_by_score = SortedDict() for word in words: orderedById, orderedByScore = query.get_posting_list( voc, word, filemanager, returnPostingListOrderedByScore=True) if orderedById and orderedByScore: posting_lists_ordered_by_score[word] = orderedByScore posting_lists_ordered_by_id[word] = orderedById return find_fagins_ta(posting_lists_ordered_by_id, posting_lists_ordered_by_score, epsilon, k)
def test_with_stopwords(self): voc = dict() currentWorkspace = './tests/workspace/test2/' filename = 'test2' pathlist = Path("./tests/data/test2/").glob('**/la*') for path in pathlist: analysis.analyse_newspaper(path, voc) filemanager = fm.FileManager(filename, currentWorkspace) filemanager.save_vocabularyAndPL_file(voc) # TODO: changer quand on ait une function directe savedVoc = filemanager.read_vocabulary() mot1 = query.get_posting_list(savedVoc, "aa", filemanager) mot2 = query.get_posting_list(savedVoc, "bb", filemanager) mot3 = query.get_posting_list(savedVoc, "cc", filemanager) self.assertEqual(mot1, {1: [0, 1], 2: [0, 2]}) self.assertEqual(mot2, {1: [0, 4], 2: [0, 1]}) self.assertEqual(mot3, {2: [0, 2]}) stop1 = query.get_posting_list(savedVoc, "doing", filemanager) self.assertEqual(stop1, {})
def apply_top_k_algo(words, voc, filemanager, epsilon, k, typeRequest = 'disjunctive'): # print("WORDS: {}".format(words)) posting_lists_ordered_by_id = SortedDict() posting_lists_ordered_by_score = SortedDict() for word in words: # print("WORDK: {}".format(word)) orderedById, orderedByScore = query.get_posting_list( voc, word, filemanager, returnPostingListOrderedByScore = True) # print("RETURNED: {}||| {}".format(orderedById, orderedByScore)) if orderedById and orderedByScore: posting_lists_ordered_by_score[word] = orderedByScore posting_lists_ordered_by_id[word] = orderedById # print("EEEO") # print('Result findla {},{}'.format(posting_lists_ordered_by_id,posting_lists_ordered_by_score)) return find_fagins_top_k(posting_lists_ordered_by_id, posting_lists_ordered_by_score, k, typeRequest)
def apply_naive_top_k_algo(words, voc, filemanager, epsilon, k, get_docs_func=disjunctive_queries): """ Apply the naive top k algorithm Preconditions: words : an array of words to do the research on voc : a dictionnay of words and offsets filemanager : a filemanager to grab the posting lists epsilon : parameter for the algorithm k : number of results get_docs_func : type of request(can be conjunctive_queries or disjunctive_queries) Postconditions: Returns top k documents """ posting_lists = [ query.get_posting_list(voc, word, filemanager) for word in words ] if all((not posting_list) for posting_list in posting_lists): return [] return naive_top_k_algo(posting_lists, k, get_docs_func)
def test_merging_3_files(self): voc = dict() currentWorkspace = './tests/workspace/test4/' filename = 'test4merging3files' pathlist = Path("./tests/data/test4/").glob('**/la*') filemanager = fm.FileManager(filename, currentWorkspace) for path in pathlist: analysis.analyse_newspaper(path, voc) filemanager.save_vocabularyAndPL_file(voc, True) voc = dict() filemanager.mergePartialVocsAndPL(False) # TODO: changer quand on a une function directe savedVoc = filemanager.read_vocabulary() mot = query.get_posting_list(savedVoc, "aa", filemanager) self.assertEqual(mot, {1: [0, 3.0], 2: [0, 6.0], 5: [0, 1.0], 6: [ 0, 1.0], 20: [0, 3.0], 21: [0, 2.0], 22: [0, 1.0]}) mot = query.get_posting_list(savedVoc, "bb", filemanager) self.assertEqual(mot, {1: [0, 3], 2: [0, 6], 20: [ 0, 1], 21: [0, 1], 4: [0, 1], 5: [0, 1]}) mot = query.get_posting_list(savedVoc, "cc", filemanager) self.assertEqual(mot, {1: [0, 1], 2: [0, 3], 22: [ 0, 1], 4: [0, 1], 6: [0, 1]}) mot = query.get_posting_list(savedVoc, "dd", filemanager) self.assertEqual(mot, {1: [0, 2], 2: [0, 1]}) mot = query.get_posting_list(savedVoc, "ff", filemanager) self.assertEqual(mot, {1: [0, 1], 20: [0, 1], 6: [0, 1]}, "FF") mot = query.get_posting_list(savedVoc, "qq", filemanager) self.assertEqual(mot, {1: [0, 1], 5: [0, 1]}) mot = query.get_posting_list(savedVoc, "rr", filemanager) self.assertEqual(mot, {1: [0, 5], 21: [0, 1]}) mot = query.get_posting_list(savedVoc, "ee", filemanager) self.assertEqual(mot, {1: [0, 1], 23: [0, 1]}) mot = query.get_posting_list(savedVoc, "vv", filemanager) self.assertEqual(mot, {1: [0, 1]}) mot = query.get_posting_list(savedVoc, "yy", filemanager) self.assertEqual(mot, {1: [0, 1]}) mot = query.get_posting_list(savedVoc, "kk", filemanager) self.assertEqual(mot, {2: [0, 1], 23: [0, 1]}) mot = query.get_posting_list(savedVoc, "ii", filemanager) self.assertEqual(mot, {2: [0, 1]}) mot = query.get_posting_list(savedVoc, "jj", filemanager) self.assertEqual(mot, {2: [0, 1]}) mot = query.get_posting_list(savedVoc, "hh", filemanager) self.assertEqual(mot, {23: [0, 1]}) mot = query.get_posting_list(savedVoc, "ll", filemanager) self.assertEqual(mot, {}, 'll is considered a stopword')