def test__update_indexes(): index = [[0, [[0, 2, 0, 3]]], [1, [[0, 1, 1], [1, 2, 0, 1]]], [2, [[0, 1, 2], [1, 2, 2, 3]]], [3, [[0, 1, 4], [1, 1, 4]]]] b = Indexing(db=db) try: b.update_indexes(index) fetched_index = b.fetch_indexes([0, 1, 2, 3]) fetched_index = b._to_list_memory_indexes(fetched_index) assert len(fetched_index) == len(index) for j in range(4): i = fetched_index[j] ei = index[j] assert i[0] == ei[0] assert len(i[1]) == len(ei[1]) for k, t in zip(i[1], ei[1]): assert len(k) == len(t) for m in range(len(k)): assert k[m] == t[m] except Exception as ex: print(ex) b._index_coll.drop() assert False else: b._index_coll.drop()
def test_index(): collection = [ [0, 'xx yy zz. xx tt.', 1], [10, 'yy yy zz. zz tt kk.', 0], ] vocabulary = [{ 'term': 'xx', 'termid': 0, 'df': 1 }, { 'term': 'yy', 'termid': 1, 'df': 2 }, { 'term': 'zz', 'termid': 2, 'df': 2 }, { 'term': 'tt', 'termid': 3, 'df': 2 }, { 'term': 'nn', 'termid': 4, 'df': 1 }, { 'term': 'mm', 'termid': 5, 'df': 1 }] disk_indexes = [ [0, [[0, 3, 10, 19]]], [1, [[0, 1, 6], [1, 2, 0, 20], [3, 1, 10]]], [2, [[0, 1, 5], [1, 2, 2, 7]]], [3, [[0, 1, 4], [1, 1, 4]]], [4, [[0, 1, 16]]], [5, [[0, 1, 17]]], ] cache_indexes = { 1: [[0, 1, 6], [1, 2, 0, 20], [3, 1, 10]], 5: [[0, 1, 16]] } cache_indexes = defaultdict(lambda: None, cache_indexes) expected_index = [[0, [[0, 2, 0, 3]]], [ 1, [[0, 1, 1], [1, 2, 0, 20], [3, 1, 10], [10, 2, 0, 1]] ], [2, [[0, 1, 2], [1, 2, 2, 7], [10, 2, 2, 3]]], [3, [[0, 1, 4], [1, 1, 4], [10, 1, 4]]], [4, [[0, 1, 16]]], [5, [[0, 1, 17]]], [6, [[10, 1, 5]]]] exp_vocabulary = { 'xx': { 'termid': 0, 'df': 2 }, 'yy': { 'termid': 1, 'df': 4 }, 'zz': { 'termid': 2, 'df': 4 }, 'tt': { 'termid': 3, 'df': 4 }, 'nn': { 'termid': 4, 'df': 1 }, 'mm': { 'termid': 5, 'df': 1 }, 'kk': { 'termid': 6, 'df': 1 }, } preprocessing = Preprocessing() b = Indexing(db=db, preprocessing=preprocessing) b._vocabulary_coll.insert_many(vocabulary) b._create_vocabulary_cache() b.save_indexes(disk_indexes) b._indexes = cache_indexes try: b.index(collection) _vocabulary = b.get_vocabulary() # test vocabulary assert len(_vocabulary) == len(exp_vocabulary) for k, v in _vocabulary.items(): assert v['termid'] == exp_vocabulary[k]['termid'] assert v['df'] == exp_vocabulary[k]['df'] fetched_index = b.fetch_indexes([0, 1, 2, 3, 4, 5, 6]) fetched_index = b._to_list_memory_indexes(fetched_index) # test indexes assert len(fetched_index) == len(expected_index) for j in range(len(fetched_index)): i = fetched_index[j] ei = expected_index[j] assert i[0] == ei[0] assert len(i[1]) == len(ei[1]) for k, t in zip(i[1], ei[1]): assert len(k) == len(t) for m in range(len(k)): assert k[m] == t[m] # test document vectors except Exception as ex: print(ex) b._vocabulary_coll.drop() b._index_coll.drop() b._doc_vector_coll.drop() assert False else: b._vocabulary_coll.drop() b._index_coll.drop() b._doc_vector_coll.drop()