def test_index(): collection = [ [0, 'xx yy zz. xx tt.', 1], [10, 'yy yy zz. zz tt kk.', 0], ] vocabulary = [{ 'term': 'xx', 'termid': 0, 'df': 1 }, { 'term': 'yy', 'termid': 1, 'df': 2 }, { 'term': 'zz', 'termid': 2, 'df': 2 }, { 'term': 'tt', 'termid': 3, 'df': 2 }, { 'term': 'nn', 'termid': 4, 'df': 1 }, { 'term': 'mm', 'termid': 5, 'df': 1 }] disk_indexes = [ [0, [[0, 3, 10, 19]]], [1, [[0, 1, 6], [1, 2, 0, 20], [3, 1, 10]]], [2, [[0, 1, 5], [1, 2, 2, 7]]], [3, [[0, 1, 4], [1, 1, 4]]], [4, [[0, 1, 16]]], [5, [[0, 1, 17]]], ] cache_indexes = { 1: [[0, 1, 6], [1, 2, 0, 20], [3, 1, 10]], 5: [[0, 1, 16]] } cache_indexes = defaultdict(lambda: None, cache_indexes) expected_index = [[0, [[0, 2, 0, 3]]], [ 1, [[0, 1, 1], [1, 2, 0, 20], [3, 1, 10], [10, 2, 0, 1]] ], [2, [[0, 1, 2], [1, 2, 2, 7], [10, 2, 2, 3]]], [3, [[0, 1, 4], [1, 1, 4], [10, 1, 4]]], [4, [[0, 1, 16]]], [5, [[0, 1, 17]]], [6, [[10, 1, 5]]]] exp_vocabulary = { 'xx': { 'termid': 0, 'df': 2 }, 'yy': { 'termid': 1, 'df': 4 }, 'zz': { 'termid': 2, 'df': 4 }, 'tt': { 'termid': 3, 'df': 4 }, 'nn': { 'termid': 4, 'df': 1 }, 'mm': { 'termid': 5, 'df': 1 }, 'kk': { 'termid': 6, 'df': 1 }, } preprocessing = Preprocessing() b = Indexing(db=db, preprocessing=preprocessing) b._vocabulary_coll.insert_many(vocabulary) b._create_vocabulary_cache() b.save_indexes(disk_indexes) b._indexes = cache_indexes try: b.index(collection) _vocabulary = b.get_vocabulary() # test vocabulary assert len(_vocabulary) == len(exp_vocabulary) for k, v in _vocabulary.items(): assert v['termid'] == exp_vocabulary[k]['termid'] assert v['df'] == exp_vocabulary[k]['df'] fetched_index = b.fetch_indexes([0, 1, 2, 3, 4, 5, 6]) fetched_index = b._to_list_memory_indexes(fetched_index) # test indexes assert len(fetched_index) == len(expected_index) for j in range(len(fetched_index)): i = fetched_index[j] ei = expected_index[j] assert i[0] == ei[0] assert len(i[1]) == len(ei[1]) for k, t in zip(i[1], ei[1]): assert len(k) == len(t) for m in range(len(k)): assert k[m] == t[m] # test document vectors except Exception as ex: print(ex) b._vocabulary_coll.drop() b._index_coll.drop() b._doc_vector_coll.drop() assert False else: b._vocabulary_coll.drop() b._index_coll.drop() b._doc_vector_coll.drop()
def test__parse2(): ''' Test: Vocabulary exists before. Test: document vectors exists before. ''' prev_vocabulary = [{ 'term': 'xx', 'termid': 0, 'df': 1 }, { 'term': 'yy', 'termid': 1, 'df': 2 }, { 'term': 'zz', 'termid': 2, 'df': 2 }, { 'term': 'tt', 'termid': 3, 'df': 2 }, { 'term': 'nn', 'termid': 4, 'df': 1 }, { 'term': 'mm', 'termid': 5, 'df': 1 }] prev_doc_vectors = [ { 'docid': 0, 'tf': [(0, 1), (1, 1), (2, 3), (3, 1)] }, { 'docid': 1, 'tf': [(2, 2), (3, 2), (4, 1)] }, ] tokens = [ [['xx', 'yy', 'zz'], ['xx', 'tt']], [['yy', 'yy', 'zz'], ['zz', 'tt', 'kk']], ] docIDs = [0, 2] db.vocabulary_coll.insert_many(prev_vocabulary) db.contentvectors_coll.insert_many(prev_doc_vectors) indexing = Indexing(db=db) try: postings = indexing._parse(tokens, docIDs) vocabulary = indexing.get_vocabulary() exp_vocabulary = { 'xx': { 'termid': 0, 'df': 2 }, 'yy': { 'termid': 1, 'df': 4 }, 'zz': { 'termid': 2, 'df': 4 }, 'tt': { 'termid': 3, 'df': 4 }, 'nn': { 'termid': 4, 'df': 1 }, 'mm': { 'termid': 5, 'df': 1 }, 'kk': { 'termid': 6, 'df': 1 }, } assert len(exp_vocabulary) == len(vocabulary) for k, v in exp_vocabulary.items(): assert vocabulary[k]['termid'] == v['termid'] assert vocabulary[k]['df'] == v['df'] expected_postings = [[0, 0, 0], [1, 0, 1], [2, 0, 2], [0, 0, 3], [3, 0, 4], [1, 2, 0], [1, 2, 1], [2, 2, 2], [2, 2, 3], [3, 2, 4], [6, 2, 5]] assert len(postings) == len(expected_postings) for a, b in zip(postings, expected_postings): assert len(a) == len(b) for c, d in zip(a, b): assert c == d expected_doc_vectors = [ { 'docid': 0, 'tf': [(0, 2), (1, 1), (2, 1), (3, 1)] }, { 'docid': 1, 'tf': [(2, 2), (3, 2), (4, 1)] }, { 'docid': 2, 'tf': [(1, 2), (2, 2), (3, 1), (6, 1)] }, ] doc_vectors = list(indexing._doc_vector_coll._coll.find().sort( 'docid', 1)) assert len(expected_doc_vectors) == len(doc_vectors) for a, b in zip(expected_doc_vectors, doc_vectors): assert a['docid'] == b['docid'] assert len(a['tf']) == len(b['tf']) for c, d in zip(a['tf'], b['tf']): for e, f in zip(c, d): assert e == f except Exception as ex: print(ex) indexing._doc_vector_coll.drop() indexing._vocabulary_coll.drop() assert False else: indexing._doc_vector_coll.drop() indexing._vocabulary_coll.drop()
def test__parse(): ''' Test: No vocabulary existing before. Test: No document vector before. ''' tokens = [ [['xx', 'yy', 'zz'], ['xx', 'tt']], [['yy', 'yy', 'zz'], ['zz', 'tt']], ] docIDs = [0, 1] indexing = Indexing(db=db) try: postings = indexing._parse(tokens, docIDs) vocabulary = indexing.get_vocabulary() exp_vocabulary = { 'xx': { 'termid': 0, 'df': 1 }, 'yy': { 'termid': 1, 'df': 2 }, 'zz': { 'termid': 2, 'df': 2 }, 'tt': { 'termid': 3, 'df': 2 }, } assert len(exp_vocabulary) == len(vocabulary) for k, v in exp_vocabulary.items(): assert vocabulary[k]['termid'] == v['termid'] assert vocabulary[k]['df'] == v['df'] expected_postings = [[0, 0, 0], [1, 0, 1], [2, 0, 2], [0, 0, 3], [3, 0, 4], [1, 1, 0], [1, 1, 1], [2, 1, 2], [2, 1, 3], [3, 1, 4]] assert len(postings) == len(expected_postings) for a, b in zip(postings, expected_postings): assert len(a) == len(b) for c, d in zip(a, b): assert c == d expected_doc_vectors = [ { 'docid': 0, 'tf': [(0, 2), (1, 1), (2, 1), (3, 1)] }, { 'docid': 1, 'tf': [(1, 2), (2, 2), (3, 1)] }, ] doc_vectors = list(indexing._doc_vector_coll._coll.find().sort( 'docid', 1)) assert len(expected_doc_vectors) == len(doc_vectors) for a, b in zip(expected_doc_vectors, doc_vectors): assert a['docid'] == b['docid'] assert len(a['tf']) == len(b['tf']) for c, d in zip(a['tf'], b['tf']): for e, f in zip(c, d): assert e == f except Exception as ex: print(ex) indexing._doc_vector_coll.drop() assert False else: indexing._doc_vector_coll.drop()