コード例 #1
0
def test_index():

    collection = [
        [0, 'xx yy zz. xx tt.', 1],
        [10, 'yy yy zz. zz tt kk.', 0],
    ]

    vocabulary = [{
        'term': 'xx',
        'termid': 0,
        'df': 1
    }, {
        'term': 'yy',
        'termid': 1,
        'df': 2
    }, {
        'term': 'zz',
        'termid': 2,
        'df': 2
    }, {
        'term': 'tt',
        'termid': 3,
        'df': 2
    }, {
        'term': 'nn',
        'termid': 4,
        'df': 1
    }, {
        'term': 'mm',
        'termid': 5,
        'df': 1
    }]

    disk_indexes = [
        [0, [[0, 3, 10, 19]]],
        [1, [[0, 1, 6], [1, 2, 0, 20], [3, 1, 10]]],
        [2, [[0, 1, 5], [1, 2, 2, 7]]],
        [3, [[0, 1, 4], [1, 1, 4]]],
        [4, [[0, 1, 16]]],
        [5, [[0, 1, 17]]],
    ]

    cache_indexes = {
        1: [[0, 1, 6], [1, 2, 0, 20], [3, 1, 10]],
        5: [[0, 1, 16]]
    }
    cache_indexes = defaultdict(lambda: None, cache_indexes)

    expected_index = [[0, [[0, 2, 0, 3]]],
                      [
                          1,
                          [[0, 1, 1], [1, 2, 0, 20], [3, 1, 10], [10, 2, 0, 1]]
                      ], [2, [[0, 1, 2], [1, 2, 2, 7], [10, 2, 2, 3]]],
                      [3, [[0, 1, 4], [1, 1, 4], [10, 1, 4]]],
                      [4, [[0, 1, 16]]], [5, [[0, 1, 17]]], [6, [[10, 1, 5]]]]

    exp_vocabulary = {
        'xx': {
            'termid': 0,
            'df': 2
        },
        'yy': {
            'termid': 1,
            'df': 4
        },
        'zz': {
            'termid': 2,
            'df': 4
        },
        'tt': {
            'termid': 3,
            'df': 4
        },
        'nn': {
            'termid': 4,
            'df': 1
        },
        'mm': {
            'termid': 5,
            'df': 1
        },
        'kk': {
            'termid': 6,
            'df': 1
        },
    }

    preprocessing = Preprocessing()
    b = Indexing(db=db, preprocessing=preprocessing)
    b._vocabulary_coll.insert_many(vocabulary)
    b._create_vocabulary_cache()
    b.save_indexes(disk_indexes)
    b._indexes = cache_indexes

    try:
        b.index(collection)
        _vocabulary = b.get_vocabulary()

        # test vocabulary
        assert len(_vocabulary) == len(exp_vocabulary)
        for k, v in _vocabulary.items():
            assert v['termid'] == exp_vocabulary[k]['termid']
            assert v['df'] == exp_vocabulary[k]['df']

        fetched_index = b.fetch_indexes([0, 1, 2, 3, 4, 5, 6])
        fetched_index = b._to_list_memory_indexes(fetched_index)

        # test indexes
        assert len(fetched_index) == len(expected_index)
        for j in range(len(fetched_index)):
            i = fetched_index[j]
            ei = expected_index[j]
            assert i[0] == ei[0]
            assert len(i[1]) == len(ei[1])
            for k, t in zip(i[1], ei[1]):
                assert len(k) == len(t)
                for m in range(len(k)):
                    assert k[m] == t[m]

        # test document vectors

    except Exception as ex:
        print(ex)
        b._vocabulary_coll.drop()
        b._index_coll.drop()
        b._doc_vector_coll.drop()
        assert False
    else:
        b._vocabulary_coll.drop()
        b._index_coll.drop()
        b._doc_vector_coll.drop()
コード例 #2
0
def test__parse2():
    '''
	Test:  Vocabulary exists before.
	Test:  document vectors exists before.
	'''

    prev_vocabulary = [{
        'term': 'xx',
        'termid': 0,
        'df': 1
    }, {
        'term': 'yy',
        'termid': 1,
        'df': 2
    }, {
        'term': 'zz',
        'termid': 2,
        'df': 2
    }, {
        'term': 'tt',
        'termid': 3,
        'df': 2
    }, {
        'term': 'nn',
        'termid': 4,
        'df': 1
    }, {
        'term': 'mm',
        'termid': 5,
        'df': 1
    }]

    prev_doc_vectors = [
        {
            'docid': 0,
            'tf': [(0, 1), (1, 1), (2, 3), (3, 1)]
        },
        {
            'docid': 1,
            'tf': [(2, 2), (3, 2), (4, 1)]
        },
    ]

    tokens = [
        [['xx', 'yy', 'zz'], ['xx', 'tt']],
        [['yy', 'yy', 'zz'], ['zz', 'tt', 'kk']],
    ]

    docIDs = [0, 2]
    db.vocabulary_coll.insert_many(prev_vocabulary)
    db.contentvectors_coll.insert_many(prev_doc_vectors)
    indexing = Indexing(db=db)

    try:
        postings = indexing._parse(tokens, docIDs)
        vocabulary = indexing.get_vocabulary()

        exp_vocabulary = {
            'xx': {
                'termid': 0,
                'df': 2
            },
            'yy': {
                'termid': 1,
                'df': 4
            },
            'zz': {
                'termid': 2,
                'df': 4
            },
            'tt': {
                'termid': 3,
                'df': 4
            },
            'nn': {
                'termid': 4,
                'df': 1
            },
            'mm': {
                'termid': 5,
                'df': 1
            },
            'kk': {
                'termid': 6,
                'df': 1
            },
        }

        assert len(exp_vocabulary) == len(vocabulary)
        for k, v in exp_vocabulary.items():
            assert vocabulary[k]['termid'] == v['termid']
            assert vocabulary[k]['df'] == v['df']

        expected_postings = [[0, 0, 0], [1, 0, 1], [2, 0, 2], [0, 0, 3],
                             [3, 0, 4], [1, 2, 0], [1, 2, 1], [2, 2, 2],
                             [2, 2, 3], [3, 2, 4], [6, 2, 5]]

        assert len(postings) == len(expected_postings)
        for a, b in zip(postings, expected_postings):
            assert len(a) == len(b)
            for c, d in zip(a, b):
                assert c == d

        expected_doc_vectors = [
            {
                'docid': 0,
                'tf': [(0, 2), (1, 1), (2, 1), (3, 1)]
            },
            {
                'docid': 1,
                'tf': [(2, 2), (3, 2), (4, 1)]
            },
            {
                'docid': 2,
                'tf': [(1, 2), (2, 2), (3, 1), (6, 1)]
            },
        ]

        doc_vectors = list(indexing._doc_vector_coll._coll.find().sort(
            'docid', 1))

        assert len(expected_doc_vectors) == len(doc_vectors)
        for a, b in zip(expected_doc_vectors, doc_vectors):
            assert a['docid'] == b['docid']
            assert len(a['tf']) == len(b['tf'])
            for c, d in zip(a['tf'], b['tf']):
                for e, f in zip(c, d):
                    assert e == f
    except Exception as ex:
        print(ex)
        indexing._doc_vector_coll.drop()
        indexing._vocabulary_coll.drop()
        assert False
    else:
        indexing._doc_vector_coll.drop()
        indexing._vocabulary_coll.drop()
コード例 #3
0
def test__parse():
    '''
	Test: No vocabulary existing before.
	Test: No document vector before.
	'''
    tokens = [
        [['xx', 'yy', 'zz'], ['xx', 'tt']],
        [['yy', 'yy', 'zz'], ['zz', 'tt']],
    ]

    docIDs = [0, 1]
    indexing = Indexing(db=db)

    try:
        postings = indexing._parse(tokens, docIDs)
        vocabulary = indexing.get_vocabulary()

        exp_vocabulary = {
            'xx': {
                'termid': 0,
                'df': 1
            },
            'yy': {
                'termid': 1,
                'df': 2
            },
            'zz': {
                'termid': 2,
                'df': 2
            },
            'tt': {
                'termid': 3,
                'df': 2
            },
        }

        assert len(exp_vocabulary) == len(vocabulary)
        for k, v in exp_vocabulary.items():
            assert vocabulary[k]['termid'] == v['termid']
            assert vocabulary[k]['df'] == v['df']

        expected_postings = [[0, 0, 0], [1, 0, 1], [2, 0, 2], [0, 0, 3],
                             [3, 0, 4], [1, 1, 0], [1, 1, 1], [2, 1, 2],
                             [2, 1, 3], [3, 1, 4]]

        assert len(postings) == len(expected_postings)
        for a, b in zip(postings, expected_postings):
            assert len(a) == len(b)
            for c, d in zip(a, b):
                assert c == d

        expected_doc_vectors = [
            {
                'docid': 0,
                'tf': [(0, 2), (1, 1), (2, 1), (3, 1)]
            },
            {
                'docid': 1,
                'tf': [(1, 2), (2, 2), (3, 1)]
            },
        ]
        doc_vectors = list(indexing._doc_vector_coll._coll.find().sort(
            'docid', 1))

        assert len(expected_doc_vectors) == len(doc_vectors)
        for a, b in zip(expected_doc_vectors, doc_vectors):
            assert a['docid'] == b['docid']
            assert len(a['tf']) == len(b['tf'])
            for c, d in zip(a['tf'], b['tf']):
                for e, f in zip(c, d):
                    assert e == f
    except Exception as ex:
        print(ex)
        indexing._doc_vector_coll.drop()
        assert False
    else:
        indexing._doc_vector_coll.drop()