예제 #1
0
def test__update_indexes():
    index = [[0, [[0, 2, 0, 3]]], [1, [[0, 1, 1], [1, 2, 0, 1]]],
             [2, [[0, 1, 2], [1, 2, 2, 3]]], [3, [[0, 1, 4], [1, 1, 4]]]]

    b = Indexing(db=db)
    try:
        b.update_indexes(index)
        fetched_index = b.fetch_indexes([0, 1, 2, 3])
        fetched_index = b._to_list_memory_indexes(fetched_index)
        assert len(fetched_index) == len(index)
        for j in range(4):
            i = fetched_index[j]
            ei = index[j]
            assert i[0] == ei[0]
            assert len(i[1]) == len(ei[1])
            for k, t in zip(i[1], ei[1]):
                assert len(k) == len(t)
                for m in range(len(k)):
                    assert k[m] == t[m]

    except Exception as ex:
        print(ex)
        b._index_coll.drop()
        assert False
    else:
        b._index_coll.drop()
예제 #2
0
def test_index():

    collection = [
        [0, 'xx yy zz. xx tt.', 1],
        [10, 'yy yy zz. zz tt kk.', 0],
    ]

    vocabulary = [{
        'term': 'xx',
        'termid': 0,
        'df': 1
    }, {
        'term': 'yy',
        'termid': 1,
        'df': 2
    }, {
        'term': 'zz',
        'termid': 2,
        'df': 2
    }, {
        'term': 'tt',
        'termid': 3,
        'df': 2
    }, {
        'term': 'nn',
        'termid': 4,
        'df': 1
    }, {
        'term': 'mm',
        'termid': 5,
        'df': 1
    }]

    disk_indexes = [
        [0, [[0, 3, 10, 19]]],
        [1, [[0, 1, 6], [1, 2, 0, 20], [3, 1, 10]]],
        [2, [[0, 1, 5], [1, 2, 2, 7]]],
        [3, [[0, 1, 4], [1, 1, 4]]],
        [4, [[0, 1, 16]]],
        [5, [[0, 1, 17]]],
    ]

    cache_indexes = {
        1: [[0, 1, 6], [1, 2, 0, 20], [3, 1, 10]],
        5: [[0, 1, 16]]
    }
    cache_indexes = defaultdict(lambda: None, cache_indexes)

    expected_index = [[0, [[0, 2, 0, 3]]],
                      [
                          1,
                          [[0, 1, 1], [1, 2, 0, 20], [3, 1, 10], [10, 2, 0, 1]]
                      ], [2, [[0, 1, 2], [1, 2, 2, 7], [10, 2, 2, 3]]],
                      [3, [[0, 1, 4], [1, 1, 4], [10, 1, 4]]],
                      [4, [[0, 1, 16]]], [5, [[0, 1, 17]]], [6, [[10, 1, 5]]]]

    exp_vocabulary = {
        'xx': {
            'termid': 0,
            'df': 2
        },
        'yy': {
            'termid': 1,
            'df': 4
        },
        'zz': {
            'termid': 2,
            'df': 4
        },
        'tt': {
            'termid': 3,
            'df': 4
        },
        'nn': {
            'termid': 4,
            'df': 1
        },
        'mm': {
            'termid': 5,
            'df': 1
        },
        'kk': {
            'termid': 6,
            'df': 1
        },
    }

    preprocessing = Preprocessing()
    b = Indexing(db=db, preprocessing=preprocessing)
    b._vocabulary_coll.insert_many(vocabulary)
    b._create_vocabulary_cache()
    b.save_indexes(disk_indexes)
    b._indexes = cache_indexes

    try:
        b.index(collection)
        _vocabulary = b.get_vocabulary()

        # test vocabulary
        assert len(_vocabulary) == len(exp_vocabulary)
        for k, v in _vocabulary.items():
            assert v['termid'] == exp_vocabulary[k]['termid']
            assert v['df'] == exp_vocabulary[k]['df']

        fetched_index = b.fetch_indexes([0, 1, 2, 3, 4, 5, 6])
        fetched_index = b._to_list_memory_indexes(fetched_index)

        # test indexes
        assert len(fetched_index) == len(expected_index)
        for j in range(len(fetched_index)):
            i = fetched_index[j]
            ei = expected_index[j]
            assert i[0] == ei[0]
            assert len(i[1]) == len(ei[1])
            for k, t in zip(i[1], ei[1]):
                assert len(k) == len(t)
                for m in range(len(k)):
                    assert k[m] == t[m]

        # test document vectors

    except Exception as ex:
        print(ex)
        b._vocabulary_coll.drop()
        b._index_coll.drop()
        b._doc_vector_coll.drop()
        assert False
    else:
        b._vocabulary_coll.drop()
        b._index_coll.drop()
        b._doc_vector_coll.drop()