예제 #1
0
def test__left_join_postings_lists():
    # test if
    pl = [[0, 1, 1], [3, 2, 0, 1]]
    target_pl = [[0, 2, 4, 5], [2, 1, 10]]
    b = Indexing(db=db)
    pl = b._left_join_postings_lists(pl, target_pl)
    expected_pl = [[0, 1, 1], [2, 1, 10], [3, 2, 0, 1]]

    assert len(pl) == len(expected_pl)
    for k, m in zip(pl, expected_pl):
        assert len(k) == len(m)
        for t, n in zip(k, m):
            assert t == n
예제 #2
0
def test__index():
    postings = [[0, 0, 0], [1, 0, 1], [2, 0, 2], [0, 0, 3], [3, 0, 4],
                [1, 1, 0], [1, 1, 1], [2, 1, 2], [2, 1, 3], [3, 1, 4]]

    b = Indexing(db=db)
    index = b._index(postings)
    expected_index = [[0, [[0, 2, 0, 3]]], [1, [[0, 1, 1], [1, 2, 0, 1]]],
                      [2, [[0, 1, 2], [1, 2, 2, 3]]],
                      [3, [[0, 1, 4], [1, 1, 4]]]]
    assert len(index) == len(expected_index)
    for j in range(4):
        i = index[j]
        ei = expected_index[j]
        assert i[0] == ei[0]
        assert len(i[1]) == len(ei[1])
        for k, t in zip(i[1], ei[1]):
            assert len(k) == len(t)
            for m in range(len(k)):
                assert k[m] == t[m]
예제 #3
0
파일: irsys.py 프로젝트: phammanhhiep/azir
	def __init__ (self, db=None, dbname='market', max_queue=100, max_delete_cache=100, max_update_wait_time=300, max_delete_wait_time=300):
		'''
		Set up parameters and index documents from the last queue and cache.

		::param max_queue:: max number of docids in the queue
		::param max_delete_cache:: max number of docids in the cache
		::param max_update_wait_time:: max waiting time in seconds before update documents with docids in queue
		::param max_delete_wait_time:: max waiting time in seconds before delete documents with docids in the delete cache.
		'''
		self.max_queue = max_queue
		self.max_delete_cache = max_delete_cache
		self.max_update_wait_time = max_update_wait_time
		self.max_delete_wait_time = max_delete_wait_time
		self.db = db
		self._create_cache ()
		self._ranking = CosineScoring ()
		self._preprocessing = Preprocessing () 		
		self._indexing = Indexing (db=db, preprocessing=self._preprocessing) 
		self._retrieval = Retrieval (db=db, preprocessing=self._preprocessing, ranking=self._ranking, indexing=self._indexing)
		self.index ()
예제 #4
0
def test__update_indexes():
    index = [[0, [[0, 2, 0, 3]]], [1, [[0, 1, 1], [1, 2, 0, 1]]],
             [2, [[0, 1, 2], [1, 2, 2, 3]]], [3, [[0, 1, 4], [1, 1, 4]]]]

    b = Indexing(db=db)
    try:
        b.update_indexes(index)
        fetched_index = b.fetch_indexes([0, 1, 2, 3])
        fetched_index = b._to_list_memory_indexes(fetched_index)
        assert len(fetched_index) == len(index)
        for j in range(4):
            i = fetched_index[j]
            ei = index[j]
            assert i[0] == ei[0]
            assert len(i[1]) == len(ei[1])
            for k, t in zip(i[1], ei[1]):
                assert len(k) == len(t)
                for m in range(len(k)):
                    assert k[m] == t[m]

    except Exception as ex:
        print(ex)
        b._index_coll.drop()
        assert False
    else:
        b._index_coll.drop()
예제 #5
0
def test__merge_indexes():
    # Test cache indexes contains posting lists of docs that edited index does not have
    # Test cache indexes contains terms that edited index does not have
    # Test new indexes contains new terms.
    # Test next indexes contains existing terms.

    new_indexes = [
        [0, [[10, 2, 0, 3]]],
        [1, [[11, 1, 1], [12, 2, 0, 1]]],
        [2, [[10, 1, 2], [12, 2, 2, 3]]],
        [3, [[10, 1, 4], [13, 1, 4]]],
        [10, [[10, 1, 5], [13, 1, 5]]],
    ]

    edited_indexes = [[0, [[0, 2, 0, 3]]], [1, [[0, 1, 1], [1, 2, 0, 1]]],
                      [2, [[0, 1, 2], [1, 2, 2, 3]]],
                      [3, [[0, 1, 4], [1, 1, 4]]]]

    cache_indexes = {
        1: [[0, 1, 6], [1, 2, 0, 20], [3, 1, 10]],
        5: [[0, 1, 16]]
    }
    cache_indexes = defaultdict(lambda: None, cache_indexes)

    disk_indexes = [[0, [[0, 3, 10, 19]]], [1, [[0, 1, 6], [1, 2, 0, 20]]],
                    [2, [[0, 1, 5], [1, 2, 2, 7]]],
                    [3, [[0, 1, 4], [1, 1, 4]]], [5, [[0, 1, 16]]]]

    expected_index = [
        [0, [[0, 2, 0, 3], [10, 2, 0, 3]]],
        [1, [[0, 1, 1], [1, 2, 0, 1], [3, 1, 10], [11, 1, 1], [12, 2, 0, 1]]],
        [2, [[0, 1, 2], [1, 2, 2, 3], [10, 1, 2], [12, 2, 2, 3]]],
        [3, [[0, 1, 4], [1, 1, 4], [10, 1, 4], [13, 1, 4]]],
        [10, [[10, 1, 5], [13, 1, 5]]]
    ]

    b = Indexing(db=db)
    b.save_indexes(disk_indexes)
    b._indexes = cache_indexes

    try:
        merged_indexes = b._merge_indexes(new_indexes, edited_indexes)
        assert len(merged_indexes) == len(expected_index)
        for j in range(len(merged_indexes)):
            i = merged_indexes[j]
            ei = expected_index[j]
            assert i[0] == ei[0]
            assert len(i[1]) == len(ei[1])
            for k, t in zip(i[1], ei[1]):
                assert len(k) == len(t)
                for m in range(len(k)):
                    assert k[m] == t[m]

    except Exception as ex:
        print(ex)
        b._index_coll.drop()
        assert False
    else:
        b._index_coll.drop()
예제 #6
0
def test__index_new_docs():
    collection = [
        [0, [['xx', 'yy', 'zz'], ['xx', 'tt']]],
        [1, [['yy', 'yy', 'zz'], ['zz', 'tt']]],
    ]

    b = Indexing(db=db)
    index = b._index_new_docs(collection)

    expected_index = [[0, [[0, 2, 0, 3]]], [1, [[0, 1, 1], [1, 2, 0, 1]]],
                      [2, [[0, 1, 2], [1, 2, 2, 3]]],
                      [3, [[0, 1, 4], [1, 1, 4]]]]
    assert len(index) == len(expected_index)
    for j in range(4):
        i = index[j]
        ei = expected_index[j]
        assert i[0] == ei[0]
        assert len(i[1]) == len(ei[1])
        for k, t in zip(i[1], ei[1]):
            assert len(k) == len(t)
            for m in range(len(k)):
                assert k[m] == t[m]
예제 #7
0
def test_retrieve():
    q = 'tt yy zz'

    collection = ['xx yy zz. xx tt.', 'yy yy zz. zz tt kk.', 'kk gh mk']
    db.content_coll.insert_many(collection)
    collection = list(db.content_coll.find_many())
    collection = [[d['_id'], d['content'], 0] for d in collection]

    preprocessing = Preprocessing()
    ranking = CosineScoring()
    indexing = Indexing(db=db, preprocessing=preprocessing)
    indexing.index(collection)
    b = Retrieval(db=db,
                  indexing=indexing,
                  preprocessing=preprocessing,
                  ranking=ranking)

    try:
        ranked_docs = b.retrieve(q)
        ranked_docsids = [d[0] for d in ranked_docs]
        expected_docids = [collection[1][0], collection[0][0]]

        assert len(ranked_docsids) == len(expected_docids)
        for a, b in zip(ranked_docsids, expected_docids):
            assert a == b

        db.index_coll.drop()
        db.vocabulary_coll.drop()
        db.contentvectors_coll.drop()
        db.content_coll.drop()
    except Exception as ex:
        print(ex)
        db.index_coll.drop()
        db.vocabulary_coll.drop()
        db.contentvectors_coll.drop()
        db.content_coll.drop()
        assert False
예제 #8
0
def test__merge_pl1():
    # test more than one keyword found
    indexes = [
        {
            'termid': 1,
            'pl': [[0, 1, 11], [1, 1, 1]]
        },
        {
            'termid': 3,
            'pl': [[0, 1, 14], [1, 1, 2]]
        },
        {
            'termid': 0,
            'pl': [[0, 2, 10, 15], [1, 1, 3]]
        },
        {
            'termid': 2,
            'pl': [[0, 1, 12], [1, 1, 12], [2, 1, 12], [3, 1, 12]]
        },
    ]

    indexing = Indexing(db=db)
    preprocessing = Preprocessing()
    ranking = CosineScoring()
    b = Retrieval(db=db,
                  indexing=indexing,
                  preprocessing=preprocessing,
                  ranking=ranking)
    merged_indexes, docids = b._merge_indexes(indexes)
    merged_indexes = sorted(merged_indexes,
                            key=lambda x: x[0])  # for testing only

    expected_merged_indexes = [
        [0, [(0, 2), (1, 1), (2, 1), (3, 1)], [10, 11, 12, 14, 15]],
        [1, [(0, 1), (1, 1), (2, 1), (3, 1)], [1, 2, 3, 12]],
    ]

    assert len(merged_indexes) == len(expected_merged_indexes)
    for i, j in zip(merged_indexes, expected_merged_indexes):
        assert len(i) == len(j)
        for t, k in zip(i, j):
            assert t == k
예제 #9
0
def test_rank():
    docs = [[1, 'xx xx'], [3, 'yy xy'], [7, 'xx zz']]

    scores = [[3, 5], [1, 4], [7, 1]]

    indexing = Indexing(db=db)
    preprocessing = Preprocessing()
    ranking = CosineScoring()
    b = Retrieval(db=db,
                  indexing=indexing,
                  preprocessing=preprocessing,
                  ranking=ranking)

    ranked_docs = b._rank(docs, scores)

    exp_ranked_docs = [[3, 'yy xy'], [1, 'xx xx'], [7, 'xx zz']]

    assert len(ranked_docs) == len(exp_ranked_docs)
    for a, b in zip(ranked_docs, exp_ranked_docs):
        assert len(a) == len(b)
        for c, d in zip(a, b):
            assert c == d
예제 #10
0
def test__merge_pl0():
    # Test: only one term found
    indexes = [
        {
            'termid': 1,
            'pl': [[0, 1, 11]]
        },
        {
            'termid': 3,
            'pl': [[0, 1, 14]]
        },
        {
            'termid': 0,
            'pl': [[0, 1, 10], [1, 1, 20]]
        },
        {
            'termid': 2,
            'pl': [[0, 1, 12], [1, 1, 12], [2, 1, 12], [3, 1, 12]]
        },
    ]

    indexing = Indexing(db=db)
    preprocessing = Preprocessing()
    ranking = CosineScoring()
    b = Retrieval(db=db,
                  indexing=indexing,
                  preprocessing=preprocessing,
                  ranking=ranking)
    merged_indexes, docids = b._merge_indexes([indexes[0]])

    expected_merged_indexes = [[0, (1, 1), [11]]]
    assert len(merged_indexes) == len(expected_merged_indexes)
    for i, j in zip(merged_indexes, expected_merged_indexes):
        assert len(i) == len(j)
        for t, k in zip(i, j):
            assert t == k
예제 #11
0
파일: irsys.py 프로젝트: phammanhhiep/azir
class IRSYS:
	def __init__ (self, db=None, dbname='market', max_queue=100, max_delete_cache=100, max_update_wait_time=300, max_delete_wait_time=300):
		'''
		Set up parameters and index documents from the last queue and cache.

		::param max_queue:: max number of docids in the queue
		::param max_delete_cache:: max number of docids in the cache
		::param max_update_wait_time:: max waiting time in seconds before update documents with docids in queue
		::param max_delete_wait_time:: max waiting time in seconds before delete documents with docids in the delete cache.
		'''
		self.max_queue = max_queue
		self.max_delete_cache = max_delete_cache
		self.max_update_wait_time = max_update_wait_time
		self.max_delete_wait_time = max_delete_wait_time
		self.db = db
		self._create_cache ()
		self._ranking = CosineScoring ()
		self._preprocessing = Preprocessing () 		
		self._indexing = Indexing (db=db, preprocessing=self._preprocessing) 
		self._retrieval = Retrieval (db=db, preprocessing=self._preprocessing, ranking=self._ranking, indexing=self._indexing)
		self.index ()

	def _create_cache (self):
		'''
		Cache:
			+ vocabulary and most frequently used index.
			+ delete cache
		'''	
		self._create_delete_cache ()
		self._create_queue ()

	def _create_delete_cache (self): 
		'''
		Used to keep docids of documents being deleted by users.  
		'''

	def _create_queue (self):
		'''
		Used to store docids and their states in the queue
		'''
		self._queue = []

	def _get_queue (self):
		return self._queue

	def _reset_queue (self):
		'''
		Reset queue and its states
		'''
		self._create_queue()
	
	def _queue_add (self, doc):
		'''
		Add docid and its state to the queue
		'''
		status = False
		if len (self._queue) < self.max_queue:
			self._queue.append (doc)
			status = True
		return status

	def _drop_queue (self):
		'''
		Delete queue in disk once all docs in queue are processed.
		'''	
		self.db.queued_content_coll.drop ()

	def save_queue (self):
		'''
		In some situations, a queue needs to be saved like 
		when the system is down suddently.
		'''	
		self.db.queued_content_coll.insert_many (self._queue)

	def _fetch_doc_content (self, docs=None):
		'''
		Get document content, add the content to correspind doc object.
		::param docs:: a dictionary, whose key is a docid, and value is its state 
		::return:: a list of list each of which has format, [docid, content, state]
		'''
		DOCID = 0
		STATE = 1
		F_DOCID = 0
		F_CONTENT = 1
		F_STATE = 2
		if docs is None:
			raise ValueError ('No documents are provided.')

		docids = [d[DOCID] for d in docs]
		states = [d[STATE] for d in docs]
		foundDocs = self._retrieval.fetch_docs (docids)
		foundDocs = sorted (foundDocs, key=lambda x: docids.index (x[F_DOCID]))
		[d.append (s) for s,d in zip (states, foundDocs)]
		return foundDocs

	def index (self, doc=None):
		'''
		Add to queue if possible or to index all documents whose docids in the queue.

		::param docs:: a dictionary, whose key is a docid, and value is its state 
		'''
		result = None
		if doc is None:
			last_docs = self._get_queue ()
			last_collection = self._fetch_doc_content (last_docs)
			self._reset_queue ()
			result = self._indexing.index (last_collection)
		else:
			if self._queue_add (doc):
				result = True
			else:
				last_docs = self._get_queue ()
				last_collection = self._fetch_doc_content (last_docs)
				result = self._indexing.index (last_collection)
				self._reset_queue ()
				if not self._queue_add (doc):
					raise ValueError ('Cannot add a document to queue.')
		if result is True:
			self._drop_queue ()		
		return result

	def retrieve (self, query):
		result = self.retrieval.retrieve (query)
		return result
예제 #12
0
def test__fetch_indexes():
    '''
	Test: Fetch existing terms
	'''

    vocabulary = [
        {
            'termid': 0,
            'term': 'xx',
            'df': 1
        },
        {
            'termid': 1,
            'term': 'yy',
            'df': 1
        },
        {
            'termid': 2,
            'term': 'zz',
            'df': 1
        },
        {
            'termid': 3,
            'term': 'tt',
            'df': 1
        },
    ]

    index = [
        {
            'termid': 0,
            'pl': [[0, 1, 10], [1, 1, 20]]
        },
        {
            'termid': 1,
            'pl': [[0, 1, 11]]
        },
        {
            'termid': 2,
            'pl': [[0, 1, 12], [1, 1, 12], [2, 1, 12], [3, 1, 12]]
        },
        {
            'termid': 3,
            'pl': [[0, 1, 14]]
        },
    ]

    tokens = ['xx', 'yy', 'zz', 'tt']

    expected_pl = [
        {
            'termid': 1,
            'pl': [[0, 1, 11]]
        },
        {
            'termid': 3,
            'pl': [[0, 1, 14]]
        },
        {
            'termid': 0,
            'pl': [[0, 1, 10], [1, 1, 20]]
        },
        {
            'termid': 2,
            'pl': [[0, 1, 12], [1, 1, 12], [2, 1, 12], [3, 1, 12]]
        },
    ]

    indexing = Indexing(db=db)
    preprocessing = Preprocessing()
    ranking = CosineScoring()
    indexing._vocabulary_coll.insert_many(vocabulary)
    indexing._index_coll.insert_many(index)
    indexing.create_cache()
    r = Retrieval(db=db,
                  indexing=indexing,
                  preprocessing=preprocessing,
                  ranking=ranking)

    try:
        pl = r._fetch_indexes(tokens)

        assert len(pl) == len(expected_pl)
        for a, b in zip(pl, expected_pl):
            assert a['termid'] == b['termid']
            assert len(a['pl']) == len(b['pl'])
            for c, d in zip(a['pl'], b['pl']):
                assert len(c) == len(d)
                for e, f in zip(c, d):
                    assert e == f

    except Exception as ex:
        print(ex)
        indexing._vocabulary_coll.drop()
        indexing._index_coll.drop()
        assert False
    else:
        indexing._vocabulary_coll.drop()
        indexing._index_coll.drop()
예제 #13
0
def test__parse2():
    '''
	Test:  Vocabulary exists before.
	Test:  document vectors exists before.
	'''

    prev_vocabulary = [{
        'term': 'xx',
        'termid': 0,
        'df': 1
    }, {
        'term': 'yy',
        'termid': 1,
        'df': 2
    }, {
        'term': 'zz',
        'termid': 2,
        'df': 2
    }, {
        'term': 'tt',
        'termid': 3,
        'df': 2
    }, {
        'term': 'nn',
        'termid': 4,
        'df': 1
    }, {
        'term': 'mm',
        'termid': 5,
        'df': 1
    }]

    prev_doc_vectors = [
        {
            'docid': 0,
            'tf': [(0, 1), (1, 1), (2, 3), (3, 1)]
        },
        {
            'docid': 1,
            'tf': [(2, 2), (3, 2), (4, 1)]
        },
    ]

    tokens = [
        [['xx', 'yy', 'zz'], ['xx', 'tt']],
        [['yy', 'yy', 'zz'], ['zz', 'tt', 'kk']],
    ]

    docIDs = [0, 2]
    db.vocabulary_coll.insert_many(prev_vocabulary)
    db.contentvectors_coll.insert_many(prev_doc_vectors)
    indexing = Indexing(db=db)

    try:
        postings = indexing._parse(tokens, docIDs)
        vocabulary = indexing.get_vocabulary()

        exp_vocabulary = {
            'xx': {
                'termid': 0,
                'df': 2
            },
            'yy': {
                'termid': 1,
                'df': 4
            },
            'zz': {
                'termid': 2,
                'df': 4
            },
            'tt': {
                'termid': 3,
                'df': 4
            },
            'nn': {
                'termid': 4,
                'df': 1
            },
            'mm': {
                'termid': 5,
                'df': 1
            },
            'kk': {
                'termid': 6,
                'df': 1
            },
        }

        assert len(exp_vocabulary) == len(vocabulary)
        for k, v in exp_vocabulary.items():
            assert vocabulary[k]['termid'] == v['termid']
            assert vocabulary[k]['df'] == v['df']

        expected_postings = [[0, 0, 0], [1, 0, 1], [2, 0, 2], [0, 0, 3],
                             [3, 0, 4], [1, 2, 0], [1, 2, 1], [2, 2, 2],
                             [2, 2, 3], [3, 2, 4], [6, 2, 5]]

        assert len(postings) == len(expected_postings)
        for a, b in zip(postings, expected_postings):
            assert len(a) == len(b)
            for c, d in zip(a, b):
                assert c == d

        expected_doc_vectors = [
            {
                'docid': 0,
                'tf': [(0, 2), (1, 1), (2, 1), (3, 1)]
            },
            {
                'docid': 1,
                'tf': [(2, 2), (3, 2), (4, 1)]
            },
            {
                'docid': 2,
                'tf': [(1, 2), (2, 2), (3, 1), (6, 1)]
            },
        ]

        doc_vectors = list(indexing._doc_vector_coll._coll.find().sort(
            'docid', 1))

        assert len(expected_doc_vectors) == len(doc_vectors)
        for a, b in zip(expected_doc_vectors, doc_vectors):
            assert a['docid'] == b['docid']
            assert len(a['tf']) == len(b['tf'])
            for c, d in zip(a['tf'], b['tf']):
                for e, f in zip(c, d):
                    assert e == f
    except Exception as ex:
        print(ex)
        indexing._doc_vector_coll.drop()
        indexing._vocabulary_coll.drop()
        assert False
    else:
        indexing._doc_vector_coll.drop()
        indexing._vocabulary_coll.drop()
예제 #14
0
def test_index():

    collection = [
        [0, 'xx yy zz. xx tt.', 1],
        [10, 'yy yy zz. zz tt kk.', 0],
    ]

    vocabulary = [{
        'term': 'xx',
        'termid': 0,
        'df': 1
    }, {
        'term': 'yy',
        'termid': 1,
        'df': 2
    }, {
        'term': 'zz',
        'termid': 2,
        'df': 2
    }, {
        'term': 'tt',
        'termid': 3,
        'df': 2
    }, {
        'term': 'nn',
        'termid': 4,
        'df': 1
    }, {
        'term': 'mm',
        'termid': 5,
        'df': 1
    }]

    disk_indexes = [
        [0, [[0, 3, 10, 19]]],
        [1, [[0, 1, 6], [1, 2, 0, 20], [3, 1, 10]]],
        [2, [[0, 1, 5], [1, 2, 2, 7]]],
        [3, [[0, 1, 4], [1, 1, 4]]],
        [4, [[0, 1, 16]]],
        [5, [[0, 1, 17]]],
    ]

    cache_indexes = {
        1: [[0, 1, 6], [1, 2, 0, 20], [3, 1, 10]],
        5: [[0, 1, 16]]
    }
    cache_indexes = defaultdict(lambda: None, cache_indexes)

    expected_index = [[0, [[0, 2, 0, 3]]],
                      [
                          1,
                          [[0, 1, 1], [1, 2, 0, 20], [3, 1, 10], [10, 2, 0, 1]]
                      ], [2, [[0, 1, 2], [1, 2, 2, 7], [10, 2, 2, 3]]],
                      [3, [[0, 1, 4], [1, 1, 4], [10, 1, 4]]],
                      [4, [[0, 1, 16]]], [5, [[0, 1, 17]]], [6, [[10, 1, 5]]]]

    exp_vocabulary = {
        'xx': {
            'termid': 0,
            'df': 2
        },
        'yy': {
            'termid': 1,
            'df': 4
        },
        'zz': {
            'termid': 2,
            'df': 4
        },
        'tt': {
            'termid': 3,
            'df': 4
        },
        'nn': {
            'termid': 4,
            'df': 1
        },
        'mm': {
            'termid': 5,
            'df': 1
        },
        'kk': {
            'termid': 6,
            'df': 1
        },
    }

    preprocessing = Preprocessing()
    b = Indexing(db=db, preprocessing=preprocessing)
    b._vocabulary_coll.insert_many(vocabulary)
    b._create_vocabulary_cache()
    b.save_indexes(disk_indexes)
    b._indexes = cache_indexes

    try:
        b.index(collection)
        _vocabulary = b.get_vocabulary()

        # test vocabulary
        assert len(_vocabulary) == len(exp_vocabulary)
        for k, v in _vocabulary.items():
            assert v['termid'] == exp_vocabulary[k]['termid']
            assert v['df'] == exp_vocabulary[k]['df']

        fetched_index = b.fetch_indexes([0, 1, 2, 3, 4, 5, 6])
        fetched_index = b._to_list_memory_indexes(fetched_index)

        # test indexes
        assert len(fetched_index) == len(expected_index)
        for j in range(len(fetched_index)):
            i = fetched_index[j]
            ei = expected_index[j]
            assert i[0] == ei[0]
            assert len(i[1]) == len(ei[1])
            for k, t in zip(i[1], ei[1]):
                assert len(k) == len(t)
                for m in range(len(k)):
                    assert k[m] == t[m]

        # test document vectors

    except Exception as ex:
        print(ex)
        b._vocabulary_coll.drop()
        b._index_coll.drop()
        b._doc_vector_coll.drop()
        assert False
    else:
        b._vocabulary_coll.drop()
        b._index_coll.drop()
        b._doc_vector_coll.drop()
예제 #15
0
def test__parse():
    '''
	Test: No vocabulary existing before.
	Test: No document vector before.
	'''
    tokens = [
        [['xx', 'yy', 'zz'], ['xx', 'tt']],
        [['yy', 'yy', 'zz'], ['zz', 'tt']],
    ]

    docIDs = [0, 1]
    indexing = Indexing(db=db)

    try:
        postings = indexing._parse(tokens, docIDs)
        vocabulary = indexing.get_vocabulary()

        exp_vocabulary = {
            'xx': {
                'termid': 0,
                'df': 1
            },
            'yy': {
                'termid': 1,
                'df': 2
            },
            'zz': {
                'termid': 2,
                'df': 2
            },
            'tt': {
                'termid': 3,
                'df': 2
            },
        }

        assert len(exp_vocabulary) == len(vocabulary)
        for k, v in exp_vocabulary.items():
            assert vocabulary[k]['termid'] == v['termid']
            assert vocabulary[k]['df'] == v['df']

        expected_postings = [[0, 0, 0], [1, 0, 1], [2, 0, 2], [0, 0, 3],
                             [3, 0, 4], [1, 1, 0], [1, 1, 1], [2, 1, 2],
                             [2, 1, 3], [3, 1, 4]]

        assert len(postings) == len(expected_postings)
        for a, b in zip(postings, expected_postings):
            assert len(a) == len(b)
            for c, d in zip(a, b):
                assert c == d

        expected_doc_vectors = [
            {
                'docid': 0,
                'tf': [(0, 2), (1, 1), (2, 1), (3, 1)]
            },
            {
                'docid': 1,
                'tf': [(1, 2), (2, 2), (3, 1)]
            },
        ]
        doc_vectors = list(indexing._doc_vector_coll._coll.find().sort(
            'docid', 1))

        assert len(expected_doc_vectors) == len(doc_vectors)
        for a, b in zip(expected_doc_vectors, doc_vectors):
            assert a['docid'] == b['docid']
            assert len(a['tf']) == len(b['tf'])
            for c, d in zip(a['tf'], b['tf']):
                for e, f in zip(c, d):
                    assert e == f
    except Exception as ex:
        print(ex)
        indexing._doc_vector_coll.drop()
        assert False
    else:
        indexing._doc_vector_coll.drop()