def test__left_join_postings_lists(): # test if pl = [[0, 1, 1], [3, 2, 0, 1]] target_pl = [[0, 2, 4, 5], [2, 1, 10]] b = Indexing(db=db) pl = b._left_join_postings_lists(pl, target_pl) expected_pl = [[0, 1, 1], [2, 1, 10], [3, 2, 0, 1]] assert len(pl) == len(expected_pl) for k, m in zip(pl, expected_pl): assert len(k) == len(m) for t, n in zip(k, m): assert t == n
def test__index(): postings = [[0, 0, 0], [1, 0, 1], [2, 0, 2], [0, 0, 3], [3, 0, 4], [1, 1, 0], [1, 1, 1], [2, 1, 2], [2, 1, 3], [3, 1, 4]] b = Indexing(db=db) index = b._index(postings) expected_index = [[0, [[0, 2, 0, 3]]], [1, [[0, 1, 1], [1, 2, 0, 1]]], [2, [[0, 1, 2], [1, 2, 2, 3]]], [3, [[0, 1, 4], [1, 1, 4]]]] assert len(index) == len(expected_index) for j in range(4): i = index[j] ei = expected_index[j] assert i[0] == ei[0] assert len(i[1]) == len(ei[1]) for k, t in zip(i[1], ei[1]): assert len(k) == len(t) for m in range(len(k)): assert k[m] == t[m]
def __init__ (self, db=None, dbname='market', max_queue=100, max_delete_cache=100, max_update_wait_time=300, max_delete_wait_time=300): ''' Set up parameters and index documents from the last queue and cache. ::param max_queue:: max number of docids in the queue ::param max_delete_cache:: max number of docids in the cache ::param max_update_wait_time:: max waiting time in seconds before update documents with docids in queue ::param max_delete_wait_time:: max waiting time in seconds before delete documents with docids in the delete cache. ''' self.max_queue = max_queue self.max_delete_cache = max_delete_cache self.max_update_wait_time = max_update_wait_time self.max_delete_wait_time = max_delete_wait_time self.db = db self._create_cache () self._ranking = CosineScoring () self._preprocessing = Preprocessing () self._indexing = Indexing (db=db, preprocessing=self._preprocessing) self._retrieval = Retrieval (db=db, preprocessing=self._preprocessing, ranking=self._ranking, indexing=self._indexing) self.index ()
def test__update_indexes(): index = [[0, [[0, 2, 0, 3]]], [1, [[0, 1, 1], [1, 2, 0, 1]]], [2, [[0, 1, 2], [1, 2, 2, 3]]], [3, [[0, 1, 4], [1, 1, 4]]]] b = Indexing(db=db) try: b.update_indexes(index) fetched_index = b.fetch_indexes([0, 1, 2, 3]) fetched_index = b._to_list_memory_indexes(fetched_index) assert len(fetched_index) == len(index) for j in range(4): i = fetched_index[j] ei = index[j] assert i[0] == ei[0] assert len(i[1]) == len(ei[1]) for k, t in zip(i[1], ei[1]): assert len(k) == len(t) for m in range(len(k)): assert k[m] == t[m] except Exception as ex: print(ex) b._index_coll.drop() assert False else: b._index_coll.drop()
def test__merge_indexes(): # Test cache indexes contains posting lists of docs that edited index does not have # Test cache indexes contains terms that edited index does not have # Test new indexes contains new terms. # Test next indexes contains existing terms. new_indexes = [ [0, [[10, 2, 0, 3]]], [1, [[11, 1, 1], [12, 2, 0, 1]]], [2, [[10, 1, 2], [12, 2, 2, 3]]], [3, [[10, 1, 4], [13, 1, 4]]], [10, [[10, 1, 5], [13, 1, 5]]], ] edited_indexes = [[0, [[0, 2, 0, 3]]], [1, [[0, 1, 1], [1, 2, 0, 1]]], [2, [[0, 1, 2], [1, 2, 2, 3]]], [3, [[0, 1, 4], [1, 1, 4]]]] cache_indexes = { 1: [[0, 1, 6], [1, 2, 0, 20], [3, 1, 10]], 5: [[0, 1, 16]] } cache_indexes = defaultdict(lambda: None, cache_indexes) disk_indexes = [[0, [[0, 3, 10, 19]]], [1, [[0, 1, 6], [1, 2, 0, 20]]], [2, [[0, 1, 5], [1, 2, 2, 7]]], [3, [[0, 1, 4], [1, 1, 4]]], [5, [[0, 1, 16]]]] expected_index = [ [0, [[0, 2, 0, 3], [10, 2, 0, 3]]], [1, [[0, 1, 1], [1, 2, 0, 1], [3, 1, 10], [11, 1, 1], [12, 2, 0, 1]]], [2, [[0, 1, 2], [1, 2, 2, 3], [10, 1, 2], [12, 2, 2, 3]]], [3, [[0, 1, 4], [1, 1, 4], [10, 1, 4], [13, 1, 4]]], [10, [[10, 1, 5], [13, 1, 5]]] ] b = Indexing(db=db) b.save_indexes(disk_indexes) b._indexes = cache_indexes try: merged_indexes = b._merge_indexes(new_indexes, edited_indexes) assert len(merged_indexes) == len(expected_index) for j in range(len(merged_indexes)): i = merged_indexes[j] ei = expected_index[j] assert i[0] == ei[0] assert len(i[1]) == len(ei[1]) for k, t in zip(i[1], ei[1]): assert len(k) == len(t) for m in range(len(k)): assert k[m] == t[m] except Exception as ex: print(ex) b._index_coll.drop() assert False else: b._index_coll.drop()
def test__index_new_docs(): collection = [ [0, [['xx', 'yy', 'zz'], ['xx', 'tt']]], [1, [['yy', 'yy', 'zz'], ['zz', 'tt']]], ] b = Indexing(db=db) index = b._index_new_docs(collection) expected_index = [[0, [[0, 2, 0, 3]]], [1, [[0, 1, 1], [1, 2, 0, 1]]], [2, [[0, 1, 2], [1, 2, 2, 3]]], [3, [[0, 1, 4], [1, 1, 4]]]] assert len(index) == len(expected_index) for j in range(4): i = index[j] ei = expected_index[j] assert i[0] == ei[0] assert len(i[1]) == len(ei[1]) for k, t in zip(i[1], ei[1]): assert len(k) == len(t) for m in range(len(k)): assert k[m] == t[m]
def test_retrieve(): q = 'tt yy zz' collection = ['xx yy zz. xx tt.', 'yy yy zz. zz tt kk.', 'kk gh mk'] db.content_coll.insert_many(collection) collection = list(db.content_coll.find_many()) collection = [[d['_id'], d['content'], 0] for d in collection] preprocessing = Preprocessing() ranking = CosineScoring() indexing = Indexing(db=db, preprocessing=preprocessing) indexing.index(collection) b = Retrieval(db=db, indexing=indexing, preprocessing=preprocessing, ranking=ranking) try: ranked_docs = b.retrieve(q) ranked_docsids = [d[0] for d in ranked_docs] expected_docids = [collection[1][0], collection[0][0]] assert len(ranked_docsids) == len(expected_docids) for a, b in zip(ranked_docsids, expected_docids): assert a == b db.index_coll.drop() db.vocabulary_coll.drop() db.contentvectors_coll.drop() db.content_coll.drop() except Exception as ex: print(ex) db.index_coll.drop() db.vocabulary_coll.drop() db.contentvectors_coll.drop() db.content_coll.drop() assert False
def test__merge_pl1(): # test more than one keyword found indexes = [ { 'termid': 1, 'pl': [[0, 1, 11], [1, 1, 1]] }, { 'termid': 3, 'pl': [[0, 1, 14], [1, 1, 2]] }, { 'termid': 0, 'pl': [[0, 2, 10, 15], [1, 1, 3]] }, { 'termid': 2, 'pl': [[0, 1, 12], [1, 1, 12], [2, 1, 12], [3, 1, 12]] }, ] indexing = Indexing(db=db) preprocessing = Preprocessing() ranking = CosineScoring() b = Retrieval(db=db, indexing=indexing, preprocessing=preprocessing, ranking=ranking) merged_indexes, docids = b._merge_indexes(indexes) merged_indexes = sorted(merged_indexes, key=lambda x: x[0]) # for testing only expected_merged_indexes = [ [0, [(0, 2), (1, 1), (2, 1), (3, 1)], [10, 11, 12, 14, 15]], [1, [(0, 1), (1, 1), (2, 1), (3, 1)], [1, 2, 3, 12]], ] assert len(merged_indexes) == len(expected_merged_indexes) for i, j in zip(merged_indexes, expected_merged_indexes): assert len(i) == len(j) for t, k in zip(i, j): assert t == k
def test_rank(): docs = [[1, 'xx xx'], [3, 'yy xy'], [7, 'xx zz']] scores = [[3, 5], [1, 4], [7, 1]] indexing = Indexing(db=db) preprocessing = Preprocessing() ranking = CosineScoring() b = Retrieval(db=db, indexing=indexing, preprocessing=preprocessing, ranking=ranking) ranked_docs = b._rank(docs, scores) exp_ranked_docs = [[3, 'yy xy'], [1, 'xx xx'], [7, 'xx zz']] assert len(ranked_docs) == len(exp_ranked_docs) for a, b in zip(ranked_docs, exp_ranked_docs): assert len(a) == len(b) for c, d in zip(a, b): assert c == d
def test__merge_pl0(): # Test: only one term found indexes = [ { 'termid': 1, 'pl': [[0, 1, 11]] }, { 'termid': 3, 'pl': [[0, 1, 14]] }, { 'termid': 0, 'pl': [[0, 1, 10], [1, 1, 20]] }, { 'termid': 2, 'pl': [[0, 1, 12], [1, 1, 12], [2, 1, 12], [3, 1, 12]] }, ] indexing = Indexing(db=db) preprocessing = Preprocessing() ranking = CosineScoring() b = Retrieval(db=db, indexing=indexing, preprocessing=preprocessing, ranking=ranking) merged_indexes, docids = b._merge_indexes([indexes[0]]) expected_merged_indexes = [[0, (1, 1), [11]]] assert len(merged_indexes) == len(expected_merged_indexes) for i, j in zip(merged_indexes, expected_merged_indexes): assert len(i) == len(j) for t, k in zip(i, j): assert t == k
class IRSYS: def __init__ (self, db=None, dbname='market', max_queue=100, max_delete_cache=100, max_update_wait_time=300, max_delete_wait_time=300): ''' Set up parameters and index documents from the last queue and cache. ::param max_queue:: max number of docids in the queue ::param max_delete_cache:: max number of docids in the cache ::param max_update_wait_time:: max waiting time in seconds before update documents with docids in queue ::param max_delete_wait_time:: max waiting time in seconds before delete documents with docids in the delete cache. ''' self.max_queue = max_queue self.max_delete_cache = max_delete_cache self.max_update_wait_time = max_update_wait_time self.max_delete_wait_time = max_delete_wait_time self.db = db self._create_cache () self._ranking = CosineScoring () self._preprocessing = Preprocessing () self._indexing = Indexing (db=db, preprocessing=self._preprocessing) self._retrieval = Retrieval (db=db, preprocessing=self._preprocessing, ranking=self._ranking, indexing=self._indexing) self.index () def _create_cache (self): ''' Cache: + vocabulary and most frequently used index. + delete cache ''' self._create_delete_cache () self._create_queue () def _create_delete_cache (self): ''' Used to keep docids of documents being deleted by users. ''' def _create_queue (self): ''' Used to store docids and their states in the queue ''' self._queue = [] def _get_queue (self): return self._queue def _reset_queue (self): ''' Reset queue and its states ''' self._create_queue() def _queue_add (self, doc): ''' Add docid and its state to the queue ''' status = False if len (self._queue) < self.max_queue: self._queue.append (doc) status = True return status def _drop_queue (self): ''' Delete queue in disk once all docs in queue are processed. ''' self.db.queued_content_coll.drop () def save_queue (self): ''' In some situations, a queue needs to be saved like when the system is down suddently. ''' self.db.queued_content_coll.insert_many (self._queue) def _fetch_doc_content (self, docs=None): ''' Get document content, add the content to correspind doc object. ::param docs:: a dictionary, whose key is a docid, and value is its state ::return:: a list of list each of which has format, [docid, content, state] ''' DOCID = 0 STATE = 1 F_DOCID = 0 F_CONTENT = 1 F_STATE = 2 if docs is None: raise ValueError ('No documents are provided.') docids = [d[DOCID] for d in docs] states = [d[STATE] for d in docs] foundDocs = self._retrieval.fetch_docs (docids) foundDocs = sorted (foundDocs, key=lambda x: docids.index (x[F_DOCID])) [d.append (s) for s,d in zip (states, foundDocs)] return foundDocs def index (self, doc=None): ''' Add to queue if possible or to index all documents whose docids in the queue. ::param docs:: a dictionary, whose key is a docid, and value is its state ''' result = None if doc is None: last_docs = self._get_queue () last_collection = self._fetch_doc_content (last_docs) self._reset_queue () result = self._indexing.index (last_collection) else: if self._queue_add (doc): result = True else: last_docs = self._get_queue () last_collection = self._fetch_doc_content (last_docs) result = self._indexing.index (last_collection) self._reset_queue () if not self._queue_add (doc): raise ValueError ('Cannot add a document to queue.') if result is True: self._drop_queue () return result def retrieve (self, query): result = self.retrieval.retrieve (query) return result
def test__fetch_indexes(): ''' Test: Fetch existing terms ''' vocabulary = [ { 'termid': 0, 'term': 'xx', 'df': 1 }, { 'termid': 1, 'term': 'yy', 'df': 1 }, { 'termid': 2, 'term': 'zz', 'df': 1 }, { 'termid': 3, 'term': 'tt', 'df': 1 }, ] index = [ { 'termid': 0, 'pl': [[0, 1, 10], [1, 1, 20]] }, { 'termid': 1, 'pl': [[0, 1, 11]] }, { 'termid': 2, 'pl': [[0, 1, 12], [1, 1, 12], [2, 1, 12], [3, 1, 12]] }, { 'termid': 3, 'pl': [[0, 1, 14]] }, ] tokens = ['xx', 'yy', 'zz', 'tt'] expected_pl = [ { 'termid': 1, 'pl': [[0, 1, 11]] }, { 'termid': 3, 'pl': [[0, 1, 14]] }, { 'termid': 0, 'pl': [[0, 1, 10], [1, 1, 20]] }, { 'termid': 2, 'pl': [[0, 1, 12], [1, 1, 12], [2, 1, 12], [3, 1, 12]] }, ] indexing = Indexing(db=db) preprocessing = Preprocessing() ranking = CosineScoring() indexing._vocabulary_coll.insert_many(vocabulary) indexing._index_coll.insert_many(index) indexing.create_cache() r = Retrieval(db=db, indexing=indexing, preprocessing=preprocessing, ranking=ranking) try: pl = r._fetch_indexes(tokens) assert len(pl) == len(expected_pl) for a, b in zip(pl, expected_pl): assert a['termid'] == b['termid'] assert len(a['pl']) == len(b['pl']) for c, d in zip(a['pl'], b['pl']): assert len(c) == len(d) for e, f in zip(c, d): assert e == f except Exception as ex: print(ex) indexing._vocabulary_coll.drop() indexing._index_coll.drop() assert False else: indexing._vocabulary_coll.drop() indexing._index_coll.drop()
def test__parse2(): ''' Test: Vocabulary exists before. Test: document vectors exists before. ''' prev_vocabulary = [{ 'term': 'xx', 'termid': 0, 'df': 1 }, { 'term': 'yy', 'termid': 1, 'df': 2 }, { 'term': 'zz', 'termid': 2, 'df': 2 }, { 'term': 'tt', 'termid': 3, 'df': 2 }, { 'term': 'nn', 'termid': 4, 'df': 1 }, { 'term': 'mm', 'termid': 5, 'df': 1 }] prev_doc_vectors = [ { 'docid': 0, 'tf': [(0, 1), (1, 1), (2, 3), (3, 1)] }, { 'docid': 1, 'tf': [(2, 2), (3, 2), (4, 1)] }, ] tokens = [ [['xx', 'yy', 'zz'], ['xx', 'tt']], [['yy', 'yy', 'zz'], ['zz', 'tt', 'kk']], ] docIDs = [0, 2] db.vocabulary_coll.insert_many(prev_vocabulary) db.contentvectors_coll.insert_many(prev_doc_vectors) indexing = Indexing(db=db) try: postings = indexing._parse(tokens, docIDs) vocabulary = indexing.get_vocabulary() exp_vocabulary = { 'xx': { 'termid': 0, 'df': 2 }, 'yy': { 'termid': 1, 'df': 4 }, 'zz': { 'termid': 2, 'df': 4 }, 'tt': { 'termid': 3, 'df': 4 }, 'nn': { 'termid': 4, 'df': 1 }, 'mm': { 'termid': 5, 'df': 1 }, 'kk': { 'termid': 6, 'df': 1 }, } assert len(exp_vocabulary) == len(vocabulary) for k, v in exp_vocabulary.items(): assert vocabulary[k]['termid'] == v['termid'] assert vocabulary[k]['df'] == v['df'] expected_postings = [[0, 0, 0], [1, 0, 1], [2, 0, 2], [0, 0, 3], [3, 0, 4], [1, 2, 0], [1, 2, 1], [2, 2, 2], [2, 2, 3], [3, 2, 4], [6, 2, 5]] assert len(postings) == len(expected_postings) for a, b in zip(postings, expected_postings): assert len(a) == len(b) for c, d in zip(a, b): assert c == d expected_doc_vectors = [ { 'docid': 0, 'tf': [(0, 2), (1, 1), (2, 1), (3, 1)] }, { 'docid': 1, 'tf': [(2, 2), (3, 2), (4, 1)] }, { 'docid': 2, 'tf': [(1, 2), (2, 2), (3, 1), (6, 1)] }, ] doc_vectors = list(indexing._doc_vector_coll._coll.find().sort( 'docid', 1)) assert len(expected_doc_vectors) == len(doc_vectors) for a, b in zip(expected_doc_vectors, doc_vectors): assert a['docid'] == b['docid'] assert len(a['tf']) == len(b['tf']) for c, d in zip(a['tf'], b['tf']): for e, f in zip(c, d): assert e == f except Exception as ex: print(ex) indexing._doc_vector_coll.drop() indexing._vocabulary_coll.drop() assert False else: indexing._doc_vector_coll.drop() indexing._vocabulary_coll.drop()
def test_index(): collection = [ [0, 'xx yy zz. xx tt.', 1], [10, 'yy yy zz. zz tt kk.', 0], ] vocabulary = [{ 'term': 'xx', 'termid': 0, 'df': 1 }, { 'term': 'yy', 'termid': 1, 'df': 2 }, { 'term': 'zz', 'termid': 2, 'df': 2 }, { 'term': 'tt', 'termid': 3, 'df': 2 }, { 'term': 'nn', 'termid': 4, 'df': 1 }, { 'term': 'mm', 'termid': 5, 'df': 1 }] disk_indexes = [ [0, [[0, 3, 10, 19]]], [1, [[0, 1, 6], [1, 2, 0, 20], [3, 1, 10]]], [2, [[0, 1, 5], [1, 2, 2, 7]]], [3, [[0, 1, 4], [1, 1, 4]]], [4, [[0, 1, 16]]], [5, [[0, 1, 17]]], ] cache_indexes = { 1: [[0, 1, 6], [1, 2, 0, 20], [3, 1, 10]], 5: [[0, 1, 16]] } cache_indexes = defaultdict(lambda: None, cache_indexes) expected_index = [[0, [[0, 2, 0, 3]]], [ 1, [[0, 1, 1], [1, 2, 0, 20], [3, 1, 10], [10, 2, 0, 1]] ], [2, [[0, 1, 2], [1, 2, 2, 7], [10, 2, 2, 3]]], [3, [[0, 1, 4], [1, 1, 4], [10, 1, 4]]], [4, [[0, 1, 16]]], [5, [[0, 1, 17]]], [6, [[10, 1, 5]]]] exp_vocabulary = { 'xx': { 'termid': 0, 'df': 2 }, 'yy': { 'termid': 1, 'df': 4 }, 'zz': { 'termid': 2, 'df': 4 }, 'tt': { 'termid': 3, 'df': 4 }, 'nn': { 'termid': 4, 'df': 1 }, 'mm': { 'termid': 5, 'df': 1 }, 'kk': { 'termid': 6, 'df': 1 }, } preprocessing = Preprocessing() b = Indexing(db=db, preprocessing=preprocessing) b._vocabulary_coll.insert_many(vocabulary) b._create_vocabulary_cache() b.save_indexes(disk_indexes) b._indexes = cache_indexes try: b.index(collection) _vocabulary = b.get_vocabulary() # test vocabulary assert len(_vocabulary) == len(exp_vocabulary) for k, v in _vocabulary.items(): assert v['termid'] == exp_vocabulary[k]['termid'] assert v['df'] == exp_vocabulary[k]['df'] fetched_index = b.fetch_indexes([0, 1, 2, 3, 4, 5, 6]) fetched_index = b._to_list_memory_indexes(fetched_index) # test indexes assert len(fetched_index) == len(expected_index) for j in range(len(fetched_index)): i = fetched_index[j] ei = expected_index[j] assert i[0] == ei[0] assert len(i[1]) == len(ei[1]) for k, t in zip(i[1], ei[1]): assert len(k) == len(t) for m in range(len(k)): assert k[m] == t[m] # test document vectors except Exception as ex: print(ex) b._vocabulary_coll.drop() b._index_coll.drop() b._doc_vector_coll.drop() assert False else: b._vocabulary_coll.drop() b._index_coll.drop() b._doc_vector_coll.drop()
def test__parse(): ''' Test: No vocabulary existing before. Test: No document vector before. ''' tokens = [ [['xx', 'yy', 'zz'], ['xx', 'tt']], [['yy', 'yy', 'zz'], ['zz', 'tt']], ] docIDs = [0, 1] indexing = Indexing(db=db) try: postings = indexing._parse(tokens, docIDs) vocabulary = indexing.get_vocabulary() exp_vocabulary = { 'xx': { 'termid': 0, 'df': 1 }, 'yy': { 'termid': 1, 'df': 2 }, 'zz': { 'termid': 2, 'df': 2 }, 'tt': { 'termid': 3, 'df': 2 }, } assert len(exp_vocabulary) == len(vocabulary) for k, v in exp_vocabulary.items(): assert vocabulary[k]['termid'] == v['termid'] assert vocabulary[k]['df'] == v['df'] expected_postings = [[0, 0, 0], [1, 0, 1], [2, 0, 2], [0, 0, 3], [3, 0, 4], [1, 1, 0], [1, 1, 1], [2, 1, 2], [2, 1, 3], [3, 1, 4]] assert len(postings) == len(expected_postings) for a, b in zip(postings, expected_postings): assert len(a) == len(b) for c, d in zip(a, b): assert c == d expected_doc_vectors = [ { 'docid': 0, 'tf': [(0, 2), (1, 1), (2, 1), (3, 1)] }, { 'docid': 1, 'tf': [(1, 2), (2, 2), (3, 1)] }, ] doc_vectors = list(indexing._doc_vector_coll._coll.find().sort( 'docid', 1)) assert len(expected_doc_vectors) == len(doc_vectors) for a, b in zip(expected_doc_vectors, doc_vectors): assert a['docid'] == b['docid'] assert len(a['tf']) == len(b['tf']) for c, d in zip(a['tf'], b['tf']): for e, f in zip(c, d): assert e == f except Exception as ex: print(ex) indexing._doc_vector_coll.drop() assert False else: indexing._doc_vector_coll.drop()