def __init__(self, rules): """ Init the Index with an iterable of Rule objects. """ self.license_index = index.Index() if DEBUG_PERF: start = time.time() print('LicenseIndex: Starting building index.') # index rules text and keep a mapping of rules rid --> rule object self.rules_by_id = {} # note: we use numeric ids for rid, rule in enumerate(rules): # FXIEME: we should pass these len and counts downstream tokens, _min_len, _max_len, _gaps_count = rule.get_tokens() self.license_index.index_one_from_tokens(rid, tokens) self.rules_by_id[rid] = rule if DEBUG_PERF: duration = time.time() - start len_rules_by_id = len(self.rules_by_id) print('Finished building index with %(len_rules_by_id)d rules ' 'in %(duration)f seconds.' % locals())
def test_Index_exact_match_ngrams_templates_perfect_minimalist(self): index_doc = [u'name is joker, {{}} name is joker'] idx = index.Index(ngram_len=3) idx.index_one('tst', text_lines(index_doc), template=True) query_doc = [u'Hi my name is joker the joker name is joker yes.'] # 012345678901234567890123456789012345678901234567 # 11111111112222222222333333333344444444 expected = { 'tst': [(Token(start=0, start_line=0, start_char=0, end_line=0, end_char=33, end=5), Token(start=2, start_line=0, start_char=6, end_line=0, end_char=43, end=9))] } matches = idx.match(text_lines(query_doc)) assert {} != matches for k, val in matches.items(): assert expected[k] == val
def test_Index_exact_match_to_indexed_template_with_short_tokens_around_gaps( self): # was failing when a gapped token (from a template) starts at a # beginning of an index doc and at a position less than ngram length # setup idx = index.Index(ngram_len=4) index_doc = text_lines(self.get_test_loc('index/templates/idx.txt')) idx.index_one('idx', text_lines(index_doc), template=True) # test index quad_grams_index = idx._get_index_for_len(4) assert 205 == len(quad_grams_index) assert u'software without prior written' in quad_grams_index # test match query_doc = text_lines(self.get_test_loc('index/templates/query.txt')) matches = idx.match(query_doc) assert 1 == len(matches) # we expect a single match to the idx doc matched_query_doc_position = matches['idx'][0][1] expected = Token(start=0, start_line=0, start_char=0, end_line=39, end_char=34, end=276) assert expected == matched_query_doc_position
def test_Index_exact_match_ngrams_perfect_minimalist(self): index_doc = [u'name is joker, name is joker'] # 0 1 2 3 4 5 idx = index.Index(ngram_len=3) idx.index_one('tst', text_lines(index_doc), template=False) query_doc = [u'Hi my name is joker, name is joker yes.'] # match 0 1 |2 3 4 5 6 7| 8 expected = { 'tst': [(Token(start=0, start_line=0, start_char=0, end_line=0, end_char=28, end=5), Token(start=2, start_line=0, start_char=6, end_line=0, end_char=34, end=7))] } matches = idx.match(query_doc) assert {} != matches for k, val in matches.items(): # assert [] == val assert expected[k] == val
def test_get_tokens_count(self): base = self.get_test_loc('index/tokens_count', copy=True) docids = os.listdir(base) idx = index.Index(ngram_len=3) for docid in docids: doc = text_lines(location=os.path.join(base, docid)) template = docid.startswith('tmpl') idx.index_one(docid, doc, template=template) indexes = [( idx.indexes[1], set(['all', 'redistribution', 'for', 'is']), ), ( idx.indexes[2], set([ 'is allowed', 'all and', 'redistribution is', 'allowed for', ]), ), ( idx.indexes[3], set([ 'for all and', 'and any thing', 'is allowed for', 'all and any', 'redistribution is allowed', 'allowed for all', ]), )] for idxi, expected_keys in indexes: assert expected_keys == set(idxi.keys()) expected = { 'plain1': 1, 'plain2': 2, 'plain3': 3, 'plain4': 4, 'plain5': 5, 'tmpl10': 10, 'tmpl2': 2, 'tmpl3': 3, 'tmpl4': 4, 'tmpl5': 5, 'tmpl5_2': 5, 'tmpl6': 6, 'tmpl7': 7, 'tmpl8': 8, 'tmpl9': 9, } result = {docid: idx.get_tokens_count(docid) for docid in docids} assert expected == result
def test_Index_index_many_unigrams(self): test_docs = self.get_test_docs('index/bsd', ['bsd-new', 'bsd-no-mod']) idx = index.Index(ngram_len=1) idx._index_many(test_docs) unigrams_index = idx.indexes[1] assert 213 == idx.get_tokens_count('bsd-new') assert 234 == idx.get_tokens_count('bsd-no-mod') assert 138 == len(unigrams_index) pos = Token(start=61, end=61, start_line=8, start_char=52, end_line=8, end_char=59, value=(u'minimum',)) expected_posting = ('bsd-no-mod', [pos],) assert expected_posting == unigrams_index[('minimum',)].items()[0]
def test_Index_index_one_trigrams_no_templates(self): test_docs = self.get_test_docs('index/bsd', ['bsd-new', 'bsd-no-mod']) idx = index.Index(ngram_len=3) for docid, doc in test_docs: idx.index_one(docid, doc) indexes = [(idx.indexes[1], 0,), (idx.indexes[2], 0,), (idx.indexes[3], 280,)] for idxi, expected_len in indexes: assert expected_len == len(idxi) assert 213 == idx.get_tokens_count('bsd-new') assert 234 == idx.get_tokens_count('bsd-no-mod')
def test_Index_index_one_trigrams_with_templates(self): test_docs = self.get_test_docs('index/bsd_templates2') idx = index.Index(ngram_len=3) for docid, doc in test_docs: idx.index_one(docid, doc, template=True) indexes = [ (idx.indexes[1], 2,), (idx.indexes[2], 5,), (idx.indexes[3], 267,) ] for idxi, expected_len in indexes: assert expected_len == len(idxi) assert 211 == idx.get_tokens_count('bsd-new') assert 232 == idx.get_tokens_count('bsd-no-mod')
def test_Index_exact_match_return_one_match_with_correct_offsets(self): index_doc = [u'A one. A two. A three.'] idx = index.Index(ngram_len=4) idx.index_one('tst', text_lines(index_doc), template=False) query_doc = [u'some junk. A one. A two. A three.'] # 1111111111222222222233 # 012345678901234567890123456789012 matches = idx.match(query_doc) match = matches['tst'] assert 1 == len(match) index_pos, query_pos = match[0] assert 11 == query_pos.start_char assert 32 == query_pos.end_char assert 0 == index_pos.start_char assert 21 == index_pos.end_char
def test_Index_match_ngrams_templates_perfect_minimalist(self): index_doc = [u'name is joker, {{}} name is joker'] idx = index.Index(ngram_len=3) idx.index_one('tst', text_lines(index_doc), template=True) query_doc = [u'Hi my name is joker the joker name is joker yes.'] expected = { 'tst': [ (Token(start=0, start_line=0, start_char=0, end_line=0, end_char=33, end=5), Token(start=2, start_line=0, start_char=6, end_line=0, end_char=43, end=9)) ] } test = dict(idx.match(text_lines(query_doc), perfect=True)) self.assertNotEqual({}, test) for k, val in test.items(): assert expected[k] == val
def test_Index_index_several_unigrams(self): test_docs = self.get_test_docs('index/bsd', ['bsd-new', 'bsd-no-mod']) idx = index.Index(ngram_len=1) for docid, doc in test_docs: idx.index_one(docid, doc) unigrams_index = idx.indexes[1] assert 213 == idx.get_tokens_count('bsd-new') assert 234 == idx.get_tokens_count('bsd-no-mod') assert 138 == len(unigrams_index) pos = Token(start=61, end=61, start_line=8, start_char=52, end_line=8, end_char=59, value=u'minimum') expected_posting = ( 'bsd-no-mod', [pos], ) assert expected_posting == unigrams_index['minimum'].items()[0]
def get_test_index(self, docs, ngram_len=3, template=False): idx = index.Index(ngram_len) idx._index_many(docs, template) return idx
def get_test_index(self, docs, ngram_len=3, template=False): idx = index.Index(ngram_len) for docid, doc in docs: idx.index_one(docid, doc, template) return idx