Example #1
0
    def __init__(self, rules):
        """
        Init the Index with an iterable of Rule objects.
        """
        self.license_index = index.Index()

        if DEBUG_PERF:
            start = time.time()
            print('LicenseIndex: Starting building index.')

        # index rules text and keep a mapping of rules rid --> rule object
        self.rules_by_id = {}

        # note: we use numeric ids
        for rid, rule in enumerate(rules):
            # FXIEME: we should pass these len and counts downstream
            tokens, _min_len, _max_len, _gaps_count = rule.get_tokens()
            self.license_index.index_one_from_tokens(rid, tokens)
            self.rules_by_id[rid] = rule

        if DEBUG_PERF:
            duration = time.time() - start
            len_rules_by_id = len(self.rules_by_id)
            print('Finished building index with %(len_rules_by_id)d rules '
                  'in %(duration)f seconds.' % locals())
    def test_Index_exact_match_ngrams_templates_perfect_minimalist(self):
        index_doc = [u'name is joker, {{}} name is joker']
        idx = index.Index(ngram_len=3)
        idx.index_one('tst', text_lines(index_doc), template=True)

        query_doc = [u'Hi my name is joker the joker name is joker yes.']
        #              012345678901234567890123456789012345678901234567
        #                        11111111112222222222333333333344444444
        expected = {
            'tst': [(Token(start=0,
                           start_line=0,
                           start_char=0,
                           end_line=0,
                           end_char=33,
                           end=5),
                     Token(start=2,
                           start_line=0,
                           start_char=6,
                           end_line=0,
                           end_char=43,
                           end=9))]
        }

        matches = idx.match(text_lines(query_doc))

        assert {} != matches
        for k, val in matches.items():
            assert expected[k] == val
    def test_Index_exact_match_to_indexed_template_with_short_tokens_around_gaps(
            self):
        # was failing when a gapped token (from a template) starts at a
        # beginning of an index doc and at a position less than ngram length

        # setup
        idx = index.Index(ngram_len=4)
        index_doc = text_lines(self.get_test_loc('index/templates/idx.txt'))
        idx.index_one('idx', text_lines(index_doc), template=True)

        # test index
        quad_grams_index = idx._get_index_for_len(4)
        assert 205 == len(quad_grams_index)
        assert u'software without prior written' in quad_grams_index

        # test match
        query_doc = text_lines(self.get_test_loc('index/templates/query.txt'))
        matches = idx.match(query_doc)
        assert 1 == len(matches)

        # we expect a single match to the idx doc
        matched_query_doc_position = matches['idx'][0][1]
        expected = Token(start=0,
                         start_line=0,
                         start_char=0,
                         end_line=39,
                         end_char=34,
                         end=276)
        assert expected == matched_query_doc_position
    def test_Index_exact_match_ngrams_perfect_minimalist(self):
        index_doc = [u'name is joker, name is joker']
        #                 0  1     2     3  4     5
        idx = index.Index(ngram_len=3)
        idx.index_one('tst', text_lines(index_doc), template=False)

        query_doc = [u'Hi my name is joker, name is joker yes.']
        # match         0  1   |2  3     4     5  6     7|  8
        expected = {
            'tst': [(Token(start=0,
                           start_line=0,
                           start_char=0,
                           end_line=0,
                           end_char=28,
                           end=5),
                     Token(start=2,
                           start_line=0,
                           start_char=6,
                           end_line=0,
                           end_char=34,
                           end=7))]
        }
        matches = idx.match(query_doc)

        assert {} != matches
        for k, val in matches.items():
            #              assert [] == val
            assert expected[k] == val
    def test_get_tokens_count(self):
        base = self.get_test_loc('index/tokens_count', copy=True)
        docids = os.listdir(base)
        idx = index.Index(ngram_len=3)
        for docid in docids:
            doc = text_lines(location=os.path.join(base, docid))
            template = docid.startswith('tmpl')
            idx.index_one(docid, doc, template=template)
        indexes = [(
            idx.indexes[1],
            set(['all', 'redistribution', 'for', 'is']),
        ),
                   (
                       idx.indexes[2],
                       set([
                           'is allowed',
                           'all and',
                           'redistribution is',
                           'allowed for',
                       ]),
                   ),
                   (
                       idx.indexes[3],
                       set([
                           'for all and',
                           'and any thing',
                           'is allowed for',
                           'all and any',
                           'redistribution is allowed',
                           'allowed for all',
                       ]),
                   )]

        for idxi, expected_keys in indexes:
            assert expected_keys == set(idxi.keys())

        expected = {
            'plain1': 1,
            'plain2': 2,
            'plain3': 3,
            'plain4': 4,
            'plain5': 5,
            'tmpl10': 10,
            'tmpl2': 2,
            'tmpl3': 3,
            'tmpl4': 4,
            'tmpl5': 5,
            'tmpl5_2': 5,
            'tmpl6': 6,
            'tmpl7': 7,
            'tmpl8': 8,
            'tmpl9': 9,
        }

        result = {docid: idx.get_tokens_count(docid) for docid in docids}
        assert expected == result
Example #6
0
    def test_Index_index_many_unigrams(self):
        test_docs = self.get_test_docs('index/bsd', ['bsd-new', 'bsd-no-mod'])
        idx = index.Index(ngram_len=1)
        idx._index_many(test_docs)
        unigrams_index = idx.indexes[1]

        assert 213 == idx.get_tokens_count('bsd-new')
        assert 234 == idx.get_tokens_count('bsd-no-mod')
        assert 138 == len(unigrams_index)

        pos = Token(start=61, end=61, start_line=8, start_char=52, end_line=8, end_char=59, value=(u'minimum',))
        expected_posting = ('bsd-no-mod', [pos],)
        assert expected_posting == unigrams_index[('minimum',)].items()[0]
Example #7
0
    def test_Index_index_one_trigrams_no_templates(self):
        test_docs = self.get_test_docs('index/bsd', ['bsd-new', 'bsd-no-mod'])
        idx = index.Index(ngram_len=3)

        for docid, doc in test_docs:
            idx.index_one(docid, doc)

        indexes = [(idx.indexes[1], 0,),
                   (idx.indexes[2], 0,),
                   (idx.indexes[3], 280,)]
        for idxi, expected_len in indexes:
            assert expected_len == len(idxi)

        assert 213 == idx.get_tokens_count('bsd-new')
        assert 234 == idx.get_tokens_count('bsd-no-mod')
Example #8
0
    def test_Index_index_one_trigrams_with_templates(self):
        test_docs = self.get_test_docs('index/bsd_templates2')
        idx = index.Index(ngram_len=3)
        for docid, doc in test_docs:
            idx.index_one(docid, doc, template=True)
        indexes = [
            (idx.indexes[1], 2,),
            (idx.indexes[2], 5,),
            (idx.indexes[3], 267,)
        ]

        for idxi, expected_len in indexes:
            assert expected_len == len(idxi)

        assert 211 == idx.get_tokens_count('bsd-new')
        assert 232 == idx.get_tokens_count('bsd-no-mod')
    def test_Index_exact_match_return_one_match_with_correct_offsets(self):
        index_doc = [u'A one. A two. A three.']
        idx = index.Index(ngram_len=4)
        idx.index_one('tst', text_lines(index_doc), template=False)
        query_doc = [u'some junk. A one. A two. A three.']
        #                         1111111111222222222233
        #              012345678901234567890123456789012

        matches = idx.match(query_doc)
        match = matches['tst']
        assert 1 == len(match)
        index_pos, query_pos = match[0]
        assert 11 == query_pos.start_char
        assert 32 == query_pos.end_char
        assert 0 == index_pos.start_char
        assert 21 == index_pos.end_char
Example #10
0
    def test_Index_match_ngrams_templates_perfect_minimalist(self):
        index_doc = [u'name is joker, {{}} name is joker']
        idx = index.Index(ngram_len=3)
        idx.index_one('tst', text_lines(index_doc), template=True)

        query_doc = [u'Hi my name is joker the joker name is joker yes.']
        expected = {
            'tst': [
                (Token(start=0, start_line=0, start_char=0, end_line=0, end_char=33, end=5),
                 Token(start=2, start_line=0, start_char=6, end_line=0, end_char=43, end=9))
            ]
        }
        test = dict(idx.match(text_lines(query_doc), perfect=True))

        self.assertNotEqual({}, test)
        for k, val in test.items():
            assert expected[k] == val
    def test_Index_index_several_unigrams(self):
        test_docs = self.get_test_docs('index/bsd', ['bsd-new', 'bsd-no-mod'])
        idx = index.Index(ngram_len=1)
        for docid, doc in test_docs:
            idx.index_one(docid, doc)
        unigrams_index = idx.indexes[1]

        assert 213 == idx.get_tokens_count('bsd-new')
        assert 234 == idx.get_tokens_count('bsd-no-mod')
        assert 138 == len(unigrams_index)

        pos = Token(start=61,
                    end=61,
                    start_line=8,
                    start_char=52,
                    end_line=8,
                    end_char=59,
                    value=u'minimum')
        expected_posting = (
            'bsd-no-mod',
            [pos],
        )
        assert expected_posting == unigrams_index['minimum'].items()[0]
Example #12
0
 def get_test_index(self, docs, ngram_len=3, template=False):
     idx = index.Index(ngram_len)
     idx._index_many(docs, template)
     return idx
 def get_test_index(self, docs, ngram_len=3, template=False):
     idx = index.Index(ngram_len)
     for docid, doc in docs:
         idx.index_one(docid, doc, template)
     return idx