Beispiel #1
0
    def test_Query_tokenize_from_string(self):
        rule_text = 'Redistribution and use in source and binary forms with or without modification are permitted'
        idx = index.LicenseIndex([Rule(stored_text=rule_text, license_expression='bsd')])
        querys = '''
            The
            Redistribution and use in source and binary are permitted.

            Athena capital of Grece
            Paris and Athene
            Always'''

        qry = Query(query_string=querys, idx=idx, _test_mode=True)
        tokens_by_line = list(qry.tokens_by_line(query_string=querys))
        qry.tokenize_and_build_runs(tokens_by_line)

        expected = ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', 'and']
        result = tks_as_str(qry.tokens, idx=idx)
        assert result == expected

        expected = [None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', None, None, None, None, None, 'and', None, None]
        result = tks_as_str(query_tokens_with_unknowns(qry), idx=idx)
        assert result == expected

        assert len(qry.query_runs) == 1
        qr1 = qry.query_runs[0]
        assert qr1.start == 0
        assert qr1.end == 9
        assert len(qr1) == 10
        expected = ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', 'and']
        result = tks_as_str(qr1.tokens, idx=idx)
        assert result == expected
        expected = [None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', None, None, None, None, None, 'and']
        result = tks_as_str(query_run_tokens_with_unknowns(qr1), idx=idx)
        assert result == expected
Beispiel #2
0
    def test_match_with_templates_with_redundant_tokens_yield_single_exact_match(self):
        from licensedcode_test_utils import query_tokens_with_unknowns  # NOQA

        _stored_text = 'copyright reserved mit is license, copyright reserved mit is license'
        #               0         1        2   3  4        5         6        7   8  9
        license_expression = 'tst'
        rule = models.Rule(license_expression=license_expression, stored_text=_stored_text)
        idx = MiniLicenseIndex([rule])

        querys = u'Hi my copyright reserved mit is license is the copyright reserved mit is license yes.'
        #           0  1         2        3   4  5       6  7   8         9       10  11 12      13  14
        qry = Query(query_string=querys, idx=idx)

        # convert tid to actual token strings
        tks_as_str = lambda tks: [None if tid is None else idx.tokens_by_tid[tid] for tid in tks]

        expected = [None, None, u'copyright', u'reserved', u'mit', u'is', u'license', u'is', None, u'copyright', u'reserved', u'mit', u'is', u'license', None]
        #              0     1            2            3       4      5           6      7      8            9           10      11     12          13     14
        assert tks_as_str(query_tokens_with_unknowns(qry)) == expected

        result = idx.match(query_string=querys)
        assert len(result) == 1

        match = result[0]
        assert match.qspan == Span(0, 4) | Span(6, 10)
        assert match.ispan == Span(0, 9)
        assert match.coverage() == 100
        qtext, itext = get_texts(match)
        assert qtext == 'copyright reserved mit is license [is] [the] copyright reserved mit is license'
        assert itext == 'copyright reserved mit is license copyright reserved mit is license'
    def test_QueryRun_does_not_end_with_None(self):
        rule_text = 'Redistribution and use in source and binary forms, with or without modification, are permitted'
        idx = index.LicenseIndex(
            [Rule(stored_text=rule_text, license_expression='bsd')])

        querys = '''
            The
            Redistribution and use in source and binary forms, with or without modification, are permitted.

            Always



            bar
             modification
             foo
            '''
        qry = Query(query_string=querys, idx=idx)
        expected = [
            None, 'redistribution', 'and', 'use', 'in', 'source', 'and',
            'binary', 'forms', 'with', 'or', 'without', 'modification', 'are',
            'permitted', None, None, 'modification', None
        ]
        assert tks_as_str(qry.tokens, idx=idx) == [x for x in expected if x]
        assert tks_as_str(query_tokens_with_unknowns(qry), idx=idx) == expected

        assert len(qry.query_runs) == 2
        qrun = qry.query_runs[0]
        expected = [
            'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary',
            'forms', 'with', 'or', 'without', 'modification', 'are',
            'permitted'
        ]
        assert tks_as_str(qrun.tokens, idx=idx) == expected
        assert qrun.start == 0
        assert qrun.end == 13

        qrun = qry.query_runs[1]
        expected = ['modification']
        assert tks_as_str(qrun.tokens, idx=idx) == expected
        assert qrun.start == 14
        assert qrun.end == 14
    def test_Query_known_and_unknown_positions(self):

        rule_text = 'Redistribution and use in source and binary forms'
        rule = Rule(stored_text=rule_text, license_expression='bsd')
        legalese = set([
            'redistribution',
            'form',
        ])
        idx = index.LicenseIndex([rule], _legalese=legalese)

        querys = 'The new Redistribution and use in other form always'
        qry = Query(query_string=querys, idx=idx, _test_mode=False)
        # we have only 4 known positions in this query, hence only 4 entries there on a single line
        # "Redistribution and use in"
        assert qry.line_by_pos == [1, 1, 1, 1, 1]

        # this show our 4 known token in this query with their known position
        # "Redistribution and use in"
        assert qry.tokens == [1, 2, 3, 4, 0]

        # the first two tokens are unknown, then starting after "in" we have three trailing unknown.
        assert qry.unknowns_by_pos == {3: 1, 4: 1, -1: 2}
        # This shows how knowns and unknowns are blended
        result = list(query_tokens_with_unknowns(qry))
        expected = [
            # The  new
            None,
            None,
            # Redistribution
            1,
            # and
            2,
            # use
            3,
            # in
            4,
            # other form always'
            None,
            0,
            None
        ]
        assert result == expected