Ejemplo n.º 1
0
    def test_Query_tokens_by_line_from_string(self):
        rule_text = 'Redistribution and use in source and binary forms with or without modification are permitted'
        rule = Rule(stored_text=rule_text, license_expression='bsd')
        legalese = set([
            'redistribution',
            'form',
        ])
        idx = index.LicenseIndex([rule], _legalese=legalese)
        querys = '''
            The
            Redistribution and use in source and binary are permitted

            Athena capital of Grece
            Paris and Athene
            Always'''

        qry = Query(query_string=querys, idx=idx, _test_mode=True)
        result = list(qry.tokens_by_line())
        expected = [
            [],
            [None],
            [1, 2, 3, 4, 5, 2, 6, 12, 13],
            [],
            [None, None, None, None],
            [None, 2, None],
            [None],
        ]

        assert expected == result

        # convert tid to actual token strings
        qtbl_as_str = lambda qtbl: [[
            None if tid is None else idx.tokens_by_tid[tid] for tid in tids
        ] for tids in qtbl]

        result_str = qtbl_as_str(result)
        expected_str = [
            [],
            [None],
            [
                'redistribution', 'and', 'use', 'in', 'source', 'and',
                'binary', 'are', 'permitted'
            ],
            [],
            [None, None, None, None],
            [None, 'and', None],
            [None],
        ]

        assert expected_str == result_str

        assert [3, 3, 3, 3, 3, 3, 3, 3, 3, 6] == qry.line_by_pos

        idx = index.LicenseIndex(
            [Rule(stored_text=rule_text, license_expression='bsd')])
        querys = 'and this is not a license'
        qry = Query(query_string=querys, idx=idx, _test_mode=True)
        result = list(qry.tokens_by_line())
        expected = [['and', None, None, None, 'license']]
        assert expected == qtbl_as_str(result)
Ejemplo n.º 2
0
    def test_Query_tokens_by_line_from_string(self):
        rule_text = 'Redistribution and use in source and binary forms with or without modification are permitted'
        idx = index.LicenseIndex([Rule(_text=rule_text, licenses=['bsd'])])
        querys = '''
            The
            Redistribution and use in source and binary are permitted

            Athena capital of Grece
            Paris and Athene
            Always'''

        qry = Query(query_string=querys, idx=idx, _test_mode=True)
        result = list(qry.tokens_by_line())
        expected = [
            [],
            [None],
            [11, 0, 6, 4, 3, 0, 1, 9, 2],
            [],
            [None, None, None, None],
            [None, 0, None],
            [None],
        ]

        assert expected == result

        # convert tid to actual token strings
        qtbl_as_str = lambda qtbl: [[None if tid is None else idx.tokens_by_tid[tid] for tid in tids] for tids in qtbl]

        result_str = qtbl_as_str(result)
        expected_str = [
            [],
            [None],
            ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted'],
            [],
            [None, None, None, None],
            [None, 'and', None],
            [None],
        ]

        assert expected_str == result_str

        assert [3, 3, 3, 3, 3, 3, 3, 3, 3, 6] == qry.line_by_pos

        idx = index.LicenseIndex([Rule(_text=rule_text, licenses=['bsd'])])
        querys = 'and this is not a license'
        qry = Query(query_string=querys, idx=idx, _test_mode=True)
        result = list(qry.tokens_by_line())
        expected = [['and', None, None, None, None, None]]
        assert expected == qtbl_as_str(result)
    def test_Query_tokenize_from_string(self):
        rule_text = 'Redistribution and use in source and binary forms with or without modification are permitted'
        idx = index.LicenseIndex([Rule(_text=rule_text, licenses=['bsd'])])
        querys = '''
            The
            Redistribution and use in source and binary are permitted.

            Athena capital of Grece
            Paris and Athene
            Always'''

        qry = Query(query_string=querys, idx=idx, _test_mode=True)
        qry.tokenize_and_build_runs(qry.tokens_by_line())
        # convert tid to actual token strings
        tks_as_str = lambda tks: [None if tid is None else idx.tokens_by_tid[tid] for tid  in tks]

        expected = ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', 'and']
        result = tks_as_str(qry.tokens)
        assert expected == result

        expected = [None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', None, None, None, None, None, 'and', None, None]
        result = tks_as_str(qry.tokens_with_unknowns())
        assert expected == result

        assert 1 == len(qry.query_runs)
        qr1 = qry.query_runs[0]
        assert 0 == qr1.start
        assert 9 == qr1.end
        assert 10 == len(qr1)
        expected = ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', 'and']
        result = tks_as_str(qr1.tokens)
        assert expected == result
        expected = [None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', None, None, None, None, None, 'and']
        result = tks_as_str(qr1.tokens_with_unknowns())
        assert expected == result
Ejemplo n.º 4
0
    def test_Query_tokenize_from_string(self):
        rule_text = 'Redistribution and use in source and binary forms with or without modification are permitted'
        idx = index.LicenseIndex([Rule(stored_text=rule_text, license_expression='bsd')])
        querys = '''
            The
            Redistribution and use in source and binary are permitted.

            Athena capital of Grece
            Paris and Athene
            Always'''

        qry = Query(query_string=querys, idx=idx, _test_mode=True)
        tokens_by_line = list(qry.tokens_by_line(query_string=querys))
        qry.tokenize_and_build_runs(tokens_by_line)

        expected = ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', 'and']
        result = tks_as_str(qry.tokens, idx=idx)
        assert result == expected

        expected = [None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', None, None, None, None, None, 'and', None, None]
        result = tks_as_str(query_tokens_with_unknowns(qry), idx=idx)
        assert result == expected

        assert len(qry.query_runs) == 1
        qr1 = qry.query_runs[0]
        assert qr1.start == 0
        assert qr1.end == 9
        assert len(qr1) == 10
        expected = ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', 'and']
        result = tks_as_str(qr1.tokens, idx=idx)
        assert result == expected
        expected = [None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', None, None, None, None, None, 'and']
        result = tks_as_str(query_run_tokens_with_unknowns(qr1), idx=idx)
        assert result == expected
Ejemplo n.º 5
0
 def test_Query_tokens_by_line_behaves_the_same_on_various_python_2(self):
     location = self.get_test_loc('query/query_lines/yahoo-eula.txt')
     idx = cache.get_index()
     query = Query(location, idx=idx)
     tbl = list(query.tokens_by_line())
     # inject the actual token string for sanity
     tbt = idx.tokens_by_tid
     results = [[[i, i and tbt[i] or i] for i in line] for line in tbl]
     expected = self.get_test_loc('query/query_lines/yahoo-eula.txt.json')
     check_result_equals_expected_json(results, expected, regen=False)
Ejemplo n.º 6
0
    def test_match_does_not_change_query_unknown_positions(self):
        from licensedcode.match import LicenseMatch
        from licensedcode.spans import Span

        location = self.get_test_loc('query/unknown_positions/lz4.license.txt')
        idx = cache.get_index()
        # build a query first
        qry1 = Query(location, idx=idx)
        # this has the side effect to populate the unknown
        txt = ' '.join(f'{i}-{idx.tokens_by_tid[t]}'
                       for i, t in enumerate(qry1.tokens))
        assert txt == (
            '0-this 1-repository 2-uses 3-2 4-different 5-licenses '
            '6-all 7-files 8-in 9-the 10-lib 11-directory 12-use 13-bsd 14-2 15-clause 16-license '
            '17-all 18-other 19-files 20-use 21-gplv2 22-license 23-unless 24-explicitly 25-stated 26-otherwise '
            '27-relevant 28-license 29-is 30-reminded 31-at 32-the 33-top 34-of 35-each 36-source 37-file '
            '38-and 39-with 40-presence 41-of 42-copying 43-or 44-license 45-file 46-in 47-associated 48-directories '
            '49-this 50-model 51-is 52-selected 53-to 54-emphasize 55-that '
            '56-files 57-in 58-the 59-lib 60-directory 61-are 62-designed 63-to 64-be 65-included 66-into 67-3rd 68-party 69-applications '
            '70-while 71-all 72-other 73-files 74-in 75-programs 76-tests 77-or 78-examples '
            '79-receive 80-more 81-limited 82-attention 83-and 84-support 85-for 86-such 87-scenario'
        )
        list(qry1.tokens_by_line())
        assert qry1.unknowns_by_pos == {}

        # run matching
        matches = idx.match(location=location)
        match = matches[0]

        rule = [
            r for r in idx.rules_by_rid
            if r.identifier == 'bsd-simplified_and_gpl-2.0_1.RULE'
        ][0]

        expected = LicenseMatch(
            matcher='2-aho',
            rule=rule,
            qspan=Span(0, 48),
            ispan=Span(0, 48),
        )

        assert match == expected

        # check that query unknown by pos is the same and empty
        qry2 = match.query

        # this was incorrectly returned as {15: 0, 20: 0, 21: 0, 41: 0, 43: 0}
        # after querying done during matching
        assert qry2.unknowns_by_pos == {}
Ejemplo n.º 7
0
    def test_query_unknowns_by_pos_and_stopwords_are_not_set_on_last_query_position(self):
        print('\nINDEX')
        idx = index.LicenseIndex(
            [Rule(stored_text='is the binary a')],
            _legalese=set(['binary']),
            _spdx_tokens=set()
        )
        print('\nQUERY')
        q = Query(query_string='a bar binary that was a binary a is the foo bar a', idx=idx)

        tids = list(q.tokens_by_line())
        assert tids == [[None, 0, None, None, 0, 1, 2, None, None]]
        # word:   a  bar  binary  that  was   a    binary  a   is    the  foo   bar   a
        # tids:  [   None 0,      None, None,      0,          1,    2,   None, None   ]
        # known:  st uk   kn      uk    uk    st   kn      st  kn    kn   uk    uk    st
        # pos:            0                        1           2     3
        assert q.unknowns_by_pos == {-1: 1, 0: 2, 3: 2}
        assert q.stopwords_by_pos == {-1: 1, 0: 1, 1: 1, 3: 1}
Ejemplo n.º 8
0
    def test_query_unknowns_by_pos_and_stopwords_are_not_defaultdic_and_not_changed_on_query(
            self):
        idx = index.LicenseIndex([Rule(stored_text='a is the binary')],
                                 _legalese=set(['binary']),
                                 _spdx_tokens=set())
        q = Query(query_string='binary that was a binary', idx=idx)
        list(q.tokens_by_line())
        assert q.unknowns_by_pos == {0: 2}
        assert q.stopwords_by_pos == {0: 1}

        assert not isinstance(q.unknowns_by_pos, defaultdict)
        assert not isinstance(q.stopwords_by_pos, defaultdict)

        try:
            q.unknowns_by_pos[1]
            assert q.unknowns_by_pos == {0: 2}
        except KeyError:
            pass
        try:
            q.stopwords_by_pos[1]
            assert q.stopwords_by_pos == {0: 1}
        except KeyError:
            pass