def test_approximate_match_to_indexed_template_with_few_tokens_around_gaps_on_limited_index(self):
        rule = models.Rule(text_file=self.get_test_loc('index/templates/idx.txt'), license_expression='test',)
        idx = index.LicenseIndex([rule])

        stats_file = 'license_approx_match_limited_index_profile_log.txt'
        locations = [self.get_test_loc('index/templates/query.txt')]
        self.profile_match(idx, locations, stats_file)
Ejemplo n.º 2
0
    def test_QueryRuns_tokens_with_unknowns(self):
        rule_text = 'Redistribution and use in source and binary forms with or without modification are permitted'
        idx = index.LicenseIndex(
            [Rule(stored_text=rule_text, license_expression='bsd')])
        querys = '''
            The
            Redistribution and use in source and binary are permitted.

            Athena capital of Grece
            Paris and Athene
            Always'''

        qry = Query(query_string=querys, idx=idx)
        assert set(qry.matchables) == set([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

        assert len(qry.query_runs) == 1
        qrun = qry.query_runs[0]

        expected = [
            'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary',
            'are', 'permitted', 'and'
        ]
        assert tks_as_str(qrun.tokens, idx=idx) == expected

        expected = [
            None, 'redistribution', 'and', 'use', 'in', 'source', 'and',
            'binary', 'are', 'permitted', None, None, None, None, None, 'and'
        ]
        assert tks_as_str(query_run_tokens_with_unknowns(qrun),
                          idx=idx) == expected

        assert qrun.start == 0
        assert qrun.end == 9
Ejemplo n.º 3
0
    def test_query_run_and_tokenizing_breaking_works__with_plus_as_expected(self):
        rule_dir = self.get_test_loc('query/run_breaking/rules')
        rules = list(models.load_rules(rule_dir))
        idx = index.LicenseIndex(rules)
        query_doc = self.get_test_loc('query/run_breaking/query.txt')
        q = Query(query_doc, idx=idx)
        result = [qr.to_dict() for qr in q.query_runs]
        expected = [
            {'end': 119, 'start': 0,
             'tokens':
                'this library is free software you can redistribute it '
                'and or modify it under the terms of the gnu library '
                'general public license as published by the free software '
                'foundation either version 2 of the license or at your '
                'option any later version this library is distributed in '
                'the hope that it will be useful but without any warranty '
                'without even the implied warranty of merchantability or '
                'fitness for particular purpose see the gnu library '
                'general public license for more details you should have '
                'received copy of the gnu library general public '
                'license along with this library see the file copying lib '
                'if not write to the free software foundation 51 franklin '
                'street fifth floor boston ma 02110 1301 usa'}
        ]

        assert result == expected

        # check rules token are the same exact set as the set of the last query run
        txtid = idx.tokens_by_tid
        qrt = [txtid[t] for t in q.query_runs[-1].tokens]
        irt = [txtid[t] for t in idx.tids_by_rid[0]]
        assert irt == qrt
Ejemplo n.º 4
0
    def test_LicenseMatch_small(self):
        r1_text = u'licensed under the GPL, licensed under the GPL distribute extent of law'
        small_rule = Rule(text_file='small_rule', licenses=['apache-1.1'], _text=r1_text)

        r2_text = u'licensed under the GPL, licensed under the GPL re distribute extent of law' * 10
        long_rule = Rule(text_file='long_rule', licenses=['apache-1.1'], _text=r2_text)

        _idx = index.LicenseIndex([small_rule, long_rule])

        test = LicenseMatch(rule=small_rule, qspan=Span(0, 10), ispan=Span(0, 10), hispan=Span(12))
        assert test.small()
        test = LicenseMatch(rule=small_rule, qspan=Span(0, 10), ispan=Span(0, 10), hispan=Span(11, 12))
        assert test.small()

        test = LicenseMatch(rule=small_rule, qspan=Span(10, 11, 12), ispan=Span(10, 11, 12), hispan=Span(11, 12))
        assert test.small()

        test = LicenseMatch(rule=small_rule, qspan=Span(1, 6), ispan=Span(1, 6))
        assert test.small()

        test = LicenseMatch(rule=long_rule, qspan=Span(0, 10), ispan=Span(0, 10), hispan=Span(12))
        assert test.small()

        test = LicenseMatch(rule=long_rule, qspan=Span(5, 10), ispan=Span(5, 10), hispan=Span(5, 6))
        assert test.small()

        test = LicenseMatch(rule=small_rule, qspan=Span(1, 10), ispan=Span(1, 10), hispan=Span(3, 6))
        assert not test.small()
Ejemplo n.º 5
0
    def test_match_exact_from_string_twice_with_repeated_text(self):
        _text = u'licensed under the GPL, licensed under the GPL'
        #                0    1   2   3         4      5   6   7
        licenses = ['tst']
        rule = models.Rule(licenses=licenses, _text=_text)

        idx = index.LicenseIndex([rule])
        querys = u'Hi licensed under the GPL, licensed under the GPL yes.'
        #          0        1   2   3     4       5     6    7   8   9

        result = idx.match(query_string=querys)
        assert 1 == len(result)
        match = result[0]
        qtext, itext = get_texts(match, query_string=querys, idx=idx)
        assert 'licensed under the GPL licensed under the GPL' == qtext
        assert 'licensed under the GPL licensed under the GPL' == itext

        assert Span(0, 7) == match.qspan
        assert Span(0, 7) == match.ispan

        # match again to ensure that there are no state side effects
        result = idx.match(query_string=querys)
        assert 1 == len(result)
        match = result[0]
        assert Span(0, 7) == match.qspan
        assert Span(0, 7) == match.ispan

        qtext, itext = get_texts(match, query_string=querys, idx=idx)
        assert u'licensed under the GPL licensed under the GPL' == qtext
        assert u'licensed under the GPL licensed under the GPL' == itext
Ejemplo n.º 6
0
 def test_index_fails_on_duplicated_rules(self):
     rule_dir = self.get_test_loc('index/no_duplicated_rule')
     try:
         index.LicenseIndex(models.load_rules(rule_dir))
         self.fail('Exception on dupes not raised')
     except AssertionError as e:
         assert u'Duplicate rules' in str(e)
Ejemplo n.º 7
0
    def test_match_license_performance_profiling_on_index_with_single_license(
            self):
        from time import time
        from licensedcode import query

        # pre-index : we are profiling only the detection, not the indexing
        rule_dir = self.get_test_loc('perf/idx/rules')
        rules = models.load_rules(rule_dir)
        idx = index.LicenseIndex(rules)
        location = self.get_test_loc('perf/idx/query.txt')
        querys = open(location, 'rb').read()

        qry = query.build_query(query_string=querys, idx=idx)

        def mini_seq_match(idx):
            list(idx.get_approximate_matches(qry, [], []))

        # qtokens_as_str = array('h', tokens).tostring()
        start = time()
        for _ in range(100):
            mini_seq_match(idx)
        duration = time() - start
        values = ('ScanCode diff:', duration)
        print(*values)
        raise Exception(values)
Ejemplo n.º 8
0
    def test_query_run_has_correct_offset(self):
        rule_dir = self.get_test_loc('query/runs/rules')
        rules = list(models.load_rules(rule_dir))
        idx = index.LicenseIndex(rules)
        query_doc = self.get_test_loc('query/runs/query.txt')
        q = Query(location=query_doc, idx=idx, line_threshold=4)
        result = [qr.to_dict() for qr in q.query_runs]
        expected = [{
            u'end': 0,
            u'start': 0,
            u'tokens': u'inc'
        }, {
            u'end':
            121,
            u'start':
            1,
            u'tokens':
            (u'this library is free software you can redistribute it and or modify '
             u'it under the terms of the gnu library general public license as '
             u'published by the free software foundation either version 2 of the '
             u'license or at your option any later version this library is '
             u'distributed in the hope that it will be useful but without any '
             u'warranty without even the implied warranty of merchantability or '
             u'fitness for particular purpose see the gnu library general public '
             u'license for more details you should have received copy of the gnu '
             u'library general public license along with this library see the file '
             u'copying lib if not write to the free software foundation inc 51 '
             u'franklin street fifth floor boston ma 02110 1301 usa')
        }]

        assert result == expected
Ejemplo n.º 9
0
    def test_LicenseMatch_small(self):
        r1_text = u'licensed under the GPL, licensed under the GPL'
        r1 = Rule(text_file='r1', licenses=['apache-1.1'], _text=r1_text)
        r2_text = u'licensed under the GPL, licensed under the GPL' * 10
        r2 = Rule(text_file='r1', licenses=['apache-1.1'], _text=r2_text)
        _idx = index.LicenseIndex([r1, r2])

        assert LicenseMatch(rule=r1,
                            qspan=Span(0, 10),
                            ispan=Span(0, 10),
                            hispan=Span(12)).small()
        assert LicenseMatch(rule=r1,
                            qspan=Span(0, 10),
                            ispan=Span(0, 10),
                            hispan=Span(11, 12)).small()
        assert LicenseMatch(rule=r1,
                            qspan=Span(10, 11, 12),
                            ispan=Span(10, 11, 12),
                            hispan=Span(11, 12)).small()
        assert LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1,
                                                                  6)).small()

        assert LicenseMatch(rule=r2,
                            qspan=Span(0, 10),
                            ispan=Span(0, 10),
                            hispan=Span(12)).small()
        assert LicenseMatch(rule=r2,
                            qspan=Span(5, 10),
                            ispan=Span(5, 10),
                            hispan=Span(5, 6)).small()
        assert LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1,
                                                                  6)).small()
Ejemplo n.º 10
0
    def test_query_runs_with_plain_rule(self):
        rule_text = u'''X11 License
            Copyright (C) 1996 X Consortium Permission is hereby granted, free
            of charge, to any person obtaining a copy of this software and
            associated documentation files (the "Software"), to deal in the
            Software without restriction, including without limitation the
            rights to use, copy, modify, merge, publish, distribute, sublicense,
            and/or sell copies of the Software, and to permit persons to whom
            the Software is furnished to do so, subject to the following
            conditions: The above copyright notice and this permission notice
            shall be included in all copies or substantial portions of the
            Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
            KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
            WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
            NONINFRINGEMENT. IN NO EVENT SHALL THE X CONSORTIUM BE LIABLE FOR
            ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
            CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
            WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
            Except as contained in this notice, the name of the X Consortium
            shall not be used in advertising or otherwise to promote the sale,
            use or other dealings in this Software without prior written
            authorization from the X Consortium. X Window System is a trademark
            of X Consortium, Inc.
        '''
        rule = Rule(stored_text=rule_text, license_expression='x-consortium')
        idx = index.LicenseIndex([rule])

        query_loc = self.get_test_loc(
            'detect/simple_detection/x11-xconsortium_text.txt')
        qry = Query(location=query_loc, idx=idx)
        result = [q.to_dict(brief=False) for q in qry.query_runs]
        expected = [{
            'start':
            0,
            'end':
            213,
            'tokens':
            (u'x11 license copyright c 1996 x consortium permission is hereby '
             u'granted free of charge to any person obtaining copy of this '
             u'software and associated documentation files the software to deal in '
             u'the software without restriction including without limitation the '
             u'rights to use copy modify merge publish distribute sublicense and or '
             u'sell copies of the software and to permit persons to whom the '
             u'software is furnished to do so subject to the following conditions '
             u'the above copyright notice and this permission notice shall be '
             u'included in all copies or substantial portions of the software the '
             u'software is provided as is without warranty of any kind express or '
             u'implied including but not limited to the warranties of '
             u'merchantability fitness for particular purpose and noninfringement '
             u'in no event shall the x consortium be liable for any claim damages or '
             u'other liability whether in an action of contract tort or otherwise '
             u'arising from out of or in connection with the software or the use or '
             u'other dealings in the software except as contained in this notice the '
             u'name of the x consortium shall not be used in advertising or '
             u'otherwise to promote the sale use or other dealings in this software '
             u'without prior written authorization from the x consortium x window '
             u'system is trademark of x consortium inc')
        }]
        assert len(qry.query_runs[0].tokens) == 214
        assert result == expected
Ejemplo n.º 11
0
    def test_Query_from_real_index_and_location(self):
        idx = index.LicenseIndex(self.get_test_rules('index/bsd'))
        query_loc = self.get_test_loc('index/querytokens')

        qry = Query(location=query_loc, idx=idx, line_threshold=4)
        result = [qr.to_dict() for qr in qry.query_runs]
        expected = [{
            'end':
            35,
            'start':
            0,
            'tokens':
            (u'redistribution and use in source and binary forms '
             u'redistributions of source code must the this that is not '
             u'to redistributions in binary form must this software is '
             u'provided by the copyright holders and contributors as is')
        }, {
            'end': 36,
            'start': 36,
            'tokens': u'redistributions'
        }]
        assert result == expected

        expected_lbp = [
            4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 9, 9, 9,
            9, 9, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15
        ]
        assert qry.line_by_pos == expected_lbp
Ejemplo n.º 12
0
    def test_match_with_templates_with_redundant_tokens_yield_single_exact_match(self):
        _text = u'copyright reserved mit is license, {{}} copyright reserved mit is license'
        #                 0        1  2   3       4               5        6   7  8       9
        licenses = ['tst']
        rule = models.Rule(licenses=licenses, _text=_text)
        idx = index.LicenseIndex([rule])
        expected_idx = {'_tst_73_0': {u'copyright': [0, 5], u'license': [4, 9], u'mit': [2, 7]}}
        assert expected_idx == idx.to_dict()

        querys = u'Hi my copyright reserved mit is license is the copyright reserved mit is license yes.'
        #           0  1         2        3   4  5       6  7   8         9       10  11 12      13  14
        qry = Query(query_string=querys, idx=idx)

        # convert tid to actual token strings
        tks_as_str = lambda tks: [None if tid is None else idx.tokens_by_tid[tid] for tid in tks]

        expected = [None, None, u'copyright', u'reserved', u'mit', u'is', u'license', u'is', None, u'copyright', u'reserved', u'mit', u'is', u'license', None]
        #              0     1            2            3       4      5           6      7      8            9           10      11     12          13     14
        assert expected == tks_as_str(qry.tokens_with_unknowns())

        result = idx.match(query_string=querys)
        assert 1 == len(result)

        match = result[0]
        assert Span(0, 4) | Span(6, 10) == match.qspan
        assert Span(0, 9) == match.ispan
        assert 100 == match.coverage()
        qtext, itext = get_texts(match, query_string=querys, idx=idx)
        assert 'copyright reserved mit is license <is> [the] copyright reserved mit is license' == qtext
        assert 'copyright reserved mit is license copyright reserved mit is license' == itext
Ejemplo n.º 13
0
    def test_get_full_matched_text_base(self):
        rule_text = u'''
            Copyright {{some copyright}}
            THIS IS FROM {{THE CODEHAUS}} AND CONTRIBUTORS
            IN NO EVENT SHALL {{THE CODEHAUS}} OR ITS CONTRIBUTORS BE LIABLE
            EVEN IF ADVISED OF THE {{POSSIBILITY OF SUCH}} DAMAGE
        '''

        rule = Rule(stored_text=rule_text, license_expression='test')
        idx = index.LicenseIndex([rule])

        querys = u'''
            foobar 45 . Copyright 2003 (C) James. All Rights Reserved.
            THIS IS FROM THE CODEHAUS AND CONTRIBUTORS
            IN NO EVENT SHALL THE best CODEHAUS OR ITS CONTRIBUTORS BE LIABLE
            EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. chabada DAMAGE 12 ABC dasdasda .
        '''
        result = idx.match(query_string=querys)
        assert 1 == len(result)
        match = result[0]

        expected = u"""Copyright [2003] ([C]) [James]. [All] [Rights] [Reserved].
            THIS IS FROM THE CODEHAUS AND CONTRIBUTORS
            IN NO EVENT SHALL THE [best] CODEHAUS OR ITS CONTRIBUTORS BE LIABLE
            EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE."""
        matched_text = u''.join(
            get_full_matched_text(match, query_string=querys, idx=idx))
        assert expected == matched_text
Ejemplo n.º 14
0
    def test_Query_tokenize_from_string(self):
        rule_text = 'Redistribution and use in source and binary forms with or without modification are permitted'
        idx = index.LicenseIndex([Rule(stored_text=rule_text, license_expression='bsd')])
        querys = '''
            The
            Redistribution and use in source and binary are permitted.

            Athena capital of Grece
            Paris and Athene
            Always'''

        qry = Query(query_string=querys, idx=idx, _test_mode=True)
        tokens_by_line = list(qry.tokens_by_line(query_string=querys))
        qry.tokenize_and_build_runs(tokens_by_line)

        expected = ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', 'and']
        result = tks_as_str(qry.tokens, idx=idx)
        assert result == expected

        expected = [None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', None, None, None, None, None, 'and', None, None]
        result = tks_as_str(query_tokens_with_unknowns(qry), idx=idx)
        assert result == expected

        assert len(qry.query_runs) == 1
        qr1 = qry.query_runs[0]
        assert qr1.start == 0
        assert qr1.end == 9
        assert len(qr1) == 10
        expected = ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', 'and']
        result = tks_as_str(qr1.tokens, idx=idx)
        assert result == expected
        expected = [None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', None, None, None, None, None, 'and']
        result = tks_as_str(query_run_tokens_with_unknowns(qr1), idx=idx)
        assert result == expected
Ejemplo n.º 15
0
    def test_QueryRun_with_all_digit_lines(self):
        rule = Rule(stored_text='''
            redistributions 0 1 2 3 4 1568 5 6 7 368 8 9 10 80 12213 232312 in
            binary 345 in 256
            free 1953
             software 406
             foundation 1151
            free 429
             software 634
             foundation 1955
            free 724
             software 932
             foundation 234
             software 694
             foundation 110
        ''')

        legalese = set(['binary', 'redistributions', 'foundation'])
        idx = index.LicenseIndex([rule], _legalese=legalese)

        qs = '''
              25  17   1   -80.00000      .25000    37.00000      .25000
            0: 5107 -2502 -700 496 -656 468 -587 418 -481 347 -325 256 -111 152 166 50
            493 -37 854 -96 1221 -118 1568 -125 1953 -143 2433 -195 2464 -281 2529 -395
            1987 -729 447 -916 -3011 -1181 -5559 -406 -6094 541 -5714 1110 -5247 1289
            -4993 1254 -4960 1151
            1: 4757 -1695 -644 429 -627 411 -602 368 -555 299 -470 206 -328 96 -125 -15
            126 -105 391 -146 634 -120 762 -58 911 -13 1583 -8 1049 -28 1451 123 1377 -464
            907 -603 -4056 -1955 -6769 -485 -5797 929 -4254 1413 -3251 1295 -2871 993
            -2899 724
            2: 4413 -932 -563 355 -566 354 -582 322 -597 258 -579 164 -499 45 -341 -84
            -127 -192 93 -234 288 -157 190 -25 -145 65 1065 74 -1087 -40 -877 1058 -994 18
            1208 694 -5540 -3840 -7658 -332 -4130 1732 -1668 1786 -634 1127 -525 501
            -856 110
        '''

        qry = Query(query_string=qs, idx=idx)
        result = [qr.to_dict() for qr in qry.query_runs]
        # FIXME: we should not even have a query run for things that are all digits
        expected = [
            {
                'end': 5,
                'start': 0,
                'tokens': '1 80 0 256 1568 1953'
            },
            {
                'end': 12,
                'start': 6,
                'tokens': '406 1151 1 429 368 634 8'
            },
            {
                'end': 17,
                'start': 13,
                'tokens': '1955 724 2 932 234'
            },
        ]
        assert result == expected

        assert not any(qr.is_matchable() for qr in qry.query_runs)
    def test_match_seq_are_correct_on_apache(self):
        rule_dir = self.get_test_loc('match_seq/rules')

        legalese = (mini_legalese
                    | set([
                        'redistributions', 'written', 'registered', 'derived',
                        'damage', 'due', 'alternately', 'nor'
                    ]))

        idx = index.LicenseIndex(load_rules(rule_dir), _legalese=legalese)

        query_loc = self.get_test_loc('match_seq/query')
        matches = idx.match(location=query_loc)
        assert len(matches) == 1
        match = matches[0]
        assert match.matcher == match_seq.MATCH_SEQ
        qtext, _itext = get_texts(match)
        expected = u'''
            The OpenSymphony Group. All rights reserved.

            Redistribution and use in source and binary forms, with or without modification,
            are permitted provided that the following conditions are met:

            1. Redistributions of source code must retain the above copyright notice, this
            list of conditions and the following disclaimer.

            2. Redistributions in binary form must reproduce the above copyright notice,
            this list of conditions and the following disclaimer in the documentation and/or
            other materials provided with the distribution.

            3. The end-user documentation included with the redistribution, if any, must
            include the following acknowledgment:

            [4]. "[This] [product] [includes] [software] [developed] [by] [the] [OpenSymphony] [Group]
            ([http]://[www].[opensymphony].[com]/)."

            [5]. Alternately, this acknowledgment may appear in the software itself, if and
            wherever such third-party acknowledgments normally appear.

            The names "OpenSymphony" and "The OpenSymphony Group" must not be used to
            endorse or promote products derived from this software without prior written
            permission. For written permission, please contact [email protected] .

            Products derived from this software may not be called "OpenSymphony" or
            "[OsCore]", nor may "OpenSymphony" or "[OsCore]" appear in their name, without prior
            written permission of the OpenSymphony Group.

            THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
            INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
            FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE APACHE
            SOFTWARE FOUNDATION OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
            INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
            LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
            PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
            LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
            OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
            ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        '''
        assert qtext.split() == expected.split()
Ejemplo n.º 17
0
 def test_match_to_threshold_words_has_hundred_score(self):
     threshold = 18
     idx = index.LicenseIndex(
         [Rule(stored_text=' LGPL ' * threshold, license_expression='lgpl-2.0')]
     )
     matches = idx.match(query_string=' LGPL ' * threshold)
     assert 1 == len(matches)
     assert 100.0 == matches[0].score()
Ejemplo n.º 18
0
    def test_Thresholds(self):
        r1_text = 'licensed under the GPL, licensed under the GPL'
        r1 = models.Rule(text_file='r1', license_expression='apache-1.1', stored_text=r1_text)
        r2_text = 'licensed under the GPL, licensed under the GPL' * 10
        r2 = models.Rule(text_file='r1', license_expression='apache-1.1', stored_text=r2_text)
        _idx = index.LicenseIndex([r1, r2])
        assert models.Thresholds(high_len=4, low_len=4, length=8, small=True, min_high=4, min_len=8) == r1.thresholds()
        assert models.Thresholds(high_len=31, low_len=40, length=71, small=False, min_high=3, min_len=4) == r2.thresholds()

        r1_text = 'licensed under the GPL,{{}} licensed under the GPL'
        r1 = models.Rule(text_file='r1', license_expression='apache-1.1', stored_text=r1_text)
        r2_text = 'licensed under the GPL, licensed under the GPL' * 10
        r2 = models.Rule(text_file='r1', license_expression='apache-1.1', stored_text=r2_text)

        _idx = index.LicenseIndex([r1, r2])
        assert models.Thresholds(high_len=4, low_len=4, length=8, small=True, min_high=4, min_len=8) == r1.thresholds()
        assert models.Thresholds(high_len=31, low_len=40, length=71, small=False, min_high=3, min_len=4) == r2.thresholds()
Ejemplo n.º 19
0
    def test_match_does_not_return_matches_for_empty_query(self):
        idx = index.LicenseIndex(
            [Rule(_text='A one. A two. license A three.')])

        matches = idx.match(query_string='')
        assert [] == matches
        matches = idx.match(query_string=None)
        assert [] == matches
Ejemplo n.º 20
0
    def test_match_return_correct_positions_with_short_index_and_queries(self):
        idx = index.LicenseIndex(
            [Rule(stored_text='MIT License', license_expression='mit')]
        )
        assert {'_tst_11_0': {'license': [1]}} == idx.to_dict(True)

        matches = idx.match(query_string='MIT License')
        assert 1 == len(matches)

        qtext, itext = get_texts(matches[0], query_string='MIT License', idx=idx)
        assert 'MIT License' == qtext
        assert 'mit license' == itext
        assert Span(0, 1) == matches[0].qspan
        assert Span(0, 1) == matches[0].ispan

        matches = idx.match(query_string='MIT MIT License')
        assert 1 == len(matches)

        qtext, itext = get_texts(matches[0], query_string='MIT MIT License', idx=idx)
        assert 'MIT License' == qtext
        assert 'mit license' == itext
        assert Span(1, 2) == matches[0].qspan
        assert Span(0, 1) == matches[0].ispan

        query_doc1 = 'do you think I am a mit license MIT License, yes, I think so'
        # #                                  0       1   2       3
        matches = idx.match(query_string=query_doc1)
        assert 2 == len(matches)

        qtext, itext = get_texts(matches[0], query_string=query_doc1, idx=idx)
        assert 'mit license' == qtext
        assert 'mit license' == itext
        assert Span(0, 1) == matches[0].qspan
        assert Span(0, 1) == matches[0].ispan

        qtext, itext = get_texts(matches[1], query_string=query_doc1, idx=idx)
        assert 'MIT License' == qtext
        assert 'mit license' == itext
        assert Span(2, 3) == matches[1].qspan
        assert Span(0, 1) == matches[1].ispan

        query_doc2 = '''do you think I am a mit license
                        MIT License
                        yes, I think so'''
        matches = idx.match(query_string=query_doc2)
        assert 2 == len(matches)

        qtext, itext = get_texts(matches[0], query_string=query_doc2, idx=idx)
        assert 'mit license' == qtext
        assert 'mit license' == itext
        assert Span(0, 1) == matches[0].qspan
        assert Span(0, 1) == matches[0].ispan

        qtext, itext = get_texts(matches[1], query_string=query_doc2, idx=idx)
        assert 'MIT License' == qtext
        assert 'mit license' == itext
        assert Span(2, 3) == matches[1].qspan
        assert Span(0, 1) == matches[1].ispan
Ejemplo n.º 21
0
    def test_match_license_performance_profiling_on_limited_index(self):
        # pre-index : we are profiling only the detection, not the indexing
        rule_dir = self.get_test_loc('detect/rule_template/rules')
        rules = models.load_rules(rule_dir)
        idx = index.LicenseIndex(rules)

        stats_file = 'license_match_limited_index_profile_log.txt'
        locations = [self.get_test_loc('detect/rule_template/query.txt')]
        self.profile_match(idx, locations, stats_file)
Ejemplo n.º 22
0
    def test_get_full_matched_text(self):
        rule_text = u'''
            Copyright {{some copyright}}
            THIS IS FROM {{THE CODEHAUS}} AND CONTRIBUTORS
            IN NO EVENT SHALL {{THE CODEHAUS}} OR ITS CONTRIBUTORS BE LIABLE
            EVEN IF ADVISED OF THE {{POSSIBILITY OF SUCH}} DAMAGE
        '''

        rule = Rule(
            _text=rule_text,
            licenses=['test'],
        )
        idx = index.LicenseIndex([rule])

        querys = u'''
            foobar 45 Copyright 2003 (C) James. All Rights Reserved.
            THIS IS FROM THE CODEHAUS AND CONTRIBUTORS
            IN NO EVENT SHALL THE best CODEHAUS OR ITS CONTRIBUTORS BE LIABLE
            EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. chabada DAMAGE 12 ABC
        '''
        result = idx.match(query_string=querys)
        assert 1 == len(result)
        match = result[0]

        expected = u"""Copyright [2003] ([C]) [James]. [All] [Rights] [Reserved].
            THIS IS FROM [THE] [CODEHAUS] AND CONTRIBUTORS
            IN NO EVENT SHALL [THE] [best] [CODEHAUS] OR ITS CONTRIBUTORS BE LIABLE
            EVEN IF ADVISED OF THE [POSSIBILITY] [OF] [SUCH] DAMAGE"""
        matched_text = u''.join(
            get_full_matched_text(match, query_string=querys, idx=idx))
        assert expected == matched_text

        # test again using a template
        expected = u"""Copyright <br>2003</br> (<br>C</br>) <br>James</br>. <br>All</br> <br>Rights</br> <br>Reserved</br>.
            THIS IS FROM <br>THE</br> <br>CODEHAUS</br> AND CONTRIBUTORS
            IN NO EVENT SHALL <br>THE</br> <br>best</br> <br>CODEHAUS</br> OR ITS CONTRIBUTORS BE LIABLE
            EVEN IF ADVISED OF THE <br>POSSIBILITY</br> <br>OF</br> <br>SUCH</br> DAMAGE"""
        matched_text = u''.join(
            get_full_matched_text(match,
                                  query_string=querys,
                                  idx=idx,
                                  highlight_not_matched=u'<br>%s</br>'))
        assert expected == matched_text

        # test again using whole_lines
        expected = u"""            foobar 45 Copyright 2003 (C) James. All Rights Reserved.
            THIS IS FROM THE CODEHAUS AND CONTRIBUTORS
            IN NO EVENT SHALL THE best CODEHAUS OR ITS CONTRIBUTORS BE LIABLE
            EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. chabada DAMAGE 12 ABC\n"""
        matched_text = u''.join(
            get_full_matched_text(match,
                                  query_string=querys,
                                  idx=idx,
                                  highlight_not_matched=u'%s',
                                  whole_lines=True))
        assert expected == matched_text
Ejemplo n.º 23
0
    def test_match_multiple(self):
        test_rules = self.get_test_rules('index/bsd')
        idx = index.LicenseIndex(test_rules)
        query = self.get_test_loc('index/querysimple')

        result = idx.match(location=query)
        assert 1 == len(result)
        match = result[0]
        assert Span(0, 209) == match.qspan
        assert Span(0, 209) == match.ispan
 def test_QueryRun(self):
     idx = index.LicenseIndex([Rule(_text='redistributions in binary form must redistributions in')])
     qry = Query(query_string='redistributions in binary form must redistributions in', idx=idx)
     qruns = qry.query_runs
     assert 1 == len(qruns)
     qr = qruns[0]
     # test
     result = [idx.tokens_by_tid[tid] for tid in qr.tokens]
     expected = ['redistributions', 'in', 'binary', 'form', 'must', 'redistributions', 'in']
     assert expected == result
Ejemplo n.º 25
0
    def test_match_can_match_with_index_built_from_rule_directory_with_sun_bcls(self):
        rule_dir = self.get_test_loc('detect/rule_template/rules')
        idx = index.LicenseIndex(load_rules(rule_dir))

        # at line 151 the query has an extra "Software" word inserted to avoid hash matching
        query_loc = self.get_test_loc('detect/rule_template/query.txt')
        matches = idx.match(location=query_loc)
        assert 1 == len(matches)
        match = matches[0]
        assert Span(0, 957) | Span(959, 1756) == match.qspan
        assert match_seq.MATCH_SEQ == match.matcher
    def test_QueryRun_repr(self):
        idx = index.LicenseIndex([Rule(_text='redistributions in binary form must redistributions in')])
        qry = Query(query_string='redistributions in binary form must redistributions in', idx=idx)
        qruns = qry.query_runs
        qr = qruns[0]
        # test
        expected = 'QueryRun(start=0, len=7, start_line=1, end_line=1)'
        assert expected == repr(qr)

        expected = 'QueryRun(start=0, len=7, start_line=1, end_line=1, tokens="redistributions in binary form must redistributions in")'
        assert expected == qr.__repr__(trace_repr=True)
Ejemplo n.º 27
0
 def test_index_rules_with_key_phrases_and_without_are_duplicates(self):
     rules_dir = self.get_test_loc('index/duplicate-key-phrases/rules')
     lics_dir = self.get_test_loc('index/duplicate-key-phrases/licenses')
     rules = models.get_rules(licenses_data_dir=lics_dir, rules_data_dir=rules_dir)
     try:
         idx = index.LicenseIndex(rules)
         for rid, tids in enumerate(idx.tids_by_rid):
             print(idx.rules_by_rid[rid].rid, repr(" ".join(idx.tokens_by_tid[t] for t in tids)))
         raise Exception("Exception not raised for duplicated rules")
     except index.DuplicateRuleError as e:
         assert str(e).startswith('Duplicate rules')
Ejemplo n.º 28
0
    def test_match_can_match_with_plain_rule_simple2(self):
        rule_text = u'''X11 License
        Copyright (C) 1996 X Consortium
        Permission is hereby granted, free of charge, to any person obtaining a copy
        of this software and associated documentation files (the "Software"), to deal
        in the Software without restriction, including without limitation the rights
        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
        copies of the Software, and to permit persons to whom the Software is
        furnished to do so, subject to the following conditions: The above copyright
        notice and this permission notice shall be included in all copies or
        substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS",
        WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
        TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
        NONINFRINGEMENT. IN NO EVENT SHALL THE X CONSORTIUM BE LIABLE FOR ANY CLAIM,
        DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
        OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
        OR OTHER DEALINGS IN THE SOFTWARE. Except as contained in this notice, the
        name of the X Consortium shall not be used in advertising or otherwise to
        promote the sale, use or other dealings in this Software without prior
        written authorization from the X Consortium. X Window System is a trademark
        of X Consortium, Inc.
        '''
        rule = Rule(_text=rule_text, licenses=['x-consortium'])
        idx = index.LicenseIndex([rule])

        query_loc = self.get_test_loc(
            'detect/simple_detection/x11-xconsortium_text.txt')
        matches = idx.match(location=query_loc)
        assert 1 == len(matches)

        expected_qtext = u'''
        X11 License Copyright C 1996 X Consortium Permission is hereby granted free
        of charge to any person obtaining a copy of this software and associated
        documentation files the Software to deal in the Software without restriction
        including without limitation the rights to use copy modify merge publish
        distribute sublicense and or sell copies of the Software and to permit
        persons to whom the Software is furnished to do so subject to the following
        conditions The above copyright notice and this permission notice shall be
        included in all copies or substantial portions of the Software THE SOFTWARE
        IS PROVIDED AS IS WITHOUT WARRANTY OF ANY KIND EXPRESS OR IMPLIED INCLUDING
        BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY FITNESS FOR A PARTICULAR
        PURPOSE AND NONINFRINGEMENT IN NO EVENT SHALL THE X CONSORTIUM BE LIABLE FOR
        ANY CLAIM DAMAGES OR OTHER LIABILITY WHETHER IN AN ACTION OF CONTRACT TORT OR
        OTHERWISE ARISING FROM OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
        OR OTHER DEALINGS IN THE SOFTWARE Except as contained in this notice the name
        of the X Consortium shall not be used in advertising or otherwise to promote
        the sale use or other dealings in this Software without prior written
        authorization from the X Consortium X Window System is a trademark of X
        Consortium Inc
        '''.split()
        match = matches[0]
        qtext, _itext = get_texts(match, location=query_loc, idx=idx)
        assert expected_qtext == qtext.split()
Ejemplo n.º 29
0
    def test_filter_matches_handles_interlaced_matches_with_overlap_and_same_license(self):
        rule_dir = self.get_test_loc('match_filter/rules')
        idx = index.LicenseIndex(load_rules(rule_dir))
        rules = {r.identifier: r for r in idx.rules_by_rid}
        query_loc = self.get_test_loc('match_filter/query')
        matches = idx.match(location=query_loc)
        expected = [
            # filtered: LicenseMatch(matcher='3-seq', rule=rules['rule1.RULE'], qspan=Span(4, 47) | Span(50, 59), ispan=Span(1, 53)),
            LicenseMatch(matcher='2-aho', rule=rules['rule2.RULE'], qspan=Span(24, 86), ispan=Span(0, 62)),
        ]

        assert expected == matches
Ejemplo n.º 30
0
    def test_match_matches_correctly_simple_exact_query_1(self):
        tf1 = self.get_test_loc('detect/mit/mit.c')
        ftr = Rule(text_file=tf1, licenses=['mit'])
        idx = index.LicenseIndex([ftr])

        query_doc = self.get_test_loc('detect/mit/mit2.c')
        matches = idx.match(query_doc)
        assert 1 == len(matches)
        match = matches[0]
        assert ftr == match.rule
        assert Span(0, 86) == match.qspan
        assert Span(0, 86) == match.ispan