Example #1
0
    def test_LicenseMatch_score_100_non_contiguous(self):
        r1 = Rule(text_file='r1', license_expression='apache-2.0')
        r1.relevance = 100
        r1.length = 42

        m1 = LicenseMatch(rule=r1,
                          qspan=Span(0, 19) | Span(30, 51),
                          ispan=Span(0, 41))
        assert m1.score() == 80.77
Example #2
0
    def test_merge_contiguous_touching_matches_in_sequence(self):
        r1 = Rule(stored_text='r1', license_expression='apache-2.0 OR gpl')
        m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2))
        m2 = LicenseMatch(rule=r1, qspan=Span(3, 6), ispan=Span(3, 6))

        result = merge_matches([m1, m2])
        match = result[0]
        assert LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0,
                                                                  6)) == match
Example #3
0
    def test_merge_does_not_merge_overlapping_matches_of_same_rules_if_in_sequence_with_gaps_for_long_match(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])
        r1.length = 20
        m1 = LicenseMatch(rule=r1, qspan=Span(1, 10), ispan=Span(1, 10))
        m2 = LicenseMatch(rule=r1, qspan=Span(14, 20), ispan=Span(14, 20))

        expected = [LicenseMatch(rule=r1, qspan=Span(1, 10) | Span(14, 20), ispan=Span(1, 10) | Span(14, 20))]
        results = merge_matches([m1, m2])
        assert expected == results
Example #4
0
    def test_merge_contiguous_contained_matches(self):
        r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl')
        m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2))
        m2 = LicenseMatch(rule=r1, qspan=Span(3, 6), ispan=Span(3, 6))
        m5 = LicenseMatch(rule=r1, qspan=Span(7, 8), ispan=Span(7, 8))

        result = merge_matches([m1, m2, m5])
        assert [LicenseMatch(rule=r1, qspan=Span(0, 8),
                             ispan=Span(0, 8))] == result
Example #5
0
    def test_merge_does_merge_overlapping_matches_of_same_rules_if_in_sequence(
            self):
        r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl')

        m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5))
        m2 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))

        assert [LicenseMatch(rule=r1, qspan=Span(0, 6),
                             ispan=Span(0, 6))] == merge_matches([m1, m2])
Example #6
0
    def test_merge_does_not_merge_overlapping_matches_of_same_rules_if_in_not_sequence(
            self):
        r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl')

        m1 = LicenseMatch(rule=r1, qspan=Span(1, 3), ispan=Span(1, 3))
        m2 = LicenseMatch(rule=r1, qspan=Span(14, 20), ispan=Span(1, 3))

        matches = merge_matches([m1, m2])
        assert sorted([m1, m2]) == sorted(matches)
Example #7
0
    def test_combine_matches_with_same_rules(self):
        r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl')

        m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5))
        m2 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))

        match = m1.combine(m2)
        assert Span(0, 6) == match.qspan
        assert Span(0, 6) == match.ispan
Example #8
0
    def test_merge_does_merge_non_contiguous_matches_in_sequence(self):
        r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl')
        m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2))
        m2 = LicenseMatch(rule=r1, qspan=Span(4, 6), ispan=Span(4, 6))
        m5 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))

        results = merge_matches([m1, m2, m5])
        assert [LicenseMatch(rule=r1, qspan=Span(0, 6),
                             ispan=Span(0, 6))] == results
 def test_match_does_not_return_incorrect_matches(self):
     ftr = Rule(text_file=self.create_test_file('A one. A two. A three.'))
     index = detect.LicenseIndex([ftr])
     docs = [
         u'some other path', u'some junk', u'some path', u'some other junk'
     ]
     for d in docs:
         matches = index.match([d])
         self.assertEqual([], matches)
Example #10
0
 def test_filter_matches_filters_multiple_nested_contained_matches_and_large_overlapping(self):
     r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])
     m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5))
     large_overlap = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))
     contained = LicenseMatch(rule=r1, qspan=Span(1, 4), ispan=Span(1, 4))
     in_contained = LicenseMatch(rule=r1, qspan=Span(2, 3), ispan=Span(2, 3))
     result, discarded = filter_contained_matches([m1, contained, in_contained, large_overlap])
     assert [m1] == result
     assert discarded
Example #11
0
    def check_detection(self, doc_file, rule_file, expected_matches):
        test_rule = self.get_test_loc(rule_file)
        ftr = Rule(text_file=test_rule, licenses=['mit'])
        index = detect.LicenseIndex([ftr])

        test_doc = self.get_test_loc(doc_file)
        matches = index.match(test_doc)
        assert 1 == len(matches)
        assert expected_matches == matches[0].query_position
Example #12
0
    def test_get_full_matched_text(self):
        rule_text = u'''
            Copyright {{some copyright}}
            THIS IS FROM {{THE CODEHAUS}} AND CONTRIBUTORS
            IN NO EVENT SHALL {{THE CODEHAUS}} OR ITS CONTRIBUTORS BE LIABLE
            EVEN IF ADVISED OF THE {{POSSIBILITY OF SUCH}} DAMAGE
        '''

        rule = Rule(
            _text=rule_text,
            licenses=['test'],
        )
        idx = index.LicenseIndex([rule])

        querys = u'''
            foobar 45 Copyright 2003 (C) James. All Rights Reserved.
            THIS IS FROM THE CODEHAUS AND CONTRIBUTORS
            IN NO EVENT SHALL THE best CODEHAUS OR ITS CONTRIBUTORS BE LIABLE
            EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. chabada DAMAGE 12 ABC
        '''
        result = idx.match(query_string=querys)
        assert 1 == len(result)
        match = result[0]

        expected = u"""Copyright [2003] ([C]) [James]. [All] [Rights] [Reserved].
            THIS IS FROM [THE] [CODEHAUS] AND CONTRIBUTORS
            IN NO EVENT SHALL [THE] [best] [CODEHAUS] OR ITS CONTRIBUTORS BE LIABLE
            EVEN IF ADVISED OF THE [POSSIBILITY] [OF] [SUCH] DAMAGE"""
        matched_text = u''.join(
            get_full_matched_text(match, query_string=querys, idx=idx))
        assert expected == matched_text

        # test again using a template
        expected = u"""Copyright <br>2003</br> (<br>C</br>) <br>James</br>. <br>All</br> <br>Rights</br> <br>Reserved</br>.
            THIS IS FROM <br>THE</br> <br>CODEHAUS</br> AND CONTRIBUTORS
            IN NO EVENT SHALL <br>THE</br> <br>best</br> <br>CODEHAUS</br> OR ITS CONTRIBUTORS BE LIABLE
            EVEN IF ADVISED OF THE <br>POSSIBILITY</br> <br>OF</br> <br>SUCH</br> DAMAGE"""
        matched_text = u''.join(
            get_full_matched_text(match,
                                  query_string=querys,
                                  idx=idx,
                                  highlight_not_matched=u'<br>%s</br>'))
        assert expected == matched_text

        # test again using whole_lines
        expected = u"""            foobar 45 Copyright 2003 (C) James. All Rights Reserved.
            THIS IS FROM THE CODEHAUS AND CONTRIBUTORS
            IN NO EVENT SHALL THE best CODEHAUS OR ITS CONTRIBUTORS BE LIABLE
            EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. chabada DAMAGE 12 ABC\n"""
        matched_text = u''.join(
            get_full_matched_text(match,
                                  query_string=querys,
                                  idx=idx,
                                  highlight_not_matched=u'%s',
                                  whole_lines=True))
        assert expected == matched_text
Example #13
0
    def test_match_return_correct_positions_with_short_index_and_queries(self):
        idx = index.LicenseIndex([Rule(_text='MIT License', licenses=['mit'])])
        matches = idx.match(query_string='MIT License')
        assert 1 == len(matches)

        assert {'_tst_11_0': {'mit': [0]}} == idx.to_dict()

        qtext, itext = get_texts(matches[0], query_string='MIT License', idx=idx)
        assert 'MIT License' == qtext
        assert 'MIT License' == itext
        assert Span(0, 1) == matches[0].qspan
        assert Span(0, 1) == matches[0].ispan

        matches = idx.match(query_string='MIT MIT License')
        assert 1 == len(matches)

        qtext, itext = get_texts(matches[0], query_string='MIT MIT License', idx=idx)
        assert 'MIT License' == qtext
        assert 'MIT License' == itext
        assert Span(1, 2) == matches[0].qspan
        assert Span(0, 1) == matches[0].ispan

        query_doc1 = 'do you think I am a mit license MIT License, yes, I think so'
        # #                                  0       1   2       3
        matches = idx.match(query_string=query_doc1)
        assert 2 == len(matches)

        qtext, itext = get_texts(matches[0], query_string=query_doc1, idx=idx)
        assert 'mit license' == qtext
        assert 'MIT License' == itext
        assert Span(0, 1) == matches[0].qspan
        assert Span(0, 1) == matches[0].ispan

        qtext, itext = get_texts(matches[1], query_string=query_doc1, idx=idx)
        assert 'MIT License' == qtext
        assert 'MIT License' == itext
        assert Span(2, 3) == matches[1].qspan
        assert Span(0, 1) == matches[1].ispan

        query_doc2 = '''do you think I am a mit license
                        MIT License
                        yes, I think so'''
        matches = idx.match(query_string=query_doc2)
        assert 2 == len(matches)

        qtext, itext = get_texts(matches[0], query_string=query_doc2, idx=idx)
        assert 'mit license' == qtext
        assert 'MIT License' == itext
        assert Span(0, 1) == matches[0].qspan
        assert Span(0, 1) == matches[0].ispan

        qtext, itext = get_texts(matches[1], query_string=query_doc2, idx=idx)
        assert 'MIT License' == qtext
        assert 'MIT License' == itext
        assert Span(2, 3) == matches[1].qspan
        assert Span(0, 1) == matches[1].ispan
Example #14
0
 def test_merge_merges_contained_and_overlapping_match(self):
     r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl')
     m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5))
     contained = LicenseMatch(rule=r1, qspan=Span(1, 4), ispan=Span(1, 4))
     overlapping = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))
     assert contained in overlapping
     assert contained in m1
     result = merge_matches([m1, contained, overlapping])
     expected = [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6))]
     assert expected == result
Example #15
0
    def test_rule_cannot_contain_extra_unknown_attributes(self):
        data_file = self.get_test_loc('models/rule_with_extra_attributes/sun-bcl.yml')
        text_file = self.get_test_loc('models/rule_with_extra_attributes/sun-bcl.RULE')

        expected = 'data file has unknown attributes: license_expressionnotuce'
        try:
            Rule(data_file=data_file, text_file=text_file)
            self.fail('Exception not raised.')
        except Exception as  e:
            assert expected in str(e)
Example #16
0
    def get_test_rules(self, base, subset=None):
        base = self.get_test_loc(base)
        test_files = sorted(os.listdir(base))
        if subset:
            test_files = [t for t in test_files if t in subset]

        return [
            Rule(text_file=os.path.join(base, license_key),
                 license_expression=license_key) for license_key in test_files
        ]
 def test_QueryRun(self):
     idx = index.LicenseIndex([Rule(_text='redistributions in binary form must redistributions in')])
     qry = Query(query_string='redistributions in binary form must redistributions in', idx=idx)
     qruns = qry.query_runs
     assert 1 == len(qruns)
     qr = qruns[0]
     # test
     result = [idx.tokens_by_tid[tid] for tid in qr.tokens]
     expected = ['redistributions', 'in', 'binary', 'form', 'must', 'redistributions', 'in']
     assert expected == result
    def test_non_contiguous_matches_are_not_filtered(self):
        r1 = Rule(licenses=['apache-2.0', 'gpl'])
        m1 = LicenseMatch(rule=r1,
                          query_position=analysis.Token(start=0, end=2))
        m2 = LicenseMatch(rule=r1,
                          query_position=analysis.Token(start=4, end=6))
        m5 = LicenseMatch(rule=r1,
                          query_position=analysis.Token(start=1, end=6))

        self.assertEqual([m1, m5], detect.filter_matches([m1, m2, m5]))
    def test_overlap_detection3(self):
        #  test this containment relationship between test and index licenses:
        #   * Index licenses:
        #   +-license 2 --------+
        #   |  +-license 1 --+  |
        #   +-------------------+
        #
        #   * License texts to detect:
        #   +- license 3 -----------+
        #   | +-license 2 --------+ |
        #   | |  +-license 1 --+  | |
        #   | +-------------------+ |
        #   +-----------------------+
        #
        # setup index
        license1 = '''Redistribution and use permitted.'''

        license2 = '''Redistributions of source must retain copyright.
        Redistribution and use permitted.
        Redistributions in binary form is permitted.'''

        rule1 = Rule(_text=license1, licenses=['overlap'])
        rule2 = Rule(_text=license2, licenses=['overlap'])
        idx = index.LicenseIndex([rule1, rule2])

        querys = '''My source.
            Redistributions of source must retain copyright.
            Redistribution and use permitted.
            Redistributions in binary form is permitted.
            My code.'''

        # test : querys contains license2 that contains license1: return license2 as exact coverage
        matches = idx.match(query_string=querys)
        assert 1 == len(matches)
        match = matches[0]
        assert rule2 == match.rule
        qtext, _itext = get_texts(match, query_string=querys, idx=idx)
        expected = '''
            Redistributions of source must retain copyright
            Redistribution and use permitted
            Redistributions in binary form is permitted'''.split()
        assert expected == qtext.split()
    def test_QueryRun_repr(self):
        idx = index.LicenseIndex([Rule(_text='redistributions in binary form must redistributions in')])
        qry = Query(query_string='redistributions in binary form must redistributions in', idx=idx)
        qruns = qry.query_runs
        qr = qruns[0]
        # test
        expected = 'QueryRun(start=0, len=7, start_line=1, end_line=1)'
        assert expected == repr(qr)

        expected = 'QueryRun(start=0, len=7, start_line=1, end_line=1, tokens="redistributions in binary form must redistributions in")'
        assert expected == qr.__repr__(trace_repr=True)
    def test_single_contained_matche_is_filtered(self):
        r1 = Rule(licenses=['apache-2.0', 'gpl'])
        m1 = LicenseMatch(rule=r1,
                          query_position=analysis.Token(start=0, end=5))
        contained = LicenseMatch(rule=r1,
                                 query_position=analysis.Token(start=1, end=4))
        m5 = LicenseMatch(rule=r1,
                          query_position=analysis.Token(start=1, end=6))

        test = detect.filter_matches([m1, contained, m5])
        self.assertEqual([m1, m5], test)
    def test_simple_detection_against_same_text(self):
        tf1 = self.get_test_loc('detect/mit/mit.c')
        ftr = Rule(text_file=tf1, licenses=['mit'])
        index = detect.LicenseIndex([ftr])

        matches = index.match(tf1)
        assert 1 == len(matches)
        match = matches[0]
        assert ftr == match.rule
        assert 0 == match.span.start
        assert 86 == match.span.end
Example #23
0
    def test_filter_matches_filters_non_contiguous_or_overlapping__but_contained_matches(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])
        m1 = LicenseMatch(rule=r1, qspan=Span(1, 2), ispan=Span(1, 2))
        m2 = LicenseMatch(rule=r1, qspan=Span(3, 6), ispan=Span(3, 6))
        m3 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))
        m4 = LicenseMatch(rule=r1, qspan=Span(0, 7), ispan=Span(0, 7))
        m5 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))

        result, discarded = filter_contained_matches([m1, m2, m3, m4, m5])
        assert [m4] == result
        assert discarded
    def test_match_matches_correctly_simple_exact_query_1(self):
        tf1 = self.get_test_loc('detect/mit/mit.c')
        ftr = Rule(text_file=tf1, licenses=['mit'])
        idx = index.LicenseIndex([ftr])

        query_doc = self.get_test_loc('detect/mit/mit2.c')
        matches = idx.match(query_doc)
        assert 1 == len(matches)
        match = matches[0]
        assert ftr == match.rule
        assert Span(0, 86) == match.qspan
        assert Span(0, 86) == match.ispan
    def test_overlapping_matches_are_filtered(self):
        r1 = Rule(licenses=['apache-2.0', 'gpl'])
        m1 = LicenseMatch(rule=r1,
                          query_position=analysis.Token(start=0, end=5))
        same_span = LicenseMatch(rule=r1,
                                 query_position=analysis.Token(start=1, end=6))
        same_span_too = LicenseMatch(rule=r1,
                                     query_position=analysis.Token(start=1,
                                                                   end=6))

        test = detect.filter_matches([m1, same_span, same_span_too])
        self.assertEqual([m1, same_span], test)
    def test_special_characters_detection(self):
        tf1 = self.get_test_loc('detect/specialcharacter/kerberos.txt')
        tf2 = self.get_test_loc('detect/specialcharacter/kerberos1.txt')
        tf3 = self.get_test_loc('detect/specialcharacter/kerberos2.txt')
        tf4 = self.get_test_loc('detect/specialcharacter/kerberos3.txt')
        docs = [tf1, tf2, tf3, tf4]

        for loc in docs:
            ftr = Rule(text_file=loc, licenses=['kerberos'])
            index = detect.LicenseIndex([ftr])
            matches = index.match(loc)
            self.assertEqual(1, len(matches))
    def test_simple_detection1(self):
        tf1 = self.get_test_loc('detect/mit/mit.c')
        ftr = Rule(text_file=tf1, licenses=['mit'])
        index = detect.LicenseIndex([ftr])

        tf2 = self.get_test_loc('detect/mit/mit2.c')
        matches = index.match(tf2)
        assert 1 == len(matches)
        match = matches[0]
        assert ftr == match.rule
        assert 5 == match.span.start
        assert 91 == match.span.end
    def test_match_index_return_one_match_with_correct_offsets(self):
        ftr = Rule(text_file=self.create_test_file('A one. A two. A three.'))
        index = detect.LicenseIndex([ftr])
        doc1 = (u'/some/path/', u'some junk. A one. A two. A three.')
        #                                    1111111111222222222233
        #                         012345678901234567890123456789012

        matches = index.match([doc1[1]])
        self.assertEqual(1, len(matches))

        self.assertEqual(11, matches[0].query_position.start_char)
        self.assertEqual(32, matches[0].query_position.end_char)
    def test_match_can_match_with_plain_rule_simple2(self):
        rule_text = u'''X11 License
        Copyright (C) 1996 X Consortium
        Permission is hereby granted, free of charge, to any person obtaining a copy
        of this software and associated documentation files (the "Software"), to deal
        in the Software without restriction, including without limitation the rights
        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
        copies of the Software, and to permit persons to whom the Software is
        furnished to do so, subject to the following conditions: The above copyright
        notice and this permission notice shall be included in all copies or
        substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS",
        WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
        TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
        NONINFRINGEMENT. IN NO EVENT SHALL THE X CONSORTIUM BE LIABLE FOR ANY CLAIM,
        DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
        OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
        OR OTHER DEALINGS IN THE SOFTWARE. Except as contained in this notice, the
        name of the X Consortium shall not be used in advertising or otherwise to
        promote the sale, use or other dealings in this Software without prior
        written authorization from the X Consortium. X Window System is a trademark
        of X Consortium, Inc.
        '''
        rule = Rule(_text=rule_text, licenses=['x-consortium'])
        idx = index.LicenseIndex([rule])

        query_loc = self.get_test_loc(
            'detect/simple_detection/x11-xconsortium_text.txt')
        matches = idx.match(location=query_loc)
        assert 1 == len(matches)

        expected_qtext = u'''
        X11 License Copyright C 1996 X Consortium Permission is hereby granted free
        of charge to any person obtaining a copy of this software and associated
        documentation files the Software to deal in the Software without restriction
        including without limitation the rights to use copy modify merge publish
        distribute sublicense and or sell copies of the Software and to permit
        persons to whom the Software is furnished to do so subject to the following
        conditions The above copyright notice and this permission notice shall be
        included in all copies or substantial portions of the Software THE SOFTWARE
        IS PROVIDED AS IS WITHOUT WARRANTY OF ANY KIND EXPRESS OR IMPLIED INCLUDING
        BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY FITNESS FOR A PARTICULAR
        PURPOSE AND NONINFRINGEMENT IN NO EVENT SHALL THE X CONSORTIUM BE LIABLE FOR
        ANY CLAIM DAMAGES OR OTHER LIABILITY WHETHER IN AN ACTION OF CONTRACT TORT OR
        OTHERWISE ARISING FROM OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
        OR OTHER DEALINGS IN THE SOFTWARE Except as contained in this notice the name
        of the X Consortium shall not be used in advertising or otherwise to promote
        the sale use or other dealings in this Software without prior written
        authorization from the X Consortium X Window System is a trademark of X
        Consortium Inc
        '''.split()
        match = matches[0]
        qtext, _itext = get_texts(match, location=query_loc, idx=idx)
        assert expected_qtext == qtext.split()
Example #30
0
    def test_filter_multiple_contained_matches(self):
        r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl')
        m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5))

        r2 = Rule(text_file='r2', license_expression='apache-2.0 OR gpl')
        contained1 = LicenseMatch(rule=r2, qspan=Span(1, 2), ispan=Span(1, 2))

        r3 = Rule(text_file='r3', license_expression='apache-2.0 OR gpl')
        contained2 = LicenseMatch(rule=r3, qspan=Span(3, 4), ispan=Span(3, 4))

        r5 = Rule(text_file='r5', license_expression='apache-2.0 OR gpl')
        m5 = LicenseMatch(rule=r5, qspan=Span(1, 6), ispan=Span(1, 6))

        matches, discarded = filter_contained_matches(
            [m1, contained1, contained2, m5])
        assert [m1] == matches
        assert sorted([
            m5,
            contained1,
            contained2,
        ]) == sorted(discarded)