Example #1
0
    def test_template_detection_with_short_tokens_around_gaps(self):
        # failed when a gapped token starts at a beginning of rule and at a
        # position less than ngram length
        # setup
        tf7 = self.get_test_loc('detect/templates/license7.txt')
        ttr = Rule(text_file=tf7, template=True)

        # use quadri grams by default
        index = detect.LicenseIndex([ttr])

        # test the index
        quad_grams_index = index.license_index.indexes[4]
        assert 205 == len(quad_grams_index)
        assert u'software without prior written' in quad_grams_index

        # test
        tf8 = self.get_test_loc('detect/templates/license8.txt')
        matches = index.match(tf8)
        assert 1 == len(matches)
        expected = Token(start=0,
                         start_line=1,
                         start_char=0,
                         end_line=40,
                         end_char=34,
                         end=276)
        assert expected == matches[0].query_position
    def test_simple_detection_xcon_crlf_template(self):
        # setup
        tf1_text = u'''X11 License
        Copyright (C) 1996 X Consortium
        Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
        The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
        Except as contained in this notice, the name of the X Consortium shall not be used in advertising or otherwise to promote the sale, use or other dealings in this Software without prior written authorization from the X Consortium.
        X Window System is a trademark of X Consortium, Inc.
        '''
        ttr = Rule(text_file=self.create_test_file(tf1_text),
                   licenses=['x-consortium'],
                   template=True)
        index = detect.LicenseIndex([ttr])

        # test
        doc = self.get_test_loc(
            'detect/simple_detection/x11-xconsortium_text.txt')
        matches = index.match(doc)
        expected = Token(start=0,
                         start_line=1,
                         start_char=0,
                         end_line=13,
                         end_char=51,
                         end=216)
        self.assertEqual(expected, matches[0].query_position)
 def test_index_template2(self):
     ttr = Rule(
         text_file=self.create_test_file(u'A one. A {{10}}two. A three.'),
         template=True)
     index = detect.LicenseIndex([ttr])
     expected = {
         (u'a', u'one', u'a'): {
             0: [
                 Token(start=0,
                       start_line=0,
                       start_char=0,
                       end_line=0,
                       end_char=8,
                       end=2,
                       gap=10,
                       value=(u'a', u'one', u'a'))
             ]
         },
         (u'two', u'a', u'three'): {
             0: [
                 Token(start=3,
                       start_line=0,
                       start_char=15,
                       end_line=0,
                       end_char=27,
                       end=5,
                       gap=0,
                       value=(u'two', u'a', u'three'))
             ]
         }
     }
     self.assertEqual(expected, index.license_index.indexes[3])
Example #4
0
 def test_index_template2(self):
     ttr = Rule(
         text_file=self.create_test_file(u'A one. A {{10}}two. A three.'),
         template=True)
     index = detect.LicenseIndex([ttr])
     expected = {
         u'a one a': {
             0: [
                 Token(start=0,
                       start_line=0,
                       start_char=0,
                       end_line=0,
                       end_char=8,
                       end=2,
                       gap=10,
                       value=u'a one a')
             ]
         },
         u'two a three': {
             0: [
                 Token(start=3,
                       start_line=0,
                       start_char=15,
                       end_line=0,
                       end_char=27,
                       end=5,
                       gap=0,
                       value=u'two a three')
             ]
         }
     }
     assert expected == index.license_index.indexes[3]
    def test_template_detection_with_short_tokens_around_gaps(self):
        # failed when a gapped token starts at a beginning of rule and at a
        # position less than ngram length
        # setup
        tf7 = self.get_test_loc('detect/templates/license7.txt')
        ttr = Rule(text_file=tf7, template=True)

        # use trigrams by default
        index = detect.LicenseIndex([ttr])

        # test the index
        four_grams_index = index.license_index.indexes[3]
        self.assertEqual(211, len(four_grams_index))
        self.assertTrue((
            u'software',
            u'without',
            u'prior',
        ) in four_grams_index)

        # test
        tf8 = self.get_test_loc('detect/templates/license8.txt')
        matches = index.match(tf8)
        self.assertEqual(1, len(matches))
        expected = Token(start=0,
                         start_line=1,
                         start_char=0,
                         end_line=40,
                         end_char=34,
                         end=276)
        self.assertEqual(expected, matches[0].query_position)
    def test_simple_detection_no_result(self):
        tf1 = self.get_test_loc('detect/mit/mit.c')
        ftr = Rule(text_file=tf1, licenses=['mit'])
        index = detect.LicenseIndex([ftr])

        tf2 = self.get_test_loc('detect/mit/mit4.c')
        matches = index.match(tf2)
        assert not matches
    def check_detection(self, doc_file, rule_file, expected_matches):
        test_rule = self.get_test_loc(rule_file)
        ftr = Rule(text_file=test_rule, licenses=['mit'])
        index = detect.LicenseIndex([ftr])

        test_doc = self.get_test_loc(doc_file)
        matches = index.match(test_doc)
        self.assertEqual(1, len(matches))
        self.assertEqual(expected_matches, matches[0].query_position)
 def test_match_does_not_return_incorrect_matches(self):
     ftr = Rule(text_file=self.create_test_file('A one. A two. A three.'))
     index = detect.LicenseIndex([ftr])
     docs = [
         u'some other path', u'some junk', u'some path', u'some other junk'
     ]
     for d in docs:
         matches = index.match([d])
         self.assertEqual([], matches)
    def test_overlap_detection(self):
        #  test this containment relationship between test and index licenses:
        #   * Index licenses:
        #
        #   +-license 2 --------+
        #   |  +-license 1 --+  |
        #   |  +-------------+  |
        #   +-------------------+
        #
        #   * License texts to detect:
        #
        #   +- license 3 -----------+
        #   | +-license 2 --------+ |
        #   | |  +-license 1 --+  | |
        #   | |  +-------------+  | |
        #   | +-------------------+ |
        #   +-----------------------+
        #
        #   +-license 4 --------+
        #   |  +-license 1 --+  |
        #   |  +-------------+  |
        #   +-------------------+

        tf1 = self.get_test_loc('detect/overlap/license.txt')
        tf2 = self.get_test_loc('detect/overlap/license2.txt')
        tf3 = self.get_test_loc('detect/overlap/license3.txt')
        tf4 = self.get_test_loc('detect/overlap/license4.txt')

        # setup index
        ftr1 = Rule(text_file=tf1, licenses=['overlap_license'])
        ftr2 = Rule(text_file=tf2, licenses=['overlap_license'])
        index = detect.LicenseIndex([ftr1, ftr2])

        # test : 1 contains nothing: return 1
        matches = index.match(tf1)
        self.assertEqual(1, len(matches))
        match = matches[0]
        self.assertEqual(ftr1, match.rule)

        # test : 2 contains 1: return 2
        matches = index.match(tf2)
        self.assertEqual(1, len(matches))
        match = matches[0]
        self.assertEqual(ftr2, match.rule)

        # test : 3 contains 2 that contains 1: return 2
        matches = index.match(tf3)
        self.assertEqual(1, len(matches))
        match = matches[0]
        self.assertEqual(ftr2, match.rule)

        # test : 4 contains 1: return 1
        matches = index.match(tf4)
        self.assertEqual(1, len(matches))
        match = matches[0]
        self.assertEqual(ftr1, match.rule)
    def test_simple_detection_against_same_text(self):
        tf1 = self.get_test_loc('detect/mit/mit.c')
        ftr = Rule(text_file=tf1, licenses=['mit'])
        index = detect.LicenseIndex([ftr])

        matches = index.match(tf1)
        assert 1 == len(matches)
        match = matches[0]
        assert ftr == match.rule
        assert 0 == match.span.start
        assert 86 == match.span.end
    def test_special_characters_detection(self):
        tf1 = self.get_test_loc('detect/specialcharacter/kerberos.txt')
        tf2 = self.get_test_loc('detect/specialcharacter/kerberos1.txt')
        tf3 = self.get_test_loc('detect/specialcharacter/kerberos2.txt')
        tf4 = self.get_test_loc('detect/specialcharacter/kerberos3.txt')
        docs = [tf1, tf2, tf3, tf4]

        for loc in docs:
            ftr = Rule(text_file=loc, licenses=['kerberos'])
            index = detect.LicenseIndex([ftr])
            matches = index.match(loc)
            self.assertEqual(1, len(matches))
    def test_simple_detection1(self):
        tf1 = self.get_test_loc('detect/mit/mit.c')
        ftr = Rule(text_file=tf1, licenses=['mit'])
        index = detect.LicenseIndex([ftr])

        tf2 = self.get_test_loc('detect/mit/mit2.c')
        matches = index.match(tf2)
        assert 1 == len(matches)
        match = matches[0]
        assert ftr == match.rule
        assert 5 == match.span.start
        assert 91 == match.span.end
    def test_match_index_return_one_match_with_correct_offsets(self):
        ftr = Rule(text_file=self.create_test_file('A one. A two. A three.'))
        index = detect.LicenseIndex([ftr])
        doc1 = (u'/some/path/', u'some junk. A one. A two. A three.')
        #                                    1111111111222222222233
        #                         012345678901234567890123456789012

        matches = index.match([doc1[1]])
        self.assertEqual(1, len(matches))

        self.assertEqual(11, matches[0].query_position.start_char)
        self.assertEqual(32, matches[0].query_position.end_char)
    def test_bsd_rule_detection(self):
        tf1 = self.get_test_loc('detect/mit/t1.txt')
        ftr = Rule(text_file=tf1, licenses=['bsd-original'])
        index = detect.LicenseIndex([ftr])

        test_doc = self.get_test_loc('detect/mit/t2.txt')
        matches = index.match(test_doc)
        self.assertEqual(1, len(matches))
        expected = Token(start=0,
                         start_line=1,
                         start_char=0,
                         end_line=27,
                         end_char=59,
                         end=241)
        self.assertEqual(expected, matches[0].query_position)
 def test_fulltext_detection_works_with_partial_overlap_from_location(self):
     # setup
     test_rule = self.get_test_loc('detect/templates/license3.txt')
     ftr = Rule(text_file=test_rule, licenses=['mylicense'])
     index = detect.LicenseIndex([ftr])
     # test
     test_doc = self.get_test_loc('detect/templates/license4.txt')
     matches = index.match(test_doc)
     self.assertEqual(1, len(matches))
     expected = Token(start=1,
                      start_line=1,
                      start_char=7,
                      end_line=4,
                      end_char=67,
                      end=42)
     self.assertEqual(expected, matches[0].query_position)
    def test_template_detection_publicdomain(self):
        # setup
        tf5 = self.get_test_loc('detect/templates/license5.txt')
        ttr = Rule(text_file=tf5, licenses=['public-domain'], template=True)
        index = detect.LicenseIndex([ttr])

        # test
        tf6 = self.get_test_loc('detect/templates/license6.txt')
        matches = index.match(tf6)
        self.assertEqual(1, len(matches))
        expected = Token(start=82,
                         start_line=16,
                         start_char=0,
                         end_line=18,
                         end_char=67,
                         end=118)
        self.assertEqual(expected, matches[0].query_position)
Example #17
0
 def test_index_template(self):
     ttr = Rule(
         text_file=self.create_test_file(u'A one. A {{}}two. A three.'),
         template=True)
     index = detect.LicenseIndex([ttr])
     expected = {
         1: {},
         2: {},
         3: {
             u'two a three': {
                 0: [
                     Token(start=3,
                           start_line=0,
                           start_char=13,
                           end_line=0,
                           end_char=25,
                           end=5,
                           gap=0,
                           value=u'two a three',
                           length=3)
                 ]
             },
             u'a one a': {
                 0: [
                     Token(start=0,
                           start_line=0,
                           start_char=0,
                           end_line=0,
                           end_char=8,
                           end=2,
                           gap=5,
                           value=u'a one a',
                           length=3)
                 ]
             }
         },
         4: {},
     }
     assert expected == index.license_index.indexes
    def test_detection_template_with_inter_gap_smaller_than_ngram_len(self):
        # in this template text there are only 2 tokens between the two
        # templates: this is smaller than the ngram_len of 3 and can never be
        # caught by this length
        tf1_text = u'''Redistributions in binary form must
        {{}} reproduce the {{}}above copyright notice'''
        ttr = Rule(text_file=self.create_test_file(tf1_text),
                   licenses=['mylicense'],
                   template=True)
        index = detect.LicenseIndex([ttr])  # default to ngram_len=3

        # test
        tf2 = u'''Redistributions in binary form must nexB company
        reproduce the word for word above copyright notice.'''.splitlines()
        matches = index.match(tf2)
        expected = Token(start=0,
                         start_line=1,
                         start_char=0,
                         end_line=2,
                         end_char=58,
                         end=14)
        self.assertEqual(1, len(matches))
        self.assertEqual(expected, matches[0].query_position)
Example #19
0
    def test_detection_template_with_inter_gap_equal_to_ngram_len(self):
        # in this template there are 3 tokens between the two templates: len is
        # same as ngram_len of 3
        tf1_text = u'''Redistributions in binary form must
        {{}} reproduce the stipulated {{}}above copyright notice'''
        ttr = Rule(text_file=self.create_test_file(tf1_text),
                   licenses=['mylicense'],
                   template=True)
        index = detect.LicenseIndex([ttr])  # default to ngram_len=3

        # test
        tf2_text = (u'''Redistributions in binary form must nexB company
        reproduce the stipulated word for word above copyright notice.'''.
                    splitlines())
        matches = index.match(tf2_text)
        expected = Token(start=0,
                         start_line=1,
                         start_char=0,
                         end_line=2,
                         end_char=69,
                         end=15)
        assert 1 == len(matches)
        assert expected == matches[0].query_position
    def test_detection_template_with_inter_gap_bigger_than_ngram_len(self):
        # setup in this template there are only 4 tokens between the two
        # templates: this is bigger than the ngram_len of 3
        tf1_text = u'''Redistributions in binary form must
        {{}} reproduce as is stipulated {{}}above copyright notice'''
        ttr = Rule(text_file=self.create_test_file(tf1_text),
                   licenses=['mylicense'],
                   template=True)
        index = detect.LicenseIndex([ttr])  # default to ngram_len=3

        # test
        tf2_text = (u'''Redistributions in binary form must nexB company
        reproduce as is stipulated the word for word above copyright notice.'''
                    .splitlines())
        matches = index.match(tf2_text)
        expected = Token(start=0,
                         start_line=1,
                         start_char=0,
                         end_line=2,
                         end_char=75,
                         end=17)
        self.assertEqual(1, len(matches))
        self.assertEqual(expected, matches[0].query_position)
Example #21
0
    def test_match_index_return_one_match_with_correct_offsets(self):
        ftr = Rule(text_file=self.create_test_file('A one. A two. A three.'))
        index = detect.LicenseIndex([ftr])
        query_doc = (u'/some/path/', u'some junk. A one. A two. A three.')
        #                              t1   t2    t3 t4  t5 t6  t7 t8
        #                                         1111111111222222222233
        #                              012345678901234567890123456789012

        matches = index.match([query_doc[1]])
        assert 1 == len(matches)
        match = matches[0]
        qpos = match.query_position
        # abs pos are zero-based
        # lines are one-based, char are zero-based on line
        expected = Token(
            start=2,
            end=7,
            start_line=1,
            start_char=11,
            end_line=1,
            end_char=32,
        )
        assert expected == qpos
    def test_simple_detection_text_shorter_than_ngram_len_using_trigrams(self):
        tf1 = self.get_test_loc('detect/string/mit.txt')
        ftr = Rule(text_file=tf1, licenses=['mit'])
        index = detect.LicenseIndex([ftr])  # default to ngram_len=3

        tf2 = self.get_test_loc('detect/string/mit2.txt')
        tf3 = self.get_test_loc('detect/string/mit3.txt')
        tf4 = self.get_test_loc('detect/string/mit4.txt')

        docs = [
            (tf1, 1, [
                Token(start=0,
                      start_line=1,
                      start_char=0,
                      end_line=1,
                      end_char=11,
                      end=1)
            ]),
            (tf4, 1, [
                Token(start=1,
                      start_line=1,
                      start_char=4,
                      end_line=1,
                      end_char=15,
                      end=2)
            ]),
            (tf2, 2, [
                Token(start=6,
                      start_line=1,
                      start_char=20,
                      end_line=1,
                      end_char=31,
                      end=7),
                Token(start=8,
                      start_line=1,
                      start_char=32,
                      end_line=1,
                      end_char=43,
                      end=9)
            ]),
            (tf3, 2, [
                Token(start=6,
                      start_line=1,
                      start_char=20,
                      end_line=1,
                      end_char=31,
                      end=7),
                Token(start=8,
                      start_line=2,
                      start_char=0,
                      end_line=2,
                      end_char=11,
                      end=9)
            ]),
        ]

        for loc, expect_mlen, expect_matches_posits in docs:
            matches = list(index.match(loc, perfect=True))
            self.assertEqual(expect_mlen, len(matches))
            for i, m in enumerate(matches):
                expected_pos = expect_matches_posits[i]
                assert expected_pos == m.query_position
 def test_match_with_empty_query_does_not_return_matches(self):
     ftr = Rule(text_file=self.create_test_file('A one. A two. A three.'))
     index = detect.LicenseIndex([ftr])
     matches = index.match([''])
     self.assertEqual([], matches)