Example #1
0
def add_rule(spdx_text, license_obj):
    """
    Add a new rule with text `spdx_text` for the `license_obj` License.
    """
    rule_base_name = 'spdx_license_id_' + spdx_text.lower(
    ) + '_for_' + license_obj.key
    text_file = os.path.join(rules_data_dir, rule_base_name + '.RULE')
    data_file = os.path.join(rules_data_dir, rule_base_name + '.yml')
    if os.path.exists(text_file) or os.path.exists(data_file):
        raise Exception(
            'Cannot create new SPDX rules text file for {text}. '
            'File already exists at: {text_file}'.format(**locals()))

    with open(text_file, 'wb') as tf:
        tf.write(spdx_text)

    rule = Rule(
        text_file=text_file,
        license_expression=license_obj.key,
        relevance=80,
        minimum_coverage=100,
        notes='Used to detect a bare SPDX license id',
    )
    rule.data_file = data_file
    rule.dump()
    click.echo('Added new rule: ' + repr(rule))
    def test_Query_tokens_by_line_from_string(self):
        rule_text = 'Redistribution and use in source and binary forms with or without modification are permitted'
        rule = Rule(stored_text=rule_text, license_expression='bsd')
        legalese = set([
            'redistribution',
            'form',
        ])
        idx = index.LicenseIndex([rule], _legalese=legalese)
        querys = '''
            The
            Redistribution and use in source and binary are permitted

            Athena capital of Grece
            Paris and Athene
            Always'''

        qry = Query(query_string=querys, idx=idx, _test_mode=True)
        result = list(qry.tokens_by_line())
        expected = [
            [],
            [None],
            [1, 2, 3, 4, 5, 2, 6, 12, 13],
            [],
            [None, None, None, None],
            [None, 2, None],
            [None],
        ]

        assert expected == result

        # convert tid to actual token strings
        qtbl_as_str = lambda qtbl: [[
            None if tid is None else idx.tokens_by_tid[tid] for tid in tids
        ] for tids in qtbl]

        result_str = qtbl_as_str(result)
        expected_str = [
            [],
            [None],
            [
                'redistribution', 'and', 'use', 'in', 'source', 'and',
                'binary', 'are', 'permitted'
            ],
            [],
            [None, None, None, None],
            [None, 'and', None],
            [None],
        ]

        assert expected_str == result_str

        assert [3, 3, 3, 3, 3, 3, 3, 3, 3, 6] == qry.line_by_pos

        idx = index.LicenseIndex(
            [Rule(stored_text=rule_text, license_expression='bsd')])
        querys = 'and this is not a license'
        qry = Query(query_string=querys, idx=idx, _test_mode=True)
        result = list(qry.tokens_by_line())
        expected = [['and', None, None, None, 'license']]
        assert expected == qtbl_as_str(result)
def add_rule(spdx_text, license_obj):
    """
    Add a new rule with text `spdx_text` for the `license_obj` License.
    """
    rule_base_name = "spdx_license_id_" + spdx_text.lower(
    ) + "_for_" + license_obj.key
    text_file = os.path.join(rules_data_dir, rule_base_name + ".RULE")
    data_file = os.path.join(rules_data_dir, rule_base_name + ".yml")
    if os.path.exists(text_file) or os.path.exists(data_file):
        raise Exception(
            "Cannot create new SPDX rules text file for {text}. "
            "File already exists at: {text_file}".format(**locals()))

    with io.open(text_file, "w", encoding="utf-8") as tf:
        tf.write(spdx_text)

    rule = Rule(
        text_file=text_file,
        license_expression=license_obj.key,
        relevance=80,
        minimum_coverage=100,
        notes="Used to detect a bare SPDX license id",
    )
    rule.data_file = data_file
    rule.dump()
    click.echo("Added new rule: " + repr(rule))
Example #4
0
    def test_multiple_contained_matches_are_filtered(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])
        m1 = LicenseMatch(rule=r1,
                          query_position=analysis.Token(start=0, end=5),
                          score=100)

        r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl'])
        contained1 = LicenseMatch(rule=r2,
                                  query_position=analysis.Token(start=1,
                                                                end=2),
                                  score=100)

        r3 = Rule(text_file='r3', licenses=['apache-2.0', 'gpl'])
        contained2 = LicenseMatch(rule=r3,
                                  query_position=analysis.Token(start=3,
                                                                end=4),
                                  score=100)

        r5 = Rule(text_file='r5', licenses=['apache-2.0', 'gpl'])
        m5 = LicenseMatch(rule=r5,
                          query_position=analysis.Token(start=1, end=6),
                          score=100)

        result = detect.filter_overlapping_matches(
            [m1, contained1, contained2, m5])
        assert [m1, m5] == result
Example #5
0
    def test_LicenseMatch_small(self):
        r1_text = u'licensed under the GPL, licensed under the GPL'
        r1 = Rule(text_file='r1', licenses=['apache-1.1'], _text=r1_text)
        r2_text = u'licensed under the GPL, licensed under the GPL' * 10
        r2 = Rule(text_file='r1', licenses=['apache-1.1'], _text=r2_text)
        _idx = index.LicenseIndex([r1, r2])

        assert LicenseMatch(rule=r1,
                            qspan=Span(0, 10),
                            ispan=Span(0, 10),
                            hispan=Span(12)).small()
        assert LicenseMatch(rule=r1,
                            qspan=Span(0, 10),
                            ispan=Span(0, 10),
                            hispan=Span(11, 12)).small()
        assert LicenseMatch(rule=r1,
                            qspan=Span(10, 11, 12),
                            ispan=Span(10, 11, 12),
                            hispan=Span(11, 12)).small()
        assert LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1,
                                                                  6)).small()

        assert LicenseMatch(rule=r2,
                            qspan=Span(0, 10),
                            ispan=Span(0, 10),
                            hispan=Span(12)).small()
        assert LicenseMatch(rule=r2,
                            qspan=Span(5, 10),
                            ispan=Span(5, 10),
                            hispan=Span(5, 6)).small()
        assert LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1,
                                                                  6)).small()
    def test_LicenseMatch_score_0_relevance(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0'])
        r1.relevance = 0
        r1.length = 6

        m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2))
        assert m1.score() == 0
    def test_LicenseMatch_score_0(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0'])
        r1.relevance = 0
        r1.length = 6

        m1 = LicenseMatch(rule=r1, qspan=Span(), ispan=Span())
        assert m1.score() == 0
Example #8
0
    def test_LicenseMatch_score_0_relevance(self):
        r1 = Rule(text_file='r1', license_expression='apache-2.0')
        r1.relevance = 0
        r1.length = 6

        m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2))
        assert m1.score() == 0
Example #9
0
    def test_LicenseMatch_small(self):
        r1_text = u'licensed under the GPL, licensed under the GPL distribute extent of law'
        small_rule = Rule(text_file='small_rule', licenses=['apache-1.1'], _text=r1_text)

        r2_text = u'licensed under the GPL, licensed under the GPL re distribute extent of law' * 10
        long_rule = Rule(text_file='long_rule', licenses=['apache-1.1'], _text=r2_text)

        _idx = index.LicenseIndex([small_rule, long_rule])

        test = LicenseMatch(rule=small_rule, qspan=Span(0, 10), ispan=Span(0, 10), hispan=Span(12))
        assert test.small()
        test = LicenseMatch(rule=small_rule, qspan=Span(0, 10), ispan=Span(0, 10), hispan=Span(11, 12))
        assert test.small()

        test = LicenseMatch(rule=small_rule, qspan=Span(10, 11, 12), ispan=Span(10, 11, 12), hispan=Span(11, 12))
        assert test.small()

        test = LicenseMatch(rule=small_rule, qspan=Span(1, 6), ispan=Span(1, 6))
        assert test.small()

        test = LicenseMatch(rule=long_rule, qspan=Span(0, 10), ispan=Span(0, 10), hispan=Span(12))
        assert test.small()

        test = LicenseMatch(rule=long_rule, qspan=Span(5, 10), ispan=Span(5, 10), hispan=Span(5, 6))
        assert test.small()

        test = LicenseMatch(rule=small_rule, qspan=Span(1, 10), ispan=Span(1, 10), hispan=Span(3, 6))
        assert not test.small()
Example #10
0
    def test_LicenseMatch_score_100_contiguous(self):
        r1 = Rule(text_file='r1', license_expression='apache-2.0')
        r1.relevance = 100
        r1.length = 42

        m1 = LicenseMatch(rule=r1, qspan=Span(0, 41), ispan=Span(0, 41))
        assert m1.score() == 100
    def test_overlap_detection5(self):
        #  test this containment relationship between test and index licenses:
        #   * Index licenses:
        #   +-license 2 --------+
        #   |  +-license 1 --+  |
        #   +-------------------+
        #
        #   +-license 4 --------+
        #   |  +-license 1 --+  |
        #   +-------------------+

        # setup index
        license1 = '''Redistribution and use permitted for MIT license.'''

        license2 = '''Redistributions of source must retain copyright.
        Redistribution and use permitted for MIT license.
        Redistributions in binary form is permitted.'''

        rule1 = Rule(_text=license1, licenses=['overlap'])
        rule2 = Rule(_text=license2, licenses=['overlap'])
        idx = index.LicenseIndex([rule1, rule2])

        querys = '''My source.
        Redistribution and use permitted for MIT license.
        My code.'''

        # test : querys contains license1: return license1 as exact coverage
        matches = idx.match(query_string=querys)
        assert 1 == len(matches)

        match = matches[0]
        assert rule1 == match.rule
        qtext, _itext = get_texts(match, query_string=querys, idx=idx)
        assert 'Redistribution and use permitted for MIT license' == qtext
Example #12
0
    def test_LicenseMatch_score_100_non_contiguous(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0'])
        r1.relevance = 100
        r1.length = 42

        m1 = LicenseMatch(rule=r1, qspan=Span(0, 19) | Span(30, 51), ispan=Span(0, 41))
        assert m1.score() == 80.77
    def test_overlap_detection2_exact(self):
        #  test this containment relationship between test and index licenses:
        #   * Index licenses:
        #   +-license 2 --------+
        #   |  +-license 1 --+  |
        #   +-------------------+

        # setup index
        license1 = '''Redistribution and use permitted.'''

        license2 = '''Redistributions of source must retain copyright.
        Redistribution and use permitted.
        Redistributions in binary form is permitted.'''

        rule1 = Rule(stored_text=license1, license_expression='overlap')
        rule2 = Rule(stored_text=license2, license_expression='overlap')
        idx = index.LicenseIndex([rule1, rule2])

        # test : license2 contains license1: return license2 as exact coverage

        querys = 'Redistribution and use bla permitted.'
        matches = idx.match(query_string=querys)
        assert 1 == len(matches)
        match = matches[0]
        assert rule1 == match.rule
        qtext, _itext = get_texts(match, query_string=querys, idx=idx)
        assert 'Redistribution and use [bla] permitted' == qtext
Example #14
0
    def test_LicenseMatch_score_25_with_stored_relvance(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0'])
        r1.relevance = 50
        r1.length = 6

        m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2))
        # NB we do not have a query here
        assert m1.score() == 50
Example #15
0
 def test_merge_should_not_merge_repeated_matches_out_of_sequence(self):
     rule = Rule(text_file='gpl-2.0_49.RULE', licenses=[u'gpl-2.0'])
     rule.rid = 2615
     m1 = LicenseMatch(rule=rule, matcher='chunk1', qspan=Span(0, 7), ispan=Span(0, 7))
     m2 = LicenseMatch(rule=rule, matcher='chunk2', qspan=Span(8, 15), ispan=Span(0, 7))
     m3 = LicenseMatch(rule=rule, matcher='chunk3', qspan=Span(16, 23), ispan=Span(0, 7))
     result = merge_matches([m1, m2, m3])
     assert [m1, m2, m3] == result
Example #16
0
    def test_merge_does_not_merge_overlapping_matches_of_different_rules_with_different_licensing(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])
        r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl2'])

        m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5))
        m2 = LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6))

        assert [m1, m2] == merge_matches([m1, m2])
Example #17
0
    def test_LicenseMatch_score_25_with_stored_relevance(self):
        r1 = Rule(text_file='r1', license_expression='apache-2.0')
        r1.relevance = 50
        r1.length = 6

        m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2))
        # NB we do not have a query here
        assert m1.score() == 25
Example #18
0
    def test_merge_does_not_merge_overlapping_matches_of_same_rules_if_in_sequence_with_gaps_for_long_match(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])
        r1.length = 20
        m1 = LicenseMatch(rule=r1, qspan=Span(1, 10), ispan=Span(1, 10))
        m2 = LicenseMatch(rule=r1, qspan=Span(14, 20), ispan=Span(14, 20))

        expected = [LicenseMatch(rule=r1, qspan=Span(1, 10) | Span(14, 20), ispan=Span(1, 10) | Span(14, 20))]
        results = merge_matches([m1, m2])
        assert expected == results
Example #19
0
    def test_merge_does_not_merge_contained_matches_of_different_rules_with_same_licensing(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])
        r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl'])

        m1 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))
        m2 = LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6))

        matches = merge_matches([m1, m2])
        assert sorted([m1, m2]) == sorted(matches)
    def test_overlap_detection(self):
        #  test this containment relationship between test and index licenses:
        #   * Index licenses:
        #
        #   +-license 2 --------+
        #   |  +-license 1 --+  |
        #   |  +-------------+  |
        #   +-------------------+
        #
        #   * License texts to detect:
        #
        #   +- license 3 -----------+
        #   | +-license 2 --------+ |
        #   | |  +-license 1 --+  | |
        #   | |  +-------------+  | |
        #   | +-------------------+ |
        #   +-----------------------+
        #
        #   +-license 4 --------+
        #   |  +-license 1 --+  |
        #   |  +-------------+  |
        #   +-------------------+

        tf1 = self.get_test_loc('detect/overlap/license.txt')
        tf2 = self.get_test_loc('detect/overlap/license2.txt')
        tf3 = self.get_test_loc('detect/overlap/license3.txt')
        tf4 = self.get_test_loc('detect/overlap/license4.txt')

        # setup index
        ftr1 = Rule(text_file=tf1, licenses=['overlap_license'])
        ftr2 = Rule(text_file=tf2, licenses=['overlap_license'])
        index = detect.LicenseIndex([ftr1, ftr2])

        # test : 1 contains nothing: return 1
        matches = index.match(tf1)
        self.assertEqual(1, len(matches))
        match = matches[0]
        self.assertEqual(ftr1, match.rule)

        # test : 2 contains 1: return 2
        matches = index.match(tf2)
        self.assertEqual(1, len(matches))
        match = matches[0]
        self.assertEqual(ftr2, match.rule)

        # test : 3 contains 2 that contains 1: return 2
        matches = index.match(tf3)
        self.assertEqual(1, len(matches))
        match = matches[0]
        self.assertEqual(ftr2, match.rule)

        # test : 4 contains 1: return 1
        matches = index.match(tf4)
        self.assertEqual(1, len(matches))
        match = matches[0]
        self.assertEqual(ftr1, match.rule)
Example #21
0
    def test_files_does_filter_contained_matches_of_different_rules_with_same_licensing(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])
        r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl'])

        m1 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))
        m2 = LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6))

        matches, discarded = filter_contained_matches([m1, m2])
        assert [m2] == matches
        assert [m1] == discarded
Example #22
0
    def test_merge_does_not_merges_matches_with_same_spans_if_rules_are_different(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])
        m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2))
        m5 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))

        r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl'])
        m2 = LicenseMatch(rule=r2, qspan=Span(0, 2), ispan=Span(0, 2))

        result = merge_matches([m1, m2, m5])
        assert sorted([LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6)), m2]) == sorted(result)
    def test_match_is_same(self):
        r1 = Rule(licenses=['apache-2.0', 'gpl'])
        m1 = LicenseMatch(rule=r1,
                          query_position=analysis.Token(start=0, end=2))
        r2 = Rule(licenses=['gpl', 'apache-2.0'])
        m2 = LicenseMatch(rule=r2,
                          query_position=analysis.Token(start=0, end=2))

        self.assertTrue(m1.is_same(m2))
        self.assertTrue(m2.is_same(m1))
Example #24
0
    def test_filter_prefers_longer_overlaping_matches(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])
        r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl'])

        overlap = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5))
        same_span1 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))
        same_span2 = LicenseMatch(rule=r2, qspan=Span(1, 8), ispan=Span(1, 8))

        matches, discarded = filter_contained_matches([overlap, same_span1, same_span2])
        assert [same_span2] == matches
        assert discarded
    def test_matches_with_same_span_are_kept_if_licenses_are_different(self):
        r1 = Rule(licenses=['apache-2.0'])
        m1 = LicenseMatch(rule=r1,
                          query_position=analysis.Token(start=0, end=2))
        r2 = Rule(licenses=['apache-1.1'])
        m2 = LicenseMatch(rule=r2,
                          query_position=analysis.Token(start=0, end=2))
        m5 = LicenseMatch(rule=r1,
                          query_position=analysis.Token(start=1, end=6))

        self.assertEqual([m1, m2, m5], detect.filter_matches([m1, m2, m5]))
Example #26
0
    def test_combine_raise_TypeError_for_matches_of_different_rules(self):
        r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl')
        r2 = Rule(text_file='r2', license_expression='apache-2.0 OR gpl2')

        m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5))
        m2 = LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6))

        try:
            m1.combine(m2)
        except TypeError:
            pass
Example #27
0
    def test_filter_matches_filters_matches_with_medium_overlap_only_if_license_are_the_same(self):
        r1 = Rule(text_file='r1', licenses=['apache-1.1'])
        m1 = LicenseMatch(rule=r1, qspan=Span(0, 10), ispan=Span(0, 10))
        m2 = LicenseMatch(rule=r1, qspan=Span(3, 11), ispan=Span(3, 11))

        r2 = Rule(text_file='r2', licenses=['gpl', 'apache-2.0'])
        m3 = LicenseMatch(rule=r2, qspan=Span(7, 15), ispan=Span(7, 15))

        result, discarded = filter_contained_matches([m1, m2, m3])
        assert sorted([m1, m3]) == sorted(result)
        assert discarded
Example #28
0
    def test_filter_does_filter_overlaping_matches_with_same_licensings(self):
        r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl')
        r2 = Rule(text_file='r2', license_expression='apache-2.0 OR gpl')

        overlap = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5))
        same_span1 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))
        same_span2 = LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6))

        matches, discarded = filter_contained_matches(
            [overlap, same_span1, same_span2])
        assert [overlap] == matches
        assert discarded
Example #29
0
    def test_filter_filters_matches_with_same_spans_if_licenses_are_identical_but_rule_differ(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0'])
        m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2))
        m5 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))

        r2 = Rule(text_file='r2', licenses=['apache-2.0'])
        m2 = LicenseMatch(rule=r2, qspan=Span(0, 2), ispan=Span(0, 2))

        matches, discarded = filter_contained_matches([m1, m2, m5])

        assert [m5] == matches
        assert discarded
Example #30
0
    def test_combine_matches_cannot_combine_matches_with_same_licensing_and_different_rules(self):
        r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl'])
        r2 = Rule(text_file='r2', licenses=['apache-2.0', 'gpl'])

        m1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5))
        m2 = LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6))

        try:
            m1.combine(m2)
            self.fail('Should fail')
        except TypeError:
            pass
Example #31
0
    def test_merge_does_not_merges_matches_with_same_spans_if_licenses_are_the_same_but_have_different_licenses_ordering(
            self):
        r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl')
        m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2))
        m5 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))

        r2 = Rule(text_file='r2', license_expression='gpl OR apache-2.0')
        m2 = LicenseMatch(rule=r2, qspan=Span(0, 2), ispan=Span(0, 2))

        result = merge_matches([m1, m2, m5])
        assert sorted(
            [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6)),
             m2]) == sorted(result)
Example #32
0
    def test_merge_does_not_merge_matches_with_same_spans_if_licenses_are_identical_but_rule_differ(
            self):
        r1 = Rule(text_file='r1', license_expression='apache-2.0')
        m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2))
        m5 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6))

        r2 = Rule(text_file='r2', license_expression='apache-2.0')
        m2 = LicenseMatch(rule=r2, qspan=Span(0, 2), ispan=Span(0, 2))

        matches = merge_matches([m1, m2, m5])
        assert sorted(
            [LicenseMatch(rule=r1, qspan=Span(0, 6), ispan=Span(0, 6)),
             m2]) == sorted(matches)
    def test_index_starters_with_inter_gap_equal_to_ngram_length(self):
        test_text = '''I hereby abandon any{{SAX 2.0 (the)}}, and release all of {{the SAX 2.0 }}source code of his'''
        rule = Rule(_text=test_text, licenses=['public-domain'])
        rule_tokens = list(rule.tokens())
        assert ['i', 'hereby', 'abandon', 'any', 'and', 'release', 'all', 'of', 'source', 'code', 'of', 'his'] == rule_tokens

        gaps = rule.gaps
        assert set([3, 7]) == gaps

        result = match_chunk.index_starters(rule_tokens, gaps, _ngram_length=4)
        expected = [
            (('i', 'hereby', 'abandon', 'any'), 0),
            (('and', 'release', 'all', 'of'), 4),
            (('source', 'code', 'of', 'his'), 8)
        ]
        assert expected == list(result)
    def test_index_starters_with_multiple_gaps_and_short_start(self):
        test_text = """
        Copyright {{10 Copyright}}. 
        All 
        Rights 
        Reserved.
        Redistribution
        materials 
        provided
        The
        name {{5 Author}} 
        must 
        not 
        be 
        used 
        to 
        endorse
        or
        promote {{5 Author}}.
        For 
        written
         permission, 
         please 
         contact {{5 Author Contact}}.
        4. 
        Products 
        derived 
        from 
        this 
        Software 
        may 
        not 
        be 
        called {{5 Product}}
        nor 
        may {{5 Product}} 
        appear 
        in 
        their 
        names 
        without 
        prior {{10 Author}}
        is 
        a 
        registered 
        trademark 
        of {{5 Author}}.
        5. 
        Due 
        credit 
        should
        be 
        given 
        to {{10 Author and URL}}
        THIS 
        SOFTWARE 
        IS 
        PROVIDED 
        BY {{10 org}}
        ``AS 
        IS'' 
        AND 
        ANY 
        EXPRESSED
         OR
          IMPLIED 
         IN 
         NO 
         EVENT 
         SHALL {{5 Author}} 
         OR 
         ITS 
         CONTRIBUTORS 
         BE 
         LIABLE {{tail gap}}"""
        rule = Rule(_text=test_text, licenses=['public-domain'])
        rule_tokens = list(rule.tokens())

        gaps = rule.gaps

        assert set([0, 8, 16, 21, 31, 33, 39, 44, 51, 56, 67]) == gaps

        result = match_chunk.index_starters(rule_tokens, gaps, _ngram_length=4)
        expected = [
            (('all', 'rights', 'reserved', 'redistribution'), 1),
            (('must', 'not', 'be', 'used'), 9),
            (('for', 'written', 'permission', 'please'), 17),
            (('4', 'products', 'derived', 'from'), 22),
            (('appear', 'in', 'their', 'names'), 34),
            (('is', 'a', 'registered', 'trademark'), 40),
            (('5', 'due', 'credit', 'should'), 45),
            (('this', 'software', 'is', 'provided'), 52),
            (('as', 'is', 'and', 'any'), 57),
            (('or', 'its', 'contributors', 'be'), 68)
        ]

        assert expected == list(result)