def test_index_starters_with_inter_gap_equal_to_ngram_length(self):
        test_text = '''I hereby abandon any{{SAX 2.0 (the)}}, and release all of {{the SAX 2.0 }}source code of his'''
        rule = Rule(_text=test_text, licenses=['public-domain'])
        rule_tokens = list(rule.tokens())
        assert ['i', 'hereby', 'abandon', 'any', 'and', 'release', 'all', 'of', 'source', 'code', 'of', 'his'] == rule_tokens

        gaps = rule.gaps
        assert set([3, 7]) == gaps

        result = match_chunk.index_starters(rule_tokens, gaps, _ngram_length=4)
        expected = [
            (('i', 'hereby', 'abandon', 'any'), 0),
            (('and', 'release', 'all', 'of'), 4),
            (('source', 'code', 'of', 'his'), 8)
        ]
        assert expected == list(result)
    def test_index_starters_with_multiple_gaps_and_short_start(self):
        test_text = """
        Copyright {{10 Copyright}}. 
        All 
        Rights 
        Reserved.
        Redistribution
        materials 
        provided
        The
        name {{5 Author}} 
        must 
        not 
        be 
        used 
        to 
        endorse
        or
        promote {{5 Author}}.
        For 
        written
         permission, 
         please 
         contact {{5 Author Contact}}.
        4. 
        Products 
        derived 
        from 
        this 
        Software 
        may 
        not 
        be 
        called {{5 Product}}
        nor 
        may {{5 Product}} 
        appear 
        in 
        their 
        names 
        without 
        prior {{10 Author}}
        is 
        a 
        registered 
        trademark 
        of {{5 Author}}.
        5. 
        Due 
        credit 
        should
        be 
        given 
        to {{10 Author and URL}}
        THIS 
        SOFTWARE 
        IS 
        PROVIDED 
        BY {{10 org}}
        ``AS 
        IS'' 
        AND 
        ANY 
        EXPRESSED
         OR
          IMPLIED 
         IN 
         NO 
         EVENT 
         SHALL {{5 Author}} 
         OR 
         ITS 
         CONTRIBUTORS 
         BE 
         LIABLE {{tail gap}}"""
        rule = Rule(_text=test_text, licenses=['public-domain'])
        rule_tokens = list(rule.tokens())

        gaps = rule.gaps

        assert set([0, 8, 16, 21, 31, 33, 39, 44, 51, 56, 67]) == gaps

        result = match_chunk.index_starters(rule_tokens, gaps, _ngram_length=4)
        expected = [
            (('all', 'rights', 'reserved', 'redistribution'), 1),
            (('must', 'not', 'be', 'used'), 9),
            (('for', 'written', 'permission', 'please'), 17),
            (('4', 'products', 'derived', 'from'), 22),
            (('appear', 'in', 'their', 'names'), 34),
            (('is', 'a', 'registered', 'trademark'), 40),
            (('5', 'due', 'credit', 'should'), 45),
            (('this', 'software', 'is', 'provided'), 52),
            (('as', 'is', 'and', 'any'), 57),
            (('or', 'its', 'contributors', 'be'), 68)
        ]

        assert expected == list(result)