def test_index_starters_with_inter_gap_equal_to_ngram_length(self): test_text = '''I hereby abandon any{{SAX 2.0 (the)}}, and release all of {{the SAX 2.0 }}source code of his''' rule = Rule(_text=test_text, licenses=['public-domain']) rule_tokens = list(rule.tokens()) assert ['i', 'hereby', 'abandon', 'any', 'and', 'release', 'all', 'of', 'source', 'code', 'of', 'his'] == rule_tokens gaps = rule.gaps assert set([3, 7]) == gaps result = match_chunk.index_starters(rule_tokens, gaps, _ngram_length=4) expected = [ (('i', 'hereby', 'abandon', 'any'), 0), (('and', 'release', 'all', 'of'), 4), (('source', 'code', 'of', 'his'), 8) ] assert expected == list(result)
def test_index_starters_with_multiple_gaps_and_short_start(self): test_text = """ Copyright {{10 Copyright}}. All Rights Reserved. Redistribution materials provided The name {{5 Author}} must not be used to endorse or promote {{5 Author}}. For written permission, please contact {{5 Author Contact}}. 4. Products derived from this Software may not be called {{5 Product}} nor may {{5 Product}} appear in their names without prior {{10 Author}} is a registered trademark of {{5 Author}}. 5. Due credit should be given to {{10 Author and URL}} THIS SOFTWARE IS PROVIDED BY {{10 org}} ``AS IS'' AND ANY EXPRESSED OR IMPLIED IN NO EVENT SHALL {{5 Author}} OR ITS CONTRIBUTORS BE LIABLE {{tail gap}}""" rule = Rule(_text=test_text, licenses=['public-domain']) rule_tokens = list(rule.tokens()) gaps = rule.gaps assert set([0, 8, 16, 21, 31, 33, 39, 44, 51, 56, 67]) == gaps result = match_chunk.index_starters(rule_tokens, gaps, _ngram_length=4) expected = [ (('all', 'rights', 'reserved', 'redistribution'), 1), (('must', 'not', 'be', 'used'), 9), (('for', 'written', 'permission', 'please'), 17), (('4', 'products', 'derived', 'from'), 22), (('appear', 'in', 'their', 'names'), 34), (('is', 'a', 'registered', 'trademark'), 40), (('5', 'due', 'credit', 'should'), 45), (('this', 'software', 'is', 'provided'), 52), (('as', 'is', 'and', 'any'), 57), (('or', 'its', 'contributors', 'be'), 68) ] assert expected == list(result)