def cli(licenses_file): """ Create rules from a text file with delimited blocks of metadata and texts. As an example a file would contains one of more blocks such as this: \b ---------------------------------------- license_expression: lgpl-2.1 relevance: 100 is_license_notice: yes --- This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 2.1 as published by the Free Software Foundation; ---------------------------------------- """ rules_data = load_data(licenses_file) rules_tokens = all_rule_tokens() licenses = cache.get_licenses_db() licensing = Licensing(licenses.values()) print() errors = validate_license_rules(rules_data, licensing) if errors: print('Invalid rules: exiting....') for error in errors: print(error) print() raise Exception('Invalid rules: exiting....') print() for rule in rules_data: is_negative = rule.data.get('is_negative') is_false_positive = rule.data.get('is_false_positive') existing = rule_exists(rule.text) if existing and not is_negative: print('Skipping existing non-negative rule:', existing, 'with text:\n', rule.text[:50].strip(), '...') continue if is_negative: base_name = 'not-a-license' else: license_expression = rule.data.get('license_expression') license_expression = str( licensing.parse(license_expression, validate=True, simple=True)) base_name = license_expression if is_false_positive: base_name = 'false-positive_' + base_name base_loc = find_rule_base_loc(base_name) data_file = base_loc + '.yml' with io.open(data_file, 'w', encoding='utf-8') as o: o.write(rule.raw_data) text_file = base_loc + '.RULE' with io.open(text_file, 'w', encoding='utf-8') as o: o.write(rule.text) rulerec = models.Rule(data_file=data_file, text_file=text_file) rule_tokens = tuple(rulerec.tokens()) if rule_tokens in rules_tokens: # cleanup os.remove(text_file) os.remove(data_file) print('Skipping already added rule with text for:', base_name) else: rules_tokens.add(rule_tokens) rulerec.dump() models.update_ignorables(rulerec, verbose=False) print( 'Rule added:', 'file://' + rulerec.data_file, '\n', 'file://' + rulerec.text_file, )
def test_compute_relevance_is_using_rule_length(self): rule = models.Rule(stored_text='1', license_expression='some-license') rule.relevance = 13 rule.has_stored_relevance = False rule.is_false_positive = False rule.length = 1000 rule.set_relevance() assert rule.relevance == 100 rule.length = 21 rule.set_relevance() assert rule.relevance == 100 rule.length = 20 rule.set_relevance() assert rule.relevance == 100 rule.length = 18 rule.set_relevance() assert rule.relevance == 100 rule.length = 17 rule.set_relevance() assert rule.relevance == 94 rule.length = 16 rule.set_relevance() assert rule.relevance == 88 rule.length = 15 rule.set_relevance() assert rule.relevance == 83 rule.length = 14 rule.set_relevance() assert rule.relevance == 77 rule.length = 13 rule.set_relevance() assert rule.relevance == 72 rule.length = 12 rule.set_relevance() assert rule.relevance == 66 rule.length = 11 rule.set_relevance() assert rule.relevance == 61 rule.length = 10 rule.set_relevance() assert rule.relevance == 55 rule.length = 8 rule.set_relevance() assert rule.relevance == 44 rule.length = 5 rule.set_relevance() assert rule.relevance == 27 rule.length = 2 rule.set_relevance() assert rule.relevance == 11 rule.length = 1 rule.set_relevance() assert rule.relevance == 5 rule.length = 0 rule.set_relevance() assert rule.relevance == 0
def cli(licenses_file): """ Create rules from a text file with delimited blocks of metadata and texts. As an example a file would contains one of more blocks such as this: \b ---------------------------------------- license_expression: lgpl-2.1 relevance: 100 is_license_notice: yes --- This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 2.1 as published by the Free Software Foundation; ---------------------------------------- """ rules_data = load_data(licenses_file) rules_tokens = all_rule_tokens() licenses_by_key = cache.get_licenses_db() skinny_rules = [] for rdata in rules_data: relevance = rdata.data.get('relevance') rdata.data['has_stored_relevance'] = bool(relevance) minimum_coverage = rdata.data.get('minimum_coverage') rdata.data['has_stored_minimum_coverage'] = bool(minimum_coverage) rl = models.BasicRule(**rdata.data) rl.stored_text = rdata.text skinny_rules.append(rl) models.validate_rules(skinny_rules, licenses_by_key, with_text=True) print() for rule in skinny_rules: existing = rule_exists(rule.text()) if existing: print('Skipping existing rule:', existing, 'with text:\n', rule.text()[:50].strip(), '...') continue if rule.is_false_positive: base_name = 'false-positive' elif rule.is_license_intro: base_name = 'license-intro' else: base_name = rule.license_expression base_loc = find_rule_base_loc(base_name) rd = rule.to_dict() rd['stored_text'] = rule.stored_text rd['has_stored_relevance'] = rule.has_stored_relevance rd['has_stored_minimum_coverage'] = rule.has_stored_minimum_coverage rulerec = models.Rule(**rd) rulerec.data_file = base_loc + '.yml' rulerec.text_file = base_loc + '.RULE' rule_tokens = tuple(rulerec.tokens()) if rule_tokens in rules_tokens: print('Skipping already added rule with text for:', base_name) else: rules_tokens.add(rule_tokens) rulerec.dump() models.update_ignorables(rulerec, verbose=False) print( 'Rule added:', 'file://' + rulerec.data_file, '\n', 'file://' + rulerec.text_file, )
def test_match_to_indexed_template_with_few_tokens_around_gaps(self): # Was failing when a gap in a template starts very close to the start of # a rule tokens seq. We may still skip that, but we capture a large # match anyway. rule = models.Rule(text_file=self.get_test_loc('index/templates/idx.txt'), licenses=['test'],) idx = index.LicenseIndex([rule]) query_loc = self.get_test_loc('index/templates/query.txt') result = idx.match(location=query_loc) assert 1 == len(result) match = result[0] exp_qtext = u""" All Rights Reserved Redistribution and use of this software and associated documentation Software with or without modification are permitted provided that the following conditions are met 1 Redistributions of source code must retain copyright statements and notices Redistributions must also contain a copy of this document 2 Redistributions in binary form must reproduce the above copyright notice this list of conditions and the following disclaimer in the documentation and or other materials provided with the distribution 3 The name [groovy] must not be used to endorse or promote products derived from this Software without prior written permission of <The> [Codehaus] For written permission please contact [info] [codehaus] [org] 4 Products derived from this Software may not be called [groovy] nor may [groovy] appear in their names without prior written permission of <The> [Codehaus] [groovy] is a registered trademark of <The> [Codehaus] 5 Due credit should be given to <The> [Codehaus] [http] [groovy] [codehaus] [org] <THIS> <SOFTWARE> <IS> <PROVIDED> <BY> <THE> [CODEHAUS] AND CONTRIBUTORS AS IS AND ANY EXPRESSED OR IMPLIED WARRANTIES INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED IN NO EVENT SHALL <THE> [CODEHAUS] OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT INDIRECT INCIDENTAL SPECIAL EXEMPLARY OR CONSEQUENTIAL DAMAGES INCLUDING BUT NOT LIMITED TO PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES LOSS OF USE DATA OR PROFITS OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY WHETHER IN CONTRACT STRICT LIABILITY OR TORT INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE EVEN IF ADVISED OF THE [POSSIBILITY] <OF> [SUCH] DAMAGE """.split() exp_itext = u""" All Rights Reserved Redistribution and use of this software and associated documentation Software with or without modification are permitted provided that the following conditions are met 1 Redistributions of source code must retain copyright statements and notices Redistributions must also contain a copy of this document 2 Redistributions in binary form must reproduce the above copyright notice this list of conditions and the following disclaimer in the documentation and or other materials provided with the distribution 3 The name must not be used to endorse or promote products derived from this Software without prior written permission of For written permission please contact 4 Products derived from this Software may not be called nor may appear in their names without prior written permission of is a registered trademark of 5 Due credit should be given to <THIS> <SOFTWARE> <IS> <PROVIDED> <BY> AND CONTRIBUTORS AS IS AND ANY EXPRESSED OR IMPLIED WARRANTIES INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED IN NO EVENT SHALL OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT INDIRECT INCIDENTAL SPECIAL EXEMPLARY OR CONSEQUENTIAL DAMAGES INCLUDING BUT NOT LIMITED TO PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES LOSS OF USE DATA OR PROFITS OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY WHETHER IN CONTRACT STRICT LIABILITY OR TORT INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE EVEN IF ADVISED OF THE DAMAGE """.split() qtext, itext = get_texts(match, location=query_loc, idx=idx) assert exp_qtext == qtext.split() assert exp_itext == itext.split() assert match.coverage() > 97 assert match_seq.MATCH_SEQ == match.matcher
def test_index_structures(self): # rule text, unique low/high len, low/high len test_rules = [ (u'a one a two a three licensed.', 4, 1, 6, 1), (u'a four a five a six licensed.', 2, 3, 4, 3), (u'one two three four five gpl', 4, 2, 4, 2), (u'The rose is a rose mit', 3, 2, 3, 3), (u'The license is GPL', 3, 1, 3, 1), (u'The license is a GPL', 4, 1, 4, 1), (u'a license is a rose', 3, 1, 4, 1), (u'the gpl', 1, 1, 1, 1), (u'the mit', 1, 1, 1, 1), (u'the bsd', 1, 1, 1, 1), (u'the lgpl', 1, 1, 1, 1), ] idx = index.LicenseIndex() rules = [models.Rule(_text=t[0]) for t in test_rules] idx._add_rules(rules) assert 8 == idx.len_junk expected_lengths = [r[1:] for r in test_rules] results = [(rule.low_unique, rule.high_unique, rule.low_length, rule.high_length) for rule in rules] assert expected_lengths == results xdict = { u'a': 0, u'bsd': 15, u'five': 11, u'four': 5, u'gpl': 8, u'is': 2, u'lgpl': 13, u'license': 3, u'licensed': 10, u'mit': 12, u'one': 7, u'rose': 9, u'six': 14, u'the': 1, u'three': 4, u'two': 6} assert xdict == idx.dictionary xtbi = [ u'a', u'the', u'is', u'license', u'three', u'four', u'two', u'one', u'gpl', u'rose', u'licensed', u'five', u'mit', u'lgpl', u'six', u'bsd'] assert xtbi == idx.tokens_by_tid expected_as_dict = { '_tst_18_4': {u'gpl': [3]}, '_tst_19_6': {u'rose': [4]}, '_tst_20_5': {u'gpl': [4]}, '_tst_22_3': {u'mit': [5], u'rose': [1, 4]}, '_tst_27_2': {u'five': [4], u'gpl': [5]}, '_tst_29_0': {u'licensed': [6]}, '_tst_29_1': {u'five': [3], u'licensed': [6], u'six': [5]}, '_tst_7_7': {u'gpl': [1]}, '_tst_7_8': {u'mit': [1]}, '_tst_7_9': {u'bsd': [1]}, '_tst_8_10': {u'lgpl': [1]}} assert expected_as_dict == idx.to_dict()
def test_compute_relevance_is_using_rule_length(self): rule = models.Rule(stored_text='1', license_expression='some-license') rule.relevance = 13 rule.has_stored_relevance = False rule.is_false_positive = False rule.length = 1000 rule.compute_relevance() assert 100 == rule.relevance rule.length = 21 rule.compute_relevance() assert 100 == rule.relevance rule.length = 20 rule.compute_relevance() assert 100 == rule.relevance rule.length = 18 rule.compute_relevance() assert 100 == rule.relevance rule.length = 17 rule.compute_relevance() assert 94 == rule.relevance rule.length = 16 rule.compute_relevance() assert 88 == rule.relevance rule.length = 15 rule.compute_relevance() assert 83 == rule.relevance rule.length = 14 rule.compute_relevance() assert 77 == rule.relevance rule.length = 13 rule.compute_relevance() assert 72 == rule.relevance rule.length = 12 rule.compute_relevance() assert 66 == rule.relevance rule.length = 11 rule.compute_relevance() assert 61 == rule.relevance rule.length = 10 rule.compute_relevance() assert 55 == rule.relevance rule.length = 8 rule.compute_relevance() assert 44 == rule.relevance rule.length = 5 rule.compute_relevance() assert 27 == rule.relevance rule.length = 2 rule.compute_relevance() assert 11 == rule.relevance rule.length = 1 rule.compute_relevance() assert 5 == rule.relevance rule.length = 0 rule.compute_relevance() assert 0 == rule.relevance
def test_negative(self): assert models.Rule(_text='test_text').negative() assert not models.Rule(_text='test_text', licenses=['mylicense' ]).negative() assert models.Rule(_text='test_text', licenses=[]).negative()
def test_index_structures_with__add_rules(self): base = self.get_test_loc('index/tokens_count') keys = sorted(os.listdir(base)) idx = MiniLicenseIndex() rules = [] for key in keys: rules.append( models.Rule(text_file=os.path.join(base, key), license_expression='gpl-2.0')) idx._add_rules(rules, _legalese=mini_legalese) assert idx.len_legalese == 40 expected = set([ 'all', 'allowed', 'and', 'any', 'for', 'is', 'redistribution', 'thing', 'yes' ]) xdict = { key for key, val in idx.dictionary.items() if val >= idx.len_legalese } assert xdict == expected xtbi = sorted([ 'all', 'allowed', 'and', 'any', 'for', 'is', 'redistribution', 'thing', 'yes' ]) assert sorted([ t for i, t in enumerate(idx.tokens_by_tid) if i >= idx.len_legalese ]) == xtbi expected_msets_by_rid = [{ u'redistribution': 1 }, { u'is': 1, u'redistribution': 1, u'yes': 1 }, { u'allowed': 1, u'is': 1, u'redistribution': 1, u'yes': 1 }, { u'allowed': 1, u'for': 1, u'is': 1, u'redistribution': 1, u'yes': 1 }, { u'all': 1, u'allowed': 1, u'for': 1, u'is': 1, u'redistribution': 1 }, { u'all': 1, u'allowed': 1, u'and': 1, u'any': 1, u'is': 1, u'redistribution': 1, u'thing': 1 }, { u'is': 1, u'redistribution': 1 }, { u'allowed': 1, u'is': 1, u'redistribution': 1 }, { u'allowed': 1, u'for': 1, u'is': 1, u'redistribution': 1 }, { u'all': 1, u'allowed': 1, u'is': 1, u'redistribution': 1, u'yes': 1 }, { u'all': 1, u'allowed': 1, u'and': 1, u'is': 1, u'redistribution': 1 }, { u'all': 1, u'allowed': 1, u'is': 1, u'redistribution': 1 }, { u'all': 1, u'allowed': 1, u'and': 1, u'any': 1, u'is': 1, u'redistribution': 1 }] htmset = [{ idx.tokens_by_tid[tok]: freq for (tok, freq) in tids_mset.items() } for tids_mset in idx.msets_by_rid] assert htmset == expected_msets_by_rid
def test_create_template_rule(self): test_rule = models.Rule(stored_text='A one. A {{}}two. A three.') expected = ['a', 'one', 'a', 'two', 'a', 'three'] assert expected == list(test_rule.tokens()) assert 6 == test_rule.length
def test_rule_identifier_ignores_small_text_differences(self): r1 = models.Rule(text_file=self.create_test_file('Some text'), template=False) r2 = models.Rule(text_file=self.create_test_file(' some \n text '), template=False) assert models.rule_identifier(r1) == models.rule_identifier(r2)
def test_rule_identifier_includes_structure(self): r1 = models.Rule(text_file=self.create_test_file('Some text'), license_choice=False) r2 = models.Rule(text_file=self.create_test_file('Some text'), license_choice=True) assert models.rule_identifier(r1) != models.rule_identifier(r2)
def test_rule_identifier_includes_rule_type(self): r1 = models.Rule(text_file=self.create_test_file('Some text'), template=True) r2 = models.Rule(text_file=self.create_test_file('Some text'), template=False) assert models.rule_identifier(r1) != models.rule_identifier(r2)
def test_index_structures(self): # rule text, unique low/high len, low/high len test_rules = [ (u'a one a two a three licensed.', (4, 1, 4, 1)), (u'a four a five a six licensed.', (4, 1, 4, 1)), (u'one two three four five gpl', (6, 0, 6, 0)), (u'The rose is a rose mit', (4, 0, 5, 0)), (u'The license is GPL', (4, 1, 4, 1)), (u'The license is this GPL', (5, 1, 5, 1)), (u'a license is a rose', (3, 1, 3, 1)), (u'the gpl', (2, 0, 2, 0)), (u'the mit', (2, 0, 2, 0)), (u'the bsd', (2, 0, 2, 0)), (u'the lgpl', (2, 0, 2, 0)), ] idx = MiniLicenseIndex() rules = [models.Rule(stored_text=t[0]) for t in test_rules] idx._add_rules(rules, _legalese=mini_legalese,) assert 40 == idx.len_legalese expected_lengths = [r[1] for r in test_rules] results = [ (rule.length_unique, rule.high_length_unique, rule.length, rule.high_length) for rule in rules] assert expected_lengths == results expected = set([ 'bsd', 'five', 'four', 'gpl', 'is', 'lgpl', 'mit', 'one', 'rose', 'six', 'the', 'this', 'three', 'two']) xdict = {key for key, val in idx.dictionary.items() if val >= idx.len_legalese} assert expected == xdict xtbi = sorted([ 'one', 'two', 'three', 'four', 'five', 'six', 'gpl', 'the', 'rose', 'is', 'mit', 'this', 'bsd', 'lgpl']) assert xtbi == sorted([t for i, t in enumerate(idx.tokens_by_tid) if i >= idx.len_legalese])
def cli(licenses_file): """ Create rules from a text file with delimited blocks of metadata and texts. As an example a file would contains one of more blocks such as this: \b ---------------------------------------- license_expression: lgpl-2.1 relevance: 100 is_license_notice: yes --- This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 2.1 as published by the Free Software Foundation; ---------------------------------------- """ rules_data = load_data(licenses_file) rule_by_tokens = all_rule_by_tokens() licenses_by_key = cache.get_licenses_db() skinny_rules = [] for rdata in rules_data: relevance = rdata.data.get('relevance') rdata.data['has_stored_relevance'] = bool(relevance) minimum_coverage = rdata.data.get('minimum_coverage') rdata.data['has_stored_minimum_coverage'] = bool(minimum_coverage) rl = models.BasicRule(**rdata.data) rl.stored_text = rdata.text skinny_rules.append(rl) models.validate_rules(skinny_rules, licenses_by_key, with_text=True) print() for rule in skinny_rules: if rule.is_false_positive: base_name = 'false-positive' elif rule.is_license_intro: base_name = 'license-intro' else: base_name = rule.license_expression text = rule.text() existing_rule = rule_exists(text) skinny_text = ' '.join(text[:80].split()) existing_msg = (f'Skipping rule for: {base_name!r}, ' 'dupe of: {existing_rule} ' f'with text: {skinny_text!r}...') if existing_rule: print(existing_msg.format(**locals())) continue base_loc = find_rule_base_loc(base_name) rd = rule.to_dict() rd['stored_text'] = rule.stored_text rd['has_stored_relevance'] = rule.has_stored_relevance rd['has_stored_minimum_coverage'] = rule.has_stored_minimum_coverage rulerec = models.Rule(**rd) # force recomputing relevance to remove junk stored relevance for long rules rulerec.set_relevance() rulerec.data_file = base_loc + '.yml' rulerec.text_file = base_loc + '.RULE' rule_tokens = tuple(rulerec.tokens()) existing_rule = rule_by_tokens.get(rule_tokens) if existing_rule: print(existing_msg.format(**locals())) continue else: print(f'Adding new rule: {base_name}') print(' file://' + rulerec.data_file) print(' file://' + rulerec.text_file, ) rulerec.dump() models.update_ignorables(rulerec, verbose=False) rulerec.dump() rule_by_tokens[rule_tokens] = base_name
def test_create_rule_ignore_punctuation(self): test_rule = models.Rule(stored_text='A one. A {{}}two. A three.') expected = ['one', 'two', 'three'] assert expected == list(test_rule.tokens()) assert 3 == test_rule.length
def test_index_structures_with__add_rules(self): base = self.get_test_loc('index/tokens_count') keys = sorted(os.listdir(base)) idx = index.LicenseIndex() rules = [] for key in keys: rules.append(models.Rule(text_file=os.path.join(base, key))) idx._add_rules(rules) assert 4 == idx.len_junk expected_index = { 'plain1_0': {u'redistribution': [0]}, 'plain2_1': {u'is': [1], u'redistribution': [0], u'yes': [2]}, 'plain3_2': {u'is': [1], u'redistribution': [0], u'yes': [3]}, 'plain4_3': {u'is': [1], u'redistribution': [0], u'yes': [4]}, 'plain5_4': {u'is': [1], u'redistribution': [0]}, 'tmpl10_5': {u'any': [8], u'is': [1], u'redistribution': [0], u'thing': [9]}, 'tmpl2_6': {u'is': [1], u'redistribution': [0]}, 'tmpl3_7': {u'is': [1], u'redistribution': [0]}, 'tmpl4_8': {u'is': [1], u'redistribution': [0]}, 'tmpl5_2_10': {u'is': [1], u'redistribution': [0], u'yes': [5]}, 'tmpl5_9': {u'is': [1, 2], u'redistribution': [0]}, 'tmpl6_11': {u'is': [1], u'redistribution': [0]}, 'tmpl7_12': {u'is': [1], u'redistribution': [0]}, 'tmpl8_13': {u'is': [1], u'redistribution': [0]}, 'tmpl9_14': {u'any': [8], u'is': [1], u'redistribution': [0]} } assert expected_index == idx.to_dict() expected_dict = { u'all': 1, u'allowed': 0, u'and': 3, u'any': 7, u'for': 2, u'is': 4, u'redistribution': 5, u'thing': 8, u'yes': 6 } assert expected_dict == idx.dictionary expected_tids = [u'allowed', u'all', u'for', u'and', u'is', u'redistribution', u'yes', u'any', u'thing'] assert expected_tids == idx.tokens_by_tid expected_high_tids_msets_by_rid = [ {u'redistribution': 1}, {u'is': 1, u'redistribution': 1, u'yes': 1}, {u'is': 1, u'redistribution': 1, u'yes': 1}, {u'is': 1, u'redistribution': 1, u'yes': 1}, {u'is': 1, u'redistribution': 1}, {u'any': 1, u'is': 1, u'redistribution': 1, u'thing': 1}, {u'is': 1, u'redistribution': 1}, {u'is': 1, u'redistribution': 1}, {u'is': 1, u'redistribution': 1}, {u'is': 2, u'redistribution': 1}, {u'is': 1, u'redistribution': 1, u'yes': 1}, {u'is': 1, u'redistribution': 1}, {u'is': 1, u'redistribution': 1}, {u'is': 1, u'redistribution': 1}, {u'any': 1, u'is': 1, u'redistribution': 1} ] low_tids_msets_by_rid, high_tids_msets_by_rid = zip(*idx.tids_msets_by_rid) htmset = [{idx.tokens_by_tid[tok]:freq for (tok, freq) in tids_mset.items()} for tids_mset in high_tids_msets_by_rid] assert expected_high_tids_msets_by_rid == htmset expected_low_tids_msets_by_rid = [ {}, {}, {u'allowed': 1}, {u'allowed': 1, u'for': 1}, {u'all': 1, u'allowed': 1, u'for': 1}, {u'all': 2, u'allowed': 1, u'and': 2, u'for': 1}, {}, {u'allowed': 1}, {u'allowed': 1, u'for': 1}, {u'allowed': 1, u'for': 1}, {u'all': 1, u'allowed': 1, u'for': 1}, {u'all': 1, u'allowed': 1, u'and': 1, u'for': 1}, {u'all': 2, u'allowed': 1, u'and': 1, u'for': 1}, {u'all': 2, u'allowed': 1, u'and': 2, u'for': 1}, {u'all': 2, u'allowed': 1, u'and': 2, u'for': 1} ] assert expected_low_tids_msets_by_rid == [{idx.tokens_by_tid[tok]: freq for tok, freq in tids_mset.items()} for tids_mset in low_tids_msets_by_rid]
def test_rule_templates_are_ignored(self): test_text = '''{{gap0}}zero one two three{{gap2}}''' r1 = models.Rule(stored_text=test_text) assert ['gap0', 'zero', 'one', 'two', 'three', 'gap2'] == list(r1.tokens())
def get_test_rules(self, base, subset=None): base = self.get_test_loc(base) test_files = sorted(os.listdir(base)) if subset: test_files = [t for t in test_files if t in subset] return [models.Rule(text_file=os.path.join(base, license_key), licenses=[license_key]) for license_key in test_files]
def test_match_to_indexed_template_with_few_tokens_around_gaps(self): # Was failing when a gap in a template starts very close to the start of # a rule tokens seq. We may still skip that, but we capture a large # match anyway. rule = models.Rule( text_file=self.get_test_loc('index/templates/idx.txt'), license_expression='test') legalese = (mini_legalese | set([ 'permission', 'written', 'registered', 'derived', 'damage', 'due' ])) idx = index.LicenseIndex([rule], _legalese=legalese) query_loc = self.get_test_loc('index/templates/query.txt') result = idx.match(location=query_loc) assert 1 == len(result) match = result[0] exp_qtext = u""" All Rights Reserved. Redistribution and use of this software and associated documentation ("Software"), with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain copyright statements and notices. Redistributions must also contain a copy of this document. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name "[groovy]" must not be used to endorse or promote products derived from this Software without prior written permission of [The] [Codehaus]. For written permission, please contact [info]@[codehaus].[org]. 4. Products derived from this Software may not be called "[groovy]" nor may "[groovy]" appear in their names without prior written permission of [The] [Codehaus]. "[groovy]" is a registered trademark of [The] [Codehaus]. 5. Due credit should be given to [The] [Codehaus] - [http]://[groovy].[codehaus].[org]/ [THIS] [SOFTWARE] [IS] [PROVIDED] [BY] [THE] [CODEHAUS] AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL [THE] [CODEHAUS] OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE [POSSIBILITY] [OF] [SUCH] DAMAGE. """.split() exp_itext = u""" All Rights Reserved Redistribution and use of this software and associated documentation Software with or without modification are permitted provided that the following conditions are met 1 Redistributions of source code must retain copyright statements and notices Redistributions must also contain copy of this document 2 Redistributions in binary form must reproduce the above copyright notice this list of conditions and the following disclaimer in the documentation and or other materials provided with the distribution 3 The name must not be used to endorse or promote products derived from this Software without prior written permission of For written permission please contact 4 Products derived from this Software may not be called nor may appear in their names without prior written permission of is registered trademark of 5 Due credit should be given to <THIS> <SOFTWARE> <IS> <PROVIDED> <BY> AND CONTRIBUTORS AS IS AND ANY EXPRESSED OR IMPLIED WARRANTIES INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR PARTICULAR PURPOSE ARE DISCLAIMED IN NO EVENT SHALL OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT INDIRECT INCIDENTAL SPECIAL EXEMPLARY OR CONSEQUENTIAL DAMAGES INCLUDING BUT NOT LIMITED TO PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES LOSS OF USE DATA OR PROFITS OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY WHETHER IN CONTRACT STRICT LIABILITY OR TORT INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE EVEN IF ADVISED OF THE DAMAGE """.lower().split() qtext, itext = get_texts(match) assert exp_qtext == qtext.split() assert exp_itext == itext.split() assert match.coverage() > 97 assert match_seq.MATCH_SEQ == match.matcher
def test_gaps_at_start_and_end_are_ignored(self): test_text = '''{{gap0}}zero one two three{{gap2}}''' r1 = models.Rule(_text=test_text) assert ['zero', 'one', 'two', 'three'] == list(r1.tokens())