def test_Query_tokenize_from_string(self): rule_text = 'Redistribution and use in source and binary forms with or without modification are permitted' idx = index.LicenseIndex([Rule(_text=rule_text, licenses=['bsd'])]) querys = ''' The Redistribution and use in source and binary are permitted. Athena capital of Grece Paris and Athene Always''' qry = Query(query_string=querys, idx=idx, _test_mode=True) qry.tokenize_and_build_runs(qry.tokens_by_line()) # convert tid to actual token strings tks_as_str = lambda tks: [None if tid is None else idx.tokens_by_tid[tid] for tid in tks] expected = ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', 'and'] result = tks_as_str(qry.tokens) assert expected == result expected = [None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', None, None, None, None, None, 'and', None, None] result = tks_as_str(qry.tokens_with_unknowns()) assert expected == result assert 1 == len(qry.query_runs) qr1 = qry.query_runs[0] assert 0 == qr1.start assert 9 == qr1.end assert 10 == len(qr1) expected = ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', 'and'] result = tks_as_str(qr1.tokens) assert expected == result expected = [None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', None, None, None, None, None, 'and'] result = tks_as_str(qr1.tokens_with_unknowns()) assert expected == result
def test_match_with_templates_with_redundant_tokens_yield_single_exact_match(self): _text = u'copyright reserved mit is license, {{}} copyright reserved mit is license' # 0 1 2 3 4 5 6 7 8 9 licenses = ['tst'] rule = models.Rule(licenses=licenses, _text=_text) idx = index.LicenseIndex([rule]) expected_idx = {'_tst_73_0': {u'copyright': [0, 5], u'license': [4, 9], u'mit': [2, 7]}} assert expected_idx == idx.to_dict() querys = u'Hi my copyright reserved mit is license is the copyright reserved mit is license yes.' # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 qry = Query(query_string=querys, idx=idx) # convert tid to actual token strings tks_as_str = lambda tks: [None if tid is None else idx.tokens_by_tid[tid] for tid in tks] expected = [None, None, u'copyright', u'reserved', u'mit', u'is', u'license', u'is', None, u'copyright', u'reserved', u'mit', u'is', u'license', None] # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 assert expected == tks_as_str(qry.tokens_with_unknowns()) result = idx.match(query_string=querys) assert 1 == len(result) match = result[0] assert Span(0, 4) | Span(6, 10) == match.qspan assert Span(0, 9) == match.ispan assert 100 == match.coverage() qtext, itext = get_texts(match, query_string=querys, idx=idx) assert 'copyright reserved mit is license <is> [the] copyright reserved mit is license' == qtext assert 'copyright reserved mit is license copyright reserved mit is license' == itext
def test_match_with_templates_with_redundant_tokens_yield_single_exact_match(self): _text = u'copyright reserved mit is license, {{}} copyright reserved mit is license' # 0 1 2 3 4 5 6 7 8 9 licenses = ['tst'] rule = models.Rule(licenses=licenses, _text=_text) idx = index.LicenseIndex([rule]) expected_idx = {'_tst_73_0': {u'copyright': [0, 5], u'license': [4, 9], u'mit': [2, 7]}} assert expected_idx == idx.to_dict() querys = u'Hi my copyright reserved mit is license is the copyright reserved mit is license yes.' # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 qry = Query(query_string=querys, idx=idx) # convert tid to actual token strings tks_as_str = lambda tks: [None if tid is None else idx.tokens_by_tid[tid] for tid in tks] expected = [None, None, u'copyright', u'reserved', u'mit', u'is', u'license', u'is', None, u'copyright', u'reserved', u'mit', u'is', u'license', None] # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 assert expected == tks_as_str(qry.tokens_with_unknowns()) result = idx.match(query_string=querys) assert 1 == len(result) match = result[0] assert Span(0, 4) | Span(6, 10) == match.qspan assert Span(0, 9) == match.ispan assert 100 == match.coverage() qtext, itext = get_texts(match, query_string=querys, idx=idx) assert 'copyright reserved mit is license <is> [the] copyright reserved mit is license' == qtext assert 'copyright reserved mit is license copyright reserved mit is license' == itext
def test_QueryRun_does_not_end_with_None(self): rule_text = 'Redistribution and use in source and binary forms, with or without modification, are permitted' idx = index.LicenseIndex( [Rule(stored_text=rule_text, license_expression='bsd')]) querys = ''' The Redistribution and use in source and binary forms, with or without modification, are permitted. Always bar modification foo ''' # convert tid to actual token strings tks_as_str = lambda tks: [ None if tid is None else idx.tokens_by_tid[tid] for tid in tks ] qry = Query(query_string=querys, idx=idx) expected = [ None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'forms', 'with', 'or', 'without', 'modification', 'are', 'permitted', None, None, 'modification', None ] assert [x for x in expected if x] == tks_as_str(qry.tokens) assert expected == tks_as_str(qry.tokens_with_unknowns()) assert 2 == len(qry.query_runs) qrun = qry.query_runs[0] expected = [ 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'forms', 'with', 'or', 'without', 'modification', 'are', 'permitted' ] assert expected == tks_as_str(qrun.tokens) assert 0 == qrun.start assert 13 == qrun.end qrun = qry.query_runs[1] expected = ['modification'] assert expected == tks_as_str(qrun.tokens) assert 14 == qrun.start assert 14 == qrun.end
def test_QueryRun_does_not_end_with_None(self): rule_text = 'Redistribution and use in source and binary forms, with or without modification, are permitted' idx = index.LicenseIndex([Rule(_text=rule_text, licenses=['bsd'])]) querys = ''' The Redistribution and use in source and binary forms, with or without modification, are permitted. Always bar modification foo ''' # convert tid to actual token strings tks_as_str = lambda tks: [None if tid is None else idx.tokens_by_tid[tid] for tid in tks] qry = Query(query_string=querys, idx=idx) expected = [ None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'forms', 'with', 'or', 'without', 'modification', 'are', 'permitted', None, None, 'modification', None ] assert [x for x in expected if x] == tks_as_str(qry.tokens) assert expected == tks_as_str(qry.tokens_with_unknowns()) assert 2 == len(qry.query_runs) qrun = qry.query_runs[0] expected = ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'forms', 'with', 'or', 'without', 'modification', 'are', 'permitted'] assert expected == tks_as_str(qrun.tokens) assert 0 == qrun.start assert 13 == qrun.end qrun = qry.query_runs[1] expected = ['modification'] assert expected == tks_as_str(qrun.tokens) assert 14 == qrun.start assert 14 == qrun.end
def test_Query_known_and_unknown_positions(self): rule_text = 'Redistribution and use in source and binary forms' rule = Rule(stored_text=rule_text, license_expression='bsd') legalese = set([ 'redistribution', 'form', ]) idx = index.LicenseIndex([rule], _legalese=legalese) querys = 'The new Redistribution and use in other form always' qry = Query(query_string=querys, idx=idx, _test_mode=False) # we have only 4 known positions in this query, hence only 4 entries there on a single line # "Redistribution and use in" assert [1, 1, 1, 1, 1] == qry.line_by_pos # this show our 4 known token in this query with their known position # "Redistribution and use in" assert [1, 2, 3, 4, 0] == qry.tokens # the first two tokens are unknown, then starting after "in" we have three trailing unknown. assert {3: 1, 4: 1, -1: 2} == qry.unknowns_by_pos # This shows how knowns and unknowns are blended result = list(qry.tokens_with_unknowns()) expected = [ # The new None, None, # Redistribution 1, # and 2, # use 3, # in 4, # other form always' None, 0, None ] assert expected == result
def test_Query_known_and_unknown_positions(self): rule_text = 'Redistribution and use in source and binary forms' idx = index.LicenseIndex( [Rule(stored_text=rule_text, license_expression='bsd')]) querys = 'The new Redistribution and use in other form always' qry = Query(query_string=querys, idx=idx, _test_mode=False) # we have only 4 known positions in this query, hence only 4 entries there on a single line # "Redistribution and use in" assert [1, 1, 1, 1] == qry.line_by_pos # this show our 4 known token in this query with their known position # "Redistribution and use in" assert [6, 0, 3, 5] == qry.tokens # the first two tokens are unknown, then starting after "in" we have three trailing unknown. assert { -1: 2, 3: 3, } == qry.unknowns_by_pos # This shows how knowns and unknowns are blended result = list(qry.tokens_with_unknowns()) expected = [ # The new None, None, # Redistribution 6, # and 0, # use 3, # in 5, # other form always' None, None, None ] assert result == expected
def test_match_with_templates_with_redundant_tokens_yield_single_exact_match( self): _stored_text = u'copyright reserved mit is license, {{}} copyright reserved mit is license' # 0 1 2 3 4 5 6 7 8 9 license_expression = 'tst' rule = models.Rule(license_expression=license_expression, stored_text=_stored_text) idx = MiniLicenseIndex([rule]) querys = u'Hi my copyright reserved mit is license is the copyright reserved mit is license yes.' # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 qry = Query(query_string=querys, idx=idx) # convert tid to actual token strings tks_as_str = lambda tks: [ None if tid is None else idx.tokens_by_tid[tid] for tid in tks ] expected = [ None, None, u'copyright', u'reserved', u'mit', u'is', u'license', u'is', None, u'copyright', u'reserved', u'mit', u'is', u'license', None ] # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 assert tks_as_str(qry.tokens_with_unknowns()) == expected result = idx.match(query_string=querys) assert len(result) == 1 match = result[0] assert match.qspan == Span(0, 4) | Span(6, 10) assert match.ispan == Span(0, 9) assert match.coverage() == 100 qtext, itext = get_texts(match) assert qtext == 'copyright reserved mit is license [is] [the] copyright reserved mit is license' assert itext == 'copyright reserved mit is license copyright reserved mit is license'