def test_Query_tokens_by_line_from_string(self): rule_text = 'Redistribution and use in source and binary forms with or without modification are permitted' rule = Rule(stored_text=rule_text, license_expression='bsd') legalese = set([ 'redistribution', 'form', ]) idx = index.LicenseIndex([rule], _legalese=legalese) querys = ''' The Redistribution and use in source and binary are permitted Athena capital of Grece Paris and Athene Always''' qry = Query(query_string=querys, idx=idx, _test_mode=True) result = list(qry.tokens_by_line()) expected = [ [], [None], [1, 2, 3, 4, 5, 2, 6, 12, 13], [], [None, None, None, None], [None, 2, None], [None], ] assert expected == result # convert tid to actual token strings qtbl_as_str = lambda qtbl: [[ None if tid is None else idx.tokens_by_tid[tid] for tid in tids ] for tids in qtbl] result_str = qtbl_as_str(result) expected_str = [ [], [None], [ 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted' ], [], [None, None, None, None], [None, 'and', None], [None], ] assert expected_str == result_str assert [3, 3, 3, 3, 3, 3, 3, 3, 3, 6] == qry.line_by_pos idx = index.LicenseIndex( [Rule(stored_text=rule_text, license_expression='bsd')]) querys = 'and this is not a license' qry = Query(query_string=querys, idx=idx, _test_mode=True) result = list(qry.tokens_by_line()) expected = [['and', None, None, None, 'license']] assert expected == qtbl_as_str(result)
def test_Query_tokens_by_line_from_string(self): rule_text = 'Redistribution and use in source and binary forms with or without modification are permitted' idx = index.LicenseIndex([Rule(_text=rule_text, licenses=['bsd'])]) querys = ''' The Redistribution and use in source and binary are permitted Athena capital of Grece Paris and Athene Always''' qry = Query(query_string=querys, idx=idx, _test_mode=True) result = list(qry.tokens_by_line()) expected = [ [], [None], [11, 0, 6, 4, 3, 0, 1, 9, 2], [], [None, None, None, None], [None, 0, None], [None], ] assert expected == result # convert tid to actual token strings qtbl_as_str = lambda qtbl: [[None if tid is None else idx.tokens_by_tid[tid] for tid in tids] for tids in qtbl] result_str = qtbl_as_str(result) expected_str = [ [], [None], ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted'], [], [None, None, None, None], [None, 'and', None], [None], ] assert expected_str == result_str assert [3, 3, 3, 3, 3, 3, 3, 3, 3, 6] == qry.line_by_pos idx = index.LicenseIndex([Rule(_text=rule_text, licenses=['bsd'])]) querys = 'and this is not a license' qry = Query(query_string=querys, idx=idx, _test_mode=True) result = list(qry.tokens_by_line()) expected = [['and', None, None, None, None, None]] assert expected == qtbl_as_str(result)
def test_Query_tokenize_from_string(self): rule_text = 'Redistribution and use in source and binary forms with or without modification are permitted' idx = index.LicenseIndex([Rule(_text=rule_text, licenses=['bsd'])]) querys = ''' The Redistribution and use in source and binary are permitted. Athena capital of Grece Paris and Athene Always''' qry = Query(query_string=querys, idx=idx, _test_mode=True) qry.tokenize_and_build_runs(qry.tokens_by_line()) # convert tid to actual token strings tks_as_str = lambda tks: [None if tid is None else idx.tokens_by_tid[tid] for tid in tks] expected = ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', 'and'] result = tks_as_str(qry.tokens) assert expected == result expected = [None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', None, None, None, None, None, 'and', None, None] result = tks_as_str(qry.tokens_with_unknowns()) assert expected == result assert 1 == len(qry.query_runs) qr1 = qry.query_runs[0] assert 0 == qr1.start assert 9 == qr1.end assert 10 == len(qr1) expected = ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', 'and'] result = tks_as_str(qr1.tokens) assert expected == result expected = [None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', None, None, None, None, None, 'and'] result = tks_as_str(qr1.tokens_with_unknowns()) assert expected == result
def test_Query_tokenize_from_string(self): rule_text = 'Redistribution and use in source and binary forms with or without modification are permitted' idx = index.LicenseIndex([Rule(stored_text=rule_text, license_expression='bsd')]) querys = ''' The Redistribution and use in source and binary are permitted. Athena capital of Grece Paris and Athene Always''' qry = Query(query_string=querys, idx=idx, _test_mode=True) tokens_by_line = list(qry.tokens_by_line(query_string=querys)) qry.tokenize_and_build_runs(tokens_by_line) expected = ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', 'and'] result = tks_as_str(qry.tokens, idx=idx) assert result == expected expected = [None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', None, None, None, None, None, 'and', None, None] result = tks_as_str(query_tokens_with_unknowns(qry), idx=idx) assert result == expected assert len(qry.query_runs) == 1 qr1 = qry.query_runs[0] assert qr1.start == 0 assert qr1.end == 9 assert len(qr1) == 10 expected = ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', 'and'] result = tks_as_str(qr1.tokens, idx=idx) assert result == expected expected = [None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', None, None, None, None, None, 'and'] result = tks_as_str(query_run_tokens_with_unknowns(qr1), idx=idx) assert result == expected
def test_Query_tokens_by_line_behaves_the_same_on_various_python_2(self): location = self.get_test_loc('query/query_lines/yahoo-eula.txt') idx = cache.get_index() query = Query(location, idx=idx) tbl = list(query.tokens_by_line()) # inject the actual token string for sanity tbt = idx.tokens_by_tid results = [[[i, i and tbt[i] or i] for i in line] for line in tbl] expected = self.get_test_loc('query/query_lines/yahoo-eula.txt.json') check_result_equals_expected_json(results, expected, regen=False)
def test_match_does_not_change_query_unknown_positions(self): from licensedcode.match import LicenseMatch from licensedcode.spans import Span location = self.get_test_loc('query/unknown_positions/lz4.license.txt') idx = cache.get_index() # build a query first qry1 = Query(location, idx=idx) # this has the side effect to populate the unknown txt = ' '.join(f'{i}-{idx.tokens_by_tid[t]}' for i, t in enumerate(qry1.tokens)) assert txt == ( '0-this 1-repository 2-uses 3-2 4-different 5-licenses ' '6-all 7-files 8-in 9-the 10-lib 11-directory 12-use 13-bsd 14-2 15-clause 16-license ' '17-all 18-other 19-files 20-use 21-gplv2 22-license 23-unless 24-explicitly 25-stated 26-otherwise ' '27-relevant 28-license 29-is 30-reminded 31-at 32-the 33-top 34-of 35-each 36-source 37-file ' '38-and 39-with 40-presence 41-of 42-copying 43-or 44-license 45-file 46-in 47-associated 48-directories ' '49-this 50-model 51-is 52-selected 53-to 54-emphasize 55-that ' '56-files 57-in 58-the 59-lib 60-directory 61-are 62-designed 63-to 64-be 65-included 66-into 67-3rd 68-party 69-applications ' '70-while 71-all 72-other 73-files 74-in 75-programs 76-tests 77-or 78-examples ' '79-receive 80-more 81-limited 82-attention 83-and 84-support 85-for 86-such 87-scenario' ) list(qry1.tokens_by_line()) assert qry1.unknowns_by_pos == {} # run matching matches = idx.match(location=location) match = matches[0] rule = [ r for r in idx.rules_by_rid if r.identifier == 'bsd-simplified_and_gpl-2.0_1.RULE' ][0] expected = LicenseMatch( matcher='2-aho', rule=rule, qspan=Span(0, 48), ispan=Span(0, 48), ) assert match == expected # check that query unknown by pos is the same and empty qry2 = match.query # this was incorrectly returned as {15: 0, 20: 0, 21: 0, 41: 0, 43: 0} # after querying done during matching assert qry2.unknowns_by_pos == {}
def test_query_unknowns_by_pos_and_stopwords_are_not_set_on_last_query_position(self): print('\nINDEX') idx = index.LicenseIndex( [Rule(stored_text='is the binary a')], _legalese=set(['binary']), _spdx_tokens=set() ) print('\nQUERY') q = Query(query_string='a bar binary that was a binary a is the foo bar a', idx=idx) tids = list(q.tokens_by_line()) assert tids == [[None, 0, None, None, 0, 1, 2, None, None]] # word: a bar binary that was a binary a is the foo bar a # tids: [ None 0, None, None, 0, 1, 2, None, None ] # known: st uk kn uk uk st kn st kn kn uk uk st # pos: 0 1 2 3 assert q.unknowns_by_pos == {-1: 1, 0: 2, 3: 2} assert q.stopwords_by_pos == {-1: 1, 0: 1, 1: 1, 3: 1}
def test_query_unknowns_by_pos_and_stopwords_are_not_defaultdic_and_not_changed_on_query( self): idx = index.LicenseIndex([Rule(stored_text='a is the binary')], _legalese=set(['binary']), _spdx_tokens=set()) q = Query(query_string='binary that was a binary', idx=idx) list(q.tokens_by_line()) assert q.unknowns_by_pos == {0: 2} assert q.stopwords_by_pos == {0: 1} assert not isinstance(q.unknowns_by_pos, defaultdict) assert not isinstance(q.stopwords_by_pos, defaultdict) try: q.unknowns_by_pos[1] assert q.unknowns_by_pos == {0: 2} except KeyError: pass try: q.stopwords_by_pos[1] assert q.stopwords_by_pos == {0: 1} except KeyError: pass