def test_approximate_match_to_indexed_template_with_few_tokens_around_gaps_on_limited_index(self): rule = models.Rule(text_file=self.get_test_loc('index/templates/idx.txt'), license_expression='test',) idx = index.LicenseIndex([rule]) stats_file = 'license_approx_match_limited_index_profile_log.txt' locations = [self.get_test_loc('index/templates/query.txt')] self.profile_match(idx, locations, stats_file)
def test_QueryRuns_tokens_with_unknowns(self): rule_text = 'Redistribution and use in source and binary forms with or without modification are permitted' idx = index.LicenseIndex( [Rule(stored_text=rule_text, license_expression='bsd')]) querys = ''' The Redistribution and use in source and binary are permitted. Athena capital of Grece Paris and Athene Always''' qry = Query(query_string=querys, idx=idx) assert set(qry.matchables) == set([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) assert len(qry.query_runs) == 1 qrun = qry.query_runs[0] expected = [ 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', 'and' ] assert tks_as_str(qrun.tokens, idx=idx) == expected expected = [ None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', None, None, None, None, None, 'and' ] assert tks_as_str(query_run_tokens_with_unknowns(qrun), idx=idx) == expected assert qrun.start == 0 assert qrun.end == 9
def test_query_run_and_tokenizing_breaking_works__with_plus_as_expected(self): rule_dir = self.get_test_loc('query/run_breaking/rules') rules = list(models.load_rules(rule_dir)) idx = index.LicenseIndex(rules) query_doc = self.get_test_loc('query/run_breaking/query.txt') q = Query(query_doc, idx=idx) result = [qr.to_dict() for qr in q.query_runs] expected = [ {'end': 119, 'start': 0, 'tokens': 'this library is free software you can redistribute it ' 'and or modify it under the terms of the gnu library ' 'general public license as published by the free software ' 'foundation either version 2 of the license or at your ' 'option any later version this library is distributed in ' 'the hope that it will be useful but without any warranty ' 'without even the implied warranty of merchantability or ' 'fitness for particular purpose see the gnu library ' 'general public license for more details you should have ' 'received copy of the gnu library general public ' 'license along with this library see the file copying lib ' 'if not write to the free software foundation 51 franklin ' 'street fifth floor boston ma 02110 1301 usa'} ] assert result == expected # check rules token are the same exact set as the set of the last query run txtid = idx.tokens_by_tid qrt = [txtid[t] for t in q.query_runs[-1].tokens] irt = [txtid[t] for t in idx.tids_by_rid[0]] assert irt == qrt
def test_LicenseMatch_small(self): r1_text = u'licensed under the GPL, licensed under the GPL distribute extent of law' small_rule = Rule(text_file='small_rule', licenses=['apache-1.1'], _text=r1_text) r2_text = u'licensed under the GPL, licensed under the GPL re distribute extent of law' * 10 long_rule = Rule(text_file='long_rule', licenses=['apache-1.1'], _text=r2_text) _idx = index.LicenseIndex([small_rule, long_rule]) test = LicenseMatch(rule=small_rule, qspan=Span(0, 10), ispan=Span(0, 10), hispan=Span(12)) assert test.small() test = LicenseMatch(rule=small_rule, qspan=Span(0, 10), ispan=Span(0, 10), hispan=Span(11, 12)) assert test.small() test = LicenseMatch(rule=small_rule, qspan=Span(10, 11, 12), ispan=Span(10, 11, 12), hispan=Span(11, 12)) assert test.small() test = LicenseMatch(rule=small_rule, qspan=Span(1, 6), ispan=Span(1, 6)) assert test.small() test = LicenseMatch(rule=long_rule, qspan=Span(0, 10), ispan=Span(0, 10), hispan=Span(12)) assert test.small() test = LicenseMatch(rule=long_rule, qspan=Span(5, 10), ispan=Span(5, 10), hispan=Span(5, 6)) assert test.small() test = LicenseMatch(rule=small_rule, qspan=Span(1, 10), ispan=Span(1, 10), hispan=Span(3, 6)) assert not test.small()
def test_match_exact_from_string_twice_with_repeated_text(self): _text = u'licensed under the GPL, licensed under the GPL' # 0 1 2 3 4 5 6 7 licenses = ['tst'] rule = models.Rule(licenses=licenses, _text=_text) idx = index.LicenseIndex([rule]) querys = u'Hi licensed under the GPL, licensed under the GPL yes.' # 0 1 2 3 4 5 6 7 8 9 result = idx.match(query_string=querys) assert 1 == len(result) match = result[0] qtext, itext = get_texts(match, query_string=querys, idx=idx) assert 'licensed under the GPL licensed under the GPL' == qtext assert 'licensed under the GPL licensed under the GPL' == itext assert Span(0, 7) == match.qspan assert Span(0, 7) == match.ispan # match again to ensure that there are no state side effects result = idx.match(query_string=querys) assert 1 == len(result) match = result[0] assert Span(0, 7) == match.qspan assert Span(0, 7) == match.ispan qtext, itext = get_texts(match, query_string=querys, idx=idx) assert u'licensed under the GPL licensed under the GPL' == qtext assert u'licensed under the GPL licensed under the GPL' == itext
def test_index_fails_on_duplicated_rules(self): rule_dir = self.get_test_loc('index/no_duplicated_rule') try: index.LicenseIndex(models.load_rules(rule_dir)) self.fail('Exception on dupes not raised') except AssertionError as e: assert u'Duplicate rules' in str(e)
def test_match_license_performance_profiling_on_index_with_single_license( self): from time import time from licensedcode import query # pre-index : we are profiling only the detection, not the indexing rule_dir = self.get_test_loc('perf/idx/rules') rules = models.load_rules(rule_dir) idx = index.LicenseIndex(rules) location = self.get_test_loc('perf/idx/query.txt') querys = open(location, 'rb').read() qry = query.build_query(query_string=querys, idx=idx) def mini_seq_match(idx): list(idx.get_approximate_matches(qry, [], [])) # qtokens_as_str = array('h', tokens).tostring() start = time() for _ in range(100): mini_seq_match(idx) duration = time() - start values = ('ScanCode diff:', duration) print(*values) raise Exception(values)
def test_query_run_has_correct_offset(self): rule_dir = self.get_test_loc('query/runs/rules') rules = list(models.load_rules(rule_dir)) idx = index.LicenseIndex(rules) query_doc = self.get_test_loc('query/runs/query.txt') q = Query(location=query_doc, idx=idx, line_threshold=4) result = [qr.to_dict() for qr in q.query_runs] expected = [{ u'end': 0, u'start': 0, u'tokens': u'inc' }, { u'end': 121, u'start': 1, u'tokens': (u'this library is free software you can redistribute it and or modify ' u'it under the terms of the gnu library general public license as ' u'published by the free software foundation either version 2 of the ' u'license or at your option any later version this library is ' u'distributed in the hope that it will be useful but without any ' u'warranty without even the implied warranty of merchantability or ' u'fitness for particular purpose see the gnu library general public ' u'license for more details you should have received copy of the gnu ' u'library general public license along with this library see the file ' u'copying lib if not write to the free software foundation inc 51 ' u'franklin street fifth floor boston ma 02110 1301 usa') }] assert result == expected
def test_LicenseMatch_small(self): r1_text = u'licensed under the GPL, licensed under the GPL' r1 = Rule(text_file='r1', licenses=['apache-1.1'], _text=r1_text) r2_text = u'licensed under the GPL, licensed under the GPL' * 10 r2 = Rule(text_file='r1', licenses=['apache-1.1'], _text=r2_text) _idx = index.LicenseIndex([r1, r2]) assert LicenseMatch(rule=r1, qspan=Span(0, 10), ispan=Span(0, 10), hispan=Span(12)).small() assert LicenseMatch(rule=r1, qspan=Span(0, 10), ispan=Span(0, 10), hispan=Span(11, 12)).small() assert LicenseMatch(rule=r1, qspan=Span(10, 11, 12), ispan=Span(10, 11, 12), hispan=Span(11, 12)).small() assert LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)).small() assert LicenseMatch(rule=r2, qspan=Span(0, 10), ispan=Span(0, 10), hispan=Span(12)).small() assert LicenseMatch(rule=r2, qspan=Span(5, 10), ispan=Span(5, 10), hispan=Span(5, 6)).small() assert LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6)).small()
def test_query_runs_with_plain_rule(self): rule_text = u'''X11 License Copyright (C) 1996 X Consortium Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Except as contained in this notice, the name of the X Consortium shall not be used in advertising or otherwise to promote the sale, use or other dealings in this Software without prior written authorization from the X Consortium. X Window System is a trademark of X Consortium, Inc. ''' rule = Rule(stored_text=rule_text, license_expression='x-consortium') idx = index.LicenseIndex([rule]) query_loc = self.get_test_loc( 'detect/simple_detection/x11-xconsortium_text.txt') qry = Query(location=query_loc, idx=idx) result = [q.to_dict(brief=False) for q in qry.query_runs] expected = [{ 'start': 0, 'end': 213, 'tokens': (u'x11 license copyright c 1996 x consortium permission is hereby ' u'granted free of charge to any person obtaining copy of this ' u'software and associated documentation files the software to deal in ' u'the software without restriction including without limitation the ' u'rights to use copy modify merge publish distribute sublicense and or ' u'sell copies of the software and to permit persons to whom the ' u'software is furnished to do so subject to the following conditions ' u'the above copyright notice and this permission notice shall be ' u'included in all copies or substantial portions of the software the ' u'software is provided as is without warranty of any kind express or ' u'implied including but not limited to the warranties of ' u'merchantability fitness for particular purpose and noninfringement ' u'in no event shall the x consortium be liable for any claim damages or ' u'other liability whether in an action of contract tort or otherwise ' u'arising from out of or in connection with the software or the use or ' u'other dealings in the software except as contained in this notice the ' u'name of the x consortium shall not be used in advertising or ' u'otherwise to promote the sale use or other dealings in this software ' u'without prior written authorization from the x consortium x window ' u'system is trademark of x consortium inc') }] assert len(qry.query_runs[0].tokens) == 214 assert result == expected
def test_Query_from_real_index_and_location(self): idx = index.LicenseIndex(self.get_test_rules('index/bsd')) query_loc = self.get_test_loc('index/querytokens') qry = Query(location=query_loc, idx=idx, line_threshold=4) result = [qr.to_dict() for qr in qry.query_runs] expected = [{ 'end': 35, 'start': 0, 'tokens': (u'redistribution and use in source and binary forms ' u'redistributions of source code must the this that is not ' u'to redistributions in binary form must this software is ' u'provided by the copyright holders and contributors as is') }, { 'end': 36, 'start': 36, 'tokens': u'redistributions' }] assert result == expected expected_lbp = [ 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 9, 9, 9, 9, 9, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15 ] assert qry.line_by_pos == expected_lbp
def test_match_with_templates_with_redundant_tokens_yield_single_exact_match(self): _text = u'copyright reserved mit is license, {{}} copyright reserved mit is license' # 0 1 2 3 4 5 6 7 8 9 licenses = ['tst'] rule = models.Rule(licenses=licenses, _text=_text) idx = index.LicenseIndex([rule]) expected_idx = {'_tst_73_0': {u'copyright': [0, 5], u'license': [4, 9], u'mit': [2, 7]}} assert expected_idx == idx.to_dict() querys = u'Hi my copyright reserved mit is license is the copyright reserved mit is license yes.' # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 qry = Query(query_string=querys, idx=idx) # convert tid to actual token strings tks_as_str = lambda tks: [None if tid is None else idx.tokens_by_tid[tid] for tid in tks] expected = [None, None, u'copyright', u'reserved', u'mit', u'is', u'license', u'is', None, u'copyright', u'reserved', u'mit', u'is', u'license', None] # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 assert expected == tks_as_str(qry.tokens_with_unknowns()) result = idx.match(query_string=querys) assert 1 == len(result) match = result[0] assert Span(0, 4) | Span(6, 10) == match.qspan assert Span(0, 9) == match.ispan assert 100 == match.coverage() qtext, itext = get_texts(match, query_string=querys, idx=idx) assert 'copyright reserved mit is license <is> [the] copyright reserved mit is license' == qtext assert 'copyright reserved mit is license copyright reserved mit is license' == itext
def test_get_full_matched_text_base(self): rule_text = u''' Copyright {{some copyright}} THIS IS FROM {{THE CODEHAUS}} AND CONTRIBUTORS IN NO EVENT SHALL {{THE CODEHAUS}} OR ITS CONTRIBUTORS BE LIABLE EVEN IF ADVISED OF THE {{POSSIBILITY OF SUCH}} DAMAGE ''' rule = Rule(stored_text=rule_text, license_expression='test') idx = index.LicenseIndex([rule]) querys = u''' foobar 45 . Copyright 2003 (C) James. All Rights Reserved. THIS IS FROM THE CODEHAUS AND CONTRIBUTORS IN NO EVENT SHALL THE best CODEHAUS OR ITS CONTRIBUTORS BE LIABLE EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. chabada DAMAGE 12 ABC dasdasda . ''' result = idx.match(query_string=querys) assert 1 == len(result) match = result[0] expected = u"""Copyright [2003] ([C]) [James]. [All] [Rights] [Reserved]. THIS IS FROM THE CODEHAUS AND CONTRIBUTORS IN NO EVENT SHALL THE [best] CODEHAUS OR ITS CONTRIBUTORS BE LIABLE EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.""" matched_text = u''.join( get_full_matched_text(match, query_string=querys, idx=idx)) assert expected == matched_text
def test_Query_tokenize_from_string(self): rule_text = 'Redistribution and use in source and binary forms with or without modification are permitted' idx = index.LicenseIndex([Rule(stored_text=rule_text, license_expression='bsd')]) querys = ''' The Redistribution and use in source and binary are permitted. Athena capital of Grece Paris and Athene Always''' qry = Query(query_string=querys, idx=idx, _test_mode=True) tokens_by_line = list(qry.tokens_by_line(query_string=querys)) qry.tokenize_and_build_runs(tokens_by_line) expected = ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', 'and'] result = tks_as_str(qry.tokens, idx=idx) assert result == expected expected = [None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', None, None, None, None, None, 'and', None, None] result = tks_as_str(query_tokens_with_unknowns(qry), idx=idx) assert result == expected assert len(qry.query_runs) == 1 qr1 = qry.query_runs[0] assert qr1.start == 0 assert qr1.end == 9 assert len(qr1) == 10 expected = ['redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', 'and'] result = tks_as_str(qr1.tokens, idx=idx) assert result == expected expected = [None, 'redistribution', 'and', 'use', 'in', 'source', 'and', 'binary', 'are', 'permitted', None, None, None, None, None, 'and'] result = tks_as_str(query_run_tokens_with_unknowns(qr1), idx=idx) assert result == expected
def test_QueryRun_with_all_digit_lines(self): rule = Rule(stored_text=''' redistributions 0 1 2 3 4 1568 5 6 7 368 8 9 10 80 12213 232312 in binary 345 in 256 free 1953 software 406 foundation 1151 free 429 software 634 foundation 1955 free 724 software 932 foundation 234 software 694 foundation 110 ''') legalese = set(['binary', 'redistributions', 'foundation']) idx = index.LicenseIndex([rule], _legalese=legalese) qs = ''' 25 17 1 -80.00000 .25000 37.00000 .25000 0: 5107 -2502 -700 496 -656 468 -587 418 -481 347 -325 256 -111 152 166 50 493 -37 854 -96 1221 -118 1568 -125 1953 -143 2433 -195 2464 -281 2529 -395 1987 -729 447 -916 -3011 -1181 -5559 -406 -6094 541 -5714 1110 -5247 1289 -4993 1254 -4960 1151 1: 4757 -1695 -644 429 -627 411 -602 368 -555 299 -470 206 -328 96 -125 -15 126 -105 391 -146 634 -120 762 -58 911 -13 1583 -8 1049 -28 1451 123 1377 -464 907 -603 -4056 -1955 -6769 -485 -5797 929 -4254 1413 -3251 1295 -2871 993 -2899 724 2: 4413 -932 -563 355 -566 354 -582 322 -597 258 -579 164 -499 45 -341 -84 -127 -192 93 -234 288 -157 190 -25 -145 65 1065 74 -1087 -40 -877 1058 -994 18 1208 694 -5540 -3840 -7658 -332 -4130 1732 -1668 1786 -634 1127 -525 501 -856 110 ''' qry = Query(query_string=qs, idx=idx) result = [qr.to_dict() for qr in qry.query_runs] # FIXME: we should not even have a query run for things that are all digits expected = [ { 'end': 5, 'start': 0, 'tokens': '1 80 0 256 1568 1953' }, { 'end': 12, 'start': 6, 'tokens': '406 1151 1 429 368 634 8' }, { 'end': 17, 'start': 13, 'tokens': '1955 724 2 932 234' }, ] assert result == expected assert not any(qr.is_matchable() for qr in qry.query_runs)
def test_match_seq_are_correct_on_apache(self): rule_dir = self.get_test_loc('match_seq/rules') legalese = (mini_legalese | set([ 'redistributions', 'written', 'registered', 'derived', 'damage', 'due', 'alternately', 'nor' ])) idx = index.LicenseIndex(load_rules(rule_dir), _legalese=legalese) query_loc = self.get_test_loc('match_seq/query') matches = idx.match(location=query_loc) assert len(matches) == 1 match = matches[0] assert match.matcher == match_seq.MATCH_SEQ qtext, _itext = get_texts(match) expected = u''' The OpenSymphony Group. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The end-user documentation included with the redistribution, if any, must include the following acknowledgment: [4]. "[This] [product] [includes] [software] [developed] [by] [the] [OpenSymphony] [Group] ([http]://[www].[opensymphony].[com]/)." [5]. Alternately, this acknowledgment may appear in the software itself, if and wherever such third-party acknowledgments normally appear. The names "OpenSymphony" and "The OpenSymphony Group" must not be used to endorse or promote products derived from this software without prior written permission. For written permission, please contact [email protected] . Products derived from this software may not be called "OpenSymphony" or "[OsCore]", nor may "OpenSymphony" or "[OsCore]" appear in their name, without prior written permission of the OpenSymphony Group. THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ''' assert qtext.split() == expected.split()
def test_match_to_threshold_words_has_hundred_score(self): threshold = 18 idx = index.LicenseIndex( [Rule(stored_text=' LGPL ' * threshold, license_expression='lgpl-2.0')] ) matches = idx.match(query_string=' LGPL ' * threshold) assert 1 == len(matches) assert 100.0 == matches[0].score()
def test_Thresholds(self): r1_text = 'licensed under the GPL, licensed under the GPL' r1 = models.Rule(text_file='r1', license_expression='apache-1.1', stored_text=r1_text) r2_text = 'licensed under the GPL, licensed under the GPL' * 10 r2 = models.Rule(text_file='r1', license_expression='apache-1.1', stored_text=r2_text) _idx = index.LicenseIndex([r1, r2]) assert models.Thresholds(high_len=4, low_len=4, length=8, small=True, min_high=4, min_len=8) == r1.thresholds() assert models.Thresholds(high_len=31, low_len=40, length=71, small=False, min_high=3, min_len=4) == r2.thresholds() r1_text = 'licensed under the GPL,{{}} licensed under the GPL' r1 = models.Rule(text_file='r1', license_expression='apache-1.1', stored_text=r1_text) r2_text = 'licensed under the GPL, licensed under the GPL' * 10 r2 = models.Rule(text_file='r1', license_expression='apache-1.1', stored_text=r2_text) _idx = index.LicenseIndex([r1, r2]) assert models.Thresholds(high_len=4, low_len=4, length=8, small=True, min_high=4, min_len=8) == r1.thresholds() assert models.Thresholds(high_len=31, low_len=40, length=71, small=False, min_high=3, min_len=4) == r2.thresholds()
def test_match_does_not_return_matches_for_empty_query(self): idx = index.LicenseIndex( [Rule(_text='A one. A two. license A three.')]) matches = idx.match(query_string='') assert [] == matches matches = idx.match(query_string=None) assert [] == matches
def test_match_return_correct_positions_with_short_index_and_queries(self): idx = index.LicenseIndex( [Rule(stored_text='MIT License', license_expression='mit')] ) assert {'_tst_11_0': {'license': [1]}} == idx.to_dict(True) matches = idx.match(query_string='MIT License') assert 1 == len(matches) qtext, itext = get_texts(matches[0], query_string='MIT License', idx=idx) assert 'MIT License' == qtext assert 'mit license' == itext assert Span(0, 1) == matches[0].qspan assert Span(0, 1) == matches[0].ispan matches = idx.match(query_string='MIT MIT License') assert 1 == len(matches) qtext, itext = get_texts(matches[0], query_string='MIT MIT License', idx=idx) assert 'MIT License' == qtext assert 'mit license' == itext assert Span(1, 2) == matches[0].qspan assert Span(0, 1) == matches[0].ispan query_doc1 = 'do you think I am a mit license MIT License, yes, I think so' # # 0 1 2 3 matches = idx.match(query_string=query_doc1) assert 2 == len(matches) qtext, itext = get_texts(matches[0], query_string=query_doc1, idx=idx) assert 'mit license' == qtext assert 'mit license' == itext assert Span(0, 1) == matches[0].qspan assert Span(0, 1) == matches[0].ispan qtext, itext = get_texts(matches[1], query_string=query_doc1, idx=idx) assert 'MIT License' == qtext assert 'mit license' == itext assert Span(2, 3) == matches[1].qspan assert Span(0, 1) == matches[1].ispan query_doc2 = '''do you think I am a mit license MIT License yes, I think so''' matches = idx.match(query_string=query_doc2) assert 2 == len(matches) qtext, itext = get_texts(matches[0], query_string=query_doc2, idx=idx) assert 'mit license' == qtext assert 'mit license' == itext assert Span(0, 1) == matches[0].qspan assert Span(0, 1) == matches[0].ispan qtext, itext = get_texts(matches[1], query_string=query_doc2, idx=idx) assert 'MIT License' == qtext assert 'mit license' == itext assert Span(2, 3) == matches[1].qspan assert Span(0, 1) == matches[1].ispan
def test_match_license_performance_profiling_on_limited_index(self): # pre-index : we are profiling only the detection, not the indexing rule_dir = self.get_test_loc('detect/rule_template/rules') rules = models.load_rules(rule_dir) idx = index.LicenseIndex(rules) stats_file = 'license_match_limited_index_profile_log.txt' locations = [self.get_test_loc('detect/rule_template/query.txt')] self.profile_match(idx, locations, stats_file)
def test_get_full_matched_text(self): rule_text = u''' Copyright {{some copyright}} THIS IS FROM {{THE CODEHAUS}} AND CONTRIBUTORS IN NO EVENT SHALL {{THE CODEHAUS}} OR ITS CONTRIBUTORS BE LIABLE EVEN IF ADVISED OF THE {{POSSIBILITY OF SUCH}} DAMAGE ''' rule = Rule( _text=rule_text, licenses=['test'], ) idx = index.LicenseIndex([rule]) querys = u''' foobar 45 Copyright 2003 (C) James. All Rights Reserved. THIS IS FROM THE CODEHAUS AND CONTRIBUTORS IN NO EVENT SHALL THE best CODEHAUS OR ITS CONTRIBUTORS BE LIABLE EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. chabada DAMAGE 12 ABC ''' result = idx.match(query_string=querys) assert 1 == len(result) match = result[0] expected = u"""Copyright [2003] ([C]) [James]. [All] [Rights] [Reserved]. THIS IS FROM [THE] [CODEHAUS] AND CONTRIBUTORS IN NO EVENT SHALL [THE] [best] [CODEHAUS] OR ITS CONTRIBUTORS BE LIABLE EVEN IF ADVISED OF THE [POSSIBILITY] [OF] [SUCH] DAMAGE""" matched_text = u''.join( get_full_matched_text(match, query_string=querys, idx=idx)) assert expected == matched_text # test again using a template expected = u"""Copyright <br>2003</br> (<br>C</br>) <br>James</br>. <br>All</br> <br>Rights</br> <br>Reserved</br>. THIS IS FROM <br>THE</br> <br>CODEHAUS</br> AND CONTRIBUTORS IN NO EVENT SHALL <br>THE</br> <br>best</br> <br>CODEHAUS</br> OR ITS CONTRIBUTORS BE LIABLE EVEN IF ADVISED OF THE <br>POSSIBILITY</br> <br>OF</br> <br>SUCH</br> DAMAGE""" matched_text = u''.join( get_full_matched_text(match, query_string=querys, idx=idx, highlight_not_matched=u'<br>%s</br>')) assert expected == matched_text # test again using whole_lines expected = u""" foobar 45 Copyright 2003 (C) James. All Rights Reserved. THIS IS FROM THE CODEHAUS AND CONTRIBUTORS IN NO EVENT SHALL THE best CODEHAUS OR ITS CONTRIBUTORS BE LIABLE EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. chabada DAMAGE 12 ABC\n""" matched_text = u''.join( get_full_matched_text(match, query_string=querys, idx=idx, highlight_not_matched=u'%s', whole_lines=True)) assert expected == matched_text
def test_match_multiple(self): test_rules = self.get_test_rules('index/bsd') idx = index.LicenseIndex(test_rules) query = self.get_test_loc('index/querysimple') result = idx.match(location=query) assert 1 == len(result) match = result[0] assert Span(0, 209) == match.qspan assert Span(0, 209) == match.ispan
def test_QueryRun(self): idx = index.LicenseIndex([Rule(_text='redistributions in binary form must redistributions in')]) qry = Query(query_string='redistributions in binary form must redistributions in', idx=idx) qruns = qry.query_runs assert 1 == len(qruns) qr = qruns[0] # test result = [idx.tokens_by_tid[tid] for tid in qr.tokens] expected = ['redistributions', 'in', 'binary', 'form', 'must', 'redistributions', 'in'] assert expected == result
def test_match_can_match_with_index_built_from_rule_directory_with_sun_bcls(self): rule_dir = self.get_test_loc('detect/rule_template/rules') idx = index.LicenseIndex(load_rules(rule_dir)) # at line 151 the query has an extra "Software" word inserted to avoid hash matching query_loc = self.get_test_loc('detect/rule_template/query.txt') matches = idx.match(location=query_loc) assert 1 == len(matches) match = matches[0] assert Span(0, 957) | Span(959, 1756) == match.qspan assert match_seq.MATCH_SEQ == match.matcher
def test_QueryRun_repr(self): idx = index.LicenseIndex([Rule(_text='redistributions in binary form must redistributions in')]) qry = Query(query_string='redistributions in binary form must redistributions in', idx=idx) qruns = qry.query_runs qr = qruns[0] # test expected = 'QueryRun(start=0, len=7, start_line=1, end_line=1)' assert expected == repr(qr) expected = 'QueryRun(start=0, len=7, start_line=1, end_line=1, tokens="redistributions in binary form must redistributions in")' assert expected == qr.__repr__(trace_repr=True)
def test_index_rules_with_key_phrases_and_without_are_duplicates(self): rules_dir = self.get_test_loc('index/duplicate-key-phrases/rules') lics_dir = self.get_test_loc('index/duplicate-key-phrases/licenses') rules = models.get_rules(licenses_data_dir=lics_dir, rules_data_dir=rules_dir) try: idx = index.LicenseIndex(rules) for rid, tids in enumerate(idx.tids_by_rid): print(idx.rules_by_rid[rid].rid, repr(" ".join(idx.tokens_by_tid[t] for t in tids))) raise Exception("Exception not raised for duplicated rules") except index.DuplicateRuleError as e: assert str(e).startswith('Duplicate rules')
def test_match_can_match_with_plain_rule_simple2(self): rule_text = u'''X11 License Copyright (C) 1996 X Consortium Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Except as contained in this notice, the name of the X Consortium shall not be used in advertising or otherwise to promote the sale, use or other dealings in this Software without prior written authorization from the X Consortium. X Window System is a trademark of X Consortium, Inc. ''' rule = Rule(_text=rule_text, licenses=['x-consortium']) idx = index.LicenseIndex([rule]) query_loc = self.get_test_loc( 'detect/simple_detection/x11-xconsortium_text.txt') matches = idx.match(location=query_loc) assert 1 == len(matches) expected_qtext = u''' X11 License Copyright C 1996 X Consortium Permission is hereby granted free of charge to any person obtaining a copy of this software and associated documentation files the Software to deal in the Software without restriction including without limitation the rights to use copy modify merge publish distribute sublicense and or sell copies of the Software and to permit persons to whom the Software is furnished to do so subject to the following conditions The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software THE SOFTWARE IS PROVIDED AS IS WITHOUT WARRANTY OF ANY KIND EXPRESS OR IMPLIED INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT IN NO EVENT SHALL THE X CONSORTIUM BE LIABLE FOR ANY CLAIM DAMAGES OR OTHER LIABILITY WHETHER IN AN ACTION OF CONTRACT TORT OR OTHERWISE ARISING FROM OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE Except as contained in this notice the name of the X Consortium shall not be used in advertising or otherwise to promote the sale use or other dealings in this Software without prior written authorization from the X Consortium X Window System is a trademark of X Consortium Inc '''.split() match = matches[0] qtext, _itext = get_texts(match, location=query_loc, idx=idx) assert expected_qtext == qtext.split()
def test_filter_matches_handles_interlaced_matches_with_overlap_and_same_license(self): rule_dir = self.get_test_loc('match_filter/rules') idx = index.LicenseIndex(load_rules(rule_dir)) rules = {r.identifier: r for r in idx.rules_by_rid} query_loc = self.get_test_loc('match_filter/query') matches = idx.match(location=query_loc) expected = [ # filtered: LicenseMatch(matcher='3-seq', rule=rules['rule1.RULE'], qspan=Span(4, 47) | Span(50, 59), ispan=Span(1, 53)), LicenseMatch(matcher='2-aho', rule=rules['rule2.RULE'], qspan=Span(24, 86), ispan=Span(0, 62)), ] assert expected == matches
def test_match_matches_correctly_simple_exact_query_1(self): tf1 = self.get_test_loc('detect/mit/mit.c') ftr = Rule(text_file=tf1, licenses=['mit']) idx = index.LicenseIndex([ftr]) query_doc = self.get_test_loc('detect/mit/mit2.c') matches = idx.match(query_doc) assert 1 == len(matches) match = matches[0] assert ftr == match.rule assert Span(0, 86) == match.qspan assert Span(0, 86) == match.ispan