def test_LicenseMatch_small(self): r1_text = u'licensed under the GPL, licensed under the GPL distribute extent of law' small_rule = Rule(text_file='small_rule', license_expression='apache-1.1', stored_text=r1_text) r2_text = u'licensed under the GPL, licensed under the GPL re distribute extent of law' * 10 long_rule = Rule(text_file='long_rule', license_expression='apache-1.1', stored_text=r2_text) _idx = index.LicenseIndex([small_rule, long_rule]) test = LicenseMatch(rule=small_rule, qspan=Span(0, 10), ispan=Span(0, 10), hispan=Span(12)) assert test.small() test = LicenseMatch(rule=small_rule, qspan=Span(0, 10), ispan=Span(0, 10), hispan=Span(11, 12)) assert test.small() test = LicenseMatch(rule=small_rule, qspan=Span(10, 11, 12), ispan=Span(10, 11, 12), hispan=Span(11, 12)) assert test.small() test = LicenseMatch(rule=small_rule, qspan=Span(1, 6), ispan=Span(1, 6)) assert test.small() test = LicenseMatch(rule=long_rule, qspan=Span(0, 10), ispan=Span(0, 10), hispan=Span(12)) assert test.small() test = LicenseMatch(rule=long_rule, qspan=Span(5, 10), ispan=Span(5, 10), hispan=Span(5, 6)) assert test.small() test = LicenseMatch(rule=small_rule, qspan=Span(1, 10), ispan=Span(1, 10), hispan=Span(3, 6)) assert not test.small()
def tokens_by_line(self): """ Yield one sequence of tokens for each line in this query. Populate the query `line_by_pos`, `unknowns_by_pos`, `unknowns_by_pos`, `shorts_and_digits_pos` and `spdx_lines` as a side effect. """ from licensedcode.match_spdx_lid import split_spdx_lid # bind frequently called functions to local scope tokenizer = query_tokenizer line_by_pos_append = self.line_by_pos.append self_unknowns_by_pos = self.unknowns_by_pos unknowns_pos = set() unknowns_pos_add = unknowns_pos.add self_shorts_and_digits_pos_add = self.shorts_and_digits_pos.add dic_get = self.idx.dictionary.get # note: positions start at zero # absolute position in a query, including all known and unknown tokens abs_pos = -1 # absolute position in a query, including only known tokens known_pos = -1 # flag ifset to True when we have found the first known token globally # across all query lines started = False spdx_lid_token_ids = self.spdx_lid_token_ids if TRACE: logger_debug('tokens_by_line: query lines') for line_num, line in query_lines(self.location, self.query_string): logger_debug(' ', line_num, ':', line) for line_num, line in query_lines(self.location, self.query_string): # keep track of tokens in a line line_tokens = [] line_tokens_append = line_tokens.append line_first_known_pos = None # FIXME: the implicit update of abs_pos is not clear for abs_pos, token in enumerate(tokenizer(line), abs_pos + 1): tid = dic_get(token) if tid is not None: # this is a known token known_pos += 1 started = True line_by_pos_append(line_num) if len(token) == 1 or token.isdigit(): self_shorts_and_digits_pos_add(known_pos) if line_first_known_pos is None: line_first_known_pos = known_pos else: if not started: # If we have not yet started globally, then all tokens # seen so far are unknowns and we keep a count of them # in the magic "-1" position. self_unknowns_by_pos[-1] += 1 else: # here we have a new unknwon token positioned right after # the current known_pos self_unknowns_by_pos[known_pos] += 1 unknowns_pos_add(known_pos) line_tokens_append(tid) # last known token position in the current line line_last_known_pos = known_pos # ONLY collect as SPDX a line that starts with SPDX License # Identifier. There are cases where this prefix does not start as # the firt tokens such as when we have one or two words (such as a # comment indicator DNL, REM etc.) that start the line and then and # an SPDX license identifier. spdx_start_offset = None if line_tokens[:3] in spdx_lid_token_ids: spdx_start_offset = 0 elif line_tokens[1:4] in spdx_lid_token_ids: spdx_start_offset = 1 elif line_tokens[2:5] in spdx_lid_token_ids: spdx_start_offset = 2 if spdx_start_offset is not None: # keep the line, start/end known pos for SPDX matching spdx_prefix, spdx_expression = split_spdx_lid(line) spdx_text = ' '.join([spdx_prefix or '', spdx_expression]) spdx_start_known_pos = line_first_known_pos + spdx_start_offset if spdx_start_known_pos <= line_last_known_pos: self.spdx_lines.append((spdx_text, spdx_start_known_pos, line_last_known_pos)) yield line_tokens # finally create a Span of positions followed by unkwnons, used # for intersection with the query span for scoring matches self.unknowns_span = Span(unknowns_pos)
def test_LicenseMatch_comparisons(self): r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl') contained1 = LicenseMatch(rule=r1, qspan=Span(0, 5), ispan=Span(0, 5)) contained2 = LicenseMatch(rule=r1, qspan=Span(1, 4), ispan=Span(1, 4)) same_span1 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) same_span2 = LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)) before_after = LicenseMatch(rule=r1, qspan=Span(8, 9), ispan=Span(8, 9)) touching = LicenseMatch(rule=r1, qspan=Span(7, 7), ispan=Span(7, 7)) overlaping = LicenseMatch(rule=r1, qspan=Span(4, 7), ispan=Span(4, 7)) assert same_span1 == same_span2 assert same_span1 in same_span2 assert same_span1.overlap(same_span2) assert same_span2.overlap(same_span1) assert contained1 not in same_span1 assert same_span1 not in contained1 assert contained1.overlap(same_span2) assert contained1.surround(contained2) assert contained2 in same_span2 assert contained2 in contained1 assert contained2.overlap(overlaping) assert overlaping.overlap(contained2) assert overlaping.overlap(same_span1) assert not overlaping.overlap(before_after) assert before_after.is_after(same_span1) assert before_after.is_after(touching) assert before_after.is_after(contained1)
def test_match_return_correct_positions_with_short_index_and_queries(self): idx = index.LicenseIndex( [Rule(stored_text='MIT License', license_expression='mit')] ) assert {'_tst_11_0': {'license': [1]}} == idx.to_dict(True) matches = idx.match(query_string='MIT License') assert 1 == len(matches) qtext, itext = get_texts(matches[0], query_string='MIT License', idx=idx) assert 'MIT License' == qtext assert 'mit license' == itext assert Span(0, 1) == matches[0].qspan assert Span(0, 1) == matches[0].ispan matches = idx.match(query_string='MIT MIT License') assert 1 == len(matches) qtext, itext = get_texts(matches[0], query_string='MIT MIT License', idx=idx) assert 'MIT License' == qtext assert 'mit license' == itext assert Span(1, 2) == matches[0].qspan assert Span(0, 1) == matches[0].ispan query_doc1 = 'do you think I am a mit license MIT License, yes, I think so' # # 0 1 2 3 matches = idx.match(query_string=query_doc1) assert 2 == len(matches) qtext, itext = get_texts(matches[0], query_string=query_doc1, idx=idx) assert 'mit license' == qtext assert 'mit license' == itext assert Span(0, 1) == matches[0].qspan assert Span(0, 1) == matches[0].ispan qtext, itext = get_texts(matches[1], query_string=query_doc1, idx=idx) assert 'MIT License' == qtext assert 'mit license' == itext assert Span(2, 3) == matches[1].qspan assert Span(0, 1) == matches[1].ispan query_doc2 = '''do you think I am a mit license MIT License yes, I think so''' matches = idx.match(query_string=query_doc2) assert 2 == len(matches) qtext, itext = get_texts(matches[0], query_string=query_doc2, idx=idx) assert 'mit license' == qtext assert 'mit license' == itext assert Span(0, 1) == matches[0].qspan assert Span(0, 1) == matches[0].ispan qtext, itext = get_texts(matches[1], query_string=query_doc2, idx=idx) assert 'MIT License' == qtext assert 'mit license' == itext assert Span(2, 3) == matches[1].qspan assert Span(0, 1) == matches[1].ispan
def test_match_can_match_with_rule_template_with_gap_near_start_with_few_tokens_before(self): # failed when a gapped token starts at a beginning of rule with few tokens before test_file = self.get_test_loc('detect/templates/license7.txt') rule = Rule(text_file=test_file, license_expression='lic') idx = index.LicenseIndex([rule]) qloc = self.get_test_loc('detect/templates/license8.txt') matches = idx.match(qloc) assert 1 == len(matches) match = matches[0] expected_qtokens = u""" All Rights Reserved Redistribution and use of this software and associated documentation Software with or without modification are permitted provided that the following conditions are met 1 Redistributions of source code must retain copyright statements and notices Redistributions must also contain copy of this document 2 Redistributions in binary form must reproduce the above copyright notice this of conditions and the following disclaimer in the documentation and or other materials provided with the distribution 3 The name [groovy] must not be used to endorse or promote products derived from this Software without prior written permission of <The> [Codehaus] For written permission please contact [info] [codehaus] [org] 4 Products derived from this Software may not be called [groovy] nor may [groovy] appear in their names without prior written permission of <The> [Codehaus] [groovy] is registered trademark of <The> [Codehaus] 5 Due credit should be given to <The> [Codehaus] [http] [groovy] [codehaus] [org] <THIS> <SOFTWARE> <IS> <PROVIDED> <BY> <THE> [CODEHAUS] <AND> <CONTRIBUTORS> AS IS AND ANY EXPRESSED OR IMPLIED WARRANTIES INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR PARTICULAR PURPOSE ARE DISCLAIMED IN NO EVENT SHALL <THE> [CODEHAUS] OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT INDIRECT INCIDENTAL SPECIAL EXEMPLARY OR CONSEQUENTIAL DAMAGES INCLUDING BUT NOT LIMITED TO PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES LOSS OF USE DATA OR PROFITS OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY WHETHER IN CONTRACT STRICT LIABILITY OR TORT INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE """.split() expected_itokens = u''' All Rights Reserved Redistribution and use of this software and associated documentation Software with or without modification are permitted provided that the following conditions are met 1 Redistributions of source code must retain copyright statements and notices Redistributions must also contain copy of this document 2 Redistributions in binary form must reproduce the above copyright notice this of conditions and the following disclaimer in the documentation and or other materials provided with the distribution 3 The name must not be used to endorse or promote products derived from this Software without prior written permission of For written permission please contact 4 Products derived from this Software may not be called nor may appear in their names without prior written permission of is registered trademark of 5 Due credit should be given to <THIS> <SOFTWARE> <IS> <PROVIDED> <BY> AS IS AND ANY EXPRESSED OR IMPLIED WARRANTIES INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR PARTICULAR PURPOSE ARE DISCLAIMED IN NO EVENT SHALL OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT INDIRECT INCIDENTAL SPECIAL EXEMPLARY OR CONSEQUENTIAL DAMAGES INCLUDING BUT NOT LIMITED TO PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES LOSS OF USE DATA OR PROFITS OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY WHETHER IN CONTRACT STRICT LIABILITY OR TORT INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE '''.lower().split() qtext, itext = get_texts(match, location=qloc, idx=idx) assert expected_qtokens == qtext.split() assert expected_itokens == itext.split() assert 97.51 == match.coverage() assert 97.51 == match.score() expected = Span(2, 96) | Span(98, 123) | Span(125, 128) | Span(130, 136) | Span(146, 174) | Span(176, 249) assert expected == match.qspan assert Span(1, 132) | Span(138, 240) == match.ispan
def test_match_with_template_and_multiple_rules(self): test_rules = self.get_test_rules('index/bsd_templates',) idx = index.LicenseIndex(test_rules) querys = u''' Hello, what about this Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of nexB Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Goodbye No part of match ''' result = idx.match(query_string=querys) assert 1 == len(result) match = result[0] assert match_seq.MATCH_SEQ == match.matcher exp_qtext = u""" Redistribution and use in source and binary forms with or without modification are permitted provided that the following conditions are met Redistributions of source code must retain the above copyright notice this list of conditions and the following disclaimer Redistributions in binary form must reproduce the above copyright notice this list of conditions and the following disclaimer in the documentation and or other materials provided with the distribution Neither the name of [nexB] <Inc> nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT INDIRECT INCIDENTAL SPECIAL EXEMPLARY OR CONSEQUENTIAL DAMAGES INCLUDING BUT NOT LIMITED TO PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES LOSS OF USE DATA OR PROFITS OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY WHETHER IN CONTRACT STRICT LIABILITY OR TORT INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE """.split() exp_itext = u""" Redistribution and use in source and binary forms with or without modification are permitted provided that the following conditions are met Redistributions of source code must retain the above copyright notice this list of conditions and the following disclaimer Redistributions in binary form must reproduce the above copyright notice this list of conditions and the following disclaimer in the documentation and or other materials provided with the distribution Neither the name of nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES INCLUDING BUT NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT INDIRECT INCIDENTAL SPECIAL EXEMPLARY OR CONSEQUENTIAL DAMAGES INCLUDING BUT NOT LIMITED TO PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES LOSS OF USE DATA OR PROFITS OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY WHETHER IN CONTRACT STRICT LIABILITY OR TORT INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE """.split() # q = Query(query_string=querys, idx=idx) # print('######################') # print('######################') # print('q=', querys.lower().replace('*', ' ').replace('/', ' '). split()) # print('q2=', [None if t is None else idx.tokens_by_tid[t] for t in q.tokens_with_unknowns()]) # print('######################') qtext, itext = get_texts(match, query_string=querys, idx=idx) assert exp_qtext == qtext.split() assert exp_itext == itext.split() assert Span(Span(1, 72) | Span(74, 212)) == match.qspan assert Span(0, 210) == match.ispan assert 100 == match.coverage()
def tokens_by_line( self, location=None, query_string=None, start_line=1, ): """ Yield multiple sequences of tokens, one for each line in this query. Line numbers start at ``start_line`` which is 1-based by default. SIDE EFFECT: This populates the query `line_by_pos`, `unknowns_by_pos`, `unknowns_span`, `stopwords_by_pos`, `stopwords_span`, `shorts_and_digits_pos` and `spdx_lines` . """ from licensedcode.match_spdx_lid import split_spdx_lid from licensedcode.stopwords import STOPWORDS location = location or self.location query_string = query_string or self.query_string # bind frequently called functions to local scope line_by_pos_append = self.line_by_pos.append self_unknowns_by_pos = self.unknowns_by_pos unknowns_pos = set() unknowns_pos_add = unknowns_pos.add self_stopwords_by_pos = self.stopwords_by_pos stopwords_pos = set() stopwords_pos_add = stopwords_pos.add self_shorts_and_digits_pos_add = self.shorts_and_digits_pos.add dic_get = self.idx.dictionary.get # note: positions start at zero # absolute position in a query, including only known tokens known_pos = -1 # flag ifset to True when we have found the first known token globally # across all query lines started = False spdx_lid_token_ids = self.spdx_lid_token_ids qlines = query_lines( location=location, query_string=query_string, start_line=start_line, ) if TRACE: logger_debug('tokens_by_line: query lines:') qlines = list(qlines) for line_num, line in qlines: logger_debug(' ', line_num, ':', line) for line_num, line in qlines: # keep track of tokens in a line line_tokens = [] line_tokens_append = line_tokens.append line_first_known_pos = None for token in query_tokenizer(line): tid = dic_get(token) is_stopword = token in STOPWORDS if tid is not None and not is_stopword: # this is a known token known_pos += 1 started = True line_by_pos_append(line_num) if len(token) == 1 or token.isdigit(): self_shorts_and_digits_pos_add(known_pos) if line_first_known_pos is None: line_first_known_pos = known_pos else: # process STOPWORDS and unknown words if is_stopword: if not started: # If we have not yet started globally, then all tokens # seen so far are stopwords and we keep a count of them # in the magic "-1" position. self_stopwords_by_pos[-1] += 1 else: # here we have a new unknwon token positioned right after # the current known_pos self_stopwords_by_pos[known_pos] += 1 stopwords_pos_add(known_pos) # we do not track stopwords, only their position continue else: if not started: # If we have not yet started globally, then all tokens # seen so far are unknowns and we keep a count of them # in the magic "-1" position. self_unknowns_by_pos[-1] += 1 else: # here we have a new unknwon token positioned right after # the current known_pos self_unknowns_by_pos[known_pos] += 1 unknowns_pos_add(known_pos) line_tokens_append(tid) # last known token position in the current line line_last_known_pos = known_pos # ONLY collect as SPDX a line that starts with SPDX License # Identifier. There are cases where this prefix does not start as # the firt tokens such as when we have one or two words (such as a # comment indicator DNL, REM etc.) that start the line and then and # an SPDX license identifier. spdx_start_offset = None if line_tokens[:3] in spdx_lid_token_ids: spdx_start_offset = 0 elif line_tokens[1:4] in spdx_lid_token_ids: spdx_start_offset = 1 elif line_tokens[2:5] in spdx_lid_token_ids: spdx_start_offset = 2 if spdx_start_offset is not None: # keep the line, start/end known pos for SPDX matching spdx_prefix, spdx_expression = split_spdx_lid(line) spdx_text = ' '.join([spdx_prefix or '', spdx_expression]) spdx_start_known_pos = line_first_known_pos + spdx_start_offset if spdx_start_known_pos <= line_last_known_pos: self.spdx_lines.append( (spdx_text, spdx_start_known_pos, line_last_known_pos)) yield line_tokens # finally create a Span of positions followed by unkwnons and another # for positions followed by stopwords used for intersection with the # query span to do the scoring matches correctly self.unknowns_span = Span(unknowns_pos) self.stopwords_span = Span(stopwords_pos)
def exact_match(idx, query_run, automaton): """ Return a list of exact LicenseMatch by matching the `query_run` against the `automaton` and `idx` index. """ if TRACE: logger_debug(' #exact_AHO: start ... ') if TRACE_DEEP: logger_debug(' #exact_AHO: query_run:', query_run) len_junk = idx.len_junk rules_by_rid = idx.rules_by_rid qtokens = query_run.tokens qbegin = query_run.start query_run_matchables = query_run.matchables qtokens_as_str = array('h', qtokens).tostring() matches = [] # iterate over matched strings: the matched value is (rule id, index start pos, index end pos) for qend, matched_rule_segments in automaton.iter(qtokens_as_str): for rid, istart, iend in matched_rule_segments: rule = rules_by_rid[rid] if TRACE_DEEP: logger_debug(' #exact_AHO: found match to rule:', rule.identifier) ################################ # FIXME: use a trie of ints or a trie or Unicode characters to avoid this shenaningan ################################ # Since the Tries stores bytes and we have two bytes per tokenid, the # real end must be adjusted real_qend = (qend - 1) / 2 # ... and there is now a real possibility of a false match. # For instance say we have these tokens : # gpl encoded as 0012 and lgpl encoded as 1200 and mit as 2600 # And if we scan this "mit lgpl" we get this encoding 2600 1200. # The automaton will find a matched string of 0012 to gpl in the middle # matching falsely so we check that the corrected end qposition # must be always an integer. real_qend_int = int(real_qend) if real_qend != real_qend_int: if TRACE: logger_debug( ' #exact_AHO: real_qend != int(real_qend), discarding rule match:', rule.identifier) continue match_len = iend + 1 - istart matcher = match_len == rule.length and MATCH_AHO_EXACT or MATCH_AHO_FRAG real_qend = real_qend_int qposses = range(qbegin + real_qend - match_len + 1, qbegin + real_qend + 1) if any(p not in query_run_matchables for p in qposses): if TRACE: logger_debug( ' #exact_AHO: not matchable match: any(p not in query_run_matchables for p in qposses), discarding rule:', rule.identifier) continue qspan = Span(qposses) ispan = Span(range(istart, iend + 1)) itokens = idx.tids_by_rid[rid] hispan = Span(p for p in ispan if itokens[p] >= len_junk) match = LicenseMatch(rule, qspan, ispan, hispan, query_run.start, matcher=matcher) matches.append(match) if TRACE and matches: logger_debug(' ##exact_AHO: matches found#', matches) map(print, matches) return matches
def match_sequence(idx, candidate, query_run, start_offset=0): """ Return a list of LicenseMatch by matching the `query_run` tokens sequence against the `idx` index for the `candidate` rule tuple (rid, rule, intersection). """ if not candidate: return [] rid, rule, _intersection = candidate high_postings = idx.high_postings_by_rid[rid] itokens = idx.tids_by_rid[rid] len_junk = idx.len_junk qbegin = query_run.start + start_offset qfinish = query_run.end qtokens = query_run.query.tokens matches = [] qstart = qbegin qlen = len(query_run) # match as long as long we find alignments and have high matchable tokens # this allows to find repeated instances of the same rule in the query run query_run_matchables = query_run.matchables while qstart <= qfinish: if not query_run_matchables: break block_matches = match_blocks(qtokens, itokens, qstart, qlen, high_postings, len_junk, query_run_matchables) if not block_matches: break if TRACE2: logger_debug('block_matches:') for m in block_matches: i, j, k = m print(m) print('qtokens:', ' '.join(idx.tokens_by_tid[t] for t in qtokens[i:i + k])) print('itokens:', ' '.join(idx.tokens_by_tid[t] for t in itokens[j:j + k])) # create one match for each matching block: this not entirely correct # but this will be sorted out at LicenseMatch merging and filtering time for qpos, ipos, mlen in block_matches: qspan = Span(range(qpos, qpos + mlen)) iposses = range(ipos, ipos + mlen) hispan = Span(p for p in iposses if itokens[p] >= len_junk) ispan = Span(iposses) match = LicenseMatch(rule, qspan, ispan, hispan, qbegin, MATCH_SEQ) if TRACE2: qt, it = get_texts(match, location=query_run.query.location, query_string=query_run.query.query_string, idx=idx) print('###########################') print(match) print('###########################') print(qt) print('###########################') print(it) print('###########################') matches.append(match) qstart = max([qstart, qspan.end + 1]) if TRACE: map(logger_debug, matches) return matches
def test_LicenseMatch_small(self): r1_text = u'licensed under the GPL, licensed under the GPL' r1 = Rule(text_file='r1', licenses=['apache-1.1'], _text=r1_text) r2_text = u'licensed under the GPL, licensed under the GPL' * 10 r2 = Rule(text_file='r1', licenses=['apache-1.1'], _text=r2_text) _idx = index.LicenseIndex([r1, r2]) assert LicenseMatch(rule=r1, qspan=Span(0, 10), ispan=Span(0, 10), hispan=Span(12)).small() assert LicenseMatch(rule=r1, qspan=Span(0, 10), ispan=Span(0, 10), hispan=Span(11, 12)).small() assert LicenseMatch(rule=r1, qspan=Span(10, 11, 12), ispan=Span(10, 11, 12), hispan=Span(11, 12)).small() assert LicenseMatch(rule=r1, qspan=Span(1, 6), ispan=Span(1, 6)).small() assert LicenseMatch(rule=r2, qspan=Span(0, 10), ispan=Span(0, 10), hispan=Span(12)).small() assert LicenseMatch(rule=r2, qspan=Span(5, 10), ispan=Span(5, 10), hispan=Span(5, 6)).small() assert LicenseMatch(rule=r2, qspan=Span(1, 6), ispan=Span(1, 6)).small()
def get_normalized_expression(query_string, try_as_expression=True): """ Given a text `query_string` return a single detected license expression. `query_string` is typically the value of a license field as found in package manifests. If `try_as_expression` is True try frst to parse this as a license_expression. Return None if there is the `query_string` is empty. Return "unknown" as a license expression if there is a `query_string` but nothing was detected. """ if not query_string or not query_string.strip(): return if TRACE: logger_debug('get_normalized_expression: query_string: "{}"'.format( query_string)) from licensedcode.cache import get_index idx = get_index() licensing = Licensing() # we match twice in a cascade: as an expression, then as plain text if we # did not succeed. matches = None if try_as_expression: try: matched_as_expression = True matches = idx.match(query_string=query_string, as_expression=True) if matches_have_unknown(matches, licensing): # rematch also if we have unknowns matched_as_expression = False matches = idx.match(query_string=query_string, as_expression=False) except Exception: matched_as_expression = False matches = idx.match(query_string=query_string, as_expression=False) else: matched_as_expression = False matches = idx.match(query_string=query_string, as_expression=False) if not matches: # we have a query_string text but there was no match: return an unknown # key return 'unknown' if TRACE: logger_debug('get_normalized_expression: matches:', matches) # join the possible multiple detected license expression with an AND expression_objects = [m.rule.license_expression_object for m in matches] if len(expression_objects) == 1: combined_expression_object = expression_objects[0] else: combined_expression_object = licensing.AND(*expression_objects) if matched_as_expression: # then just return the expression(s) return str(combined_expression_object) # Otherwise, verify that we consumed 100% of the query string e.g. that we # have no unknown leftover. # 1. have all matches 100% coverage? all_matches_have_full_coverage = all(m.coverage() == 100 for m in matches) # TODO: have all matches a high enough score? # 2. are all declared license tokens consumed? query = matches[0].query # the query object should be the same for all matches. Is this always true?? for mt in matches: if mt.query != query: # FIXME: the expception may be swallowed in callers!!! raise Exception( 'Inconsistent package.declared_license: text with multiple "queries".' 'Please report this issue to the scancode-toolkit team.\n' '{}'.format(query_string)) query_len = len(query.tokens) matched_qspans = [m.qspan for m in matches] matched_qpositions = Span.union(*matched_qspans) len_all_matches = len(matched_qpositions) declared_license_is_fully_matched = query_len == len_all_matches if not all_matches_have_full_coverage or not declared_license_is_fully_matched: # We inject an 'unknown' symbol in the expression unknown = licensing.parse('unknown', simple=True) combined_expression_object = licensing.AND(combined_expression_object, unknown) return str(combined_expression_object)
def match_sequence(idx, rule, query_run, high_postings, start_offset=0, match_blocks=None, deadline=sys.maxsize): """ Return a list of LicenseMatch by matching the `query_run` tokens sequence starting at `start_offset` against the `idx` index for the candidate `rule`. Stop processing when reachin the deadline time. """ if not rule: return [] if not match_blocks: from licensedcode.seq import match_blocks rid = rule.rid itokens = idx.tids_by_rid[rid] len_legalese = idx.len_legalese qbegin = query_run.start + start_offset qfinish = query_run.end qtokens = query_run.query.tokens query = query_run.query matches = [] qstart = qbegin # match as long as long we find alignments and have high matchable tokens # this allows to find repeated instances of the same rule in the query run while qstart <= qfinish: if TRACE2: logger_debug('\n\nmatch_seq:==========================LOOP=============================') if not query_run.is_matchable(include_low=False): break if TRACE2: logger_debug('match_seq:running block_matches:', 'a_start:', qstart, 'a_end', qfinish + 1) block_matches = match_blocks( a=qtokens, b=itokens, a_start=qstart, a_end=qfinish + 1, b2j=high_postings, len_good=len_legalese, matchables=query_run.matchables) if not block_matches: break # create one match for each matching block: they will be further merged # at LicenseMatch merging and filtering time for qpos, ipos, mlen in block_matches: qspan_end = qpos + mlen # skip single non-high word matched as as sequence if mlen > 1 or (mlen == 1 and qtokens[qpos] < len_legalese): qspan = Span(range(qpos, qspan_end)) ispan = Span(range(ipos, ipos + mlen)) hispan = Span(p for p in ispan if itokens[p] < len_legalese) match = LicenseMatch( rule, qspan, ispan, hispan, qbegin, matcher=MATCH_SEQ, query=query) matches.append(match) if TRACE2: from licensedcode.tracing import get_texts qt, it = get_texts(match) logger_debug('###########################') logger_debug(match) logger_debug('###########################') logger_debug(qt) logger_debug('###########################') logger_debug(it) logger_debug('###########################') qstart = max([qstart, qspan_end]) if time() > deadline: break if time() > deadline: break if TRACE: logger_debug('match_seq: FINAL LicenseMatch(es)') for m in matches: logger_debug(m) logger_debug('\n\n') return matches
def match_fragments(idx, query_run): """ Return a list of Span by matching the `query_run` against the `automaton` and `idx` index. This is using a BLAST-like matching approach: we match ngram fragments of rules (e.g. a seed) and then we extend left and right. """ if TRACE_FRAG: logger_debug('-------------->match_fragments') # Get matches using the AHO Fragments automaton matches = exact_match(idx, query_run, automaton=idx.fragments_automaton, matcher=MATCH_AHO_FRAG) if TRACE_FRAG: logger_debug('match_fragments') for m in matches: print(m) # Discard fragments that have any already matched positions in previous matches from licensedcode.match import filter_already_matched_matches matches, _discarded = filter_already_matched_matches( matches, query_run.query) # Merge matches with a zero max distance, e.g. contiguous or overlapping # with matches to the same rule from licensedcode.match import merge_matches matches = merge_matches(matches, max_dist=0) # extend matched fragments left and right. We group by rule from licensedcode.seq import extend_match rules_by_rid = idx.rules_by_rid tids_by_rid = idx.tids_by_rid len_legalese = idx.len_legalese alo = qbegin = query_run.start ahi = query_run.end query = query_run.query qtokens = query.tokens matchables = query_run.matchables frag_matches = [] keyf = lambda m: m.rule.rid matches.sort(key=keyf) matches_by_rule = groupby(matches, key=keyf) for rid, rule_matches in matches_by_rule: itokens = tids_by_rid[rid] blo, bhi = 0, len(itokens) rule = rules_by_rid[rid] for match in rule_matches: i, j, k = match.qstart, match.istart, match.len() # extend alignment left and right as long as we have matchables qpos, ipos, mlen = extend_match(i, j, k, qtokens, itokens, alo, ahi, blo, bhi, matchables) qspan = Span(range(qpos, qpos + mlen)) ispan = Span(range(ipos, ipos + mlen)) hispan = Span(p for p in ispan if itokens[p] < len_legalese) match = LicenseMatch(rule, qspan, ispan, hispan, qbegin, matcher=MATCH_AHO_FRAG, query=query) frag_matches.append(match) # Merge matches as usual matches = merge_matches(matches) return frag_matches
def spdx_id_match(idx, query_run, text): """ Return one LicenseMatch by matching the `text` as an SPDX license expression using the `query_run` positions and `idx` index for support. """ from licensedcode.cache import get_spdx_symbols from licensedcode.cache import get_unknown_spdx_symbol if TRACE: logger_debug('spdx_id_match: start:', 'text:', text, 'query_run:', query_run) licensing = Licensing() symbols_by_spdx = get_spdx_symbols() unknown_symbol = get_unknown_spdx_symbol() expression = get_expression(text, licensing, symbols_by_spdx, unknown_symbol) expression_str = expression.render() if TRACE: logger_debug('spdx_id_match: expression:', repr(expression_str)) # how many known or unknown-spdx symbols occurence do we have? known_syms = 0 unknown_syms = 0 for sym in licensing.license_symbols(expression, unique=False, decompose=True): if sym == unknown_symbol: unknown_syms += 1 else: known_syms += 1 match_len = len(query_run) match_start = query_run.start matched_tokens = query_run.tokens if TRACE: logger_debug('spdx_id_match: matched_tokens: 1:', matched_tokens, [idx.tokens_by_tid[tid] for tid in matched_tokens]) cleaned = clean_text(text).lower() if TRACE: logger_debug('spdx_id_match: cleaned :', cleaned) # build synthetic rule # TODO: ensure that all the SPDX license keys are known symbols rule = SpdxRule( license_expression=expression_str, # FIXME: for now we are putting the original query text as a # rule text: this is likely incorrect when it comes to properly # computing the known and unknowns and high and lows for this rule. # alternatively we could use the expression string, padded with # spdx-license-identifier: this may be wrong too, if the line was # not padded originally with this tag stored_text=text, length=match_len) if TRACE: logger_debug('spdx_id_match: synthetic rule:', rule.relevance) logger_debug('spdx_id_match: synthetic rule:', rule) # build match from parsed expression # collect match start and end: e.g. the whole text qspan = Span(range(match_start, query_run.end + 1)) # we use the query side to build the ispans ispan = Span(range(0, match_len)) len_legalese = idx.len_legalese hispan = Span(p for p, t in enumerate(matched_tokens) if t < len_legalese) match = LicenseMatch( rule=rule, qspan=qspan, ispan=ispan, hispan=hispan, query_run_start=match_start, matcher=MATCH_SPDX_ID, query=query_run.query ) if TRACE: logger_debug('spdx_id_match: match found:', match) return match
def test_merge_does_not_merge_overlapping_matches_in_sequence_with_assymetric_overlap( self): r1 = Rule(text_file='r1', license_expression=u'lgpl-2.0-plus') # ---> merge_matches: current: LicenseMatch<'3-seq', lines=(9, 28), 'lgpl-2.0-plus_9.RULE', u'lgpl-2.0-plus', choice=False, score=87.5, qlen=126, ilen=126, hilen=20, rlen=144, qreg=(50, 200), ireg=(5, 142), qspan=Span(50, 90)|Span(92, 142)|Span(151, 182)|Span(199, 200), ispan=Span(5, 21)|Span(23, 46)|Span(48, 77)|Span(79, 93)|Span(95, 100)|Span(108, 128)|Span(130, 142), hispan=Span(10)|Span(14)|Span(18)|Span(24)|Span(27)|Span(52)|Span(57)|Span(61)|Span(65, 66)|Span(68)|Span(70)|Span(80)|Span(88)|Span(96)|Span(111)|Span(113)|Span(115)|Span(131)|Span(141)> # ---> merge_matches: next: LicenseMatch<'2-aho', lines=(28, 44), 'lgpl-2.0-plus_9.RULE', u'lgpl-2.0-plus', choice=False, score=100.0, qlen=144, ilen=144, hilen=21, rlen=144, qreg=(198, 341), ireg=(0, 143), qspan=Span(198, 341), ispan=Span(0, 143), hispan=Span(1)|Span(10)|Span(14)|Span(18)|Span(24)|Span(27)|Span(52)|Span(57)|Span(61)|Span(65, 66)|Span(68)|Span(70)|Span(80)|Span(88)|Span(96)|Span(111)|Span(113)|Span(115)|Span(131)|Span(141)> # ---> ###merge_matches: next overlaps in sequence current, merged as new: LicenseMatch<'3-seq 2-aho', lines=(9, 44), 'lgpl-2.0-plus_9.RULE', u'lgpl-2.0-plus', choice=False, score=100.0, qlen=268, ilen=144, hilen=21, rlen=144, qreg=(50, 341), ireg=(0, 143), qspan=Span(50, 90)|Span(92, 142)|Span(151, 182)|Span(198, 341), ispan=Span(0, 143), his # ---> merge_matches: current: qlen=126, ilen=126, hilen=20, rlen=144, qreg=(50, 200), ireg=(5, 142) # ---> merge_matches: next: qlen=144, ilen=144, hilen=21, rlen=144, qreg=(198, 341), ireg=(0, 143) m1 = LicenseMatch( rule=r1, qspan=Span(50, 90) | Span(92, 142) | Span(151, 182) | Span(199, 200), ispan=Span(5, 21) | Span(23, 46) | Span(48, 77) | Span(79, 93) | Span(95, 100) | Span(108, 128) | Span(130, 142), hispan=Span(10) | Span(14) | Span(18) | Span(24) | Span(27) | Span(52) | Span(57) | Span(61) | Span(65, 66) | Span(68) | Span(70) | Span(80) | Span(88) | Span(96) | Span(111) | Span(113) | Span(115) | Span(131) | Span(141), ) m2 = LicenseMatch(rule=r1, qspan=Span(198, 341), ispan=Span(0, 143), hispan=Span(1) | Span(10) | Span(14) | Span(18) | Span(24) | Span(27) | Span(52) | Span(57) | Span(61) | Span(65, 66) | Span(68) | Span(70) | Span(80) | Span(88) | Span(96) | Span(111) | Span(113) | Span(115) | Span(131) | Span(141)) matches = merge_matches([m1, m2]) assert [m1, m2] == matches
def test_LicenseMatch_equality(self): r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) m2 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) assert m1 == m2 r2 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) m3 = LicenseMatch(rule=r2, qspan=Span(0, 2), ispan=Span(0, 2)) assert m1 != m3 r1 = Rule(text_file='r1', licenses=['apache-2.0', 'gpl']) m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) r2 = Rule(text_file='r2', licenses=['gpl', 'apache-2.0']) m2 = LicenseMatch(rule=r2, qspan=Span(0, 2), ispan=Span(0, 2)) assert m1 != m2 assert m2 != m1 r3 = Rule(text_file='r3', licenses=['gpl', 'apache-2.0']) m3 = LicenseMatch(rule=r3, qspan=Span(0, 2), ispan=Span(0, 2)) assert m2 != m3
def test_filter_matches_filters_non_contiguous_or_overlapping_contained_matches_with_touching_boundaries( self): r1 = Rule(text_file='r1', license_expression='apache-2.0 OR gpl') m1 = LicenseMatch(rule=r1, qspan=Span(0, 2), ispan=Span(0, 2)) r2 = Rule(text_file='r2', license_expression='apache-2.0 OR gpl') m2 = LicenseMatch(rule=r2, qspan=Span(3, 7), ispan=Span(3, 7)) r3 = Rule(text_file='r3', license_expression='apache-2.0 OR gpl') m3 = LicenseMatch(rule=r3, qspan=Span(0, 6), ispan=Span(0, 6)) r6 = Rule(text_file='r6', license_expression='apache-2.0 OR gpl') m6 = LicenseMatch(rule=r6, qspan=Span(1, 7), ispan=Span(1, 7)) r5 = Rule(text_file='r5', license_expression='apache-2.0 OR gpl') m5 = LicenseMatch(rule=r5, qspan=Span(1, 6), ispan=Span(1, 6)) r4 = Rule(text_file='r4', license_expression='apache-2.0 OR gpl') m4 = LicenseMatch(rule=r4, qspan=Span(0, 7), ispan=Span(0, 7)) result, discarded = filter_contained_matches([m1, m2, m3, m4, m5, m6]) assert [m4] == result assert discarded
def get_licenses(location, min_score=0, include_text=False, license_text_diagnostics=False, license_url_template=SCANCODE_LICENSEDB_URL, deadline=sys.maxsize, **kwargs): """ Return a mapping or detected_licenses for licenses detected in the file at `location` This mapping contains two keys: - 'licenses' with a value that is list of mappings of license information. - 'license_expressions' with a value that is list of license expression strings. `minimum_score` is a minimum score threshold from 0 to 100. The default is 0 means that all license matches are returned. Otherwise, matches with a score below `minimum_score` are returned. If `include_text` is True, matched text is included in the returned `licenses` data as well as a file-level `percentage_of_license_text` percentage to indicate the overall proportion of detected license text and license notice words in the file. This is used to determine if a file contains mostly licensing information. """ from licensedcode import cache from licensedcode.spans import Span idx = cache.get_index() detected_licenses = [] detected_expressions = [] matches = idx.match(location=location, min_score=min_score, deadline=deadline, **kwargs) qspans = [] match = None for match in matches: qspans.append(match.qspan) detected_expressions.append(match.rule.license_expression) detected_licenses.extend( _licenses_data_from_match( match=match, include_text=include_text, license_text_diagnostics=license_text_diagnostics, license_url_template=license_url_template)) percentage_of_license_text = 0 if match: # we need at least one match to compute a license_coverage matched_tokens_length = len(Span().union(*qspans)) query_tokens_length = match.query.tokens_length(with_unknown=True) percentage_of_license_text = round( (matched_tokens_length / query_tokens_length) * 100, 2) detected_spdx_expressions = [] return dict([ ('licenses', detected_licenses), ('license_expressions', detected_expressions), ('spdx_license_expressions', detected_spdx_expressions), ('percentage_of_license_text', percentage_of_license_text), ])
def test_get_key_phrases_ignores_stopwords_in_positions(self): text = 'The word comma is a stop word so comma does not increase the span position {{MIT license}}.' key_phrase_spans = get_key_phrase_spans(text) assert list(key_phrase_spans) == [Span(11, 12)]
def tokens_by_line(self): """ Yield one sequence of tokens for each line in this query. Populate the query `line_by_pos`, `unknowns_by_pos`, `unknowns_by_pos`, `shorts_and_digits_pos` and `spdx_lines` as a side effect. """ # bind frequently called functions to local scope tokenizer = query_tokenizer line_by_pos_append = self.line_by_pos.append self_unknowns_by_pos = self.unknowns_by_pos unknowns_pos = set() unknowns_pos_add = unknowns_pos.add self_shorts_and_digits_pos_add = self.shorts_and_digits_pos.add dic_get = self.idx.dictionary.get # note: positions start at zero # absolute position in a query, including all known and unknown tokens abs_pos = -1 # absolute position in a query, including only known tokens known_pos = -1 started = False spdx_lid_token_ids = self.spdx_lid_token_ids do_collect_spdx_lines = spdx_lid_token_ids is not None if TRACE: logger_debug('tokens_by_line: query lines') for line_num, line in query_lines(self.location, self.query_string): logger_debug(' ', line_num, ':', line) for line_num, line in query_lines(self.location, self.query_string): line_tokens = [] line_tokens_append = line_tokens.append line_start_known_pos = None # FIXME: the implicit update of abs_pos is not clear for abs_pos, token in enumerate(tokenizer(line), abs_pos + 1): tid = dic_get(token) if tid is not None: known_pos += 1 started = True line_by_pos_append(line_num) if len(token) == 1 or token.isdigit(): self_shorts_and_digits_pos_add(known_pos) if line_start_known_pos is None: line_start_known_pos = known_pos else: # we have not yet started if not started: self_unknowns_by_pos[-1] += 1 else: self_unknowns_by_pos[known_pos] += 1 unknowns_pos_add(known_pos) line_tokens_append(tid) line_end_known_pos = known_pos # this works ONLY if the line starts with SPDX or we have one word # (such as acomment indicator DNL, REM etc.) and an SPDX id) if do_collect_spdx_lines and (line_tokens[:3] == spdx_lid_token_ids or line_tokens[1:4] == spdx_lid_token_ids): # keep the line, start/end known pos for SPDX matching self.spdx_lines.append( (line, line_start_known_pos, line_end_known_pos)) yield line_tokens # finally create a Span of positions followed by unkwnons, used # for intersection with the query span for scoring matches self.unknowns_span = Span(unknowns_pos)