def get_approximate_matches(self, query): """ Approximate matching strategy using query_runs and multiple local alignments (aka. diff) Return a list of matches. """ matches = [] # we exclude small and "weak" rules from the subset entirely: they are # unlikely to be matchable with a seq match rules_subset = (self.regular_rids | self.small_rids).difference( self.weak_rids) for query_run in query.query_runs: if not query_run.is_matchable(include_low=True): continue # per query run hash matching just in case we are lucky hash_matches = match_hash.hash_match(self, query_run) if hash_matches: matches.extend(hash_matches) continue # inverted index match and ranking, query run-level # FIXME: we should consider aho matches to excludes them from candidates # FIXME: also exclude from candidates any rule that is only aho-matchable run_matches = [] MAX_CANDIDATES = 50 candidates = match_set.compute_candidates( query_run, self, rules_subset=rules_subset, top=MAX_CANDIDATES) # multiple sequence matching/alignment, query run-level for candidate in candidates: start_offset = 0 while True: rule_matches = match_seq.match_sequence( self, candidate, query_run, start_offset=start_offset) if not rule_matches: break else: matches_end = max(m.qend for m in rule_matches) run_matches.extend(rule_matches) if matches_end + 1 < query_run.end: start_offset = matches_end + 1 continue else: break matches.extend(match.merge_matches(run_matches, max_dist=MAX_DIST)) return matches
def match_query( self, qry, min_score=0, as_expression=False, expression_symbols=None, approximate=True, unknown_licenses=False, deadline=sys.maxsize, _skip_hash_match=False, **kwargs, ): """ Return a sequence of LicenseMatch by matching the ``qry`` Query against this index. See Index.match() for arguments documentation. """ whole_query_run = qry.whole_query_run() if not whole_query_run or not whole_query_run.matchables: return [] if not _skip_hash_match: matches = match_hash.hash_match(self, whole_query_run) if matches: match.set_matched_lines(matches, qry.line_by_pos) return matches get_spdx_id_matches = partial( self.get_spdx_id_matches, expression_symbols=expression_symbols, ) if as_expression: matches = get_spdx_id_matches(qry, from_spdx_id_lines=False) match.set_matched_lines(matches, qry.line_by_pos) return matches matches = [] if USE_AHO_FRAGMENTS: approx = self.get_fragments_matches else: approx = self.get_approximate_matches matchers = [ # matcher, include_low in post-matching remaining matchable check (self.get_exact_matches, False, 'aho'), (get_spdx_id_matches, True, 'spdx_lid'), ] if approximate: matchers += [(approx, False, 'seq'), ] already_matched_qspans = [] for matcher, include_low, matcher_name in matchers: if TRACE: logger_debug() logger_debug('matching with matcher:', matcher_name) matched = matcher( qry, matched_qspans=already_matched_qspans, existing_matches=matches, deadline=deadline, ) if TRACE: self.debug_matches( matches=matched, message='matched with: ' + matcher_name, location=qry.location, query_string=qry.query_string, ) matched = match.merge_matches(matched) matches.extend(matched) # Subtract whole text matched if this is long enough for m in matched: if (m.rule.is_license_text and m.rule.length > 120 and m.coverage() > 98 ): qry.subtract(m.qspan) # Check if we have some matchable left do not match futher if we do # not need to collect qspans matched exactly e.g. with coverage 100% # this coverage check is because we have provision to match # fragments (unused for now). already_matched_qspans.extend( m.qspan for m in matched if m.coverage() == 100) if not whole_query_run.is_matchable( include_low=include_low, qspans=already_matched_qspans): break # break if deadline has passed if time() > deadline: break # refining matches without filtering false positives matches, _discarded = match.refine_matches( matches=matches, query=qry, min_score=min_score, filter_false_positive=False, merge=True, ) if unknown_licenses: good_matches, weak_matches = match.split_weak_matches(matches) # collect the positions that are "good matches" to exclude from # matching for unknown_licenses. Create a Span to check for unknown # based on this. original_qspan = Span(0, len(qry.tokens) - 1) good_qspans = (m.qspan for m in good_matches) good_qspan = Span().union(*good_qspans) unmatched_qspan = original_qspan.difference(good_qspan) # for each subspan, run unknown license detection unknown_matches = [] for unspan in unmatched_qspan.subspans(): unquery_run = query.QueryRun( query=qry, start=unspan.start, end=unspan.end, ) unknown_match = match_unknown.match_unknowns( idx=self, query_run=unquery_run, automaton=self.unknown_automaton, ) if unknown_match: unknown_matches.append(unknown_match) unknown_matches = match.filter_invalid_contained_unknown_matches( unknown_matches=unknown_matches, good_matches=good_matches, ) matches.extend(unknown_matches) # reinject weak matches and let refine matches keep the bests matches.extend(weak_matches) if not matches: return [] if TRACE: logger_debug() self.debug_matches( matches=matches, message='matches before final merge', location=qry.location, query_string=qry.query_string, with_text=True, qry=qry) matches, _discarded = match.refine_matches( matches=matches, query=qry, min_score=min_score, filter_false_positive=True, merge=True, ) matches.sort() if TRACE: self.debug_matches( matches=matches, message='final matches', location=qry.location, query_string=qry.query_string, with_text=True, qry=qry, ) return matches
def match(self, location=None, query_string=None, min_score=0, as_expression=False, deadline=sys.maxsize, _skip_hash_match=False, **kwargs): """ This is the main entry point to match licenses. Return a sequence of LicenseMatch by matching the file at `location` or the `query_string` text against the index. Only include matches with scores greater or equal to `min_score`. If `as_expression` is True, treat the whole text as a single SPDX license expression and use only expression matching. `deadline` is a time.time() value in seconds by which the processing should stop and return whatever was matched so far. `_skip_hash_match` is used only for testing. """ assert 0 <= min_score <= 100 if not location and not query_string: return [] qry = query.build_query(location, query_string, idx=self, text_line_threshold=15, bin_line_threshold=50) if TRACE: logger_debug('match: for:', location, 'query:', qry) if not qry: return [] whole_query_run = qry.whole_query_run() if not whole_query_run or not whole_query_run.matchables: return [] if not _skip_hash_match: matches = match_hash.hash_match(self, whole_query_run) if matches: match.set_lines(matches, qry.line_by_pos) return matches # TODO: add match to degenerated expressions with custom symbols if as_expression: matches = self.get_spdx_id_matches(qry, from_spdx_id_lines=False) match.set_lines(matches, qry.line_by_pos) return matches negative_matches = [] if self.negative_rids: negative_matches = self.negative_match(whole_query_run) for neg in negative_matches: whole_query_run.subtract(neg.qspan) if TRACE_NEGATIVE: self.debug_matches( matches=negative_matches, message='negative_matches', location=location, query_string=query_string) # , with_text, query) matches = [] if USE_AHO_FRAGMENTS: approx = self.get_fragments_matches else: approx = self.get_approximate_matches matchers = [ # matcher, include_low in post-matching remaining matchable check (self.get_spdx_id_matches, True, 'spdx_lid'), (self.get_exact_matches, False, 'aho'), (approx, False, 'seq'), ] already_matched_qspans = [] for matcher, include_low, matcher_name in matchers: if TRACE: logger_debug() logger_debug('matching with matcher:', matcher_name) matched = matcher(qry, matched_qspans=already_matched_qspans, existing_matches=matches, deadline=deadline) if TRACE: self.debug_matches( matches=matched, message='matched with: ' + matcher_name, location=location, query_string=query_string) # , with_text, query) matched = match.merge_matches(matched) matches.extend(matched) # subtract whole text matched if this is long enough for m in matched: if m.rule.is_license_text and m.rule.length > 120 and m.coverage() > 98: qry.subtract(m.qspan) # check if we have some matchable left # do not match futher if we do not need to # collect qspans matched exactly e.g. with coverage 100% # this coverage check is because we have provision to match fragments (unused for now) already_matched_qspans.extend(m.qspan for m in matched if m.coverage() == 100) if not whole_query_run.is_matchable( include_low=include_low, qspans=already_matched_qspans): break # break if deadline has passed if time() > deadline: break if not matches: return [] if TRACE: logger_debug() self.debug_matches(matches=matches, message='matches before final merge', location=location, query_string=query_string, with_text=True, qry=qry) matches, _discarded = match.refine_matches( matches, idx=self, query=qry, min_score=min_score, max_dist=MAX_DIST // 2, filter_false_positive=True, merge=True) matches.sort() match.set_lines(matches, qry.line_by_pos) if TRACE: print() self.debug_matches(matches=matches, message='final matches', location=location, query_string=query_string , with_text=True, qry=qry) return matches
def match(self, location=None, query_string=None, min_score=0, as_expression=False, **kwargs): """ Return a sequence of LicenseMatch by matching the file at `location` or the `query_string` text against the index. Only include matches with scores greater or equal to `min_score`. If `as_expression` is True, treat the whole text as a single SPDX license expression and use only expression matching. """ assert 0 <= min_score <= 100 if not location and not query_string: return [] qry = query.build_query(location, query_string, idx=self, text_line_threshold=15, bin_line_threshold=50) if not qry: return [] whole_query_run = qry.whole_query_run() if not whole_query_run or not whole_query_run.matchables: return [] hash_matches = match_hash.hash_match(self, whole_query_run) if hash_matches: match.set_lines(hash_matches, qry.line_by_pos) return hash_matches if as_expression: matches = self.get_spdx_id_matches(qry, from_spdx_id_lines=False) match.set_lines(matches, qry.line_by_pos) return matches negative_matches = [] if self.negative_rids: negative_matches = self.negative_match(whole_query_run) for neg in negative_matches: whole_query_run.subtract(neg.qspan) if TRACE_NEGATIVE: self.debug_matches(negative_matches, 'negative_matches', location, query_string) #, with_text, query) matches = [] matchers = [ self.get_spdx_id_matches, self.get_exact_matches, self.get_approximate_matches ] for matcher in matchers: matched = matcher(qry) if TRACE: logger_debug('matching with matcher:', matcher) self.debug_matches(matched, 'matched', location, query_string) #, with_text, query) matches.extend(matched) # check if we have some matchable left # do not match futher if we do not need to # collect qspans matched exactly e.g. with coverage 100% # this coverage check is because we have provision to match fragments (unused for now) matched_qspans = [m.qspan for m in matches if m.coverage() == 100] if not whole_query_run.is_matchable(include_low=True, qspans=matched_qspans): break if not matches: return [] matches, _discarded = match.refine_matches(matches, idx=self, query=qry, min_score=min_score, max_dist=MAX_DIST // 2, filter_false_positive=True) matches.sort() match.set_lines(matches, qry.line_by_pos) return matches
def match_query( self, qry, min_score=0, as_expression=False, expression_symbols=None, approximate=True, deadline=sys.maxsize, _skip_hash_match=False, **kwargs, ): """ Return a sequence of LicenseMatch by matching the `qry` Query against this index. See Index.match() for arguments documentation. """ whole_query_run = qry.whole_query_run() if not whole_query_run or not whole_query_run.matchables: return [] if not _skip_hash_match: matches = match_hash.hash_match(self, whole_query_run) if matches: match.set_lines(matches, qry.line_by_pos) return matches get_spdx_id_matches = partial( self.get_spdx_id_matches, expression_symbols=expression_symbols, ) if as_expression: matches = get_spdx_id_matches(qry, from_spdx_id_lines=False) match.set_lines(matches, qry.line_by_pos) return matches matches = [] if USE_AHO_FRAGMENTS: approx = self.get_fragments_matches else: approx = self.get_approximate_matches matchers = [ # matcher, include_low in post-matching remaining matchable check (self.get_exact_matches, False, 'aho'), (get_spdx_id_matches, True, 'spdx_lid'), ] if approximate: matchers += [ (approx, False, 'seq'), ] already_matched_qspans = [] for matcher, include_low, matcher_name in matchers: if TRACE: logger_debug() logger_debug('matching with matcher:', matcher_name) matched = matcher( qry, matched_qspans=already_matched_qspans, existing_matches=matches, deadline=deadline, ) if TRACE: self.debug_matches( matches=matched, message='matched with: ' + matcher_name, location=location, query_string=query_string, ) matched = match.merge_matches(matched) matches.extend(matched) # Subtract whole text matched if this is long enough for m in matched: if (m.rule.is_license_text and m.rule.length > 120 and m.coverage() > 98): qry.subtract(m.qspan) # Check if we have some matchable left do not match futher if we do # not need to collect qspans matched exactly e.g. with coverage 100% # this coverage check is because we have provision to match # fragments (unused for now). already_matched_qspans.extend(m.qspan for m in matched if m.coverage() == 100) if not whole_query_run.is_matchable(include_low=include_low, qspans=already_matched_qspans): break # break if deadline has passed if time() > deadline: break if not matches: return [] if TRACE: logger_debug() self.debug_matches(matches=matches, message='matches before final merge', location=location, query_string=query_string, with_text=True, qry=qry) matches, _discarded = match.refine_matches( matches=matches, idx=self, query=qry, min_score=min_score, filter_false_positive=True, merge=True, ) matches.sort() match.set_lines(matches, qry.line_by_pos) if TRACE: self.debug_matches( matches=matches, message='final matches', location=location, query_string=query_string, with_text=True, qry=qry, ) return matches
def match(self, location=None, query_string=None, min_score=0, detect_negative=True): """ Return a sequence of LicenseMatch by matching the file at `location` or the `query_string` text against the index. Only include matches with scores greater or equal to `min_score`. `detect_negative` is for testing purpose only. """ assert 0 <= min_score <= 100 if TRACE: print() logger_debug('match start....') if not location and not query_string: return [] qry = query.build_query(location, query_string, self) if not qry: logger_debug('#match: No query returned for:', location) return [] ####################################################################### # Whole file matching: hash and exact matching ####################################################################### whole_query_run = qry.whole_query_run() if not whole_query_run or not whole_query_run.matchables: logger_debug('#match: whole query not matchable') return [] # hash hash_matches = match_hash.hash_match(self, whole_query_run) if hash_matches: if TRACE: self.debug_matches(hash_matches, '#match FINAL Hash matched', location, query_string) match.set_lines(hash_matches, qry.line_by_pos) return hash_matches # negative rules exact matching negative_matches = [] # note: detect_negative is false only to test negative rules detection proper if detect_negative and self.negative_rids: if TRACE: logger_debug('#match: NEGATIVE') negative_matches = self.negative_match(whole_query_run) for neg in negative_matches: whole_query_run.subtract(neg.qspan) # exact matches if TRACE_EXACT: logger_debug('#match: EXACT') exact_matches = match_aho.exact_match(self, whole_query_run, self.rules_automaton) if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: EXACT matches#:', location, query_string) exact_matches, exact_discarded = match.refine_matches(exact_matches, self, query=qry, filter_false_positive=False, merge=False) if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: ===> exact matches refined') if TRACE_EXACT: self.debug_matches(exact_discarded, ' #match: ===> exact matches discarded') matches = exact_matches discarded = exact_discarded ####################################################################### # Per query run matching. ####################################################################### if TRACE: logger_debug('#match: #QUERY RUNS:', len(qry.query_runs)) # check if we have some matchable left # collect qspans matched exactly e.g. with coverage 100% # this coverage check is because we have provision to match fragments (unused for now) matched_qspans = [m.qspan for m in exact_matches if m.coverage() == 100] # do not match futher if we do not need to if whole_query_run.is_matchable(include_low=True, qspans=matched_qspans): # FIXME: we should exclude small and "weak" rules from the subset entirely # they are unlikely to be matchable with a seq match rules_subset = (self.regular_rids | self.small_rids) for qrnum, query_run in enumerate(qry.query_runs, 1): if TRACE_QUERY_RUN_SIMPLE: logger_debug('#match: ===> processing query run #:', qrnum) logger_debug(' #match:query_run:', query_run) if not query_run.is_matchable(include_low=True): if TRACE: logger_debug('#match: query_run NOT MATCHABLE') continue # hash match ######################### hash_matches = match_hash.hash_match(self, query_run) if hash_matches: if TRACE: self.debug_matches(hash_matches, ' #match Query run matches (hash)', location, query_string) matches.extend(hash_matches) continue # FIXME: why do not we aho match again here? This would avoid # going into the costly set and seq re-match that may not be needed at all # alternatively we should consider aho matches to excludes them from candidates # query run match proper using sequence matching ######################################### if TRACE: logger_debug(' #match: Query run MATCHING proper....') run_matches = [] candidates = match_set.compute_candidates(query_run, self, rules_subset=rules_subset, top=40) if TRACE_CANDIDATES: logger_debug(' #match: query_run: number of candidates for seq match #', len(candidates)) for candidate_num, candidate in enumerate(candidates): if TRACE_QUERY_RUN: _, canrule, _ = candidate logger_debug(' #match: query_run: seq matching candidate#:', candidate_num, 'candidate:', canrule) start_offset = 0 while True: rule_matches = match_seq.match_sequence(self, candidate, query_run, start_offset=start_offset) if TRACE_QUERY_RUN and rule_matches: self.debug_matches(rule_matches, ' #match: query_run: seq matches for candidate', with_text=True, query=qry) if not rule_matches: break else: matches_end = max(m.qend for m in rule_matches) run_matches.extend(rule_matches) if matches_end + 1 < query_run.end: start_offset = matches_end + 1 continue else: break ############################################################################ if TRACE_QUERY_RUN: self.debug_matches(run_matches, ' #match: ===> Query run matches', location, query_string, with_text=True, query=qry) run_matches = match.merge_matches(run_matches, max_dist=MAX_DIST) matches.extend(run_matches) if TRACE: self.debug_matches(run_matches, ' #match: Query run matches merged', location, query_string) # final matching merge, refinement and filtering ################################################ if matches: logger_debug() logger_debug('!!!!!!!!!!!!!!!!!!!!REFINING!!!!!!!!!!!!!!!!!!!!!!!!!!!!') self.debug_matches(matches, '#match: ALL matches from all query runs', location, query_string) matches, whole_discarded = match.refine_matches(matches, idx=self, query=qry, min_score=min_score, max_dist=MAX_DIST // 2, filter_false_positive=True) if TRACE_MATCHES_DISCARD: discarded.extend(whole_discarded) matches.sort() match.set_lines(matches, qry.line_by_pos) self.debug_matches(matches, '#match: FINAL MERGED', location, query_string) if TRACE_MATCHES_DISCARD: self.debug_matches(discarded, '#match: FINAL DISCARDED', location, query_string) self.debug_matches(matches, '#match: FINAL MATCHES', location, query_string, with_text=True, query=qry) return matches