def test_match_license_performance_profiling_on_index_with_single_license(self): from time import time from licensedcode import query # pre-index : we are profiling only the detection, not the indexing rule_dir = self.get_test_loc('perf/idx/rules') rules = models.load_rules(rule_dir) idx = index.LicenseIndex(rules) location = self.get_test_loc('perf/idx/query.txt') querys = open(location, 'rb').read() qry = query.build_query(query_string=querys, idx=idx) def mini_seq_match(idx): list(idx.get_approximate_matches(qry, [], [])) # qtokens_as_str = array('h', tokens).tostring() start = time() for _ in range(100): mini_seq_match(idx) duration = time() - start values = ('ScanCode diff:', duration) print(*values) raise Exception(values)
def test_match_freertos(self): rule_dir = self.get_test_loc('mach_aho/rtos_exact/') idx = index.LicenseIndex(models.load_rules(rule_dir)) query_loc = self.get_test_loc('mach_aho/rtos_exact/gpl-2.0-freertos.RULE') qry = query.build_query(location=query_loc, idx=idx) matches = match_aho.exact_match(idx, qry.whole_query_run(), idx.rules_automaton) assert 1 == len(matches) match = matches[0] assert match_aho.MATCH_AHO_EXACT == match.matcher
def test_match_freertos(self): rule_dir = self.get_test_loc('mach_aho/rtos_exact/') idx = index.LicenseIndex(models.load_rules(rule_dir)) query_loc = self.get_test_loc( 'mach_aho/rtos_exact/gpl-2.0-freertos.RULE') qry = query.build_query(location=query_loc, idx=idx) matches = match_aho.exact_match(idx, qry.whole_query_run(), idx.rules_automaton) assert len(matches) == 1 match = matches[0] assert match.matcher == match_aho.MATCH_AHO_EXACT
def get_license_matches_from_query_string(query_string, start_line=1): """ Returns a sequence of LicenseMatch objects from license detection of the `query_string` starting at ``start_line`` number. This is useful when matching a text fragment alone when it is part of a larger text. """ if not query_string: return [] from licensedcode import cache idx = cache.get_index() qry = query.build_query( query_string=query_string, idx=idx, start_line=start_line, ) return idx.match_query(qry=qry)
def match( self, location=None, query_string=None, min_score=0, as_expression=False, expression_symbols=None, approximate=True, unknown_licenses=False, deadline=sys.maxsize, _skip_hash_match=False, **kwargs, ): """ This is the main entry point to match licenses. Return a sequence of LicenseMatch by matching the file at ``location`` or the ``query_string`` string against this index. Only include matches with scores greater or equal to ``min_score``. If ``as_expression`` is True, treat the whole text as a single SPDX license expression and use only expression matching. Use the ``expression_symbols`` mapping of {lowered key: LicenseSymbol} if provided. Otherwise use the standard SPDX license symbols mapping. If ``approximate`` is True, perform approximate matching as a last matching step. Otherwise, only do hash, exact and expression matching. If ``unknown_licenses`` is True, perform unknown licenses matching after all regular matching steps. ``deadline`` is a time.time() value in seconds by which the processing should stop and return whatever was matched so far. ``_skip_hash_match`` is used only for testing. """ assert 0 <= min_score <= 100 if not location and not query_string: return [] qry = query.build_query( location=location, query_string=query_string, idx=self, text_line_threshold=15, bin_line_threshold=50, ) if TRACE: logger_debug('Index.match: for:', location, 'query:', qry) if not qry: return [] return self.match_query( qry=qry, min_score=min_score, as_expression=as_expression, expression_symbols=expression_symbols, approximate=approximate, unknown_licenses=unknown_licenses, deadline=deadline, _skip_hash_match=_skip_hash_match, **kwargs, )
def match(self, location=None, query_string=None, min_score=0, as_expression=False, deadline=sys.maxsize, _skip_hash_match=False, **kwargs): """ This is the main entry point to match licenses. Return a sequence of LicenseMatch by matching the file at `location` or the `query_string` text against the index. Only include matches with scores greater or equal to `min_score`. If `as_expression` is True, treat the whole text as a single SPDX license expression and use only expression matching. `deadline` is a time.time() value in seconds by which the processing should stop and return whatever was matched so far. `_skip_hash_match` is used only for testing. """ assert 0 <= min_score <= 100 if not location and not query_string: return [] qry = query.build_query(location, query_string, idx=self, text_line_threshold=15, bin_line_threshold=50) if TRACE: logger_debug('match: for:', location, 'query:', qry) if not qry: return [] whole_query_run = qry.whole_query_run() if not whole_query_run or not whole_query_run.matchables: return [] if not _skip_hash_match: matches = match_hash.hash_match(self, whole_query_run) if matches: match.set_lines(matches, qry.line_by_pos) return matches # TODO: add match to degenerated expressions with custom symbols if as_expression: matches = self.get_spdx_id_matches(qry, from_spdx_id_lines=False) match.set_lines(matches, qry.line_by_pos) return matches negative_matches = [] if self.negative_rids: negative_matches = self.negative_match(whole_query_run) for neg in negative_matches: whole_query_run.subtract(neg.qspan) if TRACE_NEGATIVE: self.debug_matches( matches=negative_matches, message='negative_matches', location=location, query_string=query_string) # , with_text, query) matches = [] if USE_AHO_FRAGMENTS: approx = self.get_fragments_matches else: approx = self.get_approximate_matches matchers = [ # matcher, include_low in post-matching remaining matchable check (self.get_spdx_id_matches, True, 'spdx_lid'), (self.get_exact_matches, False, 'aho'), (approx, False, 'seq'), ] already_matched_qspans = [] for matcher, include_low, matcher_name in matchers: if TRACE: logger_debug() logger_debug('matching with matcher:', matcher_name) matched = matcher(qry, matched_qspans=already_matched_qspans, existing_matches=matches, deadline=deadline) if TRACE: self.debug_matches( matches=matched, message='matched with: ' + matcher_name, location=location, query_string=query_string) # , with_text, query) matched = match.merge_matches(matched) matches.extend(matched) # subtract whole text matched if this is long enough for m in matched: if m.rule.is_license_text and m.rule.length > 120 and m.coverage() > 98: qry.subtract(m.qspan) # check if we have some matchable left # do not match futher if we do not need to # collect qspans matched exactly e.g. with coverage 100% # this coverage check is because we have provision to match fragments (unused for now) already_matched_qspans.extend(m.qspan for m in matched if m.coverage() == 100) if not whole_query_run.is_matchable( include_low=include_low, qspans=already_matched_qspans): break # break if deadline has passed if time() > deadline: break if not matches: return [] if TRACE: logger_debug() self.debug_matches(matches=matches, message='matches before final merge', location=location, query_string=query_string, with_text=True, qry=qry) matches, _discarded = match.refine_matches( matches, idx=self, query=qry, min_score=min_score, max_dist=MAX_DIST // 2, filter_false_positive=True, merge=True) matches.sort() match.set_lines(matches, qry.line_by_pos) if TRACE: print() self.debug_matches(matches=matches, message='final matches', location=location, query_string=query_string , with_text=True, qry=qry) return matches
def match(self, location=None, query_string=None, min_score=0, detect_negative=True): """ Return a sequence of LicenseMatch by matching the file at `location` or the `query_string` text against the index. Only include matches with scores greater or equal to `min_score`. `detect_negative` is for testing purpose only. """ assert 0 <= min_score <= 100 if TRACE: print() logger_debug('match start....') if not location and not query_string: return [] qry = query.build_query(location, query_string, self) if not qry: logger_debug('#match: No query returned for:', location) return [] ####################################################################### # Whole file matching: hash, negative and exact matching ####################################################################### whole_query_run = qry.whole_query_run() if not whole_query_run or not whole_query_run.matchables: logger_debug('#match: whole query not matchable') return [] # hash hash_matches = match_hash(self, whole_query_run) if hash_matches: self.debug_matches(hash_matches, '#match FINAL Hash matched', location, query_string) set_lines(hash_matches, qry.line_by_pos) return hash_matches # negative rules exact matching negative = [] # note: detect_negative is false only to test negative rules detection proper if detect_negative and self.negative_rids: if TRACE: logger_debug('#match: NEGATIVE') negative = self.negative_match(whole_query_run) for neg in negative: if TRACE_NEGATIVE: self.debug_matches(negative, ' ##match: NEGATIVE subtracting #:', location, query_string) whole_query_run.subtract(neg.qspan) if TRACE_NEGATIVE: logger_debug(' #match: NEGATIVE found', negative) # exact matches if TRACE_EXACT: logger_debug('#match: EXACT') exact_matches = exact_match(self, whole_query_run, self.rules_automaton) if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: EXACT matches#:', location, query_string) exact_matches, exact_discarded = refine_matches(exact_matches, self, query=qry) if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: ===> exact matches refined') if TRACE_EXACT: self.debug_matches(exact_discarded, ' #match: ===> exact matches discarded') matches = exact_matches discarded = exact_discarded ####################################################################### # Per query run matching. ####################################################################### if TRACE: logger_debug('#match: #QUERY RUNS:', len(qry.query_runs)) # check if we have some matchable left # collect qspans matched exactly e.g. with coverage 100% # this coverage check is because we have provision to match fragments (unused for now) matched_qspans = [ m.qspan for m in exact_matches if m.coverage() == 100 ] # do not match futher if we do not need to if whole_query_run.is_matchable(include_low=True, qspans=matched_qspans): rules_subset = (self.regular_rids | self.small_rids) for qrnum, query_run in enumerate(qry.query_runs, 1): if TRACE_QUERY_RUN_SIMPLE: logger_debug('#match: ===> processing query run #:', qrnum) logger_debug(' #match:', query_run) if not query_run.is_matchable(include_low=True): if TRACE: logger_debug('#match: query_run NOT MATCHABLE') continue # hash match ######################### hash_matches = match_hash(self, query_run) if hash_matches: if TRACE: self.debug_matches( hash_matches, ' #match Query run matches (hash)', location, query_string) matches.extend(hash_matches) continue # query run match proper using sequence matching ######################################### if TRACE: logger_debug(' #match: Query run MATCHING proper....') run_matches = [] candidates = compute_candidates(query_run, self, rules_subset=rules_subset, top=40) if TRACE_QUERY_RUN: logger_debug( ' #match: query_run: number of candidates for seq match #', len(candidates)) for candidate_num, candidate in enumerate(candidates): if TRACE_QUERY_RUN: logger_debug( ' #match: query_run: seq matching candidate#:', candidate_num, 'candidate:', candidate) start_offset = 0 while True: rule_matches = match_sequence( self, candidate, query_run, start_offset=start_offset) if TRACE_QUERY_RUN and rule_matches: self.debug_matches( rule_matches, ' #match: query_run: seq matches for candidate' ) if not rule_matches: break else: matches_end = max(m.qend for m in rule_matches) run_matches.extend(rule_matches) if matches_end + 1 < query_run.end: start_offset = matches_end + 1 continue else: break ############################################################################ if TRACE_QUERY_RUN: self.debug_matches(run_matches, ' #match: ===> Query run matches', location, query_string, with_text=True) run_matches = merge_matches(run_matches, max_dist=MAX_DIST) matches.extend(run_matches) if TRACE: self.debug_matches( run_matches, ' #match: Query run matches merged', location, query_string) # final matching merge, refinement and filtering ################################################ if matches: logger_debug() logger_debug( '!!!!!!!!!!!!!!!!!!!!REFINING!!!!!!!!!!!!!!!!!!!!!!!!!!!!') self.debug_matches(matches, '#match: ALL matches from all query runs', location, query_string) matches, whole_discarded = refine_matches(matches, idx=self, query=qry, min_score=min_score, max_dist=MAX_DIST // 2) if TRACE_MATCHES_DISCARD: discarded.extend(whole_discarded) matches.sort() set_lines(matches, qry.line_by_pos) self.debug_matches(matches, '#match: FINAL MERGED', location, query_string) if TRACE_MATCHES_DISCARD: self.debug_matches(discarded, '#match: FINAL DISCARDED', location, query_string) self.debug_matches(matches, '#match: FINAL MATCHES', location, query_string, with_text=True) return matches
def match(self, location=None, query_string=None, min_score=0, as_expression=False, **kwargs): """ Return a sequence of LicenseMatch by matching the file at `location` or the `query_string` text against the index. Only include matches with scores greater or equal to `min_score`. If `as_expression` is True, treat the whole text as a single SPDX license expression and use only expression matching. """ assert 0 <= min_score <= 100 if not location and not query_string: return [] qry = query.build_query(location, query_string, idx=self, text_line_threshold=15, bin_line_threshold=50) if not qry: return [] whole_query_run = qry.whole_query_run() if not whole_query_run or not whole_query_run.matchables: return [] hash_matches = match_hash.hash_match(self, whole_query_run) if hash_matches: match.set_lines(hash_matches, qry.line_by_pos) return hash_matches if as_expression: matches = self.get_spdx_id_matches(qry, from_spdx_id_lines=False) match.set_lines(matches, qry.line_by_pos) return matches negative_matches = [] if self.negative_rids: negative_matches = self.negative_match(whole_query_run) for neg in negative_matches: whole_query_run.subtract(neg.qspan) if TRACE_NEGATIVE: self.debug_matches(negative_matches, 'negative_matches', location, query_string) #, with_text, query) matches = [] matchers = [ self.get_spdx_id_matches, self.get_exact_matches, self.get_approximate_matches ] for matcher in matchers: matched = matcher(qry) if TRACE: logger_debug('matching with matcher:', matcher) self.debug_matches(matched, 'matched', location, query_string) #, with_text, query) matches.extend(matched) # check if we have some matchable left # do not match futher if we do not need to # collect qspans matched exactly e.g. with coverage 100% # this coverage check is because we have provision to match fragments (unused for now) matched_qspans = [m.qspan for m in matches if m.coverage() == 100] if not whole_query_run.is_matchable(include_low=True, qspans=matched_qspans): break if not matches: return [] matches, _discarded = match.refine_matches(matches, idx=self, query=qry, min_score=min_score, max_dist=MAX_DIST // 2, filter_false_positive=True) matches.sort() match.set_lines(matches, qry.line_by_pos) return matches
def match(self, location=None, query_string=None, min_score=0, detect_negative=True): """ Return a sequence of LicenseMatch by matching the file at `location` or the `query_string` text against the index. Only include matches with scores greater or equal to `min_score`. `detect_negative` is for testing purpose only. """ assert 0 <= min_score <= 100 if TRACE: print() logger_debug('match start....') if not location and not query_string: return [] qry = query.build_query(location, query_string, self) if not qry: logger_debug('#match: No query returned for:', location) return [] ####################################################################### # Whole file matching: hash and exact matching ####################################################################### whole_query_run = qry.whole_query_run() if not whole_query_run or not whole_query_run.matchables: logger_debug('#match: whole query not matchable') return [] # hash hash_matches = match_hash.hash_match(self, whole_query_run) if hash_matches: if TRACE: self.debug_matches(hash_matches, '#match FINAL Hash matched', location, query_string) match.set_lines(hash_matches, qry.line_by_pos) return hash_matches # negative rules exact matching negative_matches = [] # note: detect_negative is false only to test negative rules detection proper if detect_negative and self.negative_rids: if TRACE: logger_debug('#match: NEGATIVE') negative_matches = self.negative_match(whole_query_run) for neg in negative_matches: whole_query_run.subtract(neg.qspan) # exact matches if TRACE_EXACT: logger_debug('#match: EXACT') exact_matches = match_aho.exact_match(self, whole_query_run, self.rules_automaton) if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: EXACT matches#:', location, query_string) exact_matches, exact_discarded = match.refine_matches(exact_matches, self, query=qry, filter_false_positive=False, merge=False) if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: ===> exact matches refined') if TRACE_EXACT: self.debug_matches(exact_discarded, ' #match: ===> exact matches discarded') matches = exact_matches discarded = exact_discarded ####################################################################### # Per query run matching. ####################################################################### if TRACE: logger_debug('#match: #QUERY RUNS:', len(qry.query_runs)) # check if we have some matchable left # collect qspans matched exactly e.g. with coverage 100% # this coverage check is because we have provision to match fragments (unused for now) matched_qspans = [m.qspan for m in exact_matches if m.coverage() == 100] # do not match futher if we do not need to if whole_query_run.is_matchable(include_low=True, qspans=matched_qspans): # FIXME: we should exclude small and "weak" rules from the subset entirely # they are unlikely to be matchable with a seq match rules_subset = (self.regular_rids | self.small_rids) for qrnum, query_run in enumerate(qry.query_runs, 1): if TRACE_QUERY_RUN_SIMPLE: logger_debug('#match: ===> processing query run #:', qrnum) logger_debug(' #match:query_run:', query_run) if not query_run.is_matchable(include_low=True): if TRACE: logger_debug('#match: query_run NOT MATCHABLE') continue # hash match ######################### hash_matches = match_hash.hash_match(self, query_run) if hash_matches: if TRACE: self.debug_matches(hash_matches, ' #match Query run matches (hash)', location, query_string) matches.extend(hash_matches) continue # FIXME: why do not we aho match again here? This would avoid # going into the costly set and seq re-match that may not be needed at all # alternatively we should consider aho matches to excludes them from candidates # query run match proper using sequence matching ######################################### if TRACE: logger_debug(' #match: Query run MATCHING proper....') run_matches = [] candidates = match_set.compute_candidates(query_run, self, rules_subset=rules_subset, top=40) if TRACE_CANDIDATES: logger_debug(' #match: query_run: number of candidates for seq match #', len(candidates)) for candidate_num, candidate in enumerate(candidates): if TRACE_QUERY_RUN: _, canrule, _ = candidate logger_debug(' #match: query_run: seq matching candidate#:', candidate_num, 'candidate:', canrule) start_offset = 0 while True: rule_matches = match_seq.match_sequence(self, candidate, query_run, start_offset=start_offset) if TRACE_QUERY_RUN and rule_matches: self.debug_matches(rule_matches, ' #match: query_run: seq matches for candidate', with_text=True, query=qry) if not rule_matches: break else: matches_end = max(m.qend for m in rule_matches) run_matches.extend(rule_matches) if matches_end + 1 < query_run.end: start_offset = matches_end + 1 continue else: break ############################################################################ if TRACE_QUERY_RUN: self.debug_matches(run_matches, ' #match: ===> Query run matches', location, query_string, with_text=True, query=qry) run_matches = match.merge_matches(run_matches, max_dist=MAX_DIST) matches.extend(run_matches) if TRACE: self.debug_matches(run_matches, ' #match: Query run matches merged', location, query_string) # final matching merge, refinement and filtering ################################################ if matches: logger_debug() logger_debug('!!!!!!!!!!!!!!!!!!!!REFINING!!!!!!!!!!!!!!!!!!!!!!!!!!!!') self.debug_matches(matches, '#match: ALL matches from all query runs', location, query_string) matches, whole_discarded = match.refine_matches(matches, idx=self, query=qry, min_score=min_score, max_dist=MAX_DIST // 2, filter_false_positive=True) if TRACE_MATCHES_DISCARD: discarded.extend(whole_discarded) matches.sort() match.set_lines(matches, qry.line_by_pos) self.debug_matches(matches, '#match: FINAL MERGED', location, query_string) if TRACE_MATCHES_DISCARD: self.debug_matches(discarded, '#match: FINAL DISCARDED', location, query_string) self.debug_matches(matches, '#match: FINAL MATCHES', location, query_string, with_text=True, query=qry) return matches