def get_approximate_matches(self, query): """ Approximate matching strategy using query_runs and multiple local alignments (aka. diff) Return a list of matches. """ matches = [] # we exclude small and "weak" rules from the subset entirely: they are # unlikely to be matchable with a seq match rules_subset = (self.regular_rids | self.small_rids).difference( self.weak_rids) for query_run in query.query_runs: if not query_run.is_matchable(include_low=True): continue # per query run hash matching just in case we are lucky hash_matches = match_hash.hash_match(self, query_run) if hash_matches: matches.extend(hash_matches) continue # inverted index match and ranking, query run-level # FIXME: we should consider aho matches to excludes them from candidates # FIXME: also exclude from candidates any rule that is only aho-matchable run_matches = [] MAX_CANDIDATES = 50 candidates = match_set.compute_candidates( query_run, self, rules_subset=rules_subset, top=MAX_CANDIDATES) # multiple sequence matching/alignment, query run-level for candidate in candidates: start_offset = 0 while True: rule_matches = match_seq.match_sequence( self, candidate, query_run, start_offset=start_offset) if not rule_matches: break else: matches_end = max(m.qend for m in rule_matches) run_matches.extend(rule_matches) if matches_end + 1 < query_run.end: start_offset = matches_end + 1 continue else: break matches.extend(match.merge_matches(run_matches, max_dist=MAX_DIST)) return matches
def get_approximate_matches(self, query, matched_qspans=None, **kwargs): """ Approximate matching strategy breaking a query in query_runs and using exacat matching then multiple local alignments (aka. diff). Return a list of matches. """ matches = [] rules_subset = self.approx_matchable_rules_subset for query_run in query.query_runs: if not query_run.is_matchable(include_low=False, qspans=matched_qspans): continue # inverted index match and ranking, query run-level # FIXME: we should consider aho matches to excludes them from candidates # FIXME: also exclude from candidates any rule that is only aho-matchable run_matches = [] MAX_CANDIDATES = 50 candidates = match_set.compute_candidates( query_run, self, rules_subset=rules_subset, top=MAX_CANDIDATES) # multiple sequence matching/alignment, query run-level for candidate in candidates: start_offset = 0 while True: rule_matches = match_seq.match_sequence( self, candidate, query_run, start_offset=start_offset) if not rule_matches: break else: matches_end = max(m.qend for m in rule_matches) run_matches.extend(rule_matches) if matches_end + 1 < query_run.end: start_offset = matches_end + 1 continue else: break matches.extend(match.merge_matches(run_matches, max_dist=MAX_DIST)) return matches
def get_approximate_matches(self, query, matched_qspans, existing_matches, deadline=sys.maxsize, **kwargs): """ Approximate matching strategy breaking a query in query_runs and using multiple local alignments (aka. diff). Return a list of matches. """ matches = [] matchable_rids = self.approx_matchable_rids already_matched_qspans = matched_qspans[:] MAX_NEAR_DUPE_CANDIDATES = 10 # first check if the whole file may be close, near-dupe match whole_query_run = query.whole_query_run() near_dupe_candidates = match_set.compute_candidates( query_run=whole_query_run, idx=self, matchable_rids=matchable_rids, top=MAX_NEAR_DUPE_CANDIDATES, high_resemblance=True, _use_bigrams=USE_BIGRAM_MULTISETS, ) # if near duplicates, we only match the whole file at once against these # candidates if near_dupe_candidates: if TRACE_APPROX_CANDIDATES: logger_debug('get_query_run_approximate_matches: near dupe candidates:') for rank, ((sv1, sv2), _rid, can, _inter) in enumerate(near_dupe_candidates, 1): logger_debug(rank, sv1, sv2, can.identifier) matched = self.get_query_run_approximate_matches( whole_query_run, near_dupe_candidates, already_matched_qspans, deadline) matches.extend(matched) # subtract these for match in matched: qspan = match.qspan query.subtract(qspan) already_matched_qspans.append(qspan) # break if deadline has passed if time() > deadline: return matches # otherwise, and in all cases we break things in smaller query runs and # match each separately if USE_RULE_STARTS: query.refine_runs() if TRACE_APPROX: logger_debug('get_approximate_matches: len(query.query_runs):', len(query.query_runs)) MAX_CANDIDATES = 70 for query_run in query.query_runs: # inverted index match and ranking, query run-level candidates = match_set.compute_candidates( query_run=query_run, idx=self, matchable_rids=matchable_rids, top=MAX_CANDIDATES, high_resemblance=False, _use_bigrams=USE_BIGRAM_MULTISETS, ) if TRACE_APPROX_CANDIDATES: logger_debug('get_query_run_approximate_matches: candidates:') for rank, ((sv1, sv2), _rid, can, _inter) in enumerate(candidates, 1): logger_debug(rank, sv1, sv2, can.identifier) matched = self.get_query_run_approximate_matches( query_run, candidates, matched_qspans, deadline) matches.extend(matched) # break if deadline has passed if time() > deadline: break return matches
def match(self, location=None, query_string=None, min_score=0, detect_negative=True): """ Return a sequence of LicenseMatch by matching the file at `location` or the `query_string` text against the index. Only include matches with scores greater or equal to `min_score`. `detect_negative` is for testing purpose only. """ assert 0 <= min_score <= 100 if TRACE: print() logger_debug('match start....') if not location and not query_string: return [] qry = query.build_query(location, query_string, self) if not qry: logger_debug('#match: No query returned for:', location) return [] ####################################################################### # Whole file matching: hash, negative and exact matching ####################################################################### whole_query_run = qry.whole_query_run() if not whole_query_run or not whole_query_run.matchables: logger_debug('#match: whole query not matchable') return [] # hash hash_matches = match_hash(self, whole_query_run) if hash_matches: self.debug_matches(hash_matches, '#match FINAL Hash matched', location, query_string) set_lines(hash_matches, qry.line_by_pos) return hash_matches # negative rules exact matching negative = [] # note: detect_negative is false only to test negative rules detection proper if detect_negative and self.negative_rids: if TRACE: logger_debug('#match: NEGATIVE') negative = self.negative_match(whole_query_run) for neg in negative: if TRACE_NEGATIVE: self.debug_matches(negative, ' ##match: NEGATIVE subtracting #:', location, query_string) whole_query_run.subtract(neg.qspan) if TRACE_NEGATIVE: logger_debug(' #match: NEGATIVE found', negative) # exact matches if TRACE_EXACT: logger_debug('#match: EXACT') exact_matches = exact_match(self, whole_query_run, self.rules_automaton) if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: EXACT matches#:', location, query_string) exact_matches, exact_discarded = refine_matches(exact_matches, self, query=qry) if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: ===> exact matches refined') if TRACE_EXACT: self.debug_matches(exact_discarded, ' #match: ===> exact matches discarded') matches = exact_matches discarded = exact_discarded ####################################################################### # Per query run matching. ####################################################################### if TRACE: logger_debug('#match: #QUERY RUNS:', len(qry.query_runs)) # check if we have some matchable left # collect qspans matched exactly e.g. with coverage 100% # this coverage check is because we have provision to match fragments (unused for now) matched_qspans = [ m.qspan for m in exact_matches if m.coverage() == 100 ] # do not match futher if we do not need to if whole_query_run.is_matchable(include_low=True, qspans=matched_qspans): rules_subset = (self.regular_rids | self.small_rids) for qrnum, query_run in enumerate(qry.query_runs, 1): if TRACE_QUERY_RUN_SIMPLE: logger_debug('#match: ===> processing query run #:', qrnum) logger_debug(' #match:', query_run) if not query_run.is_matchable(include_low=True): if TRACE: logger_debug('#match: query_run NOT MATCHABLE') continue # hash match ######################### hash_matches = match_hash(self, query_run) if hash_matches: if TRACE: self.debug_matches( hash_matches, ' #match Query run matches (hash)', location, query_string) matches.extend(hash_matches) continue # query run match proper using sequence matching ######################################### if TRACE: logger_debug(' #match: Query run MATCHING proper....') run_matches = [] candidates = compute_candidates(query_run, self, rules_subset=rules_subset, top=40) if TRACE_QUERY_RUN: logger_debug( ' #match: query_run: number of candidates for seq match #', len(candidates)) for candidate_num, candidate in enumerate(candidates): if TRACE_QUERY_RUN: logger_debug( ' #match: query_run: seq matching candidate#:', candidate_num, 'candidate:', candidate) start_offset = 0 while True: rule_matches = match_sequence( self, candidate, query_run, start_offset=start_offset) if TRACE_QUERY_RUN and rule_matches: self.debug_matches( rule_matches, ' #match: query_run: seq matches for candidate' ) if not rule_matches: break else: matches_end = max(m.qend for m in rule_matches) run_matches.extend(rule_matches) if matches_end + 1 < query_run.end: start_offset = matches_end + 1 continue else: break ############################################################################ if TRACE_QUERY_RUN: self.debug_matches(run_matches, ' #match: ===> Query run matches', location, query_string, with_text=True) run_matches = merge_matches(run_matches, max_dist=MAX_DIST) matches.extend(run_matches) if TRACE: self.debug_matches( run_matches, ' #match: Query run matches merged', location, query_string) # final matching merge, refinement and filtering ################################################ if matches: logger_debug() logger_debug( '!!!!!!!!!!!!!!!!!!!!REFINING!!!!!!!!!!!!!!!!!!!!!!!!!!!!') self.debug_matches(matches, '#match: ALL matches from all query runs', location, query_string) matches, whole_discarded = refine_matches(matches, idx=self, query=qry, min_score=min_score, max_dist=MAX_DIST // 2) if TRACE_MATCHES_DISCARD: discarded.extend(whole_discarded) matches.sort() set_lines(matches, qry.line_by_pos) self.debug_matches(matches, '#match: FINAL MERGED', location, query_string) if TRACE_MATCHES_DISCARD: self.debug_matches(discarded, '#match: FINAL DISCARDED', location, query_string) self.debug_matches(matches, '#match: FINAL MATCHES', location, query_string, with_text=True) return matches
def match(self, location=None, query_string=None, min_score=0, detect_negative=True): """ Return a sequence of LicenseMatch by matching the file at `location` or the `query_string` text against the index. Only include matches with scores greater or equal to `min_score`. `detect_negative` is for testing purpose only. """ assert 0 <= min_score <= 100 if TRACE: print() logger_debug('match start....') if not location and not query_string: return [] qry = query.build_query(location, query_string, self) if not qry: logger_debug('#match: No query returned for:', location) return [] ####################################################################### # Whole file matching: hash and exact matching ####################################################################### whole_query_run = qry.whole_query_run() if not whole_query_run or not whole_query_run.matchables: logger_debug('#match: whole query not matchable') return [] # hash hash_matches = match_hash.hash_match(self, whole_query_run) if hash_matches: if TRACE: self.debug_matches(hash_matches, '#match FINAL Hash matched', location, query_string) match.set_lines(hash_matches, qry.line_by_pos) return hash_matches # negative rules exact matching negative_matches = [] # note: detect_negative is false only to test negative rules detection proper if detect_negative and self.negative_rids: if TRACE: logger_debug('#match: NEGATIVE') negative_matches = self.negative_match(whole_query_run) for neg in negative_matches: whole_query_run.subtract(neg.qspan) # exact matches if TRACE_EXACT: logger_debug('#match: EXACT') exact_matches = match_aho.exact_match(self, whole_query_run, self.rules_automaton) if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: EXACT matches#:', location, query_string) exact_matches, exact_discarded = match.refine_matches(exact_matches, self, query=qry, filter_false_positive=False, merge=False) if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: ===> exact matches refined') if TRACE_EXACT: self.debug_matches(exact_discarded, ' #match: ===> exact matches discarded') matches = exact_matches discarded = exact_discarded ####################################################################### # Per query run matching. ####################################################################### if TRACE: logger_debug('#match: #QUERY RUNS:', len(qry.query_runs)) # check if we have some matchable left # collect qspans matched exactly e.g. with coverage 100% # this coverage check is because we have provision to match fragments (unused for now) matched_qspans = [m.qspan for m in exact_matches if m.coverage() == 100] # do not match futher if we do not need to if whole_query_run.is_matchable(include_low=True, qspans=matched_qspans): # FIXME: we should exclude small and "weak" rules from the subset entirely # they are unlikely to be matchable with a seq match rules_subset = (self.regular_rids | self.small_rids) for qrnum, query_run in enumerate(qry.query_runs, 1): if TRACE_QUERY_RUN_SIMPLE: logger_debug('#match: ===> processing query run #:', qrnum) logger_debug(' #match:query_run:', query_run) if not query_run.is_matchable(include_low=True): if TRACE: logger_debug('#match: query_run NOT MATCHABLE') continue # hash match ######################### hash_matches = match_hash.hash_match(self, query_run) if hash_matches: if TRACE: self.debug_matches(hash_matches, ' #match Query run matches (hash)', location, query_string) matches.extend(hash_matches) continue # FIXME: why do not we aho match again here? This would avoid # going into the costly set and seq re-match that may not be needed at all # alternatively we should consider aho matches to excludes them from candidates # query run match proper using sequence matching ######################################### if TRACE: logger_debug(' #match: Query run MATCHING proper....') run_matches = [] candidates = match_set.compute_candidates(query_run, self, rules_subset=rules_subset, top=40) if TRACE_CANDIDATES: logger_debug(' #match: query_run: number of candidates for seq match #', len(candidates)) for candidate_num, candidate in enumerate(candidates): if TRACE_QUERY_RUN: _, canrule, _ = candidate logger_debug(' #match: query_run: seq matching candidate#:', candidate_num, 'candidate:', canrule) start_offset = 0 while True: rule_matches = match_seq.match_sequence(self, candidate, query_run, start_offset=start_offset) if TRACE_QUERY_RUN and rule_matches: self.debug_matches(rule_matches, ' #match: query_run: seq matches for candidate', with_text=True, query=qry) if not rule_matches: break else: matches_end = max(m.qend for m in rule_matches) run_matches.extend(rule_matches) if matches_end + 1 < query_run.end: start_offset = matches_end + 1 continue else: break ############################################################################ if TRACE_QUERY_RUN: self.debug_matches(run_matches, ' #match: ===> Query run matches', location, query_string, with_text=True, query=qry) run_matches = match.merge_matches(run_matches, max_dist=MAX_DIST) matches.extend(run_matches) if TRACE: self.debug_matches(run_matches, ' #match: Query run matches merged', location, query_string) # final matching merge, refinement and filtering ################################################ if matches: logger_debug() logger_debug('!!!!!!!!!!!!!!!!!!!!REFINING!!!!!!!!!!!!!!!!!!!!!!!!!!!!') self.debug_matches(matches, '#match: ALL matches from all query runs', location, query_string) matches, whole_discarded = match.refine_matches(matches, idx=self, query=qry, min_score=min_score, max_dist=MAX_DIST // 2, filter_false_positive=True) if TRACE_MATCHES_DISCARD: discarded.extend(whole_discarded) matches.sort() match.set_lines(matches, qry.line_by_pos) self.debug_matches(matches, '#match: FINAL MERGED', location, query_string) if TRACE_MATCHES_DISCARD: self.debug_matches(discarded, '#match: FINAL DISCARDED', location, query_string) self.debug_matches(matches, '#match: FINAL MATCHES', location, query_string, with_text=True, query=qry) return matches