def get_approximate_matches(self, query): """ Approximate matching strategy using query_runs and multiple local alignments (aka. diff) Return a list of matches. """ matches = [] # we exclude small and "weak" rules from the subset entirely: they are # unlikely to be matchable with a seq match rules_subset = (self.regular_rids | self.small_rids).difference( self.weak_rids) for query_run in query.query_runs: if not query_run.is_matchable(include_low=True): continue # per query run hash matching just in case we are lucky hash_matches = match_hash.hash_match(self, query_run) if hash_matches: matches.extend(hash_matches) continue # inverted index match and ranking, query run-level # FIXME: we should consider aho matches to excludes them from candidates # FIXME: also exclude from candidates any rule that is only aho-matchable run_matches = [] MAX_CANDIDATES = 50 candidates = match_set.compute_candidates( query_run, self, rules_subset=rules_subset, top=MAX_CANDIDATES) # multiple sequence matching/alignment, query run-level for candidate in candidates: start_offset = 0 while True: rule_matches = match_seq.match_sequence( self, candidate, query_run, start_offset=start_offset) if not rule_matches: break else: matches_end = max(m.qend for m in rule_matches) run_matches.extend(rule_matches) if matches_end + 1 < query_run.end: start_offset = matches_end + 1 continue else: break matches.extend(match.merge_matches(run_matches, max_dist=MAX_DIST)) return matches
def get_approximate_matches(self, query, matched_qspans=None, **kwargs): """ Approximate matching strategy breaking a query in query_runs and using exacat matching then multiple local alignments (aka. diff). Return a list of matches. """ matches = [] rules_subset = self.approx_matchable_rules_subset for query_run in query.query_runs: if not query_run.is_matchable(include_low=False, qspans=matched_qspans): continue # inverted index match and ranking, query run-level # FIXME: we should consider aho matches to excludes them from candidates # FIXME: also exclude from candidates any rule that is only aho-matchable run_matches = [] MAX_CANDIDATES = 50 candidates = match_set.compute_candidates( query_run, self, rules_subset=rules_subset, top=MAX_CANDIDATES) # multiple sequence matching/alignment, query run-level for candidate in candidates: start_offset = 0 while True: rule_matches = match_seq.match_sequence( self, candidate, query_run, start_offset=start_offset) if not rule_matches: break else: matches_end = max(m.qend for m in rule_matches) run_matches.extend(rule_matches) if matches_end + 1 < query_run.end: start_offset = matches_end + 1 continue else: break matches.extend(match.merge_matches(run_matches, max_dist=MAX_DIST)) return matches
def get_query_run_approximate_matches( self, query_run, candidates, matched_qspans, deadline=sys.maxsize, **kwargs, ): """ Return Return a list of approximate matches for a single query run. """ matches = [] # we cannot do a sequence match in query run without some high token left if not query_run.is_matchable(include_low=False, qspans=matched_qspans): if TRACE_APPROX: logger_debug( 'get_query_run_approximate_matches: query_run not matchable:', query_run) return matches # Perform multiple sequence matching/alignment for each candidate, # query run-level for as long as we have more non-overlapping # matches returned for _score_vecs, rid, candidate_rule, high_intersection in candidates: if USE_DMP: # Myers diff works best when the difference are small, otherwise # it performs rather poorly as it is not aware of legalese match_blocks = match_blocks_dmp high_postings = None else: # we prefer to use the high tken aware seq matching only # when the matches are not clear. it works best when things # are farther apart match_blocks = match_blocks_seq high_postings = self.high_postings_by_rid[rid] high_postings = { tid: postings for tid, postings in high_postings.items() if tid in high_intersection} start_offset = 0 while True: rule_matches = match_seq.match_sequence( self, candidate_rule, query_run, high_postings=high_postings, start_offset=start_offset, match_blocks=match_blocks, ) if TRACE_APPROX_MATCHES: self.debug_matches( matches=rule_matches, message='get_query_run_approximate_matches: rule_matches:', with_text=True, qry=query_run.query, ) if not rule_matches: break matches_end = max(m.qend for m in rule_matches) matches.extend(rule_matches) if matches_end + 1 < query_run.end: start_offset = matches_end + 1 continue else: break # break if deadline has passed if time() > deadline: break # break if deadline has passed if time() > deadline: break # FIXME: is this really needed here? matches = match.merge_matches(matches) return matches
def match(self, location=None, query_string=None, min_score=0, detect_negative=True): """ Return a sequence of LicenseMatch by matching the file at `location` or the `query_string` text against the index. Only include matches with scores greater or equal to `min_score`. `detect_negative` is for testing purpose only. """ assert 0 <= min_score <= 100 if TRACE: print() logger_debug('match start....') if not location and not query_string: return [] qry = query.build_query(location, query_string, self) if not qry: logger_debug('#match: No query returned for:', location) return [] ####################################################################### # Whole file matching: hash, negative and exact matching ####################################################################### whole_query_run = qry.whole_query_run() if not whole_query_run or not whole_query_run.matchables: logger_debug('#match: whole query not matchable') return [] # hash hash_matches = match_hash(self, whole_query_run) if hash_matches: self.debug_matches(hash_matches, '#match FINAL Hash matched', location, query_string) set_lines(hash_matches, qry.line_by_pos) return hash_matches # negative rules exact matching negative = [] # note: detect_negative is false only to test negative rules detection proper if detect_negative and self.negative_rids: if TRACE: logger_debug('#match: NEGATIVE') negative = self.negative_match(whole_query_run) for neg in negative: if TRACE_NEGATIVE: self.debug_matches(negative, ' ##match: NEGATIVE subtracting #:', location, query_string) whole_query_run.subtract(neg.qspan) if TRACE_NEGATIVE: logger_debug(' #match: NEGATIVE found', negative) # exact matches if TRACE_EXACT: logger_debug('#match: EXACT') exact_matches = exact_match(self, whole_query_run, self.rules_automaton) if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: EXACT matches#:', location, query_string) exact_matches, exact_discarded = refine_matches(exact_matches, self, query=qry) if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: ===> exact matches refined') if TRACE_EXACT: self.debug_matches(exact_discarded, ' #match: ===> exact matches discarded') matches = exact_matches discarded = exact_discarded ####################################################################### # Per query run matching. ####################################################################### if TRACE: logger_debug('#match: #QUERY RUNS:', len(qry.query_runs)) # check if we have some matchable left # collect qspans matched exactly e.g. with coverage 100% # this coverage check is because we have provision to match fragments (unused for now) matched_qspans = [ m.qspan for m in exact_matches if m.coverage() == 100 ] # do not match futher if we do not need to if whole_query_run.is_matchable(include_low=True, qspans=matched_qspans): rules_subset = (self.regular_rids | self.small_rids) for qrnum, query_run in enumerate(qry.query_runs, 1): if TRACE_QUERY_RUN_SIMPLE: logger_debug('#match: ===> processing query run #:', qrnum) logger_debug(' #match:', query_run) if not query_run.is_matchable(include_low=True): if TRACE: logger_debug('#match: query_run NOT MATCHABLE') continue # hash match ######################### hash_matches = match_hash(self, query_run) if hash_matches: if TRACE: self.debug_matches( hash_matches, ' #match Query run matches (hash)', location, query_string) matches.extend(hash_matches) continue # query run match proper using sequence matching ######################################### if TRACE: logger_debug(' #match: Query run MATCHING proper....') run_matches = [] candidates = compute_candidates(query_run, self, rules_subset=rules_subset, top=40) if TRACE_QUERY_RUN: logger_debug( ' #match: query_run: number of candidates for seq match #', len(candidates)) for candidate_num, candidate in enumerate(candidates): if TRACE_QUERY_RUN: logger_debug( ' #match: query_run: seq matching candidate#:', candidate_num, 'candidate:', candidate) start_offset = 0 while True: rule_matches = match_sequence( self, candidate, query_run, start_offset=start_offset) if TRACE_QUERY_RUN and rule_matches: self.debug_matches( rule_matches, ' #match: query_run: seq matches for candidate' ) if not rule_matches: break else: matches_end = max(m.qend for m in rule_matches) run_matches.extend(rule_matches) if matches_end + 1 < query_run.end: start_offset = matches_end + 1 continue else: break ############################################################################ if TRACE_QUERY_RUN: self.debug_matches(run_matches, ' #match: ===> Query run matches', location, query_string, with_text=True) run_matches = merge_matches(run_matches, max_dist=MAX_DIST) matches.extend(run_matches) if TRACE: self.debug_matches( run_matches, ' #match: Query run matches merged', location, query_string) # final matching merge, refinement and filtering ################################################ if matches: logger_debug() logger_debug( '!!!!!!!!!!!!!!!!!!!!REFINING!!!!!!!!!!!!!!!!!!!!!!!!!!!!') self.debug_matches(matches, '#match: ALL matches from all query runs', location, query_string) matches, whole_discarded = refine_matches(matches, idx=self, query=qry, min_score=min_score, max_dist=MAX_DIST // 2) if TRACE_MATCHES_DISCARD: discarded.extend(whole_discarded) matches.sort() set_lines(matches, qry.line_by_pos) self.debug_matches(matches, '#match: FINAL MERGED', location, query_string) if TRACE_MATCHES_DISCARD: self.debug_matches(discarded, '#match: FINAL DISCARDED', location, query_string) self.debug_matches(matches, '#match: FINAL MATCHES', location, query_string, with_text=True) return matches
def match(self, location=None, query_string=None, min_score=0, detect_negative=True): """ Return a sequence of LicenseMatch by matching the file at `location` or the `query_string` text against the index. Only include matches with scores greater or equal to `min_score`. `detect_negative` is for testing purpose only. """ assert 0 <= min_score <= 100 if TRACE: print() logger_debug('match start....') if not location and not query_string: return [] qry = query.build_query(location, query_string, self) if not qry: logger_debug('#match: No query returned for:', location) return [] ####################################################################### # Whole file matching: hash and exact matching ####################################################################### whole_query_run = qry.whole_query_run() if not whole_query_run or not whole_query_run.matchables: logger_debug('#match: whole query not matchable') return [] # hash hash_matches = match_hash.hash_match(self, whole_query_run) if hash_matches: if TRACE: self.debug_matches(hash_matches, '#match FINAL Hash matched', location, query_string) match.set_lines(hash_matches, qry.line_by_pos) return hash_matches # negative rules exact matching negative_matches = [] # note: detect_negative is false only to test negative rules detection proper if detect_negative and self.negative_rids: if TRACE: logger_debug('#match: NEGATIVE') negative_matches = self.negative_match(whole_query_run) for neg in negative_matches: whole_query_run.subtract(neg.qspan) # exact matches if TRACE_EXACT: logger_debug('#match: EXACT') exact_matches = match_aho.exact_match(self, whole_query_run, self.rules_automaton) if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: EXACT matches#:', location, query_string) exact_matches, exact_discarded = match.refine_matches(exact_matches, self, query=qry, filter_false_positive=False, merge=False) if TRACE_EXACT: self.debug_matches(exact_matches, ' #match: ===> exact matches refined') if TRACE_EXACT: self.debug_matches(exact_discarded, ' #match: ===> exact matches discarded') matches = exact_matches discarded = exact_discarded ####################################################################### # Per query run matching. ####################################################################### if TRACE: logger_debug('#match: #QUERY RUNS:', len(qry.query_runs)) # check if we have some matchable left # collect qspans matched exactly e.g. with coverage 100% # this coverage check is because we have provision to match fragments (unused for now) matched_qspans = [m.qspan for m in exact_matches if m.coverage() == 100] # do not match futher if we do not need to if whole_query_run.is_matchable(include_low=True, qspans=matched_qspans): # FIXME: we should exclude small and "weak" rules from the subset entirely # they are unlikely to be matchable with a seq match rules_subset = (self.regular_rids | self.small_rids) for qrnum, query_run in enumerate(qry.query_runs, 1): if TRACE_QUERY_RUN_SIMPLE: logger_debug('#match: ===> processing query run #:', qrnum) logger_debug(' #match:query_run:', query_run) if not query_run.is_matchable(include_low=True): if TRACE: logger_debug('#match: query_run NOT MATCHABLE') continue # hash match ######################### hash_matches = match_hash.hash_match(self, query_run) if hash_matches: if TRACE: self.debug_matches(hash_matches, ' #match Query run matches (hash)', location, query_string) matches.extend(hash_matches) continue # FIXME: why do not we aho match again here? This would avoid # going into the costly set and seq re-match that may not be needed at all # alternatively we should consider aho matches to excludes them from candidates # query run match proper using sequence matching ######################################### if TRACE: logger_debug(' #match: Query run MATCHING proper....') run_matches = [] candidates = match_set.compute_candidates(query_run, self, rules_subset=rules_subset, top=40) if TRACE_CANDIDATES: logger_debug(' #match: query_run: number of candidates for seq match #', len(candidates)) for candidate_num, candidate in enumerate(candidates): if TRACE_QUERY_RUN: _, canrule, _ = candidate logger_debug(' #match: query_run: seq matching candidate#:', candidate_num, 'candidate:', canrule) start_offset = 0 while True: rule_matches = match_seq.match_sequence(self, candidate, query_run, start_offset=start_offset) if TRACE_QUERY_RUN and rule_matches: self.debug_matches(rule_matches, ' #match: query_run: seq matches for candidate', with_text=True, query=qry) if not rule_matches: break else: matches_end = max(m.qend for m in rule_matches) run_matches.extend(rule_matches) if matches_end + 1 < query_run.end: start_offset = matches_end + 1 continue else: break ############################################################################ if TRACE_QUERY_RUN: self.debug_matches(run_matches, ' #match: ===> Query run matches', location, query_string, with_text=True, query=qry) run_matches = match.merge_matches(run_matches, max_dist=MAX_DIST) matches.extend(run_matches) if TRACE: self.debug_matches(run_matches, ' #match: Query run matches merged', location, query_string) # final matching merge, refinement and filtering ################################################ if matches: logger_debug() logger_debug('!!!!!!!!!!!!!!!!!!!!REFINING!!!!!!!!!!!!!!!!!!!!!!!!!!!!') self.debug_matches(matches, '#match: ALL matches from all query runs', location, query_string) matches, whole_discarded = match.refine_matches(matches, idx=self, query=qry, min_score=min_score, max_dist=MAX_DIST // 2, filter_false_positive=True) if TRACE_MATCHES_DISCARD: discarded.extend(whole_discarded) matches.sort() match.set_lines(matches, qry.line_by_pos) self.debug_matches(matches, '#match: FINAL MERGED', location, query_string) if TRACE_MATCHES_DISCARD: self.debug_matches(discarded, '#match: FINAL DISCARDED', location, query_string) self.debug_matches(matches, '#match: FINAL MATCHES', location, query_string, with_text=True, query=qry) return matches