def generate_chunks(self, keywords): """ params: a tokenized list of keywords (e.g. ["a b c", 'a', 'b']) returns: a list of fields matching a combination of nearby keywords .. doctest:: { '[result_type]': [ matched_field, ...] } """ if not get_setting('SERVICE_RESULT_FIELDS'): return {} matches = self.get_phrase_matches(keywords) self.append_subquery_matches(keywords, matches) # return the matches in sorted order (per result type) for entity, m_list in matches.items(): for match in m_list: last_token = match['tokens_required'][-1] tokens_used = match['tokens_required'] match['predicate'] = get_operator_and_param(last_token) match['field_name'] = match['field']['name'] match['tokens_required_non_stopw'] = \ filter_stopwords(tokens_used) match['tokens_required_set'] = set(tokens_used) m_list.sort(key=lambda f: f['score'], reverse=True) # as IR based matching is fairly dumb now, # prune out the useless matches purge = [] for m1 in m_list: for m2 in m_list: tokens1 = m1['tokens_required_set'] tokens2 = m2['tokens_required_set'] if (m2 != m1 and m1['field_name'] == m2['field_name'] and tokens1.issubset(tokens2) and m1['score'] + 0.01 >= m2['score']): # mark a useless match for deletion purge.append(m2) matches[entity] = [match for match in m_list if match not in purge] normalize_scores(matches) # if enabled, prune low scoring chunks if get_setting('RESULT_FIELD_CHUNKER_PRUNE_LOW_TERMS'): cutoff = get_setting('RESULT_FIELD_CHUNKER_PRUNE_LOW_TERMS') for key in matches: matches[key] = [ match for match in matches[key] if match['score'] > cutoff ] print_debug(matches) return matches
def generate_chunks(self, keywords): """ params: a tokenized list of keywords (e.g. ["a b c", 'a', 'b']) returns: a list of fields matching a combination of nearby keywords .. doctest:: { '[result_type]': [ matched_field, ...] } """ if not get_setting('SERVICE_RESULT_FIELDS'): return {} matches = self.get_phrase_matches(keywords) self.append_subquery_matches(keywords, matches) # return the matches in sorted order (per result type) for entity, m_list in matches.items(): for match in m_list: last_token = match['tokens_required'][-1] tokens_used = match['tokens_required'] match['predicate'] = get_operator_and_param(last_token) match['field_name'] = match['field']['name'] match['tokens_required_non_stopw'] = \ filter_stopwords(tokens_used) match['tokens_required_set'] = set(tokens_used) m_list.sort(key=lambda f: f['score'], reverse=True) # as IR based matching is fairly dumb now, # prune out the useless matches purge = [] for m1 in m_list: for m2 in m_list: tokens1 = m1['tokens_required_set'] tokens2 = m2['tokens_required_set'] if (m2 != m1 and m1['field_name'] == m2['field_name'] and tokens1.issubset(tokens2) and m1['score'] + 0.01 >= m2['score']): # mark a useless match for deletion purge.append(m2) matches[entity] = [match for match in m_list if match not in purge] normalize_scores(matches) # if enabled, prune low scoring chunks if get_setting('RESULT_FIELD_CHUNKER_PRUNE_LOW_TERMS'): cutoff = get_setting('RESULT_FIELD_CHUNKER_PRUNE_LOW_TERMS') for key in matches: matches[key] = [match for match in matches[key] if match['score'] > cutoff] print_debug(matches) return matches
def string_distance(keyword, match_to, allow_low_scores=False): """ Basic string-edit distance metrics do not perform well, they either introduce too many false positives (file as site), or do not recognize fairly similar words, such as 'config' vs 'configuration'. Therefore, to minimize the false positives (which have direct effect to ranking), we use a combination of more trustful metrics listed in the order decreasing score: * full match * lemma match (e.g. only the word number differs) * stem match * stem match within a small edit distance (returning a low usable score) e.g. 1-2 characters differing, maximum 1 mutation """ if keyword == match_to: return 1.0 lemma = lemmatize(keyword) lemma2 = lemmatize(match_to) if lemma == lemma2: return 0.9 if get_setting('STRING_DIST_ENABLE_NLTK_STEM'): kwd_stem = getstem(keyword) match_stem = getstem(match_to) if kwd_stem == match_stem: return 0.7 else: kwd_stem = keyword match_stem = match_to score = 0.7 * levenshtein_norm(kwd_stem, match_stem, subcost=2, maxcost=3) if allow_low_scores: return score if score > 0.1 else 0.0 else: return score if score > 0.35 else 0.0