def append_subquery_matches(self, keywords, matches): """ get matches to individual and nearby keywords (non phrase) """ # check for full name matches to a attribute, e.g. dataset.nevents for kwd in keywords: add_full_fieldmatch(kwd, matches) fields_by_entity = get_schema().list_result_fields() str_len = len(keywords) max_len = min(len(keywords), MAX_TOKEN_COMBINATION_LEN) for length in xrange(1, max_len + 1): for start in xrange(0, str_len - length + 1): chunk = keywords[start:start + length] # exclude phrases with "a b c" (as these were processed earlier) if any(c for c in chunk if ' ' in c): continue # only the last term in the chunk is allowed to contain operator if any(test_operator_containment(kw) for kw in chunk[:-1]): continue if DEBUG: print('chunk:', chunk) print('len=', length, '; start=', start, 'chunk:', chunk) s_chunk = ' '.join(get_keyword_without_operator(kw) for kw in chunk) results = self.fields_idx.search_index( kwds=s_chunk, limit=CHUNK_N_TOKEN_COMBINATION_RESULTS) max_score = results and results[0]['score'] for result in results: result['len'] = len(result['keywords_matched']) entity = result['result_type'] if not check_validity(result, fields_by_entity): continue result['field'] = fields_by_entity[entity][result['field']] result['tokens_required'] = chunk if USE_IR_SCORE_NORMALIZATION_LOCAL: result['score'] /= max_score matches[entity].append(result)
def append_subquery_matches(self, keywords, matches): """ get matches to individual and nearby keywords (non phrase) """ # check for full name matches to a attribute, e.g. dataset.nevents for kwd in keywords: add_full_fieldmatch(kwd, matches) fields_by_entity = get_schema().list_result_fields() str_len = len(keywords) max_len = min(len(keywords), MAX_TOKEN_COMBINATION_LEN) for length in xrange(1, max_len + 1): for start in xrange(0, str_len - length + 1): chunk = keywords[start:start + length] # exclude phrases with "a b c" (as these were processed earlier) if any(c for c in chunk if ' ' in c): continue # only the last term in the chunk is allowed to contain operator if any(test_operator_containment(kw) for kw in chunk[:-1]): continue if DEBUG: print('chunk:', chunk) print('len=', length, '; start=', start, 'chunk:', chunk) s_chunk = ' '.join( get_keyword_without_operator(kw) for kw in chunk) results = self.fields_idx.search_index( kwds=s_chunk, limit=CHUNK_N_TOKEN_COMBINATION_RESULTS) max_score = results and results[0]['score'] for result in results: result['len'] = len(result['keywords_matched']) entity = result['result_type'] if not check_validity(result, fields_by_entity): continue result['field'] = fields_by_entity[entity][result['field']] result['tokens_required'] = chunk if USE_IR_SCORE_NORMALIZATION_LOCAL: result['score'] /= max_score matches[entity].append(result)
def get_phrase_matches(self, keywords): """ get phrase matches from IR index """ fields_by_entity = get_schema().list_result_fields() # first filter out the phrases (we wont combine them with anything) phrase_kwds = [kw for kw in keywords if ' ' in kw] matches = defaultdict(list) for kwd in phrase_kwds: # remove operators, e.g. "number of events">10 => number of events phrase = get_keyword_without_operator(kwd) # get ranked list of matches results = self.fields_idx.search_index(kwds=phrase, limit=CHUNK_N_PHRASE_RESULTS) max_score = results and results[0]['score'] for result in results: #r['len'] = 1 result['len'] = len(result['keywords_matched']) entity = result['result_type'] if not check_validity(result, fields_by_entity): continue # TODO: this shall be done in presentation level result['field'] = fields_by_entity[entity][result['field']] result['tokens_required'] = [kwd] # penalize terms that have multiple matches result['score'] *= W_PHRASE if USE_IR_SCORE_NORMALIZATION_LOCAL: result['score'] /= max_score matches[entity].append(result) return matches
def get_phrase_matches(self, keywords): """ get phrase matches from IR index """ fields_by_entity = get_schema().list_result_fields() # first filter out the phrases (we wont combine them with anything) phrase_kwds = [kw for kw in keywords if ' ' in kw] matches = defaultdict(list) for kwd in phrase_kwds: # remove operators, e.g. "number of events">10 => number of events phrase = get_keyword_without_operator(kwd) # get ranked list of matches results = self.fields_idx.search_index( kwds=phrase, limit=CHUNK_N_PHRASE_RESULTS) max_score = results and results[0]['score'] for result in results: #r['len'] = 1 result['len'] = len(result['keywords_matched']) entity = result['result_type'] if not check_validity(result, fields_by_entity): continue # TODO: this shall be done in presentation level result['field'] = fields_by_entity[entity][result['field']] result['tokens_required'] = [kwd] # penalize terms that have multiple matches result['score'] *= W_PHRASE if USE_IR_SCORE_NORMALIZATION_LOCAL: result['score'] /= max_score matches[entity].append(result) return matches