def match_dataset(kwd, cur_inst): """ check for dataset match in current DBS instances """ if len(kwd) < 3: return None score, data = match_value_dataset(kwd, cur_inst) if score: return data.get('adjusted_keyword', kwd)
def match_dataset_all_inst(kwd, cur_inst): """ list matching dataset patterns in all DBS instances """ if len(kwd) < 3: return [] matches = [] for inst in list_dbs_instances(): score, data = match_value_dataset(kwd, inst) if not score: continue data['inst'] = inst data['match'] = data.get('adjusted_keyword', kwd) # score matches in other DBS instances lower score = score - 0.15 if inst != cur_inst else score data['score'] = score matches.append(data) return sorted(matches, key=lambda item: item['score'], reverse=True)
def keyword_value_weights(keyword): """ for each attribute, calculates likelihood that given keyword is a value of the attribute (we are mostly interested in API parameters, but """ # to minimize false positives, we exclude the fields from regexp matching # for which we have a list of possible values (the quite static ones) fields_tracked = input_values_tracker.get_fields_tracked(only_stable=True) scores_dict = _select_best_scores( (score, field) for score, field in keyword_regexp_weights(keyword) if field not in fields_tracked) # check for matching of existing datasets, and override regexp based score dataset_score, data = match_value_dataset(keyword) if dataset_score: scores_dict['dataset.name'] = (dataset_score, data) # check for matching fields those values are fairly static (site, release..) scores_dict.update(input_values_tracker.input_value_matches(keyword)) return sorted(scores_dict.itervalues(), key=lambda item: item[0], reverse=True)