def match_value_dataset(kwd, dbs_inst=None): """ return keyword matches to dataset values in dbsmanager """ # if no specific dbs_inst passed, get the current one from request if not dbs_inst: if not hasattr(request, 'dbs_inst'): return None, None dbs_inst = request.dbs_inst dataset_score = None # make sure the kwd is unicode if not isinstance(kwd, unicode) and isinstance(kwd, str): kwd = unicode(kwd) upd_kwd = kwd # dbsmgr.find returns a generator, check if it's non empty match = find_datasets(kwd, dbs_inst, limit=1) if next(match, False): if DEBUG: print('Dataset matched by keyword %s' % kwd) # if kw contains wildcards the score shall be a bit lower if '*' in kwd and not '/' in kwd: dataset_score = 0.8 elif '*' in kwd and '/' in kwd: dataset_score = 0.9 elif not '*' in kwd and not '/' in kwd: if next(find_datasets('*%s*' % kwd, dbs_inst, limit=1), False): dataset_score = 0.7 upd_kwd = '*%s*' % kwd else: dataset_score = 1.0 # prevent number-only-keywords to be matched into datasets if kwd.isnumeric(): dataset_score -= 0.3 # add extra wildcard to make sure the query will work... if not RE_3SLASHES.match(upd_kwd): upd_kwd0 = upd_kwd if not upd_kwd.startswith('*') and not upd_kwd.startswith('/'): upd_kwd = '*' + upd_kwd if not upd_kwd0.endswith('*') or '*' not in upd_kwd0: upd_kwd += '*' return dataset_score, { 'map_to': 'dataset.name', 'adjusted_keyword': upd_kwd }
def hint_dataset_in_other_insts(query, cur_inst): """ find datasets in other DBS instances (shown only if no matches in current instance)""" dataset_pat = get_dataset_token(query) if not dataset_pat: return {} matches = match_dataset_all_inst(dataset_pat, cur_inst) # for now, display hints ONLY on no matches in the current instance if any(m['inst'] == cur_inst for m in matches): return results = [{ 'inst': m['inst'], 'match': m['inst'], 'query': repl_dataset_val(query, m['match']) + ' instance=' + m['inst'], 'examples': list(find_datasets(m['match'], m['inst'])) } for m in matches if m['inst'] != cur_inst] #print results return { 'title': 'Matching datasets in other DBS instances', 'results': results }
def match_value_dataset(kwd, dbs_inst=None): """ return keyword matches to dataset values in dbsmanager """ # if no specific dbs_inst passed, get the current one from request if not dbs_inst: if not hasattr(request, 'dbs_inst'): return None, None dbs_inst = request.dbs_inst dataset_score = None # make sure the kwd is unicode if not isinstance(kwd, unicode) and isinstance(kwd, str): kwd = unicode(kwd) upd_kwd = kwd # dbsmgr.find returns a generator, check if it's non empty match = find_datasets(kwd, dbs_inst, limit=1) if next(match, False): if DEBUG: print 'Dataset matched by keyword %s' % kwd # if kw contains wildcards the score shall be a bit lower if '*' in kwd and not '/' in kwd: dataset_score = 0.8 elif '*' in kwd and '/' in kwd: dataset_score = 0.9 elif not '*' in kwd and not '/' in kwd: if next(find_datasets('*%s*' % kwd, dbs_inst, limit=1), False): dataset_score = 0.7 upd_kwd = '*%s*' % kwd else: dataset_score = 1.0 # prevent number-only-keywords to be matched into datasets if kwd.isnumeric(): dataset_score -= 0.3 # add extra wildcard to make sure the query will work... if not RE_3SLAHES.match(upd_kwd): upd_kwd0 = upd_kwd if not upd_kwd.startswith('*') and not upd_kwd.startswith('/'): upd_kwd = '*' + upd_kwd if not upd_kwd0.endswith('*') or '*' not in upd_kwd0: upd_kwd += '*' return dataset_score, {'map_to': 'dataset.name', 'adjusted_keyword': upd_kwd}
def extract_wildcard_patterns(dbs_inst, pattern): """ Given a wildcard query and a list of datasets, we interested in how many slashes are matched by each of wildcard (because the slashes has to be included in the result). it returns counts per each combination of different patterns e.g. *Zmm* used regexp (.*)Zmm(.*) where one of the results is the following match /RelValZmm/CMSSW.../tier that yield such a combination: query match transformed into pattern * '/RelVal' -> */* Zmm (query) * '/CMSSW.../tier' -> */*/* """ # get matching datasets from out cache (through dbs manager instance) dbs_mngr_query = pattern dataset_matches = find_datasets(dbs_mngr_query, dbs_inst, limit=-1) # we will use these regexps to extract different dataset patterns pat_re = "^" + pattern.replace("*", "(.*)") + "$" pat_re = re.compile(pat_re, re.IGNORECASE) # now match the positions of slash counts = {} interpretations = {} for item in dataset_matches: match = pat_re.match(item) # just in case the pat_re regexp was more restrictive than db filtering if not match: continue groups = match.groups() if DEBUG: print "matched groups", groups # a group may contain more than one slash f_replace_group = ( lambda group: (group.count("/") == 3 and "*/*/*/*") or (group.count("/") == 2 and "*/*/*") or (group.count("/") == 1 and "*/*") or "*" ) replacements = tuple([f_replace_group(group) for group in groups]) counts[replacements] = counts.get(replacements, 0) + 1 # add this into list of possible options updated = interpretations.get(replacements, []) updated.append(groups) interpretations[replacements] = updated return counts, interpretations
def hint_dataset_case_insensitive(query, cur_inst): """ case insensitive dataset suggestions shown only if current query return no results """ dataset_pat = get_dataset_token(query) if not dataset_pat: return {} good_result = lambda m: m != dataset_pat if '*' in dataset_pat: # the mongo query is quite slow # we shall care only if case sensitive search return no results exact_matches = find_datasets(dataset_pat, cur_inst, ignorecase=False) if next(exact_matches, False): return matches = [{'match': m, 'query': repl_dataset_val(query, m)} for m in find_datasets(dataset_pat, cur_inst) if good_result(m)] return {'title': 'Case-insensitive dataset matches (NEW)', 'descr': '(dataset selection in DBS3 is now case-sensitive)', 'results': matches}
def hint_dataset_case_insensitive(query, cur_inst): """ case insensitive dataset suggestions shown only if current query return no results """ dataset_pat = get_dataset_token(query) if not dataset_pat: return {} good_result = lambda m: m != dataset_pat if '*' in dataset_pat: # the mongo query is quite slow # we shall care only if case sensitive search return no results exact_matches = find_datasets(dataset_pat, cur_inst, ignorecase=False) if next(exact_matches, False): return matches = [{ 'match': m, 'query': repl_dataset_val(query, m) } for m in find_datasets(dataset_pat, cur_inst) if good_result(m)] return { 'title': 'Case-insensitive dataset matches (NEW)', 'descr': '(dataset selection in DBS3 is now case-sensitive)', 'results': matches }
def extract_wildcard_patterns(dbs_inst, pattern, ignorecase=False): """ Given a wildcard query and a list of datasets, we interested in how many slashes are matched by each of wildcard (because the slashes has to be included in the result). it returns counts per each combination of different patterns e.g. *Zmm* used regexp (.*)Zmm(.*) where one of the results is the following match /RelValZmm/CMSSW.../tier that yield such a combination: query match transformed into pattern * '/RelVal' -> */* Zmm (query) * '/CMSSW.../tier' -> */*/* """ # get matching datasets from out cache (through dbs manager instance) dbs_mngr_query = pattern dataset_matches = find_datasets(dbs_mngr_query, dbs_inst, limit=-1, ignorecase=ignorecase) # we will use these regexps to extract different dataset patterns pat_re = '^' + pattern.replace('*', '(.*)') + '$' pat_re = re.compile(pat_re, re.IGNORECASE if ignorecase else 0) # now match the positions of slash counts = defaultdict(int) interpretations = defaultdict(list) for item in dataset_matches: match = pat_re.match(item) # just in case the pat_re regexp was more restrictive than db filtering if not match: continue groups = match.groups() if DEBUG: print("matched groups", groups) # a group may contain more than one slash f_replace_group = lambda group: (group.count('/') == 3 and '*/*/*/*')\ or (group.count('/') == 2 and '*/*/*')\ or (group.count('/') == 1 and '*/*')\ or '*' replacements = tuple(f_replace_group(group) for group in groups) counts[replacements] += 1 # add this into list of possible options interpretations[replacements].append(groups) return counts, interpretations
def hint_dataset_in_other_insts(query, cur_inst): """ find datasets in other DBS instances (shown only if no matches in current instance)""" dataset_pat = get_dataset_token(query) if not dataset_pat: return {} matches = match_dataset_all_inst(dataset_pat, cur_inst) # for now, display hints ONLY on no matches in the current instance if any(m['inst'] == cur_inst for m in matches): return results = [{'inst': m['inst'], 'match': m['inst'], 'query': repl_dataset_val(query, m['match']) + ' instance=' + m['inst'], 'examples': list(find_datasets(m['match'], m['inst']))} for m in matches if m['inst'] != cur_inst] #print results return {'title': 'Matching datasets in other DBS instances', 'results': results}