def match_value_dataset(kwd, dbs_inst=None): """ return keyword matches to dataset values in dbsmanager """ # if no specific dbs_inst passed, get the current one from request if not dbs_inst: if not hasattr(request, 'dbs_inst'): return None, None dbs_inst = request.dbs_inst dataset_score = None # make sure the kwd is unicode if not isinstance(kwd, unicode) and isinstance(kwd, str): kwd = unicode(kwd) upd_kwd = kwd # dbsmgr.find returns a generator, check if it's non empty match = find_datasets(kwd, dbs_inst, limit=1) if next(match, False): if DEBUG: print('Dataset matched by keyword %s' % kwd) # if kw contains wildcards the score shall be a bit lower if '*' in kwd and not '/' in kwd: dataset_score = 0.8 elif '*' in kwd and '/' in kwd: dataset_score = 0.9 elif not '*' in kwd and not '/' in kwd: if next(find_datasets('*%s*' % kwd, dbs_inst, limit=1), False): dataset_score = 0.7 upd_kwd = '*%s*' % kwd else: dataset_score = 1.0 # prevent number-only-keywords to be matched into datasets if kwd.isnumeric(): dataset_score -= 0.3 # add extra wildcard to make sure the query will work... if not RE_3SLASHES.match(upd_kwd): upd_kwd0 = upd_kwd if not upd_kwd.startswith('*') and not upd_kwd.startswith('/'): upd_kwd = '*' + upd_kwd if not upd_kwd0.endswith('*') or '*' not in upd_kwd0: upd_kwd += '*' return dataset_score, { 'map_to': 'dataset.name', 'adjusted_keyword': upd_kwd }
def match_value_dataset(kwd, dbs_inst=None): """ return keyword matches to dataset values in dbsmanager """ # if no specific dbs_inst passed, get the current one from request if not dbs_inst: if not hasattr(request, 'dbs_inst'): return None, None dbs_inst = request.dbs_inst dataset_score = None # make sure the kwd is unicode if not isinstance(kwd, unicode) and isinstance(kwd, str): kwd = unicode(kwd) upd_kwd = kwd # dbsmgr.find returns a generator, check if it's non empty match = find_datasets(kwd, dbs_inst, limit=1) if next(match, False): if DEBUG: print('Dataset matched by keyword %s' % kwd) # if kw contains wildcards the score shall be a bit lower if '*' in kwd and not '/' in kwd: dataset_score = 0.8 elif '*' in kwd and '/' in kwd: dataset_score = 0.9 elif not '*' in kwd and not '/' in kwd: if next(find_datasets('*%s*' % kwd, dbs_inst, limit=1), False): dataset_score = 0.7 upd_kwd = '*%s*' % kwd else: dataset_score = 1.0 # prevent number-only-keywords to be matched into datasets if kwd.isnumeric(): dataset_score -= 0.3 # add extra wildcard to make sure the query will work... if not RE_3SLASHES.match(upd_kwd): upd_kwd0 = upd_kwd if not upd_kwd.startswith('*') and not upd_kwd.startswith('/'): upd_kwd = '*' + upd_kwd if not upd_kwd0.endswith('*') or '*' not in upd_kwd0: upd_kwd += '*' return dataset_score, {'map_to': 'dataset.name', 'adjusted_keyword': upd_kwd}
def __init__(self, query, **flags): """ Accepts general form of DAS query, supported formats are DAS input query, DAS mongo query, DAS storage query. The supplied flags can carry any query attributes, e.g. filters, aggregators, system, instance, etc. """ check_query(query) self._mongoparser = None self._params = {} self._service_apis_map = {} self._str = '' self._query = '' self._query_pat = '' self._query_full = '' self._storage_query = {} self._mongo_query = {} self._qhash = None self._hashes = None self._system = None self._instance = None self._loose_query = None self._pattern_query = None self._sortkeys = [] self._filters = {} self._mapreduce = [] self._aggregators = [] self._qcache = 0 self._flags = flags self._error = '' # loop over flags and set available attributes for key, val in flags.items(): setattr(self, '_%s' % key, val) # test data type of input query and apply appropriate initialization if isinstance(query, basestring): self._query = query try: self._mongo_query = self.mongoparser.parse(query) for key, val in flags.items(): if key in self.NON_CACHEABLE_FLAGS: continue if key not in self._mongo_query: self._mongo_query[key] = val except Exception as exp: msg = "Fail to parse DAS query='%s', %s" % (query, str(exp)) print_exc(msg, print_traceback=True) self._mongo_query = {'error': msg, 'spec': {}, 'fields': []} self._storage_query = {'error': msg} self._error = msg # raise exp elif isinstance(query, dict): newquery = {} for key, val in query.items(): newquery[key] = val if isinstance(newquery.get('spec'), dict): # mongo query self._mongo_query = newquery else: # storage query self._storage_query = newquery elif isinstance(query, object) and hasattr(query, '__class__')\ and query.__class__.__name__ == 'DASQuery': self._query = query.query self._query_pat = query.query_pat self._hashes = query.hashes self._mongo_query = query.mongo_query self._storage_query = query.storage_query else: # raise Exception('Unsupported data type of DAS query') self._error = 'Unsupported data type of DAS query' if self._error: return self.update_attr() # check dataset wild-cards for key, val in self._mongo_query['spec'].items(): if key == 'dataset.name': if isinstance(val, dict): # we get {'$in':[a,b]} continue # only match dataset.name but do not primary_dataset.name if not RE_3SLASHES.match(val): # TODO: we currently do not support wildcard matching # from command line interface if not self._instance: continue # apply 3 slash pattern look-up, continuing only if one # interpretation existings here, ticket #3071 self._handle_dataset_slashes(key, val)
def process_dataset_wildcards(pattern, dbs_inst, ignorecase=False): """ The current algorithm is simple 1) Fetch all the matching data-sets (regexp from MongoDB) 2) for each of them check if the wildcard (*) matched has a slash - if so: we will replace * in initial pattern with '*/*' otherwise: we leave it as it was track all these possible replacements and their counts, and apply them possible tune ups: if all matches for a certain replacement option contain the same string: replace it by that string, simplifying the query for the providers e.g. for '*Zmm*special*RECO*' would give: /RelValZmm/*/*special*RECO* while '*Zmm*' would still give: ['/*/*Zmm*/*', '/*Zmm*/*/*'] Tests: >>> dbs_inst='prod/global' # TODO: case sensitive wildcard suggestions, e.g. # *Zmm*CMSSW*RECO* --> /RelValZMM*/CMSSW*/*RECO >>> process_dataset_wildcards('*Zmm*CMSSW*RECO*', dbs_inst) [] >>> process_dataset_wildcards('*ZMM*CMSSW*RECO*', dbs_inst) [u'/RelValZMM*/CMSSW*/*RECO'] #>>> process_dataset_wildcards('*Zmm*', dbs_inst) #['/*/*Zmm*/*'] #>>> process_dataset_wildcards('*Zmm*', dbs_inst, ignorecase=True) #['/*/*Zmm*/*', '/*ZMM*/*/*'] >>> process_dataset_wildcards('*herwig*/AODSIM', dbs_inst) ['/*herwig*/*/AODSIM'] >>> process_dataset_wildcards('*Zjkjmm*', dbs_inst) [] >>> process_dataset_wildcards('*RelValPyquen_ZeemumuJets_pt10_2760GeV*', dbs_inst) [u'/RelValPyquen_ZeemumuJets_pt10_2760GeV/*/*'] An example of input which is NOT currently converted into a wildcard one (but may be done later) >>> process_dataset_wildcards('RelValPyquen_ZeemumuJets_pt10_2760GeV', dbs_inst) [] (giving [], instead of: [u'/RelValPyquen_ZeemumuJets_pt10_2760GeV/*/*']) >>> process_dataset_wildcards('/ZMM/*', dbs_inst) [u'/ZMM/Summer11-DESIGN42_V11_428_SLHC1-v1/GEN-SIM'] TODO: Other tests, e.g. */4C_TuneZ2_7TeV-alpgen-pythia6/Summer11-PU_S4_START42_V11-v1/AODSIM* *SingleMu* /QCD*/Summer11-START311_V2-v1/GEN-SIM /RelVal*CMSSW_5_0_0_pre7*RECO* /EG/Run2010A*/AOD /*/*2011*/*/* """ # TODO: it is quite probable that people writing Zmm actually mean *Zmm* if RE_3SLASHES.match(pattern): return [pattern] # clean up any not allowed symbols in pattern that could mess up our regexps pattern = re.sub(DATASET_FORBIDDEN_SYMBOLS, '', pattern) # first load matching data-sets from cache # when group then by different cases (by how many '/' is a '*' matched) options, dataset_matches = extract_wildcard_patterns(dbs_inst, pattern) # process each different pattern results = [] #TODO: use the counts, e.g. display number of datasets for each pattern for input_interpretation, count in options.items(): if DEBUG: print('option', input_interpretation, count) subs = [] # we check if all groups are the same, if so replace by a string the_matches = dataset_matches.get(input_interpretation) #print my_matches for index, group in enumerate(input_interpretation): if REPLACE_IF_STRINGS_SAME: group = simplify_wildcard_matches(group, index, the_matches) subs.append(group) result = substitute_multiple(pattern, to_replace='*', replacements=subs) # the pattern should always start with / if result.startswith('*/*'): result = result.replace('*/*', '/*', 1) if DEBUG: print('result', result) results.append(result) return sorted(results)