Beispiel #1
0
def match_value_dataset(kwd, dbs_inst=None):
    """ return keyword matches to dataset values in dbsmanager """
    # if no specific dbs_inst passed, get the current one from request
    if not dbs_inst:
        if not hasattr(request, 'dbs_inst'):
            return None, None
        dbs_inst = request.dbs_inst

    dataset_score = None

    # make sure the kwd is unicode
    if not isinstance(kwd, unicode) and isinstance(kwd, str):
        kwd = unicode(kwd)

    upd_kwd = kwd

    # dbsmgr.find returns a generator, check if it's non empty
    match = find_datasets(kwd, dbs_inst, limit=1)
    if next(match, False):
        if DEBUG:
            print('Dataset matched by keyword %s' % kwd)
        # if kw contains wildcards the score shall be a bit lower
        if '*' in kwd and not '/' in kwd:
            dataset_score = 0.8
        elif '*' in kwd and '/' in kwd:
            dataset_score = 0.9
        elif not '*' in kwd and not '/' in kwd:
            if next(find_datasets('*%s*' % kwd, dbs_inst, limit=1), False):
                dataset_score = 0.7
                upd_kwd = '*%s*' % kwd
        else:
            dataset_score = 1.0

        # prevent number-only-keywords to be matched into datasets
        if kwd.isnumeric():
            dataset_score -= 0.3

    # add extra wildcard to make sure the query will work...
    if not RE_3SLASHES.match(upd_kwd):
        upd_kwd0 = upd_kwd
        if not upd_kwd.startswith('*') and not upd_kwd.startswith('/'):
            upd_kwd = '*' + upd_kwd
        if not upd_kwd0.endswith('*') or '*' not in upd_kwd0:
            upd_kwd += '*'

    return dataset_score, {
        'map_to': 'dataset.name',
        'adjusted_keyword': upd_kwd
    }
Beispiel #2
0
def match_value_dataset(kwd, dbs_inst=None):
    """ return keyword matches to dataset values in dbsmanager """
    # if no specific dbs_inst passed, get the current one from request
    if not dbs_inst:
        if not hasattr(request, 'dbs_inst'):
                return None, None
        dbs_inst = request.dbs_inst

    dataset_score = None

    # make sure the kwd is unicode
    if not isinstance(kwd, unicode) and isinstance(kwd, str):
        kwd = unicode(kwd)

    upd_kwd = kwd

    # dbsmgr.find returns a generator, check if it's non empty
    match = find_datasets(kwd, dbs_inst, limit=1)
    if next(match, False):
        if DEBUG:
            print('Dataset matched by keyword %s' % kwd)
        # if kw contains wildcards the score shall be a bit lower
        if '*' in kwd and not '/' in kwd:
            dataset_score = 0.8
        elif '*' in kwd and '/' in kwd:
            dataset_score = 0.9
        elif not '*' in kwd and not '/' in kwd:
            if next(find_datasets('*%s*' % kwd, dbs_inst, limit=1), False):
                dataset_score = 0.7
                upd_kwd = '*%s*' % kwd
        else:
            dataset_score = 1.0

        # prevent number-only-keywords to be matched into datasets
        if kwd.isnumeric():
            dataset_score -= 0.3

    # add extra wildcard to make sure the query will work...
    if not RE_3SLASHES.match(upd_kwd):
        upd_kwd0 = upd_kwd
        if not upd_kwd.startswith('*') and not upd_kwd.startswith('/'):
            upd_kwd = '*' + upd_kwd
        if not upd_kwd0.endswith('*') or '*' not in upd_kwd0:
            upd_kwd += '*'

    return dataset_score, {'map_to': 'dataset.name',
                           'adjusted_keyword': upd_kwd}
Beispiel #3
0
    def __init__(self, query, **flags):
        """
        Accepts general form of DAS query, supported formats are
        DAS input query, DAS mongo query, DAS storage query. The
        supplied flags can carry any query attributes, e.g.
        filters, aggregators, system, instance, etc.
        """
        check_query(query)
        self._mongoparser   = None
        self._params        = {}
        self._service_apis_map = {}
        self._str           = ''
        self._query         = ''
        self._query_pat     = ''
        self._query_full    = ''
        self._storage_query = {}
        self._mongo_query   = {}
        self._qhash         = None
        self._hashes        = None
        self._system        = None
        self._instance      = None
        self._loose_query   = None
        self._pattern_query = None
        self._sortkeys      = []
        self._filters       = {}
        self._mapreduce     = []
        self._aggregators   = []
        self._qcache        = 0
        self._flags         = flags
        self._error         = ''

        # loop over flags and set available attributes
        for key, val in flags.items():
            setattr(self, '_%s' % key, val)

        # test data type of input query and apply appropriate initialization
        if  isinstance(query, basestring):
            self._query = query
            try:
                self._mongo_query = self.mongoparser.parse(query)
                for key, val in flags.items():
                    if  key in self.NON_CACHEABLE_FLAGS:
                        continue
                    if  key not in self._mongo_query:
                        self._mongo_query[key] = val
            except Exception as exp:
                msg = "Fail to parse DAS query='%s', %s" % (query, str(exp))
                print_exc(msg, print_traceback=True)
                self._mongo_query = {'error': msg, 'spec': {}, 'fields': []}
                self._storage_query = {'error': msg}
                self._error = msg
#                 raise exp
        elif isinstance(query, dict):
            newquery = {}
            for key, val in query.items():
                newquery[key] = val
            if  isinstance(newquery.get('spec'), dict): # mongo query
                self._mongo_query = newquery
            else: # storage query
                self._storage_query = newquery
        elif isinstance(query, object) and hasattr(query, '__class__')\
            and query.__class__.__name__ == 'DASQuery':
            self._query = query.query
            self._query_pat = query.query_pat
            self._hashes = query.hashes
            self._mongo_query = query.mongo_query
            self._storage_query = query.storage_query
        else:
#             raise Exception('Unsupported data type of DAS query')
            self._error = 'Unsupported data type of DAS query'
        if self._error:
            return
        self.update_attr()

        # check dataset wild-cards
        for key, val in self._mongo_query['spec'].items():
            if  key == 'dataset.name':
                if  isinstance(val, dict): # we get {'$in':[a,b]}
                    continue
                # only match dataset.name but do not primary_dataset.name
                if  not RE_3SLASHES.match(val):

                    # TODO: we currently do not support wildcard matching
                    #       from command line interface
                    if not self._instance:
                        continue

                    # apply 3 slash pattern look-up, continuing only if one
                    # interpretation existings here, ticket #3071
                    self._handle_dataset_slashes(key, val)
Beispiel #4
0
    def __init__(self, query, **flags):
        """
        Accepts general form of DAS query, supported formats are
        DAS input query, DAS mongo query, DAS storage query. The
        supplied flags can carry any query attributes, e.g.
        filters, aggregators, system, instance, etc.
        """
        check_query(query)
        self._mongoparser = None
        self._params = {}
        self._service_apis_map = {}
        self._str = ''
        self._query = ''
        self._query_pat = ''
        self._query_full = ''
        self._storage_query = {}
        self._mongo_query = {}
        self._qhash = None
        self._hashes = None
        self._system = None
        self._instance = None
        self._loose_query = None
        self._pattern_query = None
        self._sortkeys = []
        self._filters = {}
        self._mapreduce = []
        self._aggregators = []
        self._qcache = 0
        self._flags = flags
        self._error = ''

        # loop over flags and set available attributes
        for key, val in flags.items():
            setattr(self, '_%s' % key, val)

        # test data type of input query and apply appropriate initialization
        if isinstance(query, basestring):
            self._query = query
            try:
                self._mongo_query = self.mongoparser.parse(query)
                for key, val in flags.items():
                    if key in self.NON_CACHEABLE_FLAGS:
                        continue
                    if key not in self._mongo_query:
                        self._mongo_query[key] = val
            except Exception as exp:
                msg = "Fail to parse DAS query='%s', %s" % (query, str(exp))
                print_exc(msg, print_traceback=True)
                self._mongo_query = {'error': msg, 'spec': {}, 'fields': []}
                self._storage_query = {'error': msg}
                self._error = msg
#                 raise exp
        elif isinstance(query, dict):
            newquery = {}
            for key, val in query.items():
                newquery[key] = val
            if isinstance(newquery.get('spec'), dict):  # mongo query
                self._mongo_query = newquery
            else:  # storage query
                self._storage_query = newquery
        elif isinstance(query, object) and hasattr(query, '__class__')\
            and query.__class__.__name__ == 'DASQuery':
            self._query = query.query
            self._query_pat = query.query_pat
            self._hashes = query.hashes
            self._mongo_query = query.mongo_query
            self._storage_query = query.storage_query
        else:
            #             raise Exception('Unsupported data type of DAS query')
            self._error = 'Unsupported data type of DAS query'
        if self._error:
            return
        self.update_attr()

        # check dataset wild-cards
        for key, val in self._mongo_query['spec'].items():
            if key == 'dataset.name':
                if isinstance(val, dict):  # we get {'$in':[a,b]}
                    continue
                # only match dataset.name but do not primary_dataset.name
                if not RE_3SLASHES.match(val):

                    # TODO: we currently do not support wildcard matching
                    #       from command line interface
                    if not self._instance:
                        continue

                    # apply 3 slash pattern look-up, continuing only if one
                    # interpretation existings here, ticket #3071
                    self._handle_dataset_slashes(key, val)
def process_dataset_wildcards(pattern, dbs_inst, ignorecase=False):
    """
    The current algorithm is simple
    1) Fetch all the matching data-sets (regexp from MongoDB)
    2) for each of them check if the wildcard (*) matched has a slash
        - if so: we will replace * in initial pattern with '*/*'
        otherwise: we leave it as it was

        track all these possible replacements and their counts, and apply them

        possible tune ups:
        if all matches for a certain replacement option contain the same string:
            replace it by that string, simplifying the query for the providers

        e.g. for '*Zmm*special*RECO*' would give: /RelValZmm/*/*special*RECO*
        while '*Zmm*' would still give: ['/*/*Zmm*/*', '/*Zmm*/*/*']

    Tests:
    >>> dbs_inst='prod/global'

    # TODO: case sensitive wildcard suggestions, e.g.
    # *Zmm*CMSSW*RECO* --> /RelValZMM*/CMSSW*/*RECO
    >>> process_dataset_wildcards('*Zmm*CMSSW*RECO*', dbs_inst)
    []

    >>> process_dataset_wildcards('*ZMM*CMSSW*RECO*', dbs_inst)
    [u'/RelValZMM*/CMSSW*/*RECO']

    #>>> process_dataset_wildcards('*Zmm*', dbs_inst)
    #['/*/*Zmm*/*']

    #>>> process_dataset_wildcards('*Zmm*', dbs_inst, ignorecase=True)
    #['/*/*Zmm*/*', '/*ZMM*/*/*']

    >>> process_dataset_wildcards('*herwig*/AODSIM', dbs_inst)
    ['/*herwig*/*/AODSIM']

    >>> process_dataset_wildcards('*Zjkjmm*', dbs_inst)
    []

    >>> process_dataset_wildcards('*RelValPyquen_ZeemumuJets_pt10_2760GeV*', dbs_inst)
    [u'/RelValPyquen_ZeemumuJets_pt10_2760GeV/*/*']

    An example of input which is NOT currently converted into a wildcard one
    (but may be done later)
    >>> process_dataset_wildcards('RelValPyquen_ZeemumuJets_pt10_2760GeV', dbs_inst)
    []

    (giving [], instead of: [u'/RelValPyquen_ZeemumuJets_pt10_2760GeV/*/*'])

    >>> process_dataset_wildcards('/ZMM/*', dbs_inst)
    [u'/ZMM/Summer11-DESIGN42_V11_428_SLHC1-v1/GEN-SIM']



    TODO: Other tests, e.g.
    */4C_TuneZ2_7TeV-alpgen-pythia6/Summer11-PU_S4_START42_V11-v1/AODSIM*
    *SingleMu*
    /QCD*/Summer11-START311_V2-v1/GEN-SIM
    /RelVal*CMSSW_5_0_0_pre7*RECO*
    /EG/Run2010A*/AOD
    /*/*2011*/*/*
    """

    # TODO: it is quite probable that people writing Zmm actually mean *Zmm*

    if  RE_3SLASHES.match(pattern):
        return [pattern]

    # clean up any not allowed symbols in pattern that could mess up our regexps
    pattern = re.sub(DATASET_FORBIDDEN_SYMBOLS, '', pattern)

    # first load matching data-sets from cache
    # when group then by different cases (by how many '/' is a '*' matched)
    options, dataset_matches = extract_wildcard_patterns(dbs_inst,  pattern)

    # process each different pattern
    results = []

    #TODO: use the counts, e.g. display number of datasets for each pattern
    for input_interpretation, count in options.items():
        if DEBUG:
            print('option', input_interpretation, count)

        subs = []
        # we check if all groups are the same, if so replace by a string
        the_matches = dataset_matches.get(input_interpretation)
        #print my_matches
        for index, group in enumerate(input_interpretation):
            if REPLACE_IF_STRINGS_SAME:
                group = simplify_wildcard_matches(group, index, the_matches)
            subs.append(group)

        result = substitute_multiple(pattern, to_replace='*', replacements=subs)

        # the pattern should always start with /
        if result.startswith('*/*'):
            result = result.replace('*/*', '/*', 1)

        if DEBUG:
            print('result', result)
        results.append(result)

    return sorted(results)