def set_misses(self, dasquery, api, genrows): """ Check and adjust DAS records wrt input query. If some of the DAS keys are missing, add it with its value to the DAS record. """ # look-up primary key prim_key = self.dasmapping.primary_key(self.name, api) # Scan all docs and store those whose size above MongoDB limit into # GridFS map_key = self.dasmapping.primary_mapkey(self.name, api) genrows = parse2gridfs(self.gfs, map_key, genrows, self.logger) spec = dasquery.mongo_query['spec'] row = next(genrows) ddict = DotDict(row) keys2adjust = [] for key in spec.keys(): val = ddict.get(key) if spec[key] != val and key not in keys2adjust: keys2adjust.append(key) msg = "adjust keys %s" % keys2adjust self.logger.debug(msg) count = 0 if keys2adjust: # adjust of the rows for row in yield_rows(row, genrows): ddict = DotDict(row) pval = ddict.get(map_key) if isinstance(pval, dict) and 'error' in pval: ddict[map_key] = '' ddict.update({prim_key: pval}) for key in keys2adjust: value = spec[key] existing_value = ddict.get(key) # the way to deal with proximity/patern/condition results if (isinstance(value, str) or isinstance(value, unicode))\ and value.find('*') != -1: # we got pattern if existing_value: value = existing_value elif isinstance(value, dict) or \ isinstance(value, list): # we got condition if existing_value: value = existing_value elif isinstance(value, dict) and \ '$in' in value: # we got a range {'$in': []} value = value['$in'] elif isinstance(value, dict) and \ '$lte' in value and '$gte' in value: # we got a between range value = [value['$gte'], value['$lte']] else: value = json.dumps(value) elif existing_value and value != existing_value: # we got proximity results if 'proximity' in ddict: proximity = DotDict({key:existing_value}) ddict['proximity'].update(proximity) else: proximity = DotDict({}) proximity[key] = existing_value ddict['proximity'] = proximity else: if existing_value: value = existing_value ddict[key] = value yield ddict count += 1 else: yield row for row in genrows: yield row count += 1 msg = "yield %s rows" % count self.logger.debug(msg)
def merge_records(self, dasquery): """ Merge DAS records for provided query. We perform the following steps: 1. get all queries from das.cache by ordering them by primary key 2. run aggregtor function to merge neighbors 3. insert records into das.merge """ self.logger.debug(dasquery) id_list = [] expire = 9999999999 # future # get all API records for given DAS query spec = {'qhash':dasquery.qhash, 'query':{'$exists':True}} records = self.col.find(spec) for row in records: # find smallest expire timestamp to be used by aggregator if row['das']['expire'] < expire: expire = row['das']['expire'] if row['_id'] not in id_list: id_list.append(row['_id']) inserted = 0 lookup_keys = set() fields = dasquery.mongo_query.get('fields') if not fields: # Mongo fields = [] for key in fields: for pkey in self.mapping.mapkeys(key): lookup_keys.add(pkey) for pkey in lookup_keys: skey = [(pkey, DESCENDING)] # lookup all service records spec = {'das_id': {'$in': id_list}, 'das.primary_key': pkey} if self.verbose: nrec = self.col.find(spec).sort(skey).count() msg = "merging %s records, for %s key" % (nrec, pkey) else: msg = "merging records, for %s key" % pkey self.logger.debug(msg) records = self.col.find(spec).sort(skey) # aggregate all records agen = aggregator(dasquery, records, expire) # diff aggregated records gen = das_diff(agen, self.mapping.diff_keys(pkey.split('.')[0])) # insert all records into das.merge using bulk insert size = self.cache_size try: while True: nres = self.merge.insert(\ itertools.islice(gen, size), safe=True) if nres and isinstance(nres, list): inserted += len(nres) else: break except InvalidDocument as exp: msg = "Caught bson error: " + str(exp) self.logger.info(msg) records = self.col.find(spec).sort(skey) gen = aggregator(dasquery, records, expire) genrows = parse2gridfs(self.gfs, pkey, gen, self.logger) das_dict = {'das':{'expire':expire, 'empty_record': 0, 'primary_key':[k for k in lookup_keys], 'system': ['gridfs']}, 'qhash':dasquery.qhash, 'cache_id':[], 'das_id': id_list} for row in genrows: row.update(das_dict) self.merge.insert(row, safe=True) except InvalidOperation: pass if inserted: self.logdb.insert('merge', {'insert': inserted}) elif not lookup_keys: # we get query w/o fields pass else: # we didn't merge anything, it is DB look-up failure empty_expire = time.time() + 20 # secs, short enough to expire empty_record = {'das':{'expire':empty_expire, 'primary_key':list(lookup_keys), 'empty_record': 1}, 'cache_id':[], 'das_id': id_list} for key, val in dasquery.mongo_query['spec'].iteritems(): if key.find('.') == -1: empty_record[key] = [] else: # it is compound key, e.g. site.name newkey, newval = convert_dot_notation(key, val) empty_record[newkey] = adjust_mongo_keyvalue(newval) self.merge.insert(empty_record, safe=True) # update DAS records (both meta and data ones, by using qhash) nval = {'$set': {'das.expire':empty_expire}} spec = {'qhash':dasquery.qhash} self.col.update(spec, nval, multi=True, safe=True)
def set_misses(self, dasquery, api, genrows): """ Check and adjust DAS records wrt input query. If some of the DAS keys are missing, add it with its value to the DAS record. """ # look-up primary key prim_key = self.dasmapping.primary_key(self.name, api) # Scan all docs and store those whose size above MongoDB limit into # GridFS map_key = self.dasmapping.primary_mapkey(self.name, api) genrows = parse2gridfs(self.gfs, map_key, genrows, self.logger) spec = dasquery.mongo_query['spec'] row = next(genrows) ddict = DotDict(row) keys2adjust = [] for key in spec.keys(): val = ddict.get(key) if spec[key] != val and key not in keys2adjust: keys2adjust.append(key) msg = "adjust keys %s" % keys2adjust self.logger.debug(msg) count = 0 if keys2adjust: # adjust of the rows for row in yield_rows(row, genrows): ddict = DotDict(row) pval = ddict.get(map_key) if isinstance(pval, dict) and 'error' in pval: ddict[map_key] = '' ddict.update({prim_key: pval}) for key in keys2adjust: value = spec[key] existing_value = ddict.get(key) # the way to deal with proximity/patern/condition results if (isinstance(value, str) or isinstance(value, unicode))\ and value.find('*') != -1: # we got pattern if existing_value: value = existing_value elif isinstance(value, dict) or \ isinstance(value, list): # we got condition if existing_value: value = existing_value elif isinstance(value, dict) and \ '$in' in value: # we got a range {'$in': []} value = value['$in'] elif isinstance(value, dict) and \ '$lte' in value and '$gte' in value: # we got a between range value = [value['$gte'], value['$lte']] else: value = json.dumps(value) elif existing_value and value != existing_value: # we got proximity results if 'proximity' in ddict: proximity = DotDict({key: existing_value}) ddict['proximity'].update(proximity) else: proximity = DotDict({}) proximity[key] = existing_value ddict['proximity'] = proximity else: if existing_value: value = existing_value ddict[key] = value yield ddict count += 1 else: yield row for row in genrows: yield row count += 1 msg = "yield %s rows" % count self.logger.debug(msg)
def merge_records(self, dasquery, attempt=0): """ Merge DAS records for provided query. We perform the following steps: 1. get all queries from das.cache by ordering them by primary key 2. run aggregtor function to merge neighbors 3. insert records into das.merge """ ### TMP for asyncio # time.sleep(attempt+3) # pymongo 3.2 don't yet flush in time # remove any entries in merge collection for this query self.merge.delete_many({'qhash':dasquery.qhash}) # proceed self.logger.debug(dasquery) id_list = [] expire = 9999999999 # future # get all API records for given DAS query spec = {'qhash':dasquery.qhash, 'das.expire':{'$gt':time.time()}, 'das.record':record_codes('query_record')} records = self.col.find(spec, **PYMONGO_OPTS) for row in records: # find smallest expire timestamp to be used by aggregator rexpire = row.get('das', {}).get('expire', expire) if rexpire < expire: expire = rexpire if row['_id'] not in id_list: id_list.append(row['_id']) inserted = 0 lookup_keys = set() fields = dasquery.mongo_query.get('fields') if not fields: # Mongo fields = [] for key in fields: for pkey in self.mapping.mapkeys(key): lookup_keys.add(pkey) for pkey in lookup_keys: skey = [(pkey, DESCENDING)] # lookup all service records spec = {'das_id': {'$in': id_list}, 'das.primary_key': pkey} if self.verbose: nrec = self.col.find(spec, **PYMONGO_OPTS).sort(skey).count() msg = "merging %s records, for %s key" % (nrec, pkey) else: msg = "merging records, for %s key" % pkey self.logger.debug(msg) # use exhaust=False since we process all records in aggregator # and it can be delay in processing records = self.col.find(spec, **PYMONGO_NOEXHAUST).sort(skey) # aggregate all records agen = aggregator(dasquery, records, expire) # diff aggregated records gen = das_diff(agen, self.mapping.diff_keys(pkey.split('.')[0])) # insert all records into das.merge using bulk insert size = self.cache_size try: res = self.merge.insert_many(gen) inserted += len(res.inserted_ids) except InvalidDocument as exp: print(dastimestamp('DAS WARNING'), 'InvalidDocument during merge', str(exp)) msg = "Caught bson error: " + str(exp) self.logger.info(msg) records = self.col.find(spec, **PYMONGO_OPTS).sort(skey) gen = aggregator(dasquery, records, expire) genrows = parse2gridfs(self.gfs, pkey, gen, self.logger) das_dict = {'das':{'expire':expire, 'das.record': record_codes('gridfs_record'), 'primary_key':[k for k in lookup_keys], 'system': ['gridfs']}, 'qhash':dasquery.qhash, 'cache_id':[], 'das_id': id_list} for row in genrows: row.update(das_dict) self.merge.insert(row) except InvalidOperation as exp: pass except DuplicateKeyError as err: print(dastimestamp('DAS WARNING'), 'DuplicateKeyError during merge') if not isinstance(gen, list): raise err status = 'fail' if inserted: status = 'ok' elif not lookup_keys: # we get query w/o fields msg = 'qhash %s, no lookup_keys' % dasquery.qhash print(dastimestamp('DAS WARNING'), msg) status = 'ok' else: # we didn't merge anything, it is DB look-up failure msg = 'qhash %s, did not insert into das.merge, attempt %s' \ % (dasquery.qhash, attempt) print(dastimestamp('DAS WARNING'), msg) empty_expire = etstamp() lkeys = list(lookup_keys) das = dict(expire=empty_expire, primary_key=lkeys[0], condition_keys=lkeys, instance=dasquery.instance, system=['das'], services=dasquery.services, record=record_codes('empty_record'), ts=time.time(), api=[]) empty_record = {'das':das, 'qhash': dasquery.qhash, 'cache_id':[], 'das_id': id_list} for key in lkeys: empty_record.update({key.split('.')[0]:[]}) for key, val in dasquery.mongo_query['spec'].items(): if key.find('.') == -1: empty_record[key] = [] else: # it is compound key, e.g. site.name newkey, newval = convert_dot_notation(key, val) empty_record[newkey] = adjust_mongo_keyvalue(newval) self.merge.insert(empty_record) # update DAS records (both meta and data ones, by using qhash) nval = {'$set': {'das.expire':empty_expire}} spec = {'qhash':dasquery.qhash} self.col.update_many(spec, nval) return status
def merge_records(self, dasquery, attempt=0): """ Merge DAS records for provided query. We perform the following steps: 1. get all queries from das.cache by ordering them by primary key 2. run aggregtor function to merge neighbors 3. insert records into das.merge """ ### TMP for asyncio # time.sleep(attempt+3) # pymongo 3.2 don't yet flush in time # remove any entries in merge collection for this query self.merge.delete_many({'qhash': dasquery.qhash}) # proceed self.logger.debug(dasquery) id_list = [] expire = 9999999999 # future # get all API records for given DAS query spec = { 'qhash': dasquery.qhash, 'das.expire': { '$gt': time.time() }, 'das.record': record_codes('query_record') } records = self.col.find(spec, **PYMONGO_OPTS) for row in records: # find smallest expire timestamp to be used by aggregator rexpire = row.get('das', {}).get('expire', expire) if rexpire < expire: expire = rexpire if row['_id'] not in id_list: id_list.append(row['_id']) inserted = 0 lookup_keys = set() fields = dasquery.mongo_query.get('fields') if not fields: # Mongo fields = [] for key in fields: for pkey in self.mapping.mapkeys(key): lookup_keys.add(pkey) for pkey in lookup_keys: skey = [(pkey, DESCENDING)] # lookup all service records spec = {'das_id': {'$in': id_list}, 'das.primary_key': pkey} if self.verbose: nrec = self.col.find(spec, **PYMONGO_OPTS).sort(skey).count() msg = "merging %s records, for %s key" % (nrec, pkey) else: msg = "merging records, for %s key" % pkey self.logger.debug(msg) # use exhaust=False since we process all records in aggregator # and it can be delay in processing records = self.col.find(spec, **PYMONGO_NOEXHAUST).sort(skey) # aggregate all records agen = aggregator(dasquery, records, expire) # diff aggregated records gen = das_diff(agen, self.mapping.diff_keys(pkey.split('.')[0])) # insert all records into das.merge using bulk insert size = self.cache_size try: res = self.merge.insert_many(gen) inserted += len(res.inserted_ids) except InvalidDocument as exp: print(dastimestamp('DAS WARNING'), 'InvalidDocument during merge', str(exp)) msg = "Caught bson error: " + str(exp) self.logger.info(msg) records = self.col.find(spec, **PYMONGO_OPTS).sort(skey) gen = aggregator(dasquery, records, expire) genrows = parse2gridfs(self.gfs, pkey, gen, self.logger) das_dict = { 'das': { 'expire': expire, 'das.record': record_codes('gridfs_record'), 'primary_key': [k for k in lookup_keys], 'system': ['gridfs'] }, 'qhash': dasquery.qhash, 'cache_id': [], 'das_id': id_list } for row in genrows: row.update(das_dict) self.merge.insert(row) except InvalidOperation as exp: pass except DuplicateKeyError as err: print(dastimestamp('DAS WARNING'), 'DuplicateKeyError during merge') if not isinstance(gen, list): raise err status = 'fail' if inserted: status = 'ok' elif not lookup_keys: # we get query w/o fields msg = 'qhash %s, no lookup_keys' % dasquery.qhash print(dastimestamp('DAS WARNING'), msg) status = 'ok' else: # we didn't merge anything, it is DB look-up failure msg = 'qhash %s, did not insert into das.merge, attempt %s' \ % (dasquery.qhash, attempt) print(dastimestamp('DAS WARNING'), msg) empty_expire = etstamp() lkeys = list(lookup_keys) das = dict(expire=empty_expire, primary_key=lkeys[0], condition_keys=lkeys, instance=dasquery.instance, system=['das'], services=dasquery.services, record=record_codes('empty_record'), ts=time.time(), api=[]) empty_record = { 'das': das, 'qhash': dasquery.qhash, 'cache_id': [], 'das_id': id_list } for key in lkeys: empty_record.update({key.split('.')[0]: []}) for key, val in dasquery.mongo_query['spec'].items(): if key.find('.') == -1: empty_record[key] = [] else: # it is compound key, e.g. site.name newkey, newval = convert_dot_notation(key, val) empty_record[newkey] = adjust_mongo_keyvalue(newval) self.merge.insert(empty_record) # update DAS records (both meta and data ones, by using qhash) nval = {'$set': {'das.expire': empty_expire}} spec = {'qhash': dasquery.qhash} self.col.update_many(spec, nval) return status