Beispiel #1
0
    def test_aggregator(self):
        """Test aggregator function"""
        # 1 row in results
        dasquery = DASQuery(dict(fields=None, spec={'dataset':'/a/b/c'}))
        qhash = dasquery.qhash
        das  = {'expire': 10, 'primary_key':'vk', 'record': 1,
                'api':'api', 'system':['foo'], 'services':[],
                'condition_keys':['run'], 'instance':None}
        row  = {'run':10, 'das':das, '_id':1, 'das_id':1}
        rows = (row for i in range(0,1))
        result = [r for r in aggregator(dasquery, rows, das['expire'])]
        del result[0]['das']['ts'] # we don't need record timestamp
        expect = [{'run': 10, 'das':das, 'cache_id': [1], 'das_id': [1], 'qhash':qhash}]
        self.assertEqual(result, expect)

        # 2 rows with different values for common key
        rows = []
        row  = {'run':1, 'das':das, '_id':1, 'das_id':1}
        rows.append(row)
        row  = {'run':2, 'das':das, '_id':1, 'das_id':1}
        rows.append(row)
        res  = (r for r in rows)
        result = [r for r in aggregator(dasquery, res, das['expire'])]
        for r in result:
            del r['das']['ts'] # we don't need record timestamp
        expect = [{'run': 1, 'das':das, 'das_id': [1], 'cache_id': [1], 'qhash':qhash}, 
                  {'run': 2, 'das':das, 'das_id': [1], 'cache_id': [1], 'qhash':qhash}]
        self.assertEqual(result, expect)

        # 2 rows with common value for common key
        das  = {'expire': 10, 'primary_key':'run.a', 'record': 1,
                'api': ['api'], 'system':['foo'], 'services':[],
                'condition_keys':['run'], 'instance':None}
        rows = []
        row  = {'run':{'a':1,'b':1}, 'das':das, '_id':1, 'das_id':[1]}
        rows.append(row)
        row  = {'run':{'a':1,'b':2}, 'das':das, '_id':1, 'das_id':[1]}
        rows.append(row)
        res  = (r for r in rows)
        result = [r for r in aggregator(dasquery, res, das['expire'])]
        for r in result:
            del r['das']['ts'] # we don't need record timestamp
        expect = [{'run': [{'a': 1, 'b': 1}, {'a': 1, 'b': 2}], 'das':das,
                   'das_id': [1], 'cache_id': [1], 'qhash':qhash}]
        self.assertEqual(result, expect)
Beispiel #2
0
 def test_aggregator_duplicates(self):
     """Test aggregator function"""
     dasquery = DASQuery(dict(fields=None, spec={'dataset':'/a/b/c'}))
     qhash = dasquery.qhash
     das  = {'expire': 10, 'primary_key':'run.a', 'empty_record': 0,
             'system':['foo'], 'condition_keys':['run'], 'instance':None}
     rows = []
     row  = {'run':{'a':1,'b':1}, 'das':das, '_id':1, 'das_id':1}
     rows.append(row)
     row  = {'run':{'a':1,'b':1}, 'das':das, '_id':2, 'das_id':2}
     rows.append(row)
     res  = (r for r in rows)
     result = [r for r in aggregator(dasquery, res, das['expire'])]
     for r in result:
         del r['das']['ts'] # we don't need record timestamp
     expect = [{'run': [{'a': 1, 'b': 1}], 'das':das, 'qhash':qhash,
                'das_id': [1, 2], 'cache_id': [1, 2]}]
     self.assertEqual(result, expect)
Beispiel #3
0
 def test_aggregator_duplicates(self):
     """Test aggregator function"""
     dasquery = DASQuery(dict(fields=None, spec={'dataset': '/a/b/c'}))
     qhash = dasquery.qhash
     das = {
         'expire': 10,
         'primary_key': 'run.a',
         'record': 1,
         'api': ['api'],
         'system': ['foo'],
         'services': [],
         'condition_keys': ['run'],
         'instance': None
     }
     rows = []
     row = {'run': {'a': 1, 'b': 1}, 'das': das, '_id': 1, 'das_id': [1]}
     rows.append(row)
     row = {'run': {'a': 1, 'b': 1}, 'das': das, '_id': 2, 'das_id': [2]}
     rows.append(row)
     res = (r for r in rows)
     result = [r for r in aggregator(dasquery, res, das['expire'])]
     for r in result:
         del r['das']['ts']  # we don't need record timestamp
     expect = [{
         'run': [{
             'a': 1,
             'b': 1
         }, {
             'a': 1,
             'b': 1
         }],
         'das': das,
         'qhash': qhash,
         'das_id': [1, 2],
         'cache_id': [1, 2]
     }]
     self.assertEqual(result, expect)
Beispiel #4
0
 def merge_records(self, dasquery):
     """
     Merge DAS records for provided query. We perform the following
     steps:
     1. get all queries from das.cache by ordering them by primary key
     2. run aggregtor function to merge neighbors
     3. insert records into das.merge
     """
     self.logger.debug(dasquery)
     id_list = []
     expire  = 9999999999 # future
     # get all API records for given DAS query
     spec    = {'qhash':dasquery.qhash, 'query':{'$exists':True}}
     records = self.col.find(spec)
     for row in records:
         # find smallest expire timestamp to be used by aggregator
         if  row['das']['expire'] < expire:
             expire = row['das']['expire']
         if  row['_id'] not in id_list:
             id_list.append(row['_id'])
     inserted = 0
     lookup_keys = set()
     fields = dasquery.mongo_query.get('fields')
     if  not fields: # Mongo
         fields = []
     for key in fields:
         for pkey in self.mapping.mapkeys(key):
             lookup_keys.add(pkey)
     for pkey in lookup_keys:
         skey = [(pkey, DESCENDING)]
         # lookup all service records
         spec = {'das_id': {'$in': id_list}, 'das.primary_key': pkey}
         if  self.verbose:
             nrec = self.col.find(spec).sort(skey).count()
             msg  = "merging %s records, for %s key" % (nrec, pkey) 
         else:
             msg  = "merging records, for %s key" % pkey
         self.logger.debug(msg)
         records = self.col.find(spec).sort(skey)
         # aggregate all records
         agen = aggregator(dasquery, records, expire)
         # diff aggregated records
         gen  = das_diff(agen, self.mapping.diff_keys(pkey.split('.')[0]))
         # insert all records into das.merge using bulk insert
         size = self.cache_size
         try:
             while True:
                 nres = self.merge.insert(\
                     itertools.islice(gen, size), safe=True)
                 if  nres and isinstance(nres, list):
                     inserted += len(nres)
                 else:
                     break
         except InvalidDocument as exp:
             msg = "Caught bson error: " + str(exp)
             self.logger.info(msg)
             records = self.col.find(spec).sort(skey)
             gen = aggregator(dasquery, records, expire)
             genrows = parse2gridfs(self.gfs, pkey, gen, self.logger)
             das_dict = {'das':{'expire':expire, 'empty_record': 0,
                     'primary_key':[k for k in lookup_keys],
                     'system': ['gridfs']}, 'qhash':dasquery.qhash,
                     'cache_id':[], 'das_id': id_list}
             for row in genrows:
                 row.update(das_dict)
                 self.merge.insert(row, safe=True)
         except InvalidOperation:
             pass
     if  inserted:
         self.logdb.insert('merge', {'insert': inserted})
     elif  not lookup_keys: # we get query w/o fields
         pass
     else: # we didn't merge anything, it is DB look-up failure
         empty_expire = time.time() + 20 # secs, short enough to expire
         empty_record = {'das':{'expire':empty_expire,
                                'primary_key':list(lookup_keys),
                                'empty_record': 1},
                         'cache_id':[], 'das_id': id_list}
         for key, val in dasquery.mongo_query['spec'].iteritems():
             if  key.find('.') == -1:
                 empty_record[key] = []
             else: # it is compound key, e.g. site.name
                 newkey, newval = convert_dot_notation(key, val)
                 empty_record[newkey] = adjust_mongo_keyvalue(newval)
         self.merge.insert(empty_record, safe=True)
         # update DAS records (both meta and data ones, by using qhash)
         nval = {'$set': {'das.expire':empty_expire}}
         spec = {'qhash':dasquery.qhash}
         self.col.update(spec, nval, multi=True, safe=True)
Beispiel #5
0
    def test_aggregator(self):
        """Test aggregator function"""
        # 1 row in results
        dasquery = DASQuery(dict(fields=None, spec={'dataset': '/a/b/c'}))
        qhash = dasquery.qhash
        das = {
            'expire': 10,
            'primary_key': 'vk',
            'record': 1,
            'api': 'api',
            'system': ['foo'],
            'services': [],
            'condition_keys': ['run'],
            'instance': None
        }
        row = {'run': 10, 'das': das, '_id': 1, 'das_id': 1}
        rows = (row for i in range(0, 1))
        result = [r for r in aggregator(dasquery, rows, das['expire'])]
        del result[0]['das']['ts']  # we don't need record timestamp
        expect = [{
            'run': 10,
            'das': das,
            'cache_id': [1],
            'das_id': [1],
            'qhash': qhash
        }]
        self.assertEqual(result, expect)

        # 2 rows with different values for common key
        rows = []
        row = {'run': 1, 'das': das, '_id': 1, 'das_id': 1}
        rows.append(row)
        row = {'run': 2, 'das': das, '_id': 1, 'das_id': 1}
        rows.append(row)
        res = (r for r in rows)
        result = [r for r in aggregator(dasquery, res, das['expire'])]
        for r in result:
            del r['das']['ts']  # we don't need record timestamp
        expect = [{
            'run': 1,
            'das': das,
            'das_id': [1],
            'cache_id': [1],
            'qhash': qhash
        }, {
            'run': 2,
            'das': das,
            'das_id': [1],
            'cache_id': [1],
            'qhash': qhash
        }]
        self.assertEqual(result, expect)

        # 2 rows with common value for common key
        das = {
            'expire': 10,
            'primary_key': 'run.a',
            'record': 1,
            'api': ['api'],
            'system': ['foo'],
            'services': [],
            'condition_keys': ['run'],
            'instance': None
        }
        rows = []
        row = {'run': {'a': 1, 'b': 1}, 'das': das, '_id': 1, 'das_id': [1]}
        rows.append(row)
        row = {'run': {'a': 1, 'b': 2}, 'das': das, '_id': 1, 'das_id': [1]}
        rows.append(row)
        res = (r for r in rows)
        result = [r for r in aggregator(dasquery, res, das['expire'])]
        for r in result:
            del r['das']['ts']  # we don't need record timestamp
        expect = [{
            'run': [{
                'a': 1,
                'b': 1
            }, {
                'a': 1,
                'b': 2
            }],
            'das': das,
            'das_id': [1],
            'cache_id': [1],
            'qhash': qhash
        }]
        self.assertEqual(result, expect)
Beispiel #6
0
    def merge_records(self, dasquery, attempt=0):
        """
        Merge DAS records for provided query. We perform the following
        steps:
        1. get all queries from das.cache by ordering them by primary key
        2. run aggregtor function to merge neighbors
        3. insert records into das.merge
        """
        ### TMP for asyncio
#         time.sleep(attempt+3) # pymongo 3.2 don't yet flush in time

        # remove any entries in merge collection for this query
        self.merge.delete_many({'qhash':dasquery.qhash})
        # proceed
        self.logger.debug(dasquery)
        id_list = []
        expire  = 9999999999 # future
        # get all API records for given DAS query
        spec    = {'qhash':dasquery.qhash,
                   'das.expire':{'$gt':time.time()},
                   'das.record':record_codes('query_record')}
        records = self.col.find(spec, **PYMONGO_OPTS)
        for row in records:
            # find smallest expire timestamp to be used by aggregator
            rexpire = row.get('das', {}).get('expire', expire)
            if  rexpire < expire:
                expire = rexpire
            if  row['_id'] not in id_list:
                id_list.append(row['_id'])
        inserted = 0
        lookup_keys = set()
        fields = dasquery.mongo_query.get('fields')
        if  not fields: # Mongo
            fields = []
        for key in fields:
            for pkey in self.mapping.mapkeys(key):
                lookup_keys.add(pkey)
        for pkey in lookup_keys:
            skey = [(pkey, DESCENDING)]
            # lookup all service records
            spec = {'das_id': {'$in': id_list}, 'das.primary_key': pkey}
            if  self.verbose:
                nrec = self.col.find(spec, **PYMONGO_OPTS).sort(skey).count()
                msg  = "merging %s records, for %s key" % (nrec, pkey)
            else:
                msg  = "merging records, for %s key" % pkey
            self.logger.debug(msg)
            # use exhaust=False since we process all records in aggregator
            # and it can be delay in processing
            records = self.col.find(spec, **PYMONGO_NOEXHAUST).sort(skey)
            # aggregate all records
            agen = aggregator(dasquery, records, expire)
            # diff aggregated records
            gen = das_diff(agen, self.mapping.diff_keys(pkey.split('.')[0]))
            # insert all records into das.merge using bulk insert
            size = self.cache_size
            try:
                res = self.merge.insert_many(gen)
                inserted += len(res.inserted_ids)
            except InvalidDocument as exp:
                print(dastimestamp('DAS WARNING'), 'InvalidDocument during merge', str(exp))
                msg = "Caught bson error: " + str(exp)
                self.logger.info(msg)
                records = self.col.find(spec, **PYMONGO_OPTS).sort(skey)
                gen = aggregator(dasquery, records, expire)
                genrows = parse2gridfs(self.gfs, pkey, gen, self.logger)
                das_dict = {'das':{'expire':expire,
                        'das.record': record_codes('gridfs_record'),
                        'primary_key':[k for k in lookup_keys],
                        'system': ['gridfs']}, 'qhash':dasquery.qhash,
                        'cache_id':[], 'das_id': id_list}
                for row in genrows:
                    row.update(das_dict)
                    self.merge.insert(row)
            except InvalidOperation as exp:
                pass
            except DuplicateKeyError as err:
                print(dastimestamp('DAS WARNING'), 'DuplicateKeyError during merge')
                if  not isinstance(gen, list):
                    raise err
        status = 'fail'
        if  inserted:
            status = 'ok'
        elif  not lookup_keys: # we get query w/o fields
            msg = 'qhash %s, no lookup_keys' % dasquery.qhash
            print(dastimestamp('DAS WARNING'), msg)
            status = 'ok'
        else: # we didn't merge anything, it is DB look-up failure
            msg  = 'qhash %s, did not insert into das.merge, attempt %s' \
                    % (dasquery.qhash, attempt)
            print(dastimestamp('DAS WARNING'), msg)
            empty_expire = etstamp()
            lkeys = list(lookup_keys)
            das = dict(expire=empty_expire, primary_key=lkeys[0],
                       condition_keys=lkeys,
                       instance=dasquery.instance,
                       system=['das'], services=dasquery.services,
                       record=record_codes('empty_record'),
                       ts=time.time(), api=[])
            empty_record = {'das':das, 'qhash': dasquery.qhash,
                            'cache_id':[], 'das_id': id_list}
            for key in lkeys:
                empty_record.update({key.split('.')[0]:[]})
            for key, val in dasquery.mongo_query['spec'].items():
                if  key.find('.') == -1:
                    empty_record[key] = []
                else: # it is compound key, e.g. site.name
                    newkey, newval = convert_dot_notation(key, val)
                    empty_record[newkey] = adjust_mongo_keyvalue(newval)
            self.merge.insert(empty_record)
            # update DAS records (both meta and data ones, by using qhash)
            nval = {'$set': {'das.expire':empty_expire}}
            spec = {'qhash':dasquery.qhash}
            self.col.update_many(spec, nval)
        return status
Beispiel #7
0
    def merge_records(self, dasquery, attempt=0):
        """
        Merge DAS records for provided query. We perform the following
        steps:
        1. get all queries from das.cache by ordering them by primary key
        2. run aggregtor function to merge neighbors
        3. insert records into das.merge
        """
        ### TMP for asyncio
        #         time.sleep(attempt+3) # pymongo 3.2 don't yet flush in time

        # remove any entries in merge collection for this query
        self.merge.delete_many({'qhash': dasquery.qhash})
        # proceed
        self.logger.debug(dasquery)
        id_list = []
        expire = 9999999999  # future
        # get all API records for given DAS query
        spec = {
            'qhash': dasquery.qhash,
            'das.expire': {
                '$gt': time.time()
            },
            'das.record': record_codes('query_record')
        }
        records = self.col.find(spec, **PYMONGO_OPTS)
        for row in records:
            # find smallest expire timestamp to be used by aggregator
            rexpire = row.get('das', {}).get('expire', expire)
            if rexpire < expire:
                expire = rexpire
            if row['_id'] not in id_list:
                id_list.append(row['_id'])
        inserted = 0
        lookup_keys = set()
        fields = dasquery.mongo_query.get('fields')
        if not fields:  # Mongo
            fields = []
        for key in fields:
            for pkey in self.mapping.mapkeys(key):
                lookup_keys.add(pkey)
        for pkey in lookup_keys:
            skey = [(pkey, DESCENDING)]
            # lookup all service records
            spec = {'das_id': {'$in': id_list}, 'das.primary_key': pkey}
            if self.verbose:
                nrec = self.col.find(spec, **PYMONGO_OPTS).sort(skey).count()
                msg = "merging %s records, for %s key" % (nrec, pkey)
            else:
                msg = "merging records, for %s key" % pkey
            self.logger.debug(msg)
            # use exhaust=False since we process all records in aggregator
            # and it can be delay in processing
            records = self.col.find(spec, **PYMONGO_NOEXHAUST).sort(skey)
            # aggregate all records
            agen = aggregator(dasquery, records, expire)
            # diff aggregated records
            gen = das_diff(agen, self.mapping.diff_keys(pkey.split('.')[0]))
            # insert all records into das.merge using bulk insert
            size = self.cache_size
            try:
                res = self.merge.insert_many(gen)
                inserted += len(res.inserted_ids)
            except InvalidDocument as exp:
                print(dastimestamp('DAS WARNING'),
                      'InvalidDocument during merge', str(exp))
                msg = "Caught bson error: " + str(exp)
                self.logger.info(msg)
                records = self.col.find(spec, **PYMONGO_OPTS).sort(skey)
                gen = aggregator(dasquery, records, expire)
                genrows = parse2gridfs(self.gfs, pkey, gen, self.logger)
                das_dict = {
                    'das': {
                        'expire': expire,
                        'das.record': record_codes('gridfs_record'),
                        'primary_key': [k for k in lookup_keys],
                        'system': ['gridfs']
                    },
                    'qhash': dasquery.qhash,
                    'cache_id': [],
                    'das_id': id_list
                }
                for row in genrows:
                    row.update(das_dict)
                    self.merge.insert(row)
            except InvalidOperation as exp:
                pass
            except DuplicateKeyError as err:
                print(dastimestamp('DAS WARNING'),
                      'DuplicateKeyError during merge')
                if not isinstance(gen, list):
                    raise err
        status = 'fail'
        if inserted:
            status = 'ok'
        elif not lookup_keys:  # we get query w/o fields
            msg = 'qhash %s, no lookup_keys' % dasquery.qhash
            print(dastimestamp('DAS WARNING'), msg)
            status = 'ok'
        else:  # we didn't merge anything, it is DB look-up failure
            msg  = 'qhash %s, did not insert into das.merge, attempt %s' \
                    % (dasquery.qhash, attempt)
            print(dastimestamp('DAS WARNING'), msg)
            empty_expire = etstamp()
            lkeys = list(lookup_keys)
            das = dict(expire=empty_expire,
                       primary_key=lkeys[0],
                       condition_keys=lkeys,
                       instance=dasquery.instance,
                       system=['das'],
                       services=dasquery.services,
                       record=record_codes('empty_record'),
                       ts=time.time(),
                       api=[])
            empty_record = {
                'das': das,
                'qhash': dasquery.qhash,
                'cache_id': [],
                'das_id': id_list
            }
            for key in lkeys:
                empty_record.update({key.split('.')[0]: []})
            for key, val in dasquery.mongo_query['spec'].items():
                if key.find('.') == -1:
                    empty_record[key] = []
                else:  # it is compound key, e.g. site.name
                    newkey, newval = convert_dot_notation(key, val)
                    empty_record[newkey] = adjust_mongo_keyvalue(newval)
            self.merge.insert(empty_record)
            # update DAS records (both meta and data ones, by using qhash)
            nval = {'$set': {'das.expire': empty_expire}}
            spec = {'qhash': dasquery.qhash}
            self.col.update_many(spec, nval)
        return status