class StorageTests(unittest.TestCase): """ A test class for service classes """ def setUp(self): "Set up for test" self.config = get_config(path=opt_path, file_name='test.cfg') self.storage = StorageManager(config=self.config) self.storage.drop_db() def tearDown(self): "Clean up" coll = 'test' query = dict() self.storage.delete_data(coll=coll, query=query) pipeline = list() match = {'$match':{}} pipeline.append(match) expected = list() result = self.storage.get_data(coll=coll, pipeline=pipeline) self.assertEqual(result, expected) self.storage.drop_db() #@unittest.skip("Skip Test") def test_cache(self): "Test storage cache" print "" phedex = PhEDExService(config=self.config) api = 'data' params = {'level':'block', 'dataset':'/DoubleElectron/Run2012D-22Jan2013-v1/AOD'} expected = '/DoubleElectron/Run2012D-22Jan2013-v1/AOD' phedex.fetch(api=api, params=params, cache_only=True, force_cache=True) cache_data = self.storage.get_cache(coll='phedex', api=api, params=params) try: result = cache_data['phedex']['dbs'][0]['dataset'][0]['name'] except KeyError: self.assertTrue(False) else: self.assertEqual(result, expected) #@unittest.skip("Skip Test") def test_data(self): "Test general collection manipulation functions" coll = 'test' # insert data = [{'foo':'bar_1'}, {'foo':'bar_2'}] self.storage.insert_data(coll=coll, data=data) # get pipeline = list() match = {'$match':{'foo':'bar_2'}} pipeline.append(match) data = self.storage.get_data(coll=coll, pipeline=pipeline) expected = 'bar_2' result = data[0]['foo'] self.assertEqual(result, expected) # update query = {'foo':'bar_1'} data = {'$set':{'foo':'bar_3'}} self.storage.update_data(coll=coll, query=query, data=data) pipeline = list() match = {'$match':{'foo':'bar_3'}} pipeline.append(match) data = self.storage.get_data(coll=coll, pipeline=pipeline) expected = 'bar_3' result = data[0]['foo'] self.assertEqual(result, expected) # last insert timestamp data = [{'foo':'bar_4'}] datetime_1 = datetime.utcnow().replace(microsecond=0) self.storage.insert_data(coll=coll, data=data) datetime_2 = self.storage.get_last_insert_time(coll) self.assertTrue(datetime_1 <= datetime_2)
class SiteManager(object): """ Keep track of site data """ def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.intelroccs = IntelROCCSService(self.config) self.crab = CRABService(self.config) self.storage = StorageManager(self.config) self.soft_limit = float(self.config['rocker_board']['soft_limit']) self.hard_limit = float(self.config['rocker_board']['hard_limit']) def initiate_db(self): """ Initiate Site database Does exactly the same as update_db """ self.update_db() def update_db(self): """ Initiate site data in database Get general data about all sites """ api = 'Detox' file_ = 'SitesInfo.txt' intelroccs_data = self.intelroccs.fetch(api=api, params=file_, secure=False) for site_data in get_json(intelroccs_data, 'data'): self.insert_site_data(site_data) def insert_site_data(self, site_data): """ Insert site into database """ coll = 'site_data' site_name = str(site_data[4]) site_status = int(site_data[0]) site_quota = int(site_data[1]) * 10**3 query = {'name': site_name} data = { '$set': { 'name': site_name, 'status': site_status, 'quota_gb': site_quota } } self.storage.update_data(coll=coll, query=query, data=data, upsert=True) def update_cpu(self): """ Update maximum CPU capacity for site """ active_sites = self.get_active_sites() for site_name in active_sites: # remove older values date = datetime.datetime.utcnow() - datetime.timedelta(days=30) coll = 'site_data' query = {'name': site_name} data = {'$pull': {'cpu_data': {'date': {'$lt': date}}}} self.storage.update_data(coll=coll, query=query, data=data) # get CRAB data about site query = 'GLIDEIN_CMSSite =?= "%s" && CPUs > 0' % (site_name) attributes = ['GLIDEIN_CMSSite', 'CPUs'] ads = self.crab.fetch_cluster_ads(query, attributes=attributes) cpus = 0 for ad in ads: cpus += ad['CPUs'] # insert new data date = datetime.datetime.utcnow() query = {'name': site_name} data = {'$push': {'cpu_data': {'date': date, 'cpus': cpus}}} self.storage.update_data(coll=coll, query=query, data=data) def get_active_sites(self): """ Get all sites which are active, includes sites which are not available for replication """ coll = 'site_data' pipeline = list() match = {'$match': {'status': {'$in': [1, 2]}}} pipeline.append(match) project = {'$project': {'name': 1, '_id': 0}} pipeline.append(project) sites_data = self.storage.get_data(coll=coll, pipeline=pipeline) return [site_data['name'] for site_data in sites_data] def get_available_sites(self): """ Get all sites which are available for replication """ coll = 'site_data' pipeline = list() match = {'$match': {'status': 1}} pipeline.append(match) project = {'$project': {'name': 1, '_id': 0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) return [site['name'] for site in data] def get_performance(self, site_name): """ Maximum num CPU's divide by quota """ max_cpus = self.get_max_cpu(site_name) quota_gb = self.get_quota(site_name) try: performance = float(max_cpus) / float(quota_gb / 10**3) except: performance = 0.0 return performance def get_available_storage(self, site_name): """ Get total AnalysisOps storage available at the site """ size_gb = self.get_data(site_name) quota_gb = self.get_quota(site_name) available_gb = max(0, (self.hard_limit * quota_gb) - size_gb) return available_gb def get_all_available_storage(self): """ Get available storage for all sites """ available_storage = dict() available_sites = self.get_available_sites() for site_name in available_sites: available_storage[site_name] = self.get_available_storage( site_name) return available_storage def get_over_soft_limit(self, site_name): """ Get the amount of GB a site is over the soft limit i.e lower limit. If it's not over set to 0 """ size_gb = self.get_data(site_name) quota_gb = self.get_quota(site_name) over_gb = size_gb - (self.soft_limit * quota_gb) return over_gb def get_data(self, site_name): """ Get the amount of data at the site """ coll = 'dataset_data' pipeline = list() match = {'$match': {'replicas': site_name}} pipeline.append(match) group = { '$group': { '_id': None, 'size_bytes': { '$sum': '$size_bytes' } } } pipeline.append(group) project = {'$project': {'size_bytes': 1, '_id': 0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) try: size_gb = data[0]['size_bytes'] / 10**9 except: return 0 return size_gb def get_quota(self, site_name): """ Get the AnalysisOps quota for the site """ coll = 'site_data' pipeline = list() match = {'$match': {'name': site_name}} pipeline.append(match) project = {'$project': {'quota_gb': 1, '_id': 0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) try: quota_gb = data[0]['quota_gb'] except: quota_gb = 0 return quota_gb def get_max_cpu(self, site_name): """ Get the maximum number of CPU's in the last 30 days at the site """ coll = 'site_data' pipeline = list() match = {'$match': {'name': site_name}} pipeline.append(match) unwind = {'$unwind': '$cpu_data'} pipeline.append(unwind) group = { '$group': { '_id': '$name', 'max_cpus': { '$max': '$cpu_data.cpus' } } } pipeline.append(group) project = {'$project': {'max_cpus': 1, '_id': 0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) try: max_cpus = data[0]['max_cpus'] except: self.logger.warning('Could not get site performance for %s', site_name) return 0 return max_cpus
class DatasetManager(object): """ Handle all dataset related data """ def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.phedex = PhEDExService(self.config) self.dbs = DBSService(self.config) self.storage = StorageManager(self.config) self.sites = SiteManager(self.config) self.valid_tiers = config['tools']['valid_tiers'].split(',') self.MAX_THREADS = int(config['threading']['max_threads']) def initiate_db(self): """ Initiate dataset data in database Get general data and popularity data from beginning """ q = Queue.Queue() for i in range(self.MAX_THREADS): worker = threading.Thread(target=self.insert_dataset_data, args=(i, q)) worker.daemon = True worker.start() active_sites = self.sites.get_active_sites() api = 'blockreplicas' params = [('node', active_sites), ('create_since', 0.0), ('complete', 'y'), ('dist_complete', 'y'), ('group', 'AnalysisOps'), ('show_dataset', 'y')] t1 = datetime.datetime.utcnow() phedex_data = self.phedex.fetch(api=api, params=params) t2 = datetime.datetime.utcnow() td = t2 - t1 self.logger.info('Call to PhEDEx took %s', str(td)) count = 1 t1 = datetime.datetime.utcnow() for dataset_data in get_json(get_json(phedex_data, 'phedex'), 'dataset'): q.put((dataset_data, count)) count += 1 q.join() t2 = datetime.datetime.utcnow() td = t2 - t1 self.logger.info('Inserting dataset data took %s', str(td)) self.logger.info('Done inserting datasets into DB') def update_db(self): """ Get datasets currently in AnalysisOps and compare to database Deactivate removed datasets and insert new Update replicas """ # get all datasets in database dataset_names = self.get_db_datasets() dataset_names = set(dataset_names) # get all active sites, only fetch replicas from these active_sites = self.sites.get_active_sites() api = 'blockreplicas' params = [('node', active_sites), ('create_since', 0.0), ('complete', 'y'), ('group', 'AnalysisOps'), ('show_dataset', 'y')] t1 = datetime.datetime.utcnow() phedex_data = self.phedex.fetch(api=api, params=params) t2 = datetime.datetime.utcnow() td = t2 - t1 self.logger.info('Call to PhEDEx took %s', str(td)) current_datasets = set() q = Queue.Queue() for i in range(self.MAX_THREADS): worker = threading.Thread(target=self.insert_dataset_data, args=(i, q)) worker.daemon = True worker.start() count = 1 t1 = datetime.datetime.utcnow() for dataset_data in get_json(get_json(phedex_data, 'phedex'), 'dataset'): dataset_name = get_json(dataset_data, 'name') current_datasets.add(dataset_name) if dataset_name not in dataset_names: # this is a new dataset which need to be inserted into the database q.put((dataset_data, count)) count += 1 else: # update replicas replicas = self.get_replicas(dataset_data) coll = 'dataset_data' query = {'name':dataset_name} data = {'$set':{'replicas':replicas}} data = self.storage.update_data(coll=coll, query=query, data=data) q.join() deprecated_datasets = dataset_names - current_datasets for dataset_name in deprecated_datasets: self.remove_dataset(dataset_name) t2 = datetime.datetime.utcnow() td = t2 - t1 self.logger.info('Updating dataset data took %s', str(td)) self.logger.info('Done updating datasets in DB') def insert_dataset_data(self, i, q): """ Insert a new dataset into the database and initiate all data """ while True: data = q.get() dataset_data = data[0] count = data[1] self.logger.debug('Inserting dataset number %d', count) dataset_name = get_json(dataset_data, 'name') coll = 'dataset_data' query = {'name':dataset_name} data = {'$set':{'name':dataset_name}} data = self.storage.update_data(coll=coll, query=query, data=data, upsert=True) try: self.insert_phedex_data(dataset_name) self.insert_dbs_data(dataset_name) replicas = self.get_replicas(dataset_data) query = {'name':dataset_name} data = {'$set':{'name':dataset_name, 'replicas':replicas}} data = self.storage.update_data(coll=coll, query=query, data=data) except: coll = 'dataset_data' query = {'name':dataset_name} self.storage.delete_data(coll=coll, query=query) q.task_done() def insert_phedex_data(self, dataset_name): """ Fetch phedex data about dataset and insert into database """ api = 'data' params = {'dataset':dataset_name, 'level':'block', 'create_since':0.0} phedex_data = self.phedex.fetch(api=api, params=params) size_bytes = 0 n_files = 0 dataset_data = get_json(get_json(get_json(phedex_data, 'phedex'), 'dbs')[0],'dataset')[0] for block_data in get_json(dataset_data, 'block'): size_bytes += get_json(block_data, 'bytes') n_files += get_json(block_data, 'files') coll = 'dataset_data' query = {'name':dataset_name} data = {'$set':{'size_bytes':size_bytes, 'n_files':n_files}} self.storage.update_data(coll=coll, query=query, data=data, upsert=False) def insert_dbs_data(self, dataset_name): """ Fetch dbs data about dataset and insert into database """ api = 'datasets' params = {'dataset':dataset_name, 'detail':True, 'dataset_access_type':'*'} dbs_data = self.dbs.fetch(api=api, params=params) dataset_data = get_json(dbs_data, 'data')[0] ds_name = get_json(dataset_data, 'primary_ds_name') physics_group = get_json(dataset_data, 'physics_group_name') data_tier = get_json(dataset_data, 'data_tier_name') creation_date = datetime_day(timestamp_to_datetime(get_json(dataset_data, 'creation_date'))) ds_type = get_json(dataset_data, 'primary_ds_type') coll = 'dataset_data' query = {'name':dataset_name} data = {'$set':{'ds_name':ds_name, 'physics_group':physics_group, 'data_tier':data_tier, 'creation_date':creation_date, 'ds_type':ds_type}} self.storage.update_data(coll=coll, query=query, data=data, upsert=False) def get_replicas(self, dataset_data): """ Generator function to get all replicas of a dataset """ replicas_check = dict() dataset_name = get_json(dataset_data, 'name') for block_data in get_json(dataset_data, 'block'): for replica_data in get_json(block_data, 'replica'): try: replicas_check[get_json(replica_data, 'node')] += get_json(replica_data, 'files') except: replicas_check[get_json(replica_data, 'node')] = get_json(replica_data, 'files') replicas = list() n_files = self.get_n_files(dataset_name) for site, site_files in replicas_check.items(): if site_files == n_files: replicas.append(site) return replicas def get_db_datasets(self): """ Get all datasets currently in database """ coll = 'dataset_data' pipeline = list() match = {'$match':{'data_tier': {'$in':self.valid_tiers}}} pipeline.append(match) project = {'$project':{'name':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) dataset_names = [dataset_data['name'] for dataset_data in data] #self.logger.info('%d datasets present in database', len(dataset_names)) return dataset_names def get_removed_db_datasets(self): """ Get all datasets currently in database with more than one replica """ coll = 'dataset_data' pipeline = list() match = {'$match':{'data_tier': {'$in':self.valid_tiers}}} pipeline.append(match) group = {'$group':{'_id':'$name', 'n_replicas':{'$first':{'$size':'$replicas'}}}} pipeline.append(group) project = {'$project':{'name':'$_id', 'n_replicas':1, '_id':0}} pipeline.append(project) match = {'$match': {'n_replicas':{'$gt':1}}} pipeline.append(match) project = {'$project':{'name':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) dataset_names = [dataset_data['name'] for dataset_data in data] #self.logger.info('%d valid datasets present in database', len(dataset_names)) return dataset_names def remove_dataset(self, dataset_name): """ Remove dataset from database """ coll = 'dataset_data' query = {'name':dataset_name} self.storage.delete_data(coll=coll, query=query) def get_dataset_features(self, dataset_name): """ Get dataset features for dataset from db """ coll = 'dataset_data' pipeline = list() match = {'$match':{'name':dataset_name}} pipeline.append(match) project = {'$project':{'dataset_name':'$name', 'size_gb':{'$multiply':['$size_bytes', 0.000000001]}, 'n_files':1, 'physics_group':1, 'ds_type':1, 'data_tier':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) return data[0] def get_n_files(self, dataset_name): """ Get the number of files in the block """ coll = 'dataset_data' pipeline = list() match = {'$match':{'name':dataset_name}} pipeline.append(match) project = {'$project':{'n_files':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) return data[0]['n_files'] def get_data_tiers(self, dataset_names): """ Get the data tiers of all datasets """ dataset_tiers = dict() coll = 'dataset_data' pipeline = list() match = {'$match':{'name':{'$in':dataset_names}}} pipeline.append(match) project = {'$project':{'data_tier':1, 'name':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) for dataset in data: dataset_tiers[dataset['name']] = dataset['data_tier'] return dataset_tiers def get_sites(self, dataset_name): """ Get all sites with a replica of the dataset """ coll = 'dataset_data' pipeline = list() match = {'$match':{'name':dataset_name}} pipeline.append(match) project = {'$project':{'replicas':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) site_names = data[0]['replicas'] return site_names def get_size(self, dataset_name): """ Get size in GB of dataset """ coll = 'dataset_data' pipeline = list() match = {'$match':{'name':dataset_name}} pipeline.append(match) project = {'$project':{'size_bytes':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) size_gb = float(data[0]['size_bytes'])/10**9 return size_gb def get_current_num_replicas(self): """ Get the current number of replicas for all datasets """ datasets = self.get_db_datasets() coll = 'dataset_data' pipeline = list() match = {'$match': {'name': {'$in':datasets}}} pipeline.append(match) group = {'$group':{'_id':'$name', 'n_replicas':{'$first':{'$size':'$replicas'}}}} pipeline.append(group) project = {'$project':{'name':'$_id', 'n_replicas':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) return data def get_num_replicas(self, dataset_name): """ Get the current number of replicas for one dataset """ coll = 'dataset_data' pipeline = list() match = {'$match': {'name': dataset_name}} pipeline.append(match) group = {'$group':{'_id':'$name', 'n_replicas':{'$first':{'$size':'$replicas'}}}} pipeline.append(group) project = {'$project':{'n_replicas':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) return data[0]['n_replicas'] def update_replicas(self, subscriptions, deletions): """ Maually update the replicas based on subscriptions and deletions """ coll = 'dataset_data' for subscription in subscriptions: dataset_name = subscription[0] site_name = subscription[1] query = {'name':dataset_name} data = {'$push':{'replicas':site_name}} self.storage.update_data(coll=coll, query=query, data=data) for deletion in deletions: dataset_name = deletion[0] site_name = deletion[1] query = {'name':dataset_name} data = {'$pull':{'replicas':site_name}} self.storage.update_data(coll=coll, query=query, data=data) def get_total_size(self, dataset_name): """ Get the total storage used in the system """ coll = 'dataset_data' pipeline = list() match = {'$match': {'name':dataset_name}} pipeline.append(match) group = {'$group':{'_id':'$name', 'n_replicas':{'$first':{'$size':'$replicas'}}}} pipeline.append(group) project = {'$project':{'n_replicas':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) n_replicas = data[0]['n_replicas'] size_gb = self.get_size(dataset_name) total_size = n_replicas * size_gb return total_size def get_all_dataset_size(self, dataset_names): """ Get the total storage used in the system """ coll = 'dataset_data' pipeline = list() match = {'$match': {'name':{'$in':dataset_names}}} pipeline.append(match) group = {'$group':{'_id':'$name', 'size_bytes':{'$sum':'$size_bytes'}, 'n_replicas':{'$first':{'$size':'$replicas'}}}} pipeline.append(group) project = {'$project':{'size_bytes':1, 'n_replicas':1, '_id':1}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) sizes = dict() for dataset in data: sizes[dataset['_id']] = (float(dataset['size_bytes'])/10**12)*dataset['n_replicas'] return sizes def get_all_site_size(self, site_names): """ Get the total storage used in the system """ sites_sizes = dict() for site_name in site_names: # get all datasets with a replica at the site and how many replicas it has coll = 'dataset_data' pipeline = list() match = {'$match':{'replicas':site_name}} pipeline.append(match) project = {'$project':{'name':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) dataset_names = [dataset_data['name'] for dataset_data in data] # get the popularity of the dataset and decide by number of replicas coll = 'dataset_data' pipeline = list() match = {'$match': {'name':{'$in':dataset_names}}} pipeline.append(match) group = {'$group':{'_id':'$name', 'size_bytes':{'$sum':'$size_bytes'}, 'n_replicas':{'$first':{'$size':'$replicas'}}}} pipeline.append(group) project = {'$project':{'size_bytes':1, 'n_replicas':1, '_id':1}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) size = 0.0 for dataset in data: size += (float(dataset['size_bytes'])/10**12)*dataset['n_replicas'] sites_sizes[site_name] = size return sites_sizes def get_total_storage(self): """ Get the total storage used in the system """ datasets = self.get_removed_db_datasets() coll = 'dataset_data' pipeline = list() match = {'$match': {'name': {'$in':datasets}}} pipeline.append(match) group = {'$group':{'_id':'$name', 'size_bytes':{'$sum':'$size_bytes'}, 'n_replicas':{'$first':{'$size':'$replicas'}}}} pipeline.append(group) project = {'$project':{'name':'$_id', 'size_bytes':1, 'n_replicas':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) total_storage = 0.0 for dataset in data: total_storage += (float(dataset['size_bytes'])/10**9)*dataset['n_replicas'] return total_storage
class SiteManager(object): """ Keep track of site data """ def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.intelroccs = IntelROCCSService(self.config) self.crab = CRABService(self.config) self.storage = StorageManager(self.config) self.soft_limit = float(self.config['rocker_board']['soft_limit']) self.hard_limit = float(self.config['rocker_board']['hard_limit']) def initiate_db(self): """ Initiate Site database Does exactly the same as update_db """ self.update_db() def update_db(self): """ Initiate site data in database Get general data about all sites """ api = 'Detox' file_ = 'SitesInfo.txt' intelroccs_data = self.intelroccs.fetch(api=api, params=file_, secure=False) for site_data in get_json(intelroccs_data, 'data'): self.insert_site_data(site_data) def insert_site_data(self, site_data): """ Insert site into database """ coll = 'site_data' site_name = str(site_data[4]) site_status = int(site_data[0]) site_quota = int(site_data[1])*10**3 query = {'name':site_name} data = {'$set':{'name':site_name, 'status':site_status, 'quota_gb':site_quota}} self.storage.update_data(coll=coll, query=query, data=data, upsert=True) def update_cpu(self): """ Update maximum CPU capacity for site """ active_sites = self.get_active_sites() for site_name in active_sites: # remove older values date = datetime.datetime.utcnow() - datetime.timedelta(days=30) coll = 'site_data' query = {'name':site_name} data = {'$pull':{'cpu_data':{'date':{'$lt':date}}}} self.storage.update_data(coll=coll, query=query, data=data) # get CRAB data about site query = 'GLIDEIN_CMSSite =?= "%s" && CPUs > 0' % (site_name) attributes = ['GLIDEIN_CMSSite', 'CPUs'] ads = self.crab.fetch_cluster_ads(query, attributes=attributes) cpus = 0 for ad in ads: cpus += ad['CPUs'] # insert new data date = datetime.datetime.utcnow() query = {'name':site_name} data = {'$push':{'cpu_data':{'date':date, 'cpus':cpus}}} self.storage.update_data(coll=coll, query=query, data=data) def get_active_sites(self): """ Get all sites which are active, includes sites which are not available for replication """ coll = 'site_data' pipeline = list() match = {'$match':{'status':{'$in':[1, 2]}}} pipeline.append(match) project = {'$project':{'name':1, '_id':0}} pipeline.append(project) sites_data = self.storage.get_data(coll=coll, pipeline=pipeline) return [site_data['name'] for site_data in sites_data] def get_available_sites(self): """ Get all sites which are available for replication """ coll = 'site_data' pipeline = list() match = {'$match':{'status':1}} pipeline.append(match) project = {'$project':{'name':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) return [site['name'] for site in data] def get_performance(self, site_name): """ Maximum num CPU's divide by quota """ max_cpus = self.get_max_cpu(site_name) quota_gb = self.get_quota(site_name) try: performance = float(max_cpus)/float(quota_gb/10**3) except: performance = 0.0 return performance def get_available_storage(self, site_name): """ Get total AnalysisOps storage available at the site """ size_gb = self.get_data(site_name) quota_gb = self.get_quota(site_name) available_gb = max(0, (self.hard_limit*quota_gb) - size_gb) return available_gb def get_all_available_storage(self): """ Get available storage for all sites """ available_storage = dict() available_sites = self.get_available_sites() for site_name in available_sites: available_storage[site_name] = self.get_available_storage(site_name) return available_storage def get_over_soft_limit(self, site_name): """ Get the amount of GB a site is over the soft limit i.e lower limit. If it's not over set to 0 """ size_gb = self.get_data(site_name) quota_gb = self.get_quota(site_name) over_gb = size_gb - (self.soft_limit*quota_gb) return over_gb def get_data(self, site_name): """ Get the amount of data at the site """ coll = 'dataset_data' pipeline = list() match = {'$match':{'replicas':site_name}} pipeline.append(match) group = {'$group':{'_id':None, 'size_bytes':{'$sum':'$size_bytes'}}} pipeline.append(group) project = {'$project':{'size_bytes':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) try: size_gb = data[0]['size_bytes']/10**9 except: return 0 return size_gb def get_quota(self, site_name): """ Get the AnalysisOps quota for the site """ coll = 'site_data' pipeline = list() match = {'$match':{'name':site_name}} pipeline.append(match) project = {'$project':{'quota_gb':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) try: quota_gb = data[0]['quota_gb'] except: quota_gb = 0 return quota_gb def get_max_cpu(self, site_name): """ Get the maximum number of CPU's in the last 30 days at the site """ coll = 'site_data' pipeline = list() match = {'$match':{'name':site_name}} pipeline.append(match) unwind = {'$unwind':'$cpu_data'} pipeline.append(unwind) group = {'$group':{'_id':'$name', 'max_cpus':{'$max':'$cpu_data.cpus'}}} pipeline.append(group) project = {'$project':{'max_cpus':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) try: max_cpus = data[0]['max_cpus'] except: self.logger.warning('Could not get site performance for %s', site_name) return 0 return max_cpus
class PopularityManager(object): """ Generate popularity metrics for datasets and sites """ def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.pop_db = PopDBService(self.config) self.sites = SiteManager(self.config) self.datasets = DatasetManager(self.config) self.storage = StorageManager(self.config) self.MAX_THREADS = int(config['threading']['max_threads']) def initiate_db(self): """ Collect popularity data """ q = Queue.Queue() for i in range(self.MAX_THREADS): worker = threading.Thread(target=self.insert_popularity_data, args=(i, q)) worker.daemon = True worker.start() start_date = datetime_day(datetime.datetime.utcnow() - datetime.timedelta(days=90)) end_date = datetime_day(datetime.datetime.utcnow()) # fetch popularity data t1 = datetime.datetime.utcnow() for date in daterange(start_date, end_date): q.put(date) q.join() t2 = datetime.datetime.utcnow() td = t2 - t1 self.logger.info('Inserting Pop DB data took %s', str(td)) def update_db(self): """ Fetch latest popularity data not in database """ # get dates coll = 'dataset_popularity' pipeline = list() sort = {'$sort':{'date':-1}} pipeline.append(sort) limit = {'$limit':1} pipeline.append(limit) project = {'$project':{'date':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) try: start_date = data[0]['date'] except: self.logger.warning('Popularity needs to be initiated') self.initiate_db() return q = Queue.Queue() for i in range(self.MAX_THREADS): worker = threading.Thread(target=self.insert_popularity_data, args=(i, q)) worker.daemon = True worker.start() start_date = start_date + datetime.timedelta(days=1) end_date = datetime_day(datetime.datetime.utcnow()) # fetch popularity data t1 = datetime.datetime.utcnow() for date in daterange(start_date, end_date): q.put(date) q.join() t2 = datetime.datetime.utcnow() td = t2 - t1 self.logger.info('Updating Pop DB data took %s', str(td)) def insert_popularity_data(self, i, q): """ Insert popularity data for one dataset into db """ coll = 'dataset_popularity' while True: date = q.get() self.logger.info('Inserting date %s', datetime_to_string(date)) api = 'DSStatInTimeWindow/' tstart = datetime_to_string(date) tstop = tstart params = {'sitename':'summary', 'tstart':tstart, 'tstop':tstop} json_data = self.pop_db.fetch(api=api, params=params) # sort it in dictionary for easy fetching for dataset in json_data['DATA']: dataset_name = dataset['COLLNAME'] popularity_data = {'name':dataset_name, 'date':date} popularity_data['n_accesses'] = dataset['NACC'] popularity_data['n_cpus'] = dataset['TOTCPU'] popularity_data['n_users'] = dataset['NUSERS'] query = {'name':dataset_name, 'date':date} data = {'$set':popularity_data} self.storage.update_data(coll=coll, query=query, data=data, upsert=True) q.task_done() def get_average_popularity(self, dataset_name, date): """ Get all popularity data for a dataset """ start_date = date - datetime.timedelta(days=7) end_date = date - datetime.timedelta(days=1) coll = 'dataset_popularity' pipeline = list() match = {'$match':{'name':dataset_name}} pipeline.append(match) match = {'$match':{'date':{'$gte':start_date, '$lte':end_date}}} pipeline.append(match) data = self.storage.get_data(coll=coll, pipeline=pipeline) pops = list() for i in range(0, 7): try: pops.append(log(float(data[i]['n_accesses']*data[i]['n_cpus']))) except: pops.append(0.0) avg = np.mean(pops) return avg
class Ranker(object): """ Generic Ranking class """ def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.sites = SiteManager(self.config) self.datasets = DatasetManager(self.config) self.popularity = PopularityManager(self.config) self.storage = StorageManager(self.config) self.max_replicas = int(config['rocker_board']['max_replicas']) self.MAX_THREADS = int(config['threading']['max_threads']) self.dataset_popularity = dict() def get_dataset_rankings(self, date=datetime_day(datetime.datetime.utcnow())): """ Generate dataset rankings """ self.dataset_popularity = dict() dataset_names = self.datasets.get_db_datasets() q = Queue.Queue() for i in range(self.MAX_THREADS): worker = threading.Thread(target=self.get_dataset_popularity, args=(q, )) worker.daemon = True worker.start() # self.dataset_features = self.popularity.get_features(dataset_names, date) # self.dataset_tiers = self.datasets.get_data_tiers(dataset_names) for dataset_name in dataset_names: q.put((dataset_name, date)) q.join() dataset_rankings = self.normalize_popularity(date) return dataset_rankings def get_site_rankings(self, date=datetime_day(datetime.datetime.utcnow())): """ Generate site rankings """ # get all sites which can be replicated to site_names = self.sites.get_available_sites() site_rankings = dict() for site_name in site_names: # get popularity popularity = self.get_site_popularity(site_name, date) # get cpu and storage (performance) performance = self.sites.get_performance(site_name) # get available storage available_storage_tb = self.sites.get_available_storage( site_name) / 10**3 if available_storage_tb <= 0: available_storage_tb = 0 else: available_storage_tb = 1 #calculate rank try: rank = (performance * available_storage_tb) / popularity except: rank = 0.0 # store into dict site_rankings[site_name] = rank # insert into database coll = 'site_rankings' query = {'name': site_name, 'date': date} data = { '$set': { 'name': site_name, 'date': date, 'rank': rank, 'popularity': popularity } } self.storage.update_data(coll=coll, query=query, data=data, upsert=True) return site_rankings def get_dataset_popularity(self, q): """ Get the estimated popularity for dataset """ while True: # collect features data = q.get() dataset_name = data[0] date = data[1] popularity = 0.0 # get average popularity = self.popularity.get_average_popularity( dataset_name, date) self.dataset_popularity[dataset_name] = popularity q.task_done() def get_site_popularity(self, site_name, date=datetime_day(datetime.datetime.utcnow())): """ Get popularity for site """ # get all datasets with a replica at the site and how many replicas it has coll = 'dataset_data' pipeline = list() match = {'$match': {'replicas': site_name}} pipeline.append(match) project = {'$project': {'name': 1, '_id': 0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) popularity = 0.0 dataset_names = [dataset_data['name'] for dataset_data in data] # get the popularity of the dataset and decide by number of replicas coll = 'dataset_rankings' pipeline = list() match = {'$match': {'date': date}} pipeline.append(match) match = {'$match': {'name': {'$in': dataset_names}}} pipeline.append(match) group = { '$group': { '_id': '$date', 'total_popularity': { '$sum': '$popularity' } } } pipeline.append(group) project = {'$project': {'total_popularity': 1, '_id': 0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) try: popularity = data[0]['total_popularity'] except: popularity = 0.0 return popularity def get_site_storage_rankings(self, subscriptions): """ Return the amount over the soft limit sites are including new subscriptions If site is not over just set to 0 """ site_rankings = dict() available_sites = self.sites.get_available_sites() for site_name in available_sites: site_rankings[site_name] = self.sites.get_over_soft_limit( site_name) for subscription in subscriptions: site_rankings[subscription[1]] += self.datasets.get_size( subscription[0]) for site_name in available_sites: if site_rankings[site_name] < 0: del site_rankings[site_name] return site_rankings def normalize_popularity(self, date): """ Normalize popularity values to be between 1 and max_replicas """ dataset_rankings = dict() max_pop = max(self.dataset_popularity.iteritems(), key=operator.itemgetter(1))[1] min_pop = min(self.dataset_popularity.iteritems(), key=operator.itemgetter(1))[1] n = float(min_pop + (self.max_replicas - 1)) / max_pop m = 1 - n * min_pop for dataset_name, popularity in self.dataset_popularity.items(): # store into dict rank = int(n * self.dataset_popularity[dataset_name] + m) dataset_rankings[dataset_name] = rank coll = 'dataset_rankings' query = data = {'name': dataset_name, 'date': date} data = { '$set': { 'name': dataset_name, 'date': date, 'rank': rank, 'popularity': popularity } } self.storage.update_data(coll=coll, query=query, data=data, upsert=True) return dataset_rankings
class Ranker(object): """ Generic Ranking class """ def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.sites = SiteManager(self.config) self.datasets = DatasetManager(self.config) self.popularity = PopularityManager(self.config) self.storage = StorageManager(self.config) self.max_replicas = int(config['rocker_board']['max_replicas']) self.MAX_THREADS = int(config['threading']['max_threads']) self.dataset_popularity = dict() def get_dataset_rankings(self, date=datetime_day(datetime.datetime.utcnow())): """ Generate dataset rankings """ self.dataset_popularity = dict() dataset_names = self.datasets.get_db_datasets() q = Queue.Queue() for i in range(self.MAX_THREADS): worker = threading.Thread(target=self.get_dataset_popularity, args=(q,)) worker.daemon = True worker.start() # self.dataset_features = self.popularity.get_features(dataset_names, date) # self.dataset_tiers = self.datasets.get_data_tiers(dataset_names) for dataset_name in dataset_names: q.put((dataset_name, date)) q.join() dataset_rankings = self.normalize_popularity(date) return dataset_rankings def get_site_rankings(self, date=datetime_day(datetime.datetime.utcnow())): """ Generate site rankings """ # get all sites which can be replicated to site_names = self.sites.get_available_sites() site_rankings = dict() for site_name in site_names: # get popularity popularity = self.get_site_popularity(site_name, date) # get cpu and storage (performance) performance = self.sites.get_performance(site_name) # get available storage available_storage_tb = self.sites.get_available_storage(site_name)/10**3 if available_storage_tb <= 0: available_storage_tb = 0 else: available_storage_tb = 1 #calculate rank try: rank = (performance*available_storage_tb)/popularity except: rank = 0.0 # store into dict site_rankings[site_name] = rank # insert into database coll = 'site_rankings' query = {'name':site_name, 'date':date} data = {'$set':{'name':site_name, 'date':date, 'rank':rank, 'popularity':popularity}} self.storage.update_data(coll=coll, query=query, data=data, upsert=True) return site_rankings def get_dataset_popularity(self, q): """ Get the estimated popularity for dataset """ while True: # collect features data = q.get() dataset_name = data[0] date = data[1] popularity = 0.0 # get average popularity = self.popularity.get_average_popularity(dataset_name, date) self.dataset_popularity[dataset_name] = popularity q.task_done() def get_site_popularity(self, site_name, date=datetime_day(datetime.datetime.utcnow())): """ Get popularity for site """ # get all datasets with a replica at the site and how many replicas it has coll = 'dataset_data' pipeline = list() match = {'$match':{'replicas':site_name}} pipeline.append(match) project = {'$project':{'name':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) popularity = 0.0 dataset_names = [dataset_data['name'] for dataset_data in data] # get the popularity of the dataset and decide by number of replicas coll = 'dataset_rankings' pipeline = list() match = {'$match':{'date':date}} pipeline.append(match) match = {'$match':{'name':{'$in':dataset_names}}} pipeline.append(match) group = {'$group':{'_id':'$date', 'total_popularity':{'$sum':'$popularity'}}} pipeline.append(group) project = {'$project':{'total_popularity':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) try: popularity = data[0]['total_popularity'] except: popularity = 0.0 return popularity def get_site_storage_rankings(self, subscriptions): """ Return the amount over the soft limit sites are including new subscriptions If site is not over just set to 0 """ site_rankings = dict() available_sites = self.sites.get_available_sites() for site_name in available_sites: site_rankings[site_name] = self.sites.get_over_soft_limit(site_name) for subscription in subscriptions: site_rankings[subscription[1]] += self.datasets.get_size(subscription[0]) for site_name in available_sites: if site_rankings[site_name] < 0: del site_rankings[site_name] return site_rankings def normalize_popularity(self, date): """ Normalize popularity values to be between 1 and max_replicas """ dataset_rankings = dict() max_pop = max(self.dataset_popularity.iteritems(), key=operator.itemgetter(1))[1] min_pop = min(self.dataset_popularity.iteritems(), key=operator.itemgetter(1))[1] n = float(min_pop + (self.max_replicas - 1))/max_pop m = 1 - n*min_pop for dataset_name, popularity in self.dataset_popularity.items(): # store into dict rank = int(n*self.dataset_popularity[dataset_name] + m) dataset_rankings[dataset_name] = rank coll = 'dataset_rankings' query = data = {'name':dataset_name, 'date':date} data = {'$set':{'name':dataset_name, 'date':date, 'rank':rank, 'popularity':popularity}} self.storage.update_data(coll=coll, query=query, data=data, upsert=True) return dataset_rankings