class DataAnalysis(object): """ Data Analysis is collecting data and prints it to be used by visualization software to better understand access patterns """ def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.datasets = DatasetManager(self.config) self.sites = SiteManager(self.config) self.storage = StorageManager(self.config) self.popularity = PopularityManager(self.config) def start(self): """ Begin Data Analysis """ t1 = datetime.datetime.utcnow() dataset_name = '/PAHighPt/HIRun2013-PromptReco-v1/RECO' self.initiate_data(dataset_name) self.export_data(dataset_name) t2 = datetime.datetime.utcnow() td = t2 - t1 self.logger.info('Data Analysis took %s', str(td)) def initiate_data(self, dataset_name): """ Initiate data about dataset(s) """ coll = 'dataset_data' query = {'name':dataset_name} data = {'$set':{'name':dataset_name}} self.storage.update_data(coll=coll, query=query, data=data, upsert=True) self.datasets.insert_phedex_data(dataset_name) self.datasets.insert_dbs_data(dataset_name) self.popularity.insert_dataset(dataset_name) def export_data(self, dataset_name): """ Get data from DB and export to file for usage in visualization """ # get data from DB coll = 'dataset_popularity' pipeline = list() match = {'$match':{'name':dataset_name}} pipeline.append(match) db_data = self.storage.get_data(coll=coll, pipeline=pipeline) headers = ('dataset_name', 'date', 'popularity') data = list() for data_entry in db_data: data.append(tuple(data_entry['name'], data_entry['date'], data_entry['n_accesses']*data_entry['n_cpus']*data_entry['n_users'])) export_csv(headers=headers, data=data, file_name='single_dataset')
def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.datasets = DatasetManager(self.config) self.sites = SiteManager(self.config) self.storage = StorageManager(self.config) self.popularity = PopularityManager(self.config)
def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.phedex = PhEDExService(self.config) self.dbs = DBSService(self.config) self.storage = StorageManager(self.config) self.sites = SiteManager(self.config) self.MAX_THREADS = int(config['threading']['max_threads'])
def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.pop_db = PopDBService(self.config) self.sites = SiteManager(self.config) self.datasets = DatasetManager(self.config) self.storage = StorageManager(self.config) self.MAX_THREADS = int(config['threading']['max_threads'])
class GenericService(object): """ Generic cuadrnt service class Shared properties between services: Contact a web service using a base url and some key:value parameters Services require a valid cert and key Want to cache results in a document-oriented database """ def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.storage = StorageManager(self.config) self.SERVICE = 'generic' self.TARGET_URL = '' def fetch(self, api, params=dict(), method='get', secure=True, cache=True, cache_only=False, force_cache=False): """ Get data from url using parameters params If param cache is true update cache on cache miss If param cache_only is true just update the cache, don't return any data. Use this parameter to spawn external thread to update cache in background """ if cache: json_data = dict() if not force_cache: json_data = self.storage.get_cache(self.SERVICE, api, params) if not json_data: if secure: json_data = get_secure_data(target_url=self.TARGET_URL, api=api, params=params, method=method) else: json_data = get_data(target_url=self.TARGET_URL, api=api, file_=params) if type(json_data) is not dict: json_data = {'data':json_data} self.storage.insert_cache(self.SERVICE, api, params, json_data) if not cache_only: return json_data else: if secure: json_data = get_secure_data(target_url=self.TARGET_URL, api=api, params=params, method=method) else: json_data = get_data(target_url=self.TARGET_URL, api=api, file_=params) if type(json_data) is not dict: json_data = {'data':json_data} return json_data
def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.phedex = PhEDExService(self.config) self.mit_db = MITDBService(self.config) self.datasets = DatasetManager(self.config) self.sites = SiteManager(self.config) self.storage = StorageManager(self.config) self.rankings = DeltaRanking(self.config) self.max_gb = int(self.config['rocker_board']['max_gb']) self.min_rank = float(self.config['rocker_board']['min_rank'])
class MiniAODAnalysis(object): """ Data Analysis is collecting data and prints it to be used by visualization software to better understand access patterns """ def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.storage = StorageManager(self.config) #self.phedex = self.phedex = PhEDExService(self.config) def start(self): """ Begin Data Analysis """ t1 = datetime.datetime.utcnow() n_datasets, size_all = self.get_n_datasets() print n_datasets print size_all #n_multiple, size_multiple = self.get_multiple_sites() #n_replicas = self.get_n_replicas() t2 = datetime.datetime.utcnow() td = t2 - t1 self.logger.info('MINIAOD Analysis took %s', str(td)) def get_n_datasets(self): """ Collect the total number of MINIAOD[SIM] blocks """ regex = re.compile(".*MINIAOD(SIM)?$") coll = 'dataset_data' pipeline = list() match = {'$match':{'name':{'$regex':regex}}} pipeline.append(match) group = {'$group':{'_id':None, 'count':{'$sum':1}, 'size_bytes':{'$sum':'$size_bytes'}}} pipeline.append(group) data = self.storage.get_data(coll, pipeline) n_datasets = data[0]['count'] size_gb = data[0]['size_bytes']/10**9 return n_datasets, size_gb
def setUp(self): "Set up for test" self.config = get_config(path=opt_path, file_name='test.cfg') self.storage = StorageManager(config=self.config) self.storage.drop_db()
class StorageTests(unittest.TestCase): """ A test class for service classes """ def setUp(self): "Set up for test" self.config = get_config(path=opt_path, file_name='test.cfg') self.storage = StorageManager(config=self.config) self.storage.drop_db() def tearDown(self): "Clean up" coll = 'test' query = dict() self.storage.delete_data(coll=coll, query=query) pipeline = list() match = {'$match':{}} pipeline.append(match) expected = list() result = self.storage.get_data(coll=coll, pipeline=pipeline) self.assertEqual(result, expected) self.storage.drop_db() #@unittest.skip("Skip Test") def test_cache(self): "Test storage cache" print "" phedex = PhEDExService(config=self.config) api = 'data' params = {'level':'block', 'dataset':'/DoubleElectron/Run2012D-22Jan2013-v1/AOD'} expected = '/DoubleElectron/Run2012D-22Jan2013-v1/AOD' phedex.fetch(api=api, params=params, cache_only=True, force_cache=True) cache_data = self.storage.get_cache(coll='phedex', api=api, params=params) try: result = cache_data['phedex']['dbs'][0]['dataset'][0]['name'] except KeyError: self.assertTrue(False) else: self.assertEqual(result, expected) #@unittest.skip("Skip Test") def test_data(self): "Test general collection manipulation functions" coll = 'test' # insert data = [{'foo':'bar_1'}, {'foo':'bar_2'}] self.storage.insert_data(coll=coll, data=data) # get pipeline = list() match = {'$match':{'foo':'bar_2'}} pipeline.append(match) data = self.storage.get_data(coll=coll, pipeline=pipeline) expected = 'bar_2' result = data[0]['foo'] self.assertEqual(result, expected) # update query = {'foo':'bar_1'} data = {'$set':{'foo':'bar_3'}} self.storage.update_data(coll=coll, query=query, data=data) pipeline = list() match = {'$match':{'foo':'bar_3'}} pipeline.append(match) data = self.storage.get_data(coll=coll, pipeline=pipeline) expected = 'bar_3' result = data[0]['foo'] self.assertEqual(result, expected) # last insert timestamp data = [{'foo':'bar_4'}] datetime_1 = datetime.utcnow().replace(microsecond=0) self.storage.insert_data(coll=coll, data=data) datetime_2 = self.storage.get_last_insert_time(coll) self.assertTrue(datetime_1 <= datetime_2)
def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.storage = StorageManager(self.config) self.SERVICE = 'generic' self.TARGET_URL = ''
class RockerBoard(object): """ RockerBoard is a system balancing algorithm using popularity metrics to predict popularity and make appropriate replications to keep the system balanced """ def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.phedex = PhEDExService(self.config) self.mit_db = MITDBService(self.config) self.datasets = DatasetManager(self.config) self.sites = SiteManager(self.config) self.storage = StorageManager(self.config) self.rankings = DeltaRanking(self.config) self.max_gb = int(self.config['rocker_board']['max_gb']) self.min_rank = float(self.config['rocker_board']['min_rank']) def start(self): """ Begin Rocker Board Algorithm """ t1 = datetime.datetime.utcnow() subscriptions = self.balance() for subscription in subscriptions: self.logger.info('site: %s\tdataset: %s', subscription[1], subscription[0]) self.subscribe(subscriptions) t2 = datetime.datetime.utcnow() td = t2 - t1 self.logger.info('Rocker Board took %s', str(td)) def balance(self): """ Balance system by creating new replicas based on popularity """ subscriptions = list() dataset_rankings = self.rankings.dataset_rankings() site_rankings = self.rankings.site_rankings() subscribed_gb = 0 while subscribed_gb < self.max_gb: tmp_site_rankings = site_rankings dataset_name = weighted_choice(dataset_rankings) if (not dataset_name) or (dataset_rankings[dataset_name] < self.min_rank): break size_gb = self.datasets.get_size(dataset_name) unavailable_sites = set(self.datasets.get_sites(dataset_name)) for site_name in tmp_site_rankings.keys(): if (self.sites.get_available_storage(site_name) < size_gb) or (tmp_site_rankings[site_name] <= 0): unavailable_sites.add(site_name) for site_name in unavailable_sites: try: del tmp_site_rankings[site_name] except: continue if not tmp_site_rankings: break site_name = weighted_choice(tmp_site_rankings) subscription = (dataset_name, site_name) subscriptions.append(subscription) subscribed_gb += size_gb avail_storage = self.sites.get_available_storage(site_name) self.logger.info('rank: %s\tsize: %.2f\tdataset: %s', dataset_rankings[dataset_name], size_gb, dataset_name) self.logger.info('rank: %s\tstorage: %d\site: %s', site_rankings[site_name], avail_storage, site_name) new_avail_storage = avail_storage - self.datasets.get_size(dataset_name) if new_avail_storage > 0: new_rank = 0.0 else: new_rank = (site_rankings[site_name]/avail_storage)*new_avail_storage site_rankings[site_name] = new_rank del dataset_rankings[dataset_name] self.logger.info('Subscribed %dGB', subscribed_gb) return subscriptions def subscribe(self, subscriptions): """ Make subscriptions to phedex subscriptions = [(dataset_name, site_name), ...] """ new_subscriptions = dict() for subscription in subscriptions: dataset_name = subscription[0] site_name = subscription[1] try: new_subscriptions[site_name].append(dataset_name) except: new_subscriptions[site_name] = list() new_subscriptions[site_name].append(dataset_name) for site_name, dataset_names in new_subscriptions.items(): data = self.phedex.generate_xml(dataset_names) comments = 'This dataset is predicted to become popular and has therefore been automatically replicated by cuadrnt' api = 'subscribe' params = [('node', site_name), ('data', data), ('level','dataset'), ('move', 'n'), ('custodial', 'n'), ('group', 'AnalysisOps'), ('request_only', 'n'), ('no_mail', 'n'), ('comments', comments)] json_data = self.phedex.fetch(api=api, params=params, method='post') # insert into db group_name = 'AnalysisOps' request_id = 0 request_type = 0 try: request = json_data['phedex'] request_id = request['request_created'][0]['id'] request_created = timestamp_to_datetime(request['request_timestamp']) except: self.logger.warning('Subscription did not succeed\n\tSite:%s\n\tDatasets: %s', str(site_name), str(dataset_names)) continue for dataset_name in dataset_names: coll = 'dataset_popularity' date = datetime_day(datetime.datetime.utcnow()) pipeline = list() match = {'$match':{'name':dataset_name, 'date':date}} pipeline.append(match) project = {'$project':{'delta_popularity':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) dataset_rank = data[0]['delta_popularity'] query = "INSERT INTO Requests(RequestId, RequestType, DatasetId, SiteId, GroupId, Rank, Date) SELECT %s, %s, Datasets.DatasetId, Sites.SiteId, Groups.GroupId, %s, %s FROM Datasets, Sites, Groups WHERE Datasets.DatasetName=%s AND Sites.SiteName=%s AND Groups.GroupName=%s" values = (request_id, request_type, dataset_rank, request_created, dataset_name, site_name, group_name) self.mit_db.query(query=query, values=values, cache=False)
def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.storage = StorageManager(self.config)
class DatasetManager(object): """ Handle all dataset related data """ def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.phedex = PhEDExService(self.config) self.dbs = DBSService(self.config) self.storage = StorageManager(self.config) self.sites = SiteManager(self.config) self.MAX_THREADS = int(config['threading']['max_threads']) def initiate_db(self): """ Initiate dataset data in database Get general data and popularity data from beginning """ q = Queue.Queue() for i in range(self.MAX_THREADS): worker = threading.Thread(target=self.insert_dataset_data, args=(i, q)) worker.daemon = True worker.start() active_sites = self.sites.get_active_sites() api = 'blockreplicas' params = [('node', active_sites), ('create_since', 0.0), ('complete', 'y'), ('dist_complete', 'y'), ('group', 'AnalysisOps'), ('show_dataset', 'y')] t1 = datetime.datetime.utcnow() phedex_data = self.phedex.fetch(api=api, params=params) t2 = datetime.datetime.utcnow() td = t2 - t1 self.logger.info('Call to PhEDEx took %s', str(td)) count = 1 t1 = datetime.datetime.utcnow() for dataset_data in get_json(get_json(phedex_data, 'phedex'), 'dataset'): q.put((dataset_data, count)) count += 1 q.join() t2 = datetime.datetime.utcnow() td = t2 - t1 self.logger.info('Inserting PhEDEx data took %s', str(td)) self.logger.info('Done inserting datasets into DB') def update_db(self): """ Get datasets currently in AnalysisOps and compare to database Deactivate removed datasets and insert new Update replicas """ # get all datasets in database dataset_names = self.get_db_datasets() dataset_names = set(dataset_names) # get all active sites, only fetch replicas from these active_sites = self.sites.get_active_sites() api = 'blockreplicas' params = [('node', active_sites), ('create_since', 0.0), ('complete', 'y'), ('group', 'AnalysisOps'), ('show_dataset', 'y')] phedex_data = self.phedex.fetch(api=api, params=params) current_datasets = set() q = Queue.Queue() for i in range(self.MAX_THREADS): worker = threading.Thread(target=self.insert_dataset_data, args=(i, q)) worker.daemon = True worker.start() count = 1 for dataset_data in get_json(get_json(phedex_data, 'phedex'), 'dataset'): dataset_name = get_json(dataset_data, 'name') current_datasets.add(dataset_name) if dataset_name not in dataset_names: # this is a new dataset which need to be inserted into the database q.put((dataset_data, count)) count += 1 else: # update replicas replicas = self.get_replicas(dataset_data) coll = 'dataset_data' query = {'name':dataset_name} data = {'$set':{'replicas':replicas}} data = self.storage.update_data(coll=coll, query=query, data=data, upsert=False) q.join() deprecated_datasets = dataset_names - current_datasets for dataset_name in deprecated_datasets: self.remove_dataset(dataset_name) def insert_dataset_data(self, i, q): """ Insert a new dataset into the database and initiate all data """ while True: data = q.get() dataset_data = data[0] count = data[1] self.logger.debug('Inserting dataset number %d', count) dataset_name = get_json(dataset_data, 'name') replicas = self.get_replicas(dataset_data) coll = 'dataset_data' query = {'name':dataset_name} data = {'$set':{'name':dataset_name, 'replicas':replicas}} data = self.storage.update_data(coll=coll, query=query, data=data, upsert=True) self.insert_phedex_data(dataset_name) self.insert_dbs_data(dataset_name) q.task_done() def insert_phedex_data(self, dataset_name): """ Fetch phedex data about dataset and insert into database """ api = 'data' params = {'dataset':dataset_name, 'level':'block', 'create_since':0.0} phedex_data = self.phedex.fetch(api=api, params=params) size_bytes = 0 n_files = 0 try: dataset_data = get_json(get_json(get_json(phedex_data, 'phedex'), 'dbs')[0],'dataset')[0] except: coll = 'dataset_data' query = {'name':dataset_name} self.storage.delete_data(coll=coll, query=query) return for block_data in get_json(dataset_data, 'block'): size_bytes += get_json(block_data, 'bytes') n_files += get_json(block_data, 'files') coll = 'dataset_data' query = {'name':dataset_name} data = {'$set':{'size_bytes':size_bytes, 'n_files':n_files}} self.storage.update_data(coll=coll, query=query, data=data, upsert=False) def insert_dbs_data(self, dataset_name): """ Fetch dbs data about dataset and insert into database """ api = 'datasets' params = {'dataset':dataset_name, 'detail':True, 'dataset_access_type':'*'} dbs_data = self.dbs.fetch(api=api, params=params) try: dataset_data = get_json(dbs_data, 'data')[0] except: coll = 'dataset_data' query = {'name':dataset_name} self.storage.delete_data(coll=coll, query=query) return ds_name = get_json(dataset_data, 'primary_ds_name') physics_group = get_json(dataset_data, 'physics_group_name') data_tier = get_json(dataset_data, 'data_tier_name') creation_date = datetime_day(timestamp_to_datetime(get_json(dataset_data, 'creation_date'))) ds_type = get_json(dataset_data, 'primary_ds_type') coll = 'dataset_data' query = {'name':dataset_name} data = {'$set':{'ds_name':ds_name, 'physics_group':physics_group, 'data_tier':data_tier, 'creation_date':creation_date, 'ds_type':ds_type}} self.storage.update_data(coll=coll, query=query, data=data, upsert=False) def get_replicas(self, dataset_data): """ Generator function to get all replicas of a dataset """ replicas = list() for block_data in get_json(dataset_data, 'block'): for replica_data in get_json(block_data, 'replica'): if get_json(replica_data, 'files') > 0: replicas.append(get_json(replica_data, 'node')) return replicas def get_db_datasets(self): """ Get all datasets currently in database """ coll = 'dataset_data' pipeline = list() project = {'$project':{'name':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) dataset_names = [dataset_data['name'] for dataset_data in data] self.logger.info('%d datasets present in database', len(dataset_names)) return dataset_names def remove_dataset(self, dataset_name): """ Remove dataset from database """ coll = 'dataset_data' query = {'name':dataset_name} self.storage.delete_data(coll=coll, query=query) def get_sites(self, dataset_name): """ Get all sites with a replica of the dataset """ coll = 'dataset_data' pipeline = list() match = {'$match':{'name':dataset_name}} pipeline.append(match) project = {'$project':{'replicas':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) site_names = data[0]['replicas'] return site_names def get_size(self, dataset_name): """ Get size in GB of dataset """ coll = 'dataset_data' pipeline = list() match = {'$match':{'name':dataset_name}} pipeline.append(match) project = {'$project':{'size_bytes':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) size_gb = float(data[0]['size_bytes'])/10**9 return size_gb
class PopularityManager(object): """ Generate popularity metrics for datasets and sites """ def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.pop_db = PopDBService(self.config) self.sites = SiteManager(self.config) self.datasets = DatasetManager(self.config) self.storage = StorageManager(self.config) self.MAX_THREADS = int(config['threading']['max_threads']) def initiate_db(self): """ Collect popularity data """ q = Queue.Queue() for i in range(self.MAX_THREADS): worker = threading.Thread(target=self.insert_popularity_data, args=(i, q)) worker.daemon = True worker.start() start_date = datetime_day(datetime.datetime.utcnow() - datetime.timedelta(days=90)) end_date = datetime_day(datetime.datetime.utcnow()) # fetch popularity data t1 = datetime.datetime.utcnow() for date in daterange(start_date, end_date): q.put(date) q.join() t2 = datetime.datetime.utcnow() td = t2 - t1 self.logger.info('Inserting Pop DB data took %s', str(td)) def insert_popularity_data(self, i, q): """ Insert popularity data for one dataset into db """ coll = 'dataset_popularity' while True: date = q.get() api = 'DSStatInTimeWindow/' tstart = datetime_to_string(date) tstop = tstart params = {'sitename':'summary', 'tstart':tstart, 'tstop':tstop} json_data = self.pop_db.fetch(api=api, params=params) # sort it in dictionary for easy fetching for dataset in json_data['DATA']: dataset_name = dataset['COLLNAME'] popularity_data = {'name':dataset_name, 'date':date} popularity_data['n_accesses'] = dataset['NACC'] popularity_data['n_cpus'] = dataset['TOTCPU'] popularity_data['n_users'] = dataset['NUSERS'] query = {'name':dataset_name, 'data':date} data = {'$set':popularity_data} self.storage.update_data(coll=coll, query=query, data=data, upsert=True) q.task_done() def update_db(self): """ Fetch latest popularity data not in database """ # get dates coll = 'dataset_popularity' pipeline = list() sort = {'$sort':{'date':-1}} pipeline.append(sort) limit = {'$limit':1} pipeline.append(limit) project = {'$project':{'date':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) try: start_date = data[0]['date'] except: self.logger.warning('Popularity needs to be initiated') self.initiate_db() return q = Queue.Queue() for i in range(self.MAX_THREADS): worker = threading.Thread(target=self.insert_popularity_data, args=(i, q)) worker.daemon = True worker.start() end_date = datetime_day(datetime.datetime.utcnow()) # fetch popularity data t1 = datetime.datetime.utcnow() for date in daterange(start_date, end_date): q.put(date) t2 = datetime.datetime.utcnow() td = t2 - t1 self.logger.info('Updating Pop DB data took %s', str(td)) def insert_dataset(self, dataset_name): """ Fetch all popularity data for dataset """ api = 'getSingleDSstat' sitename = 'summary' name = dataset_name aggr = 'day' orderbys = ['totcpu', 'naccess'] coll = 'dataset_popularity' for orderby in orderbys: params = {'sitename':sitename, 'name':name, 'aggr':aggr, 'orderby':orderby} json_data = self.pop_db.fetch(api=api, params=params) data = get_json(json_data, 'data') for pop_data in get_json(data, 'data'): date = pop_db_timestamp_to_datetime(pop_data[0]) query = {'name':dataset_name, 'data':date} popularity_data = {'name':dataset_name, 'date':date} popularity_data[orderby] = pop_data[1] data = {'$set':popularity_data} self.storage.update_data(coll=coll, query=query, data=data, upsert=True)
def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.intelroccs = IntelROCCSService(self.config) self.crab = CRABService(self.config) self.storage = StorageManager(self.config)
class SiteManager(object): """ Keep track of site data """ def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.intelroccs = IntelROCCSService(self.config) self.crab = CRABService(self.config) self.storage = StorageManager(self.config) def initiate_db(self): """ Initiate Site database Does exactly the same as update_db """ self.update_db() def update_db(self): """ Initiate site data in database Get general data about all sites """ api = 'Detox' file_ = 'SitesInfo.txt' intelroccs_data = self.intelroccs.fetch(api=api, params=file_, secure=False) for site_data in get_json(intelroccs_data, 'data'): self.insert_site_data(site_data) def insert_site_data(self, site_data): """ Insert site into database """ coll = 'site_data' site_name = str(site_data[4]) site_status = int(site_data[0]) site_quota = int(site_data[1])*10**3 query = {'name':site_name} data = {'$set':{'name':site_name, 'status':site_status, 'quota_gb':site_quota}} self.storage.update_data(coll=coll, query=query, data=data, upsert=True) def update_cpu(self): """ Update maximum CPU capacity for site """ active_sites = self.get_active_sites() for site_name in active_sites: # remove older values date = datetime.datetime.utcnow() - datetime.timedelta(days=30) coll = 'site_data' query = {'name':site_name} data = {'$pull':{'cpu_data':{'date':{'$lt':date}}}} self.storage.update_data(coll=coll, query=query, data=data) # get CRAB data about site query = 'GLIDEIN_CMSSite =?= "%s" && CPUs > 0' % (site_name) attributes = ['GLIDEIN_CMSSite', 'CPUs'] ads = self.crab.fetch_cluster_ads(query, attributes=attributes) cpus = 0 for ad in ads: cpus += ad['CPUs'] # insert new data date = datetime.datetime.utcnow() query = {'name':site_name} data = {'$push':{'cpu_data':{'date':date, 'cpus':cpus}}} self.storage.update_data(coll=coll, query=query, data=data) def get_active_sites(self): """ Get all sites which are active, includes sites which are not available for replication """ coll = 'site_data' pipeline = list() match = {'$match':{'status':{'$in':[1, 2]}}} pipeline.append(match) project = {'$project':{'name':1, '_id':0}} pipeline.append(project) sites_data = self.storage.get_data(coll=coll, pipeline=pipeline) return [site_data['name'] for site_data in sites_data] def get_available_sites(self): """ Get all sites which are available for replication """ coll = 'site_data' pipeline = list() match = {'$match':{'status':1}} pipeline.append(match) project = {'$project':{'name':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) return [site['name'] for site in data] def get_available_storage(self, site_name): """ Get total AnalysisOps storage available at the site """ coll = 'dataset_data' pipeline = list() match = {'$match':{'replicas':site_name}} pipeline.append(match) group = {'$group':{'_id':None, 'size_bytes':{'$sum':'$size_bytes'}}} pipeline.append(group) project = {'$project':{'size_bytes':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) try: size = data[0]['size_bytes']/10**9 except: return 0 coll = 'site_data' pipeline = list() match = {'$match':{'name':site_name}} pipeline.append(match) project = {'$project':{'quota_gb':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) quota = data[0]['quota_gb'] available_gb = (0.95*quota) - size return available_gb def get_performance(self, site_name): """ Get the maximum number of CPU's for site in last 30 days """ # get maximum number of CPU's and quota coll = 'site_data' pipeline = list() match = {'$match':{'name':site_name}} pipeline.append(match) unwind = {'$unwind':'$cpu_data'} pipeline.append(unwind) group = {'$group':{'_id':'$name', 'quota_gb':{'$max':'$quota_gb'}, 'max_cpus':{'$max':'$cpu_data.cpus'}}} pipeline.append(group) project = {'$project':{'quota_gb':1, 'max_cpus':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) try: max_cpus = data[0]['max_cpus'] quota = float(data[0]['quota_gb'])/10**3 except: self.logger.warning('Could not get site performance for %s', site_name) max_cpus = 0 quota = 0 try: performance = float(max_cpus)/float(quota) except: performance = 0.0 if not (performance > 0): performance = 0.0 return performance