Example #1
0
 def test_phedex_memory(self):
     "Test phedex data memory usage"
     print ""
     phedex = PhEDExService(self.config)
     api = 'blockreplicas'
     params = [('node', 'T2_*'), ('create_since', 0.0), ('complete', 'y'), ('group', 'AnalysisOps'), ('show_dataset', 'y')]
     phedex_data = phedex.fetch(api=api, params=params, cache=False)
     total_size = total_size_of(phedex_data)
     logger.info('Total size of PhEDEx data in memory is %d bytes (%dMB)', total_size, total_size/10**6)
Example #2
0
 def test_phedex(self):
     "Test phedex functions"
     print ""
     phedex = PhEDExService(config=self.config)
     api = 'data'
     params = {'level':'block', 'dataset':'/DoubleElectron/Run2012D-22Jan2013-v1/AOD'}
     expected = '/DoubleElectron/Run2012D-22Jan2013-v1/AOD'
     json_data = phedex.fetch(api=api, params=params, cache=False)
     result = json_data['phedex']['dbs'][0]['dataset'][0]['name']
     self.assertEqual(result, expected)
Example #3
0
 def test_cache(self):
     "Test storage cache"
     print ""
     phedex = PhEDExService(config=self.config)
     api = 'data'
     params = {'level':'block', 'dataset':'/DoubleElectron/Run2012D-22Jan2013-v1/AOD'}
     expected = '/DoubleElectron/Run2012D-22Jan2013-v1/AOD'
     phedex.fetch(api=api, params=params, cache_only=True, force_cache=True)
     cache_data = self.storage.get_cache(coll='phedex', api=api, params=params)
     try:
         result = cache_data['phedex']['dbs'][0]['dataset'][0]['name']
     except KeyError:
         self.assertTrue(False)
     else:
         self.assertEqual(result, expected)
Example #4
0
 def __init__(self, config=dict()):
     self.logger = logging.getLogger(__name__)
     self.config = config
     self.phedex = PhEDExService(self.config)
     self.dbs = DBSService(self.config)
     self.storage = StorageManager(self.config)
     self.sites = SiteManager(self.config)
     self.MAX_THREADS = int(config['threading']['max_threads'])
Example #5
0
 def __init__(self, config=dict()):
     self.logger = logging.getLogger(__name__)
     self.config = config
     self.phedex = PhEDExService(self.config)
     self.mit_db = MITDBService(self.config)
     self.datasets = DatasetManager(self.config)
     self.sites = SiteManager(self.config)
     self.storage = StorageManager(self.config)
     self.rankings = DeltaRanking(self.config)
     self.max_gb = int(self.config['rocker_board']['max_gb'])
     self.min_rank = float(self.config['rocker_board']['min_rank'])
Example #6
0
class RockerBoard(object):
    """
    RockerBoard is a system balancing algorithm using popularity metrics to predict popularity
    and make appropriate replications to keep the system balanced
    """
    def __init__(self, config=dict()):
        self.logger = logging.getLogger(__name__)
        self.config = config
        self.phedex = PhEDExService(self.config)
        self.mit_db = MITDBService(self.config)
        self.datasets = DatasetManager(self.config)
        self.sites = SiteManager(self.config)
        self.storage = StorageManager(self.config)
        self.rankings = DeltaRanking(self.config)
        self.max_gb = int(self.config['rocker_board']['max_gb'])
        self.min_rank = float(self.config['rocker_board']['min_rank'])

    def start(self):
        """
        Begin Rocker Board Algorithm
        """
        t1 = datetime.datetime.utcnow()
        subscriptions = self.balance()
        for subscription in subscriptions:
            self.logger.info('site: %s\tdataset: %s', subscription[1], subscription[0])
        self.subscribe(subscriptions)
        t2 = datetime.datetime.utcnow()
        td = t2 - t1
        self.logger.info('Rocker Board took %s', str(td))

    def balance(self):
        """
        Balance system by creating new replicas based on popularity
        """
        subscriptions = list()
        dataset_rankings = self.rankings.dataset_rankings()
        site_rankings = self.rankings.site_rankings()
        subscribed_gb = 0
        while subscribed_gb < self.max_gb:
            tmp_site_rankings = site_rankings
            dataset_name = weighted_choice(dataset_rankings)
            if (not dataset_name) or (dataset_rankings[dataset_name] < self.min_rank):
                break
            size_gb = self.datasets.get_size(dataset_name)
            unavailable_sites = set(self.datasets.get_sites(dataset_name))
            for site_name in tmp_site_rankings.keys():
                if (self.sites.get_available_storage(site_name) < size_gb) or (tmp_site_rankings[site_name] <= 0):
                    unavailable_sites.add(site_name)
            for site_name in unavailable_sites:
                try:
                    del tmp_site_rankings[site_name]
                except:
                    continue
            if not tmp_site_rankings:
                break
            site_name = weighted_choice(tmp_site_rankings)
            subscription = (dataset_name, site_name)
            subscriptions.append(subscription)
            subscribed_gb += size_gb
            avail_storage = self.sites.get_available_storage(site_name)
            self.logger.info('rank: %s\tsize: %.2f\tdataset: %s', dataset_rankings[dataset_name], size_gb, dataset_name)
            self.logger.info('rank: %s\tstorage: %d\site: %s', site_rankings[site_name], avail_storage, site_name)
            new_avail_storage = avail_storage - self.datasets.get_size(dataset_name)
            if new_avail_storage > 0:
                new_rank = 0.0
            else:
                new_rank = (site_rankings[site_name]/avail_storage)*new_avail_storage
            site_rankings[site_name] = new_rank
            del dataset_rankings[dataset_name]
        self.logger.info('Subscribed %dGB', subscribed_gb)
        return subscriptions

    def subscribe(self, subscriptions):
        """
        Make subscriptions to phedex
        subscriptions = [(dataset_name, site_name), ...]
        """
        new_subscriptions = dict()
        for subscription in subscriptions:
            dataset_name = subscription[0]
            site_name = subscription[1]
            try:
                new_subscriptions[site_name].append(dataset_name)
            except:
                new_subscriptions[site_name] = list()
                new_subscriptions[site_name].append(dataset_name)
        for site_name, dataset_names in new_subscriptions.items():
            data = self.phedex.generate_xml(dataset_names)
            comments = 'This dataset is predicted to become popular and has therefore been automatically replicated by cuadrnt'
            api = 'subscribe'
            params = [('node', site_name), ('data', data), ('level','dataset'), ('move', 'n'), ('custodial', 'n'), ('group', 'AnalysisOps'), ('request_only', 'n'), ('no_mail', 'n'), ('comments', comments)]
            json_data = self.phedex.fetch(api=api, params=params, method='post')
            # insert into db
            group_name = 'AnalysisOps'
            request_id = 0
            request_type = 0
            try:
                request = json_data['phedex']
                request_id = request['request_created'][0]['id']
                request_created = timestamp_to_datetime(request['request_timestamp'])
            except:
                self.logger.warning('Subscription did not succeed\n\tSite:%s\n\tDatasets: %s', str(site_name), str(dataset_names))
                continue
            for dataset_name in dataset_names:
                coll = 'dataset_popularity'
                date = datetime_day(datetime.datetime.utcnow())
                pipeline = list()
                match = {'$match':{'name':dataset_name, 'date':date}}
                pipeline.append(match)
                project = {'$project':{'delta_popularity':1, '_id':0}}
                pipeline.append(project)
                data = self.storage.get_data(coll=coll, pipeline=pipeline)
                dataset_rank = data[0]['delta_popularity']
                query = "INSERT INTO Requests(RequestId, RequestType, DatasetId, SiteId, GroupId, Rank, Date) SELECT %s, %s, Datasets.DatasetId, Sites.SiteId, Groups.GroupId, %s, %s FROM Datasets, Sites, Groups WHERE Datasets.DatasetName=%s AND Sites.SiteName=%s AND Groups.GroupName=%s"
                values = (request_id, request_type, dataset_rank, request_created, dataset_name, site_name, group_name)
                self.mit_db.query(query=query, values=values, cache=False)
Example #7
0
class DatasetManager(object):
    """
    Handle all dataset related data
    """
    def __init__(self, config=dict()):
        self.logger = logging.getLogger(__name__)
        self.config = config
        self.phedex = PhEDExService(self.config)
        self.dbs = DBSService(self.config)
        self.storage = StorageManager(self.config)
        self.sites = SiteManager(self.config)
        self.MAX_THREADS = int(config['threading']['max_threads'])

    def initiate_db(self):
        """
        Initiate dataset data in database
        Get general data and popularity data from beginning
        """
        q = Queue.Queue()
        for i in range(self.MAX_THREADS):
            worker = threading.Thread(target=self.insert_dataset_data, args=(i, q))
            worker.daemon = True
            worker.start()
        active_sites = self.sites.get_active_sites()
        api = 'blockreplicas'
        params = [('node', active_sites), ('create_since', 0.0), ('complete', 'y'), ('dist_complete', 'y'), ('group', 'AnalysisOps'), ('show_dataset', 'y')]
        t1 = datetime.datetime.utcnow()
        phedex_data = self.phedex.fetch(api=api, params=params)
        t2 = datetime.datetime.utcnow()
        td = t2 - t1
        self.logger.info('Call to PhEDEx took %s', str(td))
        count = 1
        t1 = datetime.datetime.utcnow()
        for dataset_data in get_json(get_json(phedex_data, 'phedex'), 'dataset'):
            q.put((dataset_data, count))
            count += 1
        q.join()
        t2 = datetime.datetime.utcnow()
        td = t2 - t1
        self.logger.info('Inserting PhEDEx data took %s', str(td))
        self.logger.info('Done inserting datasets into DB')

    def update_db(self):
        """
        Get datasets currently in AnalysisOps and compare to database
        Deactivate removed datasets and insert new
        Update replicas
        """
        # get all datasets in database
        dataset_names = self.get_db_datasets()
        dataset_names = set(dataset_names)
        # get all active sites, only fetch replicas from these
        active_sites = self.sites.get_active_sites()
        api = 'blockreplicas'
        params = [('node', active_sites), ('create_since', 0.0), ('complete', 'y'), ('group', 'AnalysisOps'), ('show_dataset', 'y')]
        phedex_data = self.phedex.fetch(api=api, params=params)
        current_datasets = set()
        q = Queue.Queue()
        for i in range(self.MAX_THREADS):
            worker = threading.Thread(target=self.insert_dataset_data, args=(i, q))
            worker.daemon = True
            worker.start()
        count = 1
        for dataset_data in get_json(get_json(phedex_data, 'phedex'), 'dataset'):
            dataset_name = get_json(dataset_data, 'name')
            current_datasets.add(dataset_name)
            if dataset_name not in dataset_names:
                # this is a new dataset which need to be inserted into the database
                q.put((dataset_data, count))
                count += 1
            else:
                # update replicas
                replicas = self.get_replicas(dataset_data)
                coll = 'dataset_data'
                query = {'name':dataset_name}
                data = {'$set':{'replicas':replicas}}
                data = self.storage.update_data(coll=coll, query=query, data=data, upsert=False)
        q.join()
        deprecated_datasets = dataset_names - current_datasets
        for dataset_name in deprecated_datasets:
            self.remove_dataset(dataset_name)

    def insert_dataset_data(self, i, q):
        """
        Insert a new dataset into the database and initiate all data
        """
        while True:
            data = q.get()
            dataset_data = data[0]
            count = data[1]
            self.logger.debug('Inserting dataset number %d', count)
            dataset_name = get_json(dataset_data, 'name')
            replicas = self.get_replicas(dataset_data)
            coll = 'dataset_data'
            query = {'name':dataset_name}
            data = {'$set':{'name':dataset_name, 'replicas':replicas}}
            data = self.storage.update_data(coll=coll, query=query, data=data, upsert=True)
            self.insert_phedex_data(dataset_name)
            self.insert_dbs_data(dataset_name)
            q.task_done()

    def insert_phedex_data(self, dataset_name):
        """
        Fetch phedex data about dataset and insert into database
        """
        api = 'data'
        params = {'dataset':dataset_name, 'level':'block', 'create_since':0.0}
        phedex_data = self.phedex.fetch(api=api, params=params)
        size_bytes = 0
        n_files = 0
        try:
            dataset_data = get_json(get_json(get_json(phedex_data, 'phedex'), 'dbs')[0],'dataset')[0]
        except:
            coll = 'dataset_data'
            query = {'name':dataset_name}
            self.storage.delete_data(coll=coll, query=query)
            return
        for block_data in get_json(dataset_data, 'block'):
            size_bytes += get_json(block_data, 'bytes')
            n_files += get_json(block_data, 'files')
        coll = 'dataset_data'
        query = {'name':dataset_name}
        data = {'$set':{'size_bytes':size_bytes, 'n_files':n_files}}
        self.storage.update_data(coll=coll, query=query, data=data, upsert=False)

    def insert_dbs_data(self, dataset_name):
        """
        Fetch dbs data about dataset and insert into database
        """
        api = 'datasets'
        params = {'dataset':dataset_name, 'detail':True, 'dataset_access_type':'*'}
        dbs_data = self.dbs.fetch(api=api, params=params)
        try:
            dataset_data = get_json(dbs_data, 'data')[0]
        except:
            coll = 'dataset_data'
            query = {'name':dataset_name}
            self.storage.delete_data(coll=coll, query=query)
            return
        ds_name = get_json(dataset_data, 'primary_ds_name')
        physics_group = get_json(dataset_data, 'physics_group_name')
        data_tier = get_json(dataset_data, 'data_tier_name')
        creation_date = datetime_day(timestamp_to_datetime(get_json(dataset_data, 'creation_date')))
        ds_type = get_json(dataset_data, 'primary_ds_type')
        coll = 'dataset_data'
        query = {'name':dataset_name}
        data = {'$set':{'ds_name':ds_name, 'physics_group':physics_group, 'data_tier':data_tier, 'creation_date':creation_date, 'ds_type':ds_type}}
        self.storage.update_data(coll=coll, query=query, data=data, upsert=False)

    def get_replicas(self, dataset_data):
        """
        Generator function to get all replicas of a dataset
        """
        replicas = list()
        for block_data in get_json(dataset_data, 'block'):
            for replica_data in get_json(block_data, 'replica'):
                if get_json(replica_data, 'files') > 0:
                    replicas.append(get_json(replica_data, 'node'))
        return replicas

    def get_db_datasets(self):
        """
        Get all datasets currently in database
        """
        coll = 'dataset_data'
        pipeline = list()
        project = {'$project':{'name':1, '_id':0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        dataset_names = [dataset_data['name'] for dataset_data in data]
        self.logger.info('%d datasets present in database', len(dataset_names))
        return dataset_names

    def remove_dataset(self, dataset_name):
        """
        Remove dataset from database
        """
        coll = 'dataset_data'
        query = {'name':dataset_name}
        self.storage.delete_data(coll=coll, query=query)

    def get_sites(self, dataset_name):
        """
        Get all sites with a replica of the dataset
        """
        coll = 'dataset_data'
        pipeline = list()
        match = {'$match':{'name':dataset_name}}
        pipeline.append(match)
        project = {'$project':{'replicas':1, '_id':0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        site_names = data[0]['replicas']
        return site_names

    def get_size(self, dataset_name):
        """
        Get size in GB of dataset
        """
        coll = 'dataset_data'
        pipeline = list()
        match = {'$match':{'name':dataset_name}}
        pipeline.append(match)
        project = {'$project':{'size_bytes':1, '_id':0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        size_gb = float(data[0]['size_bytes'])/10**9
        return size_gb