Example #1
0
 def __init__(self, config=dict()):
     self.logger = logging.getLogger(__name__)
     self.config = config
     self.pop_db = PopDBService(self.config)
     self.sites = SiteManager(self.config)
     self.datasets = DatasetManager(self.config)
     self.storage = StorageManager(self.config)
     self.MAX_THREADS = int(config['threading']['max_threads'])
Example #2
0
 def __init__(self, config=dict()):
     self.logger = logging.getLogger(__name__)
     self.config = config
     self.intelroccs = IntelROCCSService(self.config)
     self.crab = CRABService(self.config)
     self.storage = StorageManager(self.config)
     self.soft_limit = float(self.config['rocker_board']['soft_limit'])
     self.hard_limit = float(self.config['rocker_board']['hard_limit'])
Example #3
0
 def __init__(self, config=dict()):
     self.logger = logging.getLogger(__name__)
     self.config = config
     self.sites = SiteManager(self.config)
     self.datasets = DatasetManager(self.config)
     self.popularity = PopularityManager(self.config)
     self.storage = StorageManager(self.config)
     self.max_replicas = int(config['rocker_board']['max_replicas'])
     self.MAX_THREADS = int(config['threading']['max_threads'])
     self.dataset_popularity = dict()
Example #4
0
 def __init__(self, config=dict()):
     self.logger = logging.getLogger(__name__)
     self.config = config
     self.phedex = PhEDExService(self.config)
     self.mit_db = MITDBService(self.config)
     self.datasets = DatasetManager(self.config)
     self.sites = SiteManager(self.config)
     self.popularity = PopularityManager(self.config)
     self.storage = StorageManager(self.config)
     self.rankings = Ranker(self.config)
     self.max_gb = int(self.config['rocker_board']['max_gb'])
     self.csv_data = list()
Example #5
0
 def __init__(self, config=dict()):
     self.logger = logging.getLogger(__name__)
     self.config = config
     self.storage = StorageManager(self.config)
     self.sites = SiteManager(self.config)
     self.datasets = DatasetManager(self.config)
     self.popularity = PopularityManager(self.config)
Example #6
0
 def __init__(self, config=dict()):
     self.logger = logging.getLogger(__name__)
     self.config = config
     self.intelroccs = IntelROCCSService(self.config)
     self.crab = CRABService(self.config)
     self.storage = StorageManager(self.config)
     self.soft_limit = float(self.config['rocker_board']['soft_limit'])
     self.hard_limit = float(self.config['rocker_board']['hard_limit'])
Example #7
0
class GenericService(object):
    """
    Generic cuadrnt service class
    Shared properties between services:
        Contact a web service using a base url and some key:value parameters
        Services require a valid cert and key
        Want to cache results in a document-oriented database
    """

    def __init__(self, config=dict()):
        self.logger = logging.getLogger(__name__)
        self.config = config
        self.storage = StorageManager(self.config)
        self.SERVICE = "generic"
        self.TARGET_URL = ""

    def fetch(self, api, params=dict(), method="get", secure=True, cache=True, cache_only=False, force_cache=False):
        """
        Get data from url using parameters params
        If param cache is true update cache on cache miss
        If param cache_only is true just update the cache, don't return any data.
            Use this parameter to spawn external thread to update cache in background
        """
        if cache:
            json_data = dict()
            if not force_cache:
                json_data = self.storage.get_cache(self.SERVICE, api, params)
            if not json_data:
                if secure:
                    json_data = get_secure_data(target_url=self.TARGET_URL, api=api, params=params, method=method)
                else:
                    json_data = get_data(target_url=self.TARGET_URL, api=api, file_=params)
                if type(json_data) is not dict:
                    json_data = {"data": json_data}
                self.storage.insert_cache(self.SERVICE, api, params, json_data)
            if not cache_only:
                return json_data
        else:
            if secure:
                json_data = get_secure_data(target_url=self.TARGET_URL, api=api, params=params, method=method)
            else:
                json_data = get_data(target_url=self.TARGET_URL, api=api, file_=params)
            if type(json_data) is not dict:
                json_data = {"data": json_data}
            return json_data
Example #8
0
 def __init__(self, config=dict()):
     self.logger = logging.getLogger(__name__)
     self.config = config
     self.phedex = PhEDExService(self.config)
     self.dbs = DBSService(self.config)
     self.storage = StorageManager(self.config)
     self.sites = SiteManager(self.config)
     self.valid_tiers = config['tools']['valid_tiers'].split(',')
     self.MAX_THREADS = int(config['threading']['max_threads'])
Example #9
0
 def __init__(self, config=dict()):
     self.logger = logging.getLogger(__name__)
     self.config = config
     self.sites = SiteManager(self.config)
     self.datasets = DatasetManager(self.config)
     self.popularity = PopularityManager(self.config)
     self.storage = StorageManager(self.config)
     self.max_replicas = int(config['rocker_board']['max_replicas'])
     self.MAX_THREADS = int(config['threading']['max_threads'])
     self.dataset_popularity = dict()
Example #10
0
 def __init__(self, config=dict()):
     self.logger = logging.getLogger(__name__)
     self.config = config
     self.phedex = PhEDExService(self.config)
     self.mit_db = MITDBService(self.config)
     self.datasets = DatasetManager(self.config)
     self.sites = SiteManager(self.config)
     self.popularity = PopularityManager(self.config)
     self.storage = StorageManager(self.config)
     self.rankings = Ranker(self.config)
     self.max_gb = int(self.config["rocker_board"]["max_gb"])
     self.csv_data = list()
Example #11
0
 def __init__(self, config=dict()):
     self.logger = logging.getLogger(__name__)
     self.config = config
     self.sites = SiteManager(self.config)
     self.datasets = DatasetManager(self.config)
     self.popularity = PopularityManager(self.config)
     self.storage = StorageManager(self.config)
     self.max_replicas = int(config['rocker_board']['max_replicas'])
     self.name = 'generic'
     self.data_path = self.config['paths']['data']
     self.data_tiers = config['tools']['valid_tiers'].split(',')
     self.preprocessed_data = dict()
     self.clf_trend = dict()
     self.clf_avg = dict()
Example #12
0
class SiteManager(object):
    """
    Keep track of site data
    """
    def __init__(self, config=dict()):
        self.logger = logging.getLogger(__name__)
        self.config = config
        self.intelroccs = IntelROCCSService(self.config)
        self.crab = CRABService(self.config)
        self.storage = StorageManager(self.config)
        self.soft_limit = float(self.config['rocker_board']['soft_limit'])
        self.hard_limit = float(self.config['rocker_board']['hard_limit'])

    def initiate_db(self):
        """
        Initiate Site database
        Does exactly the same as update_db
        """
        self.update_db()

    def update_db(self):
        """
        Initiate site data in database
        Get general data about all sites
        """
        api = 'Detox'
        file_ = 'SitesInfo.txt'
        intelroccs_data = self.intelroccs.fetch(api=api,
                                                params=file_,
                                                secure=False)
        for site_data in get_json(intelroccs_data, 'data'):
            self.insert_site_data(site_data)

    def insert_site_data(self, site_data):
        """
        Insert site into database
        """
        coll = 'site_data'
        site_name = str(site_data[4])
        site_status = int(site_data[0])
        site_quota = int(site_data[1]) * 10**3
        query = {'name': site_name}
        data = {
            '$set': {
                'name': site_name,
                'status': site_status,
                'quota_gb': site_quota
            }
        }
        self.storage.update_data(coll=coll,
                                 query=query,
                                 data=data,
                                 upsert=True)

    def update_cpu(self):
        """
        Update maximum CPU capacity for site
        """
        active_sites = self.get_active_sites()
        for site_name in active_sites:
            # remove older values
            date = datetime.datetime.utcnow() - datetime.timedelta(days=30)
            coll = 'site_data'
            query = {'name': site_name}
            data = {'$pull': {'cpu_data': {'date': {'$lt': date}}}}
            self.storage.update_data(coll=coll, query=query, data=data)
            # get CRAB data about site
            query = 'GLIDEIN_CMSSite =?= "%s" && CPUs > 0' % (site_name)
            attributes = ['GLIDEIN_CMSSite', 'CPUs']
            ads = self.crab.fetch_cluster_ads(query, attributes=attributes)
            cpus = 0
            for ad in ads:
                cpus += ad['CPUs']
            # insert new data
            date = datetime.datetime.utcnow()
            query = {'name': site_name}
            data = {'$push': {'cpu_data': {'date': date, 'cpus': cpus}}}
            self.storage.update_data(coll=coll, query=query, data=data)

    def get_active_sites(self):
        """
        Get all sites which are active, includes sites which are not available for replication
        """
        coll = 'site_data'
        pipeline = list()
        match = {'$match': {'status': {'$in': [1, 2]}}}
        pipeline.append(match)
        project = {'$project': {'name': 1, '_id': 0}}
        pipeline.append(project)
        sites_data = self.storage.get_data(coll=coll, pipeline=pipeline)
        return [site_data['name'] for site_data in sites_data]

    def get_available_sites(self):
        """
        Get all sites which are available for replication
        """
        coll = 'site_data'
        pipeline = list()
        match = {'$match': {'status': 1}}
        pipeline.append(match)
        project = {'$project': {'name': 1, '_id': 0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        return [site['name'] for site in data]

    def get_performance(self, site_name):
        """
        Maximum num CPU's divide by quota
        """
        max_cpus = self.get_max_cpu(site_name)
        quota_gb = self.get_quota(site_name)
        try:
            performance = float(max_cpus) / float(quota_gb / 10**3)
        except:
            performance = 0.0
        return performance

    def get_available_storage(self, site_name):
        """
        Get total AnalysisOps storage available at the site
        """
        size_gb = self.get_data(site_name)
        quota_gb = self.get_quota(site_name)
        available_gb = max(0, (self.hard_limit * quota_gb) - size_gb)
        return available_gb

    def get_all_available_storage(self):
        """
        Get available storage for all sites
        """
        available_storage = dict()
        available_sites = self.get_available_sites()
        for site_name in available_sites:
            available_storage[site_name] = self.get_available_storage(
                site_name)
        return available_storage

    def get_over_soft_limit(self, site_name):
        """
        Get the amount of GB a site is over the soft limit i.e lower limit.
        If it's not over set to 0
        """
        size_gb = self.get_data(site_name)
        quota_gb = self.get_quota(site_name)
        over_gb = size_gb - (self.soft_limit * quota_gb)
        return over_gb

    def get_data(self, site_name):
        """
        Get the amount of data at the site
        """
        coll = 'dataset_data'
        pipeline = list()
        match = {'$match': {'replicas': site_name}}
        pipeline.append(match)
        group = {
            '$group': {
                '_id': None,
                'size_bytes': {
                    '$sum': '$size_bytes'
                }
            }
        }
        pipeline.append(group)
        project = {'$project': {'size_bytes': 1, '_id': 0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        try:
            size_gb = data[0]['size_bytes'] / 10**9
        except:
            return 0
        return size_gb

    def get_quota(self, site_name):
        """
        Get the AnalysisOps quota for the site
        """
        coll = 'site_data'
        pipeline = list()
        match = {'$match': {'name': site_name}}
        pipeline.append(match)
        project = {'$project': {'quota_gb': 1, '_id': 0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        try:
            quota_gb = data[0]['quota_gb']
        except:
            quota_gb = 0
        return quota_gb

    def get_max_cpu(self, site_name):
        """
        Get the maximum number of CPU's in the last 30 days at the site
        """
        coll = 'site_data'
        pipeline = list()
        match = {'$match': {'name': site_name}}
        pipeline.append(match)
        unwind = {'$unwind': '$cpu_data'}
        pipeline.append(unwind)
        group = {
            '$group': {
                '_id': '$name',
                'max_cpus': {
                    '$max': '$cpu_data.cpus'
                }
            }
        }
        pipeline.append(group)
        project = {'$project': {'max_cpus': 1, '_id': 0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        try:
            max_cpus = data[0]['max_cpus']
        except:
            self.logger.warning('Could not get site performance for %s',
                                site_name)
            return 0
        return max_cpus
Example #13
0
 def __init__(self, config=dict()):
     self.logger = logging.getLogger(__name__)
     self.config = config
     self.storage = StorageManager(self.config)
     self.SERVICE = "generic"
     self.TARGET_URL = ""
Example #14
0
class Ranker(object):
    """
    Generic Ranking class
    """
    def __init__(self, config=dict()):
        self.logger = logging.getLogger(__name__)
        self.config = config
        self.sites = SiteManager(self.config)
        self.datasets = DatasetManager(self.config)
        self.popularity = PopularityManager(self.config)
        self.storage = StorageManager(self.config)
        self.max_replicas = int(config['rocker_board']['max_replicas'])
        self.MAX_THREADS = int(config['threading']['max_threads'])
        self.dataset_popularity = dict()

    def get_dataset_rankings(self, date=datetime_day(datetime.datetime.utcnow())):
        """
        Generate dataset rankings
        """
        self.dataset_popularity = dict()
        dataset_names = self.datasets.get_db_datasets()
        q = Queue.Queue()
        for i in range(self.MAX_THREADS):
            worker = threading.Thread(target=self.get_dataset_popularity, args=(q,))
            worker.daemon = True
            worker.start()
        # self.dataset_features = self.popularity.get_features(dataset_names, date)
        # self.dataset_tiers = self.datasets.get_data_tiers(dataset_names)
        for dataset_name in dataset_names:
            q.put((dataset_name, date))
        q.join()
        dataset_rankings = self.normalize_popularity(date)
        return dataset_rankings

    def get_site_rankings(self, date=datetime_day(datetime.datetime.utcnow())):
        """
        Generate site rankings
        """
        # get all sites which can be replicated to
        site_names = self.sites.get_available_sites()
        site_rankings = dict()
        for site_name in site_names:
            # get popularity
            popularity = self.get_site_popularity(site_name, date)
            # get cpu and storage (performance)
            performance = self.sites.get_performance(site_name)
            # get available storage
            available_storage_tb = self.sites.get_available_storage(site_name)/10**3
            if available_storage_tb <= 0:
                available_storage_tb = 0
            else:
                available_storage_tb = 1
            #calculate rank
            try:
                rank = (performance*available_storage_tb)/popularity
            except:
                rank = 0.0
            # store into dict
            site_rankings[site_name] = rank
            # insert into database
            coll = 'site_rankings'
            query = {'name':site_name, 'date':date}
            data = {'$set':{'name':site_name, 'date':date, 'rank':rank, 'popularity':popularity}}
            self.storage.update_data(coll=coll, query=query, data=data, upsert=True)
        return site_rankings

    def get_dataset_popularity(self, q):
        """
        Get the estimated popularity for dataset
        """
        while True:
            # collect features
            data = q.get()
            dataset_name = data[0]
            date = data[1]
            popularity = 0.0
            # get average
            popularity = self.popularity.get_average_popularity(dataset_name, date)
            self.dataset_popularity[dataset_name] = popularity
            q.task_done()

    def get_site_popularity(self, site_name, date=datetime_day(datetime.datetime.utcnow())):
        """
        Get popularity for site
        """
        # get all datasets with a replica at the site and how many replicas it has
        coll = 'dataset_data'
        pipeline = list()
        match = {'$match':{'replicas':site_name}}
        pipeline.append(match)
        project = {'$project':{'name':1, '_id':0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        popularity = 0.0
        dataset_names = [dataset_data['name'] for dataset_data in data]
        # get the popularity of the dataset and decide by number of replicas
        coll = 'dataset_rankings'
        pipeline = list()
        match = {'$match':{'date':date}}
        pipeline.append(match)
        match = {'$match':{'name':{'$in':dataset_names}}}
        pipeline.append(match)
        group = {'$group':{'_id':'$date', 'total_popularity':{'$sum':'$popularity'}}}
        pipeline.append(group)
        project = {'$project':{'total_popularity':1, '_id':0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        try:
            popularity = data[0]['total_popularity']
        except:
            popularity = 0.0
        return popularity

    def get_site_storage_rankings(self, subscriptions):
        """
        Return the amount over the soft limit sites are including new subscriptions
        If site is not over just set to 0
        """
        site_rankings = dict()
        available_sites = self.sites.get_available_sites()
        for site_name in available_sites:
            site_rankings[site_name] = self.sites.get_over_soft_limit(site_name)
        for subscription in subscriptions:
            site_rankings[subscription[1]] += self.datasets.get_size(subscription[0])
        for site_name in available_sites:
            if site_rankings[site_name] < 0:
                del site_rankings[site_name]
        return site_rankings

    def normalize_popularity(self, date):
        """
        Normalize popularity values to be between 1 and max_replicas
        """
        dataset_rankings = dict()
        max_pop = max(self.dataset_popularity.iteritems(), key=operator.itemgetter(1))[1]
        min_pop = min(self.dataset_popularity.iteritems(), key=operator.itemgetter(1))[1]
        n = float(min_pop + (self.max_replicas - 1))/max_pop
        m = 1 - n*min_pop
        for dataset_name, popularity in self.dataset_popularity.items():
            # store into dict
            rank = int(n*self.dataset_popularity[dataset_name] + m)
            dataset_rankings[dataset_name] = rank
            coll = 'dataset_rankings'
            query = data = {'name':dataset_name, 'date':date}
            data = {'$set':{'name':dataset_name, 'date':date, 'rank':rank, 'popularity':popularity}}
            self.storage.update_data(coll=coll, query=query, data=data, upsert=True)
        return dataset_rankings
Example #15
0
class RockerBoard(object):
    """
    RockerBoard is a system balancing algorithm using popularity metrics to predict popularity
    and make appropriate replications to keep the system balanced
    """

    def __init__(self, config=dict()):
        self.logger = logging.getLogger(__name__)
        self.config = config
        self.phedex = PhEDExService(self.config)
        self.mit_db = MITDBService(self.config)
        self.datasets = DatasetManager(self.config)
        self.sites = SiteManager(self.config)
        self.popularity = PopularityManager(self.config)
        self.storage = StorageManager(self.config)
        self.rankings = Ranker(self.config)
        self.max_gb = int(self.config["rocker_board"]["max_gb"])
        self.csv_data = list()

    def start(self, date=datetime_day(datetime.datetime.utcnow())):
        """
        Begin Rocker Board Algorithm
        """
        t1 = datetime.datetime.utcnow()
        # Get goals
        dataset_rankings = self.rankings.get_dataset_rankings(date)
        site_rankings = self.rankings.get_site_rankings(date)
        self.change_dataset_rankings(dataset_rankings)
        subscriptions = self.replicate(dataset_rankings, site_rankings)
        self.logger.info("SUBSCRIPTIONS")
        for subscription in subscriptions:
            self.logger.info("site: %s\tdataset: %s", subscription[1], subscription[0])
        # site_storage = self.rankings.get_site_storage_rankings(subscriptions)
        # deletions = self.clean(dataset_rankings, site_storage)
        # self.logger.info('DELETIONS')
        # for deletion in deletions:
        #     self.logger.info('site: %s\tdataset: %s', deletion[1], deletion[0])
        # self.delete(deletions)
        self.subscribe(subscriptions)
        # self.datasets.update_replicas(subscriptions, deletions)
        t2 = datetime.datetime.utcnow()
        td = t2 - t1
        self.logger.info("Rocker Board took %s", str(td))

    def change_dataset_rankings(self, dataset_rankings):
        """
        Change the ranks from being the target number of replicas to being the
        change in number of replicas required to reach the goal
        """
        current_replicas = self.datasets.get_current_num_replicas()
        for dataset in current_replicas:
            dataset_rankings[dataset["name"]] -= dataset["n_replicas"]

    def replicate(self, dataset_rankings, site_rankings):
        """
        Balance system by creating new replicas based on popularity
        """
        subscriptions = list()
        subscribed_gb = 0
        sites_available_storage_gb = self.sites.get_all_available_storage()
        while (subscribed_gb < self.max_gb) and site_rankings:
            tmp_site_rankings = dict()
            for k, v in site_rankings.items():
                tmp_site_rankings[k] = v
            dataset = max(dataset_rankings.iteritems(), key=operator.itemgetter(1))
            dataset_name = dataset[0]
            dataset_rank = dataset[1]
            if (not dataset_name) or (dataset_rank < 1):
                break
            size_gb = self.datasets.get_size(dataset_name)
            unavailable_sites = set(self.datasets.get_sites(dataset_name))
            for site_name in tmp_site_rankings.keys():
                if (self.sites.get_available_storage(site_name) < size_gb) or (tmp_site_rankings[site_name] <= 0):
                    unavailable_sites.add(site_name)
            for site_name in unavailable_sites:
                try:
                    del tmp_site_rankings[site_name]
                except:
                    continue
            if not tmp_site_rankings:
                del dataset_rankings[dataset_name]
                continue
            site_name = weighted_choice(tmp_site_rankings)
            subscription = (dataset_name, site_name)
            subscriptions.append(subscription)
            subscribed_gb += size_gb
            sites_available_storage_gb[site_name] -= size_gb
            self.logger.info("%s : added", dataset_name)
            if sites_available_storage_gb[site_name] <= 0:
                del site_rankings[site_name]
            dataset_rankings[dataset_name] -= 1
        self.logger.info("Subscribed %dGB", subscribed_gb)
        return subscriptions

    def clean(self, dataset_rankings, site_rankings):
        """
        Suggest deletions based on dataset and site rankings
        """
        deletions = list()
        deleted_gb = 0
        while site_rankings:
            tmp_site_rankings = dict()
            dataset = min(dataset_rankings.iteritems(), key=operator.itemgetter(1))
            dataset_name = dataset[0]
            size_gb = self.datasets.get_size(dataset_name)
            available_sites = set(self.datasets.get_sites(dataset_name))
            for site_name in available_sites:
                try:
                    tmp_site_rankings[site_name] = site_rankings[site_name]
                except:
                    continue
            if not tmp_site_rankings:
                del dataset_rankings[dataset_name]
                continue
            site_name = weighted_choice(tmp_site_rankings)
            deletion = (dataset_name, site_name)
            deletions.append(deletion)
            deleted_gb += size_gb
            site_rankings[site_name] -= size_gb
            dataset_rankings[dataset_name] += 1
            if site_rankings[site_name] <= 0:
                del site_rankings[site_name]
        self.logger.info("Deleted %dGB", deleted_gb)
        return deletions

    def subscribe(self, subscriptions):
        """
        Make subscriptions to phedex
        subscriptions = [(dataset_name, site_name), ...]
        """
        new_subscriptions = dict()
        for subscription in subscriptions:
            dataset_name = subscription[0]
            site_name = subscription[1]
            try:
                new_subscriptions[site_name].append(dataset_name)
            except:
                new_subscriptions[site_name] = list()
                new_subscriptions[site_name].append(dataset_name)
        for site_name, dataset_names in new_subscriptions.items():
            data = self.phedex.generate_xml(dataset_names)
            comments = (
                "This dataset is predicted to become popular and has therefore been automatically replicated by cuadrnt"
            )
            api = "subscribe"
            params = [
                ("node", site_name),
                ("data", data),
                ("level", "dataset"),
                ("move", "n"),
                ("custodial", "n"),
                ("group", "AnalysisOps"),
                ("request_only", "n"),
                ("no_mail", "n"),
                ("comments", comments),
            ]
            json_data = self.phedex.fetch(api=api, params=params, method="post")
            # insert into db
            group_name = "AnalysisOps"
            request_id = 0
            request_type = 0
            try:
                request = json_data["phedex"]
                request_id = request["request_created"][0]["id"]
                request_created = timestamp_to_datetime(request["request_timestamp"])
            except:
                self.logger.warning(
                    "Subscription did not succeed\n\tSite:%s\n\tDatasets: %s", str(site_name), str(dataset_names)
                )
                continue
            for dataset_name in dataset_names:
                coll = "dataset_rankings"
                date = datetime_day(datetime.datetime.utcnow())
                pipeline = list()
                match = {"$match": {"name": dataset_name, "date": date}}
                pipeline.append(match)
                project = {"$project": {"delta_rank": 1, "_id": 0}}
                pipeline.append(project)
                data = self.storage.get_data(coll=coll, pipeline=pipeline)
                dataset_rank = data[0]["delta_rank"]
                query = "INSERT INTO Requests(RequestId, RequestType, DatasetId, SiteId, GroupId, Rank, Date) SELECT %s, %s, Datasets.DatasetId, Sites.SiteId, Groups.GroupId, %s, %s FROM Datasets, Sites, Groups WHERE Datasets.DatasetName=%s AND Sites.SiteName=%s AND Groups.GroupName=%s"
                values = (request_id, request_type, dataset_rank, request_created, dataset_name, site_name, group_name)
                self.mit_db.query(query=query, values=values, cache=False)

    def delete(self, deletions):
        """
        Make deletions to phedex
        deletions = [(dataset_name, site_name), ...]
        """
        new_deletions = dict()
        for deletion in deletions:
            dataset_name = deletion[0]
            site_name = deletion[1]
            try:
                new_deletions[site_name].append(dataset_name)
            except:
                new_deletions[site_name] = list()
                new_deletions[site_name].append(dataset_name)
        for site_name, dataset_names in new_deletions.items():
            data = self.phedex.generate_xml(dataset_names)
            comments = "This dataset is predicted to become less popular and has therefore been automatically deleted by cuadrnt"
            api = "delete"
            params = [
                ("node", site_name),
                ("data", data),
                ("level", "dataset"),
                ("rm_subscriptions", "y"),
                ("comments", comments),
            ]
            json_data = self.phedex.fetch(api=api, params=params, method="post")
            # insert into db
            group_name = "AnalysisOps"
            request_id = 0
            request_type = 1
            try:
                request = json_data["phedex"]
                request_id = request["request_created"][0]["id"]
                request_created = timestamp_to_datetime(request["request_timestamp"])
            except:
                self.logger.warning(
                    "Deletion did not succeed\n\tSite:%s\n\tDatasets: %s", str(site_name), str(dataset_names)
                )
                continue
            for dataset_name in dataset_names:
                coll = "dataset_rankings"
                date = datetime_day(datetime.datetime.utcnow())
                pipeline = list()
                match = {"$match": {"name": dataset_name, "date": date}}
                pipeline.append(match)
                project = {"$project": {"delta_rank": 1, "_id": 0}}
                pipeline.append(project)
                data = self.storage.get_data(coll=coll, pipeline=pipeline)
                dataset_rank = data[0]["delta_rank"]
                query = "INSERT INTO Requests(RequestId, RequestType, DatasetId, SiteId, GroupId, Rank, Date) SELECT %s, %s, Datasets.DatasetId, Sites.SiteId, Groups.GroupId, %s, %s FROM Datasets, Sites, Groups WHERE Datasets.DatasetName=%s AND Sites.SiteName=%s AND Groups.GroupName=%s"
                values = (request_id, request_type, dataset_rank, request_created, dataset_name, site_name, group_name)
                self.mit_db.query(query=query, values=values, cache=False)
Example #16
0
 def setUp(self):
     "Set up for test"
     self.config = get_config(path=opt_path, file_name='test.cfg')
     self.storage = StorageManager(config=self.config)
     self.storage.drop_db()
Example #17
0
class StorageTests(unittest.TestCase):
    """
    A test class for service classes
    """
    def setUp(self):
        "Set up for test"
        self.config = get_config(path=opt_path, file_name='test.cfg')
        self.storage = StorageManager(config=self.config)
        self.storage.drop_db()

    def tearDown(self):
        "Clean up"
        coll = 'test'
        query = dict()
        self.storage.delete_data(coll=coll, query=query)
        pipeline = list()
        match = {'$match':{}}
        pipeline.append(match)
        expected = list()
        result = self.storage.get_data(coll=coll, pipeline=pipeline)
        self.assertEqual(result, expected)
        self.storage.drop_db()

    #@unittest.skip("Skip Test")
    def test_cache(self):
        "Test storage cache"
        print ""
        phedex = PhEDExService(config=self.config)
        api = 'data'
        params = {'level':'block', 'dataset':'/DoubleElectron/Run2012D-22Jan2013-v1/AOD'}
        expected = '/DoubleElectron/Run2012D-22Jan2013-v1/AOD'
        phedex.fetch(api=api, params=params, cache_only=True, force_cache=True)
        cache_data = self.storage.get_cache(coll='phedex', api=api, params=params)
        try:
            result = cache_data['phedex']['dbs'][0]['dataset'][0]['name']
        except KeyError:
            self.assertTrue(False)
        else:
            self.assertEqual(result, expected)

    #@unittest.skip("Skip Test")
    def test_data(self):
        "Test general collection manipulation functions"
        coll = 'test'
        # insert
        data = [{'foo':'bar_1'}, {'foo':'bar_2'}]
        self.storage.insert_data(coll=coll, data=data)
        # get
        pipeline = list()
        match = {'$match':{'foo':'bar_2'}}
        pipeline.append(match)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        expected = 'bar_2'
        result = data[0]['foo']
        self.assertEqual(result, expected)
        # update
        query = {'foo':'bar_1'}
        data = {'$set':{'foo':'bar_3'}}
        self.storage.update_data(coll=coll, query=query, data=data)
        pipeline = list()
        match = {'$match':{'foo':'bar_3'}}
        pipeline.append(match)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        expected = 'bar_3'
        result = data[0]['foo']
        self.assertEqual(result, expected)
        # last insert timestamp
        data = [{'foo':'bar_4'}]
        datetime_1 = datetime.utcnow().replace(microsecond=0)
        self.storage.insert_data(coll=coll, data=data)
        datetime_2 = self.storage.get_last_insert_time(coll)
        self.assertTrue(datetime_1 <= datetime_2)
Example #18
0
class GenericService(object):
    """
    Generic cuadrnt service class
    Shared properties between services:
        Contact a web service using a base url and some key:value parameters
        Services require a valid cert and key
        Want to cache results in a document-oriented database
    """
    def __init__(self, config=dict()):
        self.logger = logging.getLogger(__name__)
        self.config = config
        self.storage = StorageManager(self.config)
        self.SERVICE = 'generic'
        self.TARGET_URL = ''

    def fetch(self,
              api,
              params=dict(),
              method='get',
              secure=True,
              cache=True,
              cache_only=False,
              force_cache=False):
        """
        Get data from url using parameters params
        If param cache is true update cache on cache miss
        If param cache_only is true just update the cache, don't return any data.
            Use this parameter to spawn external thread to update cache in background
        """
        if cache:
            json_data = dict()
            if not force_cache:
                json_data = self.storage.get_cache(self.SERVICE, api, params)
            if not json_data:
                if secure:
                    json_data = get_secure_data(target_url=self.TARGET_URL,
                                                api=api,
                                                params=params,
                                                method=method)
                else:
                    json_data = get_data(target_url=self.TARGET_URL,
                                         api=api,
                                         file_=params)
                if type(json_data) is not dict:
                    json_data = {'data': json_data}
                self.storage.insert_cache(self.SERVICE, api, params, json_data)
            if not cache_only:
                return json_data
        else:
            if secure:
                json_data = get_secure_data(target_url=self.TARGET_URL,
                                            api=api,
                                            params=params,
                                            method=method)
            else:
                json_data = get_data(target_url=self.TARGET_URL,
                                     api=api,
                                     file_=params)
            if type(json_data) is not dict:
                json_data = {'data': json_data}
            return json_data
Example #19
0
 def __init__(self, config=dict()):
     self.logger = logging.getLogger(__name__)
     self.config = config
     self.storage = StorageManager(self.config)
     self.SERVICE = 'generic'
     self.TARGET_URL = ''
Example #20
0
class RockerBoard(object):
    """
    RockerBoard is a system balancing algorithm using popularity metrics to predict popularity
    and make appropriate replications to keep the system balanced
    """
    def __init__(self, config=dict()):
        self.logger = logging.getLogger(__name__)
        self.config = config
        self.phedex = PhEDExService(self.config)
        self.mit_db = MITDBService(self.config)
        self.datasets = DatasetManager(self.config)
        self.sites = SiteManager(self.config)
        self.popularity = PopularityManager(self.config)
        self.storage = StorageManager(self.config)
        self.rankings = Ranker(self.config)
        self.max_gb = int(self.config['rocker_board']['max_gb'])
        self.csv_data = list()

    def start(self, date=datetime_day(datetime.datetime.utcnow())):
        """
        Begin Rocker Board Algorithm
        """
        t1 = datetime.datetime.utcnow()
        # Get goals
        dataset_rankings = self.rankings.get_dataset_rankings(date)
        site_rankings = self.rankings.get_site_rankings(date)
        self.change_dataset_rankings(dataset_rankings)
        subscriptions = self.replicate(dataset_rankings, site_rankings)
        self.logger.info('SUBSCRIPTIONS')
        for subscription in subscriptions:
            self.logger.info('site: %s\tdataset: %s', subscription[1], subscription[0])
        # self.subscribe(subscriptions)
        t2 = datetime.datetime.utcnow()
        td = t2 - t1
        self.logger.info('Rocker Board took %s', str(td))

    def change_dataset_rankings(self, dataset_rankings):
        """
        Change the ranks from being the target number of replicas to being the
        change in number of replicas required to reach the goal
        """
        current_replicas = self.datasets.get_current_num_replicas()
        for dataset in current_replicas:
            dataset_rankings[dataset['name']] -= dataset['n_replicas']

    def replicate(self, dataset_rankings, site_rankings):
        """
        Balance system by creating new replicas based on popularity
        """
        subscriptions = list()
        subscribed_gb = 0
        sites_available_storage_gb = self.sites.get_all_available_storage()
        while (subscribed_gb < self.max_gb) and site_rankings:
            tmp_site_rankings = dict()
            for k, v in site_rankings.items():
                tmp_site_rankings[k] = v
            dataset = max(dataset_rankings.iteritems(), key=operator.itemgetter(1))
            dataset_name = dataset[0]
            dataset_rank = dataset[1]
            if (not dataset_name) or (dataset_rank < 1):
                break
            size_gb = self.datasets.get_size(dataset_name)
            unavailable_sites = set(self.datasets.get_sites(dataset_name))
            for site_name in tmp_site_rankings.keys():
                if (self.sites.get_available_storage(site_name) < size_gb) or (tmp_site_rankings[site_name] <= 0):
                    unavailable_sites.add(site_name)
            for site_name in unavailable_sites:
                try:
                    del tmp_site_rankings[site_name]
                except:
                    continue
            if not tmp_site_rankings:
                del dataset_rankings[dataset_name]
                continue
            site_name = weighted_choice(tmp_site_rankings)
            subscription = (dataset_name, site_name)
            subscriptions.append(subscription)
            subscribed_gb += size_gb
            sites_available_storage_gb[site_name] -= size_gb
            self.logger.info('%s : added', dataset_name)
            if sites_available_storage_gb[site_name] <= 0:
                del site_rankings[site_name]
            dataset_rankings[dataset_name] -= 1
        self.logger.info('Subscribed %dGB', subscribed_gb)
        return subscriptions

    def subscribe(self, subscriptions):
        """
        Make subscriptions to phedex
        subscriptions = [(dataset_name, site_name), ...]
        """
        new_subscriptions = dict()
        for subscription in subscriptions:
            dataset_name = subscription[0]
            site_name = subscription[1]
            try:
                new_subscriptions[site_name].append(dataset_name)
            except:
                new_subscriptions[site_name] = list()
                new_subscriptions[site_name].append(dataset_name)
        for site_name, dataset_names in new_subscriptions.items():
            data = self.phedex.generate_xml(dataset_names)
            comments = 'This dataset is predicted to become popular and has therefore been automatically replicated by cuadrnt'
            api = 'subscribe'
            params = [('node', site_name), ('data', data), ('level','dataset'), ('move', 'n'), ('custodial', 'n'), ('group', 'AnalysisOps'), ('request_only', 'n'), ('no_mail', 'n'), ('comments', comments)]
            json_data = self.phedex.fetch(api=api, params=params, method='post')
            # insert into db
            group_name = 'AnalysisOps'
            request_id = 0
            request_type = 0
            try:
                request = json_data['phedex']
                request_id = request['request_created'][0]['id']
                request_created = timestamp_to_datetime(request['request_timestamp'])
            except:
                self.logger.warning('Subscription did not succeed\n\tSite:%s\n\tDatasets: %s', str(site_name), str(dataset_names))
                continue
            for dataset_name in dataset_names:
                coll = 'dataset_rankings'
                date = datetime_day(datetime.datetime.utcnow())
                pipeline = list()
                match = {'$match':{'name':dataset_name, 'date':date}}
                pipeline.append(match)
                project = {'$project':{'delta_rank':1, '_id':0}}
                pipeline.append(project)
                data = self.storage.get_data(coll=coll, pipeline=pipeline)
                dataset_rank = data[0]['delta_rank']
                query = "INSERT INTO Requests(RequestId, RequestType, DatasetId, SiteId, GroupId, Rank, Date) SELECT %s, %s, Datasets.DatasetId, Sites.SiteId, Groups.GroupId, %s, %s FROM Datasets, Sites, Groups WHERE Datasets.DatasetName=%s AND Sites.SiteName=%s AND Groups.GroupName=%s"
                values = (request_id, request_type, dataset_rank, request_created, dataset_name, site_name, group_name)
                self.mit_db.query(query=query, values=values, cache=False)
Example #21
0
class DatasetManager(object):
    """
    Handle all dataset related data
    """
    def __init__(self, config=dict()):
        self.logger = logging.getLogger(__name__)
        self.config = config
        self.phedex = PhEDExService(self.config)
        self.dbs = DBSService(self.config)
        self.storage = StorageManager(self.config)
        self.sites = SiteManager(self.config)
        self.valid_tiers = config['tools']['valid_tiers'].split(',')
        self.MAX_THREADS = int(config['threading']['max_threads'])

    def initiate_db(self):
        """
        Initiate dataset data in database
        Get general data and popularity data from beginning
        """
        q = Queue.Queue()
        for i in range(self.MAX_THREADS):
            worker = threading.Thread(target=self.insert_dataset_data, args=(i, q))
            worker.daemon = True
            worker.start()
        active_sites = self.sites.get_active_sites()
        api = 'blockreplicas'
        params = [('node', active_sites), ('create_since', 0.0), ('complete', 'y'), ('dist_complete', 'y'), ('group', 'AnalysisOps'), ('show_dataset', 'y')]
        t1 = datetime.datetime.utcnow()
        phedex_data = self.phedex.fetch(api=api, params=params)
        t2 = datetime.datetime.utcnow()
        td = t2 - t1
        self.logger.info('Call to PhEDEx took %s', str(td))
        count = 1
        t1 = datetime.datetime.utcnow()
        for dataset_data in get_json(get_json(phedex_data, 'phedex'), 'dataset'):
            q.put((dataset_data, count))
            count += 1
        q.join()
        t2 = datetime.datetime.utcnow()
        td = t2 - t1
        self.logger.info('Inserting dataset data took %s', str(td))
        self.logger.info('Done inserting datasets into DB')

    def update_db(self):
        """
        Get datasets currently in AnalysisOps and compare to database
        Deactivate removed datasets and insert new
        Update replicas
        """
        # get all datasets in database
        dataset_names = self.get_db_datasets()
        dataset_names = set(dataset_names)
        # get all active sites, only fetch replicas from these
        active_sites = self.sites.get_active_sites()
        api = 'blockreplicas'
        params = [('node', active_sites), ('create_since', 0.0), ('complete', 'y'), ('group', 'AnalysisOps'), ('show_dataset', 'y')]
        t1 = datetime.datetime.utcnow()
        phedex_data = self.phedex.fetch(api=api, params=params)
        t2 = datetime.datetime.utcnow()
        td = t2 - t1
        self.logger.info('Call to PhEDEx took %s', str(td))
        current_datasets = set()
        q = Queue.Queue()
        for i in range(self.MAX_THREADS):
            worker = threading.Thread(target=self.insert_dataset_data, args=(i, q))
            worker.daemon = True
            worker.start()
        count = 1
        t1 = datetime.datetime.utcnow()
        for dataset_data in get_json(get_json(phedex_data, 'phedex'), 'dataset'):
            dataset_name = get_json(dataset_data, 'name')
            current_datasets.add(dataset_name)
            if dataset_name not in dataset_names:
                # this is a new dataset which need to be inserted into the database
                q.put((dataset_data, count))
                count += 1
            else:
                # update replicas
                replicas = self.get_replicas(dataset_data)
                coll = 'dataset_data'
                query = {'name':dataset_name}
                data = {'$set':{'replicas':replicas}}
                data = self.storage.update_data(coll=coll, query=query, data=data)
        q.join()
        deprecated_datasets = dataset_names - current_datasets
        for dataset_name in deprecated_datasets:
            self.remove_dataset(dataset_name)
        t2 = datetime.datetime.utcnow()
        td = t2 - t1
        self.logger.info('Updating dataset data took %s', str(td))
        self.logger.info('Done updating datasets in DB')

    def insert_dataset_data(self, i, q):
        """
        Insert a new dataset into the database and initiate all data
        """
        while True:
            data = q.get()
            dataset_data = data[0]
            count = data[1]
            self.logger.debug('Inserting dataset number %d', count)
            dataset_name = get_json(dataset_data, 'name')
            coll = 'dataset_data'
            query = {'name':dataset_name}
            data = {'$set':{'name':dataset_name}}
            data = self.storage.update_data(coll=coll, query=query, data=data, upsert=True)
            try:
                self.insert_phedex_data(dataset_name)
                self.insert_dbs_data(dataset_name)
                replicas = self.get_replicas(dataset_data)
                query = {'name':dataset_name}
                data = {'$set':{'name':dataset_name, 'replicas':replicas}}
                data = self.storage.update_data(coll=coll, query=query, data=data)
            except:
                coll = 'dataset_data'
                query = {'name':dataset_name}
                self.storage.delete_data(coll=coll, query=query)
            q.task_done()

    def insert_phedex_data(self, dataset_name):
        """
        Fetch phedex data about dataset and insert into database
        """
        api = 'data'
        params = {'dataset':dataset_name, 'level':'block', 'create_since':0.0}
        phedex_data = self.phedex.fetch(api=api, params=params)
        size_bytes = 0
        n_files = 0
        dataset_data = get_json(get_json(get_json(phedex_data, 'phedex'), 'dbs')[0],'dataset')[0]
        for block_data in get_json(dataset_data, 'block'):
            size_bytes += get_json(block_data, 'bytes')
            n_files += get_json(block_data, 'files')
        coll = 'dataset_data'
        query = {'name':dataset_name}
        data = {'$set':{'size_bytes':size_bytes, 'n_files':n_files}}
        self.storage.update_data(coll=coll, query=query, data=data, upsert=False)

    def insert_dbs_data(self, dataset_name):
        """
        Fetch dbs data about dataset and insert into database
        """
        api = 'datasets'
        params = {'dataset':dataset_name, 'detail':True, 'dataset_access_type':'*'}
        dbs_data = self.dbs.fetch(api=api, params=params)
        dataset_data = get_json(dbs_data, 'data')[0]
        ds_name = get_json(dataset_data, 'primary_ds_name')
        physics_group = get_json(dataset_data, 'physics_group_name')
        data_tier = get_json(dataset_data, 'data_tier_name')
        creation_date = datetime_day(timestamp_to_datetime(get_json(dataset_data, 'creation_date')))
        ds_type = get_json(dataset_data, 'primary_ds_type')
        coll = 'dataset_data'
        query = {'name':dataset_name}
        data = {'$set':{'ds_name':ds_name, 'physics_group':physics_group, 'data_tier':data_tier, 'creation_date':creation_date, 'ds_type':ds_type}}
        self.storage.update_data(coll=coll, query=query, data=data, upsert=False)

    def get_replicas(self, dataset_data):
        """
        Generator function to get all replicas of a dataset
        """
        replicas_check = dict()
        dataset_name = get_json(dataset_data, 'name')
        for block_data in get_json(dataset_data, 'block'):
            for replica_data in get_json(block_data, 'replica'):
                try:
                    replicas_check[get_json(replica_data, 'node')] += get_json(replica_data, 'files')
                except:
                    replicas_check[get_json(replica_data, 'node')] = get_json(replica_data, 'files')
        replicas = list()
        n_files = self.get_n_files(dataset_name)
        for site, site_files in replicas_check.items():
            if site_files == n_files:
                replicas.append(site)
        return replicas

    def get_db_datasets(self):
        """
        Get all datasets currently in database
        """
        coll = 'dataset_data'
        pipeline = list()
        match = {'$match':{'data_tier': {'$in':self.valid_tiers}}}
        pipeline.append(match)
        project = {'$project':{'name':1, '_id':0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        dataset_names = [dataset_data['name'] for dataset_data in data]
        #self.logger.info('%d datasets present in database', len(dataset_names))
        return dataset_names

    def get_removed_db_datasets(self):
        """
        Get all datasets currently in database with more than one replica
        """
        coll = 'dataset_data'
        pipeline = list()
        match = {'$match':{'data_tier': {'$in':self.valid_tiers}}}
        pipeline.append(match)
        group = {'$group':{'_id':'$name', 'n_replicas':{'$first':{'$size':'$replicas'}}}}
        pipeline.append(group)
        project = {'$project':{'name':'$_id', 'n_replicas':1, '_id':0}}
        pipeline.append(project)
        match = {'$match': {'n_replicas':{'$gt':1}}}
        pipeline.append(match)
        project = {'$project':{'name':1, '_id':0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        dataset_names = [dataset_data['name'] for dataset_data in data]
        #self.logger.info('%d valid datasets present in database', len(dataset_names))
        return dataset_names

    def remove_dataset(self, dataset_name):
        """
        Remove dataset from database
        """
        coll = 'dataset_data'
        query = {'name':dataset_name}
        self.storage.delete_data(coll=coll, query=query)

    def get_dataset_features(self, dataset_name):
        """
        Get dataset features for dataset from db
        """
        coll = 'dataset_data'
        pipeline = list()
        match = {'$match':{'name':dataset_name}}
        pipeline.append(match)
        project = {'$project':{'dataset_name':'$name', 'size_gb':{'$multiply':['$size_bytes', 0.000000001]}, 'n_files':1, 'physics_group':1, 'ds_type':1, 'data_tier':1, '_id':0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        return data[0]

    def get_n_files(self, dataset_name):
        """
        Get the number of files in the block
        """
        coll = 'dataset_data'
        pipeline = list()
        match = {'$match':{'name':dataset_name}}
        pipeline.append(match)
        project = {'$project':{'n_files':1, '_id':0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        return data[0]['n_files']

    def get_data_tiers(self, dataset_names):
        """
        Get the data tiers of all datasets
        """
        dataset_tiers = dict()
        coll = 'dataset_data'
        pipeline = list()
        match = {'$match':{'name':{'$in':dataset_names}}}
        pipeline.append(match)
        project = {'$project':{'data_tier':1, 'name':1, '_id':0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        for dataset in data:
            dataset_tiers[dataset['name']] = dataset['data_tier']
        return dataset_tiers

    def get_sites(self, dataset_name):
        """
        Get all sites with a replica of the dataset
        """
        coll = 'dataset_data'
        pipeline = list()
        match = {'$match':{'name':dataset_name}}
        pipeline.append(match)
        project = {'$project':{'replicas':1, '_id':0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        site_names = data[0]['replicas']
        return site_names

    def get_size(self, dataset_name):
        """
        Get size in GB of dataset
        """
        coll = 'dataset_data'
        pipeline = list()
        match = {'$match':{'name':dataset_name}}
        pipeline.append(match)
        project = {'$project':{'size_bytes':1, '_id':0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        size_gb = float(data[0]['size_bytes'])/10**9
        return size_gb

    def get_current_num_replicas(self):
        """
        Get the current number of replicas for all datasets
        """
        datasets = self.get_db_datasets()
        coll = 'dataset_data'
        pipeline = list()
        match = {'$match': {'name': {'$in':datasets}}}
        pipeline.append(match)
        group = {'$group':{'_id':'$name', 'n_replicas':{'$first':{'$size':'$replicas'}}}}
        pipeline.append(group)
        project = {'$project':{'name':'$_id', 'n_replicas':1, '_id':0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        return data

    def get_num_replicas(self, dataset_name):
        """
        Get the current number of replicas for one dataset
        """
        coll = 'dataset_data'
        pipeline = list()
        match = {'$match': {'name': dataset_name}}
        pipeline.append(match)
        group = {'$group':{'_id':'$name', 'n_replicas':{'$first':{'$size':'$replicas'}}}}
        pipeline.append(group)
        project = {'$project':{'n_replicas':1, '_id':0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        return data[0]['n_replicas']

    def update_replicas(self, subscriptions, deletions):
        """
        Maually update the replicas based on subscriptions and deletions
        """
        coll = 'dataset_data'
        for subscription in subscriptions:
            dataset_name = subscription[0]
            site_name = subscription[1]
            query = {'name':dataset_name}
            data = {'$push':{'replicas':site_name}}
            self.storage.update_data(coll=coll, query=query, data=data)

        for deletion in deletions:
            dataset_name = deletion[0]
            site_name = deletion[1]
            query = {'name':dataset_name}
            data = {'$pull':{'replicas':site_name}}
            self.storage.update_data(coll=coll, query=query, data=data)

    def get_total_size(self, dataset_name):
        """
        Get the total storage used in the system
        """
        coll = 'dataset_data'
        pipeline = list()
        match = {'$match': {'name':dataset_name}}
        pipeline.append(match)
        group = {'$group':{'_id':'$name', 'n_replicas':{'$first':{'$size':'$replicas'}}}}
        pipeline.append(group)
        project = {'$project':{'n_replicas':1, '_id':0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        n_replicas = data[0]['n_replicas']
        size_gb = self.get_size(dataset_name)
        total_size = n_replicas * size_gb
        return total_size

    def get_all_dataset_size(self, dataset_names):
        """
        Get the total storage used in the system
        """
        coll = 'dataset_data'
        pipeline = list()
        match = {'$match': {'name':{'$in':dataset_names}}}
        pipeline.append(match)
        group = {'$group':{'_id':'$name', 'size_bytes':{'$sum':'$size_bytes'}, 'n_replicas':{'$first':{'$size':'$replicas'}}}}
        pipeline.append(group)
        project = {'$project':{'size_bytes':1, 'n_replicas':1, '_id':1}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        sizes = dict()
        for dataset in data:
            sizes[dataset['_id']] = (float(dataset['size_bytes'])/10**12)*dataset['n_replicas']
        return sizes

    def get_all_site_size(self, site_names):
        """
        Get the total storage used in the system
        """
        sites_sizes = dict()
        for site_name in site_names:
            # get all datasets with a replica at the site and how many replicas it has
            coll = 'dataset_data'
            pipeline = list()
            match = {'$match':{'replicas':site_name}}
            pipeline.append(match)
            project = {'$project':{'name':1, '_id':0}}
            pipeline.append(project)
            data = self.storage.get_data(coll=coll, pipeline=pipeline)
            dataset_names = [dataset_data['name'] for dataset_data in data]
            # get the popularity of the dataset and decide by number of replicas
            coll = 'dataset_data'
            pipeline = list()
            match = {'$match': {'name':{'$in':dataset_names}}}
            pipeline.append(match)
            group = {'$group':{'_id':'$name', 'size_bytes':{'$sum':'$size_bytes'}, 'n_replicas':{'$first':{'$size':'$replicas'}}}}
            pipeline.append(group)
            project = {'$project':{'size_bytes':1, 'n_replicas':1, '_id':1}}
            pipeline.append(project)
            data = self.storage.get_data(coll=coll, pipeline=pipeline)
            size = 0.0
            for dataset in data:
                size += (float(dataset['size_bytes'])/10**12)*dataset['n_replicas']
            sites_sizes[site_name] = size
        return sites_sizes

    def get_total_storage(self):
        """
        Get the total storage used in the system
        """
        datasets = self.get_removed_db_datasets()
        coll = 'dataset_data'
        pipeline = list()
        match = {'$match': {'name': {'$in':datasets}}}
        pipeline.append(match)
        group = {'$group':{'_id':'$name', 'size_bytes':{'$sum':'$size_bytes'}, 'n_replicas':{'$first':{'$size':'$replicas'}}}}
        pipeline.append(group)
        project = {'$project':{'name':'$_id', 'size_bytes':1, 'n_replicas':1, '_id':0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        total_storage = 0.0
        for dataset in data:
            total_storage += (float(dataset['size_bytes'])/10**9)*dataset['n_replicas']
        return total_storage
Example #22
0
class Ranker(object):
    """
    Generic Ranking class
    """
    def __init__(self, config=dict()):
        self.logger = logging.getLogger(__name__)
        self.config = config
        self.sites = SiteManager(self.config)
        self.datasets = DatasetManager(self.config)
        self.popularity = PopularityManager(self.config)
        self.storage = StorageManager(self.config)
        self.max_replicas = int(config['rocker_board']['max_replicas'])
        self.MAX_THREADS = int(config['threading']['max_threads'])
        self.dataset_popularity = dict()

    def get_dataset_rankings(self,
                             date=datetime_day(datetime.datetime.utcnow())):
        """
        Generate dataset rankings
        """
        self.dataset_popularity = dict()
        dataset_names = self.datasets.get_db_datasets()
        q = Queue.Queue()
        for i in range(self.MAX_THREADS):
            worker = threading.Thread(target=self.get_dataset_popularity,
                                      args=(q, ))
            worker.daemon = True
            worker.start()
        # self.dataset_features = self.popularity.get_features(dataset_names, date)
        # self.dataset_tiers = self.datasets.get_data_tiers(dataset_names)
        for dataset_name in dataset_names:
            q.put((dataset_name, date))
        q.join()
        dataset_rankings = self.normalize_popularity(date)
        return dataset_rankings

    def get_site_rankings(self, date=datetime_day(datetime.datetime.utcnow())):
        """
        Generate site rankings
        """
        # get all sites which can be replicated to
        site_names = self.sites.get_available_sites()
        site_rankings = dict()
        for site_name in site_names:
            # get popularity
            popularity = self.get_site_popularity(site_name, date)
            # get cpu and storage (performance)
            performance = self.sites.get_performance(site_name)
            # get available storage
            available_storage_tb = self.sites.get_available_storage(
                site_name) / 10**3
            if available_storage_tb <= 0:
                available_storage_tb = 0
            else:
                available_storage_tb = 1
            #calculate rank
            try:
                rank = (performance * available_storage_tb) / popularity
            except:
                rank = 0.0
            # store into dict
            site_rankings[site_name] = rank
            # insert into database
            coll = 'site_rankings'
            query = {'name': site_name, 'date': date}
            data = {
                '$set': {
                    'name': site_name,
                    'date': date,
                    'rank': rank,
                    'popularity': popularity
                }
            }
            self.storage.update_data(coll=coll,
                                     query=query,
                                     data=data,
                                     upsert=True)
        return site_rankings

    def get_dataset_popularity(self, q):
        """
        Get the estimated popularity for dataset
        """
        while True:
            # collect features
            data = q.get()
            dataset_name = data[0]
            date = data[1]
            popularity = 0.0
            # get average
            popularity = self.popularity.get_average_popularity(
                dataset_name, date)
            self.dataset_popularity[dataset_name] = popularity
            q.task_done()

    def get_site_popularity(self,
                            site_name,
                            date=datetime_day(datetime.datetime.utcnow())):
        """
        Get popularity for site
        """
        # get all datasets with a replica at the site and how many replicas it has
        coll = 'dataset_data'
        pipeline = list()
        match = {'$match': {'replicas': site_name}}
        pipeline.append(match)
        project = {'$project': {'name': 1, '_id': 0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        popularity = 0.0
        dataset_names = [dataset_data['name'] for dataset_data in data]
        # get the popularity of the dataset and decide by number of replicas
        coll = 'dataset_rankings'
        pipeline = list()
        match = {'$match': {'date': date}}
        pipeline.append(match)
        match = {'$match': {'name': {'$in': dataset_names}}}
        pipeline.append(match)
        group = {
            '$group': {
                '_id': '$date',
                'total_popularity': {
                    '$sum': '$popularity'
                }
            }
        }
        pipeline.append(group)
        project = {'$project': {'total_popularity': 1, '_id': 0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        try:
            popularity = data[0]['total_popularity']
        except:
            popularity = 0.0
        return popularity

    def get_site_storage_rankings(self, subscriptions):
        """
        Return the amount over the soft limit sites are including new subscriptions
        If site is not over just set to 0
        """
        site_rankings = dict()
        available_sites = self.sites.get_available_sites()
        for site_name in available_sites:
            site_rankings[site_name] = self.sites.get_over_soft_limit(
                site_name)
        for subscription in subscriptions:
            site_rankings[subscription[1]] += self.datasets.get_size(
                subscription[0])
        for site_name in available_sites:
            if site_rankings[site_name] < 0:
                del site_rankings[site_name]
        return site_rankings

    def normalize_popularity(self, date):
        """
        Normalize popularity values to be between 1 and max_replicas
        """
        dataset_rankings = dict()
        max_pop = max(self.dataset_popularity.iteritems(),
                      key=operator.itemgetter(1))[1]
        min_pop = min(self.dataset_popularity.iteritems(),
                      key=operator.itemgetter(1))[1]
        n = float(min_pop + (self.max_replicas - 1)) / max_pop
        m = 1 - n * min_pop
        for dataset_name, popularity in self.dataset_popularity.items():
            # store into dict
            rank = int(n * self.dataset_popularity[dataset_name] + m)
            dataset_rankings[dataset_name] = rank
            coll = 'dataset_rankings'
            query = data = {'name': dataset_name, 'date': date}
            data = {
                '$set': {
                    'name': dataset_name,
                    'date': date,
                    'rank': rank,
                    'popularity': popularity
                }
            }
            self.storage.update_data(coll=coll,
                                     query=query,
                                     data=data,
                                     upsert=True)
        return dataset_rankings
Example #23
0
class SiteManager(object):
    """
    Keep track of site data
    """
    def __init__(self, config=dict()):
        self.logger = logging.getLogger(__name__)
        self.config = config
        self.intelroccs = IntelROCCSService(self.config)
        self.crab = CRABService(self.config)
        self.storage = StorageManager(self.config)
        self.soft_limit = float(self.config['rocker_board']['soft_limit'])
        self.hard_limit = float(self.config['rocker_board']['hard_limit'])

    def initiate_db(self):
        """
        Initiate Site database
        Does exactly the same as update_db
        """
        self.update_db()

    def update_db(self):
        """
        Initiate site data in database
        Get general data about all sites
        """
        api = 'Detox'
        file_ = 'SitesInfo.txt'
        intelroccs_data = self.intelroccs.fetch(api=api, params=file_, secure=False)
        for site_data in get_json(intelroccs_data, 'data'):
            self.insert_site_data(site_data)

    def insert_site_data(self, site_data):
        """
        Insert site into database
        """
        coll = 'site_data'
        site_name = str(site_data[4])
        site_status = int(site_data[0])
        site_quota = int(site_data[1])*10**3
        query = {'name':site_name}
        data = {'$set':{'name':site_name, 'status':site_status, 'quota_gb':site_quota}}
        self.storage.update_data(coll=coll, query=query, data=data, upsert=True)

    def update_cpu(self):
        """
        Update maximum CPU capacity for site
        """
        active_sites = self.get_active_sites()
        for site_name in active_sites:
            # remove older values
            date = datetime.datetime.utcnow() - datetime.timedelta(days=30)
            coll = 'site_data'
            query = {'name':site_name}
            data = {'$pull':{'cpu_data':{'date':{'$lt':date}}}}
            self.storage.update_data(coll=coll, query=query, data=data)
            # get CRAB data about site
            query = 'GLIDEIN_CMSSite =?= "%s" && CPUs > 0' % (site_name)
            attributes = ['GLIDEIN_CMSSite', 'CPUs']
            ads = self.crab.fetch_cluster_ads(query, attributes=attributes)
            cpus = 0
            for ad in ads:
                cpus += ad['CPUs']
            # insert new data
            date = datetime.datetime.utcnow()
            query = {'name':site_name}
            data = {'$push':{'cpu_data':{'date':date, 'cpus':cpus}}}
            self.storage.update_data(coll=coll, query=query, data=data)

    def get_active_sites(self):
        """
        Get all sites which are active, includes sites which are not available for replication
        """
        coll = 'site_data'
        pipeline = list()
        match = {'$match':{'status':{'$in':[1, 2]}}}
        pipeline.append(match)
        project = {'$project':{'name':1, '_id':0}}
        pipeline.append(project)
        sites_data = self.storage.get_data(coll=coll, pipeline=pipeline)
        return [site_data['name'] for site_data in sites_data]

    def get_available_sites(self):
        """
        Get all sites which are available for replication
        """
        coll = 'site_data'
        pipeline = list()
        match = {'$match':{'status':1}}
        pipeline.append(match)
        project = {'$project':{'name':1, '_id':0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        return [site['name'] for site in data]

    def get_performance(self, site_name):
        """
        Maximum num CPU's divide by quota
        """
        max_cpus = self.get_max_cpu(site_name)
        quota_gb = self.get_quota(site_name)
        try:
            performance = float(max_cpus)/float(quota_gb/10**3)
        except:
            performance = 0.0
        return performance

    def get_available_storage(self, site_name):
        """
        Get total AnalysisOps storage available at the site
        """
        size_gb = self.get_data(site_name)
        quota_gb = self.get_quota(site_name)
        available_gb = max(0, (self.hard_limit*quota_gb) - size_gb)
        return available_gb

    def get_all_available_storage(self):
        """
        Get available storage for all sites
        """
        available_storage = dict()
        available_sites = self.get_available_sites()
        for site_name in available_sites:
            available_storage[site_name] = self.get_available_storage(site_name)
        return available_storage

    def get_over_soft_limit(self, site_name):
        """
        Get the amount of GB a site is over the soft limit i.e lower limit.
        If it's not over set to 0
        """
        size_gb = self.get_data(site_name)
        quota_gb = self.get_quota(site_name)
        over_gb = size_gb - (self.soft_limit*quota_gb)
        return over_gb

    def get_data(self, site_name):
        """
        Get the amount of data at the site
        """
        coll = 'dataset_data'
        pipeline = list()
        match = {'$match':{'replicas':site_name}}
        pipeline.append(match)
        group = {'$group':{'_id':None, 'size_bytes':{'$sum':'$size_bytes'}}}
        pipeline.append(group)
        project = {'$project':{'size_bytes':1, '_id':0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        try:
            size_gb = data[0]['size_bytes']/10**9
        except:
            return 0
        return size_gb

    def get_quota(self, site_name):
        """
        Get the AnalysisOps quota for the site
        """
        coll = 'site_data'
        pipeline = list()
        match = {'$match':{'name':site_name}}
        pipeline.append(match)
        project = {'$project':{'quota_gb':1, '_id':0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        try:
            quota_gb = data[0]['quota_gb']
        except:
            quota_gb = 0
        return quota_gb

    def get_max_cpu(self, site_name):
        """
        Get the maximum number of CPU's in the last 30 days at the site
        """
        coll = 'site_data'
        pipeline = list()
        match = {'$match':{'name':site_name}}
        pipeline.append(match)
        unwind = {'$unwind':'$cpu_data'}
        pipeline.append(unwind)
        group = {'$group':{'_id':'$name', 'max_cpus':{'$max':'$cpu_data.cpus'}}}
        pipeline.append(group)
        project = {'$project':{'max_cpus':1, '_id':0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        try:
            max_cpus = data[0]['max_cpus']
        except:
            self.logger.warning('Could not get site performance for %s', site_name)
            return 0
        return max_cpus
Example #24
0
class PopularityManager(object):
    """
    Generate popularity metrics for datasets and sites
    """
    def __init__(self, config=dict()):
        self.logger = logging.getLogger(__name__)
        self.config = config
        self.pop_db = PopDBService(self.config)
        self.sites = SiteManager(self.config)
        self.datasets = DatasetManager(self.config)
        self.storage = StorageManager(self.config)
        self.MAX_THREADS = int(config['threading']['max_threads'])

    def initiate_db(self):
        """
        Collect popularity data
        """
        q = Queue.Queue()
        for i in range(self.MAX_THREADS):
            worker = threading.Thread(target=self.insert_popularity_data, args=(i, q))
            worker.daemon = True
            worker.start()
        start_date = datetime_day(datetime.datetime.utcnow() - datetime.timedelta(days=90))
        end_date = datetime_day(datetime.datetime.utcnow())
        # fetch popularity data
        t1 = datetime.datetime.utcnow()
        for date in daterange(start_date, end_date):
            q.put(date)
        q.join()
        t2 = datetime.datetime.utcnow()
        td = t2 - t1
        self.logger.info('Inserting Pop DB data took %s', str(td))

    def update_db(self):
        """
        Fetch latest popularity data not in database
        """
        # get dates
        coll = 'dataset_popularity'
        pipeline = list()
        sort = {'$sort':{'date':-1}}
        pipeline.append(sort)
        limit = {'$limit':1}
        pipeline.append(limit)
        project = {'$project':{'date':1, '_id':0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        try:
            start_date = data[0]['date']
        except:
            self.logger.warning('Popularity needs to be initiated')
            self.initiate_db()
            return
        q = Queue.Queue()
        for i in range(self.MAX_THREADS):
            worker = threading.Thread(target=self.insert_popularity_data, args=(i, q))
            worker.daemon = True
            worker.start()
        start_date = start_date + datetime.timedelta(days=1)
        end_date = datetime_day(datetime.datetime.utcnow())
        # fetch popularity data
        t1 = datetime.datetime.utcnow()
        for date in daterange(start_date, end_date):
            q.put(date)
        q.join()
        t2 = datetime.datetime.utcnow()
        td = t2 - t1
        self.logger.info('Updating Pop DB data took %s', str(td))

    def insert_popularity_data(self, i, q):
        """
        Insert popularity data for one dataset into db
        """
        coll = 'dataset_popularity'
        while True:
            date = q.get()
            self.logger.info('Inserting date %s', datetime_to_string(date))
            api = 'DSStatInTimeWindow/'
            tstart = datetime_to_string(date)
            tstop = tstart
            params = {'sitename':'summary', 'tstart':tstart, 'tstop':tstop}
            json_data = self.pop_db.fetch(api=api, params=params)
            # sort it in dictionary for easy fetching
            for dataset in json_data['DATA']:
                dataset_name = dataset['COLLNAME']
                popularity_data = {'name':dataset_name, 'date':date}
                popularity_data['n_accesses'] = dataset['NACC']
                popularity_data['n_cpus'] = dataset['TOTCPU']
                popularity_data['n_users'] = dataset['NUSERS']
                query = {'name':dataset_name, 'date':date}
                data = {'$set':popularity_data}
                self.storage.update_data(coll=coll, query=query, data=data, upsert=True)
            q.task_done()

    def get_average_popularity(self, dataset_name, date):
        """
        Get all popularity data for a dataset
        """
        start_date = date - datetime.timedelta(days=7)
        end_date = date - datetime.timedelta(days=1)
        coll = 'dataset_popularity'
        pipeline = list()
        match = {'$match':{'name':dataset_name}}
        pipeline.append(match)
        match = {'$match':{'date':{'$gte':start_date, '$lte':end_date}}}
        pipeline.append(match)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        pops = list()
        for i in range(0, 7):
            try:
                pops.append(log(float(data[i]['n_accesses']*data[i]['n_cpus'])))
            except:
                pops.append(0.0)
        avg = np.mean(pops)
        return avg