Example #1
0
 def test_mit_db(self):
     "Test mit_db functions"
     print ""
     mit_db = MITDBService(config=self.config)
     query = "SELECT SiteName FROM Sites WHERE SiteName=%s"
     values = ['T2_US_Nebraska']
     expected = 'T2_US_Nebraska'
     json_data = mit_db.fetch(query=query, values=values, cache=False)
     result = json_data['data'][0][0]
     self.assertEqual(result, expected)
Example #2
0
 def test_mit_db(self):
     "Test mit_db functions"
     print ""
     mit_db = MITDBService(config=self.config)
     query = "SELECT SiteName FROM Sites WHERE SiteName=%s"
     values = ['T2_US_Nebraska']
     expected = 'T2_US_Nebraska'
     json_data = mit_db.fetch(query=query, values=values, cache=False)
     result = json_data['data'][0][0]
     self.assertEqual(result, expected)
Example #3
0
 def __init__(self, config=dict()):
     self.logger = logging.getLogger(__name__)
     self.config = config
     self.phedex = PhEDExService(self.config)
     self.mit_db = MITDBService(self.config)
     self.datasets = DatasetManager(self.config)
     self.sites = SiteManager(self.config)
     self.popularity = PopularityManager(self.config)
     self.storage = StorageManager(self.config)
     self.rankings = Ranker(self.config)
     self.max_gb = int(self.config['rocker_board']['max_gb'])
     self.csv_data = list()
Example #4
0
 def __init__(self, config=dict()):
     self.logger = logging.getLogger(__name__)
     self.config = config
     self.phedex = PhEDExService(self.config)
     self.mit_db = MITDBService(self.config)
     self.datasets = DatasetManager(self.config)
     self.sites = SiteManager(self.config)
     self.popularity = PopularityManager(self.config)
     self.storage = StorageManager(self.config)
     self.rankings = Ranker(self.config)
     self.max_gb = int(self.config["rocker_board"]["max_gb"])
     self.csv_data = list()
Example #5
0
class RockerBoard(object):
    """
    RockerBoard is a system balancing algorithm using popularity metrics to predict popularity
    and make appropriate replications to keep the system balanced
    """

    def __init__(self, config=dict()):
        self.logger = logging.getLogger(__name__)
        self.config = config
        self.phedex = PhEDExService(self.config)
        self.mit_db = MITDBService(self.config)
        self.datasets = DatasetManager(self.config)
        self.sites = SiteManager(self.config)
        self.popularity = PopularityManager(self.config)
        self.storage = StorageManager(self.config)
        self.rankings = Ranker(self.config)
        self.max_gb = int(self.config["rocker_board"]["max_gb"])
        self.csv_data = list()

    def start(self, date=datetime_day(datetime.datetime.utcnow())):
        """
        Begin Rocker Board Algorithm
        """
        t1 = datetime.datetime.utcnow()
        # Get goals
        dataset_rankings = self.rankings.get_dataset_rankings(date)
        site_rankings = self.rankings.get_site_rankings(date)
        self.change_dataset_rankings(dataset_rankings)
        subscriptions = self.replicate(dataset_rankings, site_rankings)
        self.logger.info("SUBSCRIPTIONS")
        for subscription in subscriptions:
            self.logger.info("site: %s\tdataset: %s", subscription[1], subscription[0])
        # site_storage = self.rankings.get_site_storage_rankings(subscriptions)
        # deletions = self.clean(dataset_rankings, site_storage)
        # self.logger.info('DELETIONS')
        # for deletion in deletions:
        #     self.logger.info('site: %s\tdataset: %s', deletion[1], deletion[0])
        # self.delete(deletions)
        self.subscribe(subscriptions)
        # self.datasets.update_replicas(subscriptions, deletions)
        t2 = datetime.datetime.utcnow()
        td = t2 - t1
        self.logger.info("Rocker Board took %s", str(td))

    def change_dataset_rankings(self, dataset_rankings):
        """
        Change the ranks from being the target number of replicas to being the
        change in number of replicas required to reach the goal
        """
        current_replicas = self.datasets.get_current_num_replicas()
        for dataset in current_replicas:
            dataset_rankings[dataset["name"]] -= dataset["n_replicas"]

    def replicate(self, dataset_rankings, site_rankings):
        """
        Balance system by creating new replicas based on popularity
        """
        subscriptions = list()
        subscribed_gb = 0
        sites_available_storage_gb = self.sites.get_all_available_storage()
        while (subscribed_gb < self.max_gb) and site_rankings:
            tmp_site_rankings = dict()
            for k, v in site_rankings.items():
                tmp_site_rankings[k] = v
            dataset = max(dataset_rankings.iteritems(), key=operator.itemgetter(1))
            dataset_name = dataset[0]
            dataset_rank = dataset[1]
            if (not dataset_name) or (dataset_rank < 1):
                break
            size_gb = self.datasets.get_size(dataset_name)
            unavailable_sites = set(self.datasets.get_sites(dataset_name))
            for site_name in tmp_site_rankings.keys():
                if (self.sites.get_available_storage(site_name) < size_gb) or (tmp_site_rankings[site_name] <= 0):
                    unavailable_sites.add(site_name)
            for site_name in unavailable_sites:
                try:
                    del tmp_site_rankings[site_name]
                except:
                    continue
            if not tmp_site_rankings:
                del dataset_rankings[dataset_name]
                continue
            site_name = weighted_choice(tmp_site_rankings)
            subscription = (dataset_name, site_name)
            subscriptions.append(subscription)
            subscribed_gb += size_gb
            sites_available_storage_gb[site_name] -= size_gb
            self.logger.info("%s : added", dataset_name)
            if sites_available_storage_gb[site_name] <= 0:
                del site_rankings[site_name]
            dataset_rankings[dataset_name] -= 1
        self.logger.info("Subscribed %dGB", subscribed_gb)
        return subscriptions

    def clean(self, dataset_rankings, site_rankings):
        """
        Suggest deletions based on dataset and site rankings
        """
        deletions = list()
        deleted_gb = 0
        while site_rankings:
            tmp_site_rankings = dict()
            dataset = min(dataset_rankings.iteritems(), key=operator.itemgetter(1))
            dataset_name = dataset[0]
            size_gb = self.datasets.get_size(dataset_name)
            available_sites = set(self.datasets.get_sites(dataset_name))
            for site_name in available_sites:
                try:
                    tmp_site_rankings[site_name] = site_rankings[site_name]
                except:
                    continue
            if not tmp_site_rankings:
                del dataset_rankings[dataset_name]
                continue
            site_name = weighted_choice(tmp_site_rankings)
            deletion = (dataset_name, site_name)
            deletions.append(deletion)
            deleted_gb += size_gb
            site_rankings[site_name] -= size_gb
            dataset_rankings[dataset_name] += 1
            if site_rankings[site_name] <= 0:
                del site_rankings[site_name]
        self.logger.info("Deleted %dGB", deleted_gb)
        return deletions

    def subscribe(self, subscriptions):
        """
        Make subscriptions to phedex
        subscriptions = [(dataset_name, site_name), ...]
        """
        new_subscriptions = dict()
        for subscription in subscriptions:
            dataset_name = subscription[0]
            site_name = subscription[1]
            try:
                new_subscriptions[site_name].append(dataset_name)
            except:
                new_subscriptions[site_name] = list()
                new_subscriptions[site_name].append(dataset_name)
        for site_name, dataset_names in new_subscriptions.items():
            data = self.phedex.generate_xml(dataset_names)
            comments = (
                "This dataset is predicted to become popular and has therefore been automatically replicated by cuadrnt"
            )
            api = "subscribe"
            params = [
                ("node", site_name),
                ("data", data),
                ("level", "dataset"),
                ("move", "n"),
                ("custodial", "n"),
                ("group", "AnalysisOps"),
                ("request_only", "n"),
                ("no_mail", "n"),
                ("comments", comments),
            ]
            json_data = self.phedex.fetch(api=api, params=params, method="post")
            # insert into db
            group_name = "AnalysisOps"
            request_id = 0
            request_type = 0
            try:
                request = json_data["phedex"]
                request_id = request["request_created"][0]["id"]
                request_created = timestamp_to_datetime(request["request_timestamp"])
            except:
                self.logger.warning(
                    "Subscription did not succeed\n\tSite:%s\n\tDatasets: %s", str(site_name), str(dataset_names)
                )
                continue
            for dataset_name in dataset_names:
                coll = "dataset_rankings"
                date = datetime_day(datetime.datetime.utcnow())
                pipeline = list()
                match = {"$match": {"name": dataset_name, "date": date}}
                pipeline.append(match)
                project = {"$project": {"delta_rank": 1, "_id": 0}}
                pipeline.append(project)
                data = self.storage.get_data(coll=coll, pipeline=pipeline)
                dataset_rank = data[0]["delta_rank"]
                query = "INSERT INTO Requests(RequestId, RequestType, DatasetId, SiteId, GroupId, Rank, Date) SELECT %s, %s, Datasets.DatasetId, Sites.SiteId, Groups.GroupId, %s, %s FROM Datasets, Sites, Groups WHERE Datasets.DatasetName=%s AND Sites.SiteName=%s AND Groups.GroupName=%s"
                values = (request_id, request_type, dataset_rank, request_created, dataset_name, site_name, group_name)
                self.mit_db.query(query=query, values=values, cache=False)

    def delete(self, deletions):
        """
        Make deletions to phedex
        deletions = [(dataset_name, site_name), ...]
        """
        new_deletions = dict()
        for deletion in deletions:
            dataset_name = deletion[0]
            site_name = deletion[1]
            try:
                new_deletions[site_name].append(dataset_name)
            except:
                new_deletions[site_name] = list()
                new_deletions[site_name].append(dataset_name)
        for site_name, dataset_names in new_deletions.items():
            data = self.phedex.generate_xml(dataset_names)
            comments = "This dataset is predicted to become less popular and has therefore been automatically deleted by cuadrnt"
            api = "delete"
            params = [
                ("node", site_name),
                ("data", data),
                ("level", "dataset"),
                ("rm_subscriptions", "y"),
                ("comments", comments),
            ]
            json_data = self.phedex.fetch(api=api, params=params, method="post")
            # insert into db
            group_name = "AnalysisOps"
            request_id = 0
            request_type = 1
            try:
                request = json_data["phedex"]
                request_id = request["request_created"][0]["id"]
                request_created = timestamp_to_datetime(request["request_timestamp"])
            except:
                self.logger.warning(
                    "Deletion did not succeed\n\tSite:%s\n\tDatasets: %s", str(site_name), str(dataset_names)
                )
                continue
            for dataset_name in dataset_names:
                coll = "dataset_rankings"
                date = datetime_day(datetime.datetime.utcnow())
                pipeline = list()
                match = {"$match": {"name": dataset_name, "date": date}}
                pipeline.append(match)
                project = {"$project": {"delta_rank": 1, "_id": 0}}
                pipeline.append(project)
                data = self.storage.get_data(coll=coll, pipeline=pipeline)
                dataset_rank = data[0]["delta_rank"]
                query = "INSERT INTO Requests(RequestId, RequestType, DatasetId, SiteId, GroupId, Rank, Date) SELECT %s, %s, Datasets.DatasetId, Sites.SiteId, Groups.GroupId, %s, %s FROM Datasets, Sites, Groups WHERE Datasets.DatasetName=%s AND Sites.SiteName=%s AND Groups.GroupName=%s"
                values = (request_id, request_type, dataset_rank, request_created, dataset_name, site_name, group_name)
                self.mit_db.query(query=query, values=values, cache=False)
Example #6
0
class RockerBoard(object):
    """
    RockerBoard is a system balancing algorithm using popularity metrics to predict popularity
    and make appropriate replications to keep the system balanced
    """
    def __init__(self, config=dict()):
        self.logger = logging.getLogger(__name__)
        self.config = config
        self.phedex = PhEDExService(self.config)
        self.mit_db = MITDBService(self.config)
        self.datasets = DatasetManager(self.config)
        self.sites = SiteManager(self.config)
        self.popularity = PopularityManager(self.config)
        self.storage = StorageManager(self.config)
        self.rankings = Ranker(self.config)
        self.max_gb = int(self.config['rocker_board']['max_gb'])
        self.csv_data = list()

    def start(self, date=datetime_day(datetime.datetime.utcnow())):
        """
        Begin Rocker Board Algorithm
        """
        t1 = datetime.datetime.utcnow()
        # Get goals
        dataset_rankings = self.rankings.get_dataset_rankings(date)
        site_rankings = self.rankings.get_site_rankings(date)
        self.change_dataset_rankings(dataset_rankings)
        subscriptions = self.replicate(dataset_rankings, site_rankings)
        self.logger.info('SUBSCRIPTIONS')
        for subscription in subscriptions:
            self.logger.info('site: %s\tdataset: %s', subscription[1], subscription[0])
        # self.subscribe(subscriptions)
        t2 = datetime.datetime.utcnow()
        td = t2 - t1
        self.logger.info('Rocker Board took %s', str(td))

    def change_dataset_rankings(self, dataset_rankings):
        """
        Change the ranks from being the target number of replicas to being the
        change in number of replicas required to reach the goal
        """
        current_replicas = self.datasets.get_current_num_replicas()
        for dataset in current_replicas:
            dataset_rankings[dataset['name']] -= dataset['n_replicas']

    def replicate(self, dataset_rankings, site_rankings):
        """
        Balance system by creating new replicas based on popularity
        """
        subscriptions = list()
        subscribed_gb = 0
        sites_available_storage_gb = self.sites.get_all_available_storage()
        while (subscribed_gb < self.max_gb) and site_rankings:
            tmp_site_rankings = dict()
            for k, v in site_rankings.items():
                tmp_site_rankings[k] = v
            dataset = max(dataset_rankings.iteritems(), key=operator.itemgetter(1))
            dataset_name = dataset[0]
            dataset_rank = dataset[1]
            if (not dataset_name) or (dataset_rank < 1):
                break
            size_gb = self.datasets.get_size(dataset_name)
            unavailable_sites = set(self.datasets.get_sites(dataset_name))
            for site_name in tmp_site_rankings.keys():
                if (self.sites.get_available_storage(site_name) < size_gb) or (tmp_site_rankings[site_name] <= 0):
                    unavailable_sites.add(site_name)
            for site_name in unavailable_sites:
                try:
                    del tmp_site_rankings[site_name]
                except:
                    continue
            if not tmp_site_rankings:
                del dataset_rankings[dataset_name]
                continue
            site_name = weighted_choice(tmp_site_rankings)
            subscription = (dataset_name, site_name)
            subscriptions.append(subscription)
            subscribed_gb += size_gb
            sites_available_storage_gb[site_name] -= size_gb
            self.logger.info('%s : added', dataset_name)
            if sites_available_storage_gb[site_name] <= 0:
                del site_rankings[site_name]
            dataset_rankings[dataset_name] -= 1
        self.logger.info('Subscribed %dGB', subscribed_gb)
        return subscriptions

    def subscribe(self, subscriptions):
        """
        Make subscriptions to phedex
        subscriptions = [(dataset_name, site_name), ...]
        """
        new_subscriptions = dict()
        for subscription in subscriptions:
            dataset_name = subscription[0]
            site_name = subscription[1]
            try:
                new_subscriptions[site_name].append(dataset_name)
            except:
                new_subscriptions[site_name] = list()
                new_subscriptions[site_name].append(dataset_name)
        for site_name, dataset_names in new_subscriptions.items():
            data = self.phedex.generate_xml(dataset_names)
            comments = 'This dataset is predicted to become popular and has therefore been automatically replicated by cuadrnt'
            api = 'subscribe'
            params = [('node', site_name), ('data', data), ('level','dataset'), ('move', 'n'), ('custodial', 'n'), ('group', 'AnalysisOps'), ('request_only', 'n'), ('no_mail', 'n'), ('comments', comments)]
            json_data = self.phedex.fetch(api=api, params=params, method='post')
            # insert into db
            group_name = 'AnalysisOps'
            request_id = 0
            request_type = 0
            try:
                request = json_data['phedex']
                request_id = request['request_created'][0]['id']
                request_created = timestamp_to_datetime(request['request_timestamp'])
            except:
                self.logger.warning('Subscription did not succeed\n\tSite:%s\n\tDatasets: %s', str(site_name), str(dataset_names))
                continue
            for dataset_name in dataset_names:
                coll = 'dataset_rankings'
                date = datetime_day(datetime.datetime.utcnow())
                pipeline = list()
                match = {'$match':{'name':dataset_name, 'date':date}}
                pipeline.append(match)
                project = {'$project':{'delta_rank':1, '_id':0}}
                pipeline.append(project)
                data = self.storage.get_data(coll=coll, pipeline=pipeline)
                dataset_rank = data[0]['delta_rank']
                query = "INSERT INTO Requests(RequestId, RequestType, DatasetId, SiteId, GroupId, Rank, Date) SELECT %s, %s, Datasets.DatasetId, Sites.SiteId, Groups.GroupId, %s, %s FROM Datasets, Sites, Groups WHERE Datasets.DatasetName=%s AND Sites.SiteName=%s AND Groups.GroupName=%s"
                values = (request_id, request_type, dataset_rank, request_created, dataset_name, site_name, group_name)
                self.mit_db.query(query=query, values=values, cache=False)