Example #1
0
 def initiate_db(self):
     """
     Initiate dataset data in database
     Get general data and popularity data from beginning
     """
     q = Queue.Queue()
     for i in range(self.MAX_THREADS):
         worker = threading.Thread(target=self.insert_dataset_data, args=(i, q))
         worker.daemon = True
         worker.start()
     active_sites = self.sites.get_active_sites()
     api = 'blockreplicas'
     params = [('node', active_sites), ('create_since', 0.0), ('complete', 'y'), ('dist_complete', 'y'), ('group', 'AnalysisOps'), ('show_dataset', 'y')]
     t1 = datetime.datetime.utcnow()
     phedex_data = self.phedex.fetch(api=api, params=params)
     t2 = datetime.datetime.utcnow()
     td = t2 - t1
     self.logger.info('Call to PhEDEx took %s', str(td))
     count = 1
     t1 = datetime.datetime.utcnow()
     for dataset_data in get_json(get_json(phedex_data, 'phedex'), 'dataset'):
         q.put((dataset_data, count))
         count += 1
     q.join()
     t2 = datetime.datetime.utcnow()
     td = t2 - t1
     self.logger.info('Inserting dataset data took %s', str(td))
     self.logger.info('Done inserting datasets into DB')
Example #2
0
 def test_get_json(self):
     "Test get_json function"
     json_data = {'foo': [{'bar': 1}, {'bar': 2}]}
     field = 'foobar'
     expected = list()
     result = get_json(json_data, field)
     self.assertEqual(result, expected)
     field = 'foo'
     expected = [{'bar': 1}, {'bar': 2}]
     result = get_json(json_data, field)
Example #3
0
 def test_get_json(self):
     "Test get_json function"
     json_data = {'foo':[{'bar':1}, {'bar':2}]}
     field = 'foobar'
     expected = list()
     result = get_json(json_data, field)
     self.assertEqual(result, expected)
     field = 'foo'
     expected = [{'bar':1}, {'bar':2}]
     result = get_json(json_data, field)
Example #4
0
 def get_replicas(self, dataset_data):
     """
     Generator function to get all replicas of a dataset
     """
     replicas = list()
     for block_data in get_json(dataset_data, 'block'):
         for replica_data in get_json(block_data, 'replica'):
             if get_json(replica_data, 'files') > 0:
                 replicas.append(get_json(replica_data, 'node'))
     return replicas
Example #5
0
 def update_db(self):
     """
     Get datasets currently in AnalysisOps and compare to database
     Deactivate removed datasets and insert new
     Update replicas
     """
     # get all datasets in database
     dataset_names = self.get_db_datasets()
     dataset_names = set(dataset_names)
     # get all active sites, only fetch replicas from these
     active_sites = self.sites.get_active_sites()
     api = 'blockreplicas'
     params = [('node', active_sites), ('create_since', 0.0), ('complete', 'y'), ('group', 'AnalysisOps'), ('show_dataset', 'y')]
     t1 = datetime.datetime.utcnow()
     phedex_data = self.phedex.fetch(api=api, params=params)
     t2 = datetime.datetime.utcnow()
     td = t2 - t1
     self.logger.info('Call to PhEDEx took %s', str(td))
     current_datasets = set()
     q = Queue.Queue()
     for i in range(self.MAX_THREADS):
         worker = threading.Thread(target=self.insert_dataset_data, args=(i, q))
         worker.daemon = True
         worker.start()
     count = 1
     t1 = datetime.datetime.utcnow()
     for dataset_data in get_json(get_json(phedex_data, 'phedex'), 'dataset'):
         dataset_name = get_json(dataset_data, 'name')
         current_datasets.add(dataset_name)
         if dataset_name not in dataset_names:
             # this is a new dataset which need to be inserted into the database
             q.put((dataset_data, count))
             count += 1
         else:
             # update replicas
             replicas = self.get_replicas(dataset_data)
             coll = 'dataset_data'
             query = {'name':dataset_name}
             data = {'$set':{'replicas':replicas}}
             data = self.storage.update_data(coll=coll, query=query, data=data)
     q.join()
     deprecated_datasets = dataset_names - current_datasets
     for dataset_name in deprecated_datasets:
         self.remove_dataset(dataset_name)
     t2 = datetime.datetime.utcnow()
     td = t2 - t1
     self.logger.info('Updating dataset data took %s', str(td))
     self.logger.info('Done updating datasets in DB')
Example #6
0
 def insert_dataset_data(self, i, q):
     """
     Insert a new dataset into the database and initiate all data
     """
     while True:
         data = q.get()
         dataset_data = data[0]
         count = data[1]
         self.logger.debug('Inserting dataset number %d', count)
         dataset_name = get_json(dataset_data, 'name')
         coll = 'dataset_data'
         query = {'name':dataset_name}
         data = {'$set':{'name':dataset_name}}
         data = self.storage.update_data(coll=coll, query=query, data=data, upsert=True)
         try:
             self.insert_phedex_data(dataset_name)
             self.insert_dbs_data(dataset_name)
             replicas = self.get_replicas(dataset_data)
             query = {'name':dataset_name}
             data = {'$set':{'name':dataset_name, 'replicas':replicas}}
             data = self.storage.update_data(coll=coll, query=query, data=data)
         except:
             coll = 'dataset_data'
             query = {'name':dataset_name}
             self.storage.delete_data(coll=coll, query=query)
         q.task_done()
Example #7
0
 def update_db(self):
     """
     Initiate site data in database
     Get general data about all sites
     """
     api = 'Detox'
     file_ = 'SitesInfo.txt'
     intelroccs_data = self.intelroccs.fetch(api=api, params=file_, secure=False)
     for site_data in get_json(intelroccs_data, 'data'):
         self.insert_site_data(site_data)
Example #8
0
 def update_db(self):
     """
     Initiate site data in database
     Get general data about all sites
     """
     api = 'Detox'
     file_ = 'SitesInfo.txt'
     intelroccs_data = self.intelroccs.fetch(api=api,
                                             params=file_,
                                             secure=False)
     for site_data in get_json(intelroccs_data, 'data'):
         self.insert_site_data(site_data)
Example #9
0
 def insert_dataset(self, dataset_name):
     """
     Fetch all popularity data for dataset
     """
     api = 'getSingleDSstat'
     sitename = 'summary'
     name = dataset_name
     aggr = 'day'
     orderbys = ['totcpu', 'naccess']
     coll = 'dataset_popularity'
     for orderby in orderbys:
         params = {'sitename':sitename, 'name':name, 'aggr':aggr, 'orderby':orderby}
         json_data = self.pop_db.fetch(api=api, params=params)
         data = get_json(json_data, 'data')
         for pop_data in get_json(data, 'data'):
             date = pop_db_timestamp_to_datetime(pop_data[0])
             query = {'name':dataset_name, 'data':date}
             popularity_data = {'name':dataset_name, 'date':date}
             popularity_data[orderby] = pop_data[1]
             data = {'$set':popularity_data}
             self.storage.update_data(coll=coll, query=query, data=data, upsert=True)
Example #10
0
 def get_replicas(self, dataset_data):
     """
     Generator function to get all replicas of a dataset
     """
     replicas_check = dict()
     dataset_name = get_json(dataset_data, 'name')
     for block_data in get_json(dataset_data, 'block'):
         for replica_data in get_json(block_data, 'replica'):
             try:
                 replicas_check[get_json(replica_data, 'node')] += get_json(replica_data, 'files')
             except:
                 replicas_check[get_json(replica_data, 'node')] = get_json(replica_data, 'files')
     replicas = list()
     n_files = self.get_n_files(dataset_name)
     for site, site_files in replicas_check.items():
         if site_files == n_files:
             replicas.append(site)
     return replicas
Example #11
0
 def insert_dbs_data(self, dataset_name):
     """
     Fetch dbs data about dataset and insert into database
     """
     api = 'datasets'
     params = {'dataset':dataset_name, 'detail':True, 'dataset_access_type':'*'}
     dbs_data = self.dbs.fetch(api=api, params=params)
     dataset_data = get_json(dbs_data, 'data')[0]
     ds_name = get_json(dataset_data, 'primary_ds_name')
     physics_group = get_json(dataset_data, 'physics_group_name')
     data_tier = get_json(dataset_data, 'data_tier_name')
     creation_date = datetime_day(timestamp_to_datetime(get_json(dataset_data, 'creation_date')))
     ds_type = get_json(dataset_data, 'primary_ds_type')
     coll = 'dataset_data'
     query = {'name':dataset_name}
     data = {'$set':{'ds_name':ds_name, 'physics_group':physics_group, 'data_tier':data_tier, 'creation_date':creation_date, 'ds_type':ds_type}}
     self.storage.update_data(coll=coll, query=query, data=data, upsert=False)
Example #12
0
 def insert_phedex_data(self, dataset_name):
     """
     Fetch phedex data about dataset and insert into database
     """
     api = 'data'
     params = {'dataset':dataset_name, 'level':'block', 'create_since':0.0}
     phedex_data = self.phedex.fetch(api=api, params=params)
     size_bytes = 0
     n_files = 0
     dataset_data = get_json(get_json(get_json(phedex_data, 'phedex'), 'dbs')[0],'dataset')[0]
     for block_data in get_json(dataset_data, 'block'):
         size_bytes += get_json(block_data, 'bytes')
         n_files += get_json(block_data, 'files')
     coll = 'dataset_data'
     query = {'name':dataset_name}
     data = {'$set':{'size_bytes':size_bytes, 'n_files':n_files}}
     self.storage.update_data(coll=coll, query=query, data=data, upsert=False)