Esempio n. 1
0
 def __init__(self, config=dict()):
     self.logger = logging.getLogger(__name__)
     self.config = config
     self.pop_db = PopDBService(self.config)
     self.sites = SiteManager(self.config)
     self.datasets = DatasetManager(self.config)
     self.storage = StorageManager(self.config)
     self.MAX_THREADS = int(config['threading']['max_threads'])
Esempio n. 2
0
 def test_pop_db(self):
     "Test pop_db functions"
     print ""
     pop_db = PopDBService(config=self.config)
     api = 'DSStatInTimeWindow/'
     params = {'tstart':'2015-04-18', 'tstop':'2015-04-18', 'sitename':'T2_US_Nebraska'}
     expected = 'T2_US_Nebraska'
     json_data = pop_db.fetch(api=api, params=params, cache=False)
     result = json_data['SITENAME']
     self.assertEqual(result, expected)
Esempio n. 3
0
 def test_pop_db(self):
     "Test pop_db functions"
     print ""
     pop_db = PopDBService(config=self.config)
     api = 'DSStatInTimeWindow/'
     params = {
         'tstart': '2015-04-18',
         'tstop': '2015-04-18',
         'sitename': 'T2_US_Nebraska'
     }
     expected = 'T2_US_Nebraska'
     json_data = pop_db.fetch(api=api, params=params, cache=False)
     result = json_data['SITENAME']
     self.assertEqual(result, expected)
Esempio n. 4
0
 def test_pop_db_memory(self):
     "Test pop_db data memory usage"
     print ""
     pop_db = PopDBService(self.config)
     api = 'getDSdata'
     sitename = 'summary'
     aggr = 'day'
     n = 200000
     orderby = 'totcpu'
     tstart = datetime_to_string(datetime_day(datetime.datetime.utcnow() - datetime.timedelta(days=10)))
     tstop = datetime_to_string(datetime_day(datetime.datetime.utcnow()))
     params = {'sitename':sitename, 'tstart':tstart, 'tstop':tstop, 'aggr':aggr, 'n':n, 'orderby':orderby}
     pop_db_data = pop_db.fetch(api=api, params=params, cache=False)
     total_size = total_size_of(pop_db_data)
     logger.info('Total size of Pop DB data in memory is %d bytes (%dMB)', total_size, total_size/10**6)
Esempio n. 5
0
class PopularityManager(object):
    """
    Generate popularity metrics for datasets and sites
    """
    def __init__(self, config=dict()):
        self.logger = logging.getLogger(__name__)
        self.config = config
        self.pop_db = PopDBService(self.config)
        self.sites = SiteManager(self.config)
        self.datasets = DatasetManager(self.config)
        self.storage = StorageManager(self.config)
        self.MAX_THREADS = int(config['threading']['max_threads'])

    def initiate_db(self):
        """
        Collect popularity data
        """
        q = Queue.Queue()
        for i in range(self.MAX_THREADS):
            worker = threading.Thread(target=self.insert_popularity_data, args=(i, q))
            worker.daemon = True
            worker.start()
        start_date = datetime_day(datetime.datetime.utcnow() - datetime.timedelta(days=90))
        end_date = datetime_day(datetime.datetime.utcnow())
        # fetch popularity data
        t1 = datetime.datetime.utcnow()
        for date in daterange(start_date, end_date):
            q.put(date)
        q.join()
        t2 = datetime.datetime.utcnow()
        td = t2 - t1
        self.logger.info('Inserting Pop DB data took %s', str(td))

    def update_db(self):
        """
        Fetch latest popularity data not in database
        """
        # get dates
        coll = 'dataset_popularity'
        pipeline = list()
        sort = {'$sort':{'date':-1}}
        pipeline.append(sort)
        limit = {'$limit':1}
        pipeline.append(limit)
        project = {'$project':{'date':1, '_id':0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        try:
            start_date = data[0]['date']
        except:
            self.logger.warning('Popularity needs to be initiated')
            self.initiate_db()
            return
        q = Queue.Queue()
        for i in range(self.MAX_THREADS):
            worker = threading.Thread(target=self.insert_popularity_data, args=(i, q))
            worker.daemon = True
            worker.start()
        start_date = start_date + datetime.timedelta(days=1)
        end_date = datetime_day(datetime.datetime.utcnow())
        # fetch popularity data
        t1 = datetime.datetime.utcnow()
        for date in daterange(start_date, end_date):
            q.put(date)
        q.join()
        t2 = datetime.datetime.utcnow()
        td = t2 - t1
        self.logger.info('Updating Pop DB data took %s', str(td))

    def insert_popularity_data(self, i, q):
        """
        Insert popularity data for one dataset into db
        """
        coll = 'dataset_popularity'
        while True:
            date = q.get()
            self.logger.info('Inserting date %s', datetime_to_string(date))
            api = 'DSStatInTimeWindow/'
            tstart = datetime_to_string(date)
            tstop = tstart
            params = {'sitename':'summary', 'tstart':tstart, 'tstop':tstop}
            json_data = self.pop_db.fetch(api=api, params=params)
            # sort it in dictionary for easy fetching
            for dataset in json_data['DATA']:
                dataset_name = dataset['COLLNAME']
                popularity_data = {'name':dataset_name, 'date':date}
                popularity_data['n_accesses'] = dataset['NACC']
                popularity_data['n_cpus'] = dataset['TOTCPU']
                popularity_data['n_users'] = dataset['NUSERS']
                query = {'name':dataset_name, 'date':date}
                data = {'$set':popularity_data}
                self.storage.update_data(coll=coll, query=query, data=data, upsert=True)
            q.task_done()

    def get_average_popularity(self, dataset_name, date):
        """
        Get all popularity data for a dataset
        """
        start_date = date - datetime.timedelta(days=7)
        end_date = date - datetime.timedelta(days=1)
        coll = 'dataset_popularity'
        pipeline = list()
        match = {'$match':{'name':dataset_name}}
        pipeline.append(match)
        match = {'$match':{'date':{'$gte':start_date, '$lte':end_date}}}
        pipeline.append(match)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        pops = list()
        for i in range(0, 7):
            try:
                pops.append(log(float(data[i]['n_accesses']*data[i]['n_cpus'])))
            except:
                pops.append(0.0)
        avg = np.mean(pops)
        return avg