Beispiel #1
0
 def __init__(self, configfile, verbose=0):
     "Main module"
     self.config = parse_config(configfile)
     self.storage = StorageManager(self.config)
     self.sitedb = SiteDBService(self.config, verbose)
     self.dbs = DBSService(self.config, verbose)
     self.popdb = PopDBService(self.config, verbose)
     self.phedex = PhedexService(self.config, verbose)
     self.dashboard = DashboardService(self.config, verbose)
     self.salt = self.config.get('core', {}).get('salt', 'secret sauce')
     self.verbose = verbose
     self.multitask = self.config.get('core', {}).get('multitask', False)
     self.queue = mp.Queue()
     if verbose:
         print("DCAF multitask", self.multitask)
Beispiel #2
0
class GenericService(object):
    "Generic DCAF service class"

    def __init__(self, config=None, verbose=0):
        if not config:
            config = {}
        self.name = 'generic'
        self.verbose = verbose
        self.storage = StorageManager(config)

    def fetch(self, url, params, cache=True):
        "Fetch data for given api"
        debug = 0
        data = "[]"
        if cache:
            docid = genkey("url=%s params=%s" % (url, params))
            res = self.storage.fetch_one('cache', {'_id': docid})
            if res and 'data' in res:
                if self.verbose:
                    print("%s::fetch url=%s, params=%s, docid=%s" \
                            % (self.name, url, params, docid))
                return res['data']
        if self.verbose:
            print("%s::fetch url=%s, params=%s" % (self.name, url, params))
            debug = self.verbose - 1
        try:
            data = getdata(url, params, debug=debug)
        except Exception as exc:
            print(str(exc))
            for attempt in xrange(3):
                time.sleep(0.1)
                print("Attempt %s" % attempt)
                try:
                    data = getdata(url, params, debug=debug)
                    break
                except Exception as err:
                    print(str(err))
                    pass
        if cache:
            self.storage.insert('cache', {
                '_id': docid,
                'data': data,
                'url': url,
                'params': params
            })
        return data
Beispiel #3
0
class GenericService(object):
    "Generic DCAF service class"
    def __init__(self, config=None, verbose=0):
        if  not config:
            config = {}
        self.name = 'generic'
        self.verbose = verbose
        self.storage = StorageManager(config)

    def fetch(self, url, params, cache=True):
        "Fetch data for given api"
        debug = 0
        data = "[]"
        if  cache:
            docid = genkey("url=%s params=%s" % (url, params))
            res = self.storage.fetch_one('cache', {'_id':docid})
            if  res and 'data' in res:
                if  self.verbose:
                    print("%s::fetch url=%s, params=%s, docid=%s" \
                            % (self.name, url, params, docid))
                return res['data']
        if  self.verbose:
            print("%s::fetch url=%s, params=%s" % (self.name, url, params))
            debug = self.verbose-1
        try:
            data = getdata(url, params, debug=debug)
        except Exception as exc:
            print(str(exc))
            for attempt in xrange(3):
                time.sleep(0.1)
                print("Attempt %s" % attempt)
                try:
                    data = getdata(url, params, debug=debug)
                    break
                except Exception as err:
                    print(str(err))
                    pass
        if  cache:
            self.storage.insert('cache', {'_id':docid, 'data': data, 'url': url, 'params': params})
        return data
Beispiel #4
0
 def __init__(self, configfile, verbose=0):
     "Main module"
     self.config = parse_config(configfile)
     self.storage = StorageManager(self.config)
     self.sitedb = SiteDBService(self.config, verbose)
     self.dbs = DBSService(self.config, verbose)
     self.popdb = PopDBService(self.config, verbose)
     self.phedex = PhedexService(self.config, verbose)
     self.dashboard = DashboardService(self.config, verbose)
     self.salt = self.config.get('core', {}).get('salt', 'secret sauce')
     self.verbose = verbose
     self.multitask = self.config.get('core', {}).get('multitask', False)
     self.queue = mp.Queue()
     if  verbose:
         print("DCAF multitask", self.multitask)
Beispiel #5
0
class DCAF(object):
    def __init__(self, configfile, verbose=0):
        "Main module"
        self.config = parse_config(configfile)
        self.storage = StorageManager(self.config)
        self.sitedb = SiteDBService(self.config, verbose)
        self.dbs = DBSService(self.config, verbose)
        self.popdb = PopDBService(self.config, verbose)
        self.phedex = PhedexService(self.config, verbose)
        self.dashboard = DashboardService(self.config, verbose)
        self.salt = self.config.get('core', {}).get('salt', 'secret sauce')
        self.verbose = verbose
        self.multitask = self.config.get('core', {}).get('multitask', False)
        self.queue = mp.Queue()
        if  verbose:
            print("DCAF multitask", self.multitask)

    def fetch(self, doc):
        """
        Fetch method retrieves data from data-provide and store them into
        internal data storage.
        """
        source = doc.get('source', None)
        params = doc.get('params', {})
        if  self.verbose:
            print("Fetch data from %s(%s)" % (source, params))
        if  source == 'sitedb':
            for item in self.sitedb.fetch('people', {}):
                print(item)
                break

    def data_types(self):
        """Return list of data types dicts:

            - dtypes is data types, e.g. mc/data
            - stypes is site types, dict of Tier sites
            - rtypes is release types
              {'series':{'major':{'minor':}}
        """
        dtypes = ['mc', 'data'] # data types, should be small list
        tiers = self.dbs.data_tiers()
        stypes = {'s_%s'%TIER0:0, 's_%s'%TIER1:0, 's_%s'%TIER2:0, 's_%s'%TIER3:0, 's_%s'%TIER_NA:0} # site types
        rtypes = {} # release types
        releases = self.dbs.releases()
        series = set()
        majors = set()
        minors = set()
        for row in releases:
            rel = row['release']
            sval, major, minor = rel_ver(rel)
            if  not cmssw_test(sval, major, minor):
                continue
            series.add(sval)
            majors.add(major)
            minors.add(minor)
        serdict = {}
        for val in series:
            serdict['rel1_%s'%val] = 0
        majdict = {}
        for val in majors:
            majdict['rel2_%s'%val] = 0
        mindict = {}
        for val in minors:
            mindict['rel3_%s'%val] = 0
        # release types as defined in rel_type function
        typdict = {'relt_%s'%RFULL:0, 'relt_%s'%RPRE:0, 'relt_%s'%RPATCH:0}
        rtypes = {'series': serdict, 'majors': majdict, 'minors': mindict, 'rtypes': typdict}
        return dtypes, stypes, rtypes, tiers

    def dataset_info_all(self, dataset, dbsinst, timeframe):
        "Concurrently obtain information about dataset."
        # NOTE: this function may need expansion if we'll need to obtain more information
        #       about given dataset/timeframe. To extend, please add new local
        #       function similar to procN
        # Each procN function should have (pos, out, args) input arguments
        # the pos is a position in queue, out is an output queue and args
        # are arguments required to pass to internal function

        # local functions to fetch info about dataset from different subsystems
        def proc1(pos, out, dataset, inst):
            try:
                res = [rname for rname in \
                        self.dbs.dataset_release_versions(dataset, inst)]
            except:
                res = []
            out.put((pos, res))
        def proc2(pos, out, dataset, _inst):
            try:
                res = [sname for sname in self.phedex.sites(dataset)]
            except:
                res = []
            out.put((pos, res))
        def proc3(pos, out, dataset, inst):
            try:
                res = [r for r in self.dbs.dataset_parents(dataset, inst)]
            except:
                res = []
            out.put((pos, res))
        def proc4(pos, out, dataset, inst):
            try:
                res = self.dbs.dataset_summary(dataset, inst)
            except:
                res = dict()
            out.put((pos, res))
        def proc5(pos, out, dataset, tframe):
            try:
                res = self.dashboard.dataset_info(dataset, tframe[0], tframe[1])
            except:
                res = dict()
            out.put((pos, res))
        # concurrent processes to run, each args contains
        # a position value, output queue and args for internal function call
        processes = [
            mp.Process(target=proc1, args=(1, self.queue, dataset, dbsinst)),
            mp.Process(target=proc2, args=(2, self.queue, dataset, dbsinst)),
            mp.Process(target=proc3, args=(3, self.queue, dataset, dbsinst)),
            mp.Process(target=proc4, args=(4, self.queue, dataset, dbsinst)),
            mp.Process(target=proc5, args=(5, self.queue, dataset, timeframe))
        ]
        # Run processes
        for proc in processes:
            proc.start()

        # Exit the completed processes
        for proc in processes:
            proc.join()

        # Get process results from the self.queue queue
        results = [self.queue.get() for _ in processes]
        results.sort() # sort by position
        results = [v for _, v in results] # extract values
        return results

    def dataset_info(self, timeframe, dataset, dtypes, stypes, rtypes, tiers, dformat, target=0):
        "Return common dataset info in specified data format"
        dbsinst = self.dbs.dataset_dbsinst(dataset)
        if  not dbsinst:
            return
        row = self.dbs.dataset_info(dataset, dbsinst)
        if  row:
            if  self.multitask:
                releases, sites, parents, summary, dashboard = \
                        self.dataset_info_all(dataset, dbsinst, timeframe)
            else:
                releases = [rname for rname in self.dbs.dataset_release_versions(dataset, dbsinst)]
                sites = [sname for sname in self.phedex.sites(dataset)]
                parents = [r for r in self.dbs.dataset_parents(dataset, dbsinst)]
                summary = self.dbs.dataset_summary(dataset, dbsinst)
                dashboard = self.dashboard.dataset_info(dataset, timeframe[0], timeframe[1])
            nrels = len(releases)
            series = rtypes['series']
            majors = rtypes['majors']
            minors = rtypes['minors']
            relclf = rtypes['rtypes']
            for rel in releases:
                rserie, rmajor, rminor = rel_ver(rel)
                if  not cmssw_test(rserie, rmajor, rminor):
                    continue
                rtype = rel_type(rel)
                try:
                    series['rel1_%s'%rserie] += 1
                except:
                    pass
                try:
                    majors['rel2_%s'%rmajor] += 1
                except:
                    pass
                try:
                    minors['rel3_%s'%rminor] += 1
                except:
                    pass
                try:
                    relclf['relt_%s'%rtype] += 1
                except:
                    pass
            nsites = len(sites)
            for site in sites:
                stier = site_tier(site)
                stypes['s_%s'%stier] += 1
            dataset_id = row['rid']
            era = genkey(row['acquisition_era_name'], self.salt, 5)
            create_dn = self.sitedb.dnid(row['create_by'])
            dbsinstid = row['dbs_instance']
            dtype = row['primary_ds_type']
            # number of data types should be small and simple
            # list look-up shouldn't be a problem
            if  dtype not in dtypes:
                dtypes.append(dtype)
            dtype = dtypes.index(dtype)
            _, prim, proc, tier = dataset.split('/')
            prim = genkey(prim, self.salt, 5)
            proc = genkey(proc, self.salt, 5)
            if  tier not in tiers:
                tiers.append(tier)
            tier = genkey(tier, self.salt, 5)
            parent = parents[0] if len(parents) else 0
            uid = genuid(yyyymmdd(timeframe[0]), dbsinstid, dataset_id)
            size_norm = 2**30 # normalization factor for file size
            rec = dict(id=uid, dataset=dataset_id, primds=prim, procds=proc, tier=tier,
                    dtype=dtype, creator=create_dn, nrel=nrels, nsites=nsites,
                    parent=parent, era=era, dbs=dbsinstid,
                    nfiles=summary.get('num_file', 0),
                    nlumis=summary.get('num_lumi', 0),
                    nblk=summary.get('num_block', 0),
                    nevt=summary.get('num_event', 0),
                    size=summary.get('file_size', 0)/size_norm,
                    cpu=dashboard.get('cpu', 0),
                    wct=dashboard.get('wct', 0),
                    proc_evts=dashboard.get('nevt', 0))
            if  isinstance(target, dict):
                rec.update(target)
            for key,val in series.items():
                rec.update({key:val})
            for key, val in majors.items():
                rec.update({key:val})
            for key, val in minors.items():
                rec.update({key:val})
            for key, val in relclf.items():
                rec.update({key:val})
            for key, val in stypes.items():
                rec.update({key:val})
            headers = rec.keys()
            headers.sort()
            headers.remove('id')
            headers = ['id'] + headers # let dataset id be the first column
            if  dformat == 'headers':
                yield headers
            elif  dformat == 'csv':
                res = [str(rec[h]) for h in headers]
                yield ','.join(res)
            elif dformat == 'vw':
                target_str = target.get('rnaccess')
                vals = ' '.join([str(rec[h]) for h in headers])
                uid = genkey(vals, self.salt, 5) # unique row identified
                vwrow = "%s '%s |f %s" % (target_str, uid, vals)
                yield vwrow

    def update(self):
        # get fresh copy of hashed db's
        self.dbs.update('datasets')
        self.dbs.update('releases')
        self.sitedb.update('people')
        self.sitedb.update('sites')

    def cleanup(self, cname):
        "Clean-up given collection"
        self.storage.cleanup(cname)

    def remove(self, cname, docid):
        "Remove given docid from given collection"
        spec = {"_id": docid}
        self.storage.cleanup(cname, spec)

    def dataframe(self, timeframe, seed, dformat, dbs_extra, newdata=None):
        """Form a dataframe from various CMS data-providers"""
        dtypes, stypes, rtypes, tiers = self.data_types()
        pop_datasets = 0
        dbs_datasets = 0
        popdb_results = [r for r in self.popdb.dataset_stat(timeframe[0], timeframe[1])]
        if  dformat == 'csv':
            row = popdb_results[0]
            dataset = row['dataset']
            target = dict(naccess=row['naccess'],nusers=row['nusers'],totcpu=row['totcpu'],
                    rnaccess=row['rnaccess'],rnusers=row['rnusers'],rtotcpu=row['rtotcpu'])
            # seed dataset to determine headers of the dataframe
            rows = self.dataset_info(timeframe, seed, dtypes, stypes, rtypes,
                    tiers, 'headers', target)
            headers = [r for r in rows][0]
            yield ','.join(headers)
        tstamp = time.strftime("%Y-%m-%d %H:%M:%S GMT", time.gmtime())
        if  newdata: # request new dataset
            if  self.verbose:
                print("Generate dataframe for new datasets", tstamp)
            n_days = 7
            if  timeframe:
                n_days = ndays(yyyymmdd(timeframe[0]), yyyymmdd(timeframe[1]))
            new_datasets = self.dbs.new_datasets(n_days)
            target = dict(naccess=0,nusers=0,totcpu=0,
                    rnaccess=0,rnusers=0,rtotcpu=0)
            for row in new_datasets:
                dataset = row['dataset']
                rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \
                        rtypes, tiers, dformat, target)
                for row in rows:
                    yield row
            return
        # get list of popular datasets in certain time frame
#        popdb_results = [r for r in self.popdb.dataset_stat(timeframe[0], timeframe[1])]
        popdb_datasets = {} #
        for row in popdb_results:
            dataset = row['dataset']
            if  not DATASET_PAT.match(dataset):
                continue
            if  self.verbose:
                print("Generate dataframe for %s, timeframe: %s, %s" \
                        % (dataset, timeframe, tstamp))
            target = dict(naccess=row['naccess'],nusers=row['nusers'],totcpu=row['totcpu'],
                    rnaccess=row['rnaccess'],rnusers=row['rnusers'],rtotcpu=row['rtotcpu'])
            rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \
                    rtypes, tiers, dformat, target)
            popdb_datasets[dataset] = row
            for row in rows:
                yield row
                pop_datasets += 1

        # get list of datasets from DBS and discard from this list
        # those who were presented in popdb
        all_dbs_datasets = self.dbs.datasets()
        dbsdatasets = [d for d in all_dbs_datasets if d not in popdb_datasets.keys()]
        target = dict(naccess=0,nusers=0,totcpu=0,
                rnaccess=0,rnusers=0,rtotcpu=0)
        for dataset in random.sample(dbsdatasets, dbs_extra):
            rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \
                    rtypes, tiers, dformat, target)
            for row in rows:
                yield row
                dbs_datasets += 1
        if  self.verbose:
            print("DBS datasets  : %s" % dbs_datasets)
            print("PopDB datasets: %s out of %s" % (pop_datasets, len(popdb_results)))

    def export(self, dformat):
        "Export analytics dataframe into provided data format"
        print("Export dataframe into %s format" % dformat.lower())
        if  dformat.lower() == 'csv':
            print("Do CSV export")
            headers = []
            for row in self.dataframe():
                if  not headers:
                    headers = row
                    yield ','.join(headers)
                    continue
                yield ','.join(row)
        elif  dformat.lower() == 'vw':
            print("Do vw export")
        else:
            raise NotImplemented
Beispiel #6
0
 def __init__(self, config=None, verbose=0):
     if not config:
         config = {}
     self.name = 'generic'
     self.verbose = verbose
     self.storage = StorageManager(config)
Beispiel #7
0
 def __init__(self, config=None, verbose=0):
     if  not config:
         config = {}
     self.name = 'generic'
     self.verbose = verbose
     self.storage = StorageManager(config)
Beispiel #8
0
class DCAF(object):
    def __init__(self, configfile, verbose=0):
        "Main module"
        self.config = parse_config(configfile)
        self.storage = StorageManager(self.config)
        self.sitedb = SiteDBService(self.config, verbose)
        self.dbs = DBSService(self.config, verbose)
        self.popdb = PopDBService(self.config, verbose)
        self.phedex = PhedexService(self.config, verbose)
        self.dashboard = DashboardService(self.config, verbose)
        self.salt = self.config.get('core', {}).get('salt', 'secret sauce')
        self.verbose = verbose
        self.multitask = self.config.get('core', {}).get('multitask', False)
        self.queue = mp.Queue()
        if verbose:
            print("DCAF multitask", self.multitask)

    def fetch(self, doc):
        """
        Fetch method retrieves data from data-provide and store them into
        internal data storage.
        """
        source = doc.get('source', None)
        params = doc.get('params', {})
        if self.verbose:
            print("Fetch data from %s(%s)" % (source, params))
        if source == 'sitedb':
            for item in self.sitedb.fetch('people', {}):
                print(item)
                break

    def data_types(self):
        """Return list of data types dicts:

            - dtypes is data types, e.g. mc/data
            - stypes is site types, dict of Tier sites
            - rtypes is release types
              {'series':{'major':{'minor':}}
        """
        dtypes = ['mc', 'data']  # data types, should be small list
        tiers = self.dbs.data_tiers()
        stypes = {
            's_%s' % TIER0: 0,
            's_%s' % TIER1: 0,
            's_%s' % TIER2: 0,
            's_%s' % TIER3: 0,
            's_%s' % TIER_NA: 0
        }  # site types
        rtypes = {}  # release types
        releases = self.dbs.releases()
        series = set()
        majors = set()
        minors = set()
        for row in releases:
            rel = row['release']
            sval, major, minor = rel_ver(rel)
            if not cmssw_test(sval, major, minor):
                continue
            series.add(sval)
            majors.add(major)
            minors.add(minor)
        serdict = {}
        for val in series:
            serdict['rel1_%s' % val] = 0
        majdict = {}
        for val in majors:
            majdict['rel2_%s' % val] = 0
        mindict = {}
        for val in minors:
            mindict['rel3_%s' % val] = 0
        # release types as defined in rel_type function
        typdict = {
            'relt_%s' % RFULL: 0,
            'relt_%s' % RPRE: 0,
            'relt_%s' % RPATCH: 0
        }
        rtypes = {
            'series': serdict,
            'majors': majdict,
            'minors': mindict,
            'rtypes': typdict
        }
        return dtypes, stypes, rtypes, tiers

    def dataset_info_all(self, dataset, dbsinst, timeframe):
        "Concurrently obtain information about dataset."

        # NOTE: this function may need expansion if we'll need to obtain more information
        #       about given dataset/timeframe. To extend, please add new local
        #       function similar to procN
        # Each procN function should have (pos, out, args) input arguments
        # the pos is a position in queue, out is an output queue and args
        # are arguments required to pass to internal function

        # local functions to fetch info about dataset from different subsystems
        def proc1(pos, out, dataset, inst):
            try:
                res = [rname for rname in \
                        self.dbs.dataset_release_versions(dataset, inst)]
            except:
                res = []
            out.put((pos, res))

        def proc2(pos, out, dataset, _inst):
            try:
                res = [sname for sname in self.phedex.sites(dataset)]
            except:
                res = []
            out.put((pos, res))

        def proc3(pos, out, dataset, inst):
            try:
                res = [r for r in self.dbs.dataset_parents(dataset, inst)]
            except:
                res = []
            out.put((pos, res))

        def proc4(pos, out, dataset, inst):
            try:
                res = self.dbs.dataset_summary(dataset, inst)
            except:
                res = dict()
            out.put((pos, res))

        def proc5(pos, out, dataset, tframe):
            try:
                res = self.dashboard.dataset_info(dataset, tframe[0],
                                                  tframe[1])
            except:
                res = dict()
            out.put((pos, res))

        # concurrent processes to run, each args contains
        # a position value, output queue and args for internal function call
        processes = [
            mp.Process(target=proc1, args=(1, self.queue, dataset, dbsinst)),
            mp.Process(target=proc2, args=(2, self.queue, dataset, dbsinst)),
            mp.Process(target=proc3, args=(3, self.queue, dataset, dbsinst)),
            mp.Process(target=proc4, args=(4, self.queue, dataset, dbsinst)),
            mp.Process(target=proc5, args=(5, self.queue, dataset, timeframe))
        ]
        # Run processes
        for proc in processes:
            proc.start()

        # Exit the completed processes
        for proc in processes:
            proc.join()

        # Get process results from the self.queue queue
        results = [self.queue.get() for _ in processes]
        results.sort()  # sort by position
        results = [v for _, v in results]  # extract values
        return results

    def dataset_info(self,
                     timeframe,
                     dataset,
                     dtypes,
                     stypes,
                     rtypes,
                     tiers,
                     dformat,
                     target=0):
        "Return common dataset info in specified data format"
        dbsinst = self.dbs.dataset_dbsinst(dataset)
        if not dbsinst:
            return
        row = self.dbs.dataset_info(dataset, dbsinst)
        if row:
            if self.multitask:
                releases, sites, parents, summary, dashboard = \
                        self.dataset_info_all(dataset, dbsinst, timeframe)
            else:
                releases = [
                    rname for rname in self.dbs.dataset_release_versions(
                        dataset, dbsinst)
                ]
                sites = [sname for sname in self.phedex.sites(dataset)]
                parents = [
                    r for r in self.dbs.dataset_parents(dataset, dbsinst)
                ]
                summary = self.dbs.dataset_summary(dataset, dbsinst)
                dashboard = self.dashboard.dataset_info(
                    dataset, timeframe[0], timeframe[1])
            nrels = len(releases)
            series = {}
            for k in rtypes['series'].keys():
                series[k] = 0
            majors = {}
            for k in rtypes['majors'].keys():
                majors[k] = 0
            minors = {}
            for k in rtypes['minors'].keys():
                minors[k] = 0
            relclf = {}
            for k in rtypes['rtypes'].keys():
                relclf[k] = 0
            for rel in releases:
                rserie, rmajor, rminor = rel_ver(rel)
                if not cmssw_test(rserie, rmajor, rminor):
                    continue
                rtype = rel_type(rel)
                try:
                    series['rel1_%s' % rserie] += 1
                except:
                    pass
                try:
                    majors['rel2_%s' % rmajor] += 1
                except:
                    pass
                try:
                    minors['rel3_%s' % rminor] += 1
                except:
                    pass
                try:
                    relclf['relt_%s' % rtype] += 1
                except:
                    pass
            nsites = len(sites)
            for site in sites:
                stier = site_tier(site)
                stypes['s_%s' % stier] += 1
            dataset_id = row['rid']
            era = genkey(row['acquisition_era_name'], self.salt, 5)
            create_dn = self.sitedb.dnid(row['create_by'])
            dbsinstid = row['dbs_instance']
            dtype = row['primary_ds_type']
            # number of data types should be small and simple
            # list look-up shouldn't be a problem
            if dtype not in dtypes:
                dtypes.append(dtype)
            dtype = dtypes.index(dtype)
            _, prim, proc, tier = dataset.split('/')
            prim = genkey(prim, self.salt, 5)
            proc = genkey(proc, self.salt, 5)
            if tier not in tiers:
                tiers.append(tier)
            tier = genkey(tier, self.salt, 5)
            parent = parents[0] if len(parents) else 0
            uid = genuid(yyyymmdd(timeframe[0]), dbsinstid, dataset_id)
            size_norm = 2**30  # normalization factor for file size
            if not summary:
                summary = {}  # we need a dict type
            rec = dict(id=uid,
                       dataset=dataset_id,
                       primds=prim,
                       procds=proc,
                       tier=tier,
                       dtype=dtype,
                       creator=create_dn,
                       nrel=nrels,
                       nsites=nsites,
                       parent=parent,
                       era=era,
                       dbs=dbsinstid,
                       nfiles=summary.get('num_file', 0),
                       nlumis=summary.get('num_lumi', 0),
                       nblk=summary.get('num_block', 0),
                       nevt=summary.get('num_event', 0),
                       size=summary.get('file_size', 0) / size_norm,
                       cpu=dashboard.get('cpu', 0),
                       wct=dashboard.get('wct', 0),
                       proc_evts=dashboard.get('nevt', 0))
            if isinstance(target, dict):
                rec.update(target)
            for key, val in series.items():
                rec.update({key: val})
            for key, val in majors.items():
                rec.update({key: val})
            for key, val in minors.items():
                rec.update({key: val})
            for key, val in relclf.items():
                rec.update({key: val})
            for key, val in stypes.items():
                rec.update({key: val})
            headers = rec.keys()
            headers.sort()
            headers.remove('id')
            headers = ['id'] + headers  # let dataset id be the first column
            if dformat == 'headers':
                yield headers
            elif dformat == 'csv':
                res = [str(rec[h]) for h in headers]
                yield ','.join(res)
            elif dformat == 'vw':
                target_str = target.get('rnaccess')
                vals = ' '.join([str(rec[h]) for h in headers])
                uid = genkey(vals, self.salt, 5)  # unique row identified
                vwrow = "%s '%s |f %s" % (target_str, uid, vals)
                yield vwrow

    def update(self):
        # get fresh copy of hashed db's
        self.dbs.update('datasets')
        self.dbs.update('releases')
        self.sitedb.update('people')
        self.sitedb.update('sites')

    def cleanup(self, cname):
        "Clean-up given collection"
        self.storage.cleanup(cname)

    def remove(self, cname, docid):
        "Remove given docid from given collection"
        spec = {"_id": docid}
        self.storage.cleanup(cname, spec)

    def dataframe(self, timeframe, seed, dformat, dbs_extra, newdata=None):
        """Form a dataframe from various CMS data-providers"""
        dtypes, stypes, rtypes, tiers = self.data_types()
        pop_datasets = 0
        dbs_datasets = 0
        popdb_results = [
            r for r in self.popdb.dataset_stat(timeframe[0], timeframe[1])
        ]
        if dformat == 'csv':
            dataset = None
            for row in popdb_results:
                if len(row['dataset'].split(
                        '/')) == 4:  # dataset with 3 slashes
                    dataset = row['dataset']
                    break
            if not dataset:
                raise Exception(
                    "Unable to find valid dataset name in popdb output")
            target = dict(naccess=row['naccess'],
                          nusers=row['nusers'],
                          totcpu=row['totcpu'],
                          rnaccess=row['rnaccess'],
                          rnusers=row['rnusers'],
                          rtotcpu=row['rtotcpu'])
            # seed dataset to determine headers of the dataframe
            rows = self.dataset_info(timeframe, seed, dtypes, stypes, rtypes,
                                     tiers, 'headers', target)
            headers = [r for r in rows][0]
            yield ','.join(headers)
        tstamp = time.strftime("%Y-%m-%d %H:%M:%S GMT", time.gmtime())
        if newdata:  # request new dataset
            if self.verbose:
                print("Generate dataframe for new datasets", tstamp)
            n_days = 7
            if timeframe:
                n_days = ndays(yyyymmdd(timeframe[0]), yyyymmdd(timeframe[1]))
            new_datasets = self.dbs.new_datasets(n_days)
            target = dict(naccess=0,
                          nusers=0,
                          totcpu=0,
                          rnaccess=0,
                          rnusers=0,
                          rtotcpu=0)
            for row in new_datasets:
                dataset = row['dataset']
                rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \
                        rtypes, tiers, dformat, target)
                for row in rows:
                    yield row
            return
        # get list of popular datasets in certain time frame
#        popdb_results = [r for r in self.popdb.dataset_stat(timeframe[0], timeframe[1])]
        popdb_datasets = {}  #
        for row in popdb_results:
            dataset = row['dataset']
            if not DATASET_PAT.match(dataset):
                continue
            if self.verbose:
                print("Generate dataframe for %s, timeframe: %s, %s" \
                        % (dataset, timeframe, tstamp))
            target = dict(naccess=row['naccess'],
                          nusers=row['nusers'],
                          totcpu=row['totcpu'],
                          rnaccess=row['rnaccess'],
                          rnusers=row['rnusers'],
                          rtotcpu=row['rtotcpu'])
            rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \
                    rtypes, tiers, dformat, target)
            popdb_datasets[dataset] = row
            for row in rows:
                yield row
                pop_datasets += 1

        # get list of datasets from DBS and discard from this list
        # those who were presented in popdb
        all_dbs_datasets = self.dbs.datasets()
        dbsdatasets = [
            d for d in all_dbs_datasets if d not in popdb_datasets.keys()
        ]
        target = dict(naccess=0,
                      nusers=0,
                      totcpu=0,
                      rnaccess=0,
                      rnusers=0,
                      rtotcpu=0)
        for dataset in random.sample(dbsdatasets, dbs_extra):
            rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \
                    rtypes, tiers, dformat, target)
            for row in rows:
                yield row
                dbs_datasets += 1
        if self.verbose:
            print("DBS datasets  : %s" % dbs_datasets)
            print("PopDB datasets: %s out of %s" %
                  (pop_datasets, len(popdb_results)))

    def export(self, dformat):
        "Export analytics dataframe into provided data format"
        print("Export dataframe into %s format" % dformat.lower())
        if dformat.lower() == 'csv':
            print("Do CSV export")
            headers = []
            for row in self.dataframe():
                if not headers:
                    headers = row
                    yield ','.join(headers)
                    continue
                yield ','.join(row)
        elif dformat.lower() == 'vw':
            print("Do vw export")
        else:
            raise NotImplemented