Beispiel #1
0
def main():
#  args=sys.argv[1:]
#  data=args[0]

  sample_group = 'signal' # signal, background, data, all
  sample_list = get_sample_list(sample_group)
  sample_list.sort()

  url="https://cmsweb.cern.ch/dbs/prod/global/DBSReader"
  api=DbsApi(url=url)

  for samp in sample_list:
    outputDataSets = ''
    #print('Checking {1}'.format(samp.DAS))
    outputDataSets = api.listDatasets(dataset=samp.DAS, detail = True, dataset_access_type='VALID')
 
    if outputDataSets:
      for ds in outputDataSets:
       #print('{0}'.format(ds['dataset']))
       #print('{0}'.format(ds['primary_ds_name']))
       #print('{0}'.format(ds['xtcrosssection']))
       nevents = api.listBlockSummaries(dataset=ds['dataset'])
       #print(nevents[0]['num_event'])    
       # this to create a table for the paper with dataset name and number of events 
       print('verb@ {0} @ & {1:.2e} & XX \\\\ '.format(ds['primary_ds_name'],nevents[0]['num_event'])) 
  sys.exit(0);
Beispiel #2
0
def getFilenames(txtFile):
    # dasgoclient python API
    dbs = DbsApi('https://cmsweb.cern.ch/dbs/prod/global/DBSReader')
    global_director = "root://cmsxrootd.fnal.gov/"

    # Read out input files containing data set names
    with open(txtFile) as f:
        datasets = [
            dataset for dataset in f.read().splitlines() if dataset != ""
        ]

    # Fill file names in using dasgoclient API
    filelist = {}

    for setname in datasets:
        if "mc" in setname:
            filelist[setname.split("/")[1]] = [
                global_director + filename['logical_file_name']
                for filename in dbs.listFiles(dataset=setname, detail=1)
            ]

        elif "user" in setname:
            filelist[setname.split("/")[5]] = [global_director + setname]

        else:
            filelist[setname.split("/")[1] + "-" + setname.split("/")[2]] = [
                global_director + filename['logical_file_name']
                for filename in dbs.listFiles(dataset=setname, detail=1)
            ]

    # print filelist

    return filelist
Beispiel #3
0
def getDatasetSize(dataset):
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve file aggregation only by the runs
    #transform from strin to list
    reply = dbsapi.listBlockSummaries(dataset=dataset)
    return reply[0]['file_size']
Beispiel #4
0
def dbs3_get_data(dataset, timestamps=1):

    #q = "/afs/cern.ch/user/s/spinoso/public/dbs3wrapper.sh /afs/cern.ch/user/c/cmst2/mc/scripts/datasetinfo.py --dataset %s --json" % dataset
    #output=os.popen(q).read()
    #s = json.loads(output)
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve dataset summary
    try:
        reply = dbsapi.listDatasets(dataset=dataset,
                                    dataset_access_type='*',
                                    detail=True)
        #print reply
        if len(reply):
            status = reply[0]['dataset_access_type']
            reply = dbsapi.listBlockSummaries(dataset=dataset, detail=True)
            cnt = 0
            for block in reply:
                cnt += int(block['num_event'])
            return [cnt, status, int(cnt / 100.)]
        else:
            print dataset, "not exsiting"
            return [0, '', 0]

    except:
        print "crash dbs3"
        return [0, '', 0]
Beispiel #5
0
def getLumiListInValidFiles(dataset, dbsurl = 'phys03'):
    """
    Get the runs/lumis in the valid files of a given dataset.

    dataset: the dataset name as published in DBS
    dbsurl: the DBS URL or DBS prod instance

    Returns a LumiList object.
    """
    dbsurl = DBSURLS['reader'].get(dbsurl, dbsurl)
    dbs3api = DbsApi(url=dbsurl)
    try:
        files = dbs3api.listFileArray(dataset=dataset, validFileOnly=0, detail=True)
    except Exception as ex:
        msg  = "Got DBS client error requesting details of dataset '%s' on DBS URL '%s': %s" % (dataset, dbsurl, ex)
        msg += "\n%s" % (traceback.format_exc())
        raise ClientException(msg)
    if not files:
        msg = "Dataset '%s' not found in DBS URL '%s'." % (dataset, dbsurl)
        raise ClientException(msg)
    validFiles = [f['logical_file_name'] for f in files if f['is_file_valid']]
    blocks = set([f['block_name'] for f in files])
    runLumiPairs = []
    for blockName in blocks:
        fileLumis = dbs3api.listFileLumis(block_name=blockName)
        for f in fileLumis:
            if f['logical_file_name'] in validFiles:
                run = f['run_num']
                lumis = f['lumi_section_num']
                for lumi in lumis:
                    runLumiPairs.append((run,lumi))
    lumiList = LumiList(lumis=runLumiPairs)

    return lumiList
Beispiel #6
0
def getLumiListInValidFiles(dataset, dbsurl = 'phys03'):
    """
    Get the runs/lumis in the valid files of a given dataset.

    dataset: the dataset name as published in DBS
    dbsurl: the DBS URL or DBS prod instance

    Returns a LumiList object.
    """
    dbsurl = DBSURLS['reader'].get(dbsurl, dbsurl)
    dbs3api = DbsApi(url=dbsurl)
    try:
        files = dbs3api.listFileArray(dataset=dataset, validFileOnly=0, detail=True)
    except Exception as ex:
        msg  = "Got DBS client error requesting details of dataset '%s' on DBS URL '%s': %s" % (dataset, dbsurl, ex)
        msg += "\n%s" % (traceback.format_exc())
        raise ClientException(msg)
    if not files:
        msg = "Dataset '%s' not found in DBS URL '%s'." % (dataset, dbsurl)
        raise ClientException(msg)
    validFiles = [f['logical_file_name'] for f in files if f['is_file_valid']]
    blocks = set([f['block_name'] for f in files])
    runLumiPairs = []
    for blockName in blocks:
        fileLumis = dbs3api.listFileLumis(block_name=blockName)
        for f in fileLumis:
            if f['logical_file_name'] in validFiles:
                run = f['run_num']
                lumis = f['lumi_section_num']
                for lumi in lumis:
                    runLumiPairs.append((run,lumi))
    lumiList = LumiList(lumis=runLumiPairs)

    return lumiList
Beispiel #7
0
def getDatasetFileLumis(dataset):
    url="https://cmsweb.cern.ch/dbs/prod/global/DBSReader"
    api=DbsApi(url=url)
    dRunLumis = {}

    files = api.listFiles(dataset=dataset)
    files = [f.get('logical_file_name','') for f in files]

    # chunk into size less than 1000 or else DBS complains
    fileChunks = getChunks(files)

    for fileChunk in fileChunks:

        info = api.listFileLumiArray(logical_file_name=fileChunk)

        for f in info:
            fname = f['logical_file_name']
            dRunLumis[fname] = {}
            run, lumis = str(f['run_num']), f['lumi_section_num']
            if run not in dRunLumis[fname]: dRunLumis[fname][run] = []
            dRunLumis[fname][run].extend(lumis)

    for fname in dRunLumis.keys():
        for run in dRunLumis[fname].keys():
            dRunLumis[fname][run] = listToRanges(dRunLumis[fname][run])

    return dRunLumis
Beispiel #8
0
def uploadWorker(input, results, dbsUrl):
    """
    _uploadWorker_

    Put JSONized blocks in the input
    Get confirmation in the output
    """

    # Init DBS Stuff
    logging.debug("Creating dbsAPI with address %s" % dbsUrl)
    dbsApi = DbsApi(url = dbsUrl)


    while True:

        try:
            work = input.get()
        except (EOFError, IOError):
            crashMessage = "Hit EOF/IO in getting new work\n"
            crashMessage += "Assuming this is a graceful break attempt.\n"
            logging.error(crashMessage)
            break

        if work == 'STOP':
            # Then halt the process
            break

        name  = work.get('name', None)
        block = work.get('block', None)

        # Do stuff with DBS
        try:
            logging.debug("About to call insert block with block: %s" % block)
            dbsApi.insertBulkBlock(blockDump = block)
            results.put({'name': name, 'success': "uploaded"})
        except Exception as ex:
            exString = str(ex)
            if 'Block %s already exists' % name in exString:
                # Then this is probably a duplicate
                # Ignore this for now
                logging.error("Had duplicate entry for block %s. Ignoring for now." % name)
                logging.debug("Exception: %s" % exString)
                logging.debug("Traceback: %s" % str(traceback.format_exc()))
                results.put({'name': name, 'success': "uploaded"})
            elif 'Proxy Error' in exString:
                # This is probably a successfully inserton that went bad.
                # Put it on the check list
                msg = "Got a proxy error for block (%s)." % name
                logging.error(msg)
                logging.error(str(traceback.format_exc()))
                results.put({'name': name, 'success': "check"})
            else:
                msg =  "Error trying to process block %s through DBS.\n" % name
                msg += exString
                logging.error(msg)
                logging.error(str(traceback.format_exc()))
                logging.debug("block: %s \n" % block)
                results.put({'name': name, 'success': "error", 'error': msg})

    return
Beispiel #9
0
def getDatasetSize(dataset):
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve file aggregation only by the runs
    #transform from strin to list
    reply = dbsapi.listBlockSummaries(dataset=dataset)
    return reply[0]['file_size']
Beispiel #10
0
def getDatasetStatus(dataset):
    """
    Gets the dataset status (access type): VALID, INVALID, PRODUCTION, DEPRECATED
    """
    dbsapi = DbsApi(url=dbs3_url)
    reply = dbsapi.listDatasets(dataset=dataset,dataset_access_type='*',detail=True)
    return reply[0]['dataset_access_type']
Beispiel #11
0
def duplicateLumi(dataset, verbose=False, skipInvalid=False):
    """
    checks if output dataset has duplicate lumis
    returns true if at least one duplicate lumi was found
    Verbose: if true prints details
    skipInvalid: if true skips invalid files, by default is False because is faster
   """
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)
    duplicated = False
    lumisChecked = {}
    # retrieve files
    reply = dbsapi.listFiles(dataset=dataset, detail=skipInvalid)
    for f in reply:
        logical_file_name = f['logical_file_name']
        #skip invalid files
        if skipInvalid and f['is_file_valid'] != 1:
            continue
        reply2 = dbsapi.listFileLumis(logical_file_name=logical_file_name)
        #retrieve lumis for each file
        lumis = reply2[0]['lumi_section_num']
        #check that each lumi is only in one file
        for lumi in lumis:
            if lumi in lumisChecked:
                #if verbose print results, if not end quickly
                if verbose:
                    print 'Lumi', lumi, 'is in these files'
                    print logical_file_name
                    print lumisChecked[lumi]
                    duplicated = True
                else:
                    return True
            else:
                lumisChecked[lumi] = logical_file_name
    return duplicated
Beispiel #12
0
def crabConfig(dataSet, setName, outDir, systematics, channels, era):
    isSignal = "HPlus" in setName
    isData = "Single" in setName or "JetHT" in setName or "EGamma" in setName

    outFiles = []

    for systematic in systematics:
        if systematic == "":
            outFiles.append("{}.root".format(setName))
            continue

        if isData:
            break

        for shift in ["Up", "Down"]:
            outFiles.append("{}_{}{}.root".format(setName, systematic, shift))

    #Caculate number of files per job
    url = "https://cmsweb.cern.ch/dbs/prod/{}/DBSReader".format(
        "global")  # if not isSignal else "phys03")
    api = DbsApi(url=url)
    files = api.listFiles(dataset=dataSet, detail=1)

    eventsPerFile = sum(f["event_count"] for f in files) / len(files)
    filesPerJob = int(math.ceil(300000. / eventsPerFile))

    ##Crab config
    crabConf = config()

    crabConf.General.requestName = "Skim_{}".format(era)
    crabConf.General.workArea = outDir
    crabConf.General.transferOutputs = True
    crabConf.General.transferLogs = False

    crabConf.JobType.pluginName = "Analysis"
    crabConf.JobType.psetName = "{}/src/ChargedSkimming/Skimming/python/miniskimmer.py".format(
        os.environ["CMSSW_BASE"])
    crabConf.JobType.pyCfgParams = [
        "outname={}.root".format(setName),
        "channel={}".format(",".join(channels)), "era={}".format(era)
    ]
    crabConf.JobType.outputFiles = outFiles
    crabConf.JobType.maxJobRuntimeMin = 1440
    crabConf.JobType.maxMemoryMB = 2500
    crabConf.JobType.allowUndistributedCMSSW = True

    crabConf.Data.inputDataset = dataSet
    crabConf.Data.inputDBS = "global"  # if not isSignal else "phys03"
    crabConf.Data.splitting = "FileBased"
    crabConf.Data.unitsPerJob = filesPerJob
    crabConf.Data.outLFNDirBase = "/store/user/dbrunner/skim/{}/{}".format(
        "_".join([
            str(getattr(time.localtime(), "tm_" + t))
            for t in ["mday", "mon", "year"]
        ]), era)

    crabConf.Site.storageSite = "T2_DE_DESY"
    crabConf.User.voGroup = "dcms"

    return crabConf
Beispiel #13
0
def getDatasetStatus(dataset):
    """
    Gets the dataset status (access type): VALID, INVALID, PRODUCTION, DEPRECATED
    """
    dbsapi = DbsApi(url=dbs3_url)
    reply = dbsapi.listDatasets(dataset=dataset,dataset_access_type='*',detail=True)
    return reply[0]['dataset_access_type']
Beispiel #14
0
def das_files(dataset):
    dataset_split = dataset.split('/')
    dataset_split[2] = 'RunIIFall17NanoAODv4*'
    datasetv4 = '/'.join(dataset_split)

    dbs = DbsApi('https://cmsweb.cern.ch/dbs/prod/global/DBSReader')
    return dbs.listDatasets(dataset=datasetv4)
Beispiel #15
0
def duplicateLumi(dataset, verbose=False):
    """
    checks if output dataset has duplicate lumis
    returns true if at least one duplicate lumi was found
    """
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)
    duplicated = False
    lumisChecked={}
    # retrieve files
    reply = dbsapi.listFiles(dataset=dataset)
    for f in reply:
        logical_file_name = f['logical_file_name']
        reply2 = dbsapi.listFileLumis(logical_file_name=logical_file_name)
        #retrieve lumis for each file
        lumis = reply2[0]['lumi_section_num']
        #check that each lumi is only in one file
        for lumi in lumis:
            if lumi in lumisChecked:
                #if verbose print results, if not end quickly
                if verbose:
                    print 'Lumi',lumi,'is in these files'
                    print logical_file_name
                    print lumisChecked[lumi]
                    duplicated = True
                else:
                    return True
            else:
                lumisChecked[lumi] = logical_file_name
    return duplicated
def uploadWorker(workInput, results, dbsUrl):
    """
    _uploadWorker_

    Put JSONized blocks in the workInput
    Get confirmation in the output
    """

    # Init DBS Stuff
    logging.debug("Creating dbsAPI with address %s", dbsUrl)
    dbsApi = DbsApi(url = dbsUrl)


    while True:

        try:
            work = workInput.get()
        except (EOFError, IOError):
            crashMessage = "Hit EOF/IO in getting new work\n"
            crashMessage += "Assuming this is a graceful break attempt.\n"
            logging.error(crashMessage)
            break

        if work == 'STOP':
            # Then halt the process
            break

        name  = work.get('name', None)
        block = work.get('block', None)

        # Do stuff with DBS
        try:
            logging.debug("About to call insert block with block: %s", block)
            dbsApi.insertBulkBlock(blockDump = block)
            results.put({'name': name, 'success': "uploaded"})
        except Exception as ex:
            exString = str(ex)
            if 'Block %s already exists' % name in exString:
                # Then this is probably a duplicate
                # Ignore this for now
                logging.error("Had duplicate entry for block %s. Ignoring for now.", name)
                logging.debug("Exception: %s", exString)
                logging.debug("Traceback: %s", str(traceback.format_exc()))
                results.put({'name': name, 'success': "uploaded"})
            elif 'Proxy Error' in exString:
                # This is probably a successfully inserton that went bad.
                # Put it on the check list
                msg = "Got a proxy error for block (%s)." % name
                logging.error(msg)
                logging.error(str(traceback.format_exc()))
                results.put({'name': name, 'success': "check"})
            else:
                msg =  "Error trying to process block %s through DBS.\n" % name
                msg += exString
                logging.error(msg)
                logging.error(str(traceback.format_exc()))
                logging.debug("block: %s \n", block)
                results.put({'name': name, 'success': "error", 'error': msg})

    return
Beispiel #17
0
def getDatasetStatus(dataset):
    "Return dataset status"
    dbsapi = DbsApi(url=DBS3, verifypeer=False)
    reply = dbsapi.listDatasets(dataset=dataset,
                                dataset_access_type='*',
                                detail=True)
    return reply[0]['dataset_access_type']
Beispiel #18
0
def duplicateLumi(dataset, verbose=False, skipInvalid=False):
    """
    checks if output dataset has duplicate lumis
    returns true if at least one duplicate lumi was found
    Verbose: if true prints details
    skipInvalid: if true skips invalid files, by default is False because is faster
   """
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)
    duplicated = False
    lumisChecked={}
    # retrieve files
    reply = dbsapi.listFiles(dataset=dataset, detail=skipInvalid)
    for f in reply:
        logical_file_name = f['logical_file_name']
        #skip invalid files
        if skipInvalid and f['is_file_valid'] != 1 :
            continue
        reply2 = dbsapi.listFileLumis(logical_file_name=logical_file_name)
        #retrieve lumis for each file
        lumis = reply2[0]['lumi_section_num']
        #check that each lumi is only in one file
        for lumi in lumis:
            if lumi in lumisChecked:
                #if verbose print results, if not end quickly
                if verbose:
                    print 'Lumi',lumi,'is in these files'
                    print logical_file_name
                    print lumisChecked[lumi]
                    duplicated = True
                else:
                    return True
            else:
                lumisChecked[lumi] = logical_file_name
    return duplicated
Beispiel #19
0
    def getBlockSitesFromLocalDBS3(self,dbs_url):

        ## find the location for each block in the list
        from dbs.apis.dbsClient import DbsApi
        api = DbsApi(dbs_url)

        from NodeNameUtils import getMapOfSEHostName2PhedexNodeNameFromPhEDEx

        se2pnn = getMapOfSEHostName2PhedexNodeNameFromPhEDEx()

        blockSites = {}
        for block in self.Listfileblocks:
            blockInfo=api.listBlocks(block_name=block,detail=True)
            location=blockInfo[0]['origin_site_name']
            if location == 'UNKNOWN':
                blockSites[block] = []
            else:
                #if locationIsValidPNN:
                if location.startswith('T2_') or location.startswith('T3_'):
                    blockSites[block] = [location]
                else:
                    if location in se2pnn.keys():
                        blockSites[block] = [se2pnn[location]]
                    else:
                        msg = "ERROR: unknown location for block: %s. Skip this block" % location
                        common.logger.info(msg)
                        blockSites[block] = []

        return blockSites
Beispiel #20
0
class MigrationToGlobal:
    
    def __init__(self):
        
        # Initialize dbs API
        dbsUrl = 'https://cmsweb.cern.ch/dbs/prod/global/DBSMigrate/'
        #dbsUrl = 'https://cmsweb-testbed.cern.ch/dbs/int/global/DBSMigrate/'
        self.dbsApi = DbsApi(url = dbsUrl)
        
        # Timing variable
        self.isOver = False
        
        # Timeout of the script
        self.time_out = 600 #10 min
        
        # Migration Requests
        self.migrationRequests = {}
        
    def updateRequest(self, over):
        """
        This updates the migration requests status
        First calls DBS3 for the migration status. If it is the last loop (over = True),
        remove submitted requests and handle the uncomplete requests
        """
        for task in self.migrationRequests.keys():
            # Loop over all the submitted migration requests
            if self.migrationRequests[task]['status'] == 'submitted':
                request_status = self.dbsApi.statusMigration(migration_rqst_id = self.migrationRequests[task]['id'])
                status = request_status[0]['migration_status']
                if status == 2: # Migration completed
                    self.migrationRequests[task]['status'] = 'successful'
                    print 'Migration to global succeed: '+ self.migrationRequests[task]['dataset']
                elif status == 9: # Migration failed
                    self.migrationRequests[task]['status'] = 'migration failed'
                    print 'Migration to global fail: '+ self.migrationRequests[task]['dataset']
                elif status == 3 and over == True: # Migration failed, no more retries due to script timeout
                    self.removeRequest(self.migrationRequests[task])
                    self.migrationRequests[task]['status'] = 'migration failed'
                    print 'Migration to global fail: '+ self.migrationRequests[task]['dataset']
                elif status == 0 and over == True: # Migration is still pending, remove it
                    self.removeRequest(self.migrationRequests[task])
                    self.migrationRequests[task]['status'] = 'migration not processed'
                    print 'Migration to global not proccesed: '+ self.migrationRequests[task]['dataset']
                elif status == 1 and over == True: # Migration in progress...
                    self.migrationRequests[task]['status'] = 'still processing'
                    print 'DBS3 is still processing Migration of %s' % self.migrationRequests[task]['id']
            
    def removeRequest(self, migration):
        """
        Remove a migration request
        This only works if the status is from DBS is 0 (pending) or 3 (fail)
        """
        try:
            toDelete = {'migration_rqst_id': migration['id']}
            self.dbsApi.removeMigration(toDelete)
        except Exception, ex:
            print 'There was something wrong when migrating %s' % migration['dataset']
            print 'Exception: '+str(ex)+'/n'
            print 'Traceback: '+str(traceback.format_exc())
Beispiel #21
0
def uploadWorker(workInput, results, dbsUrl):
    """
    _uploadWorker_

    Put JSONized blocks in the workInput
    Get confirmation in the output
    """

    # Init DBS Stuff
    logging.debug("Creating dbsAPI with address %s", dbsUrl)
    dbsApi = DbsApi(url=dbsUrl)

    while True:

        try:
            work = workInput.get()
        except (EOFError, IOError):
            crashMessage = "Hit EOF/IO in getting new work\n"
            crashMessage += "Assuming this is a graceful break attempt.\n"
            logging.error(crashMessage)
            break

        if work == 'STOP':
            # Then halt the process
            break

        name = work.get('name', None)  # this is the block name
        block = work.get('block', None)  # this is the block data structure

        # Do stuff with DBS
        try:
            logging.debug("About to call insert block with block: %s", block)
            dbsApi.insertBulkBlock(blockDump=block)
            results.put({'name': name, 'success': "uploaded"})
        except Exception as ex:
            exString = str(ex)
            if 'Block %s already exists' % name in exString:
                # Then this is probably a duplicate
                # Ignore this for now
                logging.warning("Block %s already exists. Marking it as uploaded.", name)
                logging.debug("Exception: %s", exString)
                results.put({'name': name, 'success': "uploaded"})
            elif 'Proxy Error' in exString:
                # This is probably a successfully insertion that went bad.
                # Put it on the check list
                msg = "Got a proxy error for block %s." % name
                logging.warning(msg)
                results.put({'name': name, 'success': "check"})
            elif 'Missing data when inserting to dataset_parents' in exString:
                msg = "Parent dataset is not inserted yet for block %s." % name
                logging.warning(msg)
                results.put({'name': name, 'success': "error", 'error': msg})
            else:
                msg = "Error trying to process block %s through DBS. Error: %s" % (name, exString)
                logging.exception(msg)
                logging.debug("block info: %s \n", block)
                results.put({'name': name, 'success': "error", 'error': msg})

    return
Beispiel #22
0
def get_filenames(bkgTXT, dataTXT, sigTXT):
    ##dasgoclient python API
    dbs = DbsApi('https://cmsweb.cern.ch/dbs/prod/global/DBSReader')
    global_director = "root://cmsxrootd.fnal.gov/"

    ##Read out input files containing data set names
    if bkgTXT:
        with open(bkgTXT) as f:
            background = [
                background for background in f.read().splitlines()
                if background != ""
            ]

    else:
        background = []

    if dataTXT:
        with open(dataTXT) as f:
            data = [data for data in f.read().splitlines() if data != ""]

    else:
        data = []

    if sigTXT:
        with open(sigTXT) as f:
            signal = [
                signal for signal in f.read().splitlines() if signal != ""
            ]

    else:
        signal = []

    ##Fill file names in using dasgoclient API
    filelist = {}

    for setname in background + data:
        if "mc" in setname:
            key = setname.split("/")[1]

        else:
            key = setname.split("/")[1] + "-" + setname.split("/")[2]

        filelist[key] = [
            global_director + filename['logical_file_name']
            for filename in dbs.listFiles(dataset=setname, detail=1)
        ]

    ##Read out signal files with gfal-ls command
    for SEpath in signal:
        key = SEpath.split("/")[-2]
        signalFiles = subprocess.check_output(["gfal-ls",
                                               SEpath]).split("\n")[:-1]

        filelist[key] = [
            global_director + SEpath[74:] + "/" + signalFile
            for signalFile in signalFiles
        ]

    return filelist
Beispiel #23
0
def getDatasetEventsPerLumi(dataset):
    dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader')
    all_files = dbsapi.listFileSummaries( dataset = dataset , validFileOnly=1)
    try:
        average = sum([f['num_event']/float(f['num_lumi']) for f in all_files]) / float(len(all_files))
    except:
        average = 100
    return average
Beispiel #24
0
def getDataTiers(dbsUrl):
    """
    Function to retrieve all the datatiers from DBS.
    NOTE: to be used with some caching (MemoryCacheStruct)
    :param dbsUrl: the DBS URL string
    :return: a list of strings/datatiers
    """
    dbs = DbsApi(dbsUrl)
    return [tier['data_tier_name'] for tier in dbs.listDataTiers()]
def getSize(dataset):
       # initialize API to DBS3
        dbsapi = DbsApi(url=dbs3_url)
        # retrieve dataset summary
        reply = dbsapi.listBlocks(dataset=dataset,detail=True)
        sum = 0
        for block in reply:
           sum = sum + block['block_size']
        return sum
Beispiel #26
0
def getDatasetStatus(dataset):
        # initialize API to DBS3                                                                                                                                                                                                                                                     
        dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader')
        # retrieve dataset summary                                                                                                                                                                                                                                                   
        reply = dbsapi.listDatasets(dataset=dataset,dataset_access_type='*',detail=True)
        if len(reply):
            return reply[0]['dataset_access_type']
        else:
            return None
Beispiel #27
0
def getFileCount(dataset):
        # initialize API to DBS3
        dbsapi = DbsApi(url=dbs3_url)
        # retrieve dataset summary
        reply = dbsapi.listBlockSummaries(dataset=dataset,detail=True)
        cnt=0
        for block in reply:
           cnt = cnt + int(block['num_file'])
        return cnt
Beispiel #28
0
def getSize(dataset):
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve dataset summary
    reply = dbsapi.listBlocks(dataset=dataset, detail=True)
    sum = 0
    for block in reply:
        sum = sum + block['block_size']
    return sum
Beispiel #29
0
def getLumiCountDataSet(dataset):
    """
    Get the number of unique lumis in a dataset
    """
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve dataset summary
    reply = dbsapi.listFileSummaries(dataset=dataset)
    return reply[0]['num_lumi']
Beispiel #30
0
    def _setDatatiersCache(ts, dbsUrl):
        """
        Set a timestamp and update the list of datatiers cached in
        the class property
        """
        dbs = DbsApi(dbsUrl)
        DBS3Reader._datatiers['ts'] = ts
        DBS3Reader._datatiers['tiers'] = [tier['data_tier_name'] for tier in dbs.listDataTiers()]

        return
Beispiel #31
0
def getNumberofFilesPerRun(das_url, dataset, run):
    """
    Count number of files
    """
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)

    # retrieve file list
    reply = dbsapi.listFiles(dataset=dataset)
    return len(reply)
def getFileCountDataset(dataset):
    """
    Returns the number of files registered in DBS3
    """
     # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)

    # retrieve file list
    reply = dbsapi.listFiles(dataset=dataset)
    return len(reply)
def getEventCountBlock(block):
    """
    Returns the number of events in a dataset using DBS3

    """
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve dataset summary
    reply = dbsapi.listBlockSummaries(block_name=block)
    return reply[0]["num_event"]
Beispiel #34
0
def getEventCountBlock(block):
    """
    Returns the number of events in a dataset using DBS3

    """
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve dataset summary
    reply = dbsapi.listBlockSummaries(block_name=block)
    return reply[0]['num_event']
Beispiel #35
0
def getDatasetEventsPerLumi(dataset):
    dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader')
    all_files = dbsapi.listFileSummaries(dataset=dataset, validFileOnly=1)
    try:
        average = sum(
            [f['num_event'] / float(f['num_lumi'])
             for f in all_files]) / float(len(all_files))
    except:
        average = 100
    return average
Beispiel #36
0
    def _setDatatiersCache(ts, dbsUrl):
        """
        Set a timestamp and update the list of datatiers cached in
        the class property
        """
        dbs = DbsApi(dbsUrl)
        DBS3Reader._datatiers['ts'] = ts
        DBS3Reader._datatiers['tiers'] = [tier['data_tier_name'] for tier in dbs.listDataTiers()]

        return
Beispiel #37
0
def getNumberofFilesPerRun(das_url, dataset, run):
    """
    Count number of files
    """
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)

    # retrieve file list
    reply = dbsapi.listFiles(dataset=dataset)
    return len(reply)
Beispiel #38
0
    def __init__(self,config):
        self.br=Browser()

        self.config = config
        
        # Initialise connections
        self.mySiteDB = SiteDBJSON()
        self.dbsPhys01 = DbsApi(url = dbs_base_url+"phys01/DBSReader/")
        self.dbsPhys02 = DbsApi(url = dbs_base_url+"phys02/DBSReader/")
        self.dbsPhys03 = DbsApi(url = dbs_base_url+"phys03/DBSReader/")
def getEventCountDataSetBlockList(dataset,blockList):
    """
    Counts and adds all the events for a given lists
    blocks inside a dataset
    """
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)    
    lumis=0
    reply = dbsapi.listBlockSummaries(block_name=blockList)       
    return reply[0]['num_event']
def getEventCountDataSet(dataset):
    """
    Returns the number of events in a dataset using DBS3

    """
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve dataset summary
    reply = dbsapi.listBlockSummaries(dataset=dataset)
    return reply[0]['num_event']
Beispiel #41
0
    def __init__(self,config):
        self.br=Browser()

        self.config = config
        
        # Initialise connections
        self.phedex = PhEDEx({"endpoint":"https://cmsweb.cern.ch/phedex/datasvc/json/prod/"}, "json")
        self.dbsPhys01 = DbsApi(url = dbs_base_url+"phys01/DBSReader/")
        self.dbsPhys02 = DbsApi(url = dbs_base_url+"phys02/DBSReader/")
        self.dbsPhys03 = DbsApi(url = dbs_base_url+"phys03/DBSReader/")
Beispiel #42
0
def duplicateRunLumi(dataset, verbose=False, skipInvalid=False):
    """
    checks if output dataset has duplicate lumis
    for every run.
    returns true if at least one duplicate lumi was found
    That is if there is the same lumi in the same run and
    two different files
    This can be used on datasets that have separate
    runs.
    Verbose: if true prints details
    skipInvalid: if true skips invalid files, by default is False because is faster
    """
    dbsapi = DbsApi(url=dbs3_url)
    duplicated = False
    #check each run
    runs = getRunsDataset(dataset)
    #if only one run in the list
    if len(runs) == 1:
        if verbose:
            print "only one run:", runs
        return duplicateLumi(dataset, verbose, skipInvalid)
    #else manually
    for run in runs:
        #create a set
        lumisChecked = {}
        # retrieve files for that run
        reply = dbsapi.listFiles(dataset=dataset, detail=skipInvalid)
        for f in reply:
            #skip invalid files
            if skipInvalid and f['is_file_valid'] != 1:
                continue
            logical_file_name = f['logical_file_name']
            reply2 = dbsapi.listFileLumis(logical_file_name=logical_file_name,
                                          run_num=run)
            #retrieve lumis for each file
            if reply2:
                lumis = reply2[0]['lumi_section_num']
            else:
                continue
            #check that each lumi is only in one file
            for lumi in lumis:
                if lumi in lumisChecked:
                    #if verbose print results, if not end quickly
                    if verbose:
                        print 'Lumi', lumi, 'in run', run, 'is in these files'
                        print logical_file_name
                        print lumisChecked[lumi]
                        duplicated = True
                    else:
                        return True
                else:
                    lumisChecked[lumi] = logical_file_name

    return duplicated
Beispiel #43
0
def getDatasetStatus(dataset):
    # initialize API to DBS3
    dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader')
    # retrieve dataset summary
    reply = dbsapi.listDatasets(dataset=dataset,
                                dataset_access_type='*',
                                detail=True)
    if len(reply):
        return reply[0]['dataset_access_type']
    else:
        return None
Beispiel #44
0
    def __init__(self, url, logger=None, **contact):

        # instantiate dbs api object
        try:
            self.dbsURL = url.replace("cmsweb.cern.ch", "cmsweb-prod.cern.ch")
            self.dbs = DbsApi(self.dbsURL, **contact)
            self.logger = logger or logging.getLogger(self.__class__.__name__)
        except dbsClientException as ex:
            msg = "Error in DBSReader with DbsApi\n"
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)
Beispiel #45
0
def getLumiCountDataSet(dataset):
    """
    Get the number of unique lumis in a dataset
    """
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve dataset summary
    reply = dbsapi.listFileSummaries(dataset=dataset)
    if not reply:
        return 0
    return reply[0]['num_lumi']
Beispiel #46
0
    def __init__(self, url, **contact):

        # instantiate dbs api object
        try:
            self.dbs = DbsApi(url, **contact)
        except dbsClientException as ex:
            msg = "Error in DBSReader with DbsApi\n"
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        # connection to PhEDEx (Use default endpoint url)
        self.phedex = PhEDEx(responseType="json")
    def __init__(self, config):
        """
        Initialise class members
        """
        logging.info("Running __init__ for DBS3 Uploader")
        BaseWorkerThread.__init__(self)
        self.config = config

        # This is slightly dangerous, but DBSUpload depends
        # on DBSInterface anyway
        self.dbsUrl = self.config.DBS3Upload.dbsUrl

        self.dbsUtil = DBSBufferUtil()

        myThread = threading.currentThread()
        self.daoFactory = DAOFactory(package="WMComponent.DBS3Buffer",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)

        self.pool = []
        self.blocksToCheck = []
        self.workInput = None
        self.workResult = None
        self.nProc = getattr(self.config.DBS3Upload, 'nProcesses', 4)
        self.wait = getattr(self.config.DBS3Upload, 'dbsWaitTime', 2)
        self.nTries = getattr(self.config.DBS3Upload, 'dbsNTries', 300)
        self.physicsGroup = getattr(self.config.DBS3Upload, "physicsGroup",
                                    "NoGroup")
        self.datasetType = getattr(self.config.DBS3Upload, "datasetType",
                                   "PRODUCTION")
        self.primaryDatasetType = getattr(self.config.DBS3Upload,
                                          "primaryDatasetType", "mc")
        self.blockCount = 0
        self.dbsApi = DbsApi(url=self.dbsUrl)

        # List of blocks currently in processing
        self.queuedBlocks = []

        # Set up the pool of worker processes
        self.setupPool()

        # Setting up any cache objects
        self.blockCache = {}

        self.filesToUpdate = []

        self.produceCopy = getattr(self.config.DBS3Upload, 'copyBlock', False)
        self.copyPath = getattr(self.config.DBS3Upload, 'copyBlockPath',
                                '/data/mnorman/block.json')

        self.timeoutWaiver = 1

        return
Beispiel #48
0
def duplicateRunLumi(dataset, verbose=False, skipInvalid=False):
    """
    checks if output dataset has duplicate lumis
    for every run.
    returns true if at least one duplicate lumi was found
    That is if there is the same lumi in the same run and
    two different files
    This can be used on datasets that have separate
    runs.
    Verbose: if true prints details
    skipInvalid: if true skips invalid files, by default is False because is faster
    """  
    dbsapi = DbsApi(url=dbs3_url)
    duplicated = False
    #check each run
    runs = getRunsDataset(dataset)
    #if only one run in the list
    if len(runs) == 1:
        if verbose:
            print "only one run:",runs
        return duplicateLumi(dataset, verbose, skipInvalid)
    #else manually
    for run in runs:
        #create a set
        lumisChecked={}
        # retrieve files for that run
        reply = dbsapi.listFiles(dataset=dataset, detail=skipInvalid)
        for f in reply:
            #skip invalid files
            if skipInvalid and f['is_file_valid'] != 1 :
                continue
            logical_file_name = f['logical_file_name']
            reply2 = dbsapi.listFileLumis(logical_file_name=logical_file_name, run_num=run)
            #retrieve lumis for each file
            if reply2:
                lumis = reply2[0]['lumi_section_num']
            else:
                continue
            #check that each lumi is only in one file
            for lumi in lumis:
                if lumi in lumisChecked:
                    #if verbose print results, if not end quickly
                    if verbose:
                        print 'Lumi',lumi,'in run',run,'is in these files'
                        print logical_file_name
                        print lumisChecked[lumi]
                        duplicated = True
                    else:
                        return True
                else:
                    lumisChecked[lumi] = logical_file_name

    return duplicated
Beispiel #49
0
def getDatasetEventsAndLumis(dataset, blocks=None):
    dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader')
    all_files = []
    if blocks:
        for b in blocks:
            all_files.extend( dbsapi.listFileSummaries( block_name = b  , validFileOnly=1))
    else:
        all_files = dbsapi.listFileSummaries( dataset = dataset , validFileOnly=1)

    all_events = sum([f['num_event'] for f in all_files])
    all_lumis = sum([f['num_lumi'] for f in all_files])
    return all_events,all_lumis
Beispiel #50
0
def uploadWorker(input, results, dbsUrl):
    """
    _uploadWorker_

    Put JSONized blocks in the input
    Get confirmation in the output
    """

    # Init DBS Stuff
    logging.debug("Creating dbsAPI with address %s" % dbsUrl)
    dbsApi = DbsApi(url = dbsUrl)


    while True:

        try:
            work = input.get()
        except (EOFError, IOError):
            crashMessage = "Hit EOF/IO in getting new work\n"
            crashMessage += "Assuming this is a graceful break attempt.\n"
            logging.error(crashMessage)
            break

        if work == 'STOP':
            # Then halt the process
            break

        name  = work.get('name', None)
        block = work.get('block', None)


        # Do stuff with DBS
        try:
            logging.debug("About to call insert block with block: %s" % block)
            dbsApi.insertBulkBlock(blockDump = block)
            results.put({'name': name, 'success': True})        
        except Exception, ex:
            exString = str(ex)
            if 'Duplicate entry' in exString:
                # Then this is probably a duplicate
                # Ignore this for now
                logging.error("Had duplicate entry for block %s\n" % name)
                logging.error("Ignoring for now.\n")
                logging.error("Exception: %s\n" % exString)
                logging.error("Traceback: %s\n" % str(traceback.format_exc()))
                results.put({'name': name, 'success': True})
            else:
                msg =  "Error trying to process block %s through DBS.\n" % name
                msg += exString
                logging.error(msg)
                logging.error(str(traceback.format_exc()))
                results.put({'name': name, 'success': False, 'error': msg})
Beispiel #51
0
    def getBlockSitesFromLocalDBS3(self,dbs_url):

        ## find the location for each block in the list
        from dbs.apis.dbsClient import DbsApi
        api = DbsApi(dbs_url)

        blockSites = {}
        for block in self.Listfileblocks:
            blockInfo=api.listBlocks(block_name=block,detail=True)
            location=blockInfo[0]['origin_site_name']
            blockSites[block] = [location]

        return blockSites
Beispiel #52
0
def lookup_summary(ds):
    """lookup basic information of a dataset from DBS3 API

    :param ds: dataset name
    :type ds: str
    :return: infodict
    :rtype: dict
    """
    dbs3api = DbsApi("https://cmsweb.cern.ch/dbs/prod/global/DBSReader")
    res = dbs3api.listFileSummaries(dataset=ds)
    # [{'num_file': 1599, 'file_size': 10341982224399, 'num_event': 28159421,
    # 'num_lumi': 198621, 'num_block': 234}]
    return res[0]
Beispiel #53
0
def getLFNbase(url, dataset):
        # initialize API to DBS3
        dbsapi = DbsApi(url=dbs3_url)
        # retrieve file
        reply = dbsapi.listFiles(dataset=dataset)
        file = reply[0]['logical_file_name']
        # determine lfn
        lfn = '/store/mc'
        if '/store/himc' in file:
           lfn = '/store/himc'
        if '/store/data' in file:
           lfn = '/store/data'
        return lfn
Beispiel #54
0
def hasAllBlocksClosed(dataset):
    """
    checks if a given dataset has all blocks closed
    """
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve dataset summary
    reply = dbsapi.listBlocks(dataset=dataset, detail=True)
    for block in reply:
        #print block['block_name']
        #print block['open_for_writing']
        if block['open_for_writing']:
            return False
    return True
Beispiel #55
0
def getLumiCountDataSet(dataset, skipInvalid=False):
    """
    Get the number of unique lumis in a dataset
    """
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve dataset summary
    if not skipInvalid:
        reply = dbsapi.listFileSummaries(dataset=dataset)
    else:
        reply = dbsapi.listFileSummaries(dataset=dataset, validFileOnly=1)
    if not reply or not reply[0]:
        return 0
    return reply[0]['num_lumi']
Beispiel #56
0
def getFileCountDataset(dataset, skipInvalid=False, onlyInvalid=False):
    """
    Returns the number of files registered in DBS3
    """
     # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)

    # retrieve file list
    reply = dbsapi.listFiles(dataset=dataset, detail=(skipInvalid or onlyInvalid))
    if skipInvalid:
        reply = filter(lambda f : f['is_file_valid'] ==1, reply)
    elif onlyInvalid:
        reply = filter(lambda f : f['is_file_valid'] ==0, reply)
    return len(reply)