Exemple #1
0
def dbs3_get_data(dataset, timestamps=1):

    #q = "/afs/cern.ch/user/s/spinoso/public/dbs3wrapper.sh /afs/cern.ch/user/c/cmst2/mc/scripts/datasetinfo.py --dataset %s --json" % dataset
    #output=os.popen(q).read()
    #s = json.loads(output)
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve dataset summary
    try:
        reply = dbsapi.listDatasets(dataset=dataset,
                                    dataset_access_type='*',
                                    detail=True)
        #print reply
        if len(reply):
            status = reply[0]['dataset_access_type']
            reply = dbsapi.listBlockSummaries(dataset=dataset, detail=True)
            cnt = 0
            for block in reply:
                cnt += int(block['num_event'])
            return [cnt, status, int(cnt / 100.)]
        else:
            print dataset, "not exsiting"
            return [0, '', 0]

    except:
        print "crash dbs3"
        return [0, '', 0]
def getDatasetStatus(dataset):
    """
    Gets the dataset status (access type): VALID, INVALID, PRODUCTION, DEPRECATED
    """
    dbsapi = DbsApi(url=dbs3_url)
    reply = dbsapi.listDatasets(dataset=dataset,dataset_access_type='*',detail=True)
    return reply[0]['dataset_access_type']
Exemple #3
0
def getDatasetStatus(dataset):
    """
    Gets the dataset status (access type): VALID, INVALID, PRODUCTION, DEPRECATED
    """
    dbsapi = DbsApi(url=dbs3_url)
    reply = dbsapi.listDatasets(dataset=dataset,dataset_access_type='*',detail=True)
    return reply[0]['dataset_access_type']
Exemple #4
0
def main():
#  args=sys.argv[1:]
#  data=args[0]

  sample_group = 'signal' # signal, background, data, all
  sample_list = get_sample_list(sample_group)
  sample_list.sort()

  url="https://cmsweb.cern.ch/dbs/prod/global/DBSReader"
  api=DbsApi(url=url)

  for samp in sample_list:
    outputDataSets = ''
    #print('Checking {1}'.format(samp.DAS))
    outputDataSets = api.listDatasets(dataset=samp.DAS, detail = True, dataset_access_type='VALID')
 
    if outputDataSets:
      for ds in outputDataSets:
       #print('{0}'.format(ds['dataset']))
       #print('{0}'.format(ds['primary_ds_name']))
       #print('{0}'.format(ds['xtcrosssection']))
       nevents = api.listBlockSummaries(dataset=ds['dataset'])
       #print(nevents[0]['num_event'])    
       # this to create a table for the paper with dataset name and number of events 
       print('verb@ {0} @ & {1:.2e} & XX \\\\ '.format(ds['primary_ds_name'],nevents[0]['num_event'])) 
  sys.exit(0);
Exemple #5
0
def getDatasetStatus(dataset):
    "Return dataset status"
    dbsapi = DbsApi(url=DBS3, verifypeer=False)
    reply = dbsapi.listDatasets(dataset=dataset,
                                dataset_access_type='*',
                                detail=True)
    return reply[0]['dataset_access_type']
Exemple #6
0
def das_files(dataset):
    dataset_split = dataset.split('/')
    dataset_split[2] = 'RunIIFall17NanoAODv4*'
    datasetv4 = '/'.join(dataset_split)

    dbs = DbsApi('https://cmsweb.cern.ch/dbs/prod/global/DBSReader')
    return dbs.listDatasets(dataset=datasetv4)
Exemple #7
0
def getDatasetStatus(dataset):
        # initialize API to DBS3                                                                                                                                                                                                                                                     
        dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader')
        # retrieve dataset summary                                                                                                                                                                                                                                                   
        reply = dbsapi.listDatasets(dataset=dataset,dataset_access_type='*',detail=True)
        if len(reply):
            return reply[0]['dataset_access_type']
        else:
            return None
Exemple #8
0
def dataset_client(args):

    if not dbsLoaded:
        logging.error(
            'You must source a crab environment to use DBS API.\nsource /cvmfs/cms.cern.ch/crab3/crab.sh'
        )
        return

    url = 'https://cmsweb.cern.ch/dbs/prod/{0}/DBSReader'.format(args.instance)
    dbsclient = DbsApi(url)

    sortType = {
        'name': 'dataset',
        'time': 'last_modification_date',
    }

    if args.datasets:
        datasets = []
        for dataset in args.datasets:
            datasets += dbsclient.listDatasets(dataset=dataset, detail=True)
        for d in sorted(datasets, key=lambda k: k[sortType[args.sortOrder]]):
            print(d['dataset'])

    if args.primaryDatasets or args.acquisitionEras or args.processNames or args.dataTiers:
        pds = args.primaryDatasets if args.primaryDatasets else ['']
        aes = args.acquisitionEras if args.acquisitionEras else ['']
        dts = args.dataTiers if args.dataTiers else ['']
        datasets = []
        for pd, ae, dt in itertools.product(pds, aes, dts):
            kwargs = {}
            if pd: kwargs['primary_ds_name'] = pd
            if ae: kwargs['acquisition_era_name'] = ae
            if dt: kwargs['data_tier_name'] = dt
            kwargs['detail'] = True
            datasets += dbsclient.listDatasets(**kwargs)
        for d in sorted(datasets, key=lambda k: k[sortType[args.sortOrder]]):
            if args.processNames:
                if not any([
                        fnmatch.fnmatch(d['processed_ds_name'], pn)
                        for pn in args.processNames
                ]):
                    continue
            print(d['dataset'])
Exemple #9
0
def getDatasetStatus(dataset):
    # initialize API to DBS3
    dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader')
    # retrieve dataset summary
    reply = dbsapi.listDatasets(dataset=dataset,
                                dataset_access_type='*',
                                detail=True)
    if len(reply):
        return reply[0]['dataset_access_type']
    else:
        return None
Exemple #10
0
def main():
    #  args=sys.argv[1:]
    #  data=args[0]
    sample_list = []
    sample_list = [o for o in gc.get_objects() if isinstance(o, sample)]

    url = "https://cmsweb.cern.ch/dbs/prod/global/DBSReader"
    api = DbsApi(url=url)

    for samp in sample_list:
        outputDataSets = ''
        #print('Checking {1}'.format(samp.DAS))
        outputDataSets = api.listDatasets(dataset=samp.DAS,
                                          dataset_access_type='VALID')
        if not outputDataSets:
            print('{0} does not correspond to any VALID DAS sample.'.format(
                samp.DAS))
            prodOutputDataset = api.listDatasets(
                dataset=samp.DAS, dataset_access_type='PRODUCTION')
            if (prodOutputDataset):
                print('Dataset {0} is in PRODUCTION state.'.format(
                    prodOutputDataset))
                continue
            print('Possible alternatives: ')
            altsampDAS = samp.DAS
            altsampDAS = re.sub(r'v[0-9]*', 'v*', samp.DAS)
            altsampDAS = re.sub(r'13TeV[a-zA-Z0-9_-]*/', '*/', altsampDAS)
            altsampDAS = re.sub(r'4F_TuneCP5_', '*', altsampDAS)
            altsampDAS = re.sub(r'_PSweights_', '*', altsampDAS)
            print(altsampDAS)
            matchedAltSamples = api.listDatasets(dataset=altsampDAS,
                                                 dataset_access_type='*')
            print(matchedAltSamples)


#    for dataset in outputDataSets:
#        inp=dataset['dataset']
#        print inp
#        reply= api.listBlockSummaries(dataset=inp)
#        print reply[0]['num_event']
    sys.exit(0)
def getEventsDetails(acquisitionEra, dataTierName, searchStr):
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve dataset summary
    reply = dbsapi.listDatasets(data_tier_name=dataTierName,dataset_access_type='VALID')

    sumValid = 0
    sumValidProd = 0

    for dataset in reply:
       if (acquisitionEra in dataset['dataset'] and (searchStr =='NA' or (searchStr != 'NA' and searchStr in dataset['dataset']))):
          events = getEventCountDataSet(dataset['dataset'])
          sumValid = sumValid + events
          sumValidProd = sumValidProd + events

    reply = dbsapi.listDatasets(data_tier_name=dataTierName,dataset_access_type='PRODUCTION')

    for dataset in reply:
       if (acquisitionEra in dataset['dataset'] and (searchStr =='NA' or (searchStr != 'NA' and searchStr in dataset['dataset']))):
          events = getEventCountDataSet(dataset['dataset'])
          sumValidProd = sumValidProd + events

    return [sumValid, sumValidProd]
def dataset_client(args):

    if not dbsLoaded:
        logging.error('You must source a crab environment to use DBS API.\nsource /cvmfs/cms.cern.ch/crab3/crab.sh')
        return

    url = 'https://cmsweb.cern.ch/dbs/prod/{0}/DBSReader'.format(args.instance)
    dbsclient = DbsApi(url)

    sortType = {
        'name': 'dataset',
        'time': 'last_modification_date',
    }

    if args.datasets:
        datasets = []
        for dataset in args.datasets:
            datasets += dbsclient.listDatasets(dataset=dataset,detail=True)
        for d in sorted(datasets, key=lambda k: k[sortType[args.sortOrder]]):
            print(d['dataset'])

    if args.primaryDatasets or args.acquisitionEras or args.processNames or args.dataTiers:
        pds = args.primaryDatasets if args.primaryDatasets else ['']
        aes = args.acquisitionEras if args.acquisitionEras else ['']
        dts = args.dataTiers if args.dataTiers else ['']
        datasets = []
        for pd, ae, dt in itertools.product(pds, aes, dts):
            kwargs = {}
            if pd: kwargs['primary_ds_name'] = pd
            if ae: kwargs['acquisition_era_name'] = ae
            if dt: kwargs['data_tier_name'] = dt
            kwargs['detail'] = True
            datasets += dbsclient.listDatasets(**kwargs)
        for d in sorted(datasets, key=lambda k: k[sortType[args.sortOrder]]):
            if args.processNames:
                if not any([fnmatch.fnmatch(d['processed_ds_name'],pn) for pn in args.processNames]): continue
            print(d['dataset'])
Exemple #13
0
def dataset_event_count_2(dataset):
    # cmd(". /cvmfs/cms.cern.ch/crab3/crab-env-bootstrap.sh >& /dev/null")
    from dbs.apis.dbsClient import DbsApi
    url="https://cmsweb.cern.ch/dbs/prod/global/DBSReader"
    api=DbsApi(url=url)
    output = api.listDatasets(dataset=dataset)

    if(len(output)==1):
        inp = output[0]['dataset']
        info = api.listFileSummaries(dataset=inp)[0]
        filesize = info['file_size']
        nevents = info['num_event']
        nlumis = info['num_lumi']
        files = api.listFiles(dataset=dataset, detail=1, validFileOnly=1)
        return {"nevents": nevents, "filesize": filesize, "nfiles": len(files), "nlumis": nlumis}

    return {}
Exemple #14
0
def dataset_event_count_2(dataset):
    # alternative to dataset_event_count() (this uses dbs client as opposed to rolling our own)
    # cmd(". /cvmfs/cms.cern.ch/crab3/crab-env-bootstrap.sh >& /dev/null")
    from dbs.apis.dbsClient import DbsApi
    url = "https://cmsweb.cern.ch/dbs/prod/global/DBSReader"
    api = DbsApi(url=url)
    output = api.listDatasets(dataset=dataset)

    if (len(output) == 1):
        inp = output[0]['dataset']
        info = api.listFileSummaries(dataset=inp)[0]
        filesize = info['file_size']
        nevents = info['num_event']
        nlumis = info['num_lumi']
        files = api.listFiles(dataset=dataset, detail=1, validFileOnly=1)
        return {
            "nevents": nevents,
            "filesize": filesize,
            "nfiles": len(files),
            "nlumis": nlumis
        }

    return {}
Exemple #15
0
def dbs3_get_data(dataset,timestamps=1):
    
    #q = "/afs/cern.ch/user/s/spinoso/public/dbs3wrapper.sh /afs/cern.ch/user/c/cmst2/mc/scripts/datasetinfo.py --dataset %s --json" % dataset
    #output=os.popen(q).read()
    #s = json.loads(output)
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve dataset summary
    try:
        reply = dbsapi.listDatasets(dataset=dataset,dataset_access_type='*',detail=True)
        #print reply
        if len(reply):
            status=reply[0]['dataset_access_type']
            reply = dbsapi.listBlockSummaries(dataset=dataset,detail=True)
            cnt=0
            for block in reply:
                cnt += int(block['num_event'])
            return [cnt,status,int(cnt/100.)]
        else:
            print dataset,"not exsiting"
            return [0,'',0]

    except:
        print "crash dbs3"
        return [0,'',0]            
    parser = OptionParser(usage='%prog --query=</specify/dataset/query>')
    parser.add_option("-q", "--query", dest="query", help="Query", metavar="/specify/dataset/query")
    parser.add_option("-d", "--dbs" , dest="dbs" , help="DBS Version" , type="int" , default=3)
    (options, args) = parser.parse_args()

    if not (options.query):
        parser.print_help()
        parser.error('Mandatory option is --query')

    url = "https://cmsweb.cern.ch/dbs/prod/phys03/DBSWriter"
    if options.dbs == 2:
        url = "https://cmsweb.cern.ch/dbs/prod/phys02/DBSReader"
    
    api = DbsApi(url)

    datasets = api.listDatasets(dataset=options.query)

    for dataset in datasets:
        files_ = api.listFiles(dataset=dataset['dataset'])
        files = []
        for fff in files_:
            files.append( fff['logical_file_name'] )
        sss = dataset['dataset'].split("/")

        oldfiles = []
        outfile = None
        totallen = len(files)

        outfilename = sss[1]

        while os.path.isfile(outfilename):
Exemple #17
0
if instance=='int':
    host = 'cmsweb-testbed.cern.ch'
if instance=='prod':
    host = 'cmsweb.cern.ch'

#globUrl = 'https://%s/dbs/%s/global/DBSReader' % (host, instance)
globUrl = 'https://%s/dbs/%s/phys02/DBSReader' % (host, instance)

phy3Url = 'https://%s/dbs/%s/phys03/DBSReader' % (host, instance)
migUrl  = 'https://%s/dbs/%s/phys03/DBSMigrate' % (host, instance)

apiG   = DbsApi(url=globUrl)
apiP3  = DbsApi(url=phy3Url)
apiMig = DbsApi(url=migUrl)

dsg = apiG.listDatasets(dataset=dataset)
print "in Global: ", dsg

ds3 = apiP3.listDatasets(dataset=dataset)
print "in Phys03 : ", ds3

data = {'migration_url': globUrl, 'migration_input': dataset}
result = apiMig.submitMigration(data)

print result

id = result.get("migration_details", {}).get("migration_request_id")
report = result.get("migration_report")

print "migration id, report: ", id, report
print '\n DATASET: ' + dataset
f = dataset.split("/")
if len(f) > 3:
    tier = f[3]
    if 'MINIAOD' in tier:
        print ' MINIAOD* identified, consider extra T2_CH_CERN copy.'
        isMiniAod = True

# size of provided dataset
#-------------------------

# instantiate an API
dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader')

# first test whether dataset is valid
dbsList = dbsapi.listDatasets(dataset = dataset, dataset_access_type = 'VALID')
datasetInvalid = False
if dbsList == []:
    datasetInvalid = True
    print ' Dataset does not exist or is invalid. Exit now!\n'
    sys.exit(1)

# determine size and number of files
size = str(sum([block['file_size'] for block in dbsapi.listBlockSummaries(dataset = dataset)]))+'UB'
sizeGb = convertSizeToGb(size)

# in case this is an open subscription we need to adjust sizeGb to the expected size
if expectedSizeGb > 0:
    sizeGb = expectedSizeGb

print ' SIZE:    %.1f GB'%(sizeGb)
Exemple #19
0
def getDatasets(dataset):
       # initialize API to DBS3                                                                                                                                                                                                                                                      
        dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader')
        # retrieve dataset summary                                                                                                                                                                                                                                                   
        reply = dbsapi.listDatasets(dataset=dataset,dataset_access_type='*')
        return reply
Exemple #20
0
#fname="delete_this_1000.txt" #relval files
fname = "delete_this_999.txt"  #other files

inputfile = open(fname, 'r')

dbsApi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader')

dsets_fraction_lost = {}

access_type = "VALID"

for line in inputfile:
    lfn = line.rstrip('\n')
    #print lfn

    if dbsApi.listDatasets(logical_file_name=lfn,
                           dataset_access_type=access_type) == []:
        print lfn
        continue

    #print dbsApi.listDatasets(logical_file_name = "/store/mc/RunIIWinter15GS/WToENu_M-500_TuneCUETP8M1_13TeV-pythia8/GEN-SIM/MCRUN2_71_V1-v3/00000/4AF070C6-DFE5-E411-A0D6-0025905B8572.root")

    dset_name = dbsApi.listDatasets(
        logical_file_name=lfn, dataset_access_type=access_type)[0]['dataset']

    status = dbsApi.listDatasets(
        dataset=dset_name, detail=1,
        dataset_access_type=access_type)[0]['dataset_access_type']

    if status != "VALID" and status != "PRODUCTION":
        print "found dataset that does not have VALID or PRODUCTION status, exiting"
        sys.exit(1)
Exemple #21
0
    def testA_basicFunction(self):
        """
        _basicFunction_

        See if I can make the damn thing work.
        """
        return
        myThread = threading.currentThread()

        config = self.getConfig()

        dbsUploader = DBSUploadPoller(config = config)
        dbsUtil     = DBSBufferUtil()
        dbsApi      = DbsApi(url = config.DBSUpload.dbsUrl)

        # This should do nothing
        # Just making sure we don't crash
        try:
            dbsUploader.algorithm()
        except:
            dbsUploader.close()
            raise


        name = "ThisIsATest_%s" % (makeUUID())
        tier = "RECO"
        nFiles = 12
        files = self.getFiles(name = name, tier = tier, nFiles = nFiles)
        datasetPath = '/%s/%s/%s' % (name, name, tier)
        shortPath   = '/%s/%s' % (name, name)


        try:
            dbsUploader.algorithm()
        except:
            dbsUploader.close()
            raise

        time.sleep(3)

        # Now look in DBS
        try:
            result = dbsApi.listPrimaryDatasets(dataset = name)
            self.assertEqual(len(result), 1)
            self.assertEqual(result[0]['primary_ds_name'], name)
            result = dbsApi.listDatasets(dataset = datasetPath, detail = True,
                                         dataset_access_type = 'PROCESSING')
            self.assertEqual(len(result), 1)
            self.assertEqual(result[0]['data_tier_name'], u'RECO')
            self.assertEqual(result[0]['processing_version'], u'V0')
            self.assertEqual(result[0]['acquisition_era_name'], u"DBS3TEST")
            result = dbsApi.listFiles(dataset=datasetPath)
            self.assertEqual(len(result), 11)
        except:
            dbsUploader.close()
            raise

        # All the blocks except for the last one should
        # now be there
        result = myThread.dbi.processData("SELECT id FROM dbsbuffer_block")[0].fetchall()
        self.assertEqual(len(result), 12)

        # The last block should still be open
        self.assertEqual(len(dbsUtil.findOpenBlocks()), 1)

        try:
            dbsUploader.algorithm()
        except:
            raise
        finally:
            dbsUploader.close()

        # All files should now be available
        result = dbsApi.listFiles(dataset=datasetPath)
        self.assertEqual(len(result), 12)

        
        # The last block should now be closed
        self.assertEqual(len(dbsUtil.findOpenBlocks()), 0)

        result = myThread.dbi.processData("SELECT status FROM dbsbuffer_block")[0].fetchall()
        for res in result:
            self.assertEqual(res.values()[0], 'InDBS')

        return
def main():
    url = 'cmsweb.cern.ch'
    # Example: python assignWorkflow.py -w amaltaro_RVZTT_120404_163607_6269
    # -t testbed-relval -s T1_US_FNAL -e CMSSW_6_0_0_pre1_FS_TEST_WMA -p v1 -a
    # relval -l /store/backfill/1
    parser = optparse.OptionParser()
    parser.add_option(
        '-w', '--workflow', help='Workflow Name', dest='workflow')
    parser.add_option('-t', '--team', help='Type of Requests', dest='team')
    parser.add_option('-s', '--site', help='Site', dest='site')
    parser.add_option('-p', '--procversion',
                      help='Processing Version', dest='procversion')
    parser.add_option('-a', '--activity',
                      help='Dashboard Activity', dest='activity')
    parser.add_option('-l', '--lfn', help='Merged LFN base', dest='lfn')
    parser.add_option('--special',
                      help='Use it for special workflows. You also have to change the code according to the type of WF', dest='special')
    parser.add_option('-r', '--replica', action='store_true', dest='replica', default=False,
                      help='Adds a _Disk Non-Custodial Replica parameter')
    parser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False,
                      help="Prints all query information.")
    parser.add_option('-x', '--xrootd', help='Assign with trustSiteLocation=True (allows xrootd capabilities)',
                      action='store_true', default=False, dest='xrootd')
    parser.add_option("--acqera", dest="acqera",
                      help="Overrides Acquisition Era with a single string")
    parser.add_option("--procstr", dest="procstring",
                      help="Overrides Processing String with a single string")

    parser.add_option('--test', action="store_true",
                      help='Nothing is injected, only print infomation about workflow and AcqEra', dest='test')
    parser.add_option('--pu', action="store_true",
                      help='Use it to inject PileUp workflows only', dest='pu')
    (options, args) = parser.parse_args()

    if not options.workflow:
        if args:
            workflows = args
        elif options.file:
            workflows = [l.strip() for l in open(options.file) if l.strip()]
        else:
            parser.error("Input a workflow name or a file to read them")
            sys.exit(0)
    else:
        workflows = [options.workflow]
        
    team = 'production'
    site = GOOD_SITES
    procversion = 1
    activity = 'production'
    lfn = '/store/mc'
    acqera = {}
    procstring = {}
    specialStr = ''
    replica = False
    
    for workflow in workflows:
        # Getting the original dictionary
        schema = getRequestDict(url, workflow)
    
        # Setting the AcqEra and ProcStr values per Task
        for key, value in schema.items():
            if type(value) is dict and key.startswith("Task"):
                try:
                    procstring[value['TaskName']] = value[
                        'ProcessingString'].replace("-", "_")
                    acqera[value['TaskName']] = value['AcquisitionEra']
                except KeyError:
                    print "This request has no AcquisitionEra or ProcessingString defined into the Tasks, aborting..."
                    sys.exit(1)
    
        # Adding the special string - in case it was provided in the command line
        if options.special:
            #specialStr = '_03Jan2013'
            specialStr = '_' + str(options.special)
            for key, value in procstring.items():
                procstring[key] = value + specialStr
    
        # Handling the parameters given in the command line
        if options.team:
            team = options.team
        if options.site:
            site = options.site
            if site == "all":
                site = ALL_SITES
            elif site == "t1":
                site = T1S
            #parse sites separated by commas
            elif "," in site:
                site = site.split(",")  
        if options.procversion:
            procversion = int(options.procversion)
        if options.activity:
            activity = options.activity
        if options.lfn:
            lfn = options.lfn
        if options.replica:
            replica = True
        # Override if there are new values in he
        if options.acqera:
            acqera = options.acqera
        if options.procstring:
            procstring = options.procstring
    
        # check output dataset existence, and abort if they already do!
        datasets = schema["OutputDatasets"]
        i = 0
        if 'ACDC' not in options.workflow:
            exist = False
            maxv = 1
            for key, value in schema.items():
                if type(value) is dict and key.startswith("Task"):
                    dbsapi = DbsApi(url=dbs3_url)
    
                    # list all datasets with same name but different version
                    # numbers
                    datasets = dbsapi.listDatasets(acquisition_era_name=value['AcquisitionEra'],
                                                   primary_ds_name=value['PrimaryDataset'], detail=True, dataset_access_type='*')
                    processedName = value['AcquisitionEra'] + \
                        '-' + value['ProcessingString'] + "-v\\d+"
                    # see if any of the dataset names is a match
                    for ds in datasets:
                        if re.match(processedName, ds['processed_ds_name']):
                            print "Existing dset:", ds['dataset'], "(%s)" % ds['dataset_access_type']
                            maxv = max(maxv, ds['processing_version'])
                            exist = True
                        else:
                            pass
                    i += 1
            # suggest max version
            if exist and procversion <= maxv:
                print "Some output datasets exist, its advised to assign with v ==", maxv + 1
                sys.exit(0)
    
        # If the --test argument was provided, then just print the information
        # gathered so far and abort the assignment
        if options.test:
            print "%s \tAcqEra: %s \tProcStr: %s \tProcVer: %s" % (workflow, acqera, procstring, procversion)
            # print workflow, '\tAcqEra:', acqera, '\tProcStr:', procstring,
            # '\tProcVer:', procversion
            print "LFN: %s \tTeam: %s \tSite: %s" % (lfn, team, site)
            # print '\tTeam:',team,  '\tSite:', site
            sys.exit(0)
    
        # Really assigning the workflow now
        # TODO use values when assigning merge jobs
        print workflow, '\tAcqEra:', acqera, '\tProcStr:', procstring, '\tProcVer:', procversion, '\tTeam:', team, '\tSite:', site
        assignRequest(url, workflow, team, site, acqera,
                      procstring, procversion, activity, lfn, replica, options.verbose, options.xrootd)
    sys.exit(0)
Exemple #23
0
    def testA_basicFunction(self):
        """
        _basicFunction_

        See if I can make the damn thing work.
        """
        myThread = threading.currentThread()

        config = self.getConfig()

        from WMComponent.DBS3Buffer.DBSUploadPoller import DBSUploadPoller
        dbsUploader = DBSUploadPoller(config = config)
        dbsUtil     = DBSBufferUtil()
        from dbs.apis.dbsClient import DbsApi
        dbsApi      = DbsApi(url = config.DBSUpload.dbsUrl)

        # This should do nothing
        # Just making sure we don't crash
        try:
            dbsUploader.algorithm()
        except:
            dbsUploader.close()
            raise

        name = "ThisIsATest%s" % (int(time.time()))
        tier = "RECO"
        nFiles = 12
        name = name.replace('-', '_')
        name = '%s-v0' % name
        files = self.getFiles(name = name, tier = tier, nFiles = nFiles)
        datasetPath = "/Cosmics/%s/%s" % (name, tier)

        try:
            dbsUploader.algorithm()
        except:
            dbsUploader.close()
            raise

        time.sleep(5)

        # Now look in DBS
        try:
            result = dbsApi.listDatasets(dataset = datasetPath, detail = True,
                                         dataset_access_type = 'PRODUCTION')
            self.assertEqual(len(result), 1)
            self.assertEqual(result[0]['data_tier_name'], 'RECO')
            self.assertEqual(result[0]['processing_version'], 0)
            self.assertEqual(result[0]['acquisition_era_name'], name.split('-')[0])

            result = dbsApi.listFiles(dataset=datasetPath)
            self.assertEqual(len(result), 11)
        except:
            dbsUploader.close()
            raise

        # All the blocks except for the last one should
        # now be there
        result = myThread.dbi.processData("SELECT id FROM dbsbuffer_block")[0].fetchall()
        self.assertEqual(len(result), 12)

        # The last block should still be open
        self.assertEqual(len(dbsUtil.findOpenBlocks()), 1)

        try:
            dbsUploader.algorithm()
        except:
            raise
        finally:
            dbsUploader.close()

        # All files should now be available
        result = dbsApi.listFiles(dataset=datasetPath)
        self.assertEqual(len(result), 12)


        # The last block should now be closed
        self.assertEqual(len(dbsUtil.findOpenBlocks()), 0)

        result = myThread.dbi.processData("SELECT status FROM dbsbuffer_block")[0].fetchall()
        for res in result:
            self.assertEqual(res.values()[0], 'InDBS')

        return
Exemple #24
0
class DBS3Reader(object):
    """
    _DBSReader_

    General API for reading data from DBS
    """
    # cache all the datatiers known by DBS
    _datatiers = {}

    def __init__(self, url, **contact):

        # instantiate dbs api object
        try:
            self.dbsURL = url
            self.dbs = DbsApi(url, **contact)
        except dbsClientException as ex:
            msg = "Error in DBSReader with DbsApi\n"
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        # connection to PhEDEx (Use default endpoint url)
        self.phedex = PhEDEx(responseType="json")

    def _getLumiList(self, blockName=None, lfns=None, validFileOnly=1):
        """
        currently only take one lfn but dbs api need be updated
        """
        try:
            if blockName:
                lumiLists = self.dbs.listFileLumis(block_name=blockName,
                                                   validFileOnly=validFileOnly)
            elif lfns:
                lumiLists = []
                for slfn in grouper(lfns, 50):
                    lumiLists.extend(
                        self.dbs.listFileLumiArray(logical_file_name=slfn))
            else:
                # shouldn't call this with both blockName and lfns empty
                # but still returns empty dict for that case
                return {}
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFileLumiArray(%s)\n" % lfns
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        lumiDict = {}
        for lumisItem in lumiLists:
            lumiDict.setdefault(lumisItem['logical_file_name'], [])
            item = {}
            item["RunNumber"] = lumisItem['run_num']
            item['LumiSectionNumber'] = lumisItem['lumi_section_num']
            if lumisItem.get('event_count', None) is not None:
                item['EventCount'] = lumisItem['event_count']
            lumiDict[lumisItem['logical_file_name']].append(item)
            # TODO: add key for lumi and event pair.
        return lumiDict

    def checkDBSServer(self):
        """
        check whether dbs server is up and running
        returns {"dbs_instance": "prod/global", "dbs_version": "3.3.144"}
        """
        try:
            return self.dbs.serverinfo()
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBS server is not up: %s" % self.dbsURL
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listPrimaryDatasets(self, match='*'):
        """
        _listPrimaryDatasets_

        return a list of primary datasets, The full dataset name must be provided
        pattern based mathcing is no longer supported.
        If no expression is provided, all datasets are returned
        """
        try:
            result = self.dbs.listPrimaryDatasets(primary_ds_name=match)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listPrimaryDataset(%s)\n" % match
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['primary_ds_name'] for x in result]
        return result

    def matchProcessedDatasets(self, primary, tier, process):
        """
        _matchProcessedDatasets_

        return a list of Processed datasets
        """
        result = []
        try:
            datasets = self.dbs.listDatasets(primary_ds_name=primary,
                                             data_tier_name=tier,
                                             detail=True)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        for dataset in datasets:
            dataset = remapDBS3Keys(dataset, processed_ds_name='Name')
            dataset['PathList'] = [dataset['dataset']]
            if dataset['Name'] == process:
                result.append(dataset)
        return result

    def listRuns(self, dataset=None, block=None):
        """
        it gets list of DbsRun object but for our purpose
        only list of number is collected.
        DbsRun (RunNumber,
                NumberOfEvents,
                NumberOfLumiSections,
                TotalLuminosity,
                StoreNumber,
                StartOfRungetLong,
                EndOfRun,
                CreationDate,
                CreatedBy,
                LastModificationDate,
                LastModifiedBy
                )
        """
        runs = []
        try:
            if block:
                results = self.dbs.listRuns(block_name=block)
            else:
                results = self.dbs.listRuns(dataset=dataset)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listRuns(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)
        [runs.extend(x['run_num']) for x in results]
        return runs

    def listRunLumis(self, dataset=None, block=None):
        """
        It gets a list of DBSRun objects and returns the number of lumisections per run
        DbsRun (RunNumber,
                NumberOfEvents,
                NumberOfLumiSections,
                TotalLuminosity,
                StoreNumber,
                StartOfRungetLong,
                EndOfRun,
                CreationDate,
                CreatedBy,
                LastModificationDate,
                LastModifiedBy
                )
        """
        # Pointless code in python3
        if isinstance(block, str):
            block = unicode(block)
        if isinstance(dataset, str):
            dataset = unicode(dataset)

        try:
            if block:
                results = self.dbs.listRuns(block_name=block)
            else:
                results = self.dbs.listRuns(dataset=dataset)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listRuns(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        # send runDict format as result, this format is for sync with dbs2 call
        # which has {run_number: num_lumis} but dbs3 call doesn't return num Lumis
        # So it returns {run_number: None}
        # TODO: After DBS2 is completely removed change the return format more sensible one

        runDict = {}
        for x in results:
            for runNumber in x["run_num"]:
                runDict[runNumber] = None
        return runDict

    def listProcessedDatasets(self, primary, dataTier='*'):
        """
        _listProcessedDatasets_

        return a list of Processed datasets for the primary and optional
        data tier value

        """
        try:
            result = self.dbs.listDatasets(primary_ds_name=primary,
                                           data_tier_name=dataTier)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['dataset'].split('/')[2] for x in result]
        return result

    def listDatasetFiles(self, datasetPath):
        """
        _listDatasetFiles_

        Get list of files for dataset

        """
        return [
            x['logical_file_name']
            for x in self.dbs.listFileArray(dataset=datasetPath)
        ]

    @staticmethod
    def listDatatiers(dbsUrl=None):
        """
        _listDatatiers_

        Get a list of datatiers known by DBS.
        """
        if dbsUrl is None:
            msg = "Error in DBSReader.listDatatiers(). DBS Url not set."
            raise DBSReaderError(msg)

        timenow = int(time.time())
        if DBS3Reader._datatiers and timenow - 7200 < DBS3Reader._datatiers[
                'ts']:
            return DBS3Reader._datatiers['tiers']

        try:
            DBS3Reader._setDatatiersCache(timenow, dbsUrl)
        except Exception as ex:
            if not DBS3Reader._datatiers:
                msg = "Error in DBSReader.listDatatiers\n%s" % formatEx3(ex)
                raise DBSReaderError(msg)
        return DBS3Reader._datatiers['tiers']

    @staticmethod
    def _setDatatiersCache(ts, dbsUrl):
        """
        Set a timestamp and update the list of datatiers cached in
        the class property
        """
        dbs = DbsApi(dbsUrl)
        DBS3Reader._datatiers['ts'] = ts
        DBS3Reader._datatiers['tiers'] = [
            tier['data_tier_name'] for tier in dbs.listDataTiers()
        ]

        return

    def listDatasetFileDetails(self,
                               datasetPath,
                               getParents=False,
                               validFileOnly=1):
        """
        TODO: This is completely wrong need to be redone. or be removed - getting dataset altogether
        might be to costly

        _listDatasetFileDetails_

        Get list of lumis, events, and parents for each file in a dataset
        Return a dict where the keys are the files, and for each file we have something like:
            { 'NumberOfEvents': 545,
              'BlockName': '/HighPileUp/Run2011A-v1/RAW#dd6e0796-cbcc-11e0-80a9-003048caaace',
              'Lumis': {173658: [8, 12, 9, 14, 19, 109, 105]},
              'Parents': [],
              'Checksum': '22218315',
              'Adler32': 'a41a1446',
              'FileSize': 286021145,
              'ValidFile': 1
            }

        """
        fileDetails = self.getFileListByDataset(dataset=datasetPath,
                                                validFileOnly=validFileOnly,
                                                detail=True)
        blocks = set()  # the set of blocks of the dataset
        # Iterate over the files and prepare the set of blocks and a dict where the keys are the files
        files = {}
        for f in fileDetails:
            blocks.add(f['block_name'])
            files[f['logical_file_name']] = remapDBS3Keys(f, stringify=True)
            files[f['logical_file_name']]['ValidFile'] = f['is_file_valid']
            files[f['logical_file_name']]['Lumis'] = {}
            files[f['logical_file_name']]['Parents'] = []

        # Iterate over the blocks and get parents and lumis
        for blockName in blocks:
            # get the parents
            if getParents:
                parents = self.dbs.listFileParents(block_name=blockName)
                for p in parents:
                    if p['logical_file_name'] in files:  # invalid files are not there if validFileOnly=1
                        files[p['logical_file_name']]['Parents'].extend(
                            p['parent_logical_file_name'])
            # get the lumis
            file_lumis = self.dbs.listFileLumis(block_name=blockName)
            for f in file_lumis:
                if f['logical_file_name'] in files:  # invalid files are not there if validFileOnly=1
                    if f['run_num'] in files[f['logical_file_name']]['Lumis']:
                        files[f['logical_file_name']]['Lumis'][
                            f['run_num']].extend(f['lumi_section_num'])
                    else:
                        files[f['logical_file_name']]['Lumis'][
                            f['run_num']] = f['lumi_section_num']

        return files

    def crossCheck(self, datasetPath, *lfns):
        """
        _crossCheck_

        For the dataset provided, check that the lfns listed all exist
        in the dataset.

        Return the list of lfns that are in the dataset

        """
        allLfns = self.dbs.listFileArray(dataset=datasetPath,
                                         validFileOnly=1,
                                         detail=False)
        setOfAllLfns = set(allLfns)
        setOfKnownLfns = set(lfns)
        return list(setOfAllLfns.intersection(setOfKnownLfns))

    def crossCheckMissing(self, datasetPath, *lfns):
        """
        _crossCheckMissing_

        As cross check, but return value is a list of files that
        are *not* known by DBS

        """
        allLfns = self.dbs.listFileArray(dataset=datasetPath,
                                         validFileOnly=1,
                                         detail=False)
        setOfAllLfns = set(allLfns)
        setOfKnownLfns = set(lfns)
        knownFiles = setOfAllLfns.intersection(setOfKnownLfns)
        unknownFiles = setOfKnownLfns.difference(knownFiles)
        return list(unknownFiles)

    def getDBSSummaryInfo(self, dataset=None, block=None):
        """
        Get dataset summary includes # of files, events, blocks and total size
        """
        # FIXME: Doesnt raise exceptions on missing data as old api did
        if dataset:
            self.checkDatasetPath(dataset)
        try:
            if block:
                summary = self.dbs.listFileSummaries(block_name=block,
                                                     validFileOnly=1)
            else:  # dataset case dataset shouldn't be None
                summary = self.dbs.listFileSummaries(dataset=dataset,
                                                     validFileOnly=1)
        except Exception as ex:
            msg = "Error in DBSReader.getDBSSummaryInfo(%s, %s)\n" % (dataset,
                                                                      block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)
        if not summary or summary[0].get(
                'file_size') is None:  # appears to indicate missing dataset
            msg = "DBSReader.listDatasetSummary(%s, %s): No matching data"
            raise DBSReaderError(msg % (dataset, block))
        result = remapDBS3Keys(summary[0], stringify=True)
        result['path'] = dataset if dataset else ''
        result['block'] = block if block else ''
        return result

    def getFileBlocksInfo(self,
                          dataset,
                          onlyClosedBlocks=False,
                          blockName=None,
                          locations=True):
        """
        """
        self.checkDatasetPath(dataset)
        args = {'dataset': dataset, 'detail': True}
        if blockName:
            args['block_name'] = blockName
        try:
            blocks = self.dbs.listBlocks(**args)
        except Exception as ex:
            msg = "Error in DBSReader.getFileBlocksInfo(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        blocks = [
            remapDBS3Keys(block, stringify=True, block_name='Name')
            for block in blocks
        ]
        # only raise if blockName not specified - mimic dbs2 error handling
        if not blocks and not blockName:
            msg = "DBSReader.getFileBlocksInfo(%s, %s): No matching data"
            raise DBSReaderError(msg % (dataset, blockName))
        if locations:
            for block in blocks:
                block['PhEDExNodeList'] = [{
                    'Name': x
                } for x in self.listFileBlockLocation(block['Name'])]

        if onlyClosedBlocks:
            return [x for x in blocks if str(x['OpenForWriting']) != "1"]

        return blocks

    def listFileBlocks(self, dataset, onlyClosedBlocks=False, blockName=None):
        """
        _listFileBlocks_

        Retrieve a list of fileblock names for a dataset

        """
        self.checkDatasetPath(dataset)
        args = {'dataset': dataset, 'detail': False}
        if blockName:
            args['block_name'] = blockName
        if onlyClosedBlocks:
            args['detail'] = True
        try:
            blocks = self.dbs.listBlocks(**args)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if onlyClosedBlocks:
            result = [
                x['block_name'] for x in blocks
                if str(x['open_for_writing']) != "1"
            ]

        else:
            result = [x['block_name'] for x in blocks]

        return result

    def listOpenFileBlocks(self, dataset):
        """
        _listOpenFileBlocks_

        Retrieve a list of open fileblock names for a dataset

        """
        self.checkDatasetPath(dataset)
        try:
            blocks = self.dbs.listBlocks(dataset=dataset, detail=True)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [
            x['block_name'] for x in blocks
            if str(x['open_for_writing']) == "1"
        ]

        return result

    def blockExists(self, fileBlockName):
        """
        _blockExists_

        Check to see if block with name provided exists in the DBS
        Instance.

        Return True if exists, False if not

        """
        self.checkBlockName(fileBlockName)
        try:

            blocks = self.dbs.listBlocks(block_name=fileBlockName)
        except Exception as ex:
            msg = "Error in "
            msg += "DBSReader.blockExists(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if len(blocks) == 0:
            return False
        return True

    def listFilesInBlock(self, fileBlockName, lumis=True, validFileOnly=1):
        """
        _listFilesInBlock_

        Get a list of files in the named fileblock
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.
        We need to clean code up when dbs2 is completely deprecated.
        calling lumis for run number is expensive.
        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            files = self.dbs.listFileArray(block_name=fileBlockName,
                                           validFileOnly=validFileOnly,
                                           detail=True)
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlock(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if lumis:
            lumiDict = self._getLumiList(blockName=fileBlockName,
                                         validFileOnly=validFileOnly)

        result = []
        for fileInfo in files:
            if lumis:
                fileInfo["LumiList"] = lumiDict[fileInfo['logical_file_name']]
            result.append(remapDBS3Keys(fileInfo, stringify=True))
        return result

    def listFilesInBlockWithParents(self,
                                    fileBlockName,
                                    lumis=True,
                                    validFileOnly=1):
        """
        _listFilesInBlockWithParents_

        Get a list of files in the named fileblock including
        the parents of that file.
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            # TODO: shoud we get only valid block for this?
            files = self.dbs.listFileParents(block_name=fileBlockName)
            fileDetails = self.listFilesInBlock(fileBlockName, lumis,
                                                validFileOnly)

        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n" % (
                fileBlockName, )
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        childByParents = defaultdict(list)
        for f in files:
            # Probably a child can have more than 1 parent file
            for fp in f['parent_logical_file_name']:
                childByParents[fp].append(f['logical_file_name'])

        parentsLFNs = childByParents.keys()

        if len(parentsLFNs) == 0:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n There is no parents files" % (
                fileBlockName)
            raise DBSReaderError(msg)

        parentFilesDetail = []
        # TODO: slicing parentLFNs util DBS api is handling that.
        # Remove slicing if DBS api handles
        for pLFNs in grouper(parentsLFNs, 50):
            parentFilesDetail.extend(
                self.dbs.listFileArray(logical_file_name=pLFNs, detail=True))

        if lumis:
            parentLumis = self._getLumiList(lfns=parentsLFNs)

        parentsByLFN = defaultdict(list)

        for pf in parentFilesDetail:
            parentLFN = pf['logical_file_name']
            dbsFile = remapDBS3Keys(pf, stringify=True)
            if lumis:
                dbsFile["LumiList"] = parentLumis[parentLFN]

            for childLFN in childByParents[parentLFN]:
                parentsByLFN[childLFN].append(dbsFile)

        for fileInfo in fileDetails:
            fileInfo["ParentList"] = parentsByLFN[
                fileInfo['logical_file_name']]

        return fileDetails

    def lfnsInBlock(self, fileBlockName):
        """
        _lfnsInBlock_

        LFN list only for block, details = False => faster query

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.lfnsInBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            lfns = self.dbs.listFileArray(block_name=fileBlockName,
                                          validFileOnly=1,
                                          detail=False)
            return lfns
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlock(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listFileBlockLocation(self, fileBlockNames, dbsOnly=False):
        """
        _listFileBlockLocation_

        Get origin_site_name of a block

        """

        singleBlockName = None
        if isinstance(fileBlockNames, basestring):
            singleBlockName = fileBlockNames
            fileBlockNames = [fileBlockNames]

        for block in fileBlockNames:
            self.checkBlockName(block)

        locations = {}
        node_filter = set(['UNKNOWN', None])

        if dbsOnly:
            blocksInfo = {}
            try:
                for block in fileBlockNames:
                    blocksInfo.setdefault(block, [])
                    # there should be only one element with a single origin site string ...
                    for blockInfo in self.dbs.listBlockOrigin(
                            block_name=block):
                        blocksInfo[block].append(blockInfo['origin_site_name'])
            except dbsClientException as ex:
                msg = "Error in DBS3Reader: self.dbs.listBlockOrigin(block_name=%s)\n" % fileBlockNames
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)
        else:
            try:
                blocksInfo = self.phedex.getReplicaPhEDExNodesForBlocks(
                    block=fileBlockNames, complete='y')
            except Exception as ex:
                msg = "Error while getting block location from PhEDEx for block_name=%s)\n" % fileBlockNames
                msg += "%s\n" % str(ex)
                raise Exception(msg)

        for block in fileBlockNames:
            valid_nodes = set(blocksInfo.get(block, [])) - node_filter
            locations[block] = list(valid_nodes)

        # returning single list if a single block is passed
        if singleBlockName:
            return locations[singleBlockName]

        return locations

    def getFileBlock(self, fileBlockName, dbsOnly=False):
        """
        _getFileBlock_

        dbsOnly flag is mostly meant for StoreResults, since there is no
        data in TMDB.

        return a dictionary:
        { blockName: {
             "PhEDExNodeNames" : [<pnn list>],
             "Files" : { LFN : Events },
             }
        }


        """
        # Pointless code in python3
        if isinstance(fileBlockName, str):
            fileBlockName = unicode(fileBlockName)
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.getFileBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        result = {
            fileBlockName: {
                "PhEDExNodeNames":
                self.listFileBlockLocation(fileBlockName, dbsOnly),
                "Files":
                self.listFilesInBlock(fileBlockName),
                "IsOpen":
                self.blockIsOpen(fileBlockName)
            }
        }
        return result

    def getFileBlockWithParents(self, fileBlockName):
        """
        _getFileBlockWithParents_

        return a dictionary:
        { blockName: {
             "PhEDExNodeNames" : [<pnn list>],
             "Files" : dictionaries representing each file
             }
        }

        files

        """
        if isinstance(fileBlockName, str):
            fileBlockName = unicode(fileBlockName)

        if not self.blockExists(fileBlockName):
            msg = "DBSReader.getFileBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        result = {
            fileBlockName: {
                "PhEDExNodeNames": self.listFileBlockLocation(fileBlockName),
                "Files": self.listFilesInBlockWithParents(fileBlockName),
                "IsOpen": self.blockIsOpen(fileBlockName)
            }
        }
        return result

    def getFiles(self, dataset, onlyClosedBlocks=False):
        """
        _getFiles_

        Returns a dictionary of block names for the dataset where
        each block constists of a dictionary containing the PhEDExNodeNames
        for that block and the files in that block by LFN mapped to NEvents

        """
        result = {}
        blocks = self.listFileBlocks(dataset, onlyClosedBlocks)

        [result.update(self.getFileBlock(x)) for x in blocks]

        return result

    def listBlockParents(self, blockName):
        """Get parent blocks for block"""
        result = []
        self.checkBlockName(blockName)
        blocks = self.dbs.listBlockParents(block_name=blockName)
        for block in blocks:
            toreturn = {'Name': block['parent_block_name']}
            toreturn['PhEDExNodeList'] = self.listFileBlockLocation(
                toreturn['Name'])
            result.append(toreturn)
        return result

    def blockIsOpen(self, blockName):
        """
        _blockIsOpen_

        Return True if named block is open, false if not, or if block
        doenst exist

        """
        self.checkBlockName(blockName)
        blockInstance = self.dbs.listBlocks(block_name=blockName, detail=True)
        if len(blockInstance) == 0:
            return False
        blockInstance = blockInstance[0]
        isOpen = blockInstance.get('open_for_writing', 1)
        if isOpen == 0:
            return False
        return True

    def blockToDatasetPath(self, blockName):
        """
        _blockToDatasetPath_

        Given a block name, get the dataset Path associated with that
        Block.

        Returns the dataset path, or None if not found

        """
        self.checkBlockName(blockName)
        try:
            blocks = self.dbs.listBlocks(block_name=blockName, detail=True)
        except Exception as ex:
            msg = "Error in "
            msg += "DBSReader.blockToDatasetPath(%s)\n" % blockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if blocks == []:
            return None

        pathname = blocks[-1].get('dataset', None)
        return pathname

    def listDatasetLocation(self, datasetName, dbsOnly=False):
        """
        _listDatasetLocation_

        List the origin SEs where there is at least a block of the given
        dataset.
        """
        self.checkDatasetPath(datasetName)

        locations = set()

        if dbsOnly:
            try:
                blocksInfo = self.dbs.listBlockOrigin(dataset=datasetName)
            except dbsClientException as ex:
                msg = "Error in DBSReader: dbsApi.listBlocks(dataset=%s)\n" % datasetName
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)

            if not blocksInfo:  # no data location from dbs
                return list()

            for blockInfo in blocksInfo:
                locations.update(blockInfo['origin_site_name'])

            locations.difference_update(
                ['UNKNOWN', None])  # remove entry when SE name is 'UNKNOWN'
        else:
            try:
                blocksInfo = self.phedex.getReplicaPhEDExNodesForBlocks(
                    dataset=[datasetName], complete='y')
            except Exception as ex:
                msg = "Error while getting block location from PhEDEx for dataset=%s)\n" % datasetName
                msg += "%s\n" % str(ex)
                raise Exception(msg)

            if blocksInfo:
                for blockSites in blocksInfo.values():
                    locations.update(blockSites)

        return list(locations)

    def checkDatasetPath(self, pathName):
        """
         _checkDatasetPath_
        """
        if pathName in ("", None):
            raise DBSReaderError("Invalid Dataset Path name: => %s <=" %
                                 pathName)
        else:
            try:
                result = self.dbs.listDatasets(dataset=pathName,
                                               dataset_access_type='*')
                if len(result) == 0:
                    raise DBSReaderError("Dataset %s doesn't exist in DBS %s" %
                                         (pathName, self.dbsURL))
            except (dbsClientException, HTTPError) as ex:
                msg = "Error in "
                msg += "DBSReader.checkDatasetPath(%s)\n" % pathName
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)
        return

    def checkBlockName(self, blockName):
        """
         _checkBlockName_
        """
        if blockName in ("", "*", None):
            raise DBSReaderError("Invalid Block name: => %s <=" % blockName)

    def getFileListByDataset(self, dataset, validFileOnly=1, detail=True):
        """
        _getFileListByDataset_

        Given a dataset, retrieves all blocks, lfns and number of events (among other
        not really important info).
        Returns a list of dict.
        """

        try:
            fileList = self.dbs.listFileArray(dataset=dataset,
                                              validFileOnly=validFileOnly,
                                              detail=detail)
            return fileList
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.getFileListByDataset(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listDatasetParents(self, childDataset):
        """
        list the the parents dataset path given childDataset
        """
        try:
            parentList = self.dbs.listDatasetParents(dataset=childDataset)
            return parentList
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listDatasetParents(%s)\n" % childDataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)
Exemple #25
0
class DBS3Reader(object):
    """
    _DBSReader_

    General API for reading data from DBS
    """
    # cache all the datatiers known by DBS
    _datatiers = {}

    def __init__(self, url, logger=None, **contact):

        # instantiate dbs api object
        try:
            self.dbsURL = url
            self.dbs = DbsApi(url, **contact)
            self.logger = logger or logging.getLogger(self.__class__.__name__)
        except dbsClientException as ex:
            msg = "Error in DBSReader with DbsApi\n"
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        # connection to PhEDEx (Use default endpoint url)
        self.phedex = PhEDEx(responseType="json", dbsUrl=self.dbsURL)

    def _getLumiList(self, blockName=None, lfns=None, validFileOnly=1):
        """
        currently only take one lfn but dbs api need be updated
        """
        try:
            if blockName:
                lumiLists = self.dbs.listFileLumis(block_name=blockName, validFileOnly=validFileOnly)
            elif lfns:
                lumiLists = []
                for slfn in grouper(lfns, 50):
                    lumiLists.extend(self.dbs.listFileLumiArray(logical_file_name=slfn))
            else:
                # shouldn't call this with both blockName and lfns empty
                # but still returns empty dict for that case
                return {}
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFileLumiArray(%s)\n" % lfns
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        lumiDict = {}
        for lumisItem in lumiLists:
            lumiDict.setdefault(lumisItem['logical_file_name'], [])
            item = {}
            item["RunNumber"] = lumisItem['run_num']
            item['LumiSectionNumber'] = lumisItem['lumi_section_num']
            if lumisItem.get('event_count', None) is not None:
                item['EventCount'] = lumisItem['event_count']
            lumiDict[lumisItem['logical_file_name']].append(item)
            # TODO: add key for lumi and event pair.
        return lumiDict

    def checkDBSServer(self):
        """
        check whether dbs server is up and running
        returns {"dbs_instance": "prod/global", "dbs_version": "3.3.144"}
        """
        try:
            return self.dbs.serverinfo()
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBS server is not up: %s" % self.dbsURL
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listPrimaryDatasets(self, match='*'):
        """
        _listPrimaryDatasets_

        return a list of primary datasets, The full dataset name must be provided
        pattern based mathcing is no longer supported.
        If no expression is provided, all datasets are returned
        """
        try:
            result = self.dbs.listPrimaryDatasets(primary_ds_name=match)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listPrimaryDataset(%s)\n" % match
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['primary_ds_name'] for x in result]
        return result

    def matchProcessedDatasets(self, primary, tier, process):
        """
        _matchProcessedDatasets_

        return a list of Processed datasets
        """
        result = []
        try:
            datasets = self.dbs.listDatasets(primary_ds_name=primary, data_tier_name=tier, detail=True)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        for dataset in datasets:
            dataset = remapDBS3Keys(dataset, processed_ds_name='Name')
            dataset['PathList'] = [dataset['dataset']]
            if dataset['Name'] == process:
                result.append(dataset)
        return result

    def listRuns(self, dataset=None, block=None):
        """
        it gets list of DbsRun object but for our purpose
        only list of number is collected.
        DbsRun (RunNumber,
                NumberOfEvents,
                NumberOfLumiSections,
                TotalLuminosity,
                StoreNumber,
                StartOfRungetLong,
                EndOfRun,
                CreationDate,
                CreatedBy,
                LastModificationDate,
                LastModifiedBy
                )
        """
        runs = []
        try:
            if block:
                results = self.dbs.listRuns(block_name=block)
            else:
                results = self.dbs.listRuns(dataset=dataset)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listRuns(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)
        for x in results:
            runs.extend(x['run_num'])
        return runs

    def listRunLumis(self, dataset=None, block=None):
        """
        It gets a list of DBSRun objects and returns the number of lumisections per run
        DbsRun (RunNumber,
                NumberOfEvents,
                NumberOfLumiSections,
                TotalLuminosity,
                StoreNumber,
                StartOfRungetLong,
                EndOfRun,
                CreationDate,
                CreatedBy,
                LastModificationDate,
                LastModifiedBy
                )
        """
        # Pointless code in python3
        if isinstance(block, str):
            block = unicode(block)
        if isinstance(dataset, str):
            dataset = unicode(dataset)

        try:
            if block:
                results = self.dbs.listRuns(block_name=block)
            else:
                results = self.dbs.listRuns(dataset=dataset)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listRuns(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        # send runDict format as result, this format is for sync with dbs2 call
        # which has {run_number: num_lumis} but dbs3 call doesn't return num Lumis
        # So it returns {run_number: None}
        # TODO: After DBS2 is completely removed change the return format more sensible one

        runDict = {}
        for x in results:
            for runNumber in x["run_num"]:
                runDict[runNumber] = None
        return runDict

    def listProcessedDatasets(self, primary, dataTier='*'):
        """
        _listProcessedDatasets_

        return a list of Processed datasets for the primary and optional
        data tier value

        """
        try:
            result = self.dbs.listDatasets(primary_ds_name=primary, data_tier_name=dataTier)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['dataset'].split('/')[2] for x in result]
        return result

    def listDatasetFiles(self, datasetPath):
        """
        _listDatasetFiles_

        Get list of files for dataset

        """
        return [x['logical_file_name'] for x in self.dbs.listFileArray(dataset=datasetPath)]

    @staticmethod
    def listDatatiers(dbsUrl=None):
        """
        _listDatatiers_

        Get a list of datatiers known by DBS.
        """
        if dbsUrl is None:
            msg = "Error in DBSReader.listDatatiers(). DBS Url not set."
            raise DBSReaderError(msg)

        timenow = int(time.time())
        if DBS3Reader._datatiers and timenow - 7200 < DBS3Reader._datatiers['ts']:
            return DBS3Reader._datatiers['tiers']

        try:
            DBS3Reader._setDatatiersCache(timenow, dbsUrl)
        except Exception as ex:
            if not DBS3Reader._datatiers:
                msg = "Error in DBSReader.listDatatiers\n%s" % formatEx3(ex)
                raise DBSReaderError(msg)
        return DBS3Reader._datatiers['tiers']

    @staticmethod
    def _setDatatiersCache(ts, dbsUrl):
        """
        Set a timestamp and update the list of datatiers cached in
        the class property
        """
        dbs = DbsApi(dbsUrl)
        DBS3Reader._datatiers['ts'] = ts
        DBS3Reader._datatiers['tiers'] = [tier['data_tier_name'] for tier in dbs.listDataTiers()]

        return

    def listDatasetFileDetails(self, datasetPath, getParents=False, getLumis=True, validFileOnly=1):
        """
        TODO: This is completely wrong need to be redone. or be removed - getting dataset altogether
        might be to costly

        _listDatasetFileDetails_

        Get list of lumis, events, and parents for each file in a dataset
        Return a dict where the keys are the files, and for each file we have something like:
            { 'NumberOfEvents': 545,
              'BlockName': '/HighPileUp/Run2011A-v1/RAW#dd6e0796-cbcc-11e0-80a9-003048caaace',
              'Lumis': {173658: [8, 12, 9, 14, 19, 109, 105]},
              'Parents': [],
              'Checksum': '22218315',
              'Adler32': 'a41a1446',
              'FileSize': 286021145,
              'ValidFile': 1
            }

        """
        fileDetails = self.getFileListByDataset(dataset=datasetPath, validFileOnly=validFileOnly, detail=True)
        blocks = set()  # the set of blocks of the dataset
        # Iterate over the files and prepare the set of blocks and a dict where the keys are the files
        files = {}
        for f in fileDetails:
            blocks.add(f['block_name'])
            files[f['logical_file_name']] = remapDBS3Keys(f, stringify=True)
            files[f['logical_file_name']]['ValidFile'] = f['is_file_valid']
            files[f['logical_file_name']]['Lumis'] = {}
            files[f['logical_file_name']]['Parents'] = []

        # Iterate over the blocks and get parents and lumis
        for blockName in blocks:
            # get the parents
            if getParents:
                parents = self.dbs.listFileParents(block_name=blockName)
                for p in parents:
                    if p['logical_file_name'] in files:  # invalid files are not there if validFileOnly=1
                        files[p['logical_file_name']]['Parents'].extend(p['parent_logical_file_name'])

            if getLumis:
                # get the lumis
                file_lumis = self.dbs.listFileLumis(block_name=blockName)
                for f in file_lumis:
                    if f['logical_file_name'] in files:  # invalid files are not there if validFileOnly=1
                        if f['run_num'] in files[f['logical_file_name']]['Lumis']:
                            files[f['logical_file_name']]['Lumis'][f['run_num']].extend(f['lumi_section_num'])
                        else:
                            files[f['logical_file_name']]['Lumis'][f['run_num']] = f['lumi_section_num']

        return files

    def crossCheck(self, datasetPath, *lfns):
        """
        _crossCheck_

        For the dataset provided, check that the lfns listed all exist
        in the dataset.

        Return the list of lfns that are in the dataset

        """
        allLfns = self.dbs.listFileArray(dataset=datasetPath, validFileOnly=1, detail=False)
        setOfAllLfns = set(allLfns)
        setOfKnownLfns = set(lfns)
        return list(setOfAllLfns.intersection(setOfKnownLfns))

    def crossCheckMissing(self, datasetPath, *lfns):
        """
        _crossCheckMissing_

        As cross check, but return value is a list of files that
        are *not* known by DBS

        """
        allLfns = self.dbs.listFileArray(dataset=datasetPath, validFileOnly=1, detail=False)
        setOfAllLfns = set(allLfns)
        setOfKnownLfns = set(lfns)
        knownFiles = setOfAllLfns.intersection(setOfKnownLfns)
        unknownFiles = setOfKnownLfns.difference(knownFiles)
        return list(unknownFiles)

    def getDBSSummaryInfo(self, dataset=None, block=None):
        """
        Get dataset summary includes # of files, events, blocks and total size
        """
        if dataset:
            self.checkDatasetPath(dataset)
        try:
            if block:
                summary = self.dbs.listFileSummaries(block_name=block, validFileOnly=1)
            else:
                summary = self.dbs.listFileSummaries(dataset=dataset, validFileOnly=1)
        except Exception as ex:
            msg = "Error in DBSReader.getDBSSummaryInfo(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if not summary:  # missing data or all files invalid
            return {}

        result = remapDBS3Keys(summary[0], stringify=True)
        result['path'] = dataset if dataset else ''
        result['block'] = block if block else ''
        return result

    def getFileBlocksInfo(self, dataset, onlyClosedBlocks=False,
                          blockName=None, locations=True):
        """
        """
        self.checkDatasetPath(dataset)
        args = {'dataset': dataset, 'detail': True}
        if blockName:
            args['block_name'] = blockName
        try:
            blocks = self.dbs.listBlocks(**args)
        except Exception as ex:
            msg = "Error in DBSReader.getFileBlocksInfo(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        blocks = [remapDBS3Keys(block, stringify=True, block_name='Name') for block in blocks]
        # only raise if blockName not specified - mimic dbs2 error handling
        if not blocks and not blockName:
            msg = "DBSReader.getFileBlocksInfo(%s, %s): No matching data"
            raise DBSReaderError(msg % (dataset, blockName))
        if locations:
            for block in blocks:
                block['PhEDExNodeList'] = [{'Name': x} for x in self.listFileBlockLocation(block['Name'])]

        if onlyClosedBlocks:
            return [x for x in blocks if str(x['OpenForWriting']) != "1"]

        return blocks

    def listFileBlocks(self, dataset, onlyClosedBlocks=False, blockName=None):
        """
        _listFileBlocks_

        Retrieve a list of fileblock names for a dataset

        """
        self.checkDatasetPath(dataset)
        args = {'dataset': dataset, 'detail': False}
        if blockName:
            args['block_name'] = blockName
        if onlyClosedBlocks:
            args['detail'] = True
        try:
            blocks = self.dbs.listBlocks(**args)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if onlyClosedBlocks:
            result = [x['block_name'] for x in blocks if str(x['open_for_writing']) != "1"]

        else:
            result = [x['block_name'] for x in blocks]

        return result

    def listOpenFileBlocks(self, dataset):
        """
        _listOpenFileBlocks_

        Retrieve a list of open fileblock names for a dataset

        """
        self.checkDatasetPath(dataset)
        try:
            blocks = self.dbs.listBlocks(dataset=dataset, detail=True)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['block_name'] for x in blocks if str(x['open_for_writing']) == "1"]

        return result

    def blockExists(self, fileBlockName):
        """
        _blockExists_

        Check to see if block with name provided exists in the DBS
        Instance.

        Return True if exists, False if not

        """
        self.checkBlockName(fileBlockName)
        try:

            blocks = self.dbs.listBlocks(block_name=fileBlockName)
        except Exception as ex:
            msg = "Error in "
            msg += "DBSReader.blockExists(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if len(blocks) == 0:
            return False
        return True

    def listFilesInBlock(self, fileBlockName, lumis=True, validFileOnly=1):
        """
        _listFilesInBlock_

        Get a list of files in the named fileblock
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.
        We need to clean code up when dbs2 is completely deprecated.
        calling lumis for run number is expensive.
        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            files = self.dbs.listFileArray(block_name=fileBlockName, validFileOnly=validFileOnly, detail=True)
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlock(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if lumis:
            lumiDict = self._getLumiList(blockName=fileBlockName, validFileOnly=validFileOnly)

        result = []
        for fileInfo in files:
            if lumis:
                fileInfo["LumiList"] = lumiDict[fileInfo['logical_file_name']]
            result.append(remapDBS3Keys(fileInfo, stringify=True))
        return result

    def listFilesInBlockWithParents(self, fileBlockName, lumis=True, validFileOnly=1):
        """
        _listFilesInBlockWithParents_

        Get a list of files in the named fileblock including
        the parents of that file.
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            # TODO: shoud we get only valid block for this?
            files = self.dbs.listFileParents(block_name=fileBlockName)
            fileDetails = self.listFilesInBlock(fileBlockName, lumis, validFileOnly)

        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n" % (
                fileBlockName,)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        childByParents = defaultdict(list)
        for f in files:
            # Probably a child can have more than 1 parent file
            for fp in f['parent_logical_file_name']:
                childByParents[fp].append(f['logical_file_name'])

        parentsLFNs = childByParents.keys()

        if len(parentsLFNs) == 0:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n There is no parents files" % (
                fileBlockName)
            raise DBSReaderError(msg)

        parentFilesDetail = []
        # TODO: slicing parentLFNs util DBS api is handling that.
        # Remove slicing if DBS api handles
        for pLFNs in grouper(parentsLFNs, 50):
            parentFilesDetail.extend(self.dbs.listFileArray(logical_file_name=pLFNs, detail=True))

        if lumis:
            parentLumis = self._getLumiList(lfns=parentsLFNs)

        parentsByLFN = defaultdict(list)

        for pf in parentFilesDetail:
            parentLFN = pf['logical_file_name']
            dbsFile = remapDBS3Keys(pf, stringify=True)
            if lumis:
                dbsFile["LumiList"] = parentLumis[parentLFN]

            for childLFN in childByParents[parentLFN]:
                parentsByLFN[childLFN].append(dbsFile)

        for fileInfo in fileDetails:
            fileInfo["ParentList"] = parentsByLFN[fileInfo['logical_file_name']]

        return fileDetails

    def lfnsInBlock(self, fileBlockName):
        """
        _lfnsInBlock_

        LFN list only for block, details = False => faster query

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.lfnsInBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            lfns = self.dbs.listFileArray(block_name=fileBlockName, validFileOnly=1, detail=False)
            return lfns
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlock(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listFileBlockLocation(self, fileBlockNames, dbsOnly=False):
        """
        _listFileBlockLocation_

        Get origin_site_name of a block

        """

        singleBlockName = None
        if isinstance(fileBlockNames, basestring):
            singleBlockName = fileBlockNames
            fileBlockNames = [fileBlockNames]

        for block in fileBlockNames:
            self.checkBlockName(block)

        locations = {}
        node_filter = set(['UNKNOWN', None])

        if dbsOnly:
            blocksInfo = {}
            try:
                for block in fileBlockNames:
                    blocksInfo.setdefault(block, [])
                    # there should be only one element with a single origin site string ...
                    for blockInfo in self.dbs.listBlockOrigin(block_name=block):
                        blocksInfo[block].append(blockInfo['origin_site_name'])
            except dbsClientException as ex:
                msg = "Error in DBS3Reader: self.dbs.listBlockOrigin(block_name=%s)\n" % fileBlockNames
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)
        else:
            try:
                blocksInfo = self.phedex.getReplicaPhEDExNodesForBlocks(block=fileBlockNames, complete='y')
            except Exception as ex:
                msg = "Error while getting block location from PhEDEx for block_name=%s)\n" % fileBlockNames
                msg += "%s\n" % str(ex)
                raise Exception(msg)

        for block in fileBlockNames:
            valid_nodes = set(blocksInfo.get(block, [])) - node_filter
            locations[block] = list(valid_nodes)

        # returning single list if a single block is passed
        if singleBlockName:
            return locations[singleBlockName]

        return locations

    def getFileBlock(self, fileBlockName, dbsOnly=False):
        """
        _getFileBlock_

        dbsOnly flag is mostly meant for StoreResults, since there is no
        data in TMDB.

        return a dictionary:
        { blockName: {
             "PhEDExNodeNames" : [<pnn list>],
             "Files" : { LFN : Events },
             }
        }


        """
        # Pointless code in python3
        if isinstance(fileBlockName, str):
            fileBlockName = unicode(fileBlockName)
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.getFileBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        result = {fileBlockName: {
            "PhEDExNodeNames": self.listFileBlockLocation(fileBlockName, dbsOnly),
            "Files": self.listFilesInBlock(fileBlockName),
            "IsOpen": self.blockIsOpen(fileBlockName)
        }
        }
        return result

    def getFileBlockWithParents(self, fileBlockName):
        """
        _getFileBlockWithParents_

        return a dictionary:
        { blockName: {
             "PhEDExNodeNames" : [<pnn list>],
             "Files" : dictionaries representing each file
             }
        }

        files

        """
        if isinstance(fileBlockName, str):
            fileBlockName = unicode(fileBlockName)

        if not self.blockExists(fileBlockName):
            msg = "DBSReader.getFileBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        result = {fileBlockName: {
            "PhEDExNodeNames": self.listFileBlockLocation(fileBlockName),
            "Files": self.listFilesInBlockWithParents(fileBlockName),
            "IsOpen": self.blockIsOpen(fileBlockName)
        }
        }
        return result

    def getFiles(self, dataset, onlyClosedBlocks=False):
        """
        _getFiles_

        Returns a dictionary of block names for the dataset where
        each block constists of a dictionary containing the PhEDExNodeNames
        for that block and the files in that block by LFN mapped to NEvents

        """
        result = {}
        blocks = self.listFileBlocks(dataset, onlyClosedBlocks)

        for x in blocks:
            result.update(self.getFileBlock(x))

        return result

    def listBlockParents(self, blockName):
        """Get parent blocks for block"""
        result = []
        self.checkBlockName(blockName)
        blocks = self.dbs.listBlockParents(block_name=blockName)
        for block in blocks:
            toreturn = {'Name': block['parent_block_name']}
            toreturn['PhEDExNodeList'] = self.listFileBlockLocation(toreturn['Name'])
            result.append(toreturn)
        return result

    def blockIsOpen(self, blockName):
        """
        _blockIsOpen_

        Return True if named block is open, false if not, or if block
        doenst exist

        """
        self.checkBlockName(blockName)
        blockInstance = self.dbs.listBlocks(block_name=blockName, detail=True)
        if len(blockInstance) == 0:
            return False
        blockInstance = blockInstance[0]
        isOpen = blockInstance.get('open_for_writing', 1)
        if isOpen == 0:
            return False
        return True

    def blockToDatasetPath(self, blockName):
        """
        _blockToDatasetPath_

        Given a block name, get the dataset Path associated with that
        Block.

        Returns the dataset path, or None if not found

        """
        self.checkBlockName(blockName)
        try:
            blocks = self.dbs.listBlocks(block_name=blockName, detail=True)
        except Exception as ex:
            msg = "Error in "
            msg += "DBSReader.blockToDatasetPath(%s)\n" % blockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if blocks == []:
            return None

        pathname = blocks[-1].get('dataset', None)
        return pathname

    def listDatasetLocation(self, datasetName, dbsOnly=False):
        """
        _listDatasetLocation_

        List the origin SEs where there is at least a block of the given
        dataset.
        """
        self.checkDatasetPath(datasetName)

        locations = set()

        if dbsOnly:
            try:
                blocksInfo = self.dbs.listBlockOrigin(dataset=datasetName)
            except dbsClientException as ex:
                msg = "Error in DBSReader: dbsApi.listBlocks(dataset=%s)\n" % datasetName
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)

            if not blocksInfo:  # no data location from dbs
                return list()

            for blockInfo in blocksInfo:
                locations.update(blockInfo['origin_site_name'])

            locations.difference_update(['UNKNOWN', None])  # remove entry when SE name is 'UNKNOWN'
        else:
            try:
                blocksInfo = self.phedex.getReplicaPhEDExNodesForBlocks(dataset=[datasetName], complete='y')
            except Exception as ex:
                msg = "Error while getting block location from PhEDEx for dataset=%s)\n" % datasetName
                msg += "%s\n" % str(ex)
                raise Exception(msg)

            if blocksInfo:
                for blockSites in blocksInfo.values():
                    locations.update(blockSites)

        return list(locations)

    def checkDatasetPath(self, pathName):
        """
         _checkDatasetPath_
        """
        if pathName in ("", None):
            raise DBSReaderError("Invalid Dataset Path name: => %s <=" % pathName)
        else:
            try:
                result = self.dbs.listDatasets(dataset=pathName, dataset_access_type='*')
                if len(result) == 0:
                    raise DBSReaderError("Dataset %s doesn't exist in DBS %s" % (pathName, self.dbsURL))
            except (dbsClientException, HTTPError) as ex:
                msg = "Error in "
                msg += "DBSReader.checkDatasetPath(%s)\n" % pathName
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)
        return

    def checkBlockName(self, blockName):
        """
         _checkBlockName_
        """
        if blockName in ("", "*", None):
            raise DBSReaderError("Invalid Block name: => %s <=" % blockName)

    def getFileListByDataset(self, dataset, validFileOnly=1, detail=True):

        """
        _getFileListByDataset_

        Given a dataset, retrieves all blocks, lfns and number of events (among other
        not really important info).
        Returns a list of dict.
        """

        try:
            fileList = self.dbs.listFileArray(dataset=dataset, validFileOnly=validFileOnly, detail=detail)
            return fileList
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.getFileListByDataset(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listDatasetParents(self, childDataset):
        """
        list the the parents dataset path given childDataset
        """
        try:
            parentList = self.dbs.listDatasetParents(dataset=childDataset)
            return parentList
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listDatasetParents(%s)\n" % childDataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    # def getListFilesByLumiAndDataset(self, dataset, files):
    #     "Unsing pycurl to get all the child parents pair for given dataset"
    #
    #     urls = ['%s/data/dbs/fileparentbylumis?block_name=%s' % (
    #              self.dbsURL, b["block_name"]) for b in self.dbs.listBlocks(dataset=dataset)]
    #
    #     data = multi_getdata(urls, ckey(), cert())
    #     rdict = {}
    #     for row in data:
    #         try:
    #             data = json.loads(row['data'])
    #             rdict[req] = data['result'][0]  # we get back {'result': [workflow]} dict
    #         except Exception as exp:
    #             print("ERROR: fail to load data as json record, error=%s" % str(exp))
    #             print(row)
    #     return rdict

    def getParentFilesGivenParentDataset(self, parentDataset, childLFNs):
        """
        returns parent files for given childLFN when DBS doesn't have direct parent child relationship in DB
        Only use this for finding missing parents

        :param parentDataset: parent dataset for childLFN
        :param childLFN: a file in child dataset
        :return: set of parent files for childLFN
        """
        fInfo = self.dbs.listFileLumiArray(logical_file_name=childLFNs)
        parentFiles = defaultdict(set)
        for f in fInfo:
            pFileList = self.dbs.listFiles(dataset=parentDataset, run_num=f['run_num'], lumi_list=f['lumi_section_num'])
            pFiles = set([x['logical_file_name'] for x in pFileList])
            parentFiles[f['logical_file_name']] = parentFiles[f['logical_file_name']].union(pFiles)
        return parentFiles

    def getParentFilesByLumi(self, childLFN):
        """
        get the parent file's lfns by lumi (This might not be the actual parentage relations in DBS just parentage by Lumis).
        use for only specific lfn for validating purpose, for the parentage fix use findAndInsertMissingParentage
        :param childLFN:
        :return: list of dictionary with parent files for given child LFN and parent dataset
        [{"ParentDataset": /abc/bad/ddd, "ParentFiles": [alf, baf, ...]]
        """
        childDatasets = self.dbs.listDatasets(logical_file_name=childLFN)
        result = []
        for i in childDatasets:
            parents = self.dbs.listDatasetParents(dataset=i["dataset"])
            for parent in parents:
                parentFiles = self.getParentFilesGivenParentDataset(parent['parent_dataset'], childLFN)
                result.append({"ParentDataset": parent['parent_dataset'], "ParentFiles": list(parentFiles)})
        return result

    def listParentsByLumi(self, childBlockName, childLFNs=None):
        """
        :param childBlockName: child block name
        :param childLFNs: list of child lfns if it is not specified, all the file in the block will be used,
               if specified, dbs validate child lfns from the childBlockName
        :return: list of list with child and parent id pair.  [[1,2], [3,4]...]
        """
        childLFNs = childLFNs or []
        return self.dbs.listFileParentsByLumi(block_name=childBlockName, logical_file_name=childLFNs)

    def insertFileParents(self, childBlockName, childParentsIDPairs):
        """
        :param childBlockName: child block name
        :param childParentsIDPairs: list of list child and parent file ids, i.e. [[1,2], [3,4]...]
                dbs validate child ids from the childBlockName
        :return: None
        """
        return self.dbs.insertFileParents({"block_name": childBlockName, "child_parent_id_list": childParentsIDPairs})

    def findAndInsertMissingParentage(self, childBlockName, childLFNs=None, insertFlag=True):
        """
        :param childBlockName: child block name
        :param childLFNs: list of child lfns if it is not specified, all the file in the block will be used,
               if specified, dbs validate child lfns from the childBlockName
        :return: number of file parents pair inserted
        """
        childLFNs = childLFNs or []
        fileParents = self.dbs.listFileParentsByLumi(block_name=childBlockName, logical_file_name=childLFNs)
        childParentsIDPairs = fileParents[0]["child_parent_id_list"]

        if insertFlag:
            self.dbs.insertFileParents({"block_name": childBlockName, "child_parent_id_list": childParentsIDPairs})
        return len(childParentsIDPairs)

    def listBlocksWithNoParents(self, childDataset):
        """
        :param childDataset: child dataset for
        :return: set of child blocks with no parentBlock
        """
        allBlocks = self.dbs.listBlocks(dataset=childDataset)
        blockNames = []
        for block in allBlocks:
            blockNames.append(block['block_name'])
        parentBlocks = self.dbs.listBlockParents(block_name=blockNames)

        cblock = set()
        for pblock in parentBlocks:
            cblock.add(pblock['this_block_name'])

        noParentBlocks = set(blockNames) - cblock
        return noParentBlocks

    def listFilesWithNoParents(self, childBlockName):
        """
        :param childBlockName:
        :return:
        """
        allFiles = self.dbs.listFiles(block_name=childBlockName)
        parentFiles = self.dbs.listFileParents(block_name=childBlockName)

        allFileNames = set()
        for fInfo in allFiles:
            allFileNames.add(fInfo['logical_file_name'])

        cfile = set()
        for pFile in parentFiles:
            cfile.add(pFile['logical_file_name'])

        noParentFiles = allFileNames - cfile
        return list(noParentFiles)

    def fixMissingParentageDatasets(self, childDataset, insertFlag=True):
        """
        :param childDataset: child dataset need to set the parentage correctly.
        :return: blocks which failed to insert parentage. for retry
        """
        pDatasets = self.listDatasetParents(childDataset)
        # print("parent datasets %s\n" % pDatasets)
        # pDatasets format is
        # [{'this_dataset': '/SingleMuon/Run2016D-03Feb2017-v1/MINIAOD', 'parent_dataset_id': 13265209, 'parent_dataset': '/SingleMuon/Run2016D-23Sep2016-v1/AOD'}]
        if not pDatasets:
            self.logger.warning("No parent dataset found for child dataset %s", childDataset)
            return {}

        blocks = self.listBlocksWithNoParents(childDataset)
        failedBlocks = []
        for blockName in blocks:
            try:
                numFiles = self.findAndInsertMissingParentage(blockName, insertFlag=insertFlag)
                self.logger.debug("%s file parentage added for block %s" % (numFiles, blockName))
            except Exception as ex:
                self.logger.exception("Parentage updated failed for block %s", blockName)
                failedBlocks.append(blockName)

        return failedBlocks

    def insertMissingParentageForAllFiles(self, childDataset, filterFilesWithParents=True, insertFlag=False):
        """
        :param childDataset: child dataset need to set the parentage correctly.
        :param filterFilesWithParents: if True, only select files without parents, if False all the files in the dataset
        :param insertFlag: if True, insert to DBS, if False just get the list of the file parentage without insert
        :return: blocks which failed to insert parentage. should be used for retrying
        """
        blocks = [b['block_name'] for b in self.dbs.listBlocks(dataset=childDataset)]
        failedBlocks = []
        print("Handling %d blocks" % len(blocks))
        totalFiles = 0
        for blockName in blocks:
            try:
                if filterFilesWithParents:
                    childLFNs = self.listFilesWithNoParents(blockName)
                    if len(childLFNs) == 0:
                        continue
                else:
                    childLFNs = []

                numFiles = self.findAndInsertMissingParentage(blockName, childLFNs=childLFNs, insertFlag=insertFlag)
                print("%s file parentage added for block %s" % (numFiles, blockName))
                totalFiles += numFiles
            except Exception as e:
                print(traceback.format_exc())
                failedBlocks.append(blockName)
        print("Total pairs: ", totalFiles)
        return failedBlocks
Exemple #26
0
class RequestQuery:

    def __init__(self,config):
        self.br=Browser()

        self.config = config
        
        # Initialise connections
        self.mySiteDB = SiteDBJSON()
        self.dbsPhys01 = DbsApi(url = dbs_base_url+"phys01/DBSReader/")
        self.dbsPhys02 = DbsApi(url = dbs_base_url+"phys02/DBSReader/")
        self.dbsPhys03 = DbsApi(url = dbs_base_url+"phys03/DBSReader/")
        
    def __del__(self):
        self.br.close()

    def getScramArchByCMSSW(self):
        """
        Get from the list of available CMSSW releases
        return a dictionary of ScramArchitecture by CMSSW
        """
        
        # Set temporary conection to the server and get the response from cmstags
        url = 'https://cmssdt.cern.ch/SDT/cgi-bin/ReleasesXML'
        br = Browser()
        br.set_handle_robots(False)
        response=br.open(url)
        soup = BeautifulSoup(response.read())
        
        # Dictionary form
        # {'CMSSW_X_X_X':[slc5_amd64_gcc472], ... }
        archByCmssw={}
        
        # Fill the dictionary
        for arch in soup.find_all('architecture'): 
            for cmssw in arch.find_all('project'): 
                # CMSSW release
                cmsswLabel = cmssw.get('label').encode('ascii', 'ignore')
                if cmsswLabel not in archByCmssw:
                    archByCmssw[cmsswLabel]=[]
                # ScramArch related to this CMSSW release
                archName = arch.get('name').encode('ascii', 'ignore')
                archByCmssw[cmsswLabel].append(archName)
        
        return archByCmssw
      
    def getDatasetOriginSites(self, dbs_url, data):
        """
        Get the origin sites for each block of the dataset.
        Return a list block origin sites.
        """
        
        local_dbs = dbs_url.split('/')[5]
        if local_dbs == 'phys01':
            response = self.dbsPhys01.listBlocks(detail=True,dataset=data)
        elif local_dbs == 'phys02':
            response = self.dbsPhys02.listBlocks(detail=True,dataset=data)
        elif local_dbs == 'phys03':
            response = self.dbsPhys03.listBlocks(detail=True,dataset=data)
        
        pnnList = set()
        for block in response:
            pnnList.add(block['origin_site_name'])
        psnList = self.mySiteDB.PNNstoPSNs(pnnList)
        
        return psnList, list(pnnList)
    
    def setGlobalTagFromOrigin(self, dbs_url,input_dataset):
        """
        Get the global tag of the dataset from the source dbs url. If it is not set, then set global tag to 'UNKNOWN'
        """
        
        globalTag = ""
        local_dbs = dbs_url.split('/')[5]
        if local_dbs == 'phys01':
            response = self.dbsPhys01.listOutputConfigs(dataset=input_dataset)
        elif local_dbs == 'phys02':
            response = self.dbsPhys02.listOutputConfigs(dataset=input_dataset)
        elif local_dbs == 'phys03':
            response = self.dbsPhys03.listOutputConfigs(dataset=input_dataset)
        
        globalTag = response[0]['global_tag']
        # GlobalTag cannot be empty
        if globalTag == '':
            globalTag = 'UNKNOWN'
            
        return globalTag
    
    def isDataAtUrl(self, dbs_url,input_dataset):
        """
        Returns True if the dataset is at the dbs url, if not returns False
        """
        local_dbs = dbs_url.split('/')[5]
        if local_dbs == 'phys01':
            response = self.dbsPhys01.listDatasets(dataset=input_dataset)
        elif local_dbs == 'phys02':
            response = self.dbsPhys02.listDatasets(dataset=input_dataset)
        elif local_dbs == 'phys03':
            response = self.dbsPhys03.listDatasets(dataset=input_dataset)
        # This means that the dataset is not at the url
        if not response:
            return False
        else:
            return True
         
    def getLabelByValueDict(self, control):
        """
        From control items, create a dictionary by values
        """   
        d = {}
        for item in control.items:
            value = item.attrs['value']
            label = item.attrs['label']
            d[value] = label
                
        return d
    
    def getValueByLabelDict(self, control):
        """
        From control items, create a dictionary by labels
        """
        d = {}
        for item in control.items:
            value = item.attrs['value']
            label = item.attrs['label']
            d[label] = value

        return d
    
    def createRequestJSON(self, ticket, input_dataset, dbs_url, cmssw_release, group_name, version = 1):
        """
        Creates a JSON file 'Ticket_#TICKET.json' with the needed
        information for creating a requeston ReqMgr.
        Input:
            - ticket: the ticket #, for instance 110773 on https://ggus.eu/?mode=ticket_info&ticket_id=110773
            - input_dataset
            - dbs_url: only the instance name, For example: "phys01" for 
             https://cmsweb.cern.ch/dbs/prod/phys01/DBSReader
            - cmssw_release
            - group_name: the physics group name
            - version: the dataset version, 1 by default.
        It returns a dictionary that contains the request information.
        """

        scramArchByCMSSW = self.getScramArchByCMSSW()
        self.nodeMappings = self.phedex.getNodeMap()
        task = ticket
        print("Processing ticket: %s" % task)
        
        #splitting input dataset       
        input_primary_dataset = input_dataset.split('/')[1].replace(' ','')
        input_processed_dataset = input_dataset.split('/')[2].replace(' ','')
        data_tier = input_dataset.split('/')[3].replace(' ','')
                
        # Transform input value to a valid DBS url
        #dbs_url = "https://cmsweb.cern.ch/dbs/prod/"+dbs_url+"/DBSReader"
        dbs_url = dbs_base_url+dbs_url+"/DBSReader"
        release_id = cmssw_release
                
        # check if deprecated release was used
        release = cmssw_release
        # check if release has not ScramArch match
        if release not in scramArchByCMSSW:
            raise Exception("Error on ticket %s due to ScramArch mismatch" % task)
        else:
            scram_arch = scramArchByCMSSW[release][-1]

        # check if dataset is not at dbs url
        try:
            data_at_url = self.isDataAtUrl(dbs_url,input_dataset)
        except:
            raise Exception('Error on ticket %s, dataset %s not available at %s' %(task, input_dataset,dbs_url))

        if not data_at_url:
            raise Exception('Error on ticket %s, dataset %s not available at %s' %(task, input_dataset,dbs_url))
                    
        ## Get Physics Group
        group_squad = 'cms-storeresults-'+group_name.replace("-","_").lower()

        ## Get Dataset Version
        dataset_version = str(version)

        # Set default Adquisition Era for StoreResults 
        acquisitionEra = "StoreResults"

        ## Construction of the new dataset name (ProcessingString)
        ## remove leading hypernews or physics group name and StoreResults+Version
        if input_processed_dataset.find(group_name)==0:
            new_dataset = input_processed_dataset.replace(group_name,"",1)
        else:
            stripped_dataset = input_processed_dataset.split("-")[1:]
            new_dataset = '_'.join(stripped_dataset)
                        
        # Get dataset site info:
        psnList, pnnList = self.getDatasetOriginSites(dbs_url,input_dataset)

        infoDict = {}
        # Build store results json
        # First add all the defaults values
        infoDict["RequestType"] = "StoreResults"
        infoDict["UnmergedLFNBase"] = "/store/unmerged" 
        infoDict["MergedLFNBase"] = "/store/results/" + group_name.replace("-","_").lower()
        infoDict["MinMergeSize"] = 1500000000
        infoDict["MaxMergeSize"] = 5000000000
        infoDict["MaxMergeEvents"] = 100000
        infoDict["TimePerEvent"] = 40
        infoDict["SizePerEvent"] = 512.0
        infoDict["Memory"] = 2394
        infoDict["CmsPath"] = "/uscmst1/prod/sw/cms"                                        
        infoDict["Group"] = "DATAOPS"
        infoDict["DbsUrl"] = dbs_url
        
        # Add all the information pulled from Savannah
        infoDict["AcquisitionEra"] = acquisitionEra
        infoDict["GlobalTag"] = self.setGlobalTagFromOrigin(dbs_url, input_dataset)
        infoDict["DataTier"] = data_tier
        infoDict["InputDataset"] = input_dataset
        infoDict["ProcessingString"] = new_dataset
        infoDict["CMSSWVersion"] = release
        infoDict["ScramArch"] = scram_arch
        infoDict["ProcessingVersion"] = dataset_version                    
        infoDict["SiteWhitelist"] = psnList
        
        # Create report for Migration2Global
        report = {}
         
        #Fill json file, if status is done
        self.writeJSONFile(task, infoDict)
        report["json"] = 'y'
        report["task"] = int(task)
        report["InputDataset"] = input_dataset
        report["ProcessingString"] = new_dataset
        report["localUrl"] = dbs_url
        report["sites"] = psnList
        report["pnns"] = pnnList

        return report

    def writeJSONFile(self, task, infoDict):
        """
        This writes a JSON file at ComponentDir
        """
        ##check if file already exists
        filename = self.config["ComponentDir"]+'/Ticket_'+str(task)+'.json'
        if not os.access(filename,os.F_OK):
            jsonfile = open(filename,'w')
            request = {'createRequest':infoDict} ## CHECK THIS BEFORE FINISHING
            jsonfile.write(json.dumps(request,sort_keys=True, indent=4))
            jsonfile.close

        return

    def removeJSONFile(self,task):
        """
        This removes the JSON file at ComponentDir if it was created
        """
        filename = self.config["ComponentDir"]+'/Ticket_'+str(task)+'.json'

        if os.access(filename,os.F_OK):
            os.remove(filename)
        return

    def printReport(self, report):
        """
        Print out a report
        """
        print("%20s %5s %10s %50s %50s" %( 'Ticket','json','local DBS','Sites','pnns')) 
        print("%20s %5s %10s %50s %50s" %( '-'*20,'-'*5,'-'*10,'-'*50,'-'*50 ))
        
        json = report["json"]
        ticket = report["task"]
        #status = report["ticketStatus"]
        localUrl = report["localUrl"].split('/')[5]
        site = ', '.join(report["sites"])
        pnns = ', '.join(report["pnns"])
        print("%20s %5s %10s %50s %50s" %(ticket,json,localUrl,site,pnns))  
def main():
    url = 'cmsweb.cern.ch'
    # Example: python assignWorkflow.py -w amaltaro_RVZTT_120404_163607_6269
    # -t testbed-relval -s T1_US_FNAL -e CMSSW_6_0_0_pre1_FS_TEST_WMA -p v1 -a
    # relval -l /store/backfill/1
    parser = optparse.OptionParser()
    parser.add_option( '-w', '--workflow', help='Workflow Name', dest='workflow')
    parser.add_option('-t', '--team', help='Type of Requests', dest='team')
    parser.add_option('-s', '--site', help='Site', dest='site')
    parser.add_option('-p', '--procversion',help='Processing Version', dest='procversion')
    parser.add_option('-a', '--activity',help='Dashboard Activity', dest='activity')
    parser.add_option('-f', '--file',help='File with workflows', dest='file')
    parser.add_option('-l', '--lfn', help='Merged LFN base', dest='lfn')
    parser.add_option('--special',
                      help='Use it for special workflows. You also have to change the code according to the type of WF', dest='special')
    parser.add_option('-r', '--replica', action='store_true', dest='replica', default=False,
                      help='Adds a _Disk Non-Custodial Replica parameter')
    parser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False,
                      help="Prints all query information.")
    parser.add_option('-x', '--xrootd', help='Assign with trustSiteLocation=True (allows xrootd capabilities)',
                      action='store_true', default=False, dest='xrootd')
    parser.add_option("--acqera", dest="acqera",help="Overrides Acquisition Era with a single string")
    parser.add_option("--procstr", dest="procstring",help="Overrides Processing String with a single string")
    parser.add_option('--test', action="store_true",
                      help='Nothing is injected, only print infomation about workflow and AcqEra', dest='test')
    parser.add_option('--pu', action="store_true",help='Use it to inject PileUp workflows only', dest='pu')
    (options, args) = parser.parse_args()

    # Handling the workflow to assign
    if not options.workflow:
        if args:
            workflows = args
        elif options.file:
            workflows = [l.strip() for l in open(options.file) if l.strip()]
        else:
            parser.error("Input a workflow name or a file to read them")
            sys.exit(0)
    else:
        workflows = [options.workflow]

    # Handling the parameters given in the command line
    team = 'production'
    site = GOOD_SITES
    procversion = 1
    activity = 'production'
    lfn = '/store/mc'
    acqera = {}
    procstring = {}
    specialStr = ''
    replica = False

    if options.team:
        team = options.team
    if options.site:
        site = options.site
        if site == "all":
            site = ALL_SITES
        elif site == "t1":
            site = T1S
        # parse sites separated by commas
        elif "," in site:
            site = site.split(",")
    if options.activity:
        activity = options.activity
    if options.lfn:
        lfn = options.lfn
    if options.replica:
        replica = True
    if options.acqera:
        acqera = options.acqera
    if options.procstring:
        procstring = options.procstring

    # Iterating over the set of Workflows
    for workflow in workflows:
        # Getting the original dictionary
        schema = getRequestDict(url, workflow)

        # Checking is the WF is in assignment-approved, it is mandatory to be assigned
        if (schema["RequestStatus"] != "assignment-approved"):
            print("The workflow '" + workflow + "' you are trying to assign is not in assignment-approved")
            sys.exit(1)

        # Dealing with the processing version
        wfInfo = reqMgr.Workflow(workflow, url=url)
        if options.procversion:
            procversion = int(options.procversion)
        else:
            procversion = wfInfo.info["ProcessingVersion"]

        # Setting the AcqEra and ProcStr values per Task
        for key, value in schema.items():
            if type(value) is dict and key.startswith("Task"):
                try:
                    procstring[value['TaskName']] = value[
                        'ProcessingString'].replace("-", "_")
                    acqera[value['TaskName']] = value['AcquisitionEra']
                except KeyError:
                    print("This request has no AcquisitionEra or ProcessingString defined into the Tasks, aborting...")
                    sys.exit(1)

        # Adding the special string - in case it was provided in the command line
        if options.special:
            #specialStr = '_03Jan2013'
            specialStr = '_' + str(options.special)
            for key, value in procstring.items():
                procstring[key] = value + specialStr


        # Check output dataset existence, and abort if they already do!
        datasets = schema["OutputDatasets"]
        i = 0
        if schema["RequestType"] == "TaskChain":
            exist = False
            maxv = 1
            for key, value in schema.items():
                if type(value) is dict and key.startswith("Task"):
                    dbsapi = DbsApi(url=dbs3_url)

                    # list all datasets with same name but different version
                    # numbers
                    datasets = dbsapi.listDatasets(acquisition_era_name=value['AcquisitionEra'],
                                                   primary_ds_name=value['PrimaryDataset'], detail=True, dataset_access_type='*')
                    processedName = value['AcquisitionEra'] + \
                        '-' + value['ProcessingString'] + "-v\\d+"
                    # see if any of the dataset names is a match
                    for ds in datasets:
                        if re.match(processedName, ds['processed_ds_name']):
                            print "Existing dset:", ds['dataset'], "(%s)" % ds['dataset_access_type']
                            maxv = max(maxv, ds['processing_version'])
                            exist = True
                        else:
                            pass
                    i += 1
            # suggest max version
            if exist and procversion <= maxv:
                print("Some output datasets exist, its advised to assign with v =="+ maxv + 1)
                sys.exit(0)
        #Checking if we are dealing with a TaskChain resubmission
        elif schema["RequestType"] == "Resubmission" and wfInfo.info["PrepID"].startswith("task"):
            procstring = wfInfo.info["ProcessingString"]
            acqera = wfInfo.info["AcquisitionEra"]
        else:
            print("The workflow '" + workflow + "' you are trying to assign is not a TaskChain, please use another resource.")
            sys.exit(1)

        # If the --test argument was provided, then just print the information
        # gathered so far and abort the assignment
        if options.test:
            print "%s \tAcqEra: %s \tProcStr: %s \tProcVer: %s" % (workflow, acqera, procstring, procversion)
            # print workflow, '\tAcqEra:', acqera, '\tProcStr:', procstring,
            # '\tProcVer:', procversion
            print "LFN: %s \tTeam: %s \tSite: %s" % (lfn, team, site)
            # print '\tTeam:',team,  '\tSite:', site
            sys.exit(0)

        # Really assigning the workflow now
        print workflow, '\tAcqEra:', acqera, '\tProcStr:', procstring, '\tProcVer:', procversion, '\tTeam:', team, '\tSite:', site
        assignRequest(url, workflow, team, site, acqera, procstring, procversion, activity, lfn, replica, options.verbose, options.xrootd)
    sys.exit(0)
#fname="delete_this_1000.txt" #relval files
fname="delete_this_999.txt" #other files

inputfile = open(fname,'r')

dbsApi = DbsApi(url = 'https://cmsweb.cern.ch/dbs/prod/global/DBSReader')

dsets_fraction_lost = {}

access_type = "VALID"

for line in inputfile:
    lfn= line.rstrip('\n')
    #print lfn

    if dbsApi.listDatasets(logical_file_name = lfn, dataset_access_type = access_type) ==  []:
        print lfn
        continue

    #print dbsApi.listDatasets(logical_file_name = "/store/mc/RunIIWinter15GS/WToENu_M-500_TuneCUETP8M1_13TeV-pythia8/GEN-SIM/MCRUN2_71_V1-v3/00000/4AF070C6-DFE5-E411-A0D6-0025905B8572.root")

    dset_name = dbsApi.listDatasets(logical_file_name = lfn, dataset_access_type = access_type)[0]['dataset']

    status= dbsApi.listDatasets(dataset = dset_name, detail=1, dataset_access_type = access_type)[0]['dataset_access_type']

    if status != "VALID" and status != "PRODUCTION":
        print "found dataset that does not have VALID or PRODUCTION status, exiting"
        sys.exit(1)

    if dset_name not in dsets_fraction_lost.keys():
        dsets_fraction_lost[dset_name] =  {'n_total_events' : dbsApi.listFileSummaries(dataset = dset_name)[0]['num_event'], 'n_events_lost' : dbsApi.listFiles(logical_file_name = lfn, detail =1)[0]['event_count']}
Exemple #29
0
def main():
    url = 'cmsweb.cern.ch'
    ### Example: python assignWorkflow.py -w amaltaro_RVZTT_120404_163607_6269 -t testbed-relval -s T1_US_FNAL -e CMSSW_6_0_0_pre1_FS_TEST_WMA -p v1 -a relval -l /store/backfill/1
    parser = optparse.OptionParser()
    parser.add_option('-w',
                      '--workflow',
                      help='Workflow Name',
                      dest='workflow')
    parser.add_option('-t', '--team', help='Type of Requests', dest='team')
    parser.add_option('-s', '--site', help='Site', dest='site')
    parser.add_option('-p',
                      '--procversion',
                      help='Processing Version',
                      dest='procversion')
    parser.add_option('-a',
                      '--activity',
                      help='Dashboard Activity',
                      dest='activity')
    parser.add_option('-l', '--lfn', help='Merged LFN base', dest='lfn')

    parser.add_option(
        '--special',
        help=
        'Use it for special workflows. You also have to change the code according to the type of WF',
        dest='special')
    parser.add_option(
        '--test',
        action="store_true",
        help=
        'Nothing is injected, only print infomation about workflow and AcqEra',
        dest='test')
    parser.add_option('--pu',
                      action="store_true",
                      help='Use it to inject PileUp workflows only',
                      dest='pu')
    (options, args) = parser.parse_args()

    if not options.workflow:
        print "The workflow name is mandatory!"
        print "Usage: python assignProdTaskChain.py -w <requestName>"
        sys.exit(0)

    workflow = options.workflow
    team = 'production'
    site = [
        "T1_DE_KIT",
        "T1_ES_PIC",
        "T1_FR_CCIN2P3",
        "T1_IT_CNAF",
        "T1_RU_JINR",
        "T1_UK_RAL",
        "T1_US_FNAL",
        #"T2_CH_CSCS",
        "T2_CH_CERN",
        #"T2_CN_Beijing",
        "T2_DE_DESY",
        "T2_DE_RWTH",
        "T2_ES_CIEMAT",
        #"T2_FR_CCIN2P3",
        "T2_FR_IPHC",
        "T2_IT_Bari",
        "T2_IT_Legnaro",
        "T2_IT_Pisa",
        "T2_IT_Rome",
        #"T2_UA_KIPT",
        "T2_UK_London_Brunel",
        #"T2_UK_London_IC",
        "T2_US_Caltech",
        "T2_US_Florida",
        "T2_US_MIT",
        "T2_US_Nebraska",
        "T2_US_Purdue",
        "T2_US_UCSD",
        "T2_US_Wisconsin"
    ]
    procversion = 1
    activity = 'production'
    lfn = '/store/mc'
    acqera = {}
    procstring = {}
    specialStr = ''

    ### Getting the original dictionary
    schema = getRequestDict(url, workflow)

    # Setting the AcqEra and ProcStr values per Task
    for key, value in schema.items():
        if type(value) is dict and key.startswith("Task"):
            try:
                procstring[
                    value['TaskName']] = value['ProcessingString'].replace(
                        "-", "_")
                acqera[value['TaskName']] = value['AcquisitionEra']
            except KeyError:
                print "This request has no AcquisitionEra or ProcessingString defined into the Tasks, aborting..."
                sys.exit(1)

    # Adding the special string - in case it was provided in the command line
    if options.special:
        #specialStr = '_03Jan2013'
        specialStr = '_' + str(options.special)
        for key, value in procstring.items():
            procstring[key] = value + specialStr

    # Handling the parameters given in the command line
    if options.team:
        team = options.team
    if options.site:
        site = options.site
    if options.procversion:
        procversion = int(options.procversion)
    if options.activity:
        activity = options.activity
    if options.lfn:
        lfn = options.lfn

    #TODO check output dataset existence, and abort if they already do!
    datasets = schema["OutputDatasets"]
    i = 0
    if 'ACDC' not in options.workflow:
        exist = False
        maxv = 1
        for key, value in schema.items():
            if type(value) is dict and key.startswith("Task"):
                dbsapi = DbsApi(url=dbs3_url)

                #list all datasets with same name but different version numbers
                datasets = dbsapi.listDatasets(
                    acquisition_era_name=value['AcquisitionEra'],
                    primary_ds_name=value['PrimaryDataset'],
                    detail=True,
                    dataset_access_type='*')
                processedName = value['AcquisitionEra'] + '-' + value[
                    'ProcessingString'] + "-v\\d+"
                #see if any of the dataset names is a match
                for ds in datasets:
                    if re.match(processedName, ds['processed_ds_name']):
                        print "Existing dset:", ds[
                            'dataset'], "(%s)" % ds['dataset_access_type']
                        maxv = max(maxv, ds['processing_version'])
                        exist = True
                    else:
                        pass
                i += 1
        #suggest max version
        if exist and procversion <= maxv:
            print "Some output datasets exist, its advised to assign with v ==", maxv + 1
            sys.exit(0)

    # If the --test argument was provided, then just print the information gathered so far and abort the assignment
    if options.test:
        print "%s \tAcqEra: %s \tProcStr: %s \tProcVer: %s" % (
            workflow, acqera, procstring, procversion)
        #print workflow, '\tAcqEra:', acqera, '\tProcStr:', procstring, '\tProcVer:', procversion
        print "LFN: %s \tTeam: %s \tSite: %s" % (lfn, team, site)
        #print '\tTeam:',team,  '\tSite:', site
        sys.exit(0)

    # Really assigning the workflow now
    print workflow, '\tAcqEra:', acqera, '\tProcStr:', procstring, '\tProcVer:', procversion, '\tTeam:', team, '\tSite:', site
    assignRequest(url, workflow, team, site, acqera, procstring, procversion,
                  activity, lfn)
    sys.exit(0)
Exemple #30
0
def main():
    url = 'cmsweb.cern.ch'
    url_tb = 'cmsweb-testbed.cern.ch'
    
    # Example: python assign.py -w amaltaro_RVZTT_120404_163607_6269
    # -t testbed-relval -s T1_US_FNAL -e CMSSW_6_0_0_pre1_FS_TEST_WMA -p v1 -a
    # relval -l /store/backfill/1
    usage = "usage: %prog [options] [WORKFLOW]"
    
    parser = optparse.OptionParser(usage=usage)
    parser.add_option('-t', '--team', help='Type of Requests', dest='team', default='production')
    parser.add_option('-s', '--sites', help=' "t1" for Tier-1\'s and "t2" for Tier-2\'s', dest='sites')
    parser.add_option('--special',  help='Use it for special workflows. You also have to change the code according to the type of WF', dest='special')
    parser.add_option('-r', '--replica', action='store_true', dest='replica', default=False, help='Adds a _Disk Non-Custodial Replica parameter')
    parser.add_option('-p', '--procversion', help='Processing Version, if empty it will leave the processing version that comes by default in the request', dest='procversion')
    parser.add_option('-a', '--activity', help='Dashboard Activity (reprocessing, production or test), if empty will set reprocessing as default', dest='activity')
    parser.add_option( '--xrootd', help='Assign with TrustSitelists=True (allows xrootd capabilities)',
                      action='store_true', dest='xrootd')
    parser.add_option('--no_xrootd', help='Assign with TrustSitelists=False',
                      action='store_false', dest='xrootd')
    parser.add_option('--secondary_xrootd', help='Assign with TrustPUSitelists=True (allows xrootd capabilities)',
                      action='store_true', dest='secondary_xrootd')
    parser.add_option('--no_secondary_xrootd', help='Assign with TrustPUSitelists=False',
                      action='store_false', dest='secondary_xrootd')
    parser.add_option('-l', '--lfn', help='Merged LFN base', dest='lfn')
    parser.add_option('-v', '--verbose', help='Verbose', action='store_true', default=False, dest='verbose')
    parser.add_option('--testbed', help='Assign in testbed', action='store_true', default=False, dest='testbed')
    parser.add_option('--test', action="store_true",help='Nothing is injected, only print infomation about workflow and Era', dest='test')
    parser.add_option('-f', '--file', help='Text file with a list of wokflows. If this option is used, the same settings will be applied to all workflows', dest='file')
    parser.add_option('-w', '--workflow', help='Workflow Name, or coma sperated list', dest='workflow')
    parser.add_option('-m', '--memory', help='Set the Memory parameter to the workflow', dest='memory', default=None)
    parser.add_option('--lumisperjob',help='Set the number of lumis per job', default=None, type=int)
    parser.add_option('--maxmergeevents',help='Set the number of event to merge at max', default=None, type=int)
    parser.add_option('-c', '--multicore', help='Set the multicore parameter to the workfllow', dest='multicore', default=None)
    parser.add_option('-e', '--era', help='Acquistion era', dest='era')
    parser.add_option("--procstr", dest="procstring", help="Overrides Processing String with a single string")
    parser.add_option('--checksite', default=False,action='store_true')
    (options, args) = parser.parse_args()
    
    if options.testbed:
        url = url_tb

    # parse input workflows and files. If both -w and -f options are used, then only the -w inputs are considered.
    if not options.workflow:
        if args:
            wfs = args
        elif options.file:
            wfs = [l.strip() for l in open(options.file) if l.strip()]
        else:
            parser.error("Input a workflow name or a file to read them")
            sys.exit(0)
    else:
        wfs = options.workflow.split(',')

    #Default values
    era = {}
    procversion = 1
    procstring = {}
    memory = None
    multicore = None
    replica = False
    sites = []
    specialStr = ''
    taskchain = False
    xrootd= False
    secondary_xrootd= False

    SI = siteInfo()
    getRandomDiskSite.T1 = SI.sites_T1s
    # Handling the parameters given in the command line
    # parse site list
    if options.sites:
        if options.sites.lower() == "t1":
            sites = SI.sites_T1s
        elif options.sites.lower() == "t2":
            sites = SI.sites_T2s
        elif options.sites.lower() in ["all","t1+t2","t2+t1"] :
            sites = SI.sites_T2s+SI.sites_T1s
        elif options.sites.lower() == "mcore":
            sites = SI.sites_mcore_ready
        elif hasattr(SI,options.sites):
            sites = getattr(SI,options.sites)
        #elif options.sites.lower() == 'acdc':
        #    sites = []
        else: 
            sites = [site for site in options.sites.split(',')]
    else: 
        sites = SI.sites_T1s + SI.sites_T2s

    if options.replica:
        replica = True

    for wfn in wfs:
        # Getting the original dictionary
        wfi = workflowInfo( url, wfn )
        schema = wfi.request
        if 'OriginalRequestName' in schema:
            print "Original workflow is:",schema['OriginalRequestName']
            original_wf = workflowInfo(url, schema['OriginalRequestName'])            
            ancestor_wf = workflowInfo(url, schema['OriginalRequestName'])
            ## go back as up as possible
            while ancestor_wf.request['RequestType'] == 'Resubmission':
                if 'OriginalRequestName' not in ancestor_wf.request:
                    ancestor_wf = None
                    break
                ancestor_wf = workflowInfo(url, ancestor_wf.request['OriginalRequestName'])
        else:
            original_wf = None
            ancestor_wf = None

        is_resubmission = (schema['RequestType'] == 'Resubmission')

        if options.sites.lower() == 'original' and original_wf:
            sites = original_wf.request['SiteWhitelist']
            print "Using",sorted(sites),"from the original request",original_wf.request['RequestName']

        #print json.dumps( schema, indent=2 )
        wf_name = wfn
        wf_info = schema

        # WF must be in assignment-approved in order to be assigned
        if (schema["RequestStatus"] != "assignment-approved") and not options.test:
            print("The workflow '" + wf_name + "' you are trying to assign is not in assignment-approved")
            sys.exit(1)

        #Check to see if the workflow is a task chain or an ACDC of a taskchain
        taskchain = (schema["RequestType"] == "TaskChain") or (ancestor_wf and ancestor_wf.request["RequestType"] == "TaskChain")

        # Adding the special string - in case it was provided in the command line
        if options.special:
            specialStr = '_' + str(options.special)
            for key, value in procstring.items():
                procstring[key] = value + specialStr

        # Override if a value is given using the procstring command
        if options.procstring:
            procstring = options.procstring
        elif is_resubmission:
            procstring = ancestor_wf.processingString()
        else:
            procstring = wfi.processingString()

        if options.era:
            era = options.era
        elif is_resubmission:
            era = ancestor_wf.acquisitionEra()
        else:
            era = wfi.acquisitionEra()
        #Dealing with era and proc string
        if (not era or not procstring) or (taskchain and (type(era)!=dict or type(procstring)!=dict)):
            print "We do not have a valid AcquisitionEra and ProcessingString"
            sys.exit(1)

        # Must use --lfn option, otherwise workflow won't be assigned
        if options.lfn:
            lfn = options.lfn
        elif "MergedLFNBase" in wf_info:
            lfn = wf_info['MergedLFNBase']
        elif ancestor_wf and "MergedLFNBase" in ancestor_wf.request:
            lfn = ancestor_wf.request['MergedLFNBase']
        else:
            print "Can't assign the workflow! Please include workflow lfn using --lfn option."
            sys.exit(0)
        # activity production by default for taskchains, reprocessing for default by workflows
        if options.activity:
            activity = options.activity
        elif taskchain:
            activity = 'production'
        else:
            activity = 'reprocessing'

        if options.memory:
            memory = options.memory

        if options.multicore:
            multicore = options.multicore

        # given or default processing version
        if options.procversion:
            procversion = int(options.procversion)
        else:
            if is_resubmission:
                procversion = ancestor_wf.request['ProcessingVersion']
            else:
                procversion = wf_info["ProcessingVersion"]

        # reading xrootd and secondary_xrootd values
        if options.xrootd is not None:
            xrootd = options.xrootd
        elif original_wf:
            xrootd= original_wf.request["TrustSitelists"]

        if options.secondary_xrootd is not None:
            secondary_xrootd = options.secondary_xrootd
        elif original_wf:
            secondary_xrootd= original_wf.request["TrustPUSitelists"]

        # Check for output dataset existence, and abort if output datasets already exist!
        # Don't perform this check for ACDC's
        datasets = schema["OutputDatasets"]
        i = 0
        if not is_resubmission:
            exist = False
            maxv = 1
            for key, value in schema.items():
                if type(value) is dict and key.startswith("Task"):
                    dbsapi = DbsApi(url=dbs3_url)
                    
                    # list all datasets with same name but different version
                    # numbers
                    datasets = dbsapi.listDatasets(acquisition_era_name=value['AcquisitionEra'], primary_ds_name=value['PrimaryDataset'], detail=True, dataset_access_type='*')
                    processedName = value['AcquisitionEra'] + '-' + value['ProcessingString'] + "-v\\d+"
                    # see if any of the dataset names is a match
                    for ds in datasets:
                        if re.match(processedName, ds['processed_ds_name']):
                            print "Existing dset:", ds['dataset'], "(%s)" % ds['dataset_access_type']
                            maxv = max(maxv, ds['processing_version'])
                            exist = True
                        else:
                             pass
                    i += 1
            # suggest max version
            if exist and procversion <= maxv:
                print "Some output datasets exist, its advised to assign with v ==", maxv + 1
                sys.exit(0)
        else:
            ## this is a resubmission !
            print "The taks in resubmission is:",schema['InitialTaskPath']
            ## pick up the sites from acdc
            if options.sites.lower() == 'acdc':
                where_to_run, _,_ =  original_wf.getRecoveryInfo()
                task = schema['InitialTaskPath']
                sites = list(set([SI.SE_to_CE(site) for site in where_to_run[task]]) & set(SI.all_sites))
                print "Found",sorted(sites),"as sites where to run the ACDC at, from the acdc doc of ",original_wf.request['RequestName']

        if options.checksite:
            ## check that the sites are all compatible and up
            check_mem = schema['Memory'] if not memory else memory
            ncores = wfi.getMulticore() if not multicore else multicore
            memory_allowed = SI.sitesByMemory( float(check_mem), maxCore=ncores)
            not_ready = sorted(set(sites) & set(SI.sites_not_ready))
            not_existing = sorted(set(sites) - set(SI.all_sites))
            not_matching = sorted((set(sites) - set(memory_allowed) - set(not_ready) - set(not_existing)))
            previously_used = []
            if schema['SiteWhitelist']: previously_used = schema['SiteWhitelist']
            if original_wf: previously_used = original_wf.request['SiteWhitelist']
            if previously_used: not_matching = sorted(set(not_matching) & set(previously_used))
            
            sites = sorted( set(sites) - set(not_matching) - set(not_existing))
            
            print sorted(memory_allowed),"to allow",check_mem,ncores
            if not_ready:
                print not_ready,"is/are not ready"
                sys.exit(0)
            if not_matching:
                print "The memory requirement",check_mem,"is too much for",not_matching
                sys.exit(0)


        ## need to play with memory setting
        if taskchain:
            if memory:
                ## transform into a dictionnary
                increase = set_to = None
                tasks,set_to = memory.split(':') if ':' in memory else ("",memory)
                tasks = tasks.split(',') if tasks else []
                if set_to.startswith('+'):
                    increase = int(set_to[1:])
                else:
                    set_to = int(set_to)
                it = 1
                memory_dict = {}
                while True:
                    t = 'Task%d'%it
                    it += 1
                    if t in schema:
                        tname = schema[t]['TaskName']
                        if tasks and not tname in tasks:
                            print tname,"not concerned"
                            memory_dict[tname] = schema[t]['Memory']
                            continue
                        if set_to:
                            memory_dict[tname] = set_to
                        else:
                            memory_dict[tname] =schema[t]['Memory'] + increase
                    else:
                        break
                memory = memory_dict
                print memory_dict
            ## need to play with multicore setting
            if multicore:
                tasks,set_to = multicore.split(':') if ':' in multicore else ("",multicore)
                tasks = tasks.split(',') if tasks else []
                set_to = int(set_to)
                multicore_dict = {}
                timeperevent_dict = {}
                it=1
                while True:
                    t = 'Task%d'%it
                    it += 1
                    if t in schema:
                        tname = schema[t]['TaskName']
                        mcore = schema[t]['Multicore']
                        if tasks and not tname in tasks:
                            print tname,"not concerned"
                            multicore_dict[tname] = schema[t]['Multicore']
                            timeperevent_dict[tname] = schema[t]['TimePerEvent']
                            continue
                        if memory:
                            mem = memory[tname]
                            print mem, memory
                            factor = (set_to / float(mcore))
                            fraction_constant = 0.4
                            mem_per_core_c = int((1-fraction_constant) * mem / float(mcore))
                            print "mem per core", mem_per_core_c
                            print "base mem", mem
                            
                            memory[tname] = mem + (set_to-mcore)*mem_per_core_c
                            print "final mem",memory[tname]
                            timeperevent_dict[tname] = schema[t]['TimePerEvent']/factor
                        print "setting mcore",set_to
                        multicore_dict[tname] = set_to
                    else:
                        break
                multicore = multicore_dict
                print multicore
                print timeperevent_dict,"cannot be used yet."
    # If the --test argument was provided, then just print the information
    # gathered so far and abort the assignment
        print wf_name
        print "Era:",era
        print "ProcStr:",procstring
        print "ProcVer:",procversion
        print "LFN:",lfn
        print "Team:",options.team
        print "Site:",sites
        print "Taskchain? ", str(taskchain)
        print "Activity:", activity
        print "ACDC:", str(is_resubmission)
        print "Xrootd:", str(xrootd)
        print "Secondary_xrootd:", str(secondary_xrootd)
        #if options.test:            continue
        
        # Really assigning the workflow now
        #print wf_name, '\tEra:', era, '\tProcStr:', procstring, '\tProcVer:', procversion, '\tTeam:', team, '\tSite:', sites
        assignRequest(url, 
                      workflow = wf_name,
                      team = options.team,
                      sites = sites,
                      era = era, 
                      procversion = procversion,
                      activity = activity,
                      lfn = lfn,
                      procstring = procstring, 
                      trust_site = xrootd, 
                      replica = options.replica, 
                      verbose = options.test, 
                      taskchain = taskchain, 
                      trust_secondary_site = secondary_xrootd,
                      memory=memory,
                      multicore=multicore,
                      lumisperjob = options.lumisperjob,
                      maxmergeevents = options.maxmergeevents
                      )
    
    sys.exit(0)
Exemple #31
0
def getDatasets(dataset_pattern):
    "Return list of dataset for given dataset pattern"
    dbsapi = DbsApi(url=DBS3, verifypeer=False)
    reply = dbsapi.listDatasets(dataset=dataset_pattern, dataset_access_type='*')
    return reply
Exemple #32
0
def main():
    url = 'cmsweb.cern.ch'
    url_tb = 'cmsweb-testbed.cern.ch'
    
    # Example: python assign.py -w amaltaro_RVZTT_120404_163607_6269
    # -t testbed-relval -s T1_US_FNAL -e CMSSW_6_0_0_pre1_FS_TEST_WMA -p v1 -a
    # relval -l /store/backfill/1
    usage = "usage: %prog [options] [WORKFLOW]"
    
    parser = optparse.OptionParser(usage=usage)
    parser.add_option('-t', '--team', help='Type of Requests', dest='team', default='production')
    parser.add_option('-s', '--sites', help=' "t1" for Tier-1\'s and "t2" for Tier-2\'s', dest='sites')
    parser.add_option('--special',  help='Use it for special workflows. You also have to change the code according to the type of WF', dest='special')
    parser.add_option('-r', '--replica', action='store_true', dest='replica', default=False, help='Adds a _Disk Non-Custodial Replica parameter')
    parser.add_option('-p', '--procversion', help='Processing Version, if empty it will leave the processing version that comes by default in the request', dest='procversion')
    parser.add_option('-a', '--activity', help='Dashboard Activity (reprocessing, production or test), if empty will set reprocessing as default', dest='activity')
    parser.add_option( '--xrootd', help='Assign with TrustSitelists=True (allows xrootd capabilities)',
                      action='store_true', dest='xrootd')
    parser.add_option('--no_xrootd', help='Assign with TrustSitelists=False',
                      action='store_false', dest='xrootd')
    parser.add_option('--secondary_xrootd', help='Assign with TrustPUSitelists=True (allows xrootd capabilities)',
                      action='store_true', dest='secondary_xrootd')
    parser.add_option('--no_secondary_xrootd', help='Assign with TrustPUSitelists=False',
                      action='store_false', dest='secondary_xrootd')
    parser.add_option('-l', '--lfn', help='Merged LFN base', dest='lfn')
    parser.add_option('-v', '--verbose', help='Verbose', action='store_true', default=False, dest='verbose')
    parser.add_option('--testbed', help='Assign in testbed', action='store_true', default=False, dest='testbed')
    parser.add_option('--test', action="store_true",help='Nothing is injected, only print infomation about workflow and Era', dest='test')
    parser.add_option('-f', '--file', help='Text file with a list of wokflows. If this option is used, the same settings will be applied to all workflows', dest='file')
    parser.add_option('-w', '--workflow', help='Workflow Name, or coma sperated list', dest='workflow')
    parser.add_option('-m', '--memory', help='Set the Memory parameter to the workflow', dest='memory', default=None)
    parser.add_option('--lumisperjob',help='Set the number of lumis per job', default=None, type=int)
    parser.add_option('--maxmergeevents',help='Set the number of event to merge at max', default=None, type=int)
    parser.add_option('-c', '--multicore', help='Set the multicore parameter to the workfllow', dest='multicore', default=None)
    parser.add_option('-e', '--era', help='Acquistion era', dest='era')
    parser.add_option("--procstr", dest="procstring", help="Overrides Processing String with a single string")
    parser.add_option('--checksite', default=False,action='store_true')
    (options, args) = parser.parse_args()
    
    if options.testbed:
        url = url_tb

    # parse input workflows and files. If both -w and -f options are used, then only the -w inputs are considered.
    if not options.workflow:
        if args:
            wfs = args
        elif options.file:
            wfs = [l.strip() for l in open(options.file) if l.strip()]
        else:
            parser.error("Input a workflow name or a file to read them")
            sys.exit(0)
    else:
        wfs = options.workflow.split(',')

    #Default values
    era = {}
    procversion = 1
    procstring = {}
    memory = None
    multicore = None
    replica = False
    sites = []
    specialStr = ''
    taskchain = False
    xrootd= False
    secondary_xrootd= False

    SI = siteInfo()
    getRandomDiskSite.T1 = SI.sites_T1s
    # Handling the parameters given in the command line
    # parse site list
    if options.sites:
        if options.sites.lower() == "t1":
            sites = SI.sites_T1s
        elif options.sites.lower() == "t2":
            sites = SI.sites_T2s
        elif options.sites.lower() in ["all","t1+t2","t2+t1"] :
            sites = SI.sites_T2s+SI.sites_T1s
        elif options.sites.lower() == "mcore":
            sites = SI.sites_mcore_ready
        elif hasattr(SI,options.sites):
            sites = getattr(SI,options.sites)
        #elif options.sites.lower() == 'acdc':
        #    sites = []
        else: 
            sites = [site for site in options.sites.split(',')]
    else: 
        sites = SI.sites_T1s + SI.sites_T2s

    if options.replica:
        replica = True

    for wfn in wfs:
        # Getting the original dictionary
        wfi = workflowInfo( url, wfn )
        schema = wfi.request
        if 'OriginalRequestName' in schema:
            print "Original workflow is:",schema['OriginalRequestName']
            original_wf = workflowInfo(url, schema['OriginalRequestName'])            
            ancestor_wf = workflowInfo(url, schema['OriginalRequestName'])
            ## go back as up as possible
            while ancestor_wf.request['RequestType'] == 'Resubmission':
                if 'OriginalRequestName' not in ancestor_wf.request:
                    ancestor_wf = None
                    break
                ancestor_wf = workflowInfo(url, ancestor_wf.request['OriginalRequestName'])
        else:
            original_wf = None
            ancestor_wf = None

        is_resubmission = (schema['RequestType'] == 'Resubmission')

        if options.sites.lower() == 'original' and original_wf:
            sites = original_wf.request['SiteWhitelist']
            print "Using",sorted(sites),"from the original request",original_wf.request['RequestName']

        #print json.dumps( schema, indent=2 )
        wf_name = wfn
        wf_info = schema

        # WF must be in assignment-approved in order to be assigned
        if (schema["RequestStatus"] != "assignment-approved") and not options.test:
            print("The workflow '" + wf_name + "' you are trying to assign is not in assignment-approved")
            sys.exit(1)

        #Check to see if the workflow is a task chain or an ACDC of a taskchain
        taskchain = (schema["RequestType"] == "TaskChain") or (ancestor_wf and ancestor_wf.request["RequestType"] == "TaskChain")

        # Adding the special string - in case it was provided in the command line
        if options.special:
            specialStr = '_' + str(options.special)
            for key, value in procstring.items():
                procstring[key] = value + specialStr

        # Override if a value is given using the procstring command
        if options.procstring:
            procstring = options.procstring
        elif is_resubmission:
            procstring = ancestor_wf.processingString()
        else:
            procstring = wfi.processingString()

        if options.era:
            era = options.era
        elif is_resubmission:
            era = ancestor_wf.acquisitionEra()
        else:
            era = wfi.acquisitionEra()
        #Dealing with era and proc string
        if (not era or not procstring) or (taskchain and (type(era)!=dict or type(procstring)!=dict)):
            print "We do not have a valid AcquisitionEra and ProcessingString"
            sys.exit(1)

        # Must use --lfn option, otherwise workflow won't be assigned
        if options.lfn:
            lfn = options.lfn
        elif "MergedLFNBase" in wf_info:
            lfn = wf_info['MergedLFNBase']
        elif ancestor_wf and "MergedLFNBase" in ancestor_wf.request:
            lfn = ancestor_wf.request['MergedLFNBase']
        else:
            print "Can't assign the workflow! Please include workflow lfn using --lfn option."
            sys.exit(0)
        # activity production by default for taskchains, reprocessing for default by workflows
        if options.activity:
            activity = options.activity
        elif taskchain:
            activity = 'production'
        else:
            activity = 'reprocessing'

        if options.memory:
            memory = options.memory

        if options.multicore:
            multicore = options.multicore

        # given or default processing version
        if options.procversion:
            procversion = int(options.procversion)
        else:
            if is_resubmission:
                procversion = ancestor_wf.request['ProcessingVersion']
            else:
                procversion = wf_info["ProcessingVersion"]

        # reading xrootd and secondary_xrootd values
        if options.xrootd is not None:
            xrootd = options.xrootd
        elif original_wf:
            xrootd= original_wf.request["TrustSitelists"]

        if options.secondary_xrootd is not None:
            secondary_xrootd = options.secondary_xrootd
        elif original_wf:
            secondary_xrootd= original_wf.request["TrustPUSitelists"]

        # Check for output dataset existence, and abort if output datasets already exist!
        # Don't perform this check for ACDC's
        datasets = schema["OutputDatasets"]
        i = 0
        if not is_resubmission:
            exist = False
            maxv = 1
            for key, value in schema.items():
                if type(value) is dict and key.startswith("Task"):
                    dbsapi = DbsApi(url=dbs3_url)
                    
                    # list all datasets with same name but different version
                    # numbers
                    datasets = dbsapi.listDatasets(acquisition_era_name=value['AcquisitionEra'], primary_ds_name=value['PrimaryDataset'], detail=True, dataset_access_type='*')
                    processedName = value['AcquisitionEra'] + '-' + value['ProcessingString'] + "-v\\d+"
                    # see if any of the dataset names is a match
                    for ds in datasets:
                        if re.match(processedName, ds['processed_ds_name']):
                            print "Existing dset:", ds['dataset'], "(%s)" % ds['dataset_access_type']
                            maxv = max(maxv, ds['processing_version'])
                            exist = True
                        else:
                             pass
                    i += 1
            # suggest max version
            if exist and procversion <= maxv:
                print "Some output datasets exist, its advised to assign with v ==", maxv + 1
                sys.exit(0)
        else:
            ## this is a resubmission !
            print "The taks in resubmission is:",schema['InitialTaskPath']
            ## pick up the sites from acdc
            if options.sites.lower() == 'acdc':
                where_to_run, _,_ =  original_wf.getRecoveryInfo()
                task = schema['InitialTaskPath']
                sites = list(set([SI.SE_to_CE(site) for site in where_to_run[task]]) & set(SI.all_sites))
                print "Found",sorted(sites),"as sites where to run the ACDC at, from the acdc doc of ",original_wf.request['RequestName']

        if options.checksite:
            ## check that the sites are all compatible and up
            check_mem = schema['Memory'] if not memory else memory
            ncores = wfi.getMulticore() if not multicore else multicore
            memory_allowed = SI.sitesByMemory( float(check_mem), maxCore=ncores)
            not_ready = sorted(set(sites) & set(SI.sites_not_ready))
            not_existing = sorted(set(sites) - set(SI.all_sites))
            not_matching = sorted((set(sites) - set(memory_allowed) - set(not_ready) - set(not_existing)))
            previously_used = []
            if schema['SiteWhitelist']: previously_used = schema['SiteWhitelist']
            if original_wf: previously_used = original_wf.request['SiteWhitelist']
            if previously_used: not_matching = sorted(set(not_matching) & set(previously_used))
            
            sites = sorted( set(sites) - set(not_matching) - set(not_existing))
            
            print sorted(memory_allowed),"to allow",check_mem,ncores
            if not_ready:
                print not_ready,"is/are not ready"
                sys.exit(0)
            if not_matching:
                print "The memory requirement",check_mem,"is too much for",not_matching
                sys.exit(0)


        ## need to play with memory setting
        if taskchain:
            if memory:
                ## transform into a dictionnary
                increase = set_to = None
                tasks,set_to = memory.split(':') if ':' in memory else ("",memory)
                tasks = tasks.split(',') if tasks else []
                if set_to.startswith('+'):
                    increase = int(set_to[1:])
                else:
                    set_to = int(set_to)
                it = 1
                memory_dict = {}
                while True:
                    t = 'Task%d'%it
                    it += 1
                    if t in schema:
                        tname = schema[t]['TaskName']
                        if tasks and not tname in tasks:
                            print tname,"not concerned"
                            memory_dict[tname] = schema[t]['Memory']
                            continue
                        if set_to:
                            memory_dict[tname] = set_to
                        else:
                            memory_dict[tname] =schema[t]['Memory'] + increase
                    else:
                        break
                memory = memory_dict
                print memory_dict
            ## need to play with multicore setting
            if multicore:
                tasks,set_to = multicore.split(':') if ':' in multicore else ("",multicore)
                tasks = tasks.split(',') if tasks else []
                set_to = int(set_to)
                multicore_dict = {}
                timeperevent_dict = {}
                it=1
                while True:
                    t = 'Task%d'%it
                    it += 1
                    if t in schema:
                        tname = schema[t]['TaskName']
                        mcore = schema[t]['Multicore']
                        if tasks and not tname in tasks:
                            print tname,"not concerned"
                            multicore_dict[tname] = schema[t]['Multicore']
                            timeperevent_dict[tname] = schema[t]['TimePerEvent']
                            continue
                        mem = memory[tname]
                        factor = (set_to / float(mcore))
                        fraction_constant = 0.4
                        mem_per_core_c = int((1-fraction_constant) * mem / float(mcore))
                        print "mem per core", mem_per_core_c
                        print "base mem", mem
                        ## need to adjut the memory at the same time
                        ## will crash of --mem was not set in argument :FINE
                        memory[tname] = mem + (set_to-mcore)*mem_per_core_c
                        print "final mem",memory[tname]
                        timeperevent_dict[tname] = schema[t]['TimePerEvent']/factor
                        print "setting mcore",set_to
                        multicore_dict[tname] = set_to
                    else:
                        break
                multicore = multicore_dict
                print multicore
                print timeperevent_dict,"cannot be used yet."
    # If the --test argument was provided, then just print the information
    # gathered so far and abort the assignment
        print wf_name
        print "Era:",era
        print "ProcStr:",procstring
        print "ProcVer:",procversion
        print "LFN:",lfn
        print "Team:",options.team
        print "Site:",sites
        print "Taskchain? ", str(taskchain)
        print "Activity:", activity
        print "ACDC:", str(is_resubmission)
        print "Xrootd:", str(xrootd)
        print "Secondary_xrootd:", str(secondary_xrootd)
        #if options.test:            continue
        
        # Really assigning the workflow now
        #print wf_name, '\tEra:', era, '\tProcStr:', procstring, '\tProcVer:', procversion, '\tTeam:', team, '\tSite:', sites
        assignRequest(url, 
                      workflow = wf_name,
                      team = options.team,
                      sites = sites,
                      era = era, 
                      procversion = procversion,
                      activity = activity,
                      lfn = lfn,
                      procstring = procstring, 
                      trust_site = xrootd, 
                      replica = options.replica, 
                      verbose = options.test, 
                      taskchain = taskchain, 
                      trust_secondary_site = secondary_xrootd,
                      memory=memory,
                      multicore=multicore,
                      lumisperjob = options.lumisperjob,
                      maxmergeevents = options.maxmergeevents
                      )
    
    sys.exit(0)
Exemple #33
0
class RequestQuery:

    def __init__(self,config):
        self.br=Browser()

        self.config = config
        
        # Initialise connections
        self.phedex = PhEDEx({"endpoint":"https://cmsweb.cern.ch/phedex/datasvc/json/prod/"}, "json")
        self.dbsPhys01 = DbsApi(url = dbs_base_url+"phys01/DBSReader/")
        self.dbsPhys02 = DbsApi(url = dbs_base_url+"phys02/DBSReader/")
        self.dbsPhys03 = DbsApi(url = dbs_base_url+"phys03/DBSReader/")
        
    def __del__(self):
        self.br.close()

    def getScramArchByCMSSW(self):
        """
        Get from the list of available CMSSW releases
        return a dictionary of ScramArchitecture by CMSSW
        """
        
        # Set temporary conection to the server and get the response from cmstags
        url = 'https://cmssdt.cern.ch/SDT/cgi-bin/ReleasesXML'
        br = Browser()
        br.set_handle_robots(False)
        response=br.open(url)
        soup = BeautifulSoup(response.read())
        
        # Dictionary form
        # {'CMSSW_X_X_X':[slc5_amd64_gcc472], ... }
        archByCmssw={}
        
        # Fill the dictionary
        for arch in soup.find_all('architecture'): 
            for cmssw in arch.find_all('project'): 
                # CMSSW release
                cmsswLabel = cmssw.get('label').encode('ascii', 'ignore')
                if cmsswLabel not in archByCmssw:
                    archByCmssw[cmsswLabel]=[]
                # ScramArch related to this CMSSW release
                archName = arch.get('name').encode('ascii', 'ignore')
                archByCmssw[cmsswLabel].append(archName)
        
        return archByCmssw
      
    def getDatasetOriginSites(self, dbs_url, data):
        """
        Get the origin sites for each block of the dataset.
        Return a list block origin sites.
        """
        
        sites=[]
        local_dbs = dbs_url.split('/')[5]
        if local_dbs == 'phys01':
            response = self.dbsPhys01.listBlocks(detail=True,dataset=data)
        elif local_dbs == 'phys02':
            response = self.dbsPhys02.listBlocks(detail=True,dataset=data)
        elif local_dbs == 'phys03':
            response = self.dbsPhys03.listBlocks(detail=True,dataset=data)
        
        seList = []
        for block in response:
            if block['origin_site_name'] not in seList:
                seList.append(block['origin_site_name'])
        
        siteNames = []
        for node in self.nodeMappings['phedex']['node']:
            if node['se'] in seList:
                siteNames.append(node['name']) 
        
        return siteNames, seList
    
    def phEDExNodetocmsName(self, nodeList):
        """
        Convert PhEDEx node name list to cms names list 
        """
        names = []
        for node in nodeList:
            name = node.replace('_MSS',
                                '').replace('_Disk',
                                    '').replace('_Buffer',
                                        '').replace('_Export', '')
            if name not in names:
                names.append(name)
        return names
    
    def setGlobalTagFromOrigin(self, dbs_url,input_dataset):
        """
        Get the global tag of the dataset from the source dbs url. If it is not set, then set global tag to 'UNKNOWN'
        """
        
        globalTag = ""
        local_dbs = dbs_url.split('/')[5]
        if local_dbs == 'phys01':
            response = self.dbsPhys01.listOutputConfigs(dataset=input_dataset)
        elif local_dbs == 'phys02':
            response = self.dbsPhys02.listOutputConfigs(dataset=input_dataset)
        elif local_dbs == 'phys03':
            response = self.dbsPhys03.listOutputConfigs(dataset=input_dataset)
        
        globalTag = response[0]['global_tag']
        # GlobalTag cannot be empty
        if globalTag == '':
            globalTag = 'UNKNOWN'
            
        return globalTag
    
    def isDataAtUrl(self, dbs_url,input_dataset):
        """
        Returns True if the dataset is at the dbs url, if not returns False
        """
        local_dbs = dbs_url.split('/')[5]
        if local_dbs == 'phys01':
            response = self.dbsPhys01.listDatasets(dataset=input_dataset)
        elif local_dbs == 'phys02':
            response = self.dbsPhys02.listDatasets(dataset=input_dataset)
        elif local_dbs == 'phys03':
            response = self.dbsPhys03.listDatasets(dataset=input_dataset)
        # This means that the dataset is not at the url
        if not response:
            return False
        else:
            return True
         
    def getLabelByValueDict(self, control):
        """
        From control items, create a dictionary by values
        """   
        d = {}
        for item in control.items:
            value = item.attrs['value']
            label = item.attrs['label']
            d[value] = label
                
        return d
    
    def getValueByLabelDict(self, control):
        """
        From control items, create a dictionary by labels
        """
        d = {}
        for item in control.items:
            value = item.attrs['value']
            label = item.attrs['label']
            d[label] = value

        return d
    
    def createRequestJSON(self, ticket, input_dataset, dbs_url, cmssw_release, group_name, version = 1):
        """
        Creates a JSON file 'Ticket_#TICKET.json' with the needed
        information for creating a requeston ReqMgr.
        Input:
            - ticket: the ticket #, for instance 110773 on https://ggus.eu/?mode=ticket_info&ticket_id=110773
            - input_dataset
            - dbs_url: only the instance name, For example: "phys01" for 
             https://cmsweb.cern.ch/dbs/prod/phys01/DBSReader
            - cmssw_release
            - group_name: the physics group name
            - version: the dataset version, 1 by default.
        It returns a dictionary that contains the request information.
        """

        scramArchByCMSSW = self.getScramArchByCMSSW()
        self.nodeMappings = self.phedex.getNodeMap()
        task = ticket
        print "Processing ticket: %s" % task
        
        #splitting input dataset       
        input_primary_dataset = input_dataset.split('/')[1].replace(' ','')
        input_processed_dataset = input_dataset.split('/')[2].replace(' ','')
        data_tier = input_dataset.split('/')[3].replace(' ','')
                
        # Transform input value to a valid DBS url
        #dbs_url = "https://cmsweb.cern.ch/dbs/prod/"+dbs_url+"/DBSReader"
        dbs_url = dbs_base_url+dbs_url+"/DBSReader"
        release_id = cmssw_release
                
        # check if deprecated release was used
        release = cmssw_release
        # check if release has not ScramArch match
        if release not in scramArchByCMSSW:
            raise Exception("Error on ticket %s due to ScramArch mismatch" % task)
        else:
            scram_arch = scramArchByCMSSW[release][-1]

        # check if dataset is not at dbs url
        try:
            data_at_url = self.isDataAtUrl(dbs_url,input_dataset)
        except:
            raise Exception('Error on ticket %s, dataset %s not available at %s' %(task, input_dataset,dbs_url))

        if not data_at_url:
            raise Exception('Error on ticket %s, dataset %s not available at %s' %(task, input_dataset,dbs_url))
                    
        ## Get Physics Group
        group_squad = 'cms-storeresults-'+group_name.replace("-","_").lower()

        ## Get Dataset Version
        dataset_version = str(version)

        # Set default Adquisition Era for StoreResults 
        acquisitionEra = "StoreResults"

        ## Construction of the new dataset name (ProcessingString)
        ## remove leading hypernews or physics group name and StoreResults+Version
        if input_processed_dataset.find(group_name)==0:
            new_dataset = input_processed_dataset.replace(group_name,"",1)
        else:
            stripped_dataset = input_processed_dataset.split("-")[1:]
            new_dataset = '_'.join(stripped_dataset)
                        
        # Get dataset site info:
        phedex_map, se_names = self.getDatasetOriginSites(dbs_url,input_dataset)
        sites = self.phEDExNodetocmsName(phedex_map)
        
        infoDict = {}
        # Build store results json
        # First add all the defaults values
        infoDict["RequestType"] = "StoreResults"
        infoDict["UnmergedLFNBase"] = "/store/unmerged" 
        infoDict["MergedLFNBase"] = "/store/results/" + group_name.replace("-","_").lower()
        infoDict["MinMergeSize"] = 1500000000
        infoDict["MaxMergeSize"] = 5000000000
        infoDict["MaxMergeEvents"] = 100000
        infoDict["TimePerEvent"] = 40
        infoDict["SizePerEvent"] = 512.0
        infoDict["Memory"] = 2394
        infoDict["CmsPath"] = "/uscmst1/prod/sw/cms"                                        
        infoDict["Group"] = "DATAOPS"
        infoDict["DbsUrl"] = dbs_url
        
        # Add all the information pulled from Savannah
        infoDict["AcquisitionEra"] = acquisitionEra
        infoDict["GlobalTag"] = self.setGlobalTagFromOrigin(dbs_url, input_dataset)
        infoDict["DataTier"] = data_tier
        infoDict["InputDataset"] = input_dataset
        infoDict["ProcessingString"] = new_dataset
        infoDict["CMSSWVersion"] = release
        infoDict["ScramArch"] = scram_arch
        infoDict["ProcessingVersion"] = dataset_version                    
        infoDict["SiteWhitelist"] = list(sites)
        
        # Create report for Migration2Global
        report = {}
         
        #Fill json file, if status is done
        self.writeJSONFile(task, infoDict)
        report["json"] = 'y'
        report["task"] = int(task)
        report["InputDataset"] = input_dataset
        report["ProcessingString"] = new_dataset
        report["localUrl"] = dbs_url
        report["sites"] = list(sites)
        report["se_names"] = list(se_names)

        return report

    def writeJSONFile(self, task, infoDict):
        """
        This writes a JSON file at ComponentDir
        """
        ##check if file already exists
        filename = self.config["ComponentDir"]+'/Ticket_'+str(task)+'.json'
        if not os.access(filename,os.F_OK):
            jsonfile = open(filename,'w')
            request = {'createRequest':infoDict} ## CHECK THIS BEFORE FINISHING
            jsonfile.write(json.dumps(request,sort_keys=True, indent=4))
            jsonfile.close

        return

    def removeJSONFile(self,task):
        """
        This removes the JSON file at ComponentDir if it was created
        """
        filename = self.config["ComponentDir"]+'/Ticket_'+str(task)+'.json'

        if os.access(filename,os.F_OK):
            os.remove(filename)
        return

    def printReport(self, report):
        """
        Print out a report
        """
        print "%20s %5s %10s %50s %50s" %( 'Ticket','json','local DBS','Sites','se_names') 
        print "%20s %5s %10s %50s %50s" %( '-'*20,'-'*5,'-'*10,'-'*50,'-'*50 )
        
        json = report["json"]
        ticket = report["task"]
        #status = report["ticketStatus"]
        localUrl = report["localUrl"].split('/')[5]
        site = ', '.join(report["sites"])
        se_names = ', '.join(report["se_names"])
        print "%20s %5s %10s %50s %50s" %(ticket,json,localUrl,site,se_names)  
Exemple #34
0
def getDatasets(dataset):
    # initialize API to DBS3
    dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader')
    # retrieve dataset summary
    reply = dbsapi.listDatasets(dataset=dataset, dataset_access_type='*')
    return reply
Exemple #35
0
def getDatasetStatus(dataset):
        # initialize API to DBS3
        dbsapi = DbsApi(url=dbs3_url)
        # retrieve dataset summary
        reply = dbsapi.listDatasets(dataset=dataset,dataset_access_type='*',detail=True)
        return reply[0]['dataset_access_type']
#!/usr/bin/env python
# https://twiki.cern.ch/twiki/bin/view/CMS/DBS3APIInstructions
import sys
import os
from dbs.apis.dbsClient import DbsApi

if (len(sys.argv) < 2):
    print "Give me an arg!"
    sys.exit()

# dataset='/DYJetsToLL_M-50_Zpt-150toInf_TuneCUETP8M1_13TeV-madgraphMLM-pythia8/RunIISpring15DR74-Asympt25ns_MCRUN2_74_V9-v1/MINIAODSIM'
dataset = sys.argv[-1]
evt_per_job = 30000
url = "https://cmsweb.cern.ch/dbs/prod/global/DBSReader"
api = DbsApi(url=url)
output = api.listDatasets(dataset=dataset)

if (output == []):
    os.system("./FindLumisPerJob.sh " + dataset)
    sys.exit()

if (len(output) == 1):
    inp = output[0]['dataset']
    info = api.listFileSummaries(dataset=inp)[0]

    nevents = info['num_event']
    nlumis = info['num_lumi']
    evt_per_lumi = 1.0 * nevents / nlumis
    lumi_per_job = evt_per_job / evt_per_lumi

    dump = api.listFiles(dataset=dataset, detail=1, validFileOnly=1)
Exemple #37
0
            config.Data.splitting = 'LumiBased'
            config.Data.unitsPerJob = myJob.get('unitsPerJob', 25)
            if year == '2016':
                config.Data.lumiMask = 'https://cms-service-dqm.web.cern.ch/cms-service-dqm/CAF/certification//Collisions16/13TeV/ReReco/Final/Cert_271036-284044_13TeV_23Sep2016ReReco_Collisions16_JSON.txt'
            if year == '2017':
                config.Data.lumiMask = 'https://cms-service-dqm.web.cern.ch/cms-service-dqm/CAF/certification/Collisions17/13TeV/Final/Cert_294927-306462_13TeV_PromptReco_Collisions17_JSON.txt'
            if year == '2018' or year== '2018D':
                config.Data.lumiMask = 'https://cms-service-dqm.web.cern.ch/cms-service-dqm/CAF/certification/Collisions18/13TeV/PromptReco/Cert_314472-325175_13TeV_PromptReco_Collisions18_JSON.txt' 

            # Recovery task!
            #config.Data.lumiMask = '{}.json'.format(jobName) 

        else:
            config.JobType.pyCfgParams.append("isData=False")
            # automate
            dbs.listDatasets(dataset=config.Data.inputDataset)
            filesummary=dbs.listFileSummaries(dataset=config.Data.inputDataset, validFileOnly=1)
            num_file = filesummary[0]['num_file']
            num_event = filesummary[0]['num_event']
            myJob['events'] = num_event
            num_event_per_file_desired = 100000
            num_event_per_file = num_event/num_file
            num_jobs = max(1, num_event_per_file_desired / num_event_per_file)
            print("Mean number of events per file", num_event_per_file)
            print("Optimal number of files per job", num_jobs)
            config.Data.unitsPerJob = myJob.get('unitsPerJob', num_jobs)

            config.Data.splitting = 'FileBased'
            config.JobType.maxJobRuntimeMin= 16*60

        if "params" in myJob:
Exemple #38
0
class DBS3Reader(object):
    """
    _DBSReader_

    General API for reading data from DBS
    """
    # cache all the datatiers known by DBS
    _datatiers = {}

    def __init__(self, url, **contact):

        # instantiate dbs api object
        try:
            self.dbsURL = url
            self.dbs = DbsApi(url, **contact)
        except dbsClientException as ex:
            msg = "Error in DBSReader with DbsApi\n"
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        # connection to PhEDEx (Use default endpoint url)
        self.phedex = PhEDEx(responseType="json")

    def _getLumiList(self, blockName=None, lfns=None, validFileOnly=1):
        """
        currently only take one lfn but dbs api need be updated
        """
        try:
            if blockName:
                lumiLists = self.dbs.listFileLumis(block_name=blockName, validFileOnly=validFileOnly)
            elif lfns:
                lumiLists = []
                for slfn in grouper(lfns, 50):
                    lumiLists.extend(self.dbs.listFileLumiArray(logical_file_name = slfn))
            else:
                # shouldn't call this with both blockName and lfns empty
                # but still returns empty dict for that case
                return {}
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFileLumiArray(%s)\n" % lfns
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        lumiDict = {}
        for lumisItem in lumiLists:
            lumiDict.setdefault(lumisItem['logical_file_name'], [])
            item = {}
            item["RunNumber"] = lumisItem['run_num']
            item['LumiSectionNumber'] = lumisItem['lumi_section_num']
            lumiDict[lumisItem['logical_file_name']].append(item)
        return lumiDict

    def checkDBSServer(self):
        """
        check whether dbs server is up and running
        returns {"dbs_instance": "prod/global", "dbs_version": "3.3.144"}
        """
        try:
            return self.dbs.serverinfo()
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBS server is not up: %s" % self.dbsURL
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listPrimaryDatasets(self, match='*'):
        """
        _listPrimaryDatasets_

        return a list of primary datasets, The full dataset name must be provided
        pattern based mathcing is no longer supported.
        If no expression is provided, all datasets are returned
        """
        try:
            result = self.dbs.listPrimaryDatasets(primary_ds_name=match)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listPrimaryDataset(%s)\n" % match
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['primary_ds_name'] for x in result]
        return result

    def matchProcessedDatasets(self, primary, tier, process):
        """
        _matchProcessedDatasets_

        return a list of Processed datasets
        """
        result = []
        try:
            datasets = self.dbs.listDatasets(primary_ds_name=primary, data_tier_name=tier, detail=True)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        for dataset in datasets:
            dataset = remapDBS3Keys(dataset, processed_ds_name='Name')
            dataset['PathList'] = [dataset['dataset']]
            if dataset['Name'] == process:
                result.append(dataset)
        return result

    def listRuns(self, dataset=None, block=None):
        """
        it gets list of DbsRun object but for our purpose
        only list of number is collected.
        DbsRun (RunNumber,
                NumberOfEvents,
                NumberOfLumiSections,
                TotalLuminosity,
                StoreNumber,
                StartOfRungetLong,
                EndOfRun,
                CreationDate,
                CreatedBy,
                LastModificationDate,
                LastModifiedBy
                )
        """
        runs = []
        try:
            if block:
                results = self.dbs.listRuns(block_name=block)
            else:
                results = self.dbs.listRuns(dataset=dataset)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listRuns(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)
        [runs.extend(x['run_num']) for x in results]
        return runs

    def listRunLumis(self, dataset=None, block=None):
        """
        It gets a list of DBSRun objects and returns the number of lumisections per run
        DbsRun (RunNumber,
                NumberOfEvents,
                NumberOfLumiSections,
                TotalLuminosity,
                StoreNumber,
                StartOfRungetLong,
                EndOfRun,
                CreationDate,
                CreatedBy,
                LastModificationDate,
                LastModifiedBy
                )
        """
        # Pointless code in python3
        if isinstance(block, str):
            block = unicode(block)
        if isinstance(dataset, str):
            dataset = unicode(dataset)

        try:
            if block:
                results = self.dbs.listRuns(block_name=block)
            else:
                results = self.dbs.listRuns(dataset=dataset)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listRuns(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        # send runDict format as result, this format is for sync with dbs2 call
        # which has {run_number: num_lumis} but dbs3 call doesn't return num Lumis
        # So it returns {run_number: None}
        # TODO: After DBS2 is completely removed change the return format more sensible one

        runDict = {}
        for x in results:
            for runNumber in x["run_num"]:
                runDict[runNumber] = None
        return runDict

    def listProcessedDatasets(self, primary, dataTier='*'):
        """
        _listProcessedDatasets_

        return a list of Processed datasets for the primary and optional
        data tier value

        """
        try:
            result = self.dbs.listDatasets(primary_ds_name=primary, data_tier_name=dataTier)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['dataset'].split('/')[2] for x in result]
        return result

    def listDatasetFiles(self, datasetPath):
        """
        _listDatasetFiles_

        Get list of files for dataset

        """
        return [x['logical_file_name'] for x in self.dbs.listFileArray(dataset=datasetPath)]

    @staticmethod
    def listDatatiers(dbsUrl=None):
        """
        _listDatatiers_

        Get a list of datatiers known by DBS.
        """
        if dbsUrl is None:
            msg = "Error in DBSReader.listDatatiers(). DBS Url not set."
            raise DBSReaderError(msg)

        timenow = int(time.time())
        if DBS3Reader._datatiers and timenow - 7200 < DBS3Reader._datatiers['ts']:
            return DBS3Reader._datatiers['tiers']

        try:
            DBS3Reader._setDatatiersCache(timenow, dbsUrl)
        except Exception as ex:
            if not DBS3Reader._datatiers:
                msg = "Error in DBSReader.listDatatiers\n%s" % formatEx3(ex)
                raise DBSReaderError(msg)
        return DBS3Reader._datatiers['tiers']

    @staticmethod
    def _setDatatiersCache(ts, dbsUrl):
        """
        Set a timestamp and update the list of datatiers cached in
        the class property
        """
        dbs = DbsApi(dbsUrl)
        DBS3Reader._datatiers['ts'] = ts
        DBS3Reader._datatiers['tiers'] = [tier['data_tier_name'] for tier in dbs.listDataTiers()]

        return

    def listDatasetFileDetails(self, datasetPath, getParents=False, validFileOnly=1):
        """
        TODO: This is completely wrong need to be redone. or be removed - getting dataset altogether
        might be to costly

        _listDatasetFileDetails_

        Get list of lumis, events, and parents for each file in a dataset
        Return a dict where the keys are the files, and for each file we have something like:
            { 'NumberOfEvents': 545,
              'BlockName': '/HighPileUp/Run2011A-v1/RAW#dd6e0796-cbcc-11e0-80a9-003048caaace',
              'Lumis': {173658: [8, 12, 9, 14, 19, 109, 105]},
              'Parents': [],
              'Checksum': '22218315',
              'Adler32': 'a41a1446',
              'FileSize': 286021145,
              'ValidFile': 1
            }

        """
        fileDetails = self.getFileListByDataset(dataset=datasetPath, validFileOnly=validFileOnly, detail=True)
        blocks = set()  # the set of blocks of the dataset
        # Iterate over the files and prepare the set of blocks and a dict where the keys are the files
        files = {}
        for f in fileDetails:
            blocks.add(f['block_name'])
            files[f['logical_file_name']] = remapDBS3Keys(f, stringify=True)
            files[f['logical_file_name']]['ValidFile'] = f['is_file_valid']
            files[f['logical_file_name']]['Lumis'] = {}
            files[f['logical_file_name']]['Parents'] = []

        # Iterate over the blocks and get parents and lumis
        for blockName in blocks:
            # get the parents
            if getParents:
                parents = self.dbs.listFileParents(block_name=blockName)
                for p in parents:
                    if p['logical_file_name'] in files:  # invalid files are not there if validFileOnly=1
                        files[p['logical_file_name']]['Parents'].extend(p['parent_logical_file_name'])
            # get the lumis
            file_lumis = self.dbs.listFileLumis(block_name=blockName)
            for f in file_lumis:
                if f['logical_file_name'] in files:  # invalid files are not there if validFileOnly=1
                    if f['run_num'] in files[f['logical_file_name']]['Lumis']:
                        files[f['logical_file_name']]['Lumis'][f['run_num']].extend(f['lumi_section_num'])
                    else:
                        files[f['logical_file_name']]['Lumis'][f['run_num']] = f['lumi_section_num']

        return files

    def crossCheck(self, datasetPath, *lfns):
        """
        _crossCheck_

        For the dataset provided, check that the lfns listed all exist
        in the dataset.

        Return the list of lfns that are in the dataset

        """
        allLfns = self.dbs.listFileArray(dataset=datasetPath, validFileOnly=1, detail=False)
        setOfAllLfns = set(allLfns)
        setOfKnownLfns = set(lfns)
        return list(setOfAllLfns.intersection(setOfKnownLfns))

    def crossCheckMissing(self, datasetPath, *lfns):
        """
        _crossCheckMissing_

        As cross check, but return value is a list of files that
        are *not* known by DBS

        """
        allLfns = self.dbs.listFileArray(dataset=datasetPath, validFileOnly=1, detail=False)
        setOfAllLfns = set(allLfns)
        setOfKnownLfns = set(lfns)
        knownFiles = setOfAllLfns.intersection(setOfKnownLfns)
        unknownFiles = setOfKnownLfns.difference(knownFiles)
        return list(unknownFiles)

    def getDBSSummaryInfo(self, dataset=None, block=None):
        """
        Get dataset summary includes # of files, events, blocks and total size
        """
        # FIXME: Doesnt raise exceptions on missing data as old api did
        if dataset:
            self.checkDatasetPath(dataset)
        try:
            if block:
                summary = self.dbs.listFileSummaries(block_name=block, validFileOnly=1)
            else:  # dataset case dataset shouldn't be None
                summary = self.dbs.listFileSummaries(dataset=dataset, validFileOnly=1)
        except Exception as ex:
            msg = "Error in DBSReader.getDBSSummaryInfo(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)
        if not summary or summary[0].get('file_size') is None:  # appears to indicate missing dataset
            msg = "DBSReader.listDatasetSummary(%s, %s): No matching data"
            raise DBSReaderError(msg % (dataset, block))
        result = remapDBS3Keys(summary[0], stringify=True)
        result['path'] = dataset if dataset else ''
        result['block'] = block if block else ''
        return result

    def getFileBlocksInfo(self, dataset, onlyClosedBlocks=False,
                          blockName=None, locations=True):
        """
        """
        self.checkDatasetPath(dataset)
        args = {'dataset': dataset, 'detail': True}
        if blockName:
            args['block_name'] = blockName
        try:
            blocks = self.dbs.listBlocks(**args)
        except Exception as ex:
            msg = "Error in DBSReader.getFileBlocksInfo(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        blocks = [remapDBS3Keys(block, stringify=True, block_name='Name') for block in blocks]
        # only raise if blockName not specified - mimic dbs2 error handling
        if not blocks and not blockName:
            msg = "DBSReader.getFileBlocksInfo(%s, %s): No matching data"
            raise DBSReaderError(msg % (dataset, blockName))
        if locations:
            for block in blocks:
                block['PhEDExNodeList'] = [{'Name': x} for x in self.listFileBlockLocation(block['Name'])]

        if onlyClosedBlocks:
            return [x for x in blocks if str(x['OpenForWriting']) != "1"]

        return blocks

    def listFileBlocks(self, dataset, onlyClosedBlocks=False, blockName=None):
        """
        _listFileBlocks_

        Retrieve a list of fileblock names for a dataset

        """
        self.checkDatasetPath(dataset)
        args = {'dataset': dataset, 'detail': False}
        if blockName:
            args['block_name'] = blockName
        if onlyClosedBlocks:
            args['detail'] = True
        try:
            blocks = self.dbs.listBlocks(**args)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if onlyClosedBlocks:
            result = [x['block_name'] for x in blocks if str(x['open_for_writing']) != "1"]

        else:
            result = [x['block_name'] for x in blocks]

        return result

    def listOpenFileBlocks(self, dataset):
        """
        _listOpenFileBlocks_

        Retrieve a list of open fileblock names for a dataset

        """
        self.checkDatasetPath(dataset)
        try:
            blocks = self.dbs.listBlocks(dataset=dataset, detail=True)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['block_name'] for x in blocks if str(x['open_for_writing']) == "1"]

        return result

    def blockExists(self, fileBlockName):
        """
        _blockExists_

        Check to see if block with name provided exists in the DBS
        Instance.

        Return True if exists, False if not

        """
        self.checkBlockName(fileBlockName)
        try:

            blocks = self.dbs.listBlocks(block_name=fileBlockName)
        except Exception as ex:
            msg = "Error in "
            msg += "DBSReader.blockExists(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if len(blocks) == 0:
            return False
        return True

    def listFilesInBlock(self, fileBlockName, lumis=True, validFileOnly=1):
        """
        _listFilesInBlock_

        Get a list of files in the named fileblock
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.
        We need to clean code up when dbs2 is completely deprecated.
        calling lumis for run number is expensive.
        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            files = self.dbs.listFileArray(block_name=fileBlockName, validFileOnly=validFileOnly, detail=True)
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlock(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if lumis:
            lumiDict = self._getLumiList(blockName=fileBlockName, validFileOnly=validFileOnly)

        result = []
        for fileInfo in files:
            if lumis:
                fileInfo["LumiList"] = lumiDict[fileInfo['logical_file_name']]
            result.append(remapDBS3Keys(fileInfo, stringify=True))
        return result

    def listFilesInBlockWithParents(self, fileBlockName, lumis=True, validFileOnly=1):
        """
        _listFilesInBlockWithParents_

        Get a list of files in the named fileblock including
        the parents of that file.
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            # TODO: shoud we get only valid block for this?
            files = self.dbs.listFileParents(block_name=fileBlockName)
            fileDetails = self.listFilesInBlock(fileBlockName, lumis, validFileOnly)

        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n" % (
                fileBlockName,)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        childByParents = defaultdict(list)
        for f in files:
            # Probably a child can have more than 1 parent file
            for fp in f['parent_logical_file_name']:
                childByParents[fp].append(f['logical_file_name'])
        
        parentsLFNs = childByParents.keys()
        
        if len(parentsLFNs) == 0:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n There is no parents files" % (
                fileBlockName)
            raise DBSReaderError(msg)

        parentFilesDetail = []
        # TODO: slicing parentLFNs util DBS api is handling that.
        # Remove slicing if DBS api handles
        for pLFNs in grouper(parentsLFNs, 50):
            parentFilesDetail.extend(self.dbs.listFileArray(logical_file_name=pLFNs, detail=True))

        if lumis:
            parentLumis = self._getLumiList(lfns=parentsLFNs)

        parentsByLFN = defaultdict(list)

        for pf in parentFilesDetail:
            parentLFN = pf['logical_file_name']
            dbsFile = remapDBS3Keys(pf, stringify=True)
            if lumis:
                dbsFile["LumiList"] = parentLumis[parentLFN]

            for childLFN in childByParents[parentLFN]:
                parentsByLFN[childLFN].append(dbsFile)

        for fileInfo in fileDetails:
            fileInfo["ParentList"] = parentsByLFN[fileInfo['logical_file_name']]

        return fileDetails

    def lfnsInBlock(self, fileBlockName):
        """
        _lfnsInBlock_

        LFN list only for block, details = False => faster query

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.lfnsInBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            lfns = self.dbs.listFileArray(block_name=fileBlockName, validFileOnly=1, detail=False)
            return lfns
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlock(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listFileBlockLocation(self, fileBlockNames, dbsOnly=False):
        """
        _listFileBlockLocation_

        Get origin_site_name of a block

        """

        singleBlockName = None
        if isinstance(fileBlockNames, basestring):
            singleBlockName = fileBlockNames
            fileBlockNames = [fileBlockNames]

        for block in fileBlockNames:
            self.checkBlockName(block)

        locations = {}
        node_filter = set(['UNKNOWN', None])

        if dbsOnly:
            blocksInfo = {}
            try:
                for block in fileBlockNames:
                    blocksInfo.setdefault(block, [])
                    # there should be only one element with a single origin site string ...
                    for blockInfo in self.dbs.listBlockOrigin(block_name=block):
                        blocksInfo[block].append(blockInfo['origin_site_name'])
            except dbsClientException as ex:
                msg = "Error in DBS3Reader: self.dbs.listBlockOrigin(block_name=%s)\n" % fileBlockNames
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)
        else:
            try:
                blocksInfo = self.phedex.getReplicaPhEDExNodesForBlocks(block=fileBlockNames, complete='y')
            except Exception as ex:
                msg = "Error while getting block location from PhEDEx for block_name=%s)\n" % fileBlockNames
                msg += "%s\n" % str(ex)
                raise Exception(msg)

        for block in fileBlockNames:
            valid_nodes = set(blocksInfo.get(block, [])) - node_filter
            locations[block] = list(valid_nodes)

        # returning single list if a single block is passed
        if singleBlockName:
            return locations[singleBlockName]

        return locations

    def getFileBlock(self, fileBlockName, dbsOnly=False):
        """
        _getFileBlock_

        dbsOnly flag is mostly meant for StoreResults, since there is no
        data in TMDB.

        return a dictionary:
        { blockName: {
             "PhEDExNodeNames" : [<pnn list>],
             "Files" : { LFN : Events },
             }
        }


        """
        # Pointless code in python3
        if isinstance(fileBlockName, str):
            fileBlockName = unicode(fileBlockName)
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.getFileBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        result = {fileBlockName: {
            "PhEDExNodeNames": self.listFileBlockLocation(fileBlockName, dbsOnly),
            "Files": self.listFilesInBlock(fileBlockName),
            "IsOpen": self.blockIsOpen(fileBlockName)
            }
        }
        return result

    def getFileBlockWithParents(self, fileBlockName):
        """
        _getFileBlockWithParents_

        return a dictionary:
        { blockName: {
             "PhEDExNodeNames" : [<pnn list>],
             "Files" : dictionaries representing each file
             }
        }

        files

        """
        if isinstance(fileBlockName, str):
            fileBlockName = unicode(fileBlockName)

        if not self.blockExists(fileBlockName):
            msg = "DBSReader.getFileBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        result = {fileBlockName: {
            "PhEDExNodeNames": self.listFileBlockLocation(fileBlockName),
            "Files": self.listFilesInBlockWithParents(fileBlockName),
            "IsOpen": self.blockIsOpen(fileBlockName)
            }
        }
        return result

    def getFiles(self, dataset, onlyClosedBlocks=False):
        """
        _getFiles_

        Returns a dictionary of block names for the dataset where
        each block constists of a dictionary containing the PhEDExNodeNames
        for that block and the files in that block by LFN mapped to NEvents

        """
        result = {}
        blocks = self.listFileBlocks(dataset, onlyClosedBlocks)

        [result.update(self.getFileBlock(x)) for x in blocks]

        return result

    def listBlockParents(self, blockName):
        """Get parent blocks for block"""
        result = []
        self.checkBlockName(blockName)
        blocks = self.dbs.listBlockParents(block_name=blockName)
        for block in blocks:
            toreturn = {'Name': block['parent_block_name']}
            toreturn['PhEDExNodeList'] = self.listFileBlockLocation(toreturn['Name'])
            result.append(toreturn)
        return result

    def blockIsOpen(self, blockName):
        """
        _blockIsOpen_

        Return True if named block is open, false if not, or if block
        doenst exist

        """
        self.checkBlockName(blockName)
        blockInstance = self.dbs.listBlocks(block_name=blockName, detail=True)
        if len(blockInstance) == 0:
            return False
        blockInstance = blockInstance[0]
        isOpen = blockInstance.get('open_for_writing', 1)
        if isOpen == 0:
            return False
        return True

    def blockToDatasetPath(self, blockName):
        """
        _blockToDatasetPath_

        Given a block name, get the dataset Path associated with that
        Block.

        Returns the dataset path, or None if not found

        """
        self.checkBlockName(blockName)
        try:
            blocks = self.dbs.listBlocks(block_name=blockName, detail=True)
        except Exception as ex:
            msg = "Error in "
            msg += "DBSReader.blockToDatasetPath(%s)\n" % blockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if blocks == []:
            return None

        pathname = blocks[-1].get('dataset', None)
        return pathname

    def listDatasetLocation(self, datasetName, dbsOnly=False):
        """
        _listDatasetLocation_

        List the origin SEs where there is at least a block of the given
        dataset.
        """
        self.checkDatasetPath(datasetName)

        locations = set()

        if dbsOnly:
            try:
                blocksInfo = self.dbs.listBlockOrigin(dataset=datasetName)
            except dbsClientException as ex:
                msg = "Error in DBSReader: dbsApi.listBlocks(dataset=%s)\n" % datasetName
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)

            if not blocksInfo:  # no data location from dbs
                return list()

            for blockInfo in blocksInfo:
                locations.update(blockInfo['origin_site_name'])

            locations.difference_update(['UNKNOWN', None])  # remove entry when SE name is 'UNKNOWN'
        else:
            try:
                blocksInfo = self.phedex.getReplicaPhEDExNodesForBlocks(dataset=[datasetName], complete='y')
            except Exception as ex:
                msg = "Error while getting block location from PhEDEx for dataset=%s)\n" % datasetName
                msg += "%s\n" % str(ex)
                raise Exception(msg)

            if blocksInfo:
                for blockSites in blocksInfo.values():
                    locations.update(blockSites)

        return list(locations)

    def checkDatasetPath(self, pathName):
        """
         _checkDatasetPath_
        """
        if pathName in ("", None):
            raise DBSReaderError("Invalid Dataset Path name: => %s <=" % pathName)
        else:
            try:
                result = self.dbs.listDatasets(dataset=pathName, dataset_access_type='*')
                if len(result) == 0:
                    raise DBSReaderError("Dataset %s doesn't exist in DBS %s" % (pathName, self.dbsURL))
            except (dbsClientException, HTTPError) as ex:
                msg = "Error in "
                msg += "DBSReader.checkDatasetPath(%s)\n" % pathName
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)
        return

    def checkBlockName(self, blockName):
        """
         _checkBlockName_
        """
        if blockName in ("", "*", None):
            raise DBSReaderError("Invalid Block name: => %s <=" % blockName)

    def getFileListByDataset(self, dataset, validFileOnly=1, detail=True):

        """
        _getFileListByDataset_

        Given a dataset, retrieves all blocks, lfns and number of events (among other
        not really important info).
        Returns a list of dict.
        """

        try:
            fileList = self.dbs.listFileArray(dataset=dataset, validFileOnly=validFileOnly, detail=detail)
            return fileList
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.getFileListByDataset(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listDatasetParents(self, childDataset):
        """
        list the the parents dataset path given childDataset
        """
        try:
            parentList = self.dbs.listDatasetParents(dataset=childDataset)
            return parentList
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listDatasetParents(%s)\n" % childDataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)
dbsApi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader')
#dbsApi = DbsApi(url = 'https://cmsweb-testbed.cern.ch/dbs/int/global/DBSReader/')

dsets_fraction_lost = {}

access_type = "VALID"

for line in inputfile:
    lfn = line.rstrip('\n')

    if dbsApi.listFiles(logical_file_name=lfn) == []:
        print lfn
        continue

    continue

    #print "lfn = "+lfn
    if dbsApi.listDatasets(
            logical_file_name=lfn,
            dataset_access_type="VALID") == [] and dbsApi.listDatasets(
                logical_file_name=lfn, dataset_access_type="PRODUCTION"
            ) == [] and dbsApi.listDatasets(
                logical_file_name=lfn,
                dataset_access_type="INVALID") == [] and dbsApi.listDatasets(
                    logical_file_name=lfn, dataset_access_type="DELETED"
                ) == [] and dbsApi.listDatasets(
                    logical_file_name=lfn,
                    dataset_access_type="DEPRECATED") == []:
        print lfn
        continue
Exemple #40
0
def getDatasets(dataset):
       # initialize API to DBS3
        dbsapi = DbsApi(url=dbs3_url)
        # retrieve dataset summary
        reply = dbsapi.listDatasets(dataset=dataset,dataset_access_type='*')
        return reply
Exemple #41
0
    def testA_basicFunction(self):
        """
        _basicFunction_

        See if I can make the damn thing work.
        """
        myThread = threading.currentThread()

        config = self.getConfig()

        from WMComponent.DBS3Buffer.DBSUploadPoller import DBSUploadPoller
        dbsUploader = DBSUploadPoller(config=config)
        dbsUtil = DBSBufferUtil()
        from dbs.apis.dbsClient import DbsApi
        dbsApi = DbsApi(url=config.DBSUpload.dbsUrl)

        # This should do nothing
        # Just making sure we don't crash
        try:
            dbsUploader.algorithm()
        except:
            dbsUploader.close()
            raise

        name = "ThisIsATest%s" % (int(time.time()))
        tier = "RECO"
        nFiles = 12
        name = name.replace('-', '_')
        name = '%s-v0' % name
        files = self.getFiles(name=name, tier=tier, nFiles=nFiles)
        datasetPath = "/Cosmics/%s/%s" % (name, tier)

        try:
            dbsUploader.algorithm()
        except:
            dbsUploader.close()
            raise

        time.sleep(5)

        # Now look in DBS
        try:
            result = dbsApi.listDatasets(dataset=datasetPath,
                                         detail=True,
                                         dataset_access_type='PRODUCTION')
            self.assertEqual(len(result), 1)
            self.assertEqual(result[0]['data_tier_name'], 'RECO')
            self.assertEqual(result[0]['processing_version'], 0)
            self.assertEqual(result[0]['acquisition_era_name'],
                             name.split('-')[0])

            result = dbsApi.listFiles(dataset=datasetPath)
            self.assertEqual(len(result), 11)
        except:
            dbsUploader.close()
            raise

        # All the blocks except for the last one should
        # now be there
        result = myThread.dbi.processData(
            "SELECT id FROM dbsbuffer_block")[0].fetchall()
        self.assertEqual(len(result), 12)

        # The last block should still be open
        self.assertEqual(len(dbsUtil.findOpenBlocks()), 1)

        try:
            dbsUploader.algorithm()
        except:
            raise
        finally:
            dbsUploader.close()

        # All files should now be available
        result = dbsApi.listFiles(dataset=datasetPath)
        self.assertEqual(len(result), 12)

        # The last block should now be closed
        self.assertEqual(len(dbsUtil.findOpenBlocks()), 0)

        result = myThread.dbi.processData(
            "SELECT status FROM dbsbuffer_block")[0].fetchall()
        for res in result:
            self.assertEqual(res.values()[0], 'InDBS')

        return
def run_command(command):
     # flash stdout to keep order of messages right
     sys.stdout.flush()
     exit_code = subprocess.call(command, shell=True)
     return exit_code
     

# DBS reader
url = "https://cmsweb.cern.ch/dbs/prod/global/DBSReader"
api = DbsApi(url=url)

datasets = []

if options.run:
     datasets = api.listDatasets(run_num=options.run, detail=True)
if options.era:
     datasets = api.listDatasets(acquisition_era_name=options.era, detail=True)

nDatasetsToCheck = 0
for ds in datasets:
     if datatiers and not ds['data_tier_name'] in datatiers:
          continue
     nDatasetsToCheck += 1
print >>log, "Number of datasets to check: %d" % nDatasetsToCheck
print "Number of datasets to check: %d" % nDatasetsToCheck

for ds in datasets:
     if datatiers and not ds['data_tier_name'] in datatiers:
          continue
     print >>log, "\nDatset:",ds['dataset'],
Exemple #43
0
def main():
    url = 'cmsweb.cern.ch'
    url_tb = 'cmsweb-testbed.cern.ch'

    # Example: python assign.py -w amaltaro_RVZTT_120404_163607_6269
    # -t testbed-relval -s T1_US_FNAL -e CMSSW_6_0_0_pre1_FS_TEST_WMA -p v1 -a
    # relval -l /store/backfill/1
    usage = "usage: %prog [options] [WORKFLOW]"

    parser = optparse.OptionParser(usage=usage)
    parser.add_option('-t', '--team', help='Type of Requests', dest='team')
    parser.add_option('-s',
                      '--sites',
                      help=' "t1" for Tier-1\'s and "t2" for Tier-2\'s',
                      dest='sites')
    parser.add_option(
        '--special',
        help=
        'Use it for special workflows. You also have to change the code according to the type of WF',
        dest='special')
    parser.add_option('-r',
                      '--replica',
                      action='store_true',
                      dest='replica',
                      default=False,
                      help='Adds a _Disk Non-Custodial Replica parameter')
    parser.add_option(
        '-p',
        '--procversion',
        help=
        'Processing Version, if empty it will leave the processing version that comes by default in the request',
        dest='procversion')
    parser.add_option(
        '-a',
        '--activity',
        help=
        'Dashboard Activity (reprocessing, production or test), if empty will set reprocessing as default',
        dest='activity')
    parser.add_option(
        '-x',
        '--xrootd',
        help='Assign with trustSiteLocation=True (allows xrootd capabilities)',
        action='store_true',
        default=False,
        dest='xrootd')
    parser.add_option('-l', '--lfn', help='Merged LFN base', dest='lfn')
    parser.add_option('-v',
                      '--verbose',
                      help='Verbose',
                      action='store_true',
                      default=False,
                      dest='verbose')
    parser.add_option('--testbed',
                      help='Assign in testbed',
                      action='store_true',
                      default=False,
                      dest='testbed')
    parser.add_option(
        '--test',
        action="store_true",
        help=
        'Nothing is injected, only print infomation about workflow and Era',
        dest='test')
    parser.add_option(
        '-f',
        '--file',
        help=
        'Text file with a list of wokflows. If this option is used, the same settings will be applied to all workflows',
        dest='file')
    parser.add_option('-w',
                      '--workflow',
                      help='Workflow Name',
                      dest='workflow')
    parser.add_option('-e', '--era', help='Acquistion era', dest='era')
    parser.add_option("--procstr",
                      dest="procstring",
                      help="Overrides Processing String with a single string")

    (options, args) = parser.parse_args()

    if options.testbed:
        url = url_tb

    # parse input workflows and files. If both -w and -f options are used, then only the -w inputs are considered.
    if not options.workflow:
        if args:
            wfs = args
        elif options.file:
            wfs = [l.strip() for l in open(options.file) if l.strip()]
        else:
            parser.error("Input a workflow name or a file to read them")
            sys.exit(0)
    else:
        wfs = [options.workflow]

    #Default values
    era = {}
    procversion = 1
    procstring = {}
    replica = False
    sites = ALL_SITES
    specialStr = ''
    taskchain = False
    team = 'production'
    trust_site = False

    SI = siteInfo()
    # Handling the parameters given in the command line
    # parse site list
    if options.sites:
        if options.sites == "t1":
            sites = SI.sites_T1s
        elif options.sites == "t2":
            sites = SI.sites_T2s
        else:
            sites = [site for site in options.sites.split(',')]
    else:
        sites = SI.sites_T1s + SI.sites_T2s
    if options.team:
        team = options.team

    if options.xrootd:
        trust_site = True

    if options.replica:
        replica = True

    for wf in wfs:
        # Getting the original dictionary
        schema = getRequestDict(url, wf)
        wf = reqMgr.Workflow(wf, url=url)

        # WF must be in assignment-approved in order to be assigned
        if (schema["RequestStatus"] != "assignment-approved"):
            print("The workflow '" + wf.name +
                  "' you are trying to assign is not in assignment-approved")
            sys.exit(1)

        #Check to see if the workflow is a task chain or an ACDC of a taskchain
        taskchain = (schema["RequestType"] == "TaskChain") or (
            (schema["RequestType"] == "Resubmission")
            and "task" in schema["InitialTaskPath"].split("/")[1])

        #Dealing with era and proc string
        if taskchain:
            # Setting the Era and ProcStr values per Task
            for key, value in schema.items():
                if type(value) is dict and key.startswith("Task"):
                    try:
                        if 'ProcessingString' in value:
                            procstring[
                                value['TaskName']] = value['ProcessingString']
                        else:
                            procstring[
                                value['TaskName']] = schema['ProcessingString']
                        if 'AcquisitionEra' in value:
                            era[value['TaskName']] = value['AcquisitionEra']
                        else:
                            procstring[
                                value['TaskName']] = schema['AcquisitionEra']
                    except KeyError:
                        print(
                            "This taskchain request has no AcquisitionEra or ProcessingString defined into the Tasks, aborting..."
                        )
                        sys.exit(1)
        # Adding the special string - in case it was provided in the command line
        if options.special:
            specialStr = '_' + str(options.special)
            for key, value in procstring.items():
                procstring[key] = value + specialStr
        # Override if a value is given using the procstring command
        if options.procstring:
            procstring = options.procstring
        elif not taskchain:
            procstring = wf.info['ProcessingString']
        if options.era:
            era = options.era
        elif not taskchain:
            era = wf.info['AcquisitionEra']
        #Set era and procstring to none for merge ACDCs inside a task chain
        if schema["RequestType"] == "Resubmission" and wf.info[
                "PrepID"].startswith("task") and "Merge" in schema[
                    "InitialTaskPath"].split("/")[-1]:
            era = None
            procstring = None

        # Must use --lfn option, otherwise workflow won't be assigned
        if options.lfn:
            lfn = options.lfn
        elif "MergedLFNBase" in wf.info:
            lfn = wf.info['MergedLFNBase']
        else:
            print "Can't assign the workflow! Please include workflow lfn using --lfn option."
            sys.exit(0)
        # activity production by default for taskchains, reprocessing for default by workflows
        if options.activity:
            activity = options.activity
        elif taskchain:
            activity = 'production'
        else:
            activity = 'reprocessing'

        # given or default processing version
        if options.procversion:
            procversion = int(options.procversion)
        else:
            procversion = wf.info["ProcessingVersion"]

        # Check for output dataset existence, and abort if output datasets already exist!
        # Don't perform this check for ACDC's
        datasets = schema["OutputDatasets"]
        i = 0
        if not (schema["RequestType"] == "Resubmission"):
            exist = False
            maxv = 1
            for key, value in schema.items():
                if type(value) is dict and key.startswith("Task"):
                    dbsapi = DbsApi(url=dbs3_url)

                    # list all datasets with same name but different version
                    # numbers
                    datasets = dbsapi.listDatasets(
                        acquisition_era_name=value['AcquisitionEra'],
                        primary_ds_name=value['PrimaryDataset'],
                        detail=True,
                        dataset_access_type='*')
                    processedName = value['AcquisitionEra'] + '-' + value[
                        'ProcessingString'] + "-v\\d+"
                    # see if any of the dataset names is a match
                    for ds in datasets:
                        if re.match(processedName, ds['processed_ds_name']):
                            print "Existing dset:", ds[
                                'dataset'], "(%s)" % ds['dataset_access_type']
                            maxv = max(maxv, ds['processing_version'])
                            exist = True
                        else:
                            pass
                    i += 1
            # suggest max version
            if exist and procversion <= maxv:
                print "Some output datasets exist, its advised to assign with v ==", maxv + 1
                sys.exit(0)

    # If the --test argument was provided, then just print the information
    # gathered so far and abort the assignment
        if options.test:
            print "%s \tEra: %s \tProcStr: %s \tProcVer: %s" % (
                wf.name, era, procstring, procversion)
            print "LFN: %s \tTeam: %s \tSite: %s" % (lfn, team, sites)
            print "Taskchain? " + str(taskchain)
            print "Activity:" + activity
            sys.exit(0)

        # Really assigning the workflow now
        print wf.name, '\tEra:', era, '\tProcStr:', procstring, '\tProcVer:', procversion, '\tTeam:', team, '\tSite:', sites
        assignRequest(url, wf.name, team, sites, era, procversion, activity,
                      lfn, procstring, trust_site, options.replica,
                      options.verbose, taskchain)

    sys.exit(0)
jsonFile = '/afs/cern.ch/cms/CAF/CMSCOMM/COMM_DQM/certification/Collisions15/13TeV/DCSOnly/json_DCSONLY.txt'

with open(jsonFile) as data_file:    
    data = json.load(data_file)
#print data.keys()
AlltheRuns = map(int,data.keys())
#print sorted(AlltheRuns)
#print "# of runs in the JSON: ", len(AlltheRuns)

# select runs in Run2015D
theRuns = filter(lambda x: x >= 256630, AlltheRuns)


# The RAW datasets to process must have AOD in Prompt
theDatasets = api.listDatasets( dataset='/*/Run2015D*/RAW' )
theDatasetsAOD = api.listDatasets( dataset='/*/*Run2015D*PromptReco*/AOD' )

#print theDatasets
#print theDatasetsAOD
theDatasetsToProcessAOD = []
theDatasetsToProcess = []

for mydataset in theDatasetsAOD:
    tempdataset = mydataset['dataset']
    theDatasetsToProcessAOD.append((tempdataset.split('/'))[1])
print theDatasetsToProcessAOD

nd = 0

# Write DEFAULT section of the conf file
def getDatasets(dataset_pattern):
    "Return list of dataset for given dataset pattern"
    dbsapi = DbsApi(url=DBS3, verifypeer=False)
    reply = dbsapi.listDatasets(dataset=dataset_pattern, dataset_access_type='*')
    return reply
Exemple #46
0
def main():
    url = 'cmsweb.cern.ch'
    url_tb = 'cmsweb-testbed.cern.ch'
    
    # Example: python assign.py -w amaltaro_RVZTT_120404_163607_6269
    # -t testbed-relval -s T1_US_FNAL -e CMSSW_6_0_0_pre1_FS_TEST_WMA -p v1 -a
    # relval -l /store/backfill/1
    usage = "usage: %prog [options] [WORKFLOW]"
    
    parser = optparse.OptionParser(usage=usage)
    parser.add_option('-t', '--team', help='Type of Requests', dest='team')
    parser.add_option('-s', '--sites', help=' "t1" for Tier-1\'s and "t2" for Tier-2\'s', dest='sites')
    parser.add_option('--special',  help='Use it for special workflows. You also have to change the code according to the type of WF', dest='special')
    parser.add_option('-r', '--replica', action='store_true', dest='replica', default=False, help='Adds a _Disk Non-Custodial Replica parameter')
    parser.add_option('-p', '--procversion', help='Processing Version, if empty it will leave the processing version that comes by default in the request', dest='procversion')
    parser.add_option('-a', '--activity', help='Dashboard Activity (reprocessing, production or test), if empty will set reprocessing as default', dest='activity')
    parser.add_option('-x', '--xrootd', help='Assign with trustSiteLocation=True (allows xrootd capabilities)',
                                        action='store_true', default=False, dest='xrootd')
    parser.add_option('-l', '--lfn', help='Merged LFN base', dest='lfn')
    parser.add_option('-v', '--verbose', help='Verbose', action='store_true', default=False, dest='verbose')
    parser.add_option('--testbed', help='Assign in testbed', action='store_true', default=False, dest='testbed')
    parser.add_option('--test', action="store_true",help='Nothing is injected, only print infomation about workflow and Era', dest='test')
    parser.add_option('-f', '--file', help='Text file with a list of wokflows. If this option is used, the same settings will be applied to all workflows', dest='file')
    parser.add_option('-w', '--workflow', help='Workflow Name', dest='workflow')
    parser.add_option('-e', '--era', help='Acquistion era', dest='era')
    parser.add_option("--procstr", dest="procstring", help="Overrides Processing String with a single string")

    (options, args) = parser.parse_args()
    
    if options.testbed:
        url = url_tb

    # parse input workflows and files. If both -w and -f options are used, then only the -w inputs are considered.
    if not options.workflow:
        if args:
            wfs = args
        elif options.file:
            wfs = [l.strip() for l in open(options.file) if l.strip()]
        else:
            parser.error("Input a workflow name or a file to read them")
            sys.exit(0)
    else:
        wfs = [options.workflow]

    #Default values
    era = {}
    procversion = 1
    procstring = {}
    replica = False
    sites = ALL_SITES
    specialStr = ''
    taskchain = False
    team = 'production'
    trust_site = False

    SI = siteInfo()
    # Handling the parameters given in the command line
    # parse site list
    if options.sites:
        if options.sites == "t1":
            sites = SI.sites_T1s
        elif options.sites == "t2":
            sites = SI.sites_T2s
        else: 
            sites = [site for site in options.sites.split(',')]
    else: 
        sites = SI.sites_T1s + SI.sites_T2s
    if options.team:
        team = options.team

    if options.xrootd:
        trust_site = True

    if options.replica:
        replica = True

    for wf in wfs:
        # Getting the original dictionary
        schema = getRequestDict(url, wf)
        wf = reqMgr.Workflow(wf, url=url)

        # WF must be in assignment-approved in order to be assigned
        if (schema["RequestStatus"] != "assignment-approved"):
            print("The workflow '" + wf.name + "' you are trying to assign is not in assignment-approved")
            sys.exit(1)

        #Check to see if the workflow is a task chain or an ACDC of a taskchain
        taskchain = (schema["RequestType"] == "TaskChain") or ((schema["RequestType"] == "Resubmission") and "task" in schema["InitialTaskPath"].split("/")[1])

        #Dealing with era and proc string
        if taskchain:
            # Setting the Era and ProcStr values per Task
            for key, value in schema.items():
                if type(value) is dict and key.startswith("Task"):
                    try:
                        if 'ProcessingString' in value:
                            procstring[value['TaskName']] = value['ProcessingString']
                        else:
                            procstring[value['TaskName']] = schema['ProcessingString']
                        if 'AcquisitionEra' in value:
                            era[value['TaskName']] = value['AcquisitionEra']
                        else:
                            procstring[value['TaskName']] = schema['AcquisitionEra']
                    except KeyError:
                        print("This taskchain request has no AcquisitionEra or ProcessingString defined into the Tasks, aborting...")
                        sys.exit(1)
        # Adding the special string - in case it was provided in the command line
        if options.special:
            specialStr = '_' + str(options.special)
            for key, value in procstring.items():
                procstring[key] = value + specialStr
        # Override if a value is given using the procstring command
        if options.procstring:
            procstring = options.procstring
        elif not taskchain:
            procstring = wf.info['ProcessingString']
        if options.era:
            era = options.era
        elif not taskchain:
            era = wf.info['AcquisitionEra']
        #Set era and procstring to none for merge ACDCs inside a task chain
        if schema["RequestType"] == "Resubmission" and wf.info["PrepID"].startswith("task") and "Merge" in schema["InitialTaskPath"].split("/")[-2]:
            era = None
            procstring = None

        # Must use --lfn option, otherwise workflow won't be assigned
        if options.lfn:
            lfn = options.lfn
        elif "MergedLFNBase" in wf.info:
            lfn = wf.info['MergedLFNBase']
        else:
            print "Can't assign the workflow! Please include workflow lfn using --lfn option."
            sys.exit(0)
        # activity production by default for taskchains, reprocessing for default by workflows
        if options.activity:
            activity = options.activity
        elif taskchain:
            activity = 'production'
        else:
            activity = 'reprocessing'

        # given or default processing version
        if options.procversion:
            procversion = int(options.procversion)
        else:
            procversion = wf.info["ProcessingVersion"]

        # Check for output dataset existence, and abort if output datasets already exist!
        # Don't perform this check for ACDC's
        datasets = schema["OutputDatasets"]
        i = 0
        if not (schema["RequestType"] == "Resubmission" ):
            exist = False
            maxv = 1
            for key, value in schema.items():
                if type(value) is dict and key.startswith("Task"):
                    dbsapi = DbsApi(url=dbs3_url)
                    
                    # list all datasets with same name but different version
                    # numbers
                    datasets = dbsapi.listDatasets(acquisition_era_name=value['AcquisitionEra'], primary_ds_name=value['PrimaryDataset'], detail=True, dataset_access_type='*')
                    processedName = value['AcquisitionEra'] + '-' + value['ProcessingString'] + "-v\\d+"
                    # see if any of the dataset names is a match
                    for ds in datasets:
                        if re.match(processedName, ds['processed_ds_name']):
                            print "Existing dset:", ds['dataset'], "(%s)" % ds['dataset_access_type']
                            maxv = max(maxv, ds['processing_version'])
                            exist = True
                        else:
                             pass
                    i += 1
            # suggest max version
            if exist and procversion <= maxv:
                print "Some output datasets exist, its advised to assign with v ==", maxv + 1
                sys.exit(0)

    # If the --test argument was provided, then just print the information
    # gathered so far and abort the assignment
        if options.test:
            print "%s \tEra: %s \tProcStr: %s \tProcVer: %s" % (wf.name, era, procstring, procversion)
            print "LFN: %s \tTeam: %s \tSite: %s" % (lfn, team, sites)
            print "Taskchain? " + str(taskchain)
            print "Activity:" + activity
            sys.exit(0)
        
        # Really assigning the workflow now
        print wf.name, '\tEra:', era, '\tProcStr:', procstring, '\tProcVer:', procversion, '\tTeam:', team, '\tSite:', sites
        assignRequest(url, wf.name, team, sites, era, procversion, activity, lfn, procstring, trust_site, options.replica, options.verbose, taskchain)
    
    sys.exit(0)
def getDatasetStatus(dataset):
    "Return dataset status"
    dbsapi = DbsApi(url=DBS3, verifypeer=False)
    reply = dbsapi.listDatasets(dataset=dataset, dataset_access_type='*', detail=True)
    return reply[0]['dataset_access_type']
Exemple #48
0
def main():

	usage="%prog <options>"

	parser = OptionParser(usage=usage)
	parser.add_option("-u", "--url", dest="url", help="DBS Instance url. default is https://cmsweb.cern.ch/dbs/prod/global/DBSReader", metavar="<url>")
	parser.add_option("-l", "--length", dest="length", help="Number of days for calculate the accumated events. It is Optional, default is 30 days.", metavar="<length>")
	parser.add_option("-d", "--dataset", dest="dataset", help="The dataset name for cacluate the events. Can be optional if datatier is used.", metavar="<dataset>")
	parser.add_option("-t", "--datatier", dest="datatier", help="The datatier name for cacluate the events. Can be optional if dataset is used. In this version datatier is not supported yet.", metavar="<data_tier_name>")
	parser.add_option("-a", "--access_type", dest="ds_access_type", help="Dataset access types: VALID, PRODUCTION or ALL(VALID+PRODUCTION). Default is ALL", metavar="<dataset_access_type>")
	parser.set_defaults(url="https://cmsweb.cern.ch/dbs/prod/global/DBSReader")
	parser.set_defaults(length=30)
	parser.set_defaults(ds_access_type="ALL")

	(opts, args) = parser.parse_args()
	if not (opts.dataset or opts.datatier):
		parser.print_help()
		parser.error('either --dataset or --datatier is required')

	dataset	 = opts.dataset
	#seconds per day    
	sdays = 86400
	lenth = int(opts.length)
	now = time.time()
	#now = 1391353032
	then = now - sdays*lenth
	url = opts.url
	api=DbsApi(url=url)
	outputDataSets = []
    
	f = [0 for x in range(lenth)]
	min_cdate = int(then)
	max_cdate = int(now)
	if (opts.ds_access_type == "ALL"):
		outputDataSetsValid = api.listDatasets(dataset=dataset, min_cdate=min_cdate-30*sdays, 
                          max_cdate=max_cdate, dataset_access_type="VALID")
		outputDataSetsProd = api.listDatasets(dataset=dataset, min_cdate=min_cdate-30*sdays,
                          max_cdate=max_cdate, dataset_access_type="PRODUCTION")
		outputDataSets = outputDataSetsValid + outputDataSetsProd
	elif (opts.ds_access_type == "VALID"):
		outputDataSets = api.listDatasets(dataset=dataset, min_cdate=min_cdate-30*sdays,
                          max_cdate=max_cdate, dataset_access_type="VALID")
	elif (opts.ds_access_type == "PRODUCTION"):
		outputDataSets = api.listDatasets(dataset=dataset, min_cdate=min_cdate-30*sdays,
                          max_cdate=max_cdate, dataset_access_type="PRODUCTION")
	for dataset in outputDataSets:
		outputBlocks = api.listBlocks(dataset=dataset["dataset"], detail=1, min_cdate=min_cdate, max_cdate=max_cdate)
		blockList = []
		blockCdate = {}
		for block in outputBlocks:
			blockList.append(block["block_name"])
			blockCdate[block["block_name"]] = block["creation_date"]
		blockSum = []
		if blockList: 
			blockSum = api.listBlockSummaries(block_name=blockList, detail=1)
		for b in blockSum:
			cdate= blockCdate[b["block_name"]]
			day = int((now-cdate)/sdays)
			f[day] = f[day] + b["num_event"] 
	for i in range(lenth):
		#print (lenth-1)-i, ":  ", f[i], "  ", sum(item['all'] for item in f[i:lenth]) 
		print i, ": ", f[(lenth-1)-i], " ", sum(item for item in f[(lenth-1)-i:lenth])
	sys.exit(0);
def main():
    url='cmsweb.cern.ch'	
    ### Example: python assignWorkflow.py -w amaltaro_RVZTT_120404_163607_6269 -t testbed-relval -s T1_US_FNAL -e CMSSW_6_0_0_pre1_FS_TEST_WMA -p v1 -a relval -l /store/backfill/1
    parser = optparse.OptionParser()
    parser.add_option('-w', '--workflow', help='Workflow Name',dest='workflow')
    parser.add_option('-t', '--team', help='Type of Requests',dest='team')
    parser.add_option('-s', '--site', help='Site',dest='site')
    parser.add_option('-p', '--procversion', help='Processing Version',dest='procversion')
    parser.add_option('-a', '--activity', help='Dashboard Activity',dest='activity')
    parser.add_option('-l', '--lfn', help='Merged LFN base',dest='lfn')

    parser.add_option('--special', help='Use it for special workflows. You also have to change the code according to the type of WF',dest='special')
    parser.add_option('--test',action="store_true", help='Nothing is injected, only print infomation about workflow and AcqEra',dest='test')
    parser.add_option('--pu',action="store_true", help='Use it to inject PileUp workflows only',dest='pu')
    (options,args) = parser.parse_args()

    if not options.workflow:
        print "The workflow name is mandatory!"
        print "Usage: python assignProdTaskChain.py -w <requestName>"
        sys.exit(0);

    workflow=options.workflow
    team='production'
    site=["T1_DE_KIT",
            "T1_ES_PIC",
            "T1_FR_CCIN2P3",
            "T1_IT_CNAF",
            "T1_RU_JINR",
            "T1_UK_RAL",
            "T1_US_FNAL",
            "T2_CH_CERN",
            "T2_DE_DESY",
            "T2_DE_RWTH",
            #"T2_ES_CIEMAT",
            "T2_FR_IPHC",
            "T2_IT_Bari",
            "T2_IT_Legnaro",
            "T2_IT_Pisa",
            #"T2_IT_Rome",
            "T2_UK_London_Brunel",
            "T2_UK_London_IC",
            "T2_US_Caltech",
            "T2_US_Florida",
            "T2_US_MIT",
            "T2_US_Nebraska",
            "T2_US_Purdue",
            "T2_US_UCSD",
            "T2_US_Wisconsin"
    ]
    procversion=1
    activity='production'
    lfn='/store/mc'
    acqera = {}
    procstring = {}
    specialStr = ''

    ### Getting the original dictionary
    schema = getRequestDict(url,workflow)

    # Setting the AcqEra and ProcStr values per Task 
    for key, value in schema.items():
        if type(value) is dict and key.startswith("Task"):
            try:
                procstring[value['TaskName']] = value['ProcessingString'].replace("-","_")
                acqera[value['TaskName']] = value['AcquisitionEra']
            except KeyError:
                print "This request has no AcquisitionEra or ProcessingString defined into the Tasks, aborting..."
                sys.exit(1)

    # Adding the special string - in case it was provided in the command line 
    if options.special:
        #specialStr = '_03Jan2013'
        specialStr = '_'+str(options.special)
        for key,value in procstring.items():
            procstring[key] = value+specialStr

    # Handling the parameters given in the command line
    if options.team:
        team=options.team
    if options.site:
        site=options.site
    if options.procversion:
        procversion=int(options.procversion)
    if options.activity:
        activity=options.activity
    if options.lfn:
        lfn=options.lfn

    
    #TODO check output dataset existence, and abort if they already do!
    datasets = schema["OutputDatasets"]
    i = 0
    if 'ACDC' not in options.workflow:
        exist = False
        maxv = 1
        for key, value in schema.items():
            if type(value) is dict and key.startswith("Task"):
                dbsapi = DbsApi(url=dbs3_url)

                #list all datasets with same name but different version numbers
                datasets = dbsapi.listDatasets(acquisition_era_name=value['AcquisitionEra'],
                             primary_ds_name= value['PrimaryDataset'], detail=True, dataset_access_type='*')
                processedName = value['AcquisitionEra']+'-'+value['ProcessingString']+"-v\\d+"
                #see if any of the dataset names is a match
                for ds in datasets:
                    if re.match(processedName,ds['processed_ds_name']):
                        print "Existing dset:", ds['dataset'], "(%s)"%ds['dataset_access_type']
                        maxv = max(maxv, ds['processing_version'])
                        exist = True
                    else:
                        pass
                i += 1
        #suggest max version
        if exist and procversion <= maxv:
            print "Some output datasets exist, its advised to assign with v ==", maxv + 1
            sys.exit(0)

    # If the --test argument was provided, then just print the information gathered so far and abort the assignment
    if options.test:
        print "%s \tAcqEra: %s \tProcStr: %s \tProcVer: %s" % (workflow, acqera, procstring, procversion)
        #print workflow, '\tAcqEra:', acqera, '\tProcStr:', procstring, '\tProcVer:', procversion
        print "LFN: %s \tTeam: %s \tSite: %s" % (lfn, team, site)
        #print '\tTeam:',team,  '\tSite:', site
        sys.exit(0);

    # Really assigning the workflow now
    print workflow, '\tAcqEra:', acqera, '\tProcStr:', procstring, '\tProcVer:', procversion, '\tTeam:',team, '\tSite:', site
    assignRequest(url,workflow,team,site,acqera,procstring,procversion,activity,lfn)
    sys.exit(0);
import json
import optparse

#fname="delete_this_1000.txt" #relval files
fname="relval_eos_file_names.txt" #other files
#fname="delete_this.txt" #other filse

inputfile = open(fname,'r')

dbsApi = DbsApi(url = 'https://cmsweb.cern.ch/dbs/prod/global/DBSReader')
#dbsApi = DbsApi(url = 'https://cmsweb-testbed.cern.ch/dbs/int/global/DBSReader/')

dsets_fraction_lost = {}

access_type = "VALID"

for line in inputfile:
    lfn= line.rstrip('\n')

    if dbsApi.listFiles(logical_file_name = lfn) == []:
        print lfn
        continue

    continue

    #print "lfn = "+lfn
    if dbsApi.listDatasets(logical_file_name = lfn, dataset_access_type = "VALID") ==  [] and dbsApi.listDatasets(logical_file_name = lfn, dataset_access_type = "PRODUCTION") ==  [] and dbsApi.listDatasets(logical_file_name = lfn, dataset_access_type = "INVALID") ==  [] and dbsApi.listDatasets(logical_file_name = lfn, dataset_access_type = "DELETED") ==  [] and dbsApi.listDatasets(logical_file_name = lfn, dataset_access_type = "DEPRECATED") ==  []:
        print lfn
        continue

Exemple #51
0
#!/usr/bin/env python
# https://twiki.cern.ch/twiki/bin/view/CMS/DBS3APIInstructions
import  sys
import os
from dbs.apis.dbsClient import DbsApi

if(len(sys.argv) < 2):
    print "Give me an arg!"
    sys.exit()

# dataset='/DYJetsToLL_M-50_Zpt-150toInf_TuneCUETP8M1_13TeV-madgraphMLM-pythia8/RunIISpring15DR74-Asympt25ns_MCRUN2_74_V9-v1/MINIAODSIM'
dataset = sys.argv[-1]
evt_per_job = 30000
url="https://cmsweb.cern.ch/dbs/prod/global/DBSReader"
api=DbsApi(url=url)
output = api.listDatasets(dataset=dataset)

if (output == []): 
  os.system("./FindLumisPerJob.sh " +  dataset)
  sys.exit()

if(len(output)==1):
    inp=output[0]['dataset']
    info = api.listFileSummaries(dataset=inp)[0]

    nevents = info['num_event']
    nlumis = info['num_lumi']
    evt_per_lumi = 1.0*nevents/nlumis
    lumi_per_job = evt_per_job/evt_per_lumi

    dump = api.listFiles(dataset=dataset, detail=1, validFileOnly=1)
class RequestQuery:

    def __init__(self,config):
        self.br=Browser()

        self.config = config
        
        # Initialise connections
        self.mySiteDB = SiteDBJSON()
        self.phedex = PhEDEx({"endpoint":"https://cmsweb.cern.ch/phedex/datasvc/json/prod/"}, "json")
        self.dbsPhys01 = DbsApi(url = dbs_base_url+"phys01/DBSReader/")
        self.dbsPhys02 = DbsApi(url = dbs_base_url+"phys02/DBSReader/")
        self.dbsPhys03 = DbsApi(url = dbs_base_url+"phys03/DBSReader/")
        
    def __del__(self):
        self.br.close()

    def login2Savannah(self):
        """
        login2Savannah log into savannah with the given parameters in the config (username and password) 
        User must have admin priviledges for store results requests
        """
        login_page='https://savannah.cern.ch/account/login.php?uri=%2F'
        savannah_page='https://savannah.cern.ch/task/?group=cms-storeresults'
        
        self.br.open(login_page)

        ## 'Search' form is form 0
        ## login form is form 1
        self.br.select_form(nr=1)

        username = self.config["SavannahUser"]
    
        self.br['form_loginname']=username
        self.br['form_pw']=self.config["SavannahPasswd"]
        
        self.br.submit()
        
        response = self.br.open(savannah_page)
        
        # Check to see if login was successful
        if not re.search('Logged in as ' + username, response.read()):
            print('login unsuccessful, please check your username and password')
            return False
        else:
            return True
    
    def selectQueryForm(self,**kargs):       
        """
        selectQueryForm create the browser view to get all the store result tickets from savannah
        """
        if self.isLoggedIn:
            self.br.select_form(name="bug_form")

            ## Use right query form labelled Test
            control = self.br.find_control("report_id",type="select")

            for item in control.items:
                if item.attrs['label'] == "Test":
                    control.value = [item.attrs['value']]
                    
            ##select number of entries displayed per page
            control = self.br.find_control("chunksz",type="text")
            control.value = "150"

            ##check additional searching parameter
            for arg in kargs:
                if arg == "approval_status":
                    control = self.br.find_control("resolution_id",type="select")
                    for item in control.items:
                        if item.attrs['label'] == kargs[arg].strip():
                            control.value = [item.attrs['value']]

                elif arg == "task_status":
                    control = self.br.find_control("status_id",type="select")
                    for item in control.items:
                        if item.attrs['label'] == kargs[arg].strip():
                            control.value = [item.attrs['value']]
                            
                elif arg == "team":
                    control = self.br.find_control("custom_sb5",type="select")
                    for item in control.items:
                        if item.attrs['label'] == kargs[arg].strip():
                            control.value = [item.attrs['value']]

            response = self.br.submit()
            response.read()

        return

    def getScramArchByCMSSW(self):
        """
        Get from the list of available CMSSW releases
        return a dictionary of ScramArchitecture by CMSSW
        """
        
        # Set temporary conection to the server and get the response from cmstags
        url = 'https://cmssdt.cern.ch/SDT/cgi-bin/ReleasesXML'
        br = Browser()
        br.set_handle_robots(False)
        response=br.open(url)
        soup = BeautifulSoup(response.read())
        
        # Dictionary form
        # {'CMSSW_X_X_X':[slc5_amd64_gcc472], ... }
        archByCmssw={}
        
        # Fill the dictionary
        for arch in soup.find_all('architecture'): 
            for cmssw in arch.find_all('project'): 
                # CMSSW release
                cmsswLabel = cmssw.get('label').encode('ascii', 'ignore')
                if cmsswLabel not in archByCmssw:
                    archByCmssw[cmsswLabel]=[]
                # ScramArch related to this CMSSW release
                archName = arch.get('name').encode('ascii', 'ignore')
                archByCmssw[cmsswLabel].append(archName)
        
        return archByCmssw
      
    def createValueDicts(self):       
        """
        Init dictionaries by value/label:
        - Releases by Value
        - Physics group by value
        - DBS url by value
        - DBS rul by label
        - Status of savannah request by value 
        - Status of savannah ticket by value (Open/Closed/Any)
        """      
        if self.isLoggedIn:
            self.br.select_form(name="bug_form")
            
            control = self.br.find_control("custom_sb2",type="select")
            self.ReleaseByValueDict = self.getLabelByValueDict(control)

            control = self.br.find_control("custom_sb3",type="select")
            self.GroupByValueDict = self.getLabelByValueDict(control)

            control = self.br.find_control("custom_sb4",type="select")
            self.DBSByValueDict = self.getLabelByValueDict(control)
            self.DBSByLabelDict = self.getValueByLabelDict(control)

            control = self.br.find_control("resolution_id",type="select")
            self.StatusByValueDict = self.getLabelByValueDict(control)

            control = self.br.find_control("status_id",type="select")
            self.TicketStatusByLabelDict = self.getValueByLabelDict(control)

        return
    
    def getDatasetOriginSites(self, dbs_url, data):
        """
        Get the origin sites for each block of the dataset.
        Return a list block origin sites.
        """
        
        local_dbs = dbs_url.split('/')[5]
        if local_dbs == 'phys01':
            response = self.dbsPhys01.listBlocks(detail=True,dataset=data)
        elif local_dbs == 'phys02':
            response = self.dbsPhys02.listBlocks(detail=True,dataset=data)
        elif local_dbs == 'phys03':
            response = self.dbsPhys03.listBlocks(detail=True,dataset=data)

        pnnList = set()
        for block in response:
            pnnList.add(block['origin_site_name'])
        psnList = self.mySiteDB.PNNstoPSNs(pnnList)
        
        return psnList, list(pnnList)

    def phEDExNodetocmsName(self, nodeList):
        """
        Convert PhEDEx node name list to cms names list 
        """
        names = []
        for node in nodeList:
            name = node.replace('_MSS',
                                '').replace('_Disk',
                                    '').replace('_Buffer',
                                        '').replace('_Export', '')
            if name not in names:
                names.append(name)
        return names
    
    def setGlobalTagFromOrigin(self, dbs_url,input_dataset):
        """
        Get the global tag of the dataset from the source dbs url. If it is not set, then set global tag to 'UNKNOWN'
        """
        
        globalTag = ""
        local_dbs = dbs_url.split('/')[5]
        if local_dbs == 'phys01':
            response = self.dbsPhys01.listOutputConfigs(dataset=input_dataset)
        elif local_dbs == 'phys02':
            response = self.dbsPhys02.listOutputConfigs(dataset=input_dataset)
        elif local_dbs == 'phys03':
            response = self.dbsPhys03.listOutputConfigs(dataset=input_dataset)
        
        globalTag = response[0]['global_tag']
        # GlobalTag cannot be empty
        if globalTag == '':
            globalTag = 'UNKNOWN'
            
        return globalTag
    
    def isDataAtUrl(self, dbs_url,input_dataset):
        """
        Returns True if the dataset is at the dbs url, if not returns False
        """
        local_dbs = dbs_url.split('/')[5]
        if local_dbs == 'phys01':
            response = self.dbsPhys01.listDatasets(dataset=input_dataset)
        elif local_dbs == 'phys02':
            response = self.dbsPhys02.listDatasets(dataset=input_dataset)
        elif local_dbs == 'phys03':
            response = self.dbsPhys03.listDatasets(dataset=input_dataset)
        # This means that the dataset is not at the url
        if not response:
            return False
        else:
            return True
         
    def getLabelByValueDict(self, control):
        """
        From control items, create a dictionary by values
        """   
        d = {}
        for item in control.items:
            value = item.attrs['value']
            label = item.attrs['label']
            d[value] = label
                
        return d
    
    def getValueByLabelDict(self, control):
        """
        From control items, create a dictionary by labels
        """
        d = {}
        for item in control.items:
            value = item.attrs['value']
            label = item.attrs['label']
            d[label] = value

        return d
    
    def getRequests(self,**kargs):
        """
        getRequests Actually goes through all the savannah requests and create json files if the 
        ticket is not Closed and the status of the item is Done.
        It also reports back the summary of the requests in savannah
        """
        requests = []
        
        # Open Browser and login into Savannah
        self.br=Browser()
        self.isLoggedIn = self.login2Savannah()
        
        if self.isLoggedIn:
            if not kargs:
                self.selectQueryForm(approval_status='1',task_status='0')
            else:
                self.selectQueryForm(**kargs)
            self.createValueDicts()
        
            self.br.select_form(name="bug_form")
            response = self.br.submit()

            html_ouput = response.read()
            
            scramArchByCMSSW = self.getScramArchByCMSSW()
            self.nodeMappings = self.phedex.getNodeMap()
            
            for link in self.br.links(text_regex="#[0-9]+"):
                response = self.br.follow_link(link)
                
                try:
                    ## Get Information
                    self.br.select_form(name="item_form")

                    ## remove leading &nbsp and # from task
                    task = link.text.replace('#','').decode('utf-8').strip()
                    print("Processing ticket: %s" % task)
                    
                    ## Get input dataset name
                    control = self.br.find_control("custom_tf1",type="text")
                    input_dataset = control.value
                    input_primary_dataset = input_dataset.split('/')[1].replace(' ','')
                    input_processed_dataset = input_dataset.split('/')[2].replace(' ','')
                    data_tier = input_dataset.split('/')[3].replace(' ','')
                    
                    ## Get DBS URL by Drop Down
                    control = self.br.find_control("custom_sb4",type="select")
                    dbs_url = self.DBSByValueDict[control.value[0]]

                    ## Get DBS URL by text field (for old entries)
                    if dbs_url=='None':
                        control = self.br.find_control("custom_tf4",type="text")
                        dbs_url = control.value.replace(' ','')
                    else: # Transform input value to a valid DBS url
                        #dbs_url = "https://cmsweb.cern.ch/dbs/prod/"+dbs_url+"/DBSReader"
                        dbs_url = dbs_base_url+dbs_url+"/DBSReader"
                        
                    ## Get Release
                    control = self.br.find_control("custom_sb2",type="select")
                    release_id = control.value
                    
                    ## Get current request status
                    control = self.br.find_control("status_id",type="select")
                    request_status_id = control.value
                    RequestStatusByValueDict = self.getLabelByValueDict(control)
                    
                    # close the request if deprecated release was used
                    try:
                        release = self.ReleaseByValueDict[release_id[0]]
                    except:
                        if len(self.ReleaseByValueDict)>0 and RequestStatusByValueDict[request_status_id[0]] != "Closed":
                            msg = "Your request is not valid anymore, since the given CMSSW release is deprecated. If your request should be still processed, please reopen the request and update the CMSSW release to a more recent *working* release.\n"
                            msg+= "\n"
                            msg+= "Thanks,\n"
                            msg+= "Your StoreResults team"
                            self.closeRequest(task,msg)
                            self.br.back()
                            print("I tried to Close ticket %s due to CMSSW not valid" % task)
                            continue
                    
                    # close the request if release has not ScramArch match
                    if release not in scramArchByCMSSW:
                        if len(self.ReleaseByValueDict)>0 and RequestStatusByValueDict[request_status_id[0]] != "Closed":
                            msg = "Your request is not valid, there is no ScramArch match for the given CMSSW release.\n"
                            msg+= "If your request should be still processed, please reopen the request and update the CMSSW release according to: https://cmssdt.cern.ch/SDT/cgi-bin/ReleasesXML \n"
                            msg+= "\n"
                            msg+= "Thanks,\n"
                            msg+= "Your StoreResults team"
                            self.closeRequest(task,msg)
                            self.br.back()
                            print("I tried to Close ticket %s due to ScramArch mismatch" % task)
                            continue
                    else: 
                        index=len(scramArchByCMSSW[release])
                        scram_arch = scramArchByCMSSW[release][index-1]

                    # close the request if dataset is not at dbs url
                    try:
                        data_at_url = self.isDataAtUrl(dbs_url,input_dataset)
                    except:
                        print('I got an error trying to look for dataset %s at %s, please look at this ticket: %s' %(input_dataset,dbs_url,task))
                        continue
                    if not data_at_url:
                        msg = "Your request is not valid, I could not find the given dataset at %s\n" % dbs_url
                        msg+= "If your request should be still processed, please reopen the request and change DBS url properly \n"
                        msg+= "\n"
                        msg+= "Thanks,\n"
                        msg+= "Your StoreResults team"
                        self.closeRequest(task,msg)
                        self.br.back()
                        print("I tried to Close ticket %s, dataset is not at DBS url" % task)
                        continue
                        
                    # Avoid not approved Tickets
                    #if not RequestStatusByValueDict[request_status_id[0]] == "Done":
                    #    continue

                    ## Get Physics Group
                    control = self.br.find_control("custom_sb3",type="select")
                    group_id = control.value[0]
                    group_squad = 'cms-storeresults-'+self.GroupByValueDict[group_id].replace("-","_").lower()

                    ## Get Dataset Version
                    control = self.br.find_control("custom_tf3",type="text")
                    dataset_version = control.value.replace(' ','')
                    if dataset_version == "": dataset_version = '1'
                                        
                    ## Get current status
                    control = self.br.find_control("resolution_id",type="select")
                    status_id = control.value

                    ## Get assigned to
                    control = self.br.find_control("assigned_to",type="select")
                    AssignedToByValueDict = self.getLabelByValueDict(control)
                    assignedTo_id = control.value

                    ##Assign task to the physics group squad
                    if AssignedToByValueDict[assignedTo_id[0]]!=group_squad:
                        assignedTo_id = [self.getValueByLabelDict(control)[group_squad]]
                        control.value = assignedTo_id
                        self.br.submit()

                    # Set default Adquisition Era for StoreResults 
                    acquisitionEra = "StoreResults"

                    ## Construction of the new dataset name (ProcessingString)
                    ## remove leading hypernews or physics group name and StoreResults+Version
                    if input_processed_dataset.find(self.GroupByValueDict[group_id])==0:
                        new_dataset = input_processed_dataset.replace(self.GroupByValueDict[group_id],"",1)
                    else:
                        stripped_dataset = input_processed_dataset.split("-")[1:]
                        new_dataset = '_'.join(stripped_dataset)
                    
                except Exception as ex:
                    self.br.back()
                    print("There is a problem with this ticket %s, please have a look to the error:" % task)
                    print(str(ex))
                    print(traceback.format_exc())
                    continue
                
                self.br.back()
                
                # Get dataset site info:
                psnList, pnnList = self.getDatasetOriginSites(dbs_url,input_dataset)
                
                infoDict = {}
                # Build store results json
                # First add all the defaults values
                infoDict["RequestType"] = "StoreResults"
                infoDict["UnmergedLFNBase"] = "/store/unmerged" 
                infoDict["MergedLFNBase"] = "/store/results/" + self.GroupByValueDict[group_id].replace("-","_").lower()
                infoDict["MinMergeSize"] = 1500000000
                infoDict["MaxMergeSize"] = 5000000000
                infoDict["MaxMergeEvents"] = 100000
                infoDict["TimePerEvent"] = 40
                infoDict["SizePerEvent"] = 512.0
                infoDict["Memory"] = 2394
                infoDict["CmsPath"] = "/uscmst1/prod/sw/cms"                                        
                infoDict["Group"] = "DATAOPS"
                infoDict["DbsUrl"] = dbs_url
                
                # Add all the information pulled from Savannah
                infoDict["AcquisitionEra"] = acquisitionEra
                infoDict["GlobalTag"] = self.setGlobalTagFromOrigin(dbs_url,input_dataset)
                infoDict["DataTier"] = data_tier
                infoDict["InputDataset"] = input_dataset
                infoDict["ProcessingString"] = new_dataset
                infoDict["CMSSWVersion"] = release
                infoDict["ScramArch"] = scram_arch
                infoDict["ProcessingVersion"] = dataset_version                    
                infoDict["SiteWhitelist"] = psnList
                
                # Create report for Migration2Global
                report = {}
                 
                #Fill json file, if status is done
                if self.StatusByValueDict[status_id[0]]=='Done' and RequestStatusByValueDict[request_status_id[0]] != "Closed":
                    self.writeJSONFile(task, infoDict)
                    report["json"] = 'y'
                else:
                    report["json"] = 'n'
                    
                report["task"] = int(task)
                report["InputDataset"] = input_dataset
                report["ProcessingString"] = new_dataset
                report["ticketStatus"] = self.StatusByValueDict[status_id[0]]
                report["assignedTo"] = AssignedToByValueDict[assignedTo_id[0]]
                report["localUrl"] = dbs_url
                report["sites"] = psnList
                report["pnns"] = pnnList

                # if the request is closed, change the item status to report to Closed
                if report["ticketStatus"] == "Done" and RequestStatusByValueDict[request_status_id[0]] == "Closed":
                    report["ticketStatus"] = "Closed"

                requests.append(report)
                    
            # Print out report
            self.printReport(requests)
        # Close connections
        self.br.close()
        
        return requests

    def closeRequest(self,task,msg):
        """
        This close a specific savannag ticket
        Insert a message in the ticket
        """
        if self.isLoggedIn:
            #self.createValueDicts()
            
            response = self.br.open('https://savannah.cern.ch/task/?'+str(task))

            html = response.read()

            self.br.select_form(name="item_form")

            control = self.br.find_control("status_id",type="select")
            control.value = [self.TicketStatusByLabelDict["Closed"]]

            #Put reason to the comment field
            control = self.br.find_control("comment",type="textarea")
            control.value = msg
                        
            #DBS Drop Down is a mandatory field, if set to None (for old requests), it is not possible to close the request
            self.setDBSDropDown()
                        
            self.br.submit()

            #remove JSON ticket
            self.removeJSONFile(task)
            
            self.br.back()
        return

    def setDBSDropDown(self):
        ## Get DBS URL by Drop Down
        control = self.br.find_control("custom_sb4",type="select")
        dbs_url = self.DBSByValueDict[control.value[0]]

        ## Get DBS URL by text field (for old entries)
        if dbs_url=='None':
            tmp = self.br.find_control("custom_tf4",type="text")
            dbs_url = tmp.value.replace(' ','')

            if dbs_url.find("phys01")!=-1:
                control.value = [self.DBSByLabelDict["phys01"]]
            elif dbs_url.find("phys02")!=-1:
                control.value = [self.DBSByLabelDict["phys02"]]
            elif dbs_url.find("phys03")!=-1:
                control.value = [self.DBSByLabelDict["phys03"]]
            else:
                msg = 'DBS URL of the old request is neither phys01, phys02 nor phys03. Please, check!'
                print(msg)
                raise RuntimeError(msg)

        return

    def writeJSONFile(self, task, infoDict):
        """
        This writes a JSON file at ComponentDir
        """
        ##check if file already exists
        filename = self.config["ComponentDir"]+'/Ticket_'+str(task)+'.json'
        if not os.access(filename,os.F_OK):
            jsonfile = open(filename,'w')
            request = {'createRequest':infoDict} ## CHECK THIS BEFORE FINISHING
            jsonfile.write(json.dumps(request,sort_keys=True, indent=4))
            jsonfile.close

        return

    def removeJSONFile(self,task):
        """
        This removes the JSON file at ComponentDir if it was created
        """
        filename = self.config["ComponentDir"]+'/Ticket_'+str(task)+'.json'

        if os.access(filename,os.F_OK):
            os.remove(filename)

        return

    def printReport(self, requests):
        """
        Print out a report
        """
        print("%20s %10s %5s %35s %10s %50s %50s" %( 'Savannah Ticket','Status','json','Assigned to','local DBS','Sites','pnns')) 
        print("%20s %10s %5s %35s %10s %50s %50s" %( '-'*20,'-'*10,'-'*5,'-'*35,'-'*10,'-'*50,'-'*50 ))
        
        for report in requests:
            
            json = report["json"]
            ticket = report["task"]
            status = report["ticketStatus"]
            assigned = report["assignedTo"]
            localUrl = report["localUrl"].split('/')[5]
            site = ', '.join(report["sites"])
            pnns = ', '.join(report["pnns"])
            print("%20s %10s %5s %35s %10s %50s %50s" %(ticket,status,json,assigned,localUrl,site,pnns))  
print '\n DATASET: ' + dataset
f = dataset.split("/")
if len(f) > 3:
    tier = f[3]
    if 'MINIAOD' in tier:
        print ' MINIAOD* identified, consider extra T2_CH_CERN copy.'
        isMiniAod = True

# size of provided dataset
#-------------------------

# instantiate an API
dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader')

# first test whether dataset is valid
dbsList = dbsapi.listDatasets(dataset = dataset, dataset_access_type = 'VALID')
datasetInvalid = False
if dbsList == []:
    datasetInvalid = True
    print ' Dataset does not exist or is invalid. Exit now!\n'
    sys.exit(1)

# determine size and number of files
size = str(sum([block['file_size'] for block in dbsapi.listBlockSummaries(dataset = dataset)]))+'UB'
sizeGb = convertSizeToGb(size)

# in case this is an open subscription we need to adjust sizeGb to the expected size
if expectedSizeGb > 0:
    sizeGb = expectedSizeGb

print ' SIZE:    %.1f GB'%(sizeGb)
Exemple #54
0
class DBS3Reader(object):
    """
    _DBSReader_

    General API for reading data from DBS
    """
    def __init__(self, url, logger=None, **contact):

        # instantiate dbs api object
        try:
            self.dbsURL = url.replace("cmsweb.cern.ch", "cmsweb-prod.cern.ch")
            self.dbs = DbsApi(self.dbsURL, **contact)
            self.logger = logger or logging.getLogger(self.__class__.__name__)
        except dbsClientException as ex:
            msg = "Error in DBSReader with DbsApi\n"
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def _getLumiList(self, blockName=None, lfns=None, validFileOnly=1):
        """
        currently only take one lfn but dbs api need be updated
        """
        try:
            if blockName:
                lumiLists = self.dbs.listFileLumis(block_name=blockName,
                                                   validFileOnly=validFileOnly)
            elif lfns:
                lumiLists = []
                for slfn in grouper(lfns, 50):
                    lumiLists.extend(
                        self.dbs.listFileLumiArray(logical_file_name=slfn))
            else:
                # shouldn't call this with both blockName and lfns empty
                # but still returns empty dict for that case
                return {}
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFileLumiArray(%s)\n" % lfns
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        lumiDict = {}
        for lumisItem in lumiLists:
            lumiDict.setdefault(lumisItem['logical_file_name'], [])
            item = {}
            item["RunNumber"] = lumisItem['run_num']
            item['LumiSectionNumber'] = lumisItem['lumi_section_num']
            if lumisItem.get('event_count', None) is not None:
                item['EventCount'] = lumisItem['event_count']
            lumiDict[lumisItem['logical_file_name']].append(item)
            # TODO: add key for lumi and event pair.
        return lumiDict

    def checkDBSServer(self):
        """
        check whether dbs server is up and running
        returns {"dbs_instance": "prod/global", "dbs_version": "3.3.144"}
        """
        try:
            return self.dbs.serverinfo()
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBS server is not up: %s" % self.dbsURL
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listPrimaryDatasets(self, match='*'):
        """
        _listPrimaryDatasets_

        return a list of primary datasets, The full dataset name must be provided
        pattern based mathcing is no longer supported.
        If no expression is provided, all datasets are returned
        """
        try:
            result = self.dbs.listPrimaryDatasets(primary_ds_name=match)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listPrimaryDataset(%s)\n" % match
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['primary_ds_name'] for x in result]
        return result

    def matchProcessedDatasets(self, primary, tier, process):
        """
        _matchProcessedDatasets_

        return a list of Processed datasets
        """
        result = []
        try:
            datasets = self.dbs.listDatasets(primary_ds_name=primary,
                                             data_tier_name=tier,
                                             detail=True)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        for dataset in datasets:
            dataset = remapDBS3Keys(dataset, processed_ds_name='Name')
            dataset['PathList'] = [dataset['dataset']]
            if dataset['Name'] == process:
                result.append(dataset)
        return result

    def listRuns(self, dataset=None, block=None):
        """
        it gets list of DbsRun object but for our purpose
        only list of number is collected.
        DbsRun (RunNumber,
                NumberOfEvents,
                NumberOfLumiSections,
                TotalLuminosity,
                StoreNumber,
                StartOfRungetLong,
                EndOfRun,
                CreationDate,
                CreatedBy,
                LastModificationDate,
                LastModifiedBy
                )
        """
        runs = []
        try:
            if block:
                results = self.dbs.listRuns(block_name=block)
            else:
                results = self.dbs.listRuns(dataset=dataset)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listRuns(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)
        for x in results:
            runs.extend(x['run_num'])
        return runs

    def listRunLumis(self, dataset=None, block=None):
        """
        It gets a list of DBSRun objects and returns the number of lumisections per run
        DbsRun (RunNumber,
                NumberOfEvents,
                NumberOfLumiSections,
                TotalLuminosity,
                StoreNumber,
                StartOfRungetLong,
                EndOfRun,
                CreationDate,
                CreatedBy,
                LastModificationDate,
                LastModifiedBy
                )
        """
        # Pointless code in python3
        block = decodeBytesToUnicode(block)
        dataset = decodeBytesToUnicode(dataset)

        try:
            if block:
                results = self.dbs.listRuns(block_name=block)
            else:
                results = self.dbs.listRuns(dataset=dataset)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listRuns(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        # send runDict format as result, this format is for sync with dbs2 call
        # which has {run_number: num_lumis} but dbs3 call doesn't return num Lumis
        # So it returns {run_number: None}
        # TODO: After DBS2 is completely removed change the return format more sensible one

        runDict = {}
        for x in results:
            for runNumber in x["run_num"]:
                runDict[runNumber] = None
        return runDict

    def listProcessedDatasets(self, primary, dataTier='*'):
        """
        _listProcessedDatasets_

        return a list of Processed datasets for the primary and optional
        data tier value

        """
        try:
            result = self.dbs.listDatasets(primary_ds_name=primary,
                                           data_tier_name=dataTier)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['dataset'].split('/')[2] for x in result]
        return result

    def listDatasetFiles(self, datasetPath):
        """
        _listDatasetFiles_

        Get list of files for dataset

        """
        return [
            x['logical_file_name']
            for x in self.dbs.listFileArray(dataset=datasetPath)
        ]

    def listDatatiers(self):
        """
        _listDatatiers_

        Get a list of datatiers known by DBS.
        """
        return [tier['data_tier_name'] for tier in self.dbs.listDataTiers()]

    def listDatasetFileDetails(self,
                               datasetPath,
                               getParents=False,
                               getLumis=True,
                               validFileOnly=1):
        """
        TODO: This is completely wrong need to be redone. or be removed - getting dataset altogether
        might be to costly

        _listDatasetFileDetails_

        Get list of lumis, events, and parents for each file in a dataset
        Return a dict where the keys are the files, and for each file we have something like:
            { 'NumberOfEvents': 545,
              'BlockName': '/HighPileUp/Run2011A-v1/RAW#dd6e0796-cbcc-11e0-80a9-003048caaace',
              'Lumis': {173658: [8, 12, 9, 14, 19, 109, 105]},
              'Parents': [],
              'Checksum': '22218315',
              'Adler32': 'a41a1446',
              'FileSize': 286021145,
              'ValidFile': 1
            }

        """
        fileDetails = self.getFileListByDataset(dataset=datasetPath,
                                                validFileOnly=validFileOnly,
                                                detail=True)
        blocks = set()  # the set of blocks of the dataset
        # Iterate over the files and prepare the set of blocks and a dict where the keys are the files
        files = {}
        for f in fileDetails:
            blocks.add(f['block_name'])
            files[f['logical_file_name']] = remapDBS3Keys(f, stringify=True)
            files[f['logical_file_name']]['ValidFile'] = f['is_file_valid']
            files[f['logical_file_name']]['Lumis'] = {}
            files[f['logical_file_name']]['Parents'] = []

        # Iterate over the blocks and get parents and lumis
        for blockName in blocks:
            # get the parents
            if getParents:
                parents = self.dbs.listFileParents(block_name=blockName)
                for p in parents:
                    if p['logical_file_name'] in files:  # invalid files are not there if validFileOnly=1
                        files[p['logical_file_name']]['Parents'].extend(
                            p['parent_logical_file_name'])

            if getLumis:
                # get the lumis
                file_lumis = self.dbs.listFileLumis(block_name=blockName)
                for f in file_lumis:
                    if f['logical_file_name'] in files:  # invalid files are not there if validFileOnly=1
                        if f['run_num'] in files[
                                f['logical_file_name']]['Lumis']:
                            files[f['logical_file_name']]['Lumis'][
                                f['run_num']].extend(f['lumi_section_num'])
                        else:
                            files[f['logical_file_name']]['Lumis'][
                                f['run_num']] = f['lumi_section_num']

        return files

    def crossCheck(self, datasetPath, *lfns):
        """
        _crossCheck_

        For the dataset provided, check that the lfns listed all exist
        in the dataset.

        Return the list of lfns that are in the dataset

        """
        allLfns = self.dbs.listFileArray(dataset=datasetPath,
                                         validFileOnly=1,
                                         detail=False)
        setOfAllLfns = set(allLfns)
        setOfKnownLfns = set(lfns)
        return list(setOfAllLfns.intersection(setOfKnownLfns))

    def crossCheckMissing(self, datasetPath, *lfns):
        """
        _crossCheckMissing_

        As cross check, but return value is a list of files that
        are *not* known by DBS

        """
        allLfns = self.dbs.listFileArray(dataset=datasetPath,
                                         validFileOnly=1,
                                         detail=False)
        setOfAllLfns = set(allLfns)
        setOfKnownLfns = set(lfns)
        knownFiles = setOfAllLfns.intersection(setOfKnownLfns)
        unknownFiles = setOfKnownLfns.difference(knownFiles)
        return list(unknownFiles)

    def getDBSSummaryInfo(self, dataset=None, block=None):
        """
        Get dataset summary includes # of files, events, blocks and total size
        """
        if dataset:
            self.checkDatasetPath(dataset)
        try:
            if block:
                summary = self.dbs.listFileSummaries(block_name=block,
                                                     validFileOnly=1)
            else:
                summary = self.dbs.listFileSummaries(dataset=dataset,
                                                     validFileOnly=1)
        except Exception as ex:
            msg = "Error in DBSReader.getDBSSummaryInfo(%s, %s)\n" % (dataset,
                                                                      block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if not summary:  # missing data or all files invalid
            return {}

        result = remapDBS3Keys(summary[0], stringify=True)
        result['path'] = dataset if dataset else ''
        result['block'] = block if block else ''
        return result

    def listFileBlocks(self, dataset, onlyClosedBlocks=False, blockName=None):
        """
        _listFileBlocks_

        Retrieve a list of fileblock names for a dataset

        """
        self.checkDatasetPath(dataset)
        args = {'dataset': dataset, 'detail': False}
        if blockName:
            args['block_name'] = blockName
        if onlyClosedBlocks:
            args['detail'] = True
        try:
            blocks = self.dbs.listBlocks(**args)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if onlyClosedBlocks:
            result = [
                x['block_name'] for x in blocks
                if str(x['open_for_writing']) != "1"
            ]

        else:
            result = [x['block_name'] for x in blocks]

        return result

    def listOpenFileBlocks(self, dataset):
        """
        _listOpenFileBlocks_

        Retrieve a list of open fileblock names for a dataset

        """
        self.checkDatasetPath(dataset)
        try:
            blocks = self.dbs.listBlocks(dataset=dataset, detail=True)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [
            x['block_name'] for x in blocks
            if str(x['open_for_writing']) == "1"
        ]

        return result

    def blockExists(self, fileBlockName):
        """
        _blockExists_

        Check to see if block with name provided exists in the DBS
        Instance.

        Return True if exists, False if not

        """
        self.checkBlockName(fileBlockName)
        try:

            blocks = self.dbs.listBlocks(block_name=fileBlockName)
        except Exception as ex:
            msg = "Error in "
            msg += "DBSReader.blockExists(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if len(blocks) == 0:
            return False
        return True

    def listFilesInBlock(self, fileBlockName, lumis=True, validFileOnly=1):
        """
        _listFilesInBlock_

        Get a list of files in the named fileblock
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.
        We need to clean code up when dbs2 is completely deprecated.
        calling lumis for run number is expensive.
        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            files = self.dbs.listFileArray(block_name=fileBlockName,
                                           validFileOnly=validFileOnly,
                                           detail=True)
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlock(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if lumis:
            lumiDict = self._getLumiList(blockName=fileBlockName,
                                         validFileOnly=validFileOnly)

        result = []
        for fileInfo in files:
            if lumis:
                fileInfo["LumiList"] = lumiDict[fileInfo['logical_file_name']]
            result.append(remapDBS3Keys(fileInfo, stringify=True))
        return result

    def listFilesInBlockWithParents(self,
                                    fileBlockName,
                                    lumis=True,
                                    validFileOnly=1):
        """
        _listFilesInBlockWithParents_

        Get a list of files in the named fileblock including
        the parents of that file.
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            # TODO: shoud we get only valid block for this?
            files = self.dbs.listFileParents(block_name=fileBlockName)
            fileDetails = self.listFilesInBlock(fileBlockName, lumis,
                                                validFileOnly)

        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n" % (
                fileBlockName, )
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        childByParents = defaultdict(list)
        for f in files:
            # Probably a child can have more than 1 parent file
            for fp in f['parent_logical_file_name']:
                childByParents[fp].append(f['logical_file_name'])

        parentsLFNs = list(childByParents)

        if len(parentsLFNs) == 0:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n There is no parents files" % (
                fileBlockName)
            raise DBSReaderError(msg)

        parentFilesDetail = []
        # TODO: slicing parentLFNs util DBS api is handling that.
        # Remove slicing if DBS api handles
        for pLFNs in grouper(parentsLFNs, 50):
            parentFilesDetail.extend(
                self.dbs.listFileArray(logical_file_name=pLFNs, detail=True))

        if lumis:
            parentLumis = self._getLumiList(lfns=parentsLFNs)

        parentsByLFN = defaultdict(list)

        for pf in parentFilesDetail:
            parentLFN = pf['logical_file_name']
            dbsFile = remapDBS3Keys(pf, stringify=True)
            if lumis:
                dbsFile["LumiList"] = parentLumis[parentLFN]

            for childLFN in childByParents[parentLFN]:
                parentsByLFN[childLFN].append(dbsFile)

        for fileInfo in fileDetails:
            fileInfo["ParentList"] = parentsByLFN[
                fileInfo['logical_file_name']]

        return fileDetails

    def lfnsInBlock(self, fileBlockName):
        """
        _lfnsInBlock_

        LFN list only for block, details = False => faster query

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.lfnsInBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            lfns = self.dbs.listFileArray(block_name=fileBlockName,
                                          validFileOnly=1,
                                          detail=False)
            return lfns
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlock(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listFileBlockLocation(self, fileBlockNames):
        """
        _listFileBlockLocation_

        Get origin_site_name of a block

        """

        singleBlockName = None
        if isinstance(fileBlockNames, (str, bytes)):
            singleBlockName = fileBlockNames
            fileBlockNames = [fileBlockNames]

        for block in fileBlockNames:
            self.checkBlockName(block)

        locations = {}
        node_filter = set(['UNKNOWN', None])

        blocksInfo = {}
        try:
            for block in fileBlockNames:
                blocksInfo.setdefault(block, [])
                # there should be only one element with a single origin site string ...
                for blockInfo in self.dbs.listBlockOrigin(block_name=block):
                    blocksInfo[block].append(blockInfo['origin_site_name'])
        except dbsClientException as ex:
            msg = "Error in DBS3Reader: self.dbs.listBlockOrigin(block_name=%s)\n" % fileBlockNames
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        for block in fileBlockNames:
            valid_nodes = set(blocksInfo.get(block, [])) - node_filter
            locations[block] = list(valid_nodes)

        # returning single list if a single block is passed
        if singleBlockName:
            return locations[singleBlockName]

        return locations

    def getFileBlock(self, fileBlockName):
        """
        Retrieve a list of files in the block; a flag whether the
        block is still open or not; and it used to resolve the block
        location via PhEDEx.

        :return: a dictionary in the format of:
            {"PhEDExNodeNames" : [],
             "Files" : { LFN : Events },
             "IsOpen" : True|False}
        """
        result = {
            "PhEDExNodeNames": [],  # FIXME: we better get rid of this line!
            "Files": self.listFilesInBlock(fileBlockName),
            "IsOpen": self.blockIsOpen(fileBlockName)
        }
        return result

    def getFileBlockWithParents(self, fileBlockName):
        """
        Retrieve a list of parent files in the block; a flag whether the
        block is still open or not; and it used to resolve the block
        location via PhEDEx.

        :return: a dictionary in the format of:
            {"PhEDExNodeNames" : [],
             "Files" : { LFN : Events },
             "IsOpen" : True|False}
        """
        fileBlockName = decodeBytesToUnicode(fileBlockName)

        if not self.blockExists(fileBlockName):
            msg = "DBSReader.getFileBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        result = {
            "PhEDExNodeNames": [],  # FIXME: we better get rid of this line!
            "Files": self.listFilesInBlockWithParents(fileBlockName),
            "IsOpen": self.blockIsOpen(fileBlockName)
        }
        return result

    def listBlockParents(self, blockName):
        """
        Return a list of parent blocks for a given child block name
        """
        # FIXME: note the different returned data structure
        result = []
        self.checkBlockName(blockName)
        blocks = self.dbs.listBlockParents(block_name=blockName)
        result = [block['parent_block_name'] for block in blocks]
        return result

    def blockIsOpen(self, blockName):
        """
        _blockIsOpen_

        Return True if named block is open, false if not, or if block
        doenst exist

        """
        self.checkBlockName(blockName)
        blockInstance = self.dbs.listBlocks(block_name=blockName, detail=True)
        if len(blockInstance) == 0:
            return False
        blockInstance = blockInstance[0]
        isOpen = blockInstance.get('open_for_writing', 1)
        if isOpen == 0:
            return False
        return True

    def blockToDatasetPath(self, blockName):
        """
        _blockToDatasetPath_

        Given a block name, get the dataset Path associated with that
        Block.

        Returns the dataset path, or None if not found

        """
        self.checkBlockName(blockName)
        try:
            blocks = self.dbs.listBlocks(block_name=blockName, detail=True)
        except Exception as ex:
            msg = "Error in "
            msg += "DBSReader.blockToDatasetPath(%s)\n" % blockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if blocks == []:
            return None

        pathname = blocks[-1].get('dataset', None)
        return pathname

    def listDatasetLocation(self, datasetName):
        """
        _listDatasetLocation_

        List the origin SEs where there is at least a block of the given
        dataset.
        """
        self.checkDatasetPath(datasetName)

        locations = set()
        try:
            blocksInfo = self.dbs.listBlockOrigin(dataset=datasetName)
        except dbsClientException as ex:
            msg = "Error in DBSReader: dbsApi.listBlocks(dataset=%s)\n" % datasetName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if not blocksInfo:  # no data location from dbs
            return list()

        for blockInfo in blocksInfo:
            locations.update(blockInfo['origin_site_name'])

        locations.difference_update(
            ['UNKNOWN', None])  # remove entry when SE name is 'UNKNOWN'

        return list(locations)

    def checkDatasetPath(self, pathName):
        """
         _checkDatasetPath_
        """
        if pathName in ("", None):
            raise DBSReaderError("Invalid Dataset Path name: => %s <=" %
                                 pathName)
        else:
            try:
                result = self.dbs.listDatasets(dataset=pathName,
                                               dataset_access_type='*')
                if len(result) == 0:
                    raise DBSReaderError("Dataset %s doesn't exist in DBS %s" %
                                         (pathName, self.dbsURL))
            except (dbsClientException, HTTPError) as ex:
                msg = "Error in "
                msg += "DBSReader.checkDatasetPath(%s)\n" % pathName
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)
        return

    def checkBlockName(self, blockName):
        """
         _checkBlockName_
        """
        if blockName in ("", "*", None):
            raise DBSReaderError("Invalid Block name: => %s <=" % blockName)

    def getFileListByDataset(self, dataset, validFileOnly=1, detail=True):
        """
        _getFileListByDataset_

        Given a dataset, retrieves all blocks, lfns and number of events (among other
        not really important info).
        Returns a list of dict.
        """

        try:
            fileList = self.dbs.listFileArray(dataset=dataset,
                                              validFileOnly=validFileOnly,
                                              detail=detail)
            return fileList
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.getFileListByDataset(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listDatasetParents(self, childDataset):
        """
        list the the parents dataset path given childDataset
        """
        try:
            parentList = self.dbs.listDatasetParents(dataset=childDataset)
            return parentList
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listDatasetParents(%s)\n" % childDataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    # def getListFilesByLumiAndDataset(self, dataset, files):
    #     "Unsing pycurl to get all the child parents pair for given dataset"
    #
    #     urls = ['%s/data/dbs/fileparentbylumis?block_name=%s' % (
    #              self.dbsURL, b["block_name"]) for b in self.dbs.listBlocks(dataset=dataset)]
    #
    #     data = multi_getdata(urls, ckey(), cert())
    #     rdict = {}
    #     for row in data:
    #         try:
    #             data = json.loads(row['data'])
    #             rdict[req] = data['result'][0]  # we get back {'result': [workflow]} dict
    #         except Exception as exp:
    #             print("ERROR: fail to load data as json record, error=%s" % str(exp))
    #             print(row)
    #     return rdict

    def getParentFilesGivenParentDataset(self, parentDataset, childLFNs):
        """
        returns parent files for given childLFN when DBS doesn't have direct parent child relationship in DB
        Only use this for finding missing parents

        :param parentDataset: parent dataset for childLFN
        :param childLFN: a file in child dataset
        :return: set of parent files for childLFN
        """
        fInfo = self.dbs.listFileLumiArray(logical_file_name=childLFNs)
        parentFiles = defaultdict(set)
        for f in fInfo:
            pFileList = self.dbs.listFiles(dataset=parentDataset,
                                           run_num=f['run_num'],
                                           lumi_list=f['lumi_section_num'])
            pFiles = set([x['logical_file_name'] for x in pFileList])
            parentFiles[f['logical_file_name']] = parentFiles[
                f['logical_file_name']].union(pFiles)
        return parentFiles

    def getParentFilesByLumi(self, childLFN):
        """
        get the parent file's lfns by lumi (This might not be the actual parentage relations in DBS just parentage by Lumis).
        use for only specific lfn for validating purpose, for the parentage fix use findAndInsertMissingParentage
        :param childLFN:
        :return: list of dictionary with parent files for given child LFN and parent dataset
        [{"ParentDataset": /abc/bad/ddd, "ParentFiles": [alf, baf, ...]]
        """
        childDatasets = self.dbs.listDatasets(logical_file_name=childLFN)
        result = []
        for i in childDatasets:
            parents = self.dbs.listDatasetParents(dataset=i["dataset"])
            for parent in parents:
                parentFiles = self.getParentFilesGivenParentDataset(
                    parent['parent_dataset'], childLFN)
                result.append({
                    "ParentDataset": parent['parent_dataset'],
                    "ParentFiles": list(parentFiles)
                })
        return result

    def insertFileParents(self, childBlockName, childParentsIDPairs):
        """
        :param childBlockName: child block name
        :param childParentsIDPairs: list of list child and parent file ids, i.e. [[1,2], [3,4]...]
                dbs validate child ids from the childBlockName
        :return: None
        """
        return self.dbs.insertFileParents({
            "block_name":
            childBlockName,
            "child_parent_id_list":
            childParentsIDPairs
        })

    def findAndInsertMissingParentage(self,
                                      childBlockName,
                                      parentData,
                                      insertFlag=True):
        """
        :param childBlockName: child block name
        :param parentData: a dictionary with complete parent dataset file/run/lumi information
        :param insertFlag: boolean to allow parentage insertion into DBS or not
        :return: number of file parents pair inserted
        """
        # in the format of: {'fileid': [[run_num1, lumi1], [run_num1, lumi2], etc]
        # e.g. {'554307997': [[1, 557179], [1, 557178], [1, 557181],
        childBlockData = self.dbs.listBlockTrio(block_name=childBlockName)

        # runs the actual mapping logic, like {"child_id": ["parent_id", "parent_id2", ...], etc
        mapChildParent = {}
        # there should be only 1 item, but we better be safe
        for item in childBlockData:
            for childFileID in item:
                for runLumiPair in item[childFileID]:
                    frozenKey = frozenset(runLumiPair)
                    parentId = parentData.get(frozenKey)
                    if parentId is None:
                        msg = "Child file id: %s, with run/lumi: %s, has no match in the parent dataset"
                        self.logger.warning(msg, childFileID, frozenKey)
                        continue
                    mapChildParent.setdefault(childFileID, set())
                    mapChildParent[childFileID].add(parentId)

        if insertFlag and mapChildParent:
            # convert dictionary to list of unique childID, parentID tuples
            listChildParent = []
            for childID in mapChildParent:
                for parentID in mapChildParent[childID]:
                    listChildParent.append([int(childID), int(parentID)])
            self.dbs.insertFileParents({
                "block_name": childBlockName,
                "child_parent_id_list": listChildParent
            })
        return len(mapChildParent)

    def listBlocksWithNoParents(self, childDataset):
        """
        :param childDataset: child dataset for
        :return: set of child blocks with no parentBlock
        """
        allBlocks = self.dbs.listBlocks(dataset=childDataset)
        blockNames = []
        for block in allBlocks:
            blockNames.append(block['block_name'])
        parentBlocks = self.dbs.listBlockParents(block_name=blockNames)

        cblock = set()
        for pblock in parentBlocks:
            cblock.add(pblock['this_block_name'])

        noParentBlocks = set(blockNames) - cblock
        return noParentBlocks

    def listFilesWithNoParents(self, childBlockName):
        """
        :param childBlockName:
        :return:
        """
        allFiles = self.dbs.listFiles(block_name=childBlockName)
        parentFiles = self.dbs.listFileParents(block_name=childBlockName)

        allFileNames = set()
        for fInfo in allFiles:
            allFileNames.add(fInfo['logical_file_name'])

        cfile = set()
        for pFile in parentFiles:
            cfile.add(pFile['logical_file_name'])

        noParentFiles = allFileNames - cfile
        return list(noParentFiles)

    def fixMissingParentageDatasets(self, childDataset, insertFlag=True):
        """
        :param childDataset: child dataset need to set the parentage correctly.
        :return: blocks which failed to insert parentage. for retry
        """
        pDatasets = self.listDatasetParents(childDataset)
        self.logger.info("Parent datasets for %s are: %s", childDataset,
                         pDatasets)
        # print("parent datasets %s\n" % pDatasets)
        # pDatasets format is
        # [{'this_dataset': '/SingleMuon/Run2016D-03Feb2017-v1/MINIAOD', 'parent_dataset_id': 13265209, 'parent_dataset': '/SingleMuon/Run2016D-23Sep2016-v1/AOD'}]
        if not pDatasets:
            self.logger.warning("No parent dataset found for child dataset %s",
                                childDataset)
            return {}

        parentFullInfo = self.getParentDatasetTrio(childDataset)
        blocks = self.listBlocksWithNoParents(childDataset)
        failedBlocks = []
        self.logger.info("Found %d blocks without parentage information",
                         len(blocks))
        for blockName in blocks:
            try:
                self.logger.info("Fixing parentage for block: %s", blockName)
                numFiles = self.findAndInsertMissingParentage(
                    blockName, parentFullInfo, insertFlag=insertFlag)
                self.logger.debug("%s file parentage added for block %s",
                                  numFiles, blockName)
            except Exception as ex:
                self.logger.exception("Parentage updated failed for block %s",
                                      blockName)
                failedBlocks.append(blockName)

        return failedBlocks

    def getParentDatasetTrio(self, childDataset):
        """
        Provided a dataset name, return all the parent dataset information, such as:
          - file ids, run number and lumi section
        NOTE: This API is meant to be used by the StepChainParentage thread only!!!
        :param childDataset: name of the child dataset
        :return: a dictionary where the key is a set of run/lumi, its value is the fileid
        """
        # this will return data in the format of:
        # {'554307997': [[1, 557179], [1, 557178],...
        # such that: key is file id, in each list is [run_number, lumi_section_numer].
        parentFullInfo = self.dbs.listParentDSTrio(dataset=childDataset)

        # runs the actual mapping logic, like {"child_id": ["parent_id", "parent_id2", ...], etc
        parentFrozenData = {}
        for item in parentFullInfo:
            for fileId in item:
                for runLumiPair in item[fileId]:
                    frozenKey = frozenset(runLumiPair)
                    parentFrozenData[frozenKey] = fileId
        return parentFrozenData
Exemple #55
0
def main():
    # get a validate file name from args

    parser = argparse.ArgumentParser()
    parser.add_argument('--file',
                        help='file containing the dump of the block',
                        default=None,
                        required=True)
    args = parser.parse_args()
    fileName = args.file
    #fileName = 'failed-block-at-1611258668.34.txt' # just an example

    failedBlocksDir = '/data/srv/Publisher_files/FailedBlocks/'
    filePath = failedBlocksDir + fileName
    if not os.path.isfile(filePath):
        print("File %s not found in %s" % (fileName, failedBlocksDir))
        return

    # initialize DBS access
    # if X509 vars are not defined, use default Publisher location
    userProxy = os.getenv('X509_USER_PROXY')
    if userProxy:
        os.environ['X509_USER_CERT'] = userProxy
        os.environ['X509_USER_KEY'] = userProxy
    if not os.getenv('X509_USER_CERT'):
        os.environ['X509_USER_CERT'] = '/data/certs/servicecert.pem'
    if not os.getenv('X509_USER_KEY'):
        os.environ['X509_USER_KEY'] = '/data/certs/servicekey.pem'
    #migUrl = 'https://cmsweb.cern.ch/dbs/prod/phys03/DBSMigrate'
    phy3Url = 'https://cmsweb.cern.ch/dbs/prod/phys03/DBSReader'
    #globUrl = 'https://cmsweb.cern.ch/dbs/prod/global/DBSReader'
    destUrl = 'https://cmsweb.cern.ch/dbs/prod/phys03/DBSWriter'
    #apiG = DbsApi(url=globUrl)
    apiP3 = DbsApi(url=phy3Url)
    #apiMig = DbsApi(url=migUrl)
    apiDest = DbsApi(url=destUrl)

    with open(filePath) as fp:
        blockData = fp.read()
    # from pprint.pprint format to a dictionary (slow, unsafe, but handy)
    block = eval(blockData)  # pylint: disable=eval-used

    targetDataset = block['dataset']['dataset']

    print('Block is meant to be added to dataset\n%s' % targetDataset)

    # look for files already present in DBS phys03
    alreadyPresentFile = False
    lfns = [f['logical_file_name'] for f in block['files']]
    print('Block contains %d files' % len(lfns))
    numPresent = 0
    sameDSet = 0
    otherDSet = 0
    otherDSlist = set()
    for lfn in lfns:
        ret = apiP3.listFiles(logical_file_name=lfn)
        if ret:
            alreadyPresentFile = True
            numPresent += 1
            if numPresent < 5:
                print('file %s found in DBS' % lfn)
            if numPresent == 5:
                print('more files found ...')
            #details = apiP3.listFiles(logical_file_name=lfn, detail=True)
            #print(details)
            lfnDSet = apiP3.listDatasets(logical_file_name=lfn)[0]['dataset']
            if lfnDSet == targetDataset:
                sameDSet += 1
                if sameDSet < 5:
                    print('this lfn is already in target dataset')
            else:
                otherDSet += 1
                if otherDSet < 5:
                    print('this lfn belongs to another dataset:\n%s' % lfnDSet)
                if not lfnDSet in otherDSlist:
                    otherDSlist.add(lfnDSet)

            #lfnBlock = apiP3.listBlocks(logical_file_name=lfn)
            #print('in block:\n%s' % lfnBlock[0]['block_name'])

    if alreadyPresentFile:
        print(
            '%d/%d file(s) from input blocks are already in DBS/phys03. Publication will fail'
            % (numPresent, len(lfns)))
        print('files already present in target dataset: %d' % sameDSet)
        print('files present in DBS in another dataset: %d' % otherDSet)
        if otherDSet:
            print('other datasets containing files from this block:\n%s' %
                  otherDSlist)
        return

    print(
        'No obvious reason for Publication failure found, try to insert again')
    try:
        apiDest.insertBulkBlock(block)
    except Exception as ex:
        print("Publication failed with exception:\n%s" % str(ex))
        return
    print("Block publication done OK")

    return