Exemple #1
0
def getDatasetSize(dataset):
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve file aggregation only by the runs
    #transform from strin to list
    reply = dbsapi.listBlockSummaries(dataset=dataset)
    return reply[0]['file_size']
Exemple #2
0
def dbs3_get_data(dataset, timestamps=1):

    #q = "/afs/cern.ch/user/s/spinoso/public/dbs3wrapper.sh /afs/cern.ch/user/c/cmst2/mc/scripts/datasetinfo.py --dataset %s --json" % dataset
    #output=os.popen(q).read()
    #s = json.loads(output)
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve dataset summary
    try:
        reply = dbsapi.listDatasets(dataset=dataset,
                                    dataset_access_type='*',
                                    detail=True)
        #print reply
        if len(reply):
            status = reply[0]['dataset_access_type']
            reply = dbsapi.listBlockSummaries(dataset=dataset, detail=True)
            cnt = 0
            for block in reply:
                cnt += int(block['num_event'])
            return [cnt, status, int(cnt / 100.)]
        else:
            print dataset, "not exsiting"
            return [0, '', 0]

    except:
        print "crash dbs3"
        return [0, '', 0]
Exemple #3
0
def main():
#  args=sys.argv[1:]
#  data=args[0]

  sample_group = 'signal' # signal, background, data, all
  sample_list = get_sample_list(sample_group)
  sample_list.sort()

  url="https://cmsweb.cern.ch/dbs/prod/global/DBSReader"
  api=DbsApi(url=url)

  for samp in sample_list:
    outputDataSets = ''
    #print('Checking {1}'.format(samp.DAS))
    outputDataSets = api.listDatasets(dataset=samp.DAS, detail = True, dataset_access_type='VALID')
 
    if outputDataSets:
      for ds in outputDataSets:
       #print('{0}'.format(ds['dataset']))
       #print('{0}'.format(ds['primary_ds_name']))
       #print('{0}'.format(ds['xtcrosssection']))
       nevents = api.listBlockSummaries(dataset=ds['dataset'])
       #print(nevents[0]['num_event'])    
       # this to create a table for the paper with dataset name and number of events 
       print('verb@ {0} @ & {1:.2e} & XX \\\\ '.format(ds['primary_ds_name'],nevents[0]['num_event'])) 
  sys.exit(0);
def getDatasetSize(dataset):
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve file aggregation only by the runs
    #transform from strin to list
    reply = dbsapi.listBlockSummaries(dataset=dataset)
    return reply[0]['file_size']
Exemple #5
0
def getFileCount(dataset):
        # initialize API to DBS3
        dbsapi = DbsApi(url=dbs3_url)
        # retrieve dataset summary
        reply = dbsapi.listBlockSummaries(dataset=dataset,detail=True)
        cnt=0
        for block in reply:
           cnt = cnt + int(block['num_file'])
        return cnt
def getEventCountDataSet(dataset):
    """
    Returns the number of events in a dataset using DBS3

    """
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve dataset summary
    reply = dbsapi.listBlockSummaries(dataset=dataset)
    return reply[0]['num_event']
def getEventCountDataSetBlockList(dataset,blockList):
    """
    Counts and adds all the events for a given lists
    blocks inside a dataset
    """
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)    
    lumis=0
    reply = dbsapi.listBlockSummaries(block_name=blockList)       
    return reply[0]['num_event']
def getEventCountBlock(block):
    """
    Returns the number of events in a dataset using DBS3

    """
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve dataset summary
    reply = dbsapi.listBlockSummaries(block_name=block)
    return reply[0]["num_event"]
Exemple #9
0
def getEventCountBlock(block):
    """
    Returns the number of events in a dataset using DBS3

    """
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve dataset summary
    reply = dbsapi.listBlockSummaries(block_name=block)
    return reply[0]['num_event']
Exemple #10
0
def getNumberOfEvents( dataset ):
    query = 'find sum(block.numevents) where dataset = ' + dataset
    dbs_cmd = [ 'dbs', 'search', '--query', query ]
    dbs_output = subprocess.Popen( dbs_cmd, stdout = subprocess.PIPE ).communicate()[0]

    from dbs.apis.dbsClient import DbsApi

    dbsUrl = 'https://cmsweb.cern.ch/dbs/prod/global/DBSReader'
    dbsApi = DbsApi( url = dbsUrl )
    datasetBlocks = dbsApi.listBlockSummaries( dataset = dataset )

    numEvents = sum( [ block['num_event'] for block in datasetBlocks ] )

    return numEvents
Exemple #11
0
def getEventCountDataSetBlockList(dataset,blockList):
    """
    Counts and adds all the events for a given lists
    blocks inside a dataset
    """
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)    
    #transform from strin to list
    if type(blockList) in (str, unicode):
        blockList = eval(blockList)
    total = 0
    #get one by one block and add it so uri wont be too large
    for block in blockList:
        reply = dbsapi.listBlockSummaries(block_name=block)
        total += reply[0]['num_event']
    return total
Exemple #12
0
def getEventCountDataSetBlockList(dataset,blockList):
    """
    Counts and adds all the events for a given lists
    blocks inside a dataset
    """
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)    
    #transform from strin to list
    if type(blockList) in (str, unicode):
        blockList = eval(blockList)
    total = 0
    #get one by one block and add it so uri wont be too large
    for block in blockList:
        reply = dbsapi.listBlockSummaries(block_name=block)
        total += reply[0]['num_event']
    return total
Exemple #13
0
def getDatasetChops(dataset, chop_threshold=1000., talk=False):
    ## does a *flat* choping of the input in chunk of size less than chop threshold
    dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader')
    blocks = dbsapi.listBlockSummaries(dataset=dataset, detail=True)
    sum_all = 0

    ## put everything in terms of GB
    for block in blocks:
        block['file_size'] /= 1000000000.

    for block in blocks:
        sum_all += block['file_size']

    items = []
    if sum_all > chop_threshold:
        items.extend([[block['block_name']] for block in filter(
            lambda b: b['file_size'] > chop_threshold, blocks)])
        small_block = filter(lambda b: b['file_size'] <= chop_threshold,
                             blocks)
        small_block.sort(lambda b1, b2: cmp(b1['file_size'], b2['file_size']),
                         reverse=True)

        while len(small_block):
            first, small_block = small_block[0], small_block[1:]
            items.append([first['block_name']])
            size_chunk = first['file_size']
            while size_chunk < chop_threshold and small_block:
                last, small_block = small_block[-1], small_block[:-1]
                size_chunk += last['file_size']
                items[-1].append(last['block_name'])

            if talk:
                print len(items[-1]), "items below thresholds", size_chunk
                print items[-1]
    else:
        if talk:
            print "one big", sum_all
        items = [[dataset]]
    if talk:
        print items
    ## a list of list of blocks or dataset
    print "Choped", dataset, "of size", sum_all, "GB (", chop_threshold, "GB) in", len(
        items), "pieces"
    return items
Exemple #14
0
def getDatasetPresence( url, dataset, complete='y', only_blocks=None, group=None, vetoes=None):
    if vetoes==None:
        vetoes = ['MSS','Buffer','Export']
    #print "presence of",dataset
    dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader')
    all_blocks = dbsapi.listBlockSummaries( dataset = dataset, detail=True)
    all_block_names=set([block['block_name'] for block in all_blocks])
    if only_blocks:
        all_block_names = filter( lambda b : b in only_blocks, all_block_names)
        full_size = sum([block['file_size'] for block in all_blocks if (block['block_name'] in only_blocks)])
        #print all_block_names
        #print [block['block_name'] for block in all_blocks if block['block_name'] in only_blocks]
    else:
        full_size = sum([block['file_size'] for block in all_blocks])
    if not full_size:
        print dataset,"is nowhere"
        return {}
    #print full_size
    conn  =  httplib.HTTPSConnection(url, cert_file = os.getenv('X509_USER_PROXY'), key_file = os.getenv('X509_USER_PROXY'))
    r1=conn.request("GET",'/phedex/datasvc/json/prod/blockreplicas?dataset=%s'%(dataset))
    r2=conn.getresponse()
    result = json.loads(r2.read())
    items=result['phedex']['block']


    locations=defaultdict(set)
    for item in items:
        for replica in item['replica']:
            if not any(replica['node'].endswith(v) for v in vetoes):
                if replica['group'] == None: replica['group']=""
                if complete and not replica['complete']==complete: continue
                #if group!=None and replica['group']==None: continue
                if group!=None and not replica['group'].lower()==group.lower(): continue 
                locations[replica['node']].add( item['name'] )

    presence={}
    for (site,blocks) in locations.items():
        site_size = sum([ block['file_size'] for block in all_blocks if (block['block_name'] in blocks and block['block_name'] in all_block_names)])
        #print site,blocks,all_block_names
        #presence[site] = (set(blocks).issubset(set(all_block_names)), site_size/float(full_size)*100.)
        presence[site] = (set(all_block_names).issubset(set(blocks)), site_size/float(full_size)*100.)
    #print json.dumps( presence , indent=2)
    return presence
Exemple #15
0
def getEventCountDataSet(dataset, skipInvalid=False):
    """
    Returns the number of events in a dataset using DBS3
    If skipInvalid =True, it will count only valid files.
    This is slower (specially on larger datasets)
    """
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve dataset summary - faster
    if not skipInvalid:
        reply = dbsapi.listBlockSummaries(dataset=dataset)
        if not reply:
            return 0
        return reply[0]['num_event']
    #discard invalid files (only count valid ones) - slower
    else:
        # retrieve file list
        reply = dbsapi.listFiles(dataset=dataset, detail=True)
        #sum only valid
        total = sum(f['event_count'] for f in reply if f['is_file_valid']==1)
        return total
Exemple #16
0
def getEventCountDataSet(dataset, skipInvalid=False):
    """
    Returns the number of events in a dataset using DBS3
    If skipInvalid =True, it will count only valid files.
    This is slower (specially on larger datasets)
    """
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve dataset summary - faster
    if not skipInvalid:
        reply = dbsapi.listBlockSummaries(dataset=dataset)
        if not reply:
            return 0
        return reply[0]['num_event']
    #discard invalid files (only count valid ones) - slower
    else:
        # retrieve file list
        reply = dbsapi.listFiles(dataset=dataset, detail=True)
        #sum only valid
        total = sum(f['event_count'] for f in reply if f['is_file_valid']==1)
        return total
Exemple #17
0
def getDatasetChops(dataset, chop_threshold =1000., talk=False):
    ## does a *flat* choping of the input in chunk of size less than chop threshold
    dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader')
    blocks = dbsapi.listBlockSummaries( dataset = dataset, detail=True)
    sum_all = 0

    ## put everything in terms of GB
    for block in blocks:
        block['file_size'] /= 1000000000.

    for block in blocks:
        sum_all += block['file_size']

    items=[]
    if sum_all > chop_threshold:
        items.extend( [[block['block_name']] for block in filter(lambda b : b['file_size'] > chop_threshold, blocks)] )
        small_block = filter(lambda b : b['file_size'] <= chop_threshold, blocks)
        small_block.sort( lambda b1,b2 : cmp(b1['file_size'],b2['file_size']), reverse=True)

        while len(small_block):
            first,small_block = small_block[0],small_block[1:]
            items.append([first['block_name']])
            size_chunk = first['file_size']
            while size_chunk < chop_threshold and small_block:
                last,small_block = small_block[-1], small_block[:-1]
                size_chunk += last['file_size']
                items[-1].append( last['block_name'] )
                
            if talk:
                print len(items[-1]),"items below thresholds",size_chunk
                print items[-1]
    else:
        if talk:
            print "one big",sum_all
        items = [[dataset]] 
    if talk:
        print items
    ## a list of list of blocks or dataset
    print "Choped",dataset,"of size",sum_all,"GB (",chop_threshold,"GB) in",len(items),"pieces"
    return items
Exemple #18
0
def dbs3_get_data(dataset,timestamps=1):
    
    #q = "/afs/cern.ch/user/s/spinoso/public/dbs3wrapper.sh /afs/cern.ch/user/c/cmst2/mc/scripts/datasetinfo.py --dataset %s --json" % dataset
    #output=os.popen(q).read()
    #s = json.loads(output)
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve dataset summary
    try:
        reply = dbsapi.listDatasets(dataset=dataset,dataset_access_type='*',detail=True)
        #print reply
        if len(reply):
            status=reply[0]['dataset_access_type']
            reply = dbsapi.listBlockSummaries(dataset=dataset,detail=True)
            cnt=0
            for block in reply:
                cnt += int(block['num_event'])
            return [cnt,status,int(cnt/100.)]
        else:
            print dataset,"not exsiting"
            return [0,'',0]

    except:
        print "crash dbs3"
        return [0,'',0]            
if read == None:
	for category in datatiers.keys():
		if category not in dbs_query_results.keys(): dbs_query_results[category] = {}
		for datatier in datatiers[category]:
			if datatier not in dbs_query_results[category].keys(): dbs_query_results[category][datatier] = {}
			blocks = api3.listBlocks(data_tier_name=datatier,min_cdate=startdate.strftime("%s"),max_cdate=enddate.strftime("%s"))
			for block in blocks:
				exclude = False
				for exclusion_string in exclusion_strings[category]:
					if exclusion_string.lower() in block['block_name'].lower():
						if verbose == True: print 'blockname was rejected:',block['block_name']
						exclude = True
						continue
				if exclude == True: continue
				if verbose == True: print 'Querying for the summary for block:',block['block_name'],'!'
				properties = api3.listBlockSummaries(block_name=block['block_name'])
				dbs_query_results[category][datatier][block['block_name']] = properties
	
	if persist != None:
		outputfile = open(persist,'w')
		json.dump(dbs_query_results,outputfile)
		outputfile.close()
else:
	dbs_query_results = json.load(open(read))

for category in datatiers.keys():
	if category not in results.keys(): results[category] = {}
	for datatier in datatiers[category]:
		if datatier not in results[category].keys(): results[category][datatier] = {}
		for blockname in dbs_query_results[category][datatier]:
			triggered_separation = False
Exemple #20
0
def main():

	usage="%prog <options>"

	parser = OptionParser(usage=usage)
	parser.add_option("-u", "--url", dest="url", help="DBS Instance url. default is https://cmsweb.cern.ch/dbs/prod/global/DBSReader", metavar="<url>")
	parser.add_option("-l", "--length", dest="length", help="Number of days for calculate the accumated events. It is Optional, default is 30 days.", metavar="<length>")
	parser.add_option("-d", "--dataset", dest="dataset", help="The dataset name for cacluate the events. Can be optional if datatier is used.", metavar="<dataset>")
	parser.add_option("-t", "--datatier", dest="datatier", help="The datatier name for cacluate the events. Can be optional if dataset is used. In this version datatier is not supported yet.", metavar="<data_tier_name>")
	parser.add_option("-a", "--access_type", dest="ds_access_type", help="Dataset access types: VALID, PRODUCTION or ALL(VALID+PRODUCTION). Default is ALL", metavar="<dataset_access_type>")
	parser.set_defaults(url="https://cmsweb.cern.ch/dbs/prod/global/DBSReader")
	parser.set_defaults(length=30)
	parser.set_defaults(ds_access_type="ALL")

	(opts, args) = parser.parse_args()
	if not (opts.dataset or opts.datatier):
		parser.print_help()
		parser.error('either --dataset or --datatier is required')

	dataset	 = opts.dataset
	#seconds per day    
	sdays = 86400
	lenth = int(opts.length)
	now = time.time()
	#now = 1391353032
	then = now - sdays*lenth
	url = opts.url
	api=DbsApi(url=url)
	outputDataSets = []
    
	f = [0 for x in range(lenth)]
	min_cdate = int(then)
	max_cdate = int(now)
	if (opts.ds_access_type == "ALL"):
		outputDataSetsValid = api.listDatasets(dataset=dataset, min_cdate=min_cdate-30*sdays, 
                          max_cdate=max_cdate, dataset_access_type="VALID")
		outputDataSetsProd = api.listDatasets(dataset=dataset, min_cdate=min_cdate-30*sdays,
                          max_cdate=max_cdate, dataset_access_type="PRODUCTION")
		outputDataSets = outputDataSetsValid + outputDataSetsProd
	elif (opts.ds_access_type == "VALID"):
		outputDataSets = api.listDatasets(dataset=dataset, min_cdate=min_cdate-30*sdays,
                          max_cdate=max_cdate, dataset_access_type="VALID")
	elif (opts.ds_access_type == "PRODUCTION"):
		outputDataSets = api.listDatasets(dataset=dataset, min_cdate=min_cdate-30*sdays,
                          max_cdate=max_cdate, dataset_access_type="PRODUCTION")
	for dataset in outputDataSets:
		outputBlocks = api.listBlocks(dataset=dataset["dataset"], detail=1, min_cdate=min_cdate, max_cdate=max_cdate)
		blockList = []
		blockCdate = {}
		for block in outputBlocks:
			blockList.append(block["block_name"])
			blockCdate[block["block_name"]] = block["creation_date"]
		blockSum = []
		if blockList: 
			blockSum = api.listBlockSummaries(block_name=blockList, detail=1)
		for b in blockSum:
			cdate= blockCdate[b["block_name"]]
			day = int((now-cdate)/sdays)
			f[day] = f[day] + b["num_event"] 
	for i in range(lenth):
		#print (lenth-1)-i, ":  ", f[i], "  ", sum(item['all'] for item in f[i:lenth]) 
		print i, ": ", f[(lenth-1)-i], " ", sum(item for item in f[(lenth-1)-i:lenth])
	sys.exit(0);
# size of provided dataset
#-------------------------

# instantiate an API
dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader')

# first test whether dataset is valid
dbsList = dbsapi.listDatasets(dataset = dataset, dataset_access_type = 'VALID')
datasetInvalid = False
if dbsList == []:
    datasetInvalid = True
    print ' Dataset does not exist or is invalid. Exit now!\n'
    sys.exit(1)

# determine size and number of files
size = str(sum([block['file_size'] for block in dbsapi.listBlockSummaries(dataset = dataset)]))+'UB'
sizeGb = convertSizeToGb(size)

# in case this is an open subscription we need to adjust sizeGb to the expected size
if expectedSizeGb > 0:
    sizeGb = expectedSizeGb

print ' SIZE:    %.1f GB'%(sizeGb)

# prepare subscription list
datasets = []
datasets.append(dataset)


# first make sure this dataset is not owned by DataOps group anymore at the Tier-1 site(s)
#-----------------------------------------------------------------------------------------
Exemple #22
0
def getDatasetSize(dataset):
    dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader')
    blocks = dbsapi.listBlockSummaries( dataset = dataset, detail=True)
    ## put everything in terms of GB
    return sum([block['file_size'] / (1024.**3) for block in blocks])
Exemple #23
0
def QueryForRquestedEventsPerDay(dbsurl,couchurl,outputdict,data_regexp):
    #
    # query couch DB and extract list of requests per day

    # these status values are for rejected workflows
    rejected_status = ['rejected','rejected-archived']

    basenames_to_print = ['SUS-Spring14miniaod-00017_00029_v0_']

    # load requests from json
    header = {'Content-type': 'application/json', 'Accept': 'application/json'}
    conn = httplib.HTTPConnection(couchurl)
    conn.request("GET", '/latency_analytics/_design/latency/_view/maria', headers= header)
    response = conn.getresponse()
    data = response.read()
    conn.close()
    myString = data.decode('utf-8')
    workflows = json.loads(myString)['rows']
    
    # first extract workflows per workflow basename to identify actual requests in case of clones or other 
    basenames = {}
    for entry in workflows:
        # extract information
        workflowname = entry['id']
        info = entry['value']
        workflow_dict = {
                          'Campaign' : info[0],
                          'Tier' : info[1],
                          'Task type' : info[2],
                          'Status' : info[3],
                          'Priority' : info[4],
                          'Requested events' : info[5],
                          '% Complete' : info[6],
                          'Completed events' : 0,
                          'Request date' : time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(info[7])),
                          'Processing dataset name' : '',
                          'Input Dataset' : info[8],
                          'Output Datasets' : info[9],
                          'Filter efficiency' : info[10],
                          'Run white list' : info[11],
                          }
        if workflowname == 'pdmvserv_SUS-Spring14miniaod-00016_00029_v0__140728_120018_4477':
            print workflowname,workflow_dict

        # filter for data_regexp
        match = False
        try:
          for output_dataset in workflow_dict['Output Datasets']:
              if re.compile(data_regexp).match(output_dataset) is not None:
                  match = True
                  break
        except:
          for output_dataset in workflow_dict['Output Datasets']:
              if re.compile(data_regexp).match(output_dataset[0]) is not None:
                  match = True
                  break

        if match == False: continue

        # extract workflow basename, split by '_', remove first field that is the username who injected the workflow, and the last 3 fields that are date, time and fractions of a second (?)
        workflowname_array = workflowname.split('_')
        basename_array = workflowname_array[1:-3]

        # continue if basename_array length == 0
        if len(basename_array) == 0: continue

        # filter out ACDC and tests
        if workflowname.lower().count('acdc') > 0: continue
        if workflowname.lower().count('test') > 0: continue

        # Jen's username is jen_a, from split above a_ could remain, remove
        if basename_array[0].lower() == 'a':
            basename_array = basename_array[1:]

        # if extension, remove EXT from beginning of basename
        if basename_array[0].lower() == 'ext':
            basename_array = basename_array[1:]

        basename = '_'.join(basename_array)
        requestdatetime = int(workflowname_array[-1]) + int(workflowname_array[-2]) * 1E4 + int(workflowname_array[-3]) * 1E10
        if basename not in basenames.keys(): basenames[basename] = {}
        basenames[basename][requestdatetime] = [workflowname,workflow_dict]
    
    # select the original workflow removing clones, etc
    selected = {}
    rejected = {}
    for basename in basenames.keys():
        if basename in basenames_to_print:
            print 'selected basename:',basename
            for date in sorted(basenames[basename].keys()):
                print basenames[basename][date]

        if basename in selected.keys() or basename in rejected.keys(): continue

        # look at all the workflow names of a basename ordered by injection time

        # if the first workflow name of a basename ordered by injection time is not a rejected status, select it
        if basenames[basename][sorted(basenames[basename].keys())[0]][1]['Status'] not in rejected_status:
            selected[basename] = basenames[basename][sorted(basenames[basename].keys())[0]]
        else :
            # if the last workflow is not in rejected status (indication that the workflow never started to run), choose the first workflow as reference
            if basenames[basename][sorted(basenames[basename].keys())[-1]][1]['Status'] not in rejected_status:
                selected[basename] = basenames[basename][sorted(basenames[basename].keys())[0]]
            else :
                # if there is only one workflow for the basename and if the status is rejected
                if len(basenames[basename]) ==  1 and basenames[basename][basenames[basename].keys()[0]][1]['Status'] in rejected_status:
                    rejected[basename] = basenames[basename][basenames[basename].keys()[0]]
                else :
                    # go through workflowname per basename ordered by status, select the first status that is not a rejected status
                    firstvalidentry = None
                    for entry in sorted(basenames[basename].keys()):
                        if basenames[basename][entry][1]['Status'] not in rejected_status:
                            firstvalidentry = entry
                            break
                    if firstvalidentry != None:
                        selected[basename] = basenames[basename][firstvalidentry]
                    else:
                        # check if there are only workflownames per basename that are in a rejected status
                        nonrejectedstatus = False
                        for entry in basenames[basename].keys():
                            if basenames[basename][entry][1]['Status'] not in rejected_status:
                                nonrejectedstatus = True
                                break
                        if nonrejectedstatus == False :
                            # select last one
                            rejected[basename] = basenames[basename][sorted(basenames[basename].keys())[-1]]
                            
        if basename in selected.keys() or basename in rejected.keys(): continue
        print 'could not decide which workflow is the original workflow for basename:',basename
        for date in sorted(basenames[basename].keys()):
            print basenames[basename][date]
        sys.exit(1)
    
    # loop over selected workflows and fill requested events per day
    # only fill day if defined as key of outputdict
    api=DbsApi(url=dbsurl)
    for basename in selected.keys():
        print 'selected basename:',basename
        for date in sorted(basenames[basename].keys()):
            print basenames[basename][date]
        workflowname = selected[basename][0]
        workflow_dict = selected[basename][1]

        # extract unix time of start of day of request date
        request_date = datetime.datetime.strptime(workflow_dict['Request date'],"%Y-%m-%d %H:%M:%S")
        request_date = request_date.replace(tzinfo=pytz.timezone('UTC'))
        request_day = int(datetime.datetime(request_date.year, request_date.month, request_date.day,0,0,0,0, tzinfo=pytz.timezone('UTC')).strftime("%s"))
        if str(request_day) not in outputdict.keys(): continue
        if 'REQUESTED' not in outputdict[str(request_day)].keys(): outputdict[str(request_day)]['REQUESTED'] = 0
        if 'WORKFLOWS' not in outputdict[str(request_day)].keys(): outputdict[str(request_day)]['WORKFLOWS'] = []
        outputdict[str(request_day)]['WORKFLOWS'].append(workflowname)
        request_events = int(workflow_dict['Requested events'])
        if request_events == 0 and workflow_dict['Input Dataset'] != '':
            blocks = api.listBlocks(dataset=workflow_dict['Input Dataset'], detail=False)
            for block in blocks:
                reply= api.listBlockSummaries(block_name=block['block_name'])
                request_events += reply[0]['num_event']
        if workflow_dict['Filter efficiency'] == None :
            outputdict[str(request_day)]['REQUESTED'] += int(request_events)
        else:
            outputdict[str(request_day)]['REQUESTED'] += int(request_events) * float(workflow_dict['Filter efficiency'])
Exemple #24
0
def getDatasetSize(dataset):
    dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader')
    blocks = dbsapi.listBlockSummaries(dataset=dataset, detail=True)
    ## put everything in terms of GB
    return sum([block['file_size'] / (1024.**3) for block in blocks])
if options.era:
     datasets = api.listDatasets(acquisition_era_name=options.era, detail=True)

nDatasetsToCheck = 0
for ds in datasets:
     if datatiers and not ds['data_tier_name'] in datatiers:
          continue
     nDatasetsToCheck += 1
print >>log, "Number of datasets to check: %d" % nDatasetsToCheck
print "Number of datasets to check: %d" % nDatasetsToCheck

for ds in datasets:
     if datatiers and not ds['data_tier_name'] in datatiers:
          continue
     print >>log, "\nDatset:",ds['dataset'],
     blocks = api.listBlockSummaries(dataset = ds['dataset'])
     ds_size = blocks[0]['file_size']/pow(2,30)
     print >>log, " \t %0.0f GB" % (ds_size)

     report = get_subscription_information(ds['dataset'])
     if options.ignore and report['firstSubscription']!=None and (time.time()-report['firstSubscription'])<86400*options.ignore:
          print >>log, "Skip the dataset availability check since the first subscription is very recent"
          continue
     if report['nComplete']==0:
          summary["NoCompleteCopyAnywhere"].append(ds['dataset'])  
          if report['nIncomplete']==0:
               summary["Lost"].append(ds['dataset'])
     if report['nAnalysisOpsComplete']==0:
          summary["NoCompleteCopyAnalysisOps"].append(ds['dataset'])  

pprint.pprint(summary)
# size of provided dataset
#-------------------------

# instantiate an API
dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader')

# first test whether dataset is valid
dbsList = dbsapi.listDatasets(dataset = dataset, dataset_access_type = 'VALID')
datasetInvalid = False
if dbsList == []:
    datasetInvalid = True
    print ' Dataset does not exist or is invalid. Exit now!\n'
    sys.exit(1)

# determine size and number of files
size = str(sum([block['file_size'] for block in dbsapi.listBlockSummaries(dataset = dataset)]))+'UB'
sizeGb = convertSizeToGb(size)

# in case this is an open subscription we need to adjust sizeGb to the expected size
if expectedSizeGb > 0:
    sizeGb = expectedSizeGb

print ' SIZE:    %.1f GB'%(sizeGb)

# prepare subscription list
datasets = []
datasets.append(dataset)


# first make sure this dataset is not owned by DataOps group anymore at the Tier-1 site(s)
#-----------------------------------------------------------------------------------------
def getBlockSizeDataSet(dataset):
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve dataset summary
    reply = dbsapi.listBlockSummaries(dataset=dataset)
    return int(reply[0]['file_size'])/1000000000000.0
Exemple #28
0
def getDatasetPresence(url,
                       dataset,
                       complete='y',
                       only_blocks=None,
                       group=None,
                       vetoes=None):
    if vetoes == None:
        vetoes = ['MSS', 'Buffer', 'Export']
    #print "presence of",dataset
    dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader')
    all_blocks = dbsapi.listBlockSummaries(dataset=dataset, detail=True)
    all_block_names = set([block['block_name'] for block in all_blocks])
    if only_blocks:
        all_block_names = filter(lambda b: b in only_blocks, all_block_names)
        full_size = sum([
            block['file_size'] for block in all_blocks
            if (block['block_name'] in only_blocks)
        ])
        #print all_block_names
        #print [block['block_name'] for block in all_blocks if block['block_name'] in only_blocks]
    else:
        full_size = sum([block['file_size'] for block in all_blocks])
    if not full_size:
        print dataset, "is nowhere"
        return {}
    #print full_size
    conn = httplib.HTTPSConnection(url,
                                   cert_file=os.getenv('X509_USER_PROXY'),
                                   key_file=os.getenv('X509_USER_PROXY'))
    r1 = conn.request(
        "GET",
        '/phedex/datasvc/json/prod/blockreplicas?dataset=%s' % (dataset))
    r2 = conn.getresponse()
    result = json.loads(r2.read())
    items = result['phedex']['block']

    locations = defaultdict(set)
    for item in items:
        for replica in item['replica']:
            if not any(replica['node'].endswith(v) for v in vetoes):
                if replica['group'] == None: replica['group'] = ""
                if complete and not replica['complete'] == complete: continue
                #if group!=None and replica['group']==None: continue
                if group != None and not replica['group'].lower(
                ) == group.lower():
                    continue
                locations[replica['node']].add(item['name'])

    presence = {}
    for (site, blocks) in locations.items():
        site_size = sum([
            block['file_size'] for block in all_blocks
            if (block['block_name'] in blocks
                and block['block_name'] in all_block_names)
        ])
        #print site,blocks,all_block_names
        #presence[site] = (set(blocks).issubset(set(all_block_names)), site_size/float(full_size)*100.)
        presence[site] = (set(all_block_names).issubset(set(blocks)),
                          site_size / float(full_size) * 100.)
    #print json.dumps( presence , indent=2)
    return presence