def testS3upload(s3tester, archiveName, hashkey, logger): cachename = "%s.tgz" % hashkey try: t1 = time.time() timestamp = time.strftime('%y%m%d_%H%M%S', time.gmtime()) msecs = int((t1 - int(t1)) * 1000) timestamp += '.%03d' % msecs with tempSetLogLevel(logger=logger, level=1000): # disable all logging for this call uploadToS3(crabserver=s3tester, objecttype='sandbox', filepath=archiveName, tarballname=cachename, logger=logger) status = 'OK' logger.debug('Successfully uploaded tarball to S3 as well') except Exception as e: logger.debug('Tarball upload to S3 failed:\n%s', str(e)) status = 'FAIL' reason = str(e) t2 = time.time() s3report = {'status':status} if status == 'FAIL': s3report['reason'] = reason thisSite = socket.gethostname() thisIP = socket.gethostbyname(thisSite) tarballKB = os.stat(archiveName).st_size // 1024 s3report['timestamp'] = timestamp s3report['clienthost'] = thisSite s3report['clientip'] = thisIP s3report['KBytes'] = tarballKB s3report['seconds'] = int(t2-t1) return s3report
def execute(self, *args, **kw): """ Main execute """ self.task = kw['task'] # Do not check it for HC # ActivitiesToRunEverywhere is used mainly for HC and there is no need to check for it. if hasattr(self.config.TaskWorker, 'ActivitiesToRunEverywhere') and \ self.task['tm_activity'] in self.config.TaskWorker.ActivitiesToRunEverywhere: self.logger.info( "Will not check possibility to write to destination site because activity: %s is in ActivitiesToRunEverywhere", self.task['tm_activity']) return # If user specified no output and no logs transfer, there is also no need to check it. if self.task['tm_save_logs'] == 'F' and self.task[ 'tm_transfer_outputs'] == 'F': self.logger.info( "Will not check possibility to write to destination site because user specified not transfer any output/log files." ) return # Do not need to check if it is dryrun. if self.task['tm_dry_run'] == 'T': self.logger.info( "Will not check possibility to write to destination site. User specified dryrun option." ) return self.workflow = self.task['tm_taskname'] self.proxy = self.task['user_proxy'] cpCmd, rmCmd, append = getCheckWriteCommand(self.proxy, self.logger) if not cpCmd: self.logger.info( "CRAB3 is not configured to check write permissions. There is no GFAL2 or LCG commands installed. Continuing" ) return filename = re.sub("[:-_]", "", self.task['tm_taskname']) + '_crab3check.tmp' try: with tempSetLogLevel(logger=self.logger, level=logging.ERROR): pfn = getPFN(self.proxy, self.task['tm_output_lfn'], filename, self.task['tm_asyncdest'], self.logger) cpCmd += append + os.path.abspath(filename) + " " + pfn rmCmd += " " + pfn createDummyFile(filename, self.logger) self.logger.info("Executing cp command: %s ", cpCmd) res = self.checkPermissions(cpCmd) if res == 0: self.logger.info("Executing rm command: %s ", rmCmd) self.checkPermissions(rmCmd) except IOError as er: self.logger.info( 'IOError %s. CRAB3 backend disk is full. Please report to experts. Task will not be submitted', er) raise except HTTPException as er: self.logger.warning( "CRAB3 is not able to get pfn from PhEDEx. Error %s", er) finally: removeDummyFile(filename, self.logger) return
def tryProxyLogon(self, proxycfg=None): """ Utility function to allow trying with diffenent myproxy configurations. It tries to retrieve a valid proxy from myproxy using the configuration passed as argument. See WMCore.Credential.Proxy for configuration details. If successful returns the proxy filename and list of VOMS groups for later addition via voms-proxy-init. If not rises a TW exception. Note that logonRenewMyProxy() does not rise exceptions. """ # WMCore proxy methods are awfully verbose, reduce logging level when using them with tempSetLogLevel(logger=self.logger, level=logging.ERROR): proxy = Proxy(proxycfg) userproxy = proxy.getProxyFilename( serverRenewer=True) # this only returns a filename proxy.logonRenewMyProxy( ) # this tries to create the proxy, but if it fails it does not rise usergroups = set(proxy.getAllUserGroups( userproxy)) # get VOMS groups from created proxy (if any) timeleft = proxy.getTimeLeft( userproxy ) # this is the way to tell if proxy creation succeeded errmsg = '' if timeleft is None or timeleft <= 0: errmsg = "Impossible to retrieve proxy from %s for %s." % ( proxycfg['myProxySvr'], proxycfg['userDN']) if timeleft < (5 * 24 * 3600): errmsg = "Could not get a proxy valid for at least 5-days from %s for %s." % ( proxycfg['myProxySvr'], proxycfg['userDN']) if errmsg: self.logger.error(errmsg) self.logger.error("Will try again in verbose mode") self.logger.error( "===========PROXY ERROR START ==========================") with tempSetLogLevel(logger=self.logger, level=logging.DEBUG): proxy.logonRenewMyProxy() self.logger.error( "===========PROXY ERROR END ==========================") raise TaskWorkerException(errmsg) hoursleft = timeleft / 3600 minutesleft = (timeleft % 3600) / 60 self.logger.info('retrieved proxy lifetime in h:m: %d:%d', hoursleft, minutesleft) return (userproxy, usergroups)
def execute(self, *args, **kwargs): result = None proxycfg = {'vo': kwargs['task']['tm_user_vo'], 'logger': self.logger, 'myProxySvr': self.config.Services.MyProxy, 'proxyValidity' : '144:0', 'min_time_left' : 36000, ## do we need this ? or should we use self.myproxylen? 'userDN' : kwargs['task']['tm_user_dn'], 'group' : kwargs['task']['tm_user_group'] if kwargs['task']['tm_user_group'] else '', 'role' : kwargs['task']['tm_user_role'] if kwargs['task']['tm_user_role'] else '', 'server_key': self.config.MyProxy.serverhostkey, 'server_cert': self.config.MyProxy.serverhostcert, 'serverDN': self.config.MyProxy.serverdn, 'uisource': getattr(self.config.MyProxy, 'uisource', ''), 'credServerPath': self.config.MyProxy.credpath, 'myproxyAccount' : self.server['host'], 'cleanEnvironment' : getattr(self.config.MyProxy, 'cleanEnvironment', False) } # WMCore proxy methods are awfully verbode, reduce logging level when using them with tempSetLogLevel(logger=self.logger, level=logging.ERROR): proxy = Proxy(proxycfg) userproxy = proxy.getProxyFilename(serverRenewer=True) proxy.logonRenewMyProxy() timeleft = proxy.getTimeLeft(userproxy) usergroups = set(proxy.getAllUserGroups(userproxy)) if timeleft is None or timeleft <= 0: msg = "Impossible to retrieve proxy from %s for %s." % (proxycfg['myProxySvr'], proxycfg['userDN']) self.logger.error(msg) self.logger.error("\n Will try again in verbose mode") self.logger.error("===========PROXY ERROR START ==========================") with tempSetLogLevel(logger=self.logger, level=logging.DEBUG): userproxy = proxy.getProxyFilename(serverRenewer=True) proxy.logonRenewMyProxy() timeleft = proxy.getTimeLeft(userproxy) usergroups = set(proxy.getAllUserGroups(userproxy)) self.logger.error("===========PROXY ERROR END ==========================") raise TaskWorkerException(msg) else: kwargs['task']['user_proxy'] = userproxy kwargs['task']['user_groups'] = usergroups self.logger.debug("Valid proxy for %s now in %s", proxycfg['userDN'], userproxy) result = Result(task=kwargs['task'], result='OK') return result
def executeInternal(self, *args, **kwargs): self.logger.info( "Data discovery with DBS") ## to be changed into debug dbsurl = self.config.Services.DBSUrl if kwargs['task']['tm_dbs_url']: dbsurl = kwargs['task']['tm_dbs_url'] self.dbs = DBSReader(dbsurl) self.dbsInstance = self.dbs.dbs.serverinfo()["dbs_instance"] isUserDataset = self.dbsInstance.split('/')[1] != 'global' # where to look locations in pre-Rucio world PhEDExOrDBS = 'PhEDEx' if not isUserDataset else 'DBS origin site' taskName = kwargs['task']['tm_taskname'] userProxy = kwargs['task']['user_proxy'] self.logger.debug("Data discovery through %s for %s", self.dbs, taskName) inputDataset = kwargs['task']['tm_input_dataset'] secondaryDataset = kwargs['task'].get('tm_secondary_input_dataset', None) self.checkDatasetStatus(inputDataset, kwargs) if secondaryDataset: self.checkDatasetStatus(secondaryDataset, kwargs) try: # Get the list of blocks for the locations. # The WMCore DBS3 implementation makes one call to DBS for each block # when using locations=True so we are using locations=False and looking up location later blocks = [ x['Name'] for x in self.dbs.getFileBlocksInfo(inputDataset, locations=False) ] if secondaryDataset: secondaryBlocks = [ x['Name'] for x in self.dbs.getFileBlocksInfo(secondaryDataset, locations=False) ] except DBSReaderError as dbsexc: # dataset not found in DBS is a known use case if str(dbsexc).find('No matching data'): raise TaskWorkerException( "CRAB could not find dataset %s in this DBS instance: %s" % inputDataset, dbsurl) raise ## Create a map for block's locations: for each block get the list of locations. ## Note: listFileBlockLocation() gets first the locations from PhEDEx, and if no ## locations are found it gets the original locations from DBS. So it should ## never be the case at this point that some blocks have no locations. ## locationsMap is a dictionary, key=blockName, value=list of PhedexNodes, example: ## {'/JetHT/Run2016B-PromptReco-v2/AOD#b10179dc-3723-11e6-9aa5-001e67abf228': [u'T1_IT_CNAF_Buffer', u'T2_US_Wisconsin', u'T1_IT_CNAF_MSS', u'T2_BE_UCL'], ## '/JetHT/Run2016B-PromptReco-v2/AOD#89b03ca6-1dc9-11e6-b567-001e67ac06a0': [u'T1_IT_CNAF_Buffer', u'T2_US_Wisconsin', u'T1_IT_CNAF_MSS', u'T2_BE_UCL']} # For now apply Rucio data location only to NANOAOD* # in time useRucioForLocations may become a more rich expression isNano = blocks[0].split("#")[0].split("/")[-1] in [ "NANOAOD", "NANOAODSIM" ] if isNano: self.logger.info( "NANOAOD* datset. Will use Rucio for data location") useRucioForLocations = isNano locationsFoundWithRucio = False if not useRucioForLocations: self.logger.info("Will not use Rucio for this dataset") # if locations should be in Rucio, try it first and fall back to old ways if Rucio calls fail # of if they return no locations (possible Rucio teething pain). If Rucio returns a list, trust it. if useRucioForLocations: locationsMap = {} scope = "cms" # If the dataset is a USER one, use the Rucio user scope to find it # TODO: we need a way to enable users to indicate others user scopes as source if isUserDataset: scope = "user.%s" % kwargs['task']['tm_username'] rucio_config_dict = { "phedexCompatible": True, "auth_type": "x509", "ca_cert": self.config.Services.Rucio_caPath, "logger": self.logger, "creds": { "client_cert": self.config.TaskWorker.cmscert, "client_key": self.config.TaskWorker.cmskey } } try: self.logger.info("Initializing Rucio client") # WMCore is awfully verbose with tempSetLogLevel(logger=self.logger, level=logging.ERROR): rucioClient = Rucio( self.config.Services.Rucio_account, hostUrl=self.config.Services.Rucio_host, authUrl=self.config.Services.Rucio_authUrl, configDict=rucio_config_dict) rucioClient.whoAmI() self.logger.info( "Looking up data location with Rucio in %s scope.", scope) with tempSetLogLevel(logger=self.logger, level=logging.ERROR): locations = rucioClient.getReplicaInfoForBlocks( scope=scope, block=list(blocks)) except Exception as exc: msg = "Rucio lookup failed with\n%s" % str(exc) # TODO when removing fall-back to PhEDEx, this should be a fatal error # raise TaskWorkerException(msg) self.logger.warn(msg) locations = None # TODO when removing fall-back to PhEDEx, above code will raise if it fails, therefore # the following "if" must be removed and the code shifted left if locations: located_blocks = locations['phedex']['block'] for element in located_blocks: if element[ 'replica']: # only fill map for blocks which have at least one location locationsMap.update({ element['name']: [x['node'] for x in element['replica']] }) if locationsMap: locationsFoundWithRucio = True else: msg = "No locations found with Rucio for this dataset" # since NANO* are not in PhEDEx, this should be a fatal error if isNano: raise TaskWorkerException(msg) else: # note it down and try with PhEDEx self.logger.warn(msg) if not locationsFoundWithRucio: # fall back to pre-Rucio methods try: self.logger.info("Looking up data locations using %s", PhEDExOrDBS) locationsMap = self.dbs.listFileBlockLocation( list(blocks), dbsOnly=isUserDataset) except Exception as ex: raise TaskWorkerException( "The CRAB3 server backend could not get the location of the files from dbs nor phedex nor rucio.\n"+\ "This is could be a temporary phedex/rucio/dbs glitch, please try to submit a new task (resubmit will not work)"+\ " and contact the experts if the error persists.\nError reason: %s" % str(ex) ) # only fill map for blocks which have at least one location locationsMap = { key: value for key, value in locationsMap.iteritems() if value } if secondaryDataset: secondaryLocationsMap = {} # see https://github.com/dmwm/CRABServer/issues/6075#issuecomment-641569446 self.logger.info( "Trying data location of secondary blocks with Rucio") try: locations = rucioClient.getReplicaInfoForBlocks( scope=scope, block=list(secondaryBlocks)) except Exception as exc: locations = None secondaryLocationsMap = {} self.logger.warn("Rucio lookup failed with. %s", exc) if locations: located_blocks = locations['phedex']['block'] for element in located_blocks: if element[ 'replica']: # only fill map for blocks which have at least one location secondaryLocationsMap.update({ element['name']: [x['node'] for x in element['replica']] }) if not secondaryLocationsMap: msg = "No locations found with Rucio for secondaryDataset." # TODO when removing fall-back to PhEDEx, this should be a fatal error # raise TaskWorkerException(msg) self.logger.warn(msg) self.logger.info( "Trying data location of secondary blocks with PhEDEx") try: secondaryLocationsMap = self.dbs.listFileBlockLocation( list(secondaryBlocks), dbsOnly=isUserDataset) except Exception as ex: raise TaskWorkerException( "The CRAB3 server backend could not get the location of the secondary dataset files from dbs or phedex or rucio.\n" + \ "This is could be a temporary phedex/rucio/dbs glitch, please try to submit a new task (resubmit will not work)" + \ " and contact the experts if the error persists.\nError reason: %s" % str(ex) ) # only fill map for blocks which have at least one location secondaryLocationsMap = { key: value for key, value in secondaryLocationsMap.iteritems() if value } # From now on code is not dependent from having used Rucio or PhEDEx blocksWithLocation = locationsMap.keys() if secondaryDataset: secondaryBlocksWithLocation = secondaryLocationsMap.keys() self.keepOnlyDisks(locationsMap) if not locationsMap: msg = "Task could not be submitted because there is no DISK replica for dataset %s" % inputDataset if self.tapeLocations: msg += "\nN.B.: the input dataset is stored at %s, but those are TAPE locations." % ', '.join( sorted(self.tapeLocations)) # submit request to DDM ddmRequest = None ddmServer = self.config.TaskWorker.DDMServer try: ddmRequest = blocksRequest(blocksWithLocation, ddmServer, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, verbose=False) except HTTPException as hte: self.logger.exception(hte) msg += "\nThe automatic stage-out failed, please try again later. If the error persists contact the experts and provide this error message:" msg += "\nHTTP Error while contacting the DDM server %s:\n%s" % ( ddmServer, str(hte)) msg += "\nHTTP Headers are: %s" % hte.headers msg += "\nYou might want to contact your physics group if you need a disk replica." raise TaskWorkerException(msg, retry=True) self.logger.info("Contacted %s using %s and %s, got:\n%s", self.config.TaskWorker.DDMServer, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, ddmRequest) # The query above returns a JSON with a format {"result": "OK", "message": "Copy requested", "data": [{"request_id": 18, "site": <site>, "item": [<list of blocks>], "group": "AnalysisOps", "n": 1, "status": "new", "first_request": "2018-02-26 23:57:37", "last_request": "2018-02-26 23:57:37", "request_count": 1}]} if ddmRequest["result"] == "OK": # set status to TAPERECALL tapeRecallStatus = 'TAPERECALL' ddmReqId = ddmRequest["data"][0]["request_id"] configreq = { 'workflow': taskName, 'taskstatus': tapeRecallStatus, 'ddmreqid': ddmReqId, 'subresource': 'addddmreqid', } try: tapeRecallStatusSet = self.server.post( self.restURInoAPI + '/task', data=urllib.urlencode(configreq)) except HTTPException as hte: self.logger.exception(hte) msg = "HTTP Error while contacting the REST Interface %s:\n%s" % ( self.config.TaskWorker.restHost, str(hte)) msg += "\nSetting %s status and DDM request ID (%d) failed for task %s" % ( tapeRecallStatus, ddmReqId, taskName) msg += "\nHTTP Headers are: %s" % hte.headers raise TaskWorkerException(msg, retry=True) msg += "\nA disk replica has been requested on %s to CMS DDM (request ID: %d)" % ( ddmRequest["data"][0]["first_request"], ddmReqId) if tapeRecallStatusSet[2] == "OK": self.logger.info("Status for task %s set to '%s'", taskName, tapeRecallStatus) msg += "\nThis task will be automatically submitted as soon as the stage-out is completed." self.uploadWarning(msg, userProxy, taskName) raise TapeDatasetException(msg) else: msg += ", please try again in two days." else: msg += "\nThe disk replica request failed with this error:\n %s" % ddmRequest[ "message"] msg += "\nPlease, check DAS (https://cmsweb.cern.ch/das) and make sure the dataset is accessible on DISK." raise TaskWorkerException(msg) # will not need lumi info if user has asked for split by file with no run/lumi mask splitAlgo = kwargs['task']['tm_split_algo'] lumiMask = kwargs['task']['tm_split_args']['lumis'] runRange = kwargs['task']['tm_split_args']['runs'] needLumiInfo = splitAlgo != 'FileBased' or lumiMask != [] or runRange != [] # secondary dataset access relies on run/lumi info if secondaryDataset: needLumiInfo = True if needLumiInfo: self.checkBlocksSize( blocksWithLocation ) # Interested only in blocks with locations, 'blocks' may contain invalid ones and trigger an Exception if secondaryDataset: self.checkBlocksSize(secondaryBlocksWithLocation) try: filedetails = self.dbs.listDatasetFileDetails( inputDataset, getParents=True, getLumis=needLumiInfo, validFileOnly=0) if secondaryDataset: moredetails = self.dbs.listDatasetFileDetails( secondaryDataset, getParents=False, getLumis=needLumiInfo, validFileOnly=0) for secfilename, secinfos in moredetails.items(): secinfos['lumiobj'] = LumiList( runsAndLumis=secinfos['Lumis']) self.logger.info( "Beginning to match files from secondary dataset") for dummyFilename, infos in filedetails.items(): infos['Parents'] = [] lumis = LumiList(runsAndLumis=infos['Lumis']) for secfilename, secinfos in moredetails.items(): if lumis & secinfos['lumiobj']: infos['Parents'].append(secfilename) self.logger.info("Done matching files from secondary dataset") kwargs['task']['tm_use_parent'] = 1 except Exception as ex: #TODO should we catch HttpException instead? self.logger.exception(ex) raise TaskWorkerException("The CRAB3 server backend could not contact DBS to get the files details (Lumis, events, etc).\n"+\ "This is could be a temporary DBS glitch. Please try to submit a new task (resubmit will not work)"+\ " and contact the experts if the error persists.\nError reason: %s" % str(ex)) #TODO addo the nodes phedex so the user can check themselves if not filedetails: raise TaskWorkerException(("Cannot find any file inside the dataset. Please, check your dataset in DAS, %s.\n" +\ "Aborting submission. Resubmitting your task will not help.") %\ ("https://cmsweb.cern.ch/das/request?instance=%s&input=dataset=%s") %\ (self.dbsInstance, inputDataset)) ## Format the output creating the data structures required by WMCore. Filters out invalid files, ## files whose block has no location, and figures out the PSN result = self.formatOutput(task=kwargs['task'], requestname=taskName, datasetfiles=filedetails, locations=locationsMap, tempDir=kwargs['tempDir']) if not result.result: raise TaskWorkerException(( "Cannot find any valid file inside the dataset. Please, check your dataset in DAS, %s.\n" + "Aborting submission. Resubmitting your task will not help." ) % ( "https://cmsweb.cern.ch/das/request?instance=%s&input=dataset=%s" ) % (self.dbsInstance, inputDataset)) self.logger.debug("Got %s files", len(result.result.getFiles())) return result
def formatOutput(self, task, requestname, datasetfiles, locations, tempDir): """ Receives as input the result of the data location discovery operations and fill up the WMCore objects. """ self.logger.debug(" Formatting data discovery output ") wmfiles = [] event_counter = 0 lumi_counter = 0 uniquelumis = set() datasetLumis = {} blocksWithNoLocations = set() ## Loop over the sorted list of files. configDict = { "cacheduration": 1, "pycurl": True } # cache duration is in hours with tempSetLogLevel(logger=self.logger, level=logging.ERROR): resourceCatalog = CRIC(logger=self.logger, configDict=configDict) # can't affort one message from CRIC per file, unless critical ! with tempSetLogLevel(logger=self.logger, level=logging.ERROR): for lfn, infos in datasetfiles.iteritems(): ## Skip the file if it is not in VALID state. if not infos.get('ValidFile', True): self.logger.warning("Skipping invalid file %s", lfn) continue ## Skip the file if the block has not been found or has no locations. if not infos['BlockName'] in locations or not locations[ infos['BlockName']]: self.logger.warning( "Skipping %s because its block (%s) has no locations", lfn, infos['BlockName']) blocksWithNoLocations.add(infos['BlockName']) continue if task['tm_use_parent'] == 1 and len(infos['Parents']) == 0: self.logger.warning( "Skipping %s because it has no parents") continue ## Create a WMCore File object. size = infos['FileSize'] checksums = { 'Checksum': infos['Checksum'], 'Adler32': infos['Adler32'], 'Md5': infos['Md5'] } wmfile = File(lfn=lfn, events=infos['NumberOfEvents'], size=size, checksums=checksums, parents=infos['Parents']) wmfile['block'] = infos['BlockName'] try: wmfile['locations'] = resourceCatalog.PNNstoPSNs( locations[wmfile['block']]) except Exception as ex: self.logger.error( "Impossible translating %s to a CMS name through CMS Resource Catalog", locations[wmfile['block']]) self.logger.error("got this exception:\n %s", ex) raise wmfile['workflow'] = requestname event_counter += infos['NumberOfEvents'] for run, lumis in infos['Lumis'].iteritems(): datasetLumis.setdefault(run, []).extend(lumis) wmfile.addRun(Run(run, *lumis)) for lumi in lumis: uniquelumis.add((run, lumi)) lumi_counter += len(lumis) wmfiles.append(wmfile) if blocksWithNoLocations: msg = "%d blocks will be skipped because are not completely replicated on DISK: %s" % ( len(blocksWithNoLocations), list(blocksWithNoLocations)) self.logger.warning(msg) self.uploadWarning(msg, task['user_proxy'], task['tm_taskname']) uniquelumis = len(uniquelumis) self.logger.debug('Tot events found: %d', event_counter) self.logger.debug('Tot lumis found: %d', uniquelumis) self.logger.debug('Duplicate lumis found: %d', (lumi_counter - uniquelumis)) self.logger.debug('Tot files found: %d', len(wmfiles)) self.logger.debug( "Starting to create compact lumilists for input dataset") datasetLumiList = LumiList(runsAndLumis=datasetLumis) datasetLumis = datasetLumiList.getCompactList() datasetDuplicateLumis = datasetLumiList.getDuplicates().getCompactList( ) self.logger.debug( "Finished to create compact lumilists for input dataset") with open(os.path.join(tempDir, "input_dataset_lumis.json"), "w") as fd: json.dump(datasetLumis, fd) with open(os.path.join(tempDir, "input_dataset_duplicate_lumis.json"), "w") as fd: json.dump(datasetDuplicateLumis, fd) return Result(task=task, result=Fileset(name='FilesToSplit', files=set(wmfiles)))
def formatOutput(self, task, requestname, datasetfiles, locations, tempDir): """ Receives as input the result of the data location discovery operations and fill up the WMCore objects. """ self.logger.debug(" Formatting data discovery output ") wmfiles = [] event_counter = 0 lumi_counter = 0 uniquelumis = set() datasetLumis = {} blocksWithNoLocations = set() ## Loop over the sorted list of files. configDict = { "cacheduration": 1, "pycurl": True } # cache duration is in hours with tempSetLogLevel(logger=self.logger, level=logging.ERROR): resourceCatalog = CRIC(logger=self.logger, configDict=configDict) # can't affort one message from CRIC per file, unless critical ! with tempSetLogLevel(logger=self.logger, level=logging.ERROR): for lfn, infos in datasetfiles.iteritems(): ## Skip the file if it is not in VALID state. if not infos.get('ValidFile', True): self.logger.warning("Skipping invalid file %s", lfn) continue ## Skip the file if the block has not been found or has no locations. if not infos['BlockName'] in locations or not locations[ infos['BlockName']]: self.logger.warning( "Skipping %s because its block (%s) has no locations", lfn, infos['BlockName']) blocksWithNoLocations.add(infos['BlockName']) continue if task['tm_use_parent'] == 1 and len(infos['Parents']) == 0: raise TaskWorkerException( "The CRAB3 server backend refuses to submit jobs to the Grid scheduler\n" + "because you specified useParents=True but some your files have no" + "parents.\nExample: " + lfn) ## Create a WMCore File object. try: size = infos['FileSize'] checksums = { 'Checksum': infos['Checksum'], 'Adler32': infos['Adler32'], 'Md5': infos['Md5'] } except: #This is so that the task worker does not crash if an old version of WMCore is used (the interface of an API suddenly changed). # We may want to remove the try/except and the following two lines eventually, but keeping them for the moment so other devels won't be affected #See this WMCore commit: https://github.com/dmwm/WMCore/commit/2afc01ae571390f5fa009dd258be757adac89c28#diff-374b7a6640288184175057234e393e1cL204 size = infos['Size'] checksums = infos['Checksums'] wmfile = File(lfn=lfn, events=infos['NumberOfEvents'], size=size, checksums=checksums, parents=infos['Parents']) wmfile['block'] = infos['BlockName'] try: wmfile['locations'] = resourceCatalog.PNNstoPSNs( locations[wmfile['block']]) except Exception as ex: self.logger.error( "Impossible translating %s to a CMS name through CMS Resource Catalog", locations[wmfile['block']]) self.logger.error("got this exception:\n %s", ex) raise wmfile['workflow'] = requestname event_counter += infos['NumberOfEvents'] for run, lumis in infos['Lumis'].iteritems(): datasetLumis.setdefault(run, []).extend(lumis) wmfile.addRun(Run(run, *lumis)) for lumi in lumis: uniquelumis.add((run, lumi)) lumi_counter += len(lumis) wmfiles.append(wmfile) if blocksWithNoLocations: msg = "%d blocks will be skipped because are not completely replicated on DISK: %s" % ( len(blocksWithNoLocations), list(blocksWithNoLocations)) self.logger.warning(msg) self.uploadWarning(msg, task['user_proxy'], task['tm_taskname']) uniquelumis = len(uniquelumis) self.logger.debug('Tot events found: %d', event_counter) self.logger.debug('Tot lumis found: %d', uniquelumis) self.logger.debug('Duplicate lumis found: %d', (lumi_counter - uniquelumis)) self.logger.debug('Tot files found: %d', len(wmfiles)) self.logger.debug( "Starting to create compact lumilists for input dataset") datasetLumiList = LumiList(runsAndLumis=datasetLumis) datasetLumis = datasetLumiList.getCompactList() datasetDuplicateLumis = datasetLumiList.getDuplicates().getCompactList( ) self.logger.debug( "Finished to create compact lumilists for input dataset") with open(os.path.join(tempDir, "input_dataset_lumis.json"), "w") as fd: json.dump(datasetLumis, fd) with open(os.path.join(tempDir, "input_dataset_duplicate_lumis.json"), "w") as fd: json.dump(datasetDuplicateLumis, fd) return Result(task=task, result=Fileset(name='FilesToSplit', files=set(wmfiles)))
def get_proxy_from_MyProxy(self, ad): vo = 'cms' group = '' role = '' if 'CRAB_UserVO' in ad and ad['CRAB_UserVO']: vo = ad['CRAB_UserVO'] if 'CRAB_UserGroup' in ad and ad['CRAB_UserGroup'] and ad[ 'CRAB_UserGroup'] != classad.Value.Undefined: group = ad['CRAB_UserGroup'] if 'CRAB_UserRole' in ad and ad['CRAB_UserRole'] and ad[ 'CRAB_UserRole'] != classad.Value.Undefined: role = ad['CRAB_UserRole'] username = ad['CRAB_UserHN'] proxycfg = { 'vo': vo, 'logger': self.logger, 'myProxySvr': self.config.Services.MyProxy, 'proxyValidity': '144:0', 'min_time_left': MINPROXYLENGTH, ## do we need this ? or should we use self.myproxylen? 'userDN': ad['CRAB_UserDN'], 'userName': username + '_CRAB', 'group': group, 'role': role, 'server_key': self.config.MyProxy.serverhostkey, 'server_cert': self.config.MyProxy.serverhostcert, 'serverDN': 'dummy', # this is only used inside WMCore/Proxy.py functions not used by CRAB 'uisource': getattr(self.config.MyProxy, 'uisource', ''), 'credServerPath': self.config.MyProxy.credpath, 'cleanEnvironment': getattr(self.config.MyProxy, 'cleanEnvironment', False) } proxy = Proxy(proxycfg) userproxy = proxy.getProxyFilename(serverRenewer=True) # try first with new username_CRAB with tempSetLogLevel(logger=self.logger, level=logging.ERROR): proxy.logonRenewMyProxy() timeleft = proxy.getTimeLeft(userproxy) if not timeleft or timeleft <= 0: # if that fails, try with old fashioned DN hash del proxycfg['userName'] proxy = Proxy(proxycfg) with tempSetLogLevel(logger=self.logger, level=logging.ERROR): proxy.logonRenewMyProxy() timeleft = proxy.getTimeLeft(userproxy) if timeleft is None or timeleft <= 0: self.logger.error("Impossible to retrieve proxy from %s for %s.", proxycfg['myProxySvr'], proxycfg['userDN']) self.logger.error("repeat the command in verbose mode") proxycfg['userName'] = username + '_CRAB' proxy = Proxy(proxycfg) proxy.logonRenewMyProxy() raise Exception("Failed to retrieve proxy.") return userproxy
def __init__(self, confFile=None, quiet=False, debug=True, testMode=False): """ Initialise class members :arg WMCore.Configuration config: input Publisher configuration :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger :arg bool testMode: it tells if to run in test (no subprocesses) mode. """ def createLogdir(dirname): """ Create the directory dirname ignoring erors in case it exists. Exit if the directory cannot be created. """ try: os.makedirs(dirname) except OSError as ose: if ose.errno != 17: #ignore the "Directory already exists error" print(str(ose)) print( "The Publisher Worker needs to access the '%s' directory" % dirname) sys.exit(1) def setRootLogger(logsDir, quiet=False, debug=True, console=False): """Sets the root logger with the desired verbosity level The root logger logs to logs/log.txt and every single logging instruction is propagated to it (not really nice to read) :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger :arg bool console: it tells if to direct all printoput to console rather then files, useful for debug :return logger: a logger with the appropriate logger level.""" createLogdir(logsDir) createLogdir(os.path.join(logsDir, 'processes')) createLogdir(os.path.join(logsDir, 'tasks')) if console: # if we are testing log to the console is easier logging.getLogger().addHandler(logging.StreamHandler()) else: logHandler = MultiProcessingLog(os.path.join( logsDir, 'log.txt'), when='midnight') logFormatter = logging.Formatter( "%(asctime)s:%(levelname)s:%(module)s,%(lineno)d:%(message)s" ) logHandler.setFormatter(logFormatter) logging.getLogger().addHandler(logHandler) loglevel = logging.INFO if quiet: loglevel = logging.WARNING if debug: loglevel = logging.DEBUG logging.getLogger().setLevel(loglevel) logger = setMasterLogger() logger.debug("PID %s.", os.getpid()) logger.debug("Logging level initialized to %s.", loglevel) return logger def logVersionAndConfig(config=None, logger=None): """ log version number and major config. parameters args: config : a configuration object loaded from file args: logger : the logger instance to use """ pubstartDict = {} pubstartDict['version'] = __version__ pubstartDict['asoworker'] = config.General.asoworker pubstartDict['instance'] = config.General.instance if config.General.instance == 'other': pubstartDict['restHost'] = config.General.restHost pubstartDict['dbInstance'] = config.General.dbInstance pubstartDict['max_slaves'] = config.General.max_slaves pubstartDict['DBShost'] = config.TaskPublisher.DBShost pubstartDict['dryRun'] = config.TaskPublisher.dryRun # one line for automatic parsing logger.info('PUBSTART: %s', json.dumps(pubstartDict)) # multiple lines for humans to read for k, v in pubstartDict.items(): logger.info('%s: %s', k, v) return self.configurationFile = confFile # remember this, will have to pass it to TaskPublish config = loadConfigurationFile(confFile) self.config = config.General self.TPconfig = config.TaskPublisher # these are used for talking to DBS os.putenv('X509_USER_CERT', self.config.serviceCert) os.putenv('X509_USER_KEY', self.config.serviceKey) self.block_publication_timeout = self.config.block_closure_timeout self.lfn_map = {} self.force_publication = False self.force_failure = False self.TestMode = testMode self.taskFilesDir = self.config.taskFilesDir createLogdir(self.taskFilesDir) createLogdir(os.path.join(self.taskFilesDir, 'FailedBlocks')) self.logger = setRootLogger(self.config.logsDir, quiet=quiet, debug=debug, console=self.TestMode) logVersionAndConfig(config, self.logger) from WMCore.Credential.Proxy import Proxy proxy = Proxy({'logger': self.logger}) from ServerUtilities import tempSetLogLevel with tempSetLogLevel(self.logger, logging.ERROR): self.myDN = proxy.getSubjectFromCert( certFile=self.config.serviceCert) try: instance = self.config.instance except: msg = "No instance provided: need to specify config.General.instance in the configuration" raise ConfigException(msg) if instance in SERVICE_INSTANCES: self.logger.info('Will connect to CRAB service: %s', instance) restHost = SERVICE_INSTANCES[instance]['restHost'] dbInstance = SERVICE_INSTANCES[instance]['dbInstance'] else: msg = "Invalid instance value '%s'" % instance raise ConfigException(msg) if instance == 'other': self.logger.info( 'Will use restHost and dbInstance from config file') try: restHost = self.config.restHost dbInstance = self.config.dbInstance except: msg = "Need to specify config.General.restHost and dbInstance in the configuration" raise ConfigException(msg) self.logger.info( 'Will connect to CRAB Data Base %s instance via URL: https://%s', dbInstance, restHost) # CRAB REST API's self.max_files_per_block = self.config.max_files_per_block self.crabServer = CRABRest(hostname=restHost, localcert=self.config.serviceCert, localkey=self.config.serviceKey, retry=3, userAgent='CRABPublisher') self.crabServer.setDbInstance(dbInstance=dbInstance) self.startTime = time.time()
def formatOutput(self, task, requestname, datasetfiles, locations, tempDir): """ Receives as input the result of the data location discovery operations and fill up the WMCore objects. """ self.logger.debug(" Formatting data discovery output ") wmfiles = [] event_counter = 0 lumi_counter = 0 uniquelumis = set() datasetLumis = {} blocksWithNoLocations = set() ## Loop over the sorted list of files. configDict = {"cacheduration": 1, "pycurl": True} # cache duration is in hours resourceCatalog = CRIC(logger=self.logger, configDict=configDict) # can't affort one message from CRIC per file, unless critical ! with tempSetLogLevel(logger=self.logger, level=logging.ERROR): for lfn, infos in datasetfiles.iteritems(): ## Skip the file if it is not in VALID state. if not infos.get('ValidFile', True): self.logger.warning("Skipping invalid file %s", lfn) continue ## Skip the file if the block has not been found or has no locations. if not infos['BlockName'] in locations or not locations[infos['BlockName']]: self.logger.warning("Skipping %s because its block (%s) has no locations", lfn, infos['BlockName']) blocksWithNoLocations.add(infos['BlockName']) continue if task['tm_use_parent'] == 1 and len(infos['Parents']) == 0: raise TaskWorkerException( "The CRAB3 server backend refuses to submit jobs to the Grid scheduler\n" + "because you specified useParents=True but some your files have no" + "parents.\nExample: " + lfn) ## Create a WMCore File object. try: size = infos['FileSize'] checksums = {'Checksum': infos['Checksum'], 'Adler32': infos['Adler32'], 'Md5': infos['Md5']} except: #This is so that the task worker does not crash if an old version of WMCore is used (the interface of an API suddenly changed). # We may want to remove the try/except and the following two lines eventually, but keeping them for the moment so other devels won't be affected #See this WMCore commit: https://github.com/dmwm/WMCore/commit/2afc01ae571390f5fa009dd258be757adac89c28#diff-374b7a6640288184175057234e393e1cL204 size = infos['Size'] checksums = infos['Checksums'] wmfile = File(lfn = lfn, events = infos['NumberOfEvents'], size = size, checksums = checksums, parents = infos['Parents']) wmfile['block'] = infos['BlockName'] try: wmfile['locations'] = resourceCatalog.PNNstoPSNs(locations[wmfile['block']]) except Exception as ex: self.logger.error("Impossible translating %s to a CMS name through CMS Resource Catalog", locations[wmfile['block']] ) self.logger.error("got this exception:\n %s", ex) raise wmfile['workflow'] = requestname event_counter += infos['NumberOfEvents'] for run, lumis in infos['Lumis'].iteritems(): datasetLumis.setdefault(run, []).extend(lumis) wmfile.addRun(Run(run, *lumis)) for lumi in lumis: uniquelumis.add((run, lumi)) lumi_counter += len(lumis) wmfiles.append(wmfile) if blocksWithNoLocations: msg = "The locations of some blocks (%d) have not been found: %s" % (len(blocksWithNoLocations), list(blocksWithNoLocations)) self.logger.warning(msg) self.uploadWarning(msg, task['user_proxy'], task['tm_taskname']) uniquelumis = len(uniquelumis) self.logger.debug('Tot events found: %d', event_counter) self.logger.debug('Tot lumis found: %d', uniquelumis) self.logger.debug('Duplicate lumis found: %d', (lumi_counter - uniquelumis)) self.logger.debug('Tot files found: %d', len(wmfiles)) self.logger.debug("Starting to create compact lumilists for input dataset") datasetLumiList = LumiList(runsAndLumis=datasetLumis) datasetLumis = datasetLumiList.getCompactList() datasetDuplicateLumis = datasetLumiList.getDuplicates().getCompactList() self.logger.debug("Finished to create compact lumilists for input dataset") with open(os.path.join(tempDir, "input_dataset_lumis.json"), "w") as fd: json.dump(datasetLumis, fd) with open(os.path.join(tempDir, "input_dataset_duplicate_lumis.json"), "w") as fd: json.dump(datasetDuplicateLumis, fd) return Result(task = task, result = Fileset(name = 'FilesToSplit', files = set(wmfiles)))
def __init__(self, confFile=None, quiet=False, debug=True, testMode=False): """ Initialise class members :arg WMCore.Configuration config: input Publisher configuration :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger :arg bool testMode: it tells if to run in test (no subprocesses) mode. """ def createLogdir(dirname): """ Create the directory dirname ignoring erors in case it exists. Exit if the directory cannot be created. """ try: os.makedirs(dirname) except OSError as ose: if ose.errno != 17: #ignore the "Directory already exists error" print(str(ose)) print( "The Publisher Worker needs to access the '%s' directory" % dirname) sys.exit(1) self.configurationFile = confFile # remember this, will have to pass it to TaskPublish config = loadConfigurationFile(confFile) self.config = config.General self.TPconfig = config.TaskPublisher # these are used for talking to DBS os.putenv('X509_USER_CERT', self.config.serviceCert) os.putenv('X509_USER_KEY', self.config.serviceKey) self.block_publication_timeout = self.config.block_closure_timeout self.lfn_map = {} self.force_publication = False self.force_failure = False self.TestMode = testMode self.taskFilesDir = self.config.taskFilesDir createLogdir(self.taskFilesDir) def setRootLogger(logsDir, quiet=False, debug=True, console=False): """Sets the root logger with the desired verbosity level The root logger logs to logs/twlog.txt and every single logging instruction is propagated to it (not really nice to read) :arg bool quiet: it tells if a quiet logger is needed :arg bool debug: it tells if needs a verbose logger :arg bool console: it tells if to direct all printoput to console rather then files, useful for debug :return logger: a logger with the appropriate logger level.""" createLogdir(logsDir) createLogdir(logsDir + '/processes') createLogdir(logsDir + '/tasks') if console: # if we are testing log to the console is easier logging.getLogger().addHandler(logging.StreamHandler()) else: logHandler = MultiProcessingLog(logsDir + '/log.txt', when='midnight') logFormatter = logging.Formatter( "%(asctime)s:%(levelname)s:%(module)s,%(lineno)d:%(message)s" ) logHandler.setFormatter(logFormatter) logging.getLogger().addHandler(logHandler) loglevel = logging.INFO if quiet: loglevel = logging.WARNING if debug: loglevel = logging.DEBUG logging.getLogger().setLevel(loglevel) logger = setMasterLogger() logger.debug("PID %s.", os.getpid()) logger.debug("Logging level initialized to %s.", loglevel) return logger self.logger = setRootLogger(self.config.logsDir, quiet=quiet, debug=debug, console=self.TestMode) from WMCore.Credential.Proxy import Proxy proxy = Proxy({'logger': self.logger}) from ServerUtilities import tempSetLogLevel with tempSetLogLevel(self.logger, logging.ERROR): self.myDN = proxy.getSubjectFromCert( certFile=self.config.serviceCert) # CRABServer REST API's (see CRABInterface) try: instance = self.config.instance except: msg = "No instance provided: need to specify config.General.instance in the configuration" raise ConfigException(msg) if instance in SERVICE_INSTANCES: self.logger.info('Will connect to CRAB service: %s', instance) restHost = SERVICE_INSTANCES[instance]['restHost'] dbInstance = SERVICE_INSTANCES[instance]['dbInstance'] else: msg = "Invalid instance value '%s'" % instance raise ConfigException(msg) if instance == 'other': self.logger.info( 'Will use restHost and dbInstance from config file') try: restHost = self.config.restHost dbInstance = self.config.dbInstance except: msg = "Need to specify config.General.restHost and dbInstance in the configuration" raise ConfigException(msg) restURInoAPI = '/crabserver/' + dbInstance self.logger.info( 'Will connect to CRAB Data Base via URL: https://%s/%s', restHost, restURInoAPI) # CRAB REST API's self.REST_filetransfers = restURInoAPI + '/filetransfers' #self.REST_usertransfers = restURInoAPI + '/fileusertransfers' self.REST_filemetadata = restURInoAPI + '/filemetadata' self.REST_workflow = restURInoAPI + '/workflow' self.REST_task = restURInoAPI + '/task' self.max_files_per_block = self.config.max_files_per_block self.crabServer = HTTPRequests(url=restHost, localcert=self.config.serviceCert, localkey=self.config.serviceKey, retry=3)