def getLumisToProcess(self, userWebDirURL, jobs, workflow): """ What each job was requested to process Get the lumis to process by each job in the workflow. """ res = {} if userWebDirURL: url = userWebDirURL + "/run_and_lumis.tar.gz" tarFilename = os.path.join(self.requestarea, 'results/run_and_lumis.tar.gz') try: getFileFromURL(url, tarFilename, self.proxyfilename) # Not using 'with tarfile.open(..) as t:' syntax because # the tarfile module only received context manager protocol support # in python 2.7, whereas CMSSW_5_* uses python 2.6 and breaks here. tarball = tarfile.open(tarFilename) for jobid in jobs: filename = "job_lumis_%s.json" % (jobid) try: member = tarball.getmember(filename) except KeyError: self.logger.warning("File %s not found in run_and_lumis.tar.gz for task %s" % (filename, workflow)) else: fd = tarball.extractfile(member) try: res[str(jobid)] = json.load(fd) finally: fd.close() tarball.close() except HTTPException as hte: self.logger.error("Failed to retrieve input dataset duplicate lumis.") logging.getLogger('CRAB3').exception(hte) return res
def getInputDatasetLumis(self, inputDataset, userWebDirURL): """ What the input dataset had in DBS when the task was submitted Get the lumis (and the lumis split across files) in the input dataset. Files containing this information were created at data discovery time and then copied to the schedd. """ res = {} res['inputDataset'] = {'lumis': {}, 'duplicateLumis': {}} if inputDataset and userWebDirURL: url = userWebDirURL + "/input_dataset_lumis.json" filename = os.path.join(self.requestarea, 'results/input_dataset_lumis.json') try: ## Retrieve the lumis in the input dataset. getFileFromURL(url, filename, self.proxyfilename) with open(filename) as fd: res['inputDataset']['lumis'] = json.load(fd) except HTTPException as hte: self.logger.error("Failed to retrieve input dataset lumis.") logging.getLogger('CRAB3').exception(hte) url = userWebDirURL + "/input_dataset_duplicate_lumis.json" filename = os.path.join(self.requestarea, 'results/input_dataset_duplicate_lumis.json') try: ## Retrieve the lumis split across files in the input dataset. getFileFromURL(url, filename, self.proxyfilename) with open(filename) as fd: res['inputDataset']['duplicateLumis'] = json.load(fd) except HTTPException as hte: self.logger.error("Failed to retrieve input dataset duplicate lumis.") logging.getLogger('CRAB3').exception(hte) return res
def retrieveShortLogs(self, webdir, proxyfilename): self.logger.info("Retrieving...") success = [] failed = [] for _, jobid in self.options.jobids: ## We don't know a priori how many retries the job had. So we start with retry 0 ## and increase it by 1 until we are unable to retrieve a log file (interpreting ## this as the fact that we reached the highest retry already). retry = 0 succeded = True while succeded: filename = 'job_out.%s.%s.txt' % (jobid, retry) url = webdir + '/' + filename try: getFileFromURL(url, self.dest + '/' + filename, proxyfilename) self.logger.info('Retrieved %s' % (filename)) success.append(filename) retry += 1 #To retrieve retried job log, if there is any. except ClientException as ex: succeded = False ## Ignore the exception if the HTTP status code is 404. Status 404 means file ## not found (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html). File ## not found error is expected, since we try all the job retries. if not hasattr(ex, "status") or ex.status!=404: self.logger.debug(str(ex)) failed.append(filename) return failed, success
def retrieveShortLogs(self, webdir, proxyfilename): self.logger.info("Retrieving...") success = [] failed = [] for _, jobid in self.options.jobids: ## We don't know a priori how many retries the job had. So we start with retry 0 ## and increase it by 1 until we are unable to retrieve a log file (interpreting ## this as the fact that we reached the highest retry already). retry = 0 succeded = True while succeded: filename = 'job_out.%s.%s.txt' % (jobid, retry) url = webdir + '/' + filename try: getFileFromURL(url, self.dest + '/' + filename, proxyfilename) self.logger.info('Retrieved %s' % (filename)) success.append(filename) retry += 1 #To retrieve retried job log, if there is any. except ClientException as ex: succeded = False ## Ignore the exception if the HTTP status code is 404. Status 404 means file ## not found (see http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html). File ## not found error is expected, since we try all the job retries. if not hasattr(ex, "status") or ex.status != 404: self.logger.debug(str(ex)) failed.append(filename) return failed, success
def getLumisToProcess(self, userWebDirURL, numJobs, workflow): """ What each job was requested to process Get the lumis to process by each job in the workflow. """ res = {} if userWebDirURL: url = userWebDirURL + "/run_and_lumis.tar.gz" tarFilename = os.path.join(self.requestarea, 'results/run_and_lumis.tar.gz') try: getFileFromURL(url, tarFilename, self.proxyfilename) with tarfile.open(tarFilename) as tarball: for jobid in xrange(1, numJobs + 1): filename = "job_lumis_%d.json" % (jobid) try: member = tarball.getmember(filename) except KeyError: self.logger.warning( "File %s not found in run_and_lumis.tar.gz for task %s" % (filename, workflow)) else: fd = tarball.extractfile(member) try: res[str(jobid)] = json.load(fd) finally: fd.close() except HTTPException as hte: self.logger.error( "Failed to retrieve input dataset duplicate lumis.") logging.getLogger('CRAB3').exception(hte) return res
def getInputFiles(self): """ Get the InputFiles.tar.gz and extract the necessary files """ taskname = self.cachedinfo['RequestName'] #Get task status from the task DB self.logger.debug("Getting status from he DB") uri = self.getUrl(self.instance, resource = 'task') serverFactory = CRABClient.Emulator.getEmulator('rest') server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version = __version__) crabDBInfo, _, _ = server.get(uri, data = {'subresource': 'search', 'workflow': taskname}) status = getColumn(crabDBInfo, 'tm_task_status') self.destination = getColumn(crabDBInfo, 'tm_asyncdest') inputsFilename = os.path.join(os.getcwd(), 'InputFiles.tar.gz') if status == 'UPLOADED': filecacheurl = getColumn(crabDBInfo, 'tm_cache_url') ufc = CRABClient.Emulator.getEmulator('ufc')({'endpoint' : filecacheurl, "pycurl": True}) self.logger.debug("Downloading and extracting 'dry-run-sandbox.tar.gz' from %s" % filecacheurl) ufc.downloadLog('dry-run-sandbox.tar.gz', output=os.path.join(os.getcwd(), 'dry-run-sandbox.tar.gz')) with tarfile.open('dry-run-sandbox.tar.gz') as tf: tf.extractall() elif status == 'SUBMITTED': webdir = getProxiedWebDir(taskname, self.serverurl, uri, self.proxyfilename, self.logger.debug) if not webdir: webdir = getColumn(crabDBInfo, 'tm_user_webdir') self.logger.debug("Downloading 'InputFiles.tar.gz' from %s" % webdir) getFileFromURL(webdir + '/InputFiles.tar.gz', inputsFilename, self.proxyfilename) else: raise ClientException('Can only execute jobs from tasks in status SUBMITTED or UPLOADED. Current status is %s' % status) for name in [inputsFilename, 'CMSRunAnalysis.tar.gz', 'sandbox.tar.gz']: with tarfile.open(name) as tf: tf.extractall()
def __call__(self): if self.options.short: inputlist = {'subresource': 'webdir', 'workflow': self.cachedinfo['RequestName']} serverFactory = CRABClient.Emulator.getEmulator('rest') server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) uri = self.getUrl(self.instance, resource = 'task') dictresult, status, reason = server.get(uri, data = inputlist) self.logger.info('Server result: %s' % dictresult['result'][0]) dictresult = self.processServerResult(dictresult) if status != 200: msg = "Problem retrieving information from the server:\ninput:%s\noutput:%s\nreason:%s" % (str(inputlist), str(dictresult), str(reason)) raise RESTCommunicationException(msg) self.setDestination() self.logger.info("Setting the destination to %s " % self.dest) self.logger.info("Retrieving...") success = [] failed = [] for item in self.options.jobids: jobid = str(item[1]) filename = 'job_out.'+jobid+'.0.txt' url = dictresult['result'][0]+'/'+filename try: getFileFromURL(url, self.dest+'/'+filename) self.logger.info ('Retrieved %s' % (filename)) success.append(filename) retry = 1 #To retrieve retried joblog, if there is any. while urllib.urlopen(dictresult['result'][0]+'/'+'job_out.'+jobid+'.'+str(retry)+'.txt').getcode() == 200: filename = 'job_out.'+jobid+'.'+str(retry)+'.txt' url = dictresult['result'][0]+'/'+filename getFileFromURL(url, self.dest+'/'+filename) self.logger.info ('Retrieved %s' % (filename)) success.append(filename) retry = retry + 1 except ClientException as ex: self.logger.debug(str(ex)) failed.append(filename) if failed: msg = "%sError%s: Failed to retrieve the following files: %s" % (colors.RED,colors.NORMAL,failed) self.logger.info(msg) else: self.logger.info("%sSuccess%s: All files successfully retrieved." % (colors.GREEN,colors.NORMAL)) returndict = {'success': success, 'failed': failed} else: returndict = getcommand.__call__(self, subresource = 'logs') if ('success' in returndict and not returndict['success']) or \ ('failed' in returndict and returndict['failed']): msg = "You can use the --short option to retrieve a short version of the log files from the Grid scheduler." self.logger.info(msg) return returndict
def getLumisToProcess(self, userWebDirURL, jobs, workflow): """ What each job was requested to process Get the lumis to process by each job in the workflow. """ res = {} if userWebDirURL: url = userWebDirURL + "/run_and_lumis.tar.gz" tarFilename = os.path.join(self.requestarea, 'results/run_and_lumis.tar.gz') try: getFileFromURL(url, tarFilename, self.proxyfilename) # Not using 'with tarfile.open(..) as t:' syntax because # the tarfile module only received context manager protocol support # in python 2.7, whereas CMSSW_5_* uses python 2.6 and breaks here. tarball = tarfile.open(tarFilename) for jobid in jobs: filename = "job_lumis_%s.json" % (jobid) try: member = tarball.getmember(filename) except KeyError: self.logger.warning( "File %s not found in run_and_lumis.tar.gz for task %s" % (filename, workflow)) else: fd = tarball.extractfile(member) try: res[str(jobid)] = json.load(fd) finally: fd.close() tarball.close() except HTTPException as hte: self.logger.error( "Failed to retrieve input dataset duplicate lumis.") logging.getLogger('CRAB3').exception(hte) return res
def getInputDatasetLumis(self, inputDataset, userWebDirURL): """ What the input dataset had in DBS when the task was submitted Get the lumis (and the lumis split across files) in the input dataset. Files containing this information were created at data discovery time and then copied to the schedd. """ res = {} res['inputDataset'] = {'lumis': {}, 'duplicateLumis': {}} if inputDataset and userWebDirURL: url = userWebDirURL + "/input_dataset_lumis.json" filename = os.path.join(self.requestarea, 'results/input_dataset_lumis.json') try: ## Retrieve the lumis in the input dataset. getFileFromURL(url, filename, self.proxyfilename) with open(filename) as fd: res['inputDataset']['lumis'] = json.load(fd) except HTTPException as hte: self.logger.error("Failed to retrieve input dataset lumis.") logging.getLogger('CRAB3').exception(hte) url = userWebDirURL + "/input_dataset_duplicate_lumis.json" filename = os.path.join( self.requestarea, 'results/input_dataset_duplicate_lumis.json') try: ## Retrieve the lumis split across files in the input dataset. getFileFromURL(url, filename, self.proxyfilename) with open(filename) as fd: res['inputDataset']['duplicateLumis'] = json.load(fd) except HTTPException as hte: self.logger.error( "Failed to retrieve input dataset duplicate lumis.") logging.getLogger('CRAB3').exception(hte) return res
def __call__(self): # Get all of the columns from the database for a certain task taskname = self.cachedinfo['RequestName'] uri = self.getUrl(self.instance, resource = 'task') serverFactory = CRABClient.Emulator.getEmulator('rest') server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) crabDBInfo, _, _ = server.get(uri, data = {'subresource': 'search', 'workflow': taskname}) self.logger.debug("Got information from server oracle database: %s", crabDBInfo) user = self.getColumn(crabDBInfo, 'tm_username') webdir = self.getColumn(crabDBInfo, 'tm_user_webdir') rootDagId = self.getColumn(crabDBInfo, 'clusterid') #that's the condor id from the TW #Print information from the database self.printTaskInfo(crabDBInfo, user) if rootDagId and not webdir: # if the dag is submitted and the webdir is not there we have to wait that AdjustSites run # and upload the webdir location to the server self.logger.info("The CRAB server submitted your task to the Grid scheduler (ID: %s)") self.logger.info("Waiting for the scheduler to report back the status of your task") return crabDBInfo, None self.logger.debug("Webdir is located at %s", webdir) # Download status_cache file self.logger.debug("Retrieving 'status_cache' file from webdir") url = webdir + '/' + "status_cache" statusCacheInfo = None statusCacheFilename = getFileFromURL(url, proxyfilename=self.proxyfilename) with open(statusCacheFilename) as fd: # Skip first line of the file (it contains info for the caching script) and load job_report summary fd.readline() statusCacheInfo = literal_eval(fd.readline()) self.logger.debug("Got information from status cache file: %s", statusCacheInfo) self.printDAGStatus(statusCacheInfo) shortResult = self.printShort(statusCacheInfo) self.printErrors(statusCacheInfo) if self.options.summary: self.printSummary(statusCacheInfo) if self.options.long or self.options.sort: sortdict = self.printLong(statusCacheInfo, quiet = (not self.options.long)) if self.options.sort: self.printSort(sortdict, self.options.sort) if self.options.json: self.logger.info(json.dumps(statusCacheInfo)) return crabDBInfo, shortResult
def __call__(self): # Get all of the columns from the database for a certain task taskname = self.cachedinfo["RequestName"] uri = self.getUrl(self.instance, resource="task") serverFactory = CRABClient.Emulator.getEmulator("rest") server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) crabDBInfo, _, _ = server.get(uri, data={"subresource": "search", "workflow": taskname}) self.logger.debug("Got information from server oracle database: %s", crabDBInfo) user = self.getColumn(crabDBInfo, "tm_username") webdir = self.getColumn(crabDBInfo, "tm_user_webdir") rootDagId = self.getColumn(crabDBInfo, "clusterid") # that's the condor id from the TW # Print information from the database self.printTaskInfo(crabDBInfo, user) if not rootDagId: self.logger.debug( "The task has not been submitted to the Grid scheduler yet. Not printing job information." ) return crabDBInfo, None self.logger.debug("The CRAB server submitted your task to the Grid scheduler (cluster ID: %s)" % rootDagId) if not webdir: # if the dag is submitted and the webdir is not there we have to wait that AdjustSites run # and upload the webdir location to the server self.logger.info("Waiting for the Grid scheduler to bootstrap your task") self.logger.debug("Schedd has not reported back the webdir (yet)") return crabDBInfo, None self.logger.debug("Webdir is located at %s", webdir) # Download status_cache file self.logger.debug("Retrieving 'status_cache' file from webdir") url = webdir + "/" + "status_cache" statusCacheInfo = None try: statusCacheFilename = getFileFromURL(url, proxyfilename=self.proxyfilename) except ClientException as ce: self.logger.info("Waiting for the Grid scheduler to report back the status of your task") self.logger.debug("Cannot retrieve the status_cache file. Maybe the task process has not run yet?") self.logger.debug("Got: %s" % ce) return crabDBInfo, None else: with open(statusCacheFilename) as fd: # Skip first line of the file (it contains info for the caching script) and load job_report summary fd.readline() statusCacheInfo = literal_eval(fd.readline()) self.logger.debug("Got information from status cache file: %s", statusCacheInfo) self.printDAGStatus(crabDBInfo, statusCacheInfo) shortResult = self.printShort(statusCacheInfo) self.printErrors(statusCacheInfo) if self.options.summary: self.printSummary(statusCacheInfo) if self.options.long or self.options.sort: sortdict = self.printLong(statusCacheInfo, quiet=(not self.options.long)) if self.options.sort: self.printSort(sortdict, self.options.sort) if self.options.json: self.logger.info(json.dumps(statusCacheInfo)) return crabDBInfo, shortResult
def run(self, filecacheurl = None): """ Override run() for JobType """ taskDict, webdir = self.getTaskDict() addoutputfiles = literal_eval(getColumn(taskDict, 'tm_outfiles')) tfileoutfiles = literal_eval(getColumn(taskDict, 'tm_tfile_outfiles')) edmoutfiles = literal_eval(getColumn(taskDict, 'tm_edm_outfiles')) jobarch = getColumn(taskDict, 'tm_job_arch') jobsw = getColumn(taskDict, 'tm_job_sw') sandboxFilename = os.path.join(self.workdir, 'sandbox.tar.gz') getFileFromURL(webdir + '/sandbox.tar.gz', sandboxFilename, self.proxyfilename) configArguments = {'addoutputfiles' : addoutputfiles, 'tfileoutfiles' : tfileoutfiles, 'edmoutfiles' : edmoutfiles, 'jobarch' : jobarch, 'jobsw' : jobsw, } # Maybe the user wnat to change the dataset if getattr(self.config.Data, 'inputDataset', None): configArguments['inputdata'] = self.config.Data.inputDataset ufc = CRABClient.Emulator.getEmulator('ufc')({'endpoint' : filecacheurl, "pycurl": True}) result = ufc.upload(sandboxFilename, excludeList = NEW_USER_SANDBOX_EXCLUSIONS) if 'hashkey' not in result: self.logger.error("Failed to upload source files: %s" % str(result)) raise CachefileNotFoundException configArguments['cacheurl'] = filecacheurl configArguments['cachefilename'] = "%s.tar.gz" % str(result['hashkey']) # Upload list of user-defined input files to process as the primary input userFilesList = getattr(self.config.Data, 'userInputFiles', None) if userFilesList: self.logger.debug("Attaching list of user-specified primary input files.") userFilesList = map(string.strip, userFilesList) userFilesList = [file for file in userFilesList if file] if len(userFilesList) != len(set(userFilesList)): msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) msg += " CRAB configuration parameter Data.userInputFiles contains duplicated entries." msg += " Duplicated entries will be removed." self.logger.warning(msg) configArguments['userfiles'] = set(userFilesList) configArguments['primarydataset'] = getattr(self.config.Data, 'outputPrimaryDataset', 'CRAB_UserFiles') lumi_mask_name = getattr(self.config.Data, 'lumiMask', None) lumi_list = None if lumi_mask_name: self.logger.debug("Attaching lumi mask %s to the request" % (lumi_mask_name)) try: lumi_list = getLumiList(lumi_mask_name, logger = self.logger) except ValueError as ex: msg = "%sError%s:" % (colors.RED, colors.NORMAL) msg += " Failed to load lumi mask %s : %s" % (lumi_mask_name, ex) raise ConfigurationException(msg) run_ranges = getattr(self.config.Data, 'runRange', None) if run_ranges: run_ranges_is_valid = re.match('^\d+((?!(-\d+-))(\,|\-)\d+)*$', run_ranges) if run_ranges_is_valid: run_list = getRunList(run_ranges) if lumi_list: lumi_list.selectRuns(run_list) if not lumi_list: msg = "Invalid CRAB configuration: The intersection between the lumi mask and the run range is null." raise ConfigurationException(msg) else: if len(run_list) > 50000: msg = "CRAB configuration parameter Data.runRange includes %s runs." % str(len(run_list)) msg += " When Data.lumiMask is not specified, Data.runRange can not include more than 50000 runs." raise ConfigurationException(msg) lumi_list = LumiList(runs = run_list) else: msg = "Invalid CRAB configuration: Parameter Data.runRange should be a comma separated list of integers or (inclusive) ranges. Example: '12345,99900-99910'" raise ConfigurationException(msg) if lumi_list: configArguments['runs'] = lumi_list.getRuns() ## For each run we encode the lumis as a string representing a list of integers: [[1,2],[5,5]] ==> '1,2,5,5' lumi_mask = lumi_list.getCompactList() configArguments['lumis'] = [str(reduce(lambda x,y: x+y, lumi_mask[run]))[1:-1].replace(' ','') for run in configArguments['runs']] configArguments['jobtype'] = 'Analysis' return sandboxFilename, configArguments
def __call__(self): # Get all of the columns from the database for a certain task taskname = self.cachedinfo['RequestName'] uri = self.getUrl(self.instance, resource='task') serverFactory = CRABClient.Emulator.getEmulator('rest') server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) crabDBInfo, _, _ = server.get(uri, data={ 'subresource': 'search', 'workflow': taskname }) self.logger.debug("Got information from server oracle database: %s", crabDBInfo) user = self.getColumn(crabDBInfo, 'tm_username') webdir = self.getColumn(crabDBInfo, 'tm_user_webdir') rootDagId = self.getColumn( crabDBInfo, 'clusterid') #that's the condor id from the TW #Print information from the database self.printTaskInfo(crabDBInfo, user) if not rootDagId: self.logger.debug( "The task has not been submitted to the Grid scheduler yet. Not printing job information." ) return crabDBInfo, None self.logger.debug( "The CRAB server submitted your task to the Grid scheduler (cluster ID: %s)" % rootDagId) if not webdir: # if the dag is submitted and the webdir is not there we have to wait that AdjustSites run # and upload the webdir location to the server self.logger.info( "Waiting for the Grid scheduler to bootstrap your task") self.logger.debug("Schedd has not reported back the webdir (yet)") return crabDBInfo, None self.logger.debug("Webdir is located at %s", webdir) # Download status_cache file self.logger.debug("Retrieving 'status_cache' file from webdir") url = webdir + '/' + "status_cache" statusCacheInfo = None try: statusCacheFilename = getFileFromURL( url, proxyfilename=self.proxyfilename) except ClientException as ce: self.logger.info( "Waiting for the Grid scheduler to report back the status of your task" ) self.logger.debug( "Cannot retrieve the status_cache file. Maybe the task process has not run yet?" ) self.logger.debug("Got: %s" % ce) return crabDBInfo, None else: with open(statusCacheFilename) as fd: # Skip first line of the file (it contains info for the caching script) and load job_report summary fd.readline() statusCacheInfo = literal_eval(fd.readline()) self.logger.debug("Got information from status cache file: %s", statusCacheInfo) self.printDAGStatus(crabDBInfo, statusCacheInfo) shortResult = self.printShort(statusCacheInfo) self.printErrors(statusCacheInfo) if self.options.summary: self.printSummary(statusCacheInfo) if self.options.long or self.options.sort: sortdict = self.printLong(statusCacheInfo, quiet=(not self.options.long)) if self.options.sort: self.printSort(sortdict, self.options.sort) if self.options.json: self.logger.info(json.dumps(statusCacheInfo)) return crabDBInfo, shortResult