Ejemplo n.º 1
0
    def getInputFiles(self):
        """ Get the InputFiles.tar.gz and extract the necessary files
        """
        taskname = self.cachedinfo['RequestName']

        #Get task status from the task DB
        self.logger.debug("Getting status from he DB")
        uri = self.getUrl(self.instance, resource = 'task')
        serverFactory = CRABClient.Emulator.getEmulator('rest')
        server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version = __version__)
        crabDBInfo, _, _ =  server.get(uri, data = {'subresource': 'search', 'workflow': taskname})
        status = getColumn(crabDBInfo, 'tm_task_status')
        self.destination = getColumn(crabDBInfo, 'tm_asyncdest')

        inputsFilename = os.path.join(os.getcwd(), 'InputFiles.tar.gz')
        if status == 'UPLOADED':
            filecacheurl = getColumn(crabDBInfo, 'tm_cache_url')
            ufc = CRABClient.Emulator.getEmulator('ufc')({'endpoint' : filecacheurl, "pycurl": True})
            self.logger.debug("Downloading and extracting 'dry-run-sandbox.tar.gz' from %s" % filecacheurl)
            ufc.downloadLog('dry-run-sandbox.tar.gz', output=os.path.join(os.getcwd(), 'dry-run-sandbox.tar.gz'))
            with tarfile.open('dry-run-sandbox.tar.gz') as tf:
                tf.extractall()
        elif status == 'SUBMITTED':
            webdir = getProxiedWebDir(taskname, self.serverurl, uri, self.proxyfilename, self.logger.debug)
            if not webdir:
                webdir = getColumn(crabDBInfo, 'tm_user_webdir')
            self.logger.debug("Downloading 'InputFiles.tar.gz' from %s" % webdir)
            getFileFromURL(webdir + '/InputFiles.tar.gz', inputsFilename, self.proxyfilename)
        else:
            raise ClientException('Can only execute jobs from tasks in status SUBMITTED or UPLOADED. Current status is %s' % status)

        for name in [inputsFilename, 'CMSRunAnalysis.tar.gz', 'sandbox.tar.gz']:
            with tarfile.open(name) as tf:
                tf.extractall()
Ejemplo n.º 2
0
    def collectReportData(self):
        """
        Gather information from the server, status2, DBS and files in the webdir that is needed for the report.
        """
        reportData = {}

        serverFactory = CRABClient.Emulator.getEmulator('rest')
        server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__)

        self.logger.debug('Looking up report for task %s' % self.cachedinfo['RequestName'])

        # Query server for information from the taskdb, intput/output file metadata from metadatadb
        dictresult, status, reason = server.get(self.uri, data = {'workflow': self.cachedinfo['RequestName'], 'subresource': 'report2'})

        self.logger.debug("Result: %s" % dictresult)
        self.logger.info("Running crab status2 first to fetch necessary information.")
        # Get job statuses
        crabDBInfo, shortResult = getMutedStatusInfo(self.logger)

        if not shortResult:
            # No point in continuing if the job list is empty.
            # Can happen when the task is very new / old and the files necessary for status2
            # are unavailable.
            return None
        reportData['jobList'] = shortResult['jobList']

        reportData['runsAndLumis'] = {}

        # Transform status joblist (tuples of job status and job id) into a dictionary
        jobStatusDict = {}
        for status, jobId in reportData['jobList']:
            jobStatusDict[jobId] = status

        # Filter output/input file metadata by finished job state
        for jobId in jobStatusDict:
            if jobStatusDict.get(jobId) in ['finished']:
                reportData['runsAndLumis'][jobId] = dictresult['result'][0]['runsAndLumis'][jobId]

        reportData['publication'] = True if getColumn(crabDBInfo, 'tm_publication') == "T" else False
        userWebDirURL = getColumn(crabDBInfo, 'tm_user_webdir')
        numJobs = len(shortResult['jobList'])

        reportData['lumisToProcess'] = self.getLumisToProcess(userWebDirURL, numJobs, self.cachedinfo['RequestName'])
        reportData['inputDataset'] = getColumn(crabDBInfo, 'tm_input_dataset')

        inputDatasetInfo = self.getInputDatasetLumis(reportData['inputDataset'], userWebDirURL)['inputDataset']
        reportData['inputDatasetLumis'] = inputDatasetInfo['lumis']
        reportData['inputDatasetDuplicateLumis'] = inputDatasetInfo['duplicateLumis']
        reportData['outputDatasets'] = dictresult['result'][0]['taskDBInfo']['outputDatasets']

        if reportData['publication']:
            reportData['outputDatasetsInfo'] = self.getDBSPublicationInfo(reportData['outputDatasets'])

        return reportData
Ejemplo n.º 3
0
def checkTaskInfo(crabserver, ad):
    """
    Function checks that given task is registered in the database with status SUBMITTED and with the
    same clusterId and schedd name in the database as in the condor ads where it is currently running.
    In case above condition is not met, script immediately terminates
    """

    task = ad['CRAB_ReqName']
    clusterIdOnSchedd = ad['ClusterId']
    data = {'subresource': 'search', 'workflow': task}

    try:
        dictresult, _, _ = crabserver.get(api='task', data=data)
    except HTTPException as hte:
        printLog(traceback.format_exc())
        printLog(hte.headers)
        printLog(hte.result)
        sys.exit(2)

    taskStatusOnDB = getColumn(dictresult, 'tm_task_status')
    clusteridOnDB = getColumn(dictresult, 'clusterid')
    scheddOnDB = getColumn(dictresult, 'tm_schedd')

    scheddName = os.environ['schedd_name']

    printLog(
        'Task status on DB: %s, clusterID on DB: %s, schedd name on DB: %s; \nclusterID on condor ads: %s, schedd name on condor ads: %s '
        % (taskStatusOnDB, clusteridOnDB, scheddOnDB, clusterIdOnSchedd,
           scheddName))

    if not (taskStatusOnDB == 'SUBMITTED' and scheddOnDB == scheddName
            and clusteridOnDB == str(clusterIdOnSchedd)):
        printLog(
            'Exiting AdjustSites because this dagman does not match task information in TASKS DB'
        )
        sys.exit(3)
Ejemplo n.º 4
0
    def searchTask(self, workflow):
        """

        :param workflow:
        :return:
        """
        data = dict()
        data['workflow'] = workflow
        data['subresource'] = 'search'
        try:
            result = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers', 'task'),
                                       data=encodeRequest(data))
            self.logger.debug("task: %s " % str(result[0]))
            self.logger.debug("task: %s " % getColumn(result[0], 'tm_last_publication'))
        except Exception as ex:
            self.logger.error("Error during task doc retrieving: %s" % ex)
            return {}

        return oracleOutputMapping(result)
Ejemplo n.º 5
0
    def startSlave(self, task):
        # TODO: lock task!
        # - process logger
        logger = setProcessLogger(str(task[0][3]))
        logger.info("Process %s is starting. PID %s", task[0][3], os.getpid())

        self.force_publication = False
        workflow = str(task[0][3])
        wfnamemsg = "%s: " % (workflow)

        if int(workflow[0:2]) < 18:
            msg = "Skipped. Ignore tasks created before 2018."
            logger.info(wfnamemsg+msg)
            return 0

        if len(task[1]) > self.max_files_per_block:
            self.force_publication = True
            msg = "All datasets have more than %s ready files." % (self.max_files_per_block)
            msg += " No need to retrieve task status nor last publication time."
            logger.info(wfnamemsg+msg)
        else:
            msg = "At least one dataset has less than %s ready files." % (self.max_files_per_block)
            logger.info(wfnamemsg+msg)
            # Retrieve the workflow status. If the status can not be retrieved, continue
            # with the next workflow.
            workflow_status = ''
            url = '/'.join(self.cache_area.split('/')[:-1]) #+ '/workflow'
            msg = "Retrieving status from %s" % (url)
            logger.info(wfnamemsg+msg)
            header = {"Content-Type":"application/json"}
            data = {'workflow': workflow, 'subresource': 'taskads'}
            url = 'cmsweb-testbed.cern.ch'
            connection = HTTPRequests(url,
                                     self.config.opsCert,
                                     self.config.opsKey)
            
            try:
                res = connection.get('/crabserver/preprod/workflow', data=encodeRequest(data))
            except Exception as ex:
                #logger.info(wfnamemsg+encodeRequest(data))
                logger.warn('Error retrieving status from cache for %s.' % workflow)
                return 0

            try:
                workflow_status = res[0]['result'][0]['status']
                msg = "Task status is %s." % workflow_status
                logger.info(wfnamemsg+msg)
            except ValueError:
                msg = "Workflow removed from WM."
                logger.error(wfnamemsg+msg)
                workflow_status = 'REMOVED'
            except Exception as ex:
                msg = "Error loading task status!"
                msg += str(ex)
                msg += str(traceback.format_exc())
                logger.error(wfnamemsg+msg)
            # If the workflow status is terminal, go ahead and publish all the ready files
            # in the workflow.
            if workflow_status in ['COMPLETED', 'FAILED', 'KILLED', 'REMOVED']:
                self.force_publication = True
                if workflow_status in ['KILLED', 'REMOVED']:
                    self.force_failure = True
                msg = "Considering task status as terminal. Will force publication."
                logger.info(wfnamemsg+msg)
            # Otherwise...
            else:
                msg = "Task status is not considered terminal."
                logger.info(wfnamemsg+msg)
                msg = "Getting last publication time."
                logger.info(wfnamemsg+msg)
                # Get when was the last time a publication was done for this workflow (this
                # should be more or less independent of the output dataset in case there are
                # more than one).
                last_publication_time = None
                data = {}
                data['workflow'] = workflow
                data['subresource'] = 'search'
                try:
                    result = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers', 'task'),
                                               data=encodeRequest(data))
                    logger.debug("task: %s " %  str(result[0]))
                    logger.debug("task: %s " %  getColumn(result[0], 'tm_last_publication'))
                except Exception as ex:
                    logger.error("Error during task doc retrieving: %s" %ex)
                if last_publication_time:
                    date = oracleOutputMapping(result)['last_publication']
                    seconds = datetime.strptime(date, "%Y-%m-%d %H:%M:%S.%f").timetuple()
                    last_publication_time = time.mktime(seconds)

                msg = "Last publication time: %s." % str(last_publication_time)
                logger.debug(wfnamemsg+msg)
                # If this is the first time a publication would be done for this workflow, go
                # ahead and publish.
                if not last_publication_time:
                    self.force_publication = True
                    msg = "There was no previous publication. Will force publication."
                    logger.info(wfnamemsg+msg)
                # Otherwise...
                else:
                    last = last_publication_time
                    msg = "Last published block: %s" % (last)
                    logger.debug(wfnamemsg+msg)
                    # If the last publication was long time ago (> our block publication timeout),
                    # go ahead and publish.
                    now = int(time.time()) - time.timezone
                    time_since_last_publication = now - last
                    hours = int(time_since_last_publication/60/60)
                    minutes = int((time_since_last_publication - hours*60*60)/60)
                    timeout_hours = int(self.block_publication_timeout/60/60)
                    timeout_minutes = int((self.block_publication_timeout - timeout_hours*60*60)/60)
                    msg = "Last publication was %sh:%sm ago" % (hours, minutes)
                    if time_since_last_publication > self.block_publication_timeout:
                        self.force_publication = True
                        msg += " (more than the timeout of %sh:%sm)." % (timeout_hours, timeout_minutes)
                        msg += " Will force publication."
                    else:
                        msg += " (less than the timeout of %sh:%sm)." % (timeout_hours, timeout_minutes)
                        msg += " Not enough to force publication."
                    logger.info(wfnamemsg+msg)

        #logger.info(task[1])
        try:
            if self.force_publication:
                # - get info
                active_ = [{'key': [x['username'],
                                    x['user_group'],
                                    x['user_role'],
                                    x['taskname']],
                            'value': [x['destination'],
                                      x['source_lfn'],
                                      x['destination_lfn'],
                                      x['input_dataset'],
                                      x['dbs_url'],
                                      x['last_update']
                                     ]}
                           for x in task[1] if x['transfer_state'] == 3 and x['publication_state'] not in [2, 3, 5]]

                lfn_ready = []
                wf_jobs_endtime = []
                pnn, input_dataset, input_dbs_url = "", "", ""
                for active_file in active_:
                    job_end_time = active_file['value'][5]
                    if job_end_time and self.config.isOracle:
                        wf_jobs_endtime.append(int(job_end_time) - time.timezone)
                    elif job_end_time:
                        wf_jobs_endtime.append(int(time.mktime(time.strptime(str(job_end_time), '%Y-%m-%d %H:%M:%S'))) - time.timezone)
                    source_lfn = active_file['value'][1]
                    dest_lfn = active_file['value'][2]
                    self.lfn_map[dest_lfn] = source_lfn
                    if not pnn or not input_dataset or not input_dbs_url:
                        pnn = str(active_file['value'][0])
                        input_dataset = str(active_file['value'][3])
                        input_dbs_url = str(active_file['value'][4])
                    lfn_ready.append(dest_lfn)

                userDN = ''
                username = task[0][0]
                user_group = ""
                if task[0][1]:
                    user_group = task[0][1]
                user_role = ""
                if task[0][2]:
                    user_role = task[0][2]
                logger.debug("Trying to get DN %s %s %s" % (username, user_group, user_role))

                try:
                    userDN = getDNFromUserName(username, logger)
                except Exception as ex:
                    msg = "Error retrieving the user DN"
                    msg += str(ex)
                    msg += str(traceback.format_exc())
                    logger.error(msg)
                    return 1

                # Get metadata
                toPublish = []
                publDescFiles_list = self.getPublDescFiles(workflow, lfn_ready)
                for file_ in active_:
                    for _, doc in enumerate(publDescFiles_list):
                        #logger.info(type(doc))
                        #logger.info(doc)
                        if doc["lfn"] == file_["value"][2]:
                            doc["User"] = username
                            doc["Group"] = file_["key"][1]
                            doc["Role"] = file_["key"][2]
                            doc["UserDN"] = userDN
                            doc["Destination"] = file_["value"][0]
                            doc["SourceLFN"] = file_["value"][1]
                            toPublish.append(doc)
                with open("/tmp/publisher_files/"+workflow+'.json', 'w') as outfile:
                    json.dump(toPublish, outfile)
                logger.info(". publisher.py %s" % (workflow))

                # find the location in the current environment of the script we want to run
                import Publisher.TaskPublish as tp
                taskPublishScript = tp.__file__
                subprocess.call(["python", taskPublishScript, workflow])

        except:
            logger.exception("Exception!")


        return 0
Ejemplo n.º 6
0
    def __call__(self):

        self.logger.info('Getting the tarball hash key')
        inputlist = {'subresource': 'search', 'workflow': self.cachedinfo['RequestName']}
        serverFactory = CRABClient.Emulator.getEmulator('rest')
        server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__)
        uri = getUrl(self.instance, resource = 'task')
        dictresult, _, _ =  server.get(uri, data = inputlist)

        tm_user_sandbox = getColumn(dictresult, 'tm_user_sandbox')
        hashkey = tm_user_sandbox.replace(".tar.gz","")

        # Get the schedd address from the DB info and strip off the 'crab3@' prefix if it exists
        scheddaddress = getColumn(dictresult, 'tm_schedd')
        scheddaddress = scheddaddress.split('@')[1] if '@' in scheddaddress else scheddaddress

        self.logger.info('Checking task status')
        serverFactory = CRABClient.Emulator.getEmulator('rest')
        server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__)
        dictresult, _, _ = server.get(self.uri, data = {'workflow': self.cachedinfo['RequestName'], 'verbose': 0})

        dictresult = dictresult['result'][0] #take just the significant part

        self.logger.info('Task status: %s' % dictresult['status'])
        accepstate = ['SUBMITFAILED','KILLED','FINISHED','FAILED','KILLFAILED', 'COMPLETED']
        if dictresult['status'] not in accepstate:
            msg = ('%sError%s: Only tasks with these status can be purged: {0}'.format(accepstate) % (colors.RED, colors.NORMAL))
            raise ConfigurationException(msg)

        #getting the cache url
        cacheresult = {}
        scheddresult = {}
        gsisshdict = {}
        if not self.options.scheddonly:
            baseurl = getUrl(self.instance, resource='info')
            cacheurl = server_info('backendurls', self.serverurl, self.proxyfilename, baseurl)
            cacheurl = cacheurl['cacheSSL']
            cacheurldict = {'endpoint': cacheurl, 'pycurl': True}

            ufc = UserFileCache(cacheurldict)
            self.logger.info('Tarball hashkey: %s' %hashkey)
            self.logger.info('Attempting to remove task file from crab server cache')

            try:
                ufcresult = ufc.removeFile(hashkey)
            except HTTPException as re:
                if 'X-Error-Info' in re.headers and 'Not such file' in re.headers['X-Error-Info']:
                    self.logger.info('%sError%s: Failed to find task file in crab server cache; the file might have been already purged' % (colors.RED,colors.NORMAL))
                raise

            if ufcresult == '':
                self.logger.info('%sSuccess%s: Successfully removed task files from crab server cache' % (colors.GREEN, colors.NORMAL))
                cacheresult = 'SUCCESS'
            else:
                self.logger.info('%sError%s: Failed to remove task files from crab server cache' % (colors.RED, colors.NORMAL))
                cacheresult = 'FAILED'

        if not self.options.cacheonly:
            self.logger.debug('%sSuccess%s: Successfully got schedd address' % (colors.GREEN, colors.NORMAL))
            self.logger.debug('Schedd address: %s' % scheddaddress)
            self.logger.info('Attempting to remove task from schedd')

            gssishrm = 'gsissh -o ConnectTimeout=60 -o PasswordAuthentication=no ' + scheddaddress + ' rm -rf ' + self.cachedinfo['RequestName']
            self.logger.debug('gsissh command: %s' % gssishrm)

            delprocess=subprocess.Popen(gssishrm, stdout= subprocess.PIPE, stderr= subprocess.PIPE, shell=True)
            stdout, stderr = delprocess.communicate()
            exitcode = delprocess.returncode

            if exitcode == 0 :
                self.logger.info('%sSuccess%s: Successfully removed task from schedd' % (colors.GREEN, colors.NORMAL))
                scheddresult = 'SUCCESS'
                gsisshdict = {}
            else :
                self.logger.info('%sError%s: Failed to remove task from schedd' % (colors.RED, colors.NORMAL))
                scheddaddress = 'FAILED'
                self.logger.debug('gsissh stdout: %s\ngsissh stderr: %s\ngsissh exitcode: %s' % (stdout,stderr,exitcode))
                gsisshdict = {'stdout' : stdout, 'stderr' : stderr , 'exitcode' : exitcode}

            return {'cacheresult' : cacheresult , 'scheddresult' : scheddresult , 'gsiresult' : gsisshdict}
Ejemplo n.º 7
0
    def __call__(self):

        self.logger.info('Getting the tarball hash key')
        inputlist = {'subresource': 'search', 'workflow': self.cachedinfo['RequestName']}
        serverFactory = CRABClient.Emulator.getEmulator('rest')
        server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__)
        uri = self.getUrl(self.instance, resource = 'task')
        dictresult, _, _ =  server.get(uri, data = inputlist)

        tm_user_sandbox = getColumn(dictresult, 'tm_user_sandbox')
        hashkey = tm_user_sandbox.replace(".tar.gz","")

        # Get the schedd address from the DB info and strip off the 'crab3@' prefix if it exists
        scheddaddress = getColumn(dictresult, 'tm_schedd')
        scheddaddress = scheddaddress.split('@')[1] if '@' in scheddaddress else scheddaddress

        self.logger.info('Checking task status')
        serverFactory = CRABClient.Emulator.getEmulator('rest')
        server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__)
        dictresult, _, _ = server.get(self.uri, data = {'workflow': self.cachedinfo['RequestName'], 'verbose': 0})

        dictresult = dictresult['result'][0] #take just the significant part

        self.logger.info('Task status: %s' % dictresult['status'])
        accepstate = ['SUBMITFAILED','KILLED','FINISHED','FAILED','KILLFAILED', 'COMPLETED']
        if dictresult['status'] not in accepstate:
            msg = ('%sError%s: Only tasks with these status can be purged: {0}'.format(accepstate) % (colors.RED, colors.NORMAL))
            raise ConfigurationException(msg)

        #getting the cache url
        cacheresult = {}
        scheddresult = {}
        gsisshdict = {}
        if not self.options.scheddonly:
            baseurl = getUrl(self.instance, resource='info')
            cacheurl = server_info('backendurls', self.serverurl, self.proxyfilename, baseurl)
            cacheurl = cacheurl['cacheSSL']
            cacheurldict = {'endpoint': cacheurl, 'pycurl': True}

            ufc = UserFileCache(cacheurldict)
            self.logger.info('Tarball hashkey: %s' %hashkey)
            self.logger.info('Attempting to remove task file from crab server cache')

            try:
                ufcresult = ufc.removeFile(hashkey)
            except HTTPException as re:
                if 'X-Error-Info' in re.headers and 'Not such file' in re.headers['X-Error-Info']:
                    self.logger.info('%sError%s: Failed to find task file in crab server cache; the file might have been already purged' % (colors.RED,colors.NORMAL))
                raise

            if ufcresult == '':
                self.logger.info('%sSuccess%s: Successfully removed task files from crab server cache' % (colors.GREEN, colors.NORMAL))
                cacheresult = 'SUCCESS'
            else:
                self.logger.info('%sError%s: Failed to remove task files from crab server cache' % (colors.RED, colors.NORMAL))
                cacheresult = 'FAILED'

        if not self.options.cacheonly:
            self.logger.debug('%sSuccess%s: Successfully got schedd address' % (colors.GREEN, colors.NORMAL))
            self.logger.debug('Schedd address: %s' % scheddaddress)
            self.logger.info('Attempting to remove task from schedd')

            gssishrm = 'gsissh -o ConnectTimeout=60 -o PasswordAuthentication=no ' + scheddaddress + ' rm -rf ' + self.cachedinfo['RequestName']
            self.logger.debug('gsissh command: %s' % gssishrm)

            delprocess=subprocess.Popen(gssishrm, stdout= subprocess.PIPE, stderr= subprocess.PIPE, shell=True)
            stdout, stderr = delprocess.communicate()
            exitcode = delprocess.returncode

            if exitcode == 0 :
                self.logger.info('%sSuccess%s: Successfully removed task from schedd' % (colors.GREEN, colors.NORMAL))
                scheddresult = 'SUCCESS'
                gsisshdict = {}
            else :
                self.logger.info('%sError%s: Failed to remove task from schedd' % (colors.RED, colors.NORMAL))
                scheddaddress = 'FAILED'
                self.logger.debug('gsissh stdout: %s\ngsissh stderr: %s\ngsissh exitcode: %s' % (stdout,stderr,exitcode))
                gsisshdict = {'stdout' : stdout, 'stderr' : stderr , 'exitcode' : exitcode}

            return {'cacheresult' : cacheresult , 'scheddresult' : scheddresult , 'gsiresult' : gsisshdict}
Ejemplo n.º 8
0
    def startSlave(self, task):
        """
        start a slave process to deal with publication for a single task
        :param task: one tupla describing  a task as returned by  active_tasks()
        :return: 0  It will always terminate normally, if publication fails it will mark it in the DB
        """
        # TODO: lock task!
        # - process logger
        logger = setSlaveLogger(str(task[0][3]))
        logger.info("Process %s is starting. PID %s", task[0][3], os.getpid())

        self.force_publication = False
        workflow = str(task[0][3])

        if len(task[1]) > self.max_files_per_block:
            self.force_publication = True
            msg = "All datasets have more than %s ready files." % (
                self.max_files_per_block)
            msg += " No need to retrieve task status nor last publication time."
            logger.info(msg)
        else:
            msg = "At least one dataset has less than %s ready files." % (
                self.max_files_per_block)
            logger.info(msg)
            # Retrieve the workflow status. If the status can not be retrieved, continue
            # with the next workflow.
            workflow_status = ''
            msg = "Retrieving status"
            logger.info(msg)
            data = encodeRequest({'workflow': workflow})
            try:
                res = self.crabServer.get(api='workflow', data=data)
            except Exception as ex:
                logger.warn(
                    'Error retrieving status from crabserver for %s:\n%s',
                    workflow, str(ex))
                return 0

            try:
                workflow_status = res[0]['result'][0]['status']
                msg = "Task status is %s." % workflow_status
                logger.info(msg)
            except ValueError:
                msg = "Workflow removed from WM."
                logger.error(msg)
                workflow_status = 'REMOVED'
            except Exception as ex:
                msg = "Error loading task status!"
                msg += str(ex)
                msg += str(traceback.format_exc())
                logger.error(msg)
            # If the workflow status is terminal, go ahead and publish all the ready files
            # in the workflow.
            if workflow_status in ['COMPLETED', 'FAILED', 'KILLED', 'REMOVED']:
                self.force_publication = True
                if workflow_status in ['KILLED', 'REMOVED']:
                    self.force_failure = True
                msg = "Considering task status as terminal. Will force publication."
                logger.info(msg)
            # Otherwise...
            else:  ## TODO put this else in a function like def checkForPublication()
                msg = "Task status is not considered terminal."
                logger.info(msg)
                msg = "Getting last publication time."
                logger.info(msg)
                # Get when was the last time a publication was done for this workflow (this
                # should be more or less independent of the output dataset in case there are
                # more than one).
                last_publication_time = None
                data = encodeRequest({
                    'workflow': workflow,
                    'subresource': 'search'
                })
                try:
                    result = self.crabServer.get(api='task', data=data)
                    logger.debug("task: %s ", str(result[0]))
                    last_publication_time = getColumn(result[0],
                                                      'tm_last_publication')
                except Exception as ex:
                    logger.error("Error during task doc retrieving:\n%s", ex)
                if last_publication_time:
                    date = last_publication_time  # datetime in Oracle format
                    timetuple = datetime.strptime(
                        date, "%Y-%m-%d %H:%M:%S.%f").timetuple(
                        )  # convert to time tuple
                    last_publication_time = time.mktime(
                        timetuple)  # convert to seconds since Epoch (float)

                msg = "Last publication time: %s." % str(last_publication_time)
                logger.debug(msg)
                # If this is the first time a publication would be done for this workflow, go
                # ahead and publish.
                if not last_publication_time:
                    self.force_publication = True
                    msg = "There was no previous publication. Will force publication."
                    logger.info(msg)
                # Otherwise...
                else:
                    last = last_publication_time
                    msg = "Last published block time: %s" % last
                    logger.debug(msg)
                    # If the last publication was long time ago (> our block publication timeout),
                    # go ahead and publish.
                    now = int(time.time()) - time.timezone
                    time_since_last_publication = now - last
                    hours = int(time_since_last_publication / 60 / 60)
                    minutes = int(
                        (time_since_last_publication - hours * 60 * 60) / 60)
                    timeout_hours = int(self.block_publication_timeout / 60 /
                                        60)
                    timeout_minutes = int((self.block_publication_timeout -
                                           timeout_hours * 60 * 60) / 60)
                    msg = "Last publication was %sh:%sm ago" % (hours, minutes)
                    if time_since_last_publication > self.block_publication_timeout:
                        self.force_publication = True
                        msg += " (more than the timeout of %sh:%sm)." % (
                            timeout_hours, timeout_minutes)
                        msg += " Will force publication."
                    else:
                        msg += " (less than the timeout of %sh:%sm)." % (
                            timeout_hours, timeout_minutes)
                        msg += " Not enough to force publication."
                    logger.info(msg)

        # logger.info(task[1])
        try:
            if self.force_publication:
                # - get info
                active_ = [{
                    'key': [
                        x['username'], x['user_group'], x['user_role'],
                        x['taskname']
                    ],
                    'value': [
                        x['destination'], x['source_lfn'],
                        x['destination_lfn'], x['input_dataset'], x['dbs_url'],
                        x['last_update']
                    ]
                } for x in task[1] if x['transfer_state'] == 3
                           and x['publication_state'] not in [2, 3, 5]]

                lfn_ready = []
                wf_jobs_endtime = []
                pnn, input_dataset, input_dbs_url = "", "", ""
                for active_file in active_:
                    job_end_time = active_file['value'][5]
                    if job_end_time:
                        wf_jobs_endtime.append(
                            int(job_end_time) - time.timezone)
                    source_lfn = active_file['value'][1]
                    dest_lfn = active_file['value'][2]
                    self.lfn_map[dest_lfn] = source_lfn
                    if not pnn or not input_dataset or not input_dbs_url:
                        pnn = str(active_file['value'][0])
                        input_dataset = str(active_file['value'][3])
                        input_dbs_url = str(active_file['value'][4])
                    lfn_ready.append(dest_lfn)

                username = task[0][0]

                # Get metadata
                toPublish = []
                toFail = []
                publDescFiles_list = self.getPublDescFiles(
                    workflow, lfn_ready, logger)
                for file_ in active_:
                    metadataFound = False
                    for doc in publDescFiles_list:
                        # logger.info(type(doc))
                        # logger.info(doc)
                        if doc["lfn"] == file_["value"][2]:
                            doc["User"] = username
                            doc["Group"] = file_["key"][1]
                            doc["Role"] = file_["key"][2]
                            doc["UserDN"] = self.myDN
                            doc["Destination"] = file_["value"][0]
                            doc["SourceLFN"] = file_["value"][1]
                            toPublish.append(doc)
                            metadataFound = True
                            break

                    # if we failed to find metadata mark publication as failed to avoid to keep looking
                    # at same files over and over
                    if not metadataFound:
                        toFail.append(file_["value"][1])
                with open(self.taskFilesDir + workflow + '.json',
                          'w') as outfile:
                    json.dump(toPublish, outfile)
                logger.debug(
                    'Unitarity check: active_:%d toPublish:%d toFail:%d',
                    len(active_), len(toPublish), len(toFail))
                if len(toPublish) + len(toFail) != len(active_):
                    logger.error("SOMETHING WRONG IN toPublish vs toFail !!")
                if toFail:
                    logger.info(
                        'Did not find useful metadata for %d files. Mark as failed',
                        len(toFail))
                    from ServerUtilities import getHashLfn
                    nMarked = 0
                    for lfn in toFail:
                        source_lfn = lfn
                        docId = getHashLfn(source_lfn)
                        data = dict()
                        data['asoworker'] = self.config.asoworker
                        data['subresource'] = 'updatePublication'
                        data['list_of_ids'] = docId
                        data['list_of_publication_state'] = 'FAILED'
                        data['list_of_retry_value'] = 1
                        data[
                            'list_of_failure_reason'] = 'File type not EDM or metadata not found'
                        try:
                            result = self.crabServer.post(
                                api='filetransfers', data=encodeRequest(data))
                            #logger.debug("updated DocumentId: %s lfn: %s Result %s", docId, source_lfn, result)
                        except Exception as ex:
                            logger.error(
                                "Error updating status for DocumentId: %s lfn: %s",
                                docId, source_lfn)
                            logger.error("Error reason: %s", ex)

                        nMarked += 1
                        #if nMarked % 10 == 0:
                    logger.info('marked %d files as Failed', nMarked)

                # find the location in the current environment of the script we want to run
                import Publisher.TaskPublish as tp
                taskPublishScript = tp.__file__
                cmd = "python %s " % taskPublishScript
                cmd += " --configFile=%s" % self.configurationFile
                cmd += " --taskname=%s" % workflow
                if self.TPconfig.dryRun:
                    cmd += " --dry"
                logger.info("Now execute: %s", cmd)
                stdout, stderr, exitcode = executeCommand(cmd)
                if exitcode != 0:
                    errorMsg = 'Failed to execute command: %s.\n StdErr: %s.' % (
                        cmd, stderr)
                    raise Exception(errorMsg)
                else:
                    logger.info('TaskPublishScript done : %s', stdout)

                jsonSummary = stdout.split()[-1]
                with open(jsonSummary, 'r') as fd:
                    summary = json.load(fd)
                result = summary['result']
                reason = summary['reason']

                taskname = summary['taskname']
                if result == 'OK':
                    if reason == 'NOTHING TO DO':
                        logger.info('Taskname %s is OK. Nothing to do',
                                    taskname)
                    else:
                        msg = 'Taskname %s is OK. Published %d files in %d blocks.' % \
                              (taskname, summary['publishedFiles'], summary['publishedBlocks'])
                        if summary['nextIterFiles']:
                            msg += ' %d files left for next iteration.' % summary[
                                'nextIterFiles']
                        logger.info(msg)
                if result == 'FAIL':
                    logger.error('Taskname %s : TaskPublish failed with: %s',
                                 taskname, reason)
                    if reason == 'DBS Publication Failure':
                        logger.error(
                            'Taskname %s : %d blocks failed for a total of %d files',
                            taskname, summary['failedBlocks'],
                            summary['failedFiles'])
                        logger.error(
                            'Taskname %s : Failed block(s) details have been saved in %s',
                            taskname, summary['failedBlockDumps'])
        except Exception as ex:
            logger.exception("Exception when calling TaskPublish!\n%s",
                             str(ex))

        return 0
Ejemplo n.º 9
0
    def collectReportData(self):
        """
        Gather information from the server, status2, DBS and files in the webdir that is needed for the report.
        """
        reportData = {}

        serverFactory = CRABClient.Emulator.getEmulator('rest')
        server = serverFactory(self.serverurl,
                               self.proxyfilename,
                               self.proxyfilename,
                               version=__version__)

        self.logger.debug('Looking up report for task %s' %
                          self.cachedinfo['RequestName'])

        # Query server for information from the taskdb, intput/output file metadata from metadatadb
        dictresult, status, reason = server.get(
            self.uri,
            data={
                'workflow': self.cachedinfo['RequestName'],
                'subresource': 'report2'
            })

        self.logger.debug("Result: %s" % dictresult)
        self.logger.info(
            "Running crab status2 first to fetch necessary information.")
        # Get job statuses
        crabDBInfo, shortResult = getMutedStatusInfo(self.logger)

        if not shortResult:
            # No point in continuing if the job list is empty.
            # Can happen when the task is very new / old and the files necessary for status2
            # are unavailable.
            return None
        reportData['jobList'] = shortResult['jobList']

        reportData['runsAndLumis'] = {}

        # Transform status joblist (tuples of job status and job id) into a dictionary
        jobStatusDict = {}
        for status, jobId in reportData['jobList']:
            jobStatusDict[jobId] = status

        # Filter output/input file metadata by finished job state
        for jobId in jobStatusDict:
            if jobStatusDict.get(jobId) in ['finished']:
                reportData['runsAndLumis'][jobId] = dictresult['result'][0][
                    'runsAndLumis'][jobId]

        reportData['publication'] = True if getColumn(
            crabDBInfo, 'tm_publication') == "T" else False
        userWebDirURL = getColumn(crabDBInfo, 'tm_user_webdir')
        numJobs = len(shortResult['jobList'])

        reportData['lumisToProcess'] = self.getLumisToProcess(
            userWebDirURL, numJobs, self.cachedinfo['RequestName'])
        reportData['inputDataset'] = getColumn(crabDBInfo, 'tm_input_dataset')

        inputDatasetInfo = self.getInputDatasetLumis(
            reportData['inputDataset'], userWebDirURL)['inputDataset']
        reportData['inputDatasetLumis'] = inputDatasetInfo['lumis']
        reportData['inputDatasetDuplicateLumis'] = inputDatasetInfo[
            'duplicateLumis']
        reportData['outputDatasets'] = dictresult['result'][0]['taskDBInfo'][
            'outputDatasets']

        if reportData['publication']:
            reportData['outputDatasetsInfo'] = self.getDBSPublicationInfo(
                reportData['outputDatasets'])

        return reportData
Ejemplo n.º 10
0
    def startSlave(self, task):
        """
        start a slave process to deal with publication for a single task
        :param task: one tupla describing  a task as returned by  active_tasks()
        :return: 0  It will always terminate normally, if publication fails it will mark it in the DB
        """
        # TODO: lock task!
        # - process logger
        logger = setSlaveLogger(str(task[0][3]))
        logger.info("Process %s is starting. PID %s", task[0][3], os.getpid())

        self.force_publication = False
        workflow = str(task[0][3])

        if int(workflow[0:2]) < 20:
            msg = "Skipped. Ignore tasks created before 2020."
            logger.info(msg)
            return 0

        if len(task[1]) > self.max_files_per_block:
            self.force_publication = True
            msg = "All datasets have more than %s ready files." % (
                self.max_files_per_block)
            msg += " No need to retrieve task status nor last publication time."
            logger.info(msg)
        else:
            msg = "At least one dataset has less than %s ready files." % (
                self.max_files_per_block)
            logger.info(msg)
            # Retrieve the workflow status. If the status can not be retrieved, continue
            # with the next workflow.
            workflow_status = ''
            msg = "Retrieving status"
            logger.info(msg)
            uri = self.REST_workflow
            data = encodeRequest({'workflow': workflow})
            try:
                res = self.crabServer.get(uri=uri, data=data)
            except Exception as ex:
                logger.warn('Error retrieving status from %s for %s.', uri,
                            workflow)
                return 0

            try:
                workflow_status = res[0]['result'][0]['status']
                msg = "Task status is %s." % workflow_status
                logger.info(msg)
            except ValueError:
                msg = "Workflow removed from WM."
                logger.error(msg)
                workflow_status = 'REMOVED'
            except Exception as ex:
                msg = "Error loading task status!"
                msg += str(ex)
                msg += str(traceback.format_exc())
                logger.error(msg)
            # If the workflow status is terminal, go ahead and publish all the ready files
            # in the workflow.
            if workflow_status in ['COMPLETED', 'FAILED', 'KILLED', 'REMOVED']:
                self.force_publication = True
                if workflow_status in ['KILLED', 'REMOVED']:
                    self.force_failure = True
                msg = "Considering task status as terminal. Will force publication."
                logger.info(msg)
            # Otherwise...
            else:
                msg = "Task status is not considered terminal."
                logger.info(msg)
                msg = "Getting last publication time."
                logger.info(msg)
                # Get when was the last time a publication was done for this workflow (this
                # should be more or less independent of the output dataset in case there are
                # more than one).
                last_publication_time = None
                data = encodeRequest({
                    'workflow': workflow,
                    'subresource': 'search'
                })
                try:
                    result = self.crabServer.get(self.REST_task, data)
                    logger.debug("task: %s ", str(result[0]))
                    last_publication_time = getColumn(result[0],
                                                      'tm_last_publication')
                except Exception as ex:
                    logger.error("Error during task doc retrieving: %s", ex)
                if last_publication_time:
                    date = last_publication_time  # datetime in Oracle format
                    timetuple = datetime.strptime(
                        date, "%Y-%m-%d %H:%M:%S.%f").timetuple(
                        )  # convert to time tuple
                    last_publication_time = time.mktime(
                        timetuple)  # convert to seconds since Epoch (float)

                msg = "Last publication time: %s." % str(last_publication_time)
                logger.debug(msg)
                # If this is the first time a publication would be done for this workflow, go
                # ahead and publish.
                if not last_publication_time:
                    self.force_publication = True
                    msg = "There was no previous publication. Will force publication."
                    logger.info(msg)
                # Otherwise...
                else:
                    last = last_publication_time
                    msg = "Last published block time: %s" % last
                    logger.debug(msg)
                    # If the last publication was long time ago (> our block publication timeout),
                    # go ahead and publish.
                    now = int(time.time()) - time.timezone
                    time_since_last_publication = now - last
                    hours = int(time_since_last_publication / 60 / 60)
                    minutes = int(
                        (time_since_last_publication - hours * 60 * 60) / 60)
                    timeout_hours = int(self.block_publication_timeout / 60 /
                                        60)
                    timeout_minutes = int((self.block_publication_timeout -
                                           timeout_hours * 60 * 60) / 60)
                    msg = "Last publication was %sh:%sm ago" % (hours, minutes)
                    if time_since_last_publication > self.block_publication_timeout:
                        self.force_publication = True
                        msg += " (more than the timeout of %sh:%sm)." % (
                            timeout_hours, timeout_minutes)
                        msg += " Will force publication."
                    else:
                        msg += " (less than the timeout of %sh:%sm)." % (
                            timeout_hours, timeout_minutes)
                        msg += " Not enough to force publication."
                    logger.info(msg)

        # logger.info(task[1])
        try:
            if self.force_publication:
                # - get info
                active_ = [{
                    'key': [
                        x['username'], x['user_group'], x['user_role'],
                        x['taskname']
                    ],
                    'value': [
                        x['destination'], x['source_lfn'],
                        x['destination_lfn'], x['input_dataset'], x['dbs_url'],
                        x['last_update']
                    ]
                } for x in task[1] if x['transfer_state'] == 3
                           and x['publication_state'] not in [2, 3, 5]]

                lfn_ready = []
                wf_jobs_endtime = []
                pnn, input_dataset, input_dbs_url = "", "", ""
                for active_file in active_:
                    job_end_time = active_file['value'][5]
                    if job_end_time:
                        wf_jobs_endtime.append(
                            int(job_end_time) - time.timezone)
                    source_lfn = active_file['value'][1]
                    dest_lfn = active_file['value'][2]
                    self.lfn_map[dest_lfn] = source_lfn
                    if not pnn or not input_dataset or not input_dbs_url:
                        pnn = str(active_file['value'][0])
                        input_dataset = str(active_file['value'][3])
                        input_dbs_url = str(active_file['value'][4])
                    lfn_ready.append(dest_lfn)

                username = task[0][0]

                # Get metadata
                toPublish = []
                publDescFiles_list = self.getPublDescFiles(
                    workflow, lfn_ready, logger)
                for file_ in active_:
                    for _, doc in enumerate(publDescFiles_list):
                        # logger.info(type(doc))
                        # logger.info(doc)
                        if doc["lfn"] == file_["value"][2]:
                            doc["User"] = username
                            doc["Group"] = file_["key"][1]
                            doc["Role"] = file_["key"][2]
                            doc["UserDN"] = self.myDN
                            doc["Destination"] = file_["value"][0]
                            doc["SourceLFN"] = file_["value"][1]
                            toPublish.append(doc)
                with open(self.taskFilesDir + workflow + '.json',
                          'w') as outfile:
                    json.dump(toPublish, outfile)

                # find the location in the current environment of the script we want to run
                import Publisher.TaskPublish as tp
                taskPublishScript = tp.__file__
                cmd = "python %s " % taskPublishScript
                cmd += " --configFile=%s" % self.configurationFile
                cmd += " --taskname=%s" % workflow
                if self.TPconfig.dryRun:
                    cmd += " --dry"
                logger.info("Now execute: %s", cmd)
                subprocess.call(cmd, shell=True)
                logger.info('ALL DONE')

        except Exception:
            logger.exception("Exception when calling TaskPublish!")

        return 0