def checkPermissions(self, Cmd):
     """
     Execute command and in case of permanent issue, raise error
     If issue unknown, upload warning message and return 1
     Return 0 otherwise
     """
     self.logger.info("Executing command: %s ", Cmd)
     out, err, exitcode = executeCommand(Cmd)
     if exitcode != 0:
         isPermanent, failure, dummyExitCode = isFailurePermanent(err)
         if isPermanent:
             msg = "CRAB3 refuses to send jobs to grid scheduler for %s. Error message: %s" %(self.task['tm_taskname'], failure)
             msg += "\n" + out
             msg += "\n" + err
             self.logger.warning(msg)
             raise TaskWorkerException(msg)
         else:
             # Unknown error. Operators should check it from time to time and add failures if they are permanent.
             self.logger.warning("CRAB3 was not able to identify if failure is permanent. Err: %s Out: %s ExitCode: %s", err, out, exitcode)
             # Upload warning to user about not being able to check stageout
             msg = "The CRAB3 server got a non-critical error while checking stageout permissions. Please use checkwrite to check if everything is fine."
             self.uploadWarning(msg, self.task['user_proxy'], self.task['tm_taskname'])
             self.logger.info("UNKNOWN ERROR. Operator should check if it is permanent, but for now we go ahead and submit a task.")
             return 1
     return 0
Beispiel #2
0
 def checkPermissions(self, Cmd):
     """
     Execute command and in case of permanent issue, raise error
     If issue unknown, upload warning message and return 1
     Return 0 otherwise
     """
     self.logger.info("Executing command: %s ", Cmd)
     out, err, exitcode = executeCommand(Cmd)
     if exitcode != 0:
         isPermanent, failure, dummyExitCode = isFailurePermanent(err)
         if isPermanent:
             msg = "CRAB3 refuses to send jobs to grid scheduler for %s. Error message: %s" % (
                 self.task["tm_taskname"],
                 failure,
             )
             self.logger.warning(msg)
             raise TaskWorkerException(msg)
         else:
             # Unknown error. Operators should check it from time to time and add failures if they are permanent.
             self.logger.warning(
                 "CRAB3 was not able to identify if failure is permanent. Err: %s Out: %s ExitCode: %s",
                 err,
                 out,
                 exitcode,
             )
             # Upload warning to user about not being able to check stageout
             msg = "The CRAB3 server got a non-critical error while checking stageout permissions. Please use checkwrite to check if everything is fine."
             self.uploadWarning(msg, self.task["user_proxy"], self.task["tm_taskname"])
             self.logger.info(
                 "UNKNOWN ERROR. Operator should check if it is permanent, but for now we go ahead and submit a task."
             )
             return 1
     return 0
Beispiel #3
0
def processWorkerLoop(inputs, results, resthost, resturi, procnum, logger, logsDir):
    procName = "Process-%s" % procnum
    while True:
        try:
            ## Get (and remove) an item from the input queue. If the queue is empty, wait
            ## until an item is available.
            workid, work, task, failstatus, inputargs = inputs.get()
            if work == 'STOP':
                break
            taskhandler = addTaskLogHandler(logger, task['tm_username'], task['tm_taskname'], logsDir)
        except (EOFError, IOError):
            crashMessage = "Hit EOF/IO in getting new work\n"
            crashMessage += "Assuming this is a graceful break attempt.\n"
            logger.error(crashMessage)
            break

        outputs = None
        t0 = time.time()
        logger.debug("%s: Starting %s on %s", procName, str(work), task['tm_taskname'])
        try:
            msg = None
            outputs = work(resthost, resturi, WORKER_CONFIG, task, procnum, inputargs)
        except TapeDatasetException as tde:
            outputs = Result(task=task, err=str(tde))
        except WorkerHandlerException as we:
            outputs = Result(task=task, err=str(we))
            msg = str(we)
        except Exception as exc: #pylint: disable=broad-except
            outputs = Result(task=task, err=str(exc))
            msg = "%s: I just had a failure for %s" % (procName, str(exc))
            msg += "\n\tworkid=" + str(workid)
            msg += "\n\ttask=" + str(task['tm_taskname'])
            msg += "\n" + str(traceback.format_exc())
        finally:
            if msg:
                server = HTTPRequests(resthost, WORKER_CONFIG.TaskWorker.cmscert, WORKER_CONFIG.TaskWorker.cmskey, retry = 20, logger = logger)
                failTask(task['tm_taskname'], server, resturi, msg, logger, failstatus)
        t1 = time.time()
        logger.debug("%s: ...work on %s completed in %d seconds: %s", procName, task['tm_taskname'], t1-t0, outputs)

        try:
            out, _, _ = executeCommand("ps u -p %s | awk '{sum=sum+$6}; END {print sum/1024}'" % os.getpid())
            msg = "RSS after finishing %s: %s MB" % (task['tm_taskname'], out.strip())
            logger.debug(msg)
        except:
            logger.exception("Problem getting worker RSS:")

        removeTaskLogHandler(logger, taskhandler)

        results.put({
                     'workid': workid,
                     'out' : outputs
                    })
Beispiel #4
0
def processWorkerLoop(inputs, results, resthost, resturi, procnum, logger,
                      logsDir):
    procName = "Process-%s" % procnum
    while True:
        try:
            ## Get (and remove) an item from the input queue. If the queue is empty, wait
            ## until an item is available.
            workid, work, task, failstatus, inputargs = inputs.get()
            if work == 'STOP':
                break
            taskhandler = addTaskLogHandler(logger, task['tm_username'],
                                            task['tm_taskname'], logsDir)
        except (EOFError, IOError):
            crashMessage = "Hit EOF/IO in getting new work\n"
            crashMessage += "Assuming this is a graceful break attempt.\n"
            logger.error(crashMessage)
            break

        outputs = None
        t0 = time.time()
        logger.debug("%s: Starting %s on %s", procName, str(work),
                     task['tm_taskname'])
        try:
            msg = None
            outputs = work(resthost, resturi, WORKER_CONFIG, task, procnum,
                           inputargs)
        except TapeDatasetException as tde:
            outputs = Result(task=task, err=str(tde))
        except WorkerHandlerException as we:
            outputs = Result(task=task, err=str(we))
            msg = str(we)
        except Exception as exc:  #pylint: disable=broad-except
            outputs = Result(task=task, err=str(exc))
            msg = "%s: I just had a failure for %s" % (procName, str(exc))
            msg += "\n\tworkid=" + str(workid)
            msg += "\n\ttask=" + str(task['tm_taskname'])
            msg += "\n" + str(traceback.format_exc())
        finally:
            if msg:
                server = HTTPRequests(resthost,
                                      WORKER_CONFIG.TaskWorker.cmscert,
                                      WORKER_CONFIG.TaskWorker.cmskey,
                                      retry=20,
                                      logger=logger)
                failTask(task['tm_taskname'], server, resturi, msg, logger,
                         failstatus)
        t1 = time.time()
        logger.debug("%s: ...work on %s completed in %d seconds: %s", procName,
                     task['tm_taskname'], t1 - t0, outputs)

        try:
            out, _, _ = executeCommand(
                "ps u -p %s | awk '{sum=sum+$6}; END {print sum/1024}'" %
                os.getpid())
            msg = "RSS after finishing %s: %s MB" % (task['tm_taskname'],
                                                     out.strip())
            logger.debug(msg)
        except:
            logger.exception("Problem getting worker RSS:")

        removeTaskLogHandler(logger, taskhandler)

        results.put({'workid': workid, 'out': outputs})
Beispiel #5
0
def processWorkerLoop(inputs, results, resthost, resturi, procnum, logger):
    procName = "Process-%s" % procnum
    while True:
        try:
            ## Get (and remove) an item from the input queue. If the queue is empty, wait
            ## until an item is available.
            workid, work, task, failstatus, inputargs = inputs.get()
            if work == 'STOP':
                break
            taskhandler = addTaskLogHandler(logger, task['tm_username'],
                                            task['tm_taskname'])
        except (EOFError, IOError):
            crashMessage = "Hit EOF/IO in getting new work\n"
            crashMessage += "Assuming this is a graceful break attempt.\n"
            logger.error(crashMessage)
            break

        outputs = None
        t0 = time.time()
        logger.debug("%s: Starting %s on %s", procName, str(work),
                     task['tm_taskname'])
        try:
            msg = None
            outputs = work(resthost, resturi, WORKER_CONFIG, task, procnum,
                           inputargs)
        except TapeDatasetException as tde:
            outputs = Result(task=task, err=str(tde))
        except WorkerHandlerException as we:
            outputs = Result(task=task, err=str(we))
            msg = str(we)
        except Exception as exc:  #pylint: disable=broad-except
            outputs = Result(task=task, err=str(exc))
            msg = "%s: I just had a failure for %s" % (procName, str(exc))
            msg += "\n\tworkid=" + str(workid)
            msg += "\n\ttask=" + str(task['tm_taskname'])
            msg += "\n" + str(traceback.format_exc())
        finally:
            if msg:
                try:
                    logger.info("Uploading error message to REST: %s", msg)
                    server = HTTPRequests(resthost,
                                          WORKER_CONFIG.TaskWorker.cmscert,
                                          WORKER_CONFIG.TaskWorker.cmskey,
                                          retry=20,
                                          logger=logger)
                    truncMsg = truncateError(msg)
                    configreq = {
                        'workflow': task['tm_taskname'],
                        'status': failstatus,
                        'subresource': 'failure',
                        #limit the message to 7500 chars, which means no more than 10000 once encoded. That's the limit in the REST
                        'failure': b64encode(truncMsg)
                    }
                    server.post(resturi, data=urllib.urlencode(configreq))
                    logger.info(
                        "Error message successfully uploaded to the REST")
                except HTTPException as hte:
                    logger.warning(
                        "Cannot upload failure message to the REST for workflow %s. HTTP headers follows:",
                        task['tm_taskname'])
                    logger.error(hte.headers)
                except Exception as exc:  #pylint: disable=broad-except
                    logger.warning(
                        "Cannot upload failure message to the REST for workflow %s.\nReason: %s",
                        task['tm_taskname'], exc)
                    logger.exception('Traceback follows:')
        t1 = time.time()
        logger.debug("%s: ...work on %s completed in %d seconds: %s", procName,
                     task['tm_taskname'], t1 - t0, outputs)

        try:
            out, _, _ = executeCommand(
                "ps u -p %s | awk '{sum=sum+$6}; END {print sum/1024}'" %
                os.getpid())
            msg = "RSS after finishing %s: %s MB" % (task['tm_taskname'],
                                                     out.strip())
            logger.debug(msg)
        except:
            logger.exception("Problem getting worker RSS:")

        removeTaskLogHandler(logger, taskhandler)

        results.put({'workid': workid, 'out': outputs})
def getMemory():
    out, _, _ = executeCommand("ps u -p %s | awk '{sum=sum+$6}; END {print sum/1024}'" % os.getpid())
    return float(out)
Beispiel #7
0
    def startSlave(self, task):
        """
        start a slave process to deal with publication for a single task
        :param task: one tupla describing  a task as returned by  active_tasks()
        :return: 0  It will always terminate normally, if publication fails it will mark it in the DB
        """
        # TODO: lock task!
        # - process logger
        logger = setSlaveLogger(str(task[0][3]))
        logger.info("Process %s is starting. PID %s", task[0][3], os.getpid())

        self.force_publication = False
        workflow = str(task[0][3])

        if len(task[1]) > self.max_files_per_block:
            self.force_publication = True
            msg = "All datasets have more than %s ready files." % (
                self.max_files_per_block)
            msg += " No need to retrieve task status nor last publication time."
            logger.info(msg)
        else:
            msg = "At least one dataset has less than %s ready files." % (
                self.max_files_per_block)
            logger.info(msg)
            # Retrieve the workflow status. If the status can not be retrieved, continue
            # with the next workflow.
            workflow_status = ''
            msg = "Retrieving status"
            logger.info(msg)
            data = encodeRequest({'workflow': workflow})
            try:
                res = self.crabServer.get(api='workflow', data=data)
            except Exception as ex:
                logger.warn(
                    'Error retrieving status from crabserver for %s:\n%s',
                    workflow, str(ex))
                return 0

            try:
                workflow_status = res[0]['result'][0]['status']
                msg = "Task status is %s." % workflow_status
                logger.info(msg)
            except ValueError:
                msg = "Workflow removed from WM."
                logger.error(msg)
                workflow_status = 'REMOVED'
            except Exception as ex:
                msg = "Error loading task status!"
                msg += str(ex)
                msg += str(traceback.format_exc())
                logger.error(msg)
            # If the workflow status is terminal, go ahead and publish all the ready files
            # in the workflow.
            if workflow_status in ['COMPLETED', 'FAILED', 'KILLED', 'REMOVED']:
                self.force_publication = True
                if workflow_status in ['KILLED', 'REMOVED']:
                    self.force_failure = True
                msg = "Considering task status as terminal. Will force publication."
                logger.info(msg)
            # Otherwise...
            else:  ## TODO put this else in a function like def checkForPublication()
                msg = "Task status is not considered terminal."
                logger.info(msg)
                msg = "Getting last publication time."
                logger.info(msg)
                # Get when was the last time a publication was done for this workflow (this
                # should be more or less independent of the output dataset in case there are
                # more than one).
                last_publication_time = None
                data = encodeRequest({
                    'workflow': workflow,
                    'subresource': 'search'
                })
                try:
                    result = self.crabServer.get(api='task', data=data)
                    logger.debug("task: %s ", str(result[0]))
                    last_publication_time = getColumn(result[0],
                                                      'tm_last_publication')
                except Exception as ex:
                    logger.error("Error during task doc retrieving:\n%s", ex)
                if last_publication_time:
                    date = last_publication_time  # datetime in Oracle format
                    timetuple = datetime.strptime(
                        date, "%Y-%m-%d %H:%M:%S.%f").timetuple(
                        )  # convert to time tuple
                    last_publication_time = time.mktime(
                        timetuple)  # convert to seconds since Epoch (float)

                msg = "Last publication time: %s." % str(last_publication_time)
                logger.debug(msg)
                # If this is the first time a publication would be done for this workflow, go
                # ahead and publish.
                if not last_publication_time:
                    self.force_publication = True
                    msg = "There was no previous publication. Will force publication."
                    logger.info(msg)
                # Otherwise...
                else:
                    last = last_publication_time
                    msg = "Last published block time: %s" % last
                    logger.debug(msg)
                    # If the last publication was long time ago (> our block publication timeout),
                    # go ahead and publish.
                    now = int(time.time()) - time.timezone
                    time_since_last_publication = now - last
                    hours = int(time_since_last_publication / 60 / 60)
                    minutes = int(
                        (time_since_last_publication - hours * 60 * 60) / 60)
                    timeout_hours = int(self.block_publication_timeout / 60 /
                                        60)
                    timeout_minutes = int((self.block_publication_timeout -
                                           timeout_hours * 60 * 60) / 60)
                    msg = "Last publication was %sh:%sm ago" % (hours, minutes)
                    if time_since_last_publication > self.block_publication_timeout:
                        self.force_publication = True
                        msg += " (more than the timeout of %sh:%sm)." % (
                            timeout_hours, timeout_minutes)
                        msg += " Will force publication."
                    else:
                        msg += " (less than the timeout of %sh:%sm)." % (
                            timeout_hours, timeout_minutes)
                        msg += " Not enough to force publication."
                    logger.info(msg)

        # logger.info(task[1])
        try:
            if self.force_publication:
                # - get info
                active_ = [{
                    'key': [
                        x['username'], x['user_group'], x['user_role'],
                        x['taskname']
                    ],
                    'value': [
                        x['destination'], x['source_lfn'],
                        x['destination_lfn'], x['input_dataset'], x['dbs_url'],
                        x['last_update']
                    ]
                } for x in task[1] if x['transfer_state'] == 3
                           and x['publication_state'] not in [2, 3, 5]]

                lfn_ready = []
                wf_jobs_endtime = []
                pnn, input_dataset, input_dbs_url = "", "", ""
                for active_file in active_:
                    job_end_time = active_file['value'][5]
                    if job_end_time:
                        wf_jobs_endtime.append(
                            int(job_end_time) - time.timezone)
                    source_lfn = active_file['value'][1]
                    dest_lfn = active_file['value'][2]
                    self.lfn_map[dest_lfn] = source_lfn
                    if not pnn or not input_dataset or not input_dbs_url:
                        pnn = str(active_file['value'][0])
                        input_dataset = str(active_file['value'][3])
                        input_dbs_url = str(active_file['value'][4])
                    lfn_ready.append(dest_lfn)

                username = task[0][0]

                # Get metadata
                toPublish = []
                toFail = []
                publDescFiles_list = self.getPublDescFiles(
                    workflow, lfn_ready, logger)
                for file_ in active_:
                    metadataFound = False
                    for doc in publDescFiles_list:
                        # logger.info(type(doc))
                        # logger.info(doc)
                        if doc["lfn"] == file_["value"][2]:
                            doc["User"] = username
                            doc["Group"] = file_["key"][1]
                            doc["Role"] = file_["key"][2]
                            doc["UserDN"] = self.myDN
                            doc["Destination"] = file_["value"][0]
                            doc["SourceLFN"] = file_["value"][1]
                            toPublish.append(doc)
                            metadataFound = True
                            break

                    # if we failed to find metadata mark publication as failed to avoid to keep looking
                    # at same files over and over
                    if not metadataFound:
                        toFail.append(file_["value"][1])
                with open(self.taskFilesDir + workflow + '.json',
                          'w') as outfile:
                    json.dump(toPublish, outfile)
                logger.debug(
                    'Unitarity check: active_:%d toPublish:%d toFail:%d',
                    len(active_), len(toPublish), len(toFail))
                if len(toPublish) + len(toFail) != len(active_):
                    logger.error("SOMETHING WRONG IN toPublish vs toFail !!")
                if toFail:
                    logger.info(
                        'Did not find useful metadata for %d files. Mark as failed',
                        len(toFail))
                    from ServerUtilities import getHashLfn
                    nMarked = 0
                    for lfn in toFail:
                        source_lfn = lfn
                        docId = getHashLfn(source_lfn)
                        data = dict()
                        data['asoworker'] = self.config.asoworker
                        data['subresource'] = 'updatePublication'
                        data['list_of_ids'] = docId
                        data['list_of_publication_state'] = 'FAILED'
                        data['list_of_retry_value'] = 1
                        data[
                            'list_of_failure_reason'] = 'File type not EDM or metadata not found'
                        try:
                            result = self.crabServer.post(
                                api='filetransfers', data=encodeRequest(data))
                            #logger.debug("updated DocumentId: %s lfn: %s Result %s", docId, source_lfn, result)
                        except Exception as ex:
                            logger.error(
                                "Error updating status for DocumentId: %s lfn: %s",
                                docId, source_lfn)
                            logger.error("Error reason: %s", ex)

                        nMarked += 1
                        #if nMarked % 10 == 0:
                    logger.info('marked %d files as Failed', nMarked)

                # find the location in the current environment of the script we want to run
                import Publisher.TaskPublish as tp
                taskPublishScript = tp.__file__
                cmd = "python %s " % taskPublishScript
                cmd += " --configFile=%s" % self.configurationFile
                cmd += " --taskname=%s" % workflow
                if self.TPconfig.dryRun:
                    cmd += " --dry"
                logger.info("Now execute: %s", cmd)
                stdout, stderr, exitcode = executeCommand(cmd)
                if exitcode != 0:
                    errorMsg = 'Failed to execute command: %s.\n StdErr: %s.' % (
                        cmd, stderr)
                    raise Exception(errorMsg)
                else:
                    logger.info('TaskPublishScript done : %s', stdout)

                jsonSummary = stdout.split()[-1]
                with open(jsonSummary, 'r') as fd:
                    summary = json.load(fd)
                result = summary['result']
                reason = summary['reason']

                taskname = summary['taskname']
                if result == 'OK':
                    if reason == 'NOTHING TO DO':
                        logger.info('Taskname %s is OK. Nothing to do',
                                    taskname)
                    else:
                        msg = 'Taskname %s is OK. Published %d files in %d blocks.' % \
                              (taskname, summary['publishedFiles'], summary['publishedBlocks'])
                        if summary['nextIterFiles']:
                            msg += ' %d files left for next iteration.' % summary[
                                'nextIterFiles']
                        logger.info(msg)
                if result == 'FAIL':
                    logger.error('Taskname %s : TaskPublish failed with: %s',
                                 taskname, reason)
                    if reason == 'DBS Publication Failure':
                        logger.error(
                            'Taskname %s : %d blocks failed for a total of %d files',
                            taskname, summary['failedBlocks'],
                            summary['failedFiles'])
                        logger.error(
                            'Taskname %s : Failed block(s) details have been saved in %s',
                            taskname, summary['failedBlockDumps'])
        except Exception as ex:
            logger.exception("Exception when calling TaskPublish!\n%s",
                             str(ex))

        return 0
Beispiel #8
0
def processWorkerLoop(inputs, results, resthost, dbInstance, procnum, logger, logsDir):
    procName = "Process-%s" % procnum
    while True:
        try:
            ## Get (and remove) an item from the input queue. If the queue is empty, wait
            ## until an item is available. Item content is:
            ##  workid : an integer assigne by the queue module
            ##  work   : a function handler to the needed action e.g. function handleNewTask
            ##  task   : a task dictionary
            ##  failstatus : the status to assign to the task if work fails (e.g. 'SUBMITFAILED')
            workid, work, task, failstatus, inputargs = inputs.get()
            if work == 'STOP':
                break
            taskhandler = addTaskLogHandler(logger, task['tm_username'], task['tm_taskname'], logsDir)
        except (EOFError, IOError):
            crashMessage = "Hit EOF/IO in getting new work\n"
            crashMessage += "Assuming this is a graceful break attempt.\n"
            logger.error(crashMessage)
            break

        outputs = None
        t0 = time.time()
        #log entry below is used for logs parsing, therefore, changing it might require to update logstash configuration
        logger.debug("%s: Starting %s on %s", procName, str(work), task['tm_taskname'])
        try:
            msg = None
            outputs = work(resthost, dbInstance, WORKER_CONFIG, task, procnum, inputargs)
        except TapeDatasetException as tde:
            outputs = Result(task=task, err=str(tde))
        except WorkerHandlerException as we:
            outputs = Result(task=task, err=str(we))
            msg = str(we)
        except Exception as exc: #pylint: disable=broad-except
            outputs = Result(task=task, err=str(exc))
            msg = "%s: I just had a failure for %s" % (procName, str(exc))
            msg += "\n\tworkid=" + str(workid)
            msg += "\n\ttask=" + str(task['tm_taskname'])
            msg += "\n" + str(traceback.format_exc())
        finally:
            if msg:
                crabserver = CRABRest(resthost, WORKER_CONFIG.TaskWorker.cmscert, WORKER_CONFIG.TaskWorker.cmskey,
                                      retry=20, logger=logger, userAgent='CRABTaskWorker')
                crabserver.setDbInstance(dbInstance)
                failTask(task['tm_taskname'], crabserver, msg, logger, failstatus)
        t1 = time.time()
        workType = task.get('tm_task_command', 'RECURRING')
        #log entry below is used for logs parsing, therefore, changing it might require to update logstash configuration
        logger.debug("%s: %s work on %s completed in %d seconds: %s", procName, workType, task['tm_taskname'], t1-t0, outputs)

        try:
            out, _, _ = executeCommand("ps u -p %s | awk '{sum=sum+$6}; END {print sum/1024}'" % os.getpid())
            msg = "RSS after finishing %s: %s MB" % (task['tm_taskname'], out.strip())
            logger.debug(msg)
        except Exception:
            logger.exception("Problem getting worker RSS:")

        removeTaskLogHandler(logger, taskhandler)

        results.put({
                     'workid': workid,
                     'out' : outputs
                    })
Beispiel #9
0
def processWorkerLoop(inputs, results, resthost, resturi, procnum, logger):
    procName = "Process-%s" % procnum
    while True:
        try:
            ## Get (and remove) an item from the input queue. If the queue is empty, wait
            ## until an item is available.
            workid, work, task, failstatus, inputargs = inputs.get()
            if work == 'STOP':
                break
            taskhandler = addTaskLogHandler(logger, task['tm_username'], task['tm_taskname'])
        except (EOFError, IOError):
            crashMessage = "Hit EOF/IO in getting new work\n"
            crashMessage += "Assuming this is a graceful break attempt.\n"
            logger.error(crashMessage)
            break

        outputs = None
        t0 = time.time()
        logger.debug("%s: Starting %s on %s", procName, str(work), task['tm_taskname'])
        try:
            msg = None
            outputs = work(resthost, resturi, WORKER_CONFIG, task, procnum, inputargs)
        except WorkerHandlerException as we:
            outputs = Result(task=task, err=str(we))
            msg = str(we)
        except Exception as exc: #pylint: disable=broad-except
            outputs = Result(task=task, err=str(exc))
            msg = "%s: I just had a failure for %s" % (procName, str(exc))
            msg += "\n\tworkid=" + str(workid)
            msg += "\n\ttask=" + str(task['tm_taskname'])
            msg += "\n" + str(traceback.format_exc())
        finally:
            if msg:
                try:
                    logger.info("Uploading error message to REST: %s", msg)
                    server = HTTPRequests(resthost, WORKER_CONFIG.TaskWorker.cmscert, WORKER_CONFIG.TaskWorker.cmskey, retry = 20,
                                          logger = logger)
                    truncMsg = truncateError(msg)
                    configreq = {'workflow': task['tm_taskname'],
                                 'status': failstatus,
                                 'subresource': 'failure',
                                 #limit the message to 7500 chars, which means no more than 10000 once encoded. That's the limit in the REST
                                 'failure': b64encode(truncMsg)}
                    server.post(resturi, data = urllib.urlencode(configreq))
                    logger.info("Error message successfully uploaded to the REST")
                except HTTPException as hte:
                    logger.warning("Cannot upload failure message to the REST for workflow %s. HTTP headers follows:", task['tm_taskname'])
                    logger.error(hte.headers)
                except Exception as exc: #pylint: disable=broad-except
                    logger.warning("Cannot upload failure message to the REST for workflow %s.\nReason: %s", task['tm_taskname'], exc)
                    logger.exception('Traceback follows:')
        t1 = time.time()
        logger.debug("%s: ...work on %s completed in %d seconds: %s", procName, task['tm_taskname'], t1-t0, outputs)

        try:
            out, _, _ = executeCommand("ps u -p %s | awk '{sum=sum+$6}; END {print sum/1024}'" % os.getpid())
            msg = "RSS after finishing %s: %s MB" % (task['tm_taskname'], out.strip())
            logger.debug(msg)
        except:
            logger.exception("Problem getting worker RSS:")

        removeTaskLogHandler(logger, taskhandler)

        results.put({
                     'workid': workid,
                     'out' : outputs
                    })