コード例 #1
0
def getProxiedWebDir(task, host, uri, cert, logFunction=print):
    """ The function simply queries the REST interface specified to get the proxied webdir to use
        for the task. Returns None in case the API could not find the url (either an error or the schedd
        is not configured)
    """
    #This import is here because SeverUtilities is also used on the worker nodes,
    #and I want to avoid the dependency to pycurl right now. We should actually add it one day
    #so that other code in cmscp that uses Requests.py from WMCore can be migrated to RESTInteractions
    from RESTInteractions import HTTPRequests
    data = {
        'subresource': 'webdirprx',
        'workflow': task,
    }
    res = None
    try:
        server = HTTPRequests(host, cert, cert, retry=2)
        dictresult, _, _ = server.get(
            uri, data=data)  #the second and third parameters are deprecated
        if dictresult.get('result'):
            res = dictresult['result'][0]
    except HTTPException as hte:
        logFunction(traceback.format_exc())
        logFunction(hte.headers)
        logFunction(hte.result)

    return res
コード例 #2
0
ファイル: postmortem.py プロジェクト: Crabclient/CRABClient
    def __call__(self):
        if self.options.task is None:
            return CommandResult(2001, 'ERROR: Task option is required')

        server = HTTPRequests(self.cachedinfo['Server'] + ':' + str(self.cachedinfo['Port']))

        self.logger.debug('Looking up detailed postmortem of task %s' % self.cachedinfo['RequestName'])
        dictresult, postmortem, reason = server.get(self.uri + self.cachedinfo['RequestName'])

        if postmortem != 200:
            msg = "Problem retrieving postmortem:\ninput:%s\noutput:%s\nreason:%s" % (str(self.cachedinfo['RequestName']), str(dictresult), str(reason))
            return CommandResult(1, msg)

        for workflow in dictresult['errors']:
            self.logger.info("#%i %s" % (workflow['subOrder'], workflow['request']))
            if self.options.verbose or self.options.outputfile:
                self.printVerbose(workflow['details'], self.options.outputfile, os.path.join(self.requestarea, 'results', 'jobFailures.log'))
            else:
                self.logger.debug("   Aggregating job failures")
                groupederrs = self.aggregateFailures(workflow['details'])
                if not groupederrs:
                    self.logger.info("   No failures")
                    continue
                self.logger.info("   List of failures and jobs per each failure: (one job could have more then one failure, one per each step)")
                for hkey in groupederrs:
                    ## removing duplicates and sort
                    joberrs = list(set(groupederrs[hkey]['jobs']))
                    joberrs.sort()
                    self.logger.info('     %s jobs failed with error "%s"' %(len(joberrs), groupederrs[hkey]['error']))
                    self.logger.info('       (%s)'  %(', '.join([ str(jobid[0]) for jobid in joberrs])) )

        return CommandResult(0, None)
コード例 #3
0
ファイル: status.py プロジェクト: Crabclient/CRABClient
    def __call__(self):
        server = HTTPRequests(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__)

        self.logger.debug('Looking up detailed status of task %s' % self.cachedinfo['RequestName'])
        user = self.cachedinfo['RequestName'].split("_")[2].split(":")[-1]
        verbose = int(self.summary or self.long or self.json)
        if self.idle:
            verbose = 2
        dictresult, status, reason = server.get(self.uri, data = { 'workflow' : self.cachedinfo['RequestName'], 'verbose': verbose })
        dictresult = dictresult['result'][0] #take just the significant part

        if status != 200:
            msg = "Problem retrieving status:\ninput:%s\noutput:%s\nreason:%s" % (str(self.cachedinfo['RequestName']), str(dictresult), str(reason))
            raise RESTCommunicationException(msg)

        self.printShort(dictresult, user)
        self.printPublication(dictresult)

        if 'jobs' not in dictresult:
            self.logger.info("\nNo jobs created yet!")
        else:
            # Note several options could be combined
            if self.summary:
                self.printSummary(dictresult)
            if self.long:
               self.printLong(dictresult)
            if self.idle:
               self.printIdle(dictresult, user)
            if self.json:
               self.logger.info(dictresult['jobs'])
コード例 #4
0
 def get_backendurls(self):
     self.logger.info("Querying server %s for HTCondor schedds and pool names." % self.resturi)
     server = HTTPRequests(self.resthost, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, retry = 2)
     result = server.get(self.resturi, data={'subresource':'backendurls'})[0]['result'][0]
     self.pool = str(result['htcondorPool'])
     self.schedds = [str(i) for i in result['htcondorSchedds']]
     self.logger.info("Resulting pool %s; schedds %s" % (self.pool, ",".join(self.schedds)))
コード例 #5
0
 def get_backendurls(self):
     self.logger.info("Querying server %s for HTCondor schedds and pool names." % self.resturi)
     server = HTTPRequests(self.resthost, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, retry = 2)
     result = server.get(self.resturi, data={'subresource':'backendurls'})[0]['result'][0]
     self.pool = str(result['htcondorPool'])
     self.schedds = [str(i) for i in result['htcondorSchedds']]
     self.logger.info("Resulting pool %s; schedds %s" % (self.pool, ",".join(self.schedds)))
コード例 #6
0
ファイル: client_utilities.py プロジェクト: qunox/CRABClient
def server_info(subresource, server, proxyfilename, baseurl):
    """
    Get relevant information about the server
    """
    server = HTTPRequests(server, proxyfilename, proxyfilename, version=__version__)

    dictresult, status, reason = server.get(baseurl, {'subresource' : subresource})

    return dictresult['result'][0]
コード例 #7
0
ファイル: request_type.py プロジェクト: Crabclient/CRABClient
    def __call__(self):


        proxyfile = self.options.proxyfile if self.options.proxyfile else self.proxyfilename
        server = HTTPRequests(self.serverurl, proxyfile, proxyfile, version=__version__)

        self.logger.debug('Looking type for task %s' % self.cachedinfo['RequestName'])
        dictresult, status, reason = server.get(self.uri, data = {'workflow': self.cachedinfo['RequestName'], 'subresource': 'type'})
        self.logger.debug('Task type %s' % dictresult['result'][0])
        return dictresult['result'][0]
コード例 #8
0
ファイル: GenerateXMLForTW.py プロジェクト: jmarra13/crab3mon
 def getCountTasksByStatusAbs(self):
     try:
         resturi = "/crabserver/prod/task"
         configreq = { 'minutes': "1000000000", 'subresource': "counttasksbystatus" }
         server = HTTPRequests(self.resthost, "/data/certs/servicecert.pem", "/data/certs/servicekey.pem", retry = 2)
         result = server.get(resturi, data = configreq)
         return dict(result[0]['result'])
     except Exception, e:
         self.logger.debug("Error in getCountTasksByStatusAbs: %s"%str(e))
         return []
コード例 #9
0
ファイル: CRABServer.py プロジェクト: akanevm/GangaCRAB3
    def getOutput(self, job):
        """Retrieve the output of the job."""
        """
        if not os.path.exists(job.inputdata.ui_working_dir):
            raise CRABServerError('Workdir "%s" not found.' %
                                  job.inputdata.ui_working_dir)

        cmd = 'crab -getoutput %d -c %s' % (int(job.id) + 1,
                                            job.inputdata.ui_working_dir)
        self._send_with_retry(cmd, 'getoutput', job.backend.crab_env)
        # Make output files coming from the WMS readable.
        for root, _, files in os.walk(os.path.join(job.inputdata.ui_working_dir,
                                                   'res')): # Just 'res'.
            for f in files:
                os.chmod(os.path.join(root, f), 0644)
        """
        logger.info('getting Output for job %s:%s' % (job.backend.taskname, job.backend.crabid)) 
        inputlist =  [  ('workflow', job.backend.taskname)]
        inputlist.extend([('subresource', 'logs')])
        inputlist.extend( [('jobids', job.backend.crabid)] )

        #srv='hammer-crab3.cern.ch'#  'cmsweb-testbed.cern.ch'
        #proxypath= '/afs/cern.ch/user/r/riahi/public/proxy'#'/afs/cern.ch/user/s/spiga/public/PerValentaina/proxy'
        #resource='/crabserver/dev/workflow'
        #server = HTTPRequests(srv, proxypath)

        server = HTTPRequests(job.backend.server_name, job.backend.userproxy)

        resource = job.backend.apiresource+'workflow'

        try:
            import sys, traceback
            dictresult, status, reason = server.get(resource, data = inputlist)
            input = dictresult['result']
            rcopy = remoteCopy(input, job.outputdir, logger)
            rcopy()
            logger.info("Task: %s - subjob: %s output copied" % (job.backend.taskname, job.backend.crabid))
            tfile = tarfile.open(os.path.join(job.outputdir, "cmsRun_%s.log.tar.gz" % job.backend.crabid))
            tfile.extractall(job.outputdir)

        except HTTPException, e:
            msg = type(e)
            msg += " "+dir(e)
            msg += " "+e.req_headers
            msg += " "+e.req_data
            msg += " "+e.reason
            msg += " "+e.message
            msg += " "+e.headers
            msg += " "+e.result
            msg += " "+e.status
            msg += " "+e.url
            msg += " "+e.args
            logger.error(msg)
コード例 #10
0
def server_info(subresource, server, proxyfilename, baseurl):
    """
    Get relevant information about the CRAB REST server
    """
    server = HTTPRequests(url=server,
                          localcert=proxyfilename,
                          localkey=proxyfilename,
                          version='HC')

    dictresult, status, reason = server.get(baseurl,
                                            {'subresource': subresource})

    return dictresult['result'][0]
コード例 #11
0
    def __call__(self):
        server = HTTPRequests(self.serverurl,
                              self.proxyfilename,
                              self.proxyfilename,
                              version=__version__)
        dictresult, status, reason = server.get(self.uri,
                                                data={'timestamp': self.date})
        dictresult = dictresult['result']  #take just the significant part

        if status != 200:
            msg = "Problem retrieving tasks:\ninput:%s\noutput:%s\nreason:%s" % (
                str(self.date), str(dictresult), str(reason))
            raise RESTCommunicationException(msg)

        dictresult.sort()
        dictresult.reverse()

        if self.options.status:
            dictresult = [
                item for item in dictresult if item[1] == self.options.status
            ]

        result = [item[0:2] for item in dictresult]

        today = date.today()

        if not dictresult:
            msg = "No tasks found from %s until %s" % (self.date, today)
            if self.options.status:
                msg += " with status %s" % (self.options.status)
            self.logger.info(msg)
            return result

        msg = "\nList of tasks from %s until %s" % (self.date, today)
        if self.options.status:
            msg += " with status %s" % (self.options.status)
        self.logger.info(msg)
        msg = "Beware that STATUS here does not include information from grid jobs"
        self.logger.info(msg)
        self.logger.info('=' * 80)
        self.logger.info('NAME\t\t\t\t\t\t\t\tSTATUS')
        self.logger.info('=' * 80)
        for item in dictresult:
            name, status = item[0:2]
            self.logger.info('%s\n\t\t\t\t\t\t\t\t%s' % (name, status))
            self.logger.info('-' * 80)
        self.logger.info('\n')

        return result
コード例 #12
0
ファイル: GenerateMONIT.py プロジェクト: vlimant/CRABServer
 def getCountTasksByStatusAbs(self):
     try:
         resturi = "/crabserver/prod/task"
         configreq = {'minutes': "1000000000", 'subresource': "counttasksbystatus"}
         server = HTTPRequests(self.resthost, "/data/certs/servicecert.pem", "/data/certs/servicekey.pem", retry=10)
         result = server.get(resturi, data=configreq)
         return dict(result[0]['result'])
     except Exception:
         e = sys.exc_info()
         if hasattr(e,"headers"):
             self.logger.error(str(e.headers))
         self.logger.exception("Error in getCountTasksByStatusAbs:")
         pprint(e[1])
         traceback.print_tb(e[2])
         return []
コード例 #13
0
ファイル: CRABServer.py プロジェクト: spigad/GangaCRAB3
    def getOutput(self, job):
        """Retrieve the output of the job."""
        """
        if not os.path.exists(job.inputdata.ui_working_dir):
            raise CRABServerError('Workdir "%s" not found.' %
                                  job.inputdata.ui_working_dir)

        cmd = 'crab -getoutput %d -c %s' % (int(job.id) + 1,
                                            job.inputdata.ui_working_dir)
        self._send_with_retry(cmd, 'getoutput', job.backend.crab_env)
        # Make output files coming from the WMS readable.
        for root, _, files in os.walk(os.path.join(job.inputdata.ui_working_dir,
                                                   'res')): # Just 'res'.
            for f in files:
                os.chmod(os.path.join(root, f), 0644)
        """
        logger.info('geting Output for jon %s:%s' %
                    (job.backend.taskname, job.backend.crabid))
        inputlist = [('workflow', job.backend.taskname)]
        inputlist.extend([('subresource', 'logs')])
        inputlist.extend([('jobids', job.backend.crabid)])

        #srv='hammer-crab3.cern.ch'#  'cmsweb-testbed.cern.ch'
        #proxypath= '/afs/cern.ch/user/r/riahi/public/proxy'#'/afs/cern.ch/user/s/spiga/public/PerValentaina/proxy'
        #resource='/crabserver/dev/workflow'
        #server = HTTPRequests(srv, proxypath)

        server = HTTPRequests(job.backend.server_name, job.backend.userproxy)

        resource = job.backend.apiresource + 'workflow'

        try:
            dictresult, status, reason = server.get(resource, data=inputlist)

        except HTTPException, e:
            print type(e)
            print dir(e)
            print e.req_headers
            print e.req_data
            print e.reason
            print e.message
            print e.headers
            print e.result
            print e.status
            print e.url
            print e.args
コード例 #14
0
ファイル: CRABServer.py プロジェクト: spigad/GangaCRAB3
    def getOutput(self, job):
        """Retrieve the output of the job."""
        """
        if not os.path.exists(job.inputdata.ui_working_dir):
            raise CRABServerError('Workdir "%s" not found.' %
                                  job.inputdata.ui_working_dir)

        cmd = 'crab -getoutput %d -c %s' % (int(job.id) + 1,
                                            job.inputdata.ui_working_dir)
        self._send_with_retry(cmd, 'getoutput', job.backend.crab_env)
        # Make output files coming from the WMS readable.
        for root, _, files in os.walk(os.path.join(job.inputdata.ui_working_dir,
                                                   'res')): # Just 'res'.
            for f in files:
                os.chmod(os.path.join(root, f), 0644)
        """
        logger.info('geting Output for jon %s:%s' % (job.backend.taskname, job.backend.crabid)) 
        inputlist =  [  ('workflow', job.backend.taskname)]
        inputlist.extend([('subresource', 'logs')])
        inputlist.extend( [('jobids', job.backend.crabid)] )

        #srv='hammer-crab3.cern.ch'#  'cmsweb-testbed.cern.ch'
        #proxypath= '/afs/cern.ch/user/r/riahi/public/proxy'#'/afs/cern.ch/user/s/spiga/public/PerValentaina/proxy'
        #resource='/crabserver/dev/workflow'
        #server = HTTPRequests(srv, proxypath)

        server = HTTPRequests(job.backend.server_name, job.backend.userproxy)

        resource = job.backend.apiresource+'workflow'

        try:
            dictresult, status, reason = server.get(resource, data = inputlist)

        except HTTPException, e:
            print type(e)
            print dir(e)
            print e.req_headers
            print e.req_data
            print e.reason
            print e.message
            print e.headers
            print e.result
            print e.status
            print e.url
            print e.args
コード例 #15
0
ファイル: getcommand.py プロジェクト: spigad/CRABClient
    def __call__(self, **argv):
        #Setting default destination if -o is not provided
        if not self.dest:
            self.dest = os.path.join(self.requestarea, 'results')

        #Creating the destination directory if necessary
        if not os.path.exists( self.dest ):
            self.logger.debug("Creating directory %s " % self.dest)
            os.makedirs( self.dest )
        elif not os.path.isdir( self.dest ):
            raise ConfigurationException('Destination directory is a file')

        self.logger.info("Setting the destination directory to %s " % self.dest )

        #Retrieving output files location from the server
        self.logger.debug('Retrieving locations for task %s' % self.cachedinfo['RequestName'] )
        inputlist =  [ ('workflow', self.cachedinfo['RequestName']) ]
        inputlist.extend(list(argv.iteritems()))
        if getattr(self.options, 'quantity', None):
            self.logger.debug('Retrieving %s file locations' % self.options.quantity )
            inputlist.append( ('limit',self.options.quantity) )
        if getattr(self.options, 'jobids', None):
            self.logger.debug('Retrieving jobs %s' % self.options.jobids )
            inputlist.extend( self.options.jobids )
        server = HTTPRequests(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__)
        dictresult, status, reason = server.get(self.uri, data = inputlist)
        self.logger.debug('Server result: %s' % dictresult )
        dictresult = self.processServerResult(dictresult)

        if status != 200:
            msg = "Problem retrieving information from the server:\ninput:%s\noutput:%s\nreason:%s" % (str(inputlist), str(dictresult), str(reason))
            raise ConfigurationException(msg)

        totalfiles = len( dictresult['result'] )
        cpresults = []
#        for workflow in dictresult['result']: TODO re-enable this when we will have resubmissions
        workflow = dictresult['result']        #TODO assigning workflow to dictresult. for the moment we have only one wf
        arglist = ['-d', self.dest, '-i', workflow, '-t', self.options.task, '-p', self.proxyfilename]
        if len(workflow) > 0:
            self.logger.info("Retrieving %s files" % totalfiles )
            copyoutput = remote_copy( self.logger, arglist )
            copyoutput()

        if totalfiles == 0:
            self.logger.info("No files to retrieve")
コード例 #16
0
ファイル: report.py プロジェクト: qunox/CRABClient
    def __call__(self):
        server = HTTPRequests(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__)

        self.logger.debug('Looking up report for task %s' % self.cachedinfo['RequestName'])
        dictresult, status, reason = server.get(self.uri, data = {'workflow': self.cachedinfo['RequestName'], 'subresource': 'report'})

        self.logger.debug("Result: %s" % dictresult)

        if status != 200:
            msg = "Problem retrieving report:\ninput:%s\noutput:%s\nreason:%s" % (str(self.cachedinfo['RequestName']), str(dictresult), str(reason))
            raise RESTCommunicationException(msg)
        if not dictresult['result'][0]['runsAndLumis'] :
            self.logger.info('No jobs finished yet. Report is available when jobs complete')
            return

        runlumiLists = map(lambda x: literal_eval(x['runlumi']), dictresult['result'][0]['runsAndLumis'].values())
        #convert lumi lists from strings to integers
        for runlumi in runlumiLists:
            for run in runlumi:
                runlumi[run] = map(int, runlumi[run])
        analyzed, diff, doublelumis = BasicJobType.mergeLumis(runlumiLists, dictresult['result'][0]['lumiMask'])
        numFiles = len(reduce(set().union, map(lambda x: literal_eval(x['parents']), dictresult['result'][0]['runsAndLumis'].values())))
        self.logger.info("%d files have been read" % numFiles)
        self.logger.info("%d events have been read" % sum(map(lambda x: x['events'], dictresult['result'][0]['runsAndLumis'].values())))

        if self.outdir:
            jsonFileDir = self.outdir
        else:
            jsonFileDir = os.path.join(self.requestarea, 'results')
        if analyzed:
            with open(os.path.join(jsonFileDir, 'analyzed.json'), 'w') as jsonFile:
                json.dump(analyzed, os.path.join(jsonFile))
                jsonFile.write("\n")
                self.logger.info("Analyzed lumi written to %s/analyzed.json" % jsonFileDir)
        if diff:
            with open(os.path.join(jsonFileDir, 'diff.json'), 'w') as jsonFile:
                json.dump(diff, jsonFile)
                jsonFile.write("\n")
                self.logger.info("%sNot Analyzed lumi written to %s/diff.json%s" % (colors.RED, jsonFileDir, colors.NORMAL))
        if doublelumis:
            with open(os.path.join(jsonFileDir, 'double.json'), 'w') as jsonFile:
                json.dump(doublelumis, jsonFile)
                jsonFile.write("\n")
                self.logger.info("%sDouble lumis written to %s/double.json%s" % (colors.RED, jsonFileDir, colors.NORMAL))
コード例 #17
0
ファイル: tasks.py プロジェクト: blinkseb/CRABClient
    def __call__(self):
        server = HTTPRequests(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__)
        dictresult, status, reason = server.get(self.uri, data={"timestamp": self.date})
        dictresult = dictresult["result"]  # take just the significant part

        if status != 200:
            msg = "Problem retrieving tasks:\ninput:%s\noutput:%s\nreason:%s" % (
                str(self.date),
                str(dictresult),
                str(reason),
            )
            raise RESTCommunicationException(msg)

        dictresult.sort()
        dictresult.reverse()

        if self.options.status:
            dictresult = [item for item in dictresult if item[1] == self.options.status]

        result = [item[0:2] for item in dictresult]

        today = date.today()

        if not dictresult:
            msg = "No tasks found from %s until %s" % (self.date, today)
            if self.options.status:
                msg += " with status %s" % (self.options.status)
            self.logger.info(msg)
            return result

        msg = "\nList of tasks from %s until %s" % (self.date, today)
        if self.options.status:
            msg += " with status %s" % (self.options.status)
        self.logger.info(msg)
        self.logger.info("=" * 80)
        self.logger.info("NAME\t\t\t\t\t\t\t\tSTATUS")
        self.logger.info("=" * 80)
        for item in dictresult:
            name, status = item[0:2]
            self.logger.info("%s\n\t\t\t\t\t\t\t\t%s" % (name, status))
            self.logger.info("-" * 80)
        self.logger.info("\n")

        return result
コード例 #18
0
ファイル: status.py プロジェクト: spigad/CRABClient
    def __call__(self):
        server = HTTPRequests(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__)

        self.logger.debug('Looking up detailed status of task %s' % self.cachedinfo['RequestName'])
        dictresult, status, reason = server.get(self.uri, data = { 'workflow' : self.cachedinfo['RequestName']})
        dictresult = dictresult['result'][0] #take just the significant part

        if status != 200:
            msg = "Problem retrieving status:\ninput:%s\noutput:%s\nreason:%s" % (str(self.cachedinfo['RequestName']), str(dictresult), str(reason))
            raise RESTCommunicationException(msg)

        self.logger.debug(dictresult) #should be something like {u'result': [[123, u'ciao'], [456, u'ciao']]}

        self.logger.info("Task name:\t\t\t%s" % self.cachedinfo['RequestName'])
        self.logger.info("Task status:\t\t\t%s" % dictresult['status'])

        def logJDefErr(jdef):
            """Printing job def failures if any"""
            if jdef['jobdefErrors']:
                self.logger.error("%sFailed to inject %s\t%s out of %s:" %(colors.RED, colors.NORMAL,\
                                                                           jdef['failedJobdefs'], jdef['totalJobdefs']))
                for error in jdef['jobdefErrors']:
                    self.logger.info("\t%s" % error)

        #Print the url of the panda monitor
        if dictresult['taskFailureMsg']:
            self.logger.error("%sError during task injection:%s\t%s" % (colors.RED,colors.NORMAL,dictresult['taskFailureMsg']))
            # We might also have more information in the job def errors
            logJDefErr(jdef=dictresult)
        elif dictresult['jobSetID']:
            username = urllib.quote(getUserName(self.logger))
            self.logger.info("Panda url:\t\t\thttp://pandamon-cms-dev.cern.ch/jobinfo?jobtype=*&jobsetID=%s&prodUserName=%s" % (dictresult['jobSetID'], username))
            # We have cases where the job def errors are there but we have a job def id
            logJDefErr(jdef=dictresult)

        #Print information about jobs
        states = dictresult['jobsPerStatus']
        total = sum( states[st] for st in states )
        frmt = ''
        for status in states:
            frmt += status + ' %s\t' % self._percentageString(states[status], total)
        if frmt:
            self.logger.info('Details:\t\t\t%s' % frmt)
コード例 #19
0
ファイル: tasks.py プロジェクト: belforte/CRABClient
    def __call__(self):
        server = HTTPRequests(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__)
        dictresult, status, reason = server.get(self.uri, data = {'timestamp': self.date})
        dictresult = dictresult['result'] #take just the significant part

        if status != 200:
            msg = "Problem retrieving tasks:\ninput:%s\noutput:%s\nreason:%s" % (str(self.date), str(dictresult), str(reason))
            raise RESTCommunicationException(msg)

        dictresult.sort()
        dictresult.reverse()

        if self.options.status:
            dictresult = [item for item in dictresult if item[1] == self.options.status]

        result = [item[0:2] for item in dictresult]

        today = date.today()

        if not dictresult:
            msg = "No tasks found from %s until %s" % (self.date, today)
            if self.options.status:
                msg += " with status %s" % (self.options.status)
            self.logger.info(msg)
            return result

        msg = "\nList of tasks from %s until %s" % (self.date, today)
        if self.options.status:
            msg += " with status %s" % (self.options.status)
        self.logger.info(msg)
        msg = "Beware that STATUS here does not include information from grid jobs"
        self.logger.info(msg)
        self.logger.info('='*80)
        self.logger.info('NAME\t\t\t\t\t\t\t\tSTATUS')
        self.logger.info('='*80)
        for item in dictresult:
            name, status = item[0:2]
            self.logger.info('%s\n\t\t\t\t\t\t\t\t%s' % (name, status))
            self.logger.info('-'*80)
        self.logger.info('\n')

        return result
コード例 #20
0
ファイル: tasks.py プロジェクト: annawoodard/CRABClient
    def __call__(self):
        server = HTTPRequests(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__)
        dictresult, status, reason = server.get(self.uri, data = { 'timestamp' : self.date })
        dictresult = dictresult['result'] #take just the significant part
        dictresult.sort()
        dictresult.reverse()

        today = date.today() 

        self.logger.info ('\n')
        self.logger.info ('The list of tasks from %s until %s' %(self.date, today))
        self.logger.info ('='*80)
        self.logger.info ('NAME\t\t\t\t\t\t\t\tSTATUS')
        self.logger.info ('='*80)

        for item in dictresult:
            name, status = item[0:2]
            self.logger.info ('%s\n\t\t\t\t\t\t\t\t%s' %(name, status))
            self.logger.info ('-'*80)
    	
        self.logger.info ('\n')
コード例 #21
0
ファイル: CRABServer.py プロジェクト: spigad/GangaCRAB3
    def status(self, job):
        """Get the status of a jobset."""
        """
        if not os.path.exists(job.inputdata.ui_working_dir):
            raise CRABServerError('Workdir "%s" not found.' %
                                  job.inputdata.ui_working_dir)

        cmd = 'crab -status -c %s' % job.inputdata.ui_working_dir
        self._send_with_retry(cmd, 'status', job.backend.crab_env)
        return True
        """
        #from RESTInteractions import HTTPRequests

        logger.info('checkin status')

        try:
            server = HTTPRequests(job.backend.server_name,
                                  job.backend.userproxy)
            resource = job.backend.apiresource + 'workflow'
            dictresult, status, reason = server.get(
                resource, data={'workflow': job.backend.taskname})
            logger.info("status %s, reason %s" % (status, reason))
            return dictresult, status, reason

        except HTTPException, e:
            print type(e)
            print dir(e)
            print e.req_headers
            print e.req_data
            print e.reason
            print e.message
            print e.headers
            print e.result
            print e.status
            print e.url
            print e.args

            raise e
コード例 #22
0
def getProxiedWebDir(task, host, uri, cert, logFunction=print):
    """ The function simply queries the REST interface specified to get the proxied webdir to use
        for the task. Returns None in case the API could not find the url (either an error or the schedd
        is not configured)
    """
    # This import is here because SeverUtilities is also used on the worker nodes,
    # and I want to avoid the dependency to pycurl right now. We should actually add it one day
    # so that other code in cmscp that uses Requests.py from WMCore can be migrated to RESTInteractions
    from RESTInteractions import HTTPRequests

    data = {"subresource": "webdirprx", "workflow": task}
    res = None
    try:
        server = HTTPRequests(host, cert, cert, retry=2)
        dictresult, _, _ = server.get(uri, data=data)  # the second and third parameters are deprecated
        if dictresult.get("result"):
            res = dictresult["result"][0]
    except HTTPException as hte:
        logFunction(traceback.format_exc())
        logFunction(hte.headers)
        logFunction(hte.result)

    return res
コード例 #23
0
ファイル: tasks.py プロジェクト: matz-e/CRABClient
    def __call__(self):
        server = HTTPRequests(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__)
	
        dictresult, status, reason = server.get(self.uri, data = { 'timestamp' : self.date })
        dictresult = dictresult['result'] #take just the significant part

        today = date.today() 

        self.logger.info ('\n')

        self.logger.info ('The list of tasks from %s until %s' %(self.date, today))

        self.logger.info ('='*80)
	
        self.logger.info ('NAME\t\t\t\t\t\t\t\tSTATUS')
		
        self.logger.info ('='*80)

        for item in dictresult:
            name, status = item[0:2]
            self.logger.info ('%s\n\t\t\t\t\t\t\t\t%s' %(name, status))
            self.logger.info ('-'*80)
    	
        self.logger.info ('\n')
コード例 #24
0
ファイル: CRABServer.py プロジェクト: akanevm/GangaCRAB3
    def status(self, job):
        """Get the status of a jobset."""
        """
        if not os.path.exists(job.inputdata.ui_working_dir):
            raise CRABServerError('Workdir "%s" not found.' %
                                  job.inputdata.ui_working_dir)

        cmd = 'crab -status -c %s' % job.inputdata.ui_working_dir
        self._send_with_retry(cmd, 'status', job.backend.crab_env)
        return True
        """
        #from RESTInteractions import HTTPRequests

        logger.info('checkin status')

        try:
            server = HTTPRequests(job.backend.server_name, job.backend.userproxy)
            resource = job.backend.apiresource+'workflow'
            dictresult, status, reason = server.get(resource, data = { 'workflow' : job.backend.taskname})
            logger.info("status %s, reason %s" % (status, reason))
            return dictresult, status, reason

        except HTTPException, e:
            print type(e)
            print dir(e)
            print e.req_headers
            print e.req_data
            print e.reason
            print e.message
            print e.headers
            print e.result
            print e.status
            print e.url
            print e.args

            raise e 
コード例 #25
0
ファイル: purge.py プロジェクト: Crabclient/CRABClient
    def __call__(self):

        self.logger.info('Getting the tarball hash key')

        tarballdir=glob.glob(self.requestarea+'/inputs/*.tgz')
        if len(tarballdir) != 1 :
            self.logger.info('%sError%s: Could not find tarball or there is more than one tarball'% (colors.RED, colors.NORMAL))
            raise ConfigurationException
        tarballdir=tarballdir[0]

        #checking task status

        self.logger.info('Checking task status')
        server = HTTPRequests(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__)
        dictresult, status, _ = server.get(self.uri, data = { 'workflow' : self.cachedinfo['RequestName'], 'verbose': 0 })

        dictresult = dictresult['result'][0] #take just the significant part

        if status != 200:
            msg = "Problem retrieving status:\ninput:%s\noutput:%s\nreason:%s" % (str(self.cachedinfo['RequestName']), str(dictresult), str(reason))
            raise RESTCommunicationException(msg)

        self.logger.info('Task status: %s' % dictresult['status'])
        accepstate = ['KILLED','FINISHED','FAILED','KILLFAILED', 'COMPLETED']
        if dictresult['status'] not in accepstate:
            msg = ('%sError%s: Only task with this status can be purge: {0}'.format(accepstate) % (colors.RED, colors.NORMAL))
            raise ConfigurationException(msg)

        #getting the cache url

        if not self.options.scheddonly:
            baseurl=self.getUrl(self.instance, resource='info')
            cacheurl=server_info('backendurls', self.serverurl, self.proxyfilename, baseurl)
            cacheurl=cacheurl['cacheSSL']
            cacheurldict={'endpoint' : cacheurl, 'pycurl': True}

            ufc = UserFileCache(cacheurldict)
            hashkey = ufc.checksum(tarballdir)
            self.logger.info('Tarball hashkey :%s' %hashkey)
            self.logger.info('Attempting to clean user file cache')
            ufcresult = ufc.removeFile(hashkey)
            if ufcresult == '' :
                self.logger.info('%sSuccess%s: Successfully remove file from cache' % (colors.GREEN, colors.NORMAL))
            else:
                self.logger.info('%sError%s: Failed to remove the file from cache' % (colors.RED, colors.NORMAL))

        if not self.options.cacheonly:
            self.logger.info('Getting the schedd address')
            baseurl=self.getUrl(self.instance, resource='info')
            try:
                sceddaddress = server_info('scheddaddress', self.serverurl, self.proxyfilename, baseurl, workflow = self.cachedinfo['RequestName'] )
            except HTTPException, he:
                self.logger.info('%sError%s: Failed to get the schedd address' % (colors.RED, colors.NORMAL))
                raise HTTPException,he
            self.logger.debug('%sSuccess%s: Successfully getting schedd address' % (colors.GREEN, colors.NORMAL))
            self.logger.debug('Schedd address: %s' % sceddaddress)
            self.logger.info('Attempting to clean user file schedd')

            gssishrm = 'gsissh -o ConnectTimeout=60 -o PasswordAuthentication=no ' + sceddaddress + ' rm -rf ' + self.cachedinfo['RequestName']
            self.logger.debug('gsissh command: %s' % gssishrm)

            delprocess=subprocess.Popen(gssishrm, stdout= subprocess.PIPE, stderr= subprocess.PIPE, shell=True)
            stdout, stderr = delprocess.communicate()
            exitcode = delprocess.returncode

            if exitcode == 0 :
                self.logger.info('%sSuccess%s: Successfully remove task from scehdd' % (colors.GREEN, colors.NORMAL))
            else :
                self.logger.info('%sError%s: Failed to remove task from schedd' % (colors.RED, colors.NORMAL))
                self.logger.debug('gsissh stdout: %s\ngsissh stderr: %s\ngsissh exitcode: %s' % (stdout,stderr,exitcode))
コード例 #26
0
class ReporterWorker:

    def __init__(self, user, config):
        """
        store the user and tfc the worker
        """
        self.user = user
        self.config = config
        self.dropbox_dir = '%s/dropbox/inputs' % self.config.componentDir
        logging.basicConfig(level=config.log_level)
        self.site_tfc_map = {}
        self.logger = logging.getLogger('AsyncTransfer-Reporter-%s' % self.user)
        formatter = getCommonLogFormatter(self.config)
        for handler in logging.getLogger().handlers:
            handler.setFormatter(formatter)
        self.uiSetupScript = getattr(self.config, 'UISetupScript', None)
        self.cleanEnvironment = ''
        self.userDN = ''
        self.init = True
        if getattr(self.config, 'cleanEnvironment', False):
            self.cleanEnvironment = 'unset LD_LIBRARY_PATH; unset X509_USER_CERT; unset X509_USER_KEY;'
        # TODO: improve how the worker gets a log
        self.logger.debug("Trying to get DN")
        try:
            self.userDN = getDNFromUserName(self.user, self.logger)
        except Exception as ex:
            msg = "Error retrieving the user DN"
            msg += str(ex)
            msg += str(traceback.format_exc())
            self.logger.error(msg)
            self.init = False
            return
        if not self.userDN:
            self.init = False
            return
        defaultDelegation = {
                                  'logger': self.logger,
                                  'credServerPath': self.config.credentialDir,
                                  # It will be moved to be getfrom couchDB
                                  'myProxySvr': 'myproxy.cern.ch',
                                  'min_time_left' : getattr(self.config, 'minTimeLeft', 36000),
                                  'serverDN': self.config.serverDN,
                                  'uisource': self.uiSetupScript,
                                  'cleanEnvironment': getattr(self.config, 'cleanEnvironment', False)
                            }
        if hasattr(self.config, "cache_area"):
            try:
                defaultDelegation['myproxyAccount'] = re.compile('https?://([^/]*)/.*').findall(self.config.cache_area)[0]
            except IndexError:
                self.logger.error('MyproxyAccount parameter cannot be retrieved from %s' % self.config.cache_area)
                pass
        if getattr(self.config, 'serviceCert', None):
            defaultDelegation['server_cert'] = self.config.serviceCert
        if getattr(self.config, 'serviceKey', None):
            defaultDelegation['server_key'] = self.config.serviceKey

        self.valid = False
        try:
            self.valid, proxy = getProxy(self.userDN, "", "", defaultDelegation, self.logger)
        except Exception as ex:
            msg = "Error getting the user proxy"
            msg += str(ex)
            msg += str(traceback.format_exc())
            self.logger.error(msg)

        if self.valid:
            self.userProxy = proxy
        else:
            # Use the operator's proxy when the user proxy in invalid.
            # This will be moved soon
            self.logger.error('Did not get valid proxy. Setting proxy to ops proxy')
            self.userProxy = config.opsProxy

        if self.config.isOracle:
            try:
                self.oracleDB = HTTPRequests(self.config.oracleDB,
                                             config.opsProxy,
                                             config.opsProxy)
            except Exception:
                self.logger.exception()
                raise
        else:
            server = CouchServer(dburl=self.config.couch_instance, ckey=self.config.opsProxy, cert=self.config.opsProxy)
            self.db = server.connectDatabase(self.config.files_database)

        # Set up a factory for loading plugins
        self.factory = WMFactory(self.config.pluginDir, namespace = self.config.pluginDir)
        self.commandTimeout = 1200
        self.max_retry = config.max_retry
        # Proxy management in Couch
        os.environ['X509_USER_PROXY'] = self.userProxy
        try:
            self.phedex = PhEDEx(responseType='xml',
                                 dict={'key':self.config.opsProxy,
                                       'cert':self.config.opsProxy})
        except Exception as e:
            self.logger.exception('PhEDEx exception: %s' % e)

    def __call__(self):
        """
        a. makes the ftscp copyjob
        b. submits ftscp
        c. deletes successfully transferred files from the DB
        """
        self.logger.info("Retrieving files for %s" % self.user)
        files_to_update = self.files_for_update()
        self.logger.info("%s files to process" % len(files_to_update))
        self.logger.debug("%s files to process" % files_to_update)
        for input_file in files_to_update:
            remove_good = True
            remove_failed = True
            failed_lfns = []
            failure_reason = []
            good_lfns = []
            self.logger.info("Updating %s" % input_file)
            if os.path.basename(input_file).startswith('Reporter'):
                try:
                    json_data = json.loads(open(input_file).read())
                except ValueError as e:
                    self.logger.error("Error loading %s" % e)
                    self.logger.debug('Removing %s' % input_file)
                    os.unlink(input_file)
                    continue
                except Exception as e:
                    self.logger.error("Error loading %s" % e)
                    self.logger.debug('Removing %s' % input_file)
                    os.unlink(input_file)
                    continue
                if json_data:
                    self.logger.debug('Inputs: %s %s %s' % (json_data['LFNs'], json_data['transferStatus'], json_data['failure_reason']))

                    if 'FAILED' or 'abandoned' or 'CANCELED' or 'lost' in json_data['transferStatus']:
                        # Sort failed files
                        failed_indexes = [i for i, x in enumerate(json_data['transferStatus']) if x == 'FAILED' or x == 'CANCELED']
                        abandoned_indexes = [i for i, x in enumerate(json_data['transferStatus']) if x == 'abandoned']
                        failed_indexes.extend(abandoned_indexes)
                        self.logger.info('failed indexes %s' % len(failed_indexes))
                        self.logger.debug('failed indexes %s' % failed_indexes)
                        for i in failed_indexes:
                            failed_lfns.append(json_data['LFNs'][i])
                            failure_reason.append(json_data['failure_reason'][i])
                        self.logger.debug('Marking failed %s %s' %(failed_lfns, failure_reason))
                        updated_failed_lfns = self.mark_failed(failed_lfns, failure_reason)


                    if 'Done' or 'FINISHED' in json_data['transferStatus']:
                        # Sort good files
                        good_indexes = [i for i, x in enumerate(json_data['transferStatus']) if (x == 'Done' or x == 'FINISHED' or x == 'Finishing') ]
                        self.logger.info('good indexes %s' % len(good_indexes))
                        self.logger.debug('good indexes %s' % good_indexes)
                        for i in good_indexes:
                            good_lfns.append(json_data['LFNs'][i])
                        self.logger.info('Marking good %s' %(good_lfns))
                        try:
                            updated_good_lfns = self.mark_good(good_lfns)
                        except:
                            self.logger.exception('Either no files to mark or failed to update state')

                    # Remove the json file
                    self.logger.debug('Removing %s' % input_file)
                    os.unlink( input_file )

                else:
                    self.logger.info('Empty file %s' % input_file)
                    continue
            else:
                self.logger.info('File not for the Reporter %s' % input_file)
                continue
        self.logger.info('Update completed')
        return

    def files_for_update(self):
        """
        Retrieve the list of files to update.
        """
        files_to_update = []
        user_dir = os.path.join(self.dropbox_dir, self.user)
        self.logger.info('Looking into %s' % user_dir)
        for user_file in os.listdir(user_dir):
            files_to_update.append(os.path.join(self.dropbox_dir, self.user, user_file))
        return files_to_update

    def mark_good(self, files):
        """
        Mark the list of files as tranferred
        """
        updated_lfn = []
        good_ids = []
        if len(files) == 0:
            return updated_lfn
        for it, lfn in enumerate(files):
            hash_lfn = getHashLfn(lfn)
            self.logger.info("Marking good %s" % hash_lfn)
            self.logger.debug("Marking good %s" % lfn)
            if not self.config.isOracle:
                try:
                    document = self.db.document(hash_lfn)
                except Exception as ex:
                    msg = "Error loading document from couch"
                    msg += str(ex)
                    msg += str(traceback.format_exc())
                    self.logger.error(msg)
                    continue
            self.logger.info("Doc %s Loaded" % hash_lfn)
            try:
                now = str(datetime.datetime.now())
                last_update = time.time()
                if self.config.isOracle:
                    docId = getHashLfn(lfn)
                    good_ids.append(docId)
                    updated_lfn.append(lfn)
                else:
                    if document['state'] != 'killed' and document['state'] != 'done' and document['state'] != 'failed':
                        outputLfn = document['lfn'].replace('store/temp', 'store', 1)
                        data = dict()
                        data['end_time'] = now
                        data['state'] = 'done'
                        data['lfn'] = outputLfn
                        data['last_update'] = last_update
                        updateUri = "/" + self.db.name + "/_design/AsyncTransfer/_update/updateJobs/" + getHashLfn(lfn)
                        updateUri += "?" + urllib.urlencode(data)
                        self.db.makeRequest(uri = updateUri, type = "PUT", decode = False)
                        updated_lfn.append(lfn)
                        self.logger.debug("Marked good %s" % lfn)
                    else: 
                        updated_lfn.append(lfn)
                    try:
                        self.db.commit()
                    except Exception as ex:
                        msg = "Error commiting documents in couch"
                        msg += str(ex)
                        msg += str(traceback.format_exc())
                        self.logger.error(msg)
                        continue   
            except Exception as ex:
                msg = "Error updating document"
                msg += str(ex)
                msg += str(traceback.format_exc())
                self.logger.error(msg)
                continue
        if self.config.isOracle:
            try:
                data = dict()
                data['asoworker'] = self.config.asoworker
                data['subresource'] = 'updateTransfers'
                data['list_of_ids'] = good_ids
                data['list_of_transfer_state'] = ["DONE" for x in good_ids]
                result = self.oracleDB.post(self.config.oracleFileTrans,
                                            data=encodeRequest(data))
                self.logger.debug("Marked good %s" % good_ids)
            except Exception:
                self.logger.exception('Error updating document')
                return {}
        
        self.logger.info("Transferred file %s updated, removing now source file" %docId)
        try:
            docbyId = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers','fileusertransfers'),
                                        data=encodeRequest({'subresource': 'getById', 'id': docId}))
            document = oracleOutputMapping(docbyId, None)[0]
        except Exception:
            msg = "Error getting file from source"
            self.logger.exception(msg)
            return {}

        if document["source"] not in self.site_tfc_map:
            self.logger.debug("site not found... gathering info from phedex")
            self.site_tfc_map[document["source"]] = self.get_tfc_rules(document["source"])
        pfn = self.apply_tfc_to_lfn( '%s:%s' %(document["source"], lfn))
        self.logger.debug("File has to be removed now from source site: %s" %pfn)
        self.remove_files(self.userProxy, pfn)
        self.logger.debug("Transferred file removed from source")
        return updated_lfn

    def remove_files(self, userProxy, pfn):

        command = 'env -i X509_USER_PROXY=%s gfal-rm -v -t 180 %s'  % \
                  (userProxy, pfn)
        logging.debug("Running remove command %s" % command)
        try:
            rc, stdout, stderr = execute_command(command, self.logger, 3600)
        except Exception as ex:
            self.logger.error(ex)
        if rc:
            logging.info("Deletion command failed with output %s and error %s" %(stdout, stderr))
        else:
            logging.info("File Deleted.")
        return 

    def get_tfc_rules(self, site):
        """
        Get the TFC regexp for a given site.
        """
        self.phedex.getNodeTFC(site)
        try:
            tfc_file = self.phedex.cacheFileName('tfc', inputdata={'node': site})
        except Exception:
            self.logger.exception('A problem occured when getting the TFC regexp: %s')
            return None
        return readTFC(tfc_file)

    def apply_tfc_to_lfn(self, file):
        """
        Take a CMS_NAME:lfn string and make a pfn.
        Update pfn_to_lfn_mapping dictionary.
        """
        try:
            site, lfn = tuple(file.split(':'))
        except Exception:
            self.logger.exception('It does not seem to be an lfn %s' %file.split(':'))
            return None
        if site in self.site_tfc_map:
            pfn = self.site_tfc_map[site].matchLFN('srmv2', lfn)
            # TODO: improve fix for wrong tfc on sites
            try:
                if pfn.find("\\") != -1: pfn = pfn.replace("\\","")
                if pfn.split(':')[0] != 'srm' and pfn.split(':')[0] != 'gsiftp' :
                    self.logger.error('Broken tfc for file %s at site %s' % (lfn, site))
                    return None
            except IndexError:
                self.logger.error('Broken tfc for file %s at site %s' % (lfn, site))
                return None
            except AttributeError:
                self.logger.error('Broken tfc for file %s at site %s' % (lfn, site))
                return None
            return pfn
        else:
            self.logger.error('Wrong site %s!' % site)
            return None

    def mark_failed(self, files=[], failures_reasons=[], force_fail=False):
        """
        Something failed for these files so increment the retry count
        """
        updated_lfn = []
        for lfn in files:
            data = {}
            self.logger.debug("Document: %s" % lfn)
            if not isinstance(lfn, dict):
                if 'temp' not in lfn:
                    temp_lfn = lfn.replace('store', 'store/temp', 1)
                else:
                    temp_lfn = lfn
            else:
                if 'temp' not in lfn['value']:
                    temp_lfn = lfn['value'].replace('store', 'store/temp', 1)
                else:
                    temp_lfn = lfn['value']
            docId = getHashLfn(temp_lfn)
            # Load document to get the retry_count
            if self.config.isOracle:
                try:
                    self.logger.debug("Document: %s" %docId)
                    docbyId = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers',
                                                                                    'fileusertransfers'),
                                                data=encodeRequest({'subresource': 'getById', 'id': docId}))
                    document = oracleOutputMapping(docbyId)[0]
                    data = dict()
                    data['asoworker'] = self.config.asoworker
                    data['subresource'] = 'updateTransfers'
                    data['list_of_ids'] = docId

                    if force_fail or document['transfer_retry_count'] + 1 > self.max_retry:
                        data['list_of_transfer_state'] = 'FAILED'
                        data['list_of_retry_value'] = 0
                    else:
                        data['list_of_transfer_state'] = 'RETRY'
                        fatal_error = self.determine_fatal_error(failures_reasons[files.index(lfn)])
                        if fatal_error:
                            data['list_of_transfer_state'] = 'FAILED'
                        
                    data['list_of_failure_reason'] = failures_reasons[files.index(lfn)]
                    data['list_of_retry_value'] = 0

                    self.logger.debug("update: %s" % data)
                    result = self.oracleDB.post(self.config.oracleFileTrans,
                                                data=encodeRequest(data))
                    if not data['list_of_transfer_state'] == 'RETRY':  
                        updated_lfn.append(lfn)
                    self.logger.debug("Marked failed %s" % lfn)
                except Exception as ex:
                    self.logger.error("Error updating document status: %s" %ex)
                    continue
            else:
                try:
                    document = self.db.document( docId )
                except Exception as ex:
                    msg = "Error loading document from couch"
                    msg += str(ex)
                    msg += str(traceback.format_exc())
                    self.logger.error(msg)
                    continue
                if document['state'] != 'killed' and document['state'] != 'done' and document['state'] != 'failed':
                    now = str(datetime.datetime.now())
                    last_update = time.time()
                    # Prepare data to update the document in couch
                    if force_fail or len(document['retry_count']) + 1 > self.max_retry:
                        data['state'] = 'failed'
                        data['end_time'] = now
                    else:
                        data['state'] = 'retry'
                        fatal_error = self.determine_fatal_error(failures_reasons[files.index(lfn)])
                        if fatal_error:
                            data['state'] = 'failed'
                            data['end_time'] = now

                    self.logger.debug("Failure list: %s" % failures_reasons)
                    self.logger.debug("Files: %s" % files)
                    self.logger.debug("LFN %s" % lfn)

                    data['failure_reason'] = failures_reasons[files.index(lfn)]
                    data['last_update'] = last_update
                    data['retry'] = now
                    # Update the document in couch
                    self.logger.debug("Marking failed %s" % docId)
                    try:
                        updateUri = "/" + self.db.name + "/_design/AsyncTransfer/_update/updateJobs/" + docId
                        updateUri += "?" + urllib.urlencode(data)
                        self.db.makeRequest(uri = updateUri, type = "PUT", decode = False)
                        updated_lfn.append(docId)
                        self.logger.debug("Marked failed %s" % docId)
                    except Exception as ex:
                        msg = "Error in updating document in couch"
                        msg += str(ex)
                        msg += str(traceback.format_exc())
                        self.logger.error(msg)
                        continue
                    try:
                        self.db.commit()
                    except Exception as ex:
                        msg = "Error commiting documents in couch"
                        msg += str(ex)
                        msg += str(traceback.format_exc())
                        self.logger.error(msg)
                        continue
                else: updated_lfn.append(docId)
        self.logger.debug("failed file updated")
        return updated_lfn

    def determine_fatal_error(self, failure=""):
        """
        Determine if transfer error is fatal or not.
        """
        permanent_failure_reasons = [
                             ".*canceled because it stayed in the queue for too long.*",
                             ".*permission denied.*",
                             ".*disk quota exceeded.*",
                             ".*operation not permitted*",
                             ".*mkdir\(\) fail.*",
                             ".*open/create error.*",
                             ".*mkdir\: cannot create directory.*",
                             ".*does not have enough space.*"
                                    ]
        failure = str(failure).lower()
        for permanent_failure_reason in permanent_failure_reasons:
            if re.match(permanent_failure_reason, failure):
                return True
        return False

    def mark_incomplete(self, files=[]):
        """
        Mark the list of files as acquired
        """
        self.logger('Something called mark_incomplete which should never be called')
コード例 #27
0
ファイル: TaskPublish.py プロジェクト: dciangot/CRABServer
def publishInDBS3(config, taskname, verbose):
    """
    Publish output from one task in DBS
    """
    def mark_good(files, crabServer, logger):
        """
        Mark the list of files as tranferred
        """

        msg = "Marking %s file(s) as published." % len(files)
        logger.info(msg)
        nMarked = 0
        for lfn in files:
            data = {}
            source_lfn = lfn
            docId = getHashLfn(source_lfn)
            data['asoworker'] = config.General.asoworker
            data['subresource'] = 'updatePublication'
            data['list_of_ids'] = docId
            data['list_of_publication_state'] = 'DONE'
            data['list_of_retry_value'] = 1
            data['list_of_failure_reason'] = ''

            if dryRun:
                logger.info("DryRun: skip marking good file")
            else:
                try:
                    result = crabServer.post(REST_filetransfers,
                                             data=encodeRequest(data))
                    logger.debug("updated DocumentId: %s lfn: %s Result %s",
                                 docId, source_lfn, result)
                except Exception as ex:
                    logger.error(
                        "Error updating status for DocumentId: %s lfn: %s",
                        docId, source_lfn)
                    logger.error("Error reason: %s", ex)
            nMarked += 1
            if nMarked % 10 == 0:
                logger.info('marked %d files', nMarked)

    def mark_failed(files, crabServer, logger, failure_reason=""):
        """
        Something failed for these files so increment the retry count
        """
        msg = "Marking %s file(s) as failed" % len(files)
        logger.info(msg)
        nMarked = 0
        for lfn in files:
            source_lfn = lfn
            docId = getHashLfn(source_lfn)
            data = dict()
            data['asoworker'] = config.General.asoworker
            data['subresource'] = 'updatePublication'
            data['list_of_ids'] = docId
            data['list_of_publication_state'] = 'FAILED'
            data['list_of_retry_value'] = 1
            data['list_of_failure_reason'] = failure_reason

            logger.debug("data: %s ", data)
            if dryRun:
                logger.debug("DryRun: skip marking failes files")
            else:
                try:
                    result = crabServer.post(REST_filetransfers,
                                             data=encodeRequest(data))
                    logger.debug("updated DocumentId: %s lfn: %s Result %s",
                                 docId, source_lfn, result)
                except Exception as ex:
                    logger.error(
                        "Error updating status for DocumentId: %s lfn: %s",
                        docId, source_lfn)
                    logger.error("Error reason: %s", ex)
            nMarked += 1
            if nMarked % 10 == 0:
                logger.info('marked %d files', nMarked)

    def createLogdir(dirname):
        """
        Create the directory dirname ignoring erors in case it exists. Exit if
        the directory cannot be created.
        """
        try:
            os.mkdir(dirname)
        except OSError as ose:
            if ose.errno != 17:  #ignore the "Directory already exists error"
                print(str(ose))
                print("The task worker need to access the '%s' directory" %
                      dirname)
                sys.exit(1)

    taskFilesDir = config.General.taskFilesDir
    dryRun = config.TaskPublisher.dryRun
    username = taskname.split(':')[1].split('_')[0]
    logdir = config.General.logsDir + '/tasks/' + username
    logfile = logdir + '/' + taskname + '.log'
    createLogdir(logdir)
    logger = logging.getLogger(taskname)
    logging.basicConfig(filename=logfile,
                        level=logging.INFO,
                        format=config.TaskPublisher.logMsgFormat)
    if verbose:
        logger.setLevel(logging.DEBUG)

    logger.info("Getting files to publish")

    toPublish = []
    # TODO move from new to done when processed
    fname = taskFilesDir + taskname + ".json"
    with open(fname) as f:
        toPublish = json.load(f)

    if not toPublish:
        logger.info("Empty data file %s", fname)
        return "EMPTY"

    pnn = toPublish[0]["Destination"]

    # CRABServer REST API's (see CRABInterface)
    try:
        instance = config.General.instance
    except:
        msg = "No instance provided: need to specify config.General.instance in the configuration"
        raise ConfigException(msg)

    if instance in SERVICE_INSTANCES:
        logger.info('Will connect to CRAB service: %s', instance)
        restHost = SERVICE_INSTANCES[instance]['restHost']
        dbInstance = SERVICE_INSTANCES[instance]['dbInstance']
    else:
        msg = "Invalid instance value '%s'" % instance
        raise ConfigException(msg)
    if instance == 'other':
        logger.info('Will use restHost and dbInstance from config file')
        try:
            restHost = config.General.restHost
            dbInstance = config.General.dbInstance
        except:
            msg = "Need to specify config.General.restHost and dbInstance in the configuration"
            raise ConfigException(msg)

    restURInoAPI = '/crabserver/' + dbInstance
    logger.info('Will connect to CRAB Data Base via URL: https://%s/%s',
                restHost, restURInoAPI)

    # CRAB REST API's
    REST_filetransfers = restURInoAPI + '/filetransfers'
    REST_task = restURInoAPI + '/task'
    crabServer = HTTPRequests(url=restHost,
                              localcert=config.General.serviceCert,
                              localkey=config.General.serviceKey,
                              retry=3)

    data = dict()
    data['subresource'] = 'search'
    data['workflow'] = taskname

    try:
        results = crabServer.get(REST_task, data=encodeRequest(data))
    except Exception as ex:
        logger.error(
            "Failed to get acquired publications from oracleDB for %s: %s",
            taskname, ex)
        return "FAILED"

    if verbose:
        logger.info(results[0]['desc']['columns'])

    try:
        inputDatasetIndex = results[0]['desc']['columns'].index(
            "tm_input_dataset")
        inputDataset = results[0]['result'][inputDatasetIndex]
        sourceURLIndex = results[0]['desc']['columns'].index("tm_dbs_url")
        sourceURL = results[0]['result'][sourceURLIndex]
        publish_dbs_urlIndex = results[0]['desc']['columns'].index(
            "tm_publish_dbs_url")
        publish_dbs_url = results[0]['result'][publish_dbs_urlIndex]

        if not sourceURL.endswith("/DBSReader") and not sourceURL.endswith(
                "/DBSReader/"):
            sourceURL += "/DBSReader"
    except Exception:
        logger.exception("ERROR")
    # When looking up parents may need to look in global DBS as well.
    globalURL = sourceURL
    globalURL = globalURL.replace('phys01', 'global')
    globalURL = globalURL.replace('phys02', 'global')
    globalURL = globalURL.replace('phys03', 'global')
    globalURL = globalURL.replace('caf', 'global')

    # DBS client relies on X509 env. vars
    os.environ['X509_USER_CERT'] = config.General.serviceCert
    os.environ['X509_USER_KEY'] = config.General.serviceKey

    logger.info("Source API URL: %s", sourceURL)
    sourceApi = dbsClient.DbsApi(url=sourceURL)
    logger.info("Global API URL: %s", globalURL)
    globalApi = dbsClient.DbsApi(url=globalURL)

    if publish_dbs_url.endswith('/DBSWriter'):
        publish_read_url = publish_dbs_url[:-len('/DBSWriter')] + '/DBSReader'
        publish_migrate_url = publish_dbs_url[:-len('/DBSWriter'
                                                    )] + '/DBSMigrate'
    else:
        publish_migrate_url = publish_dbs_url + '/DBSMigrate'
        publish_read_url = publish_dbs_url + '/DBSReader'
        publish_dbs_url += '/DBSWriter'

    try:
        logger.info("Destination API URL: %s", publish_dbs_url)
        destApi = dbsClient.DbsApi(url=publish_dbs_url)
        logger.info("Destination read API URL: %s", publish_read_url)
        destReadApi = dbsClient.DbsApi(url=publish_read_url)
        logger.info("Migration API URL: %s", publish_migrate_url)
        migrateApi = dbsClient.DbsApi(url=publish_migrate_url)
    except Exception:
        logger.exception('Wrong DBS URL %s', publish_dbs_url)
        return "FAILED"

    logger.info("inputDataset: %s", inputDataset)
    noInput = len(inputDataset.split("/")) <= 3

    # TODO: fix dbs dep
    if not noInput:
        try:
            existing_datasets = sourceApi.listDatasets(dataset=inputDataset,
                                                       detail=True,
                                                       dataset_access_type='*')
            primary_ds_type = existing_datasets[0]['primary_ds_type']
            # There's little chance this is correct, but it's our best guess for now.
            # CRAB2 uses 'crab2_tag' for all cases
            existing_output = destReadApi.listOutputConfigs(
                dataset=inputDataset)
        except Exception:
            logger.exception('Wrong DBS URL %s', publish_dbs_url)
            return "FAILED"
        if not existing_output:
            msg = "Unable to list output config for input dataset %s." % (
                inputDataset)
            logger.error(msg)
            global_tag = 'crab3_tag'
        else:
            global_tag = existing_output[0]['global_tag']
    else:
        msg = "This publication appears to be for private MC."
        logger.info(msg)
        primary_ds_type = 'mc'
        global_tag = 'crab3_tag'

    acquisition_era_name = "CRAB"
    processing_era_config = {
        'processing_version': 1,
        'description': 'CRAB3_processing_era'
    }

    appName = 'cmsRun'
    appVer = toPublish[0]["swversion"]
    pset_hash = toPublish[0]['publishname'].split("-")[-1]
    gtag = str(toPublish[0]['globaltag'])
    if gtag == "None":
        gtag = global_tag
    try:
        if toPublish[0]['acquisitionera'] and not toPublish[0][
                'acquisitionera'] in ["null"]:
            acquisitionera = str(toPublish[0]['acquisitionera'])
        else:
            acquisitionera = acquisition_era_name
    except Exception:
        acquisitionera = acquisition_era_name

    _, primName, procName, tier = toPublish[0]['outdataset'].split('/')

    primds_config = {
        'primary_ds_name': primName,
        'primary_ds_type': primary_ds_type
    }
    msg = "About to insert primary dataset: %s" % (str(primds_config))
    logger.debug(msg)
    if dryRun:
        logger.info("DryRun: skip insertPrimaryDataset")
    else:
        destApi.insertPrimaryDataset(primds_config)
        msg = "Successfully inserted primary dataset %s." % (primName)
        logger.info(msg)

    final = {}
    failed = []
    publish_in_next_iteration = []
    published = []

    dataset = toPublish[0]['outdataset']
    # Find all (valid) files already published in this dataset.
    try:
        existingDBSFiles = destReadApi.listFiles(dataset=dataset, detail=True)
        existingFiles = [f['logical_file_name'] for f in existingDBSFiles]
        existingFilesValid = [
            f['logical_file_name'] for f in existingDBSFiles
            if f['is_file_valid']
        ]
        msg = "Dataset %s already contains %d files" % (dataset,
                                                        len(existingFiles))
        msg += " (%d valid, %d invalid)." % (len(existingFilesValid),
                                             len(existingFiles) -
                                             len(existingFilesValid))
        logger.info(msg)
        final['existingFiles'] = len(existingFiles)
    except Exception as ex:
        msg = "Error when listing files in DBS: %s" % (str(ex))
        msg += "\n%s" % (str(traceback.format_exc()))
        logger.error(msg)
        return "FAILED"

    # check if actions are needed
    workToDo = False

    for fileTo in toPublish:
        #print(existingFilesValid)
        if fileTo['lfn'] not in existingFilesValid:
            workToDo = True
            break

    if not workToDo:
        msg = "Nothing uploaded, %s has these files already or not enough files." % (
            dataset)
        logger.info(msg)
        return "NOTHING TO DO"

    acquisition_era_config = {
        'acquisition_era_name': acquisitionera,
        'start_date': 0
    }

    output_config = {
        'release_version': appVer,
        'pset_hash': pset_hash,
        'app_name': appName,
        'output_module_label': 'o',
        'global_tag': global_tag,
    }
    msg = "Published output config."
    logger.info(msg)

    dataset_config = {
        'dataset': dataset,
        'processed_ds_name': procName,
        'data_tier_name': tier,
        'dataset_access_type': 'VALID',
        'physics_group_name': 'CRAB3',
        'last_modification_date': int(time.time()),
    }
    msg = "About to insert dataset: %s" % (str(dataset_config))
    logger.info(msg)

    # List of all files that must (and can) be published.
    dbsFiles = []
    dbsFiles_f = []
    # Set of all the parent files from all the files requested to be published.
    parentFiles = set()
    # Set of parent files for which the migration to the destination DBS instance
    # should be skipped (because they were not found in DBS).
    parentsToSkip = set()
    # Set of parent files to migrate from the source DBS instance
    # to the destination DBS instance.
    localParentBlocks = set()
    # Set of parent files to migrate from the global DBS instance
    # to the destination DBS instance.
    globalParentBlocks = set()

    # Loop over all files to publish.
    for file_ in toPublish:
        if verbose:
            logger.info(file_)
        # Check if this file was already published and if it is valid.
        if file_['lfn'] not in existingFilesValid:
            # We have a file to publish.
            # Get the parent files and for each parent file do the following:
            # 1) Add it to the list of parent files.
            # 2) Find the block to which it belongs and insert that block name in
            #    (one of) the set of blocks to be migrated to the destination DBS.
            for parentFile in list(file_['parents']):
                if parentFile not in parentFiles:
                    parentFiles.add(parentFile)
                    # Is this parent file already in the destination DBS instance?
                    # (If yes, then we don't have to migrate this block.)
                    blocksDict = destReadApi.listBlocks(
                        logical_file_name=parentFile)
                    if not blocksDict:
                        # No, this parent file is not in the destination DBS instance.
                        # Maybe it is in the same DBS instance as the input dataset?
                        blocksDict = sourceApi.listBlocks(
                            logical_file_name=parentFile)
                        if blocksDict:
                            # Yes, this parent file is in the same DBS instance as the input dataset.
                            # Add the corresponding block to the set of blocks from the source DBS
                            # instance that have to be migrated to the destination DBS.
                            localParentBlocks.add(blocksDict[0]['block_name'])
                        else:
                            # No, this parent file is not in the same DBS instance as input dataset.
                            # Maybe it is in global DBS instance?
                            blocksDict = globalApi.listBlocks(
                                logical_file_name=parentFile)
                            if blocksDict:
                                # Yes, this parent file is in global DBS instance.
                                # Add the corresponding block to the set of blocks from global DBS
                                # instance that have to be migrated to the destination DBS.
                                globalParentBlocks.add(
                                    blocksDict[0]['block_name'])
                    # If this parent file is not in the destination DBS instance, is not
                    # the source DBS instance, and is not in global DBS instance, then it
                    # means it is not known to DBS and therefore we can not migrate it.
                    # Put it in the set of parent files for which migration should be skipped.
                    if not blocksDict:
                        parentsToSkip.add(parentFile)
                # If this parent file should not be migrated because it is not known to DBS,
                # we remove it from the list of parents in the file-to-publish info dictionary
                # (so that when publishing, this "parent" file will not appear as a parent).
                if parentFile in parentsToSkip:
                    msg = "Skipping parent file %s, as it doesn't seem to be known to DBS." % (
                        parentFile)
                    logger.info(msg)
                    if parentFile in file_['parents']:
                        file_['parents'].remove(parentFile)
            # Add this file to the list of files to be published.
            dbsFiles.append(format_file_3(file_))
            dbsFiles_f.append(file_)
        #print file
        published.append(file_['SourceLFN'])
        #published.append(file_['lfn'].replace("/store","/store/temp"))

    # Print a message with the number of files to publish.
    msg = "Found %d files not already present in DBS which will be published." % (
        len(dbsFiles))
    logger.info(msg)

    # If there are no files to publish, continue with the next dataset.
    if not dbsFiles_f:
        msg = "Nothing to do for this dataset."
        logger.info(msg)
        return "NOTHING TO DO"

    # Migrate parent blocks before publishing.
    # First migrate the parent blocks that are in the same DBS instance
    # as the input dataset.
    if localParentBlocks:
        msg = "List of parent blocks that need to be migrated from %s:\n%s" % (
            sourceApi.url, localParentBlocks)
        logger.info(msg)
        if dryRun:
            logger.info("DryRun: skipping migration request")
        else:
            statusCode, failureMsg = migrateByBlockDBS3(
                taskname, migrateApi, destReadApi, sourceApi, inputDataset,
                localParentBlocks, verbose)
            if statusCode:
                failureMsg += " Not publishing any files."
                logger.info(failureMsg)
                failed.extend([f['SourceLFN'] for f in dbsFiles_f])
                #failed.extend([f['lfn'].replace("/store","/store/temp") for f in dbsFiles_f])
                failure_reason = failureMsg
                published = [
                    x for x in published[dataset] if x not in failed[dataset]
                ]
                return "NOTHING TO DO"
    # Then migrate the parent blocks that are in the global DBS instance.
    if globalParentBlocks:
        msg = "List of parent blocks that need to be migrated from %s:\n%s" % (
            globalApi.url, globalParentBlocks)
        logger.info(msg)
        if dryRun:
            logger.info("DryRun: skipping migration request")
        else:
            statusCode, failureMsg = migrateByBlockDBS3(
                taskname, migrateApi, destReadApi, globalApi, inputDataset,
                globalParentBlocks, verbose)
            if statusCode:
                failureMsg += " Not publishing any files."
                logger.info(failureMsg)
                failed.extend([f['SourceLFN'] for f in dbsFiles_f])
                #failed.extend([f['lfn'].replace("/store","/store/temp") for f in dbsFiles_f])
                failure_reason = failureMsg
                published = [
                    x for x in published[dataset] if x not in failed[dataset]
                ]
                return "NOTHING TO DO"
    # Publish the files in blocks. The blocks must have exactly max_files_per_block
    # files, unless there are less than max_files_per_block files to publish to
    # begin with. If there are more than max_files_per_block files to publish,
    # publish as many blocks as possible and leave the tail of files for the next
    # PublisherWorker call, unless forced to published.
    block_count = 0
    count = 0
    max_files_per_block = config.General.max_files_per_block
    while True:
        block_name = "%s#%s" % (dataset, str(uuid.uuid4()))
        files_to_publish = dbsFiles[count:count + max_files_per_block]
        try:
            block_config = {
                'block_name': block_name,
                'origin_site_name': pnn,
                'open_for_writing': 0
            }
            if verbose:
                msg = "Inserting files %s into block %s." % (
                    [f['logical_file_name']
                     for f in files_to_publish], block_name)
                logger.info(msg)
            blockDump = createBulkBlock(output_config, processing_era_config,
                                        primds_config, dataset_config,
                                        acquisition_era_config, block_config,
                                        files_to_publish)
            #logger.debug("Block to insert: %s\n %s" % (blockDump, destApi.__dict__ ))

            if dryRun:
                logger.info("DryRun: skip insertBulkBlock")
            else:
                destApi.insertBulkBlock(blockDump)
            block_count += 1
        except Exception as ex:
            #logger.error("Error for files: %s" % [f['SourceLFN'] for f in toPublish])
            logger.error("Error for files: %s", [f['lfn'] for f in toPublish])
            failed.extend([f['SourceLFN'] for f in toPublish])
            #failed.extend([f['lfn'].replace("/store","/store/temp") for f in toPublish])
            msg = "Error when publishing (%s) " % ", ".join(failed)
            msg += str(ex)
            msg += str(traceback.format_exc())
            logger.error(msg)
            failure_reason = str(ex)
            fname = '/tmp/failed-block-at-%s.txt' % time.time()
            with open(fname, 'w') as fd:
                fd.write(blockDump)
            logger.error("FAILING BLOCK SAVED AS %s", file)
        count += max_files_per_block
        files_to_publish_next = dbsFiles_f[count:count + max_files_per_block]
        if len(files_to_publish_next) < max_files_per_block:
            publish_in_next_iteration.extend(
                [f["SourceLFN"] for f in files_to_publish_next])
            #publish_in_next_iteration.extend([f["lfn"].replace("/store","/store/temp") for f in files_to_publish_next])
            break
    published = [
        x for x in published if x not in failed + publish_in_next_iteration
    ]
    # Fill number of files/blocks published for this dataset.
    final['files'] = len(dbsFiles) - len(failed) - len(
        publish_in_next_iteration)
    final['blocks'] = block_count
    # Print a publication status summary for this dataset.
    msg = "End of publication status for dataset %s:" % (dataset)
    msg += " failed %s" % len(failed)
    if verbose:
        msg += ": %s" % failed
    msg += ", published %s" % len(published)
    if verbose:
        msg += ": %s" % published
    msg += ", publish_in_next_iteration %s" % len(publish_in_next_iteration)
    if verbose:
        msg += ": %s" % publish_in_next_iteration
    msg += ", results %s" % (final)
    logger.info(msg)

    try:
        if published:
            mark_good(published, crabServer, logger)
            data['workflow'] = taskname
            data['subresource'] = 'updatepublicationtime'
            crabServer.post(REST_task, data=encodeRequest(data))
        if failed:
            logger.debug("Failed files: %s ", failed)
            mark_failed(failed, crabServer, logger, failure_reason)
    except Exception as ex:
        logger.exception("Status update failed: %s", ex)

    return 0
コード例 #28
0
class Master(object):
    """I am the main daemon kicking off all Publisher work via slave Publishers"""

    
    def __init__(self, config, quiet=False, debug=True, testMode=False):
        """
        Initialise class members

        :arg WMCore.Configuration config: input Publisher configuration
        :arg bool quiet: it tells if a quiet logger is needed
        :arg bool debug: it tells if needs a verbose logger
        :arg bool testMode: it tells if to run in test (no subprocesses) mode.
        """
        self.config = config.General
        self.max_files_per_block = self.config.max_files_per_block
        #self.userCert = self.config.opsCert
        #self.userKey = self.config.opsKey
        self.block_publication_timeout = self.config.block_closure_timeout
        self.lfn_map = {}
        self.force_publication = False
        self.force_failure = False
        self.TestMode = testMode
        self.cache_area = self.config.cache_area

        def createLogdir(dirname):
            """ Create the directory dirname ignoring erors in case it exists. Exit if
                the directory cannot be created.
            """
            try:
                os.makedirs(dirname)
            except OSError as ose:
                if ose.errno != 17: #ignore the "Directory already exists error"
                    print(str(ose))
                    print("The Publisher Worker needs to access the '%s' directory" % dirname)
                    sys.exit(1)


        def setRootLogger(quiet=False, debug=True, console=False):
            """Sets the root logger with the desired verbosity level
               The root logger logs to logs/twlog.txt and every single
               logging instruction is propagated to it (not really nice
               to read)

            :arg bool quiet: it tells if a quiet logger is needed
            :arg bool debug: it tells if needs a verbose logger
            :arg bool console: it tells if to direct all printoput to console rather then files, useful for debug
            :return logger: a logger with the appropriate logger level."""

            createLogdir('Publisher/logs')
            createLogdir('Publisher/logs/processes')
            createLogdir('Publisher/logs/tasks')

            if console:
                #if we are testing log to the console is easier
                logging.getLogger().addHandler(logging.StreamHandler())
            else:
                logHandler = MultiProcessingLog('logs/log.txt', when='midnight')
                logFormatter = logging.Formatter("%(asctime)s:%(levelname)s:%(module)s,%(lineno)d:%(message)s")
                logHandler.setFormatter(logFormatter)
                logging.getLogger().addHandler(logHandler)
            loglevel = logging.INFO
            if quiet:
                loglevel = logging.WARNING
            if debug:
                loglevel = logging.DEBUG
            logging.getLogger().setLevel(loglevel)
            logger = setProcessLogger("master")
            logger.debug("PID %s.", os.getpid())
            logger.debug("Logging level initialized to %s.", loglevel)
            return logger

        self.logger = setRootLogger(quiet, True, console=self.TestMode)

        try:
            self.oracleDB = HTTPRequests(self.config.oracleDB,
                                         self.config.opsCert,
                                         self.config.opsKey)
            self.logger.debug('Contacting OracleDB:' + self.config.oracleDB)
        except:
            self.logger.exception('Failed when contacting Oracle')
            raise

        #try:
        #    self.connection = RequestHandler(config={'timeout': 900, 'connecttimeout' : 900})
        #except Exception as ex:
        #    msg = "Error initializing the connection handler"
        #    msg += str(ex)
        #    msg += str(traceback.format_exc())
        #    self.logger.debug(msg)


    def active_tasks(self, db):

        fileDoc = {}
        fileDoc['asoworker'] = self.config.asoworker
        fileDoc['subresource'] = 'acquirePublication'

        self.logger.debug("Retrieving publications from oracleDB")

        results = ''
        try:
            results = db.post(self.config.oracleFileTrans,
                              data=encodeRequest(fileDoc))
        except Exception as ex:
            self.logger.error("Failed to acquire publications \
                                from oracleDB: %s" %ex)
            return []

        fileDoc = dict()
        fileDoc['asoworker'] = self.config.asoworker
        fileDoc['subresource'] = 'acquiredPublication'
        fileDoc['grouping'] = 0
        fileDoc['limit'] = 100000

        self.logger.debug("Retrieving max.100000 acquired publications from oracleDB")

        result = []

        try:
            results = db.get(self.config.oracleFileTrans,
                             data=encodeRequest(fileDoc))
            result.extend(oracleOutputMapping(results))
        except Exception as ex:
            self.logger.error("Failed to acquire publications \
                                from oracleDB: %s" %ex)
            return []

        self.logger.debug("publen: %s" % len(result))

        self.logger.debug("%s acquired publications retrieved" % len(result))
        #TODO: join query for publisher (same of submitter)
        unique_tasks = [list(i) for i in set(tuple([x['username'],
                                                    x['user_group'],
                                                    x['user_role'],
                                                    x['taskname']]
                                                  ) for x in result if x['transfer_state'] == 3)]

        info = []
        for task in unique_tasks:
            info.append([x for x in result if x['taskname'] == task[3]])
        return zip(unique_tasks, info)

    def getPublDescFiles(self, workflow, lfn_ready):
        """
        Download and read the files describing
        what needs to be published
        """
        data = {}
        data['taskname'] = workflow
        data['filetype'] = 'EDM'

        out = []
        # divide lfn per chunks, avoiding URI-too long exception
        def chunks(l, n):
            """
            Yield successive n-sized chunks from l.
            :param l: list to splitt in chunks
            :param n: chunk size
            :return: yield the next list chunk
            """
            for i in range(0, len(l), n):
                yield l[i:i + n]

        for  lfn_ in chunks(lfn_ready, 50):
            data['lfn'] = lfn_

            try:
                res = self.oracleDB.get('/crabserver/preprod/filemetadata',
                                        data=encodeRequest(data, listParams=["lfn"]))
                res = res[0]
            except Exception as ex:
                self.logger.error("Error during metadata retrieving: %s" %ex)
                continue

            print(len(res['result']))
            for obj in res['result']:
                if isinstance(obj, dict):
                    out.append(obj)
                else:
                    #print type(obj)
                    out.append(json.loads(str(obj)))

        return out

    def algorithm(self):
        """
        1. Get a list of users with files to publish from the couchdb instance
        2. For each user get a suitably sized input for publish
        3. Submit the publish to a subprocess
        """
        tasks = self.active_tasks(self.oracleDB)

        # example code, uncomment to pick only one task
        #myTask='180912_142016:arizzi_crab_NanoDnRMXDYJetsToLL_M-105To160_TuneCUETP8M1_13TeV-amcaRunIISummer16MiniAODv2-PUMorio__heIV_v6-v22201'
        #tasksToDo=[]
        #for t in tasks:
        #  if t[0][3]==myTask:
        #  tasksToDo.append(t)
        #tasks = tasksToDo

        self.logger.debug('kicking off pool %s' % [x[0][3] for x in tasks])
        processes = []

        try:
            for task in tasks:
                if self.TestMode:
                    self.startSlave(task)   # sequentially do one task after another
                else:                       # deal with each task in a separate process
                    p = Process(target=self.startSlave, args=(task,))
                    p.start()
                    processes.append(p)

            for proc in processes:
                proc.join()
        except:
            self.logger.exception("Error during process mapping")


    def startSlave(self, task):
        # TODO: lock task!
        # - process logger
        logger = setProcessLogger(str(task[0][3]))
        logger.info("Process %s is starting. PID %s", task[0][3], os.getpid())

        self.force_publication = False
        workflow = str(task[0][3])
        wfnamemsg = "%s: " % (workflow)

        if int(workflow[0:2]) < 18:
            msg = "Skipped. Ignore tasks created before 2018."
            logger.info(wfnamemsg+msg)
            return 0

        if len(task[1]) > self.max_files_per_block:
            self.force_publication = True
            msg = "All datasets have more than %s ready files." % (self.max_files_per_block)
            msg += " No need to retrieve task status nor last publication time."
            logger.info(wfnamemsg+msg)
        else:
            msg = "At least one dataset has less than %s ready files." % (self.max_files_per_block)
            logger.info(wfnamemsg+msg)
            # Retrieve the workflow status. If the status can not be retrieved, continue
            # with the next workflow.
            workflow_status = ''
            url = '/'.join(self.cache_area.split('/')[:-1]) #+ '/workflow'
            msg = "Retrieving status from %s" % (url)
            logger.info(wfnamemsg+msg)
            header = {"Content-Type":"application/json"}
            data = {'workflow': workflow, 'subresource': 'taskads'}
            url = 'cmsweb-testbed.cern.ch'
            connection = HTTPRequests(url,
                                     self.config.opsCert,
                                     self.config.opsKey)
            
            try:
                res = connection.get('/crabserver/preprod/workflow', data=encodeRequest(data))
            except Exception as ex:
                #logger.info(wfnamemsg+encodeRequest(data))
                logger.warn('Error retrieving status from cache for %s.' % workflow)
                return 0

            try:
                workflow_status = res[0]['result'][0]['status']
                msg = "Task status is %s." % workflow_status
                logger.info(wfnamemsg+msg)
            except ValueError:
                msg = "Workflow removed from WM."
                logger.error(wfnamemsg+msg)
                workflow_status = 'REMOVED'
            except Exception as ex:
                msg = "Error loading task status!"
                msg += str(ex)
                msg += str(traceback.format_exc())
                logger.error(wfnamemsg+msg)
            # If the workflow status is terminal, go ahead and publish all the ready files
            # in the workflow.
            if workflow_status in ['COMPLETED', 'FAILED', 'KILLED', 'REMOVED']:
                self.force_publication = True
                if workflow_status in ['KILLED', 'REMOVED']:
                    self.force_failure = True
                msg = "Considering task status as terminal. Will force publication."
                logger.info(wfnamemsg+msg)
            # Otherwise...
            else:
                msg = "Task status is not considered terminal."
                logger.info(wfnamemsg+msg)
                msg = "Getting last publication time."
                logger.info(wfnamemsg+msg)
                # Get when was the last time a publication was done for this workflow (this
                # should be more or less independent of the output dataset in case there are
                # more than one).
                last_publication_time = None
                data = {}
                data['workflow'] = workflow
                data['subresource'] = 'search'
                try:
                    result = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers', 'task'),
                                               data=encodeRequest(data))
                    logger.debug("task: %s " %  str(result[0]))
                    logger.debug("task: %s " %  getColumn(result[0], 'tm_last_publication'))
                except Exception as ex:
                    logger.error("Error during task doc retrieving: %s" %ex)
                if last_publication_time:
                    date = oracleOutputMapping(result)['last_publication']
                    seconds = datetime.strptime(date, "%Y-%m-%d %H:%M:%S.%f").timetuple()
                    last_publication_time = time.mktime(seconds)

                msg = "Last publication time: %s." % str(last_publication_time)
                logger.debug(wfnamemsg+msg)
                # If this is the first time a publication would be done for this workflow, go
                # ahead and publish.
                if not last_publication_time:
                    self.force_publication = True
                    msg = "There was no previous publication. Will force publication."
                    logger.info(wfnamemsg+msg)
                # Otherwise...
                else:
                    last = last_publication_time
                    msg = "Last published block: %s" % (last)
                    logger.debug(wfnamemsg+msg)
                    # If the last publication was long time ago (> our block publication timeout),
                    # go ahead and publish.
                    now = int(time.time()) - time.timezone
                    time_since_last_publication = now - last
                    hours = int(time_since_last_publication/60/60)
                    minutes = int((time_since_last_publication - hours*60*60)/60)
                    timeout_hours = int(self.block_publication_timeout/60/60)
                    timeout_minutes = int((self.block_publication_timeout - timeout_hours*60*60)/60)
                    msg = "Last publication was %sh:%sm ago" % (hours, minutes)
                    if time_since_last_publication > self.block_publication_timeout:
                        self.force_publication = True
                        msg += " (more than the timeout of %sh:%sm)." % (timeout_hours, timeout_minutes)
                        msg += " Will force publication."
                    else:
                        msg += " (less than the timeout of %sh:%sm)." % (timeout_hours, timeout_minutes)
                        msg += " Not enough to force publication."
                    logger.info(wfnamemsg+msg)

        #logger.info(task[1])
        try:
            if self.force_publication:
                # - get info
                active_ = [{'key': [x['username'],
                                    x['user_group'],
                                    x['user_role'],
                                    x['taskname']],
                            'value': [x['destination'],
                                      x['source_lfn'],
                                      x['destination_lfn'],
                                      x['input_dataset'],
                                      x['dbs_url'],
                                      x['last_update']
                                     ]}
                           for x in task[1] if x['transfer_state'] == 3 and x['publication_state'] not in [2, 3, 5]]

                lfn_ready = []
                wf_jobs_endtime = []
                pnn, input_dataset, input_dbs_url = "", "", ""
                for active_file in active_:
                    job_end_time = active_file['value'][5]
                    if job_end_time and self.config.isOracle:
                        wf_jobs_endtime.append(int(job_end_time) - time.timezone)
                    elif job_end_time:
                        wf_jobs_endtime.append(int(time.mktime(time.strptime(str(job_end_time), '%Y-%m-%d %H:%M:%S'))) - time.timezone)
                    source_lfn = active_file['value'][1]
                    dest_lfn = active_file['value'][2]
                    self.lfn_map[dest_lfn] = source_lfn
                    if not pnn or not input_dataset or not input_dbs_url:
                        pnn = str(active_file['value'][0])
                        input_dataset = str(active_file['value'][3])
                        input_dbs_url = str(active_file['value'][4])
                    lfn_ready.append(dest_lfn)

                userDN = ''
                username = task[0][0]
                user_group = ""
                if task[0][1]:
                    user_group = task[0][1]
                user_role = ""
                if task[0][2]:
                    user_role = task[0][2]
                logger.debug("Trying to get DN %s %s %s" % (username, user_group, user_role))

                try:
                    userDN = getDNFromUserName(username, logger)
                except Exception as ex:
                    msg = "Error retrieving the user DN"
                    msg += str(ex)
                    msg += str(traceback.format_exc())
                    logger.error(msg)
                    return 1

                # Get metadata
                toPublish = []
                publDescFiles_list = self.getPublDescFiles(workflow, lfn_ready)
                for file_ in active_:
                    for _, doc in enumerate(publDescFiles_list):
                        #logger.info(type(doc))
                        #logger.info(doc)
                        if doc["lfn"] == file_["value"][2]:
                            doc["User"] = username
                            doc["Group"] = file_["key"][1]
                            doc["Role"] = file_["key"][2]
                            doc["UserDN"] = userDN
                            doc["Destination"] = file_["value"][0]
                            doc["SourceLFN"] = file_["value"][1]
                            toPublish.append(doc)
                with open("/tmp/publisher_files/"+workflow+'.json', 'w') as outfile:
                    json.dump(toPublish, outfile)
                logger.info(". publisher.py %s" % (workflow))

                # find the location in the current environment of the script we want to run
                import Publisher.TaskPublish as tp
                taskPublishScript = tp.__file__
                subprocess.call(["python", taskPublishScript, workflow])

        except:
            logger.exception("Exception!")


        return 0
コード例 #29
0
    def startSlave(self, task):
        # TODO: lock task!
        # - process logger
        logger = setProcessLogger(str(task[0][3]))
        logger.info("Process %s is starting. PID %s", task[0][3], os.getpid())

        self.force_publication = False
        workflow = str(task[0][3])
        wfnamemsg = "%s: " % (workflow)

        if int(workflow[0:2]) < 18:
            msg = "Skipped. Ignore tasks created before 2018."
            logger.info(wfnamemsg+msg)
            return 0

        if len(task[1]) > self.max_files_per_block:
            self.force_publication = True
            msg = "All datasets have more than %s ready files." % (self.max_files_per_block)
            msg += " No need to retrieve task status nor last publication time."
            logger.info(wfnamemsg+msg)
        else:
            msg = "At least one dataset has less than %s ready files." % (self.max_files_per_block)
            logger.info(wfnamemsg+msg)
            # Retrieve the workflow status. If the status can not be retrieved, continue
            # with the next workflow.
            workflow_status = ''
            url = '/'.join(self.cache_area.split('/')[:-1]) #+ '/workflow'
            msg = "Retrieving status from %s" % (url)
            logger.info(wfnamemsg+msg)
            header = {"Content-Type":"application/json"}
            data = {'workflow': workflow, 'subresource': 'taskads'}
            url = 'cmsweb-testbed.cern.ch'
            connection = HTTPRequests(url,
                                     self.config.opsCert,
                                     self.config.opsKey)
            
            try:
                res = connection.get('/crabserver/preprod/workflow', data=encodeRequest(data))
            except Exception as ex:
                #logger.info(wfnamemsg+encodeRequest(data))
                logger.warn('Error retrieving status from cache for %s.' % workflow)
                return 0

            try:
                workflow_status = res[0]['result'][0]['status']
                msg = "Task status is %s." % workflow_status
                logger.info(wfnamemsg+msg)
            except ValueError:
                msg = "Workflow removed from WM."
                logger.error(wfnamemsg+msg)
                workflow_status = 'REMOVED'
            except Exception as ex:
                msg = "Error loading task status!"
                msg += str(ex)
                msg += str(traceback.format_exc())
                logger.error(wfnamemsg+msg)
            # If the workflow status is terminal, go ahead and publish all the ready files
            # in the workflow.
            if workflow_status in ['COMPLETED', 'FAILED', 'KILLED', 'REMOVED']:
                self.force_publication = True
                if workflow_status in ['KILLED', 'REMOVED']:
                    self.force_failure = True
                msg = "Considering task status as terminal. Will force publication."
                logger.info(wfnamemsg+msg)
            # Otherwise...
            else:
                msg = "Task status is not considered terminal."
                logger.info(wfnamemsg+msg)
                msg = "Getting last publication time."
                logger.info(wfnamemsg+msg)
                # Get when was the last time a publication was done for this workflow (this
                # should be more or less independent of the output dataset in case there are
                # more than one).
                last_publication_time = None
                data = {}
                data['workflow'] = workflow
                data['subresource'] = 'search'
                try:
                    result = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers', 'task'),
                                               data=encodeRequest(data))
                    logger.debug("task: %s " %  str(result[0]))
                    logger.debug("task: %s " %  getColumn(result[0], 'tm_last_publication'))
                except Exception as ex:
                    logger.error("Error during task doc retrieving: %s" %ex)
                if last_publication_time:
                    date = oracleOutputMapping(result)['last_publication']
                    seconds = datetime.strptime(date, "%Y-%m-%d %H:%M:%S.%f").timetuple()
                    last_publication_time = time.mktime(seconds)

                msg = "Last publication time: %s." % str(last_publication_time)
                logger.debug(wfnamemsg+msg)
                # If this is the first time a publication would be done for this workflow, go
                # ahead and publish.
                if not last_publication_time:
                    self.force_publication = True
                    msg = "There was no previous publication. Will force publication."
                    logger.info(wfnamemsg+msg)
                # Otherwise...
                else:
                    last = last_publication_time
                    msg = "Last published block: %s" % (last)
                    logger.debug(wfnamemsg+msg)
                    # If the last publication was long time ago (> our block publication timeout),
                    # go ahead and publish.
                    now = int(time.time()) - time.timezone
                    time_since_last_publication = now - last
                    hours = int(time_since_last_publication/60/60)
                    minutes = int((time_since_last_publication - hours*60*60)/60)
                    timeout_hours = int(self.block_publication_timeout/60/60)
                    timeout_minutes = int((self.block_publication_timeout - timeout_hours*60*60)/60)
                    msg = "Last publication was %sh:%sm ago" % (hours, minutes)
                    if time_since_last_publication > self.block_publication_timeout:
                        self.force_publication = True
                        msg += " (more than the timeout of %sh:%sm)." % (timeout_hours, timeout_minutes)
                        msg += " Will force publication."
                    else:
                        msg += " (less than the timeout of %sh:%sm)." % (timeout_hours, timeout_minutes)
                        msg += " Not enough to force publication."
                    logger.info(wfnamemsg+msg)

        #logger.info(task[1])
        try:
            if self.force_publication:
                # - get info
                active_ = [{'key': [x['username'],
                                    x['user_group'],
                                    x['user_role'],
                                    x['taskname']],
                            'value': [x['destination'],
                                      x['source_lfn'],
                                      x['destination_lfn'],
                                      x['input_dataset'],
                                      x['dbs_url'],
                                      x['last_update']
                                     ]}
                           for x in task[1] if x['transfer_state'] == 3 and x['publication_state'] not in [2, 3, 5]]

                lfn_ready = []
                wf_jobs_endtime = []
                pnn, input_dataset, input_dbs_url = "", "", ""
                for active_file in active_:
                    job_end_time = active_file['value'][5]
                    if job_end_time and self.config.isOracle:
                        wf_jobs_endtime.append(int(job_end_time) - time.timezone)
                    elif job_end_time:
                        wf_jobs_endtime.append(int(time.mktime(time.strptime(str(job_end_time), '%Y-%m-%d %H:%M:%S'))) - time.timezone)
                    source_lfn = active_file['value'][1]
                    dest_lfn = active_file['value'][2]
                    self.lfn_map[dest_lfn] = source_lfn
                    if not pnn or not input_dataset or not input_dbs_url:
                        pnn = str(active_file['value'][0])
                        input_dataset = str(active_file['value'][3])
                        input_dbs_url = str(active_file['value'][4])
                    lfn_ready.append(dest_lfn)

                userDN = ''
                username = task[0][0]
                user_group = ""
                if task[0][1]:
                    user_group = task[0][1]
                user_role = ""
                if task[0][2]:
                    user_role = task[0][2]
                logger.debug("Trying to get DN %s %s %s" % (username, user_group, user_role))

                try:
                    userDN = getDNFromUserName(username, logger)
                except Exception as ex:
                    msg = "Error retrieving the user DN"
                    msg += str(ex)
                    msg += str(traceback.format_exc())
                    logger.error(msg)
                    return 1

                # Get metadata
                toPublish = []
                publDescFiles_list = self.getPublDescFiles(workflow, lfn_ready)
                for file_ in active_:
                    for _, doc in enumerate(publDescFiles_list):
                        #logger.info(type(doc))
                        #logger.info(doc)
                        if doc["lfn"] == file_["value"][2]:
                            doc["User"] = username
                            doc["Group"] = file_["key"][1]
                            doc["Role"] = file_["key"][2]
                            doc["UserDN"] = userDN
                            doc["Destination"] = file_["value"][0]
                            doc["SourceLFN"] = file_["value"][1]
                            toPublish.append(doc)
                with open("/tmp/publisher_files/"+workflow+'.json', 'w') as outfile:
                    json.dump(toPublish, outfile)
                logger.info(". publisher.py %s" % (workflow))

                # find the location in the current environment of the script we want to run
                import Publisher.TaskPublish as tp
                taskPublishScript = tp.__file__
                subprocess.call(["python", taskPublishScript, workflow])

        except:
            logger.exception("Exception!")


        return 0
コード例 #30
0
ファイル: getcommand.py プロジェクト: qunox/CRABClient
    def __call__(self, **argv):
        # Setting default destination if -o is not provided
        if not self.dest:
            self.dest = os.path.join(self.requestarea, "results")

        # Destination is a URL.
        if re.match("^[a-z]+://", self.dest):
            if not self.dest.endswith("/"):
                self.dest += "/"
        # Creating the destination directory if necessary
        elif not os.path.exists(self.dest):
            self.logger.debug("Creating directory %s " % self.dest)
            os.makedirs(self.dest)
        elif not os.path.isdir(self.dest):
            raise ConfigurationException("Destination directory is a file")

        if self.options.dump or self.options.xroot:
            self.logger.debug("Getting url info")
        else:
            self.logger.info("Setting the destination to %s " % self.dest)

        # Retrieving output files location from the server
        self.logger.debug("Retrieving locations for task %s" % self.cachedinfo["RequestName"])
        inputlist = [("workflow", self.cachedinfo["RequestName"])]
        inputlist.extend(list(argv.iteritems()))
        if getattr(self.options, "quantity", None):
            self.logger.debug("Retrieving %s file locations" % self.options.quantity)
            inputlist.append(("limit", self.options.quantity))
        else:
            self.logger.debug("Retrieving all file locations")
            inputlist.append(("limit", -1))
        if getattr(self.options, "jobids", None):
            self.logger.debug("Retrieving jobs %s" % self.options.jobids)
            inputlist.extend(self.options.jobids)
        server = HTTPRequests(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__)
        dictresult, status, reason = server.get(self.uri, data=inputlist)
        self.logger.debug("Server result: %s" % dictresult)
        dictresult = self.processServerResult(dictresult)

        if status != 200:
            msg = "Problem retrieving information from the server:\ninput:%s\noutput:%s\nreason:%s" % (
                str(inputlist),
                str(dictresult),
                str(reason),
            )
            raise ConfigurationException(msg)

        totalfiles = len(dictresult["result"])
        cpresults = []
        #        for workflow in dictresult['result']: TODO re-enable this when we will have resubmissions
        workflow = dictresult["result"]  # TODO assigning workflow to dictresult. for the moment we have only one wf
        arglist = [
            "-d",
            self.dest,
            "-i",
            workflow,
            "-t",
            self.options.task,
            "-p",
            self.proxyfilename,
            "-l",
            self.options.nparallel,
            "-w",
            self.options.waittime,
        ]
        if len(workflow) > 0:
            if self.options.xroot:
                self.logger.debug("XRootD url is requested")
                for fileinfo in workflow:
                    self.logger.info("root://cms-xrd-global.cern.ch/%s" % fileinfo["lfn"])
            elif self.dump:
                for fileInfo in workflow:
                    self.logger.info(fileInfo["pfn"])
            else:
                self.logger.info("Retrieving %s files" % totalfiles)
                copyoutput = remote_copy(self.logger, arglist)
                copyoutput()

        if totalfiles == 0:
            self.logger.info("No files to retrieve")
コード例 #31
0
    while maxi < 3:
        cmd = "curl -i -F file=@%s xsls.cern.ch"%xmllocation
        try:
            pu = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
            break
        except Exception, e:
            logger.debug(str(e))
            maxi = maxi + 1
            continue

if __name__ == "__main__":
    server = HTTPRequests('cmsweb-testbed.cern.ch',
                          '/data/srv/asyncstageout/state/asyncstageout/creds/OpsProxy',
                          '/data/srv/asyncstageout/state/asyncstageout/creds/OpsProxy')

    result = server.get('/crabserver/preprod/filetransfers', 
                        data=encodeRequest({'subresource': 'groupedTransferStatistics', 'grouping': 0}))

    results = oracleOutputMapping(result)


    status = {'transfers':{}, 'publications':{}}
    tmp = {'transfers':{ 'DONE':0, 'ACQUIRED':0, 'SUBMITTED':0, 'FAILED':0, 'RETRY':0 }, 
           'publications':{'DONE':0, 'ACQUIRED':0, 'NEW':0, 'FAILED':0, 'RETRY':0}}

    #past = open("tmp_transfer")
    #tmp = json.load(past)

    for doc in results:
        if doc['aso_worker']=="asodciangot1":
            if 'transfers' in tmp and TRANSFERDB_STATES[doc['transfer_state']] in tmp['transfers']:
                status['transfers'][TRANSFERDB_STATES[doc['transfer_state']]] = - tmp['transfers'][TRANSFERDB_STATES[doc['transfer_state']]] + doc['nt']
コード例 #32
0
class MasterWorker(object):
    """I am the master of the TaskWorker"""
    def __init__(self, config, quiet, debug, test=False):
        """Initializer

        :arg WMCore.Configuration config: input TaskWorker configuration
        :arg logging logger: the logger
        :arg bool quiet: it tells if a quiet logger is needed
        :arg bool debug: it tells if needs a verbose logger."""
        def createLogdir(dirname):
            """ Create the directory dirname ignoring erors in case it exists. Exit if
                the directory cannot be created.
            """
            try:
                os.mkdir(dirname)
            except OSError as ose:
                if ose.errno != 17:  #ignore the "Directory already exists error"
                    print(str(ose))
                    print("The task worker need to access the '%s' directory" %
                          dirname)
                    sys.exit(1)

        def setRootLogger(quiet, debug):
            """Sets the root logger with the desired verbosity level
               The root logger logs to logs/twlog.txt and every single
               logging instruction is propagated to it (not really nice
               to read)

            :arg bool quiet: it tells if a quiet logger is needed
            :arg bool debug: it tells if needs a verbose logger
            :return logger: a logger with the appropriate logger level."""

            createLogdir('logs')
            createLogdir('logs/processes')
            createLogdir('logs/tasks')

            if self.TEST:
                #if we are testing log to the console is easier
                logging.getLogger().addHandler(logging.StreamHandler())
            else:
                logHandler = MultiProcessingLog('logs/twlog.txt',
                                                when='midnight')
                logFormatter = \
                    logging.Formatter("%(asctime)s:%(levelname)s:%(module)s:%(message)s")
                logHandler.setFormatter(logFormatter)
                logging.getLogger().addHandler(logHandler)
            loglevel = logging.INFO
            if quiet:
                loglevel = logging.WARNING
            if debug:
                loglevel = logging.DEBUG
            logging.getLogger().setLevel(loglevel)
            logger = setProcessLogger("master")
            logger.debug("PID %s." % os.getpid())
            logger.debug("Logging level initialized to %s." % loglevel)
            return logger

        self.STOP = False
        self.TEST = test
        self.logger = setRootLogger(quiet, debug)
        self.config = config
        resthost = None
        self.restURInoAPI = None
        if not self.config.TaskWorker.mode in MODEURL.keys():
            raise ConfigException(
                "No mode provided: need to specify config.TaskWorker.mode in the configuration"
            )
        elif MODEURL[self.config.TaskWorker.mode]['host'] is not None:
            resthost = MODEURL[self.config.TaskWorker.mode]['host']
            self.restURInoAPI = '/crabserver/' + MODEURL[
                self.config.TaskWorker.mode]['instance']
        else:
            resthost = self.config.TaskWorker.resturl  #this should be called resthost in the TaskWorkerConfig -_-
            self.restURInoAPI = '/crabserver/' + MODEURL[
                self.config.TaskWorker.mode]['instance']
        if resthost is None:
            raise ConfigException(
                "No correct mode provided: need to specify config.TaskWorker.mode in the configuration"
            )
        self.server = HTTPRequests(resthost,
                                   self.config.TaskWorker.cmscert,
                                   self.config.TaskWorker.cmskey,
                                   retry=2)
        self.logger.debug("Hostcert: %s, hostkey: %s" %
                          (str(self.config.TaskWorker.cmscert),
                           str(self.config.TaskWorker.cmskey)))
        # Retries for any failures
        if not hasattr(self.config.TaskWorker, 'max_retry'):
            self.config.TaskWorker.max_retry = 0
        if not hasattr(self.config.TaskWorker, 'retry_interval'):
            self.config.TaskWorker.retry_interval = [
                retry * 20 * 2
                for retry in range(self.config.TaskWorker.max_retry)
            ]
        if not len(self.config.TaskWorker.retry_interval
                   ) == self.config.TaskWorker.max_retry:
            raise ConfigException(
                "No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry."
            )
        if self.TEST:
            self.slaves = TestWorker(self.config, resthost,
                                     self.restURInoAPI + '/workflowdb')
        else:
            self.slaves = Worker(self.config, resthost,
                                 self.restURInoAPI + '/workflowdb')
        self.slaves.begin()
        recurringActionsNames = getattr(self.config.TaskWorker,
                                        'recurringActions', [])
        self.recurringActions = [
            self.getRecurringActionInst(name) for name in recurringActionsNames
        ]

    def getRecurringActionInst(self, actionName):
        mod = __import__('TaskWorker.Actions.Recurring.%s' % actionName,
                         fromlist=actionName)
        return getattr(mod, actionName)()

    def _lockWork(self, limit, getstatus, setstatus):
        """Today this is always returning true, because we do not want the worker to die if
           the server endpoint is not avaialable.
           Prints a log entry if answer is greater than 400:
            * the server call succeeded or
            * the server could not find anything to update or
            * the server has an internal error"""
        configreq = {
            'subresource': 'process',
            'workername': self.config.TaskWorker.name,
            'getstatus': getstatus,
            'limit': limit,
            'status': setstatus
        }
        try:
            self.server.post(self.restURInoAPI + '/workflowdb',
                             data=urllib.urlencode(configreq))
        except HTTPException as hte:
            #Using a msg variable and only one self.logger.error so that messages do not get shuffled
            msg = "Task Worker could not update a task status (HTTPException): %s\nConfiguration parameters=%s\n" % (
                str(hte), configreq)
            if not hte.headers.get('X-Error-Detail', '') == 'Required object is missing' or \
               not hte.headers.get('X-Error-Http', -1) == '400':
                msg += "Task Worker could not update work to the server: \n" +\
                                  "\tstatus: %s\n" %(hte.headers.get('X-Error-Http', 'unknown')) +\
                                  "\treason: %s\n" %(hte.headers.get('X-Error-Detail', 'unknown'))
                msg += "Probably no task to be updated\n"
            if hte.headers.get('X-Error-Http', 'unknown') in ['unknown']:
                msg += "TW could not update work to the server:\n"
                msg += "%s \n" % (str(traceback.format_exc()))
                msg += "\turl: %s\n" % (getattr(hte, 'url', 'unknown'))
                msg += "\tresult: %s\n" % (getattr(hte, 'result', 'unknown'))
            self.logger.error(msg)
        except Exception as exc:
            msg = "Task Worker could not update a task status: %s\nConfiguration parameters=%s\n" % (
                str(exc), configreq)
            self.logger.error(msg + traceback.format_exc())

        return True

    def _getWork(self, limit, getstatus):
        configreq = {
            'limit': limit,
            'workername': self.config.TaskWorker.name,
            'getstatus': getstatus
        }
        pendingwork = []
        try:
            pendingwork = self.server.get(self.restURInoAPI + '/workflowdb',
                                          data=configreq)[0]['result']
        except HTTPException as hte:
            self.logger.error("HTTP Error during _getWork: %s" % str(hte))
            self.logger.error("Could not get any work from the server: \n" +
                              "\tstatus: %s\n" %
                              (hte.headers.get('X-Error-Http', 'unknown')) +
                              "\treason: %s" %
                              (hte.headers.get('X-Error-Detail', 'unknown')))
            if hte.headers.get('X-Error-Http', 'unknown') in ['unknown']:
                self.logger.error(
                    "Server could not acquire any work from the server:")
                self.logger.error("%s " % (str(traceback.format_exc())))
                self.logger.error("\turl: %s\n" %
                                  (getattr(hte, 'url', 'unknown')))
                self.logger.error("\tresult: %s\n" %
                                  (getattr(hte, 'result', 'unknown')))
        except Exception as exc:
            self.logger.error("Server could not process the request: %s" %
                              (str(exc)))
            self.logger.error(traceback.format_exc())
        return pendingwork

    def quit(self, code, traceback_):
        self.logger.info("Received kill request. Waiting for the workers...")
        self.STOP = True

    def updateWork(self, taskname, status):
        configreq = {
            'workflow': taskname,
            'status': status,
            'subresource': 'state'
        }
        retry = True
        while retry:
            try:
                self.server.post(self.restURInoAPI + '/workflowdb',
                                 data=urllib.urlencode(configreq))
                retry = False
            except HTTPException as hte:
                #Using a msg variable and only one self.logger.error so that messages do not get shuffled
                msg = "Task Worker could not update a task status (HTTPException): %s\nConfiguration parameters=%s\n" % (
                    str(hte), configreq)
                msg += "\tstatus: %s\n" % (hte.headers.get(
                    'X-Error-Http', 'unknown'))
                msg += "\treason: %s\n" % (hte.headers.get(
                    'X-Error-Detail', 'unknown'))
                msg += "\turl: %s\n" % (getattr(hte, 'url', 'unknown'))
                msg += "\tresult: %s\n" % (getattr(hte, 'result', 'unknown'))
                msg += "%s \n" % (str(traceback.format_exc()))
                self.logger.error(msg)
                retry = False
                if int(hte.headers.get('X-Error-Http', '0')) == 503:
                    #503 - Database/Service unavailable. Maybe Intervention of CMSWEB ongoing?
                    retry = True
                    time_sleep = 30 + random.randint(10, 30)
                    self.logger.info(
                        "Sleeping %s seconds and will try to update again." %
                        str(time_sleep))
                    time.sleep(time_sleep)
            except Exception as exc:
                msg = "Task Worker could not update a task status: %s\nConfiguration parameters=%s\n" % (
                    str(exc), configreq)
                self.logger.error(msg + traceback.format_exc())
                retry = False

    def algorithm(self):
        """I'm the intelligent guy taking care of getting the work
           and distribuiting it to the slave processes."""

        self.logger.debug("Starting")
        while (not self.STOP):
            for status, worktype, failstatus in states():
                limit = self.slaves.queueableTasks()
                if not self._lockWork(
                        limit=limit, getstatus=status, setstatus='HOLDING'):
                    continue
                ## Warning: If we fail to retrieve tasks on HOLDING (e.g. because cmsweb is down)
                ## we may end up executing the wrong worktype later on. A solution would be to
                ## save the previous task state in a new column of the TaskDB.
                pendingwork = self._getWork(limit=limit, getstatus='HOLDING')
                self.logger.info("Retrieved a total of %d %s works" %
                                 (len(pendingwork), worktype))
                self.logger.debug("Retrieved the following works: \n%s" %
                                  (str(pendingwork)))
                self.slaves.injectWorks([(worktype, task, failstatus, None)
                                         for task in pendingwork])
                for task in pendingwork:
                    self.updateWork(task['tm_taskname'], 'QUEUED')

            for action in self.recurringActions:
                if action.isTimeToGo():
                    #Maybe we should use new slaves and not reuse the ones used for the tasks
                    self.logger.debug("Injecting recurring action: \n%s" %
                                      (str(action.__module__)))
                    self.slaves.injectWorks([(handleRecurring, {
                        'tm_taskname': action.__module__
                    }, 'FAILED', action.__module__)])

            self.logger.info('Master Worker status:')
            self.logger.info(' - free slaves: %d' % self.slaves.freeSlaves())
            self.logger.info(' - acquired tasks: %d' %
                             self.slaves.queuedTasks())
            self.logger.info(' - tasks pending in queue: %d' %
                             self.slaves.pendingTasks())

            time.sleep(self.config.TaskWorker.polling)

            finished = self.slaves.checkFinished()

        self.logger.debug("Master Worker Exiting Main Cycle")
コード例 #33
0
class FileTransfersTest(unittest.TestCase):
    """
    _DashboardAPITest_

    Unit tests for the FileTransfers API
    """
    def setUp(self):
        """
        Setup for unit tests
        """
        self.server = HTTPRequests(os.environ['SERVER_HOST'],
                                   os.environ['X509_USER_PROXY'],
                                   os.environ['X509_USER_PROXY'])
        self.lfnBase = '/store/temp/user/%s/my_cool_dataset-%s/file-%s-%s.root'
        self.fileDoc = {
            'id': 'OVERWRITE',
            'username': '******',
            'taskname': 'OVERWRITE',
            'start_time': 0,
            'destination': 'T2_CH_CERN',
            'destination_lfn': 'OVERWRITE',
            'source': 'T2_US_Caltech',
            'source_lfn': 'OVERWRITE',
            'filesize': random.randint(1, 9999),
            'publish': 1,
            'transfer_state': 'OVERWRITE',
            'publication_state': 'OVERWRITE',
            'job_id': 1,
            'job_retry_count': 0,
            'type': 'log',
            'rest_host': 'cmsweb.cern.ch',
            'rest_uri': '/crabserver/prod/'
        }
        self.ids = []
        self.users = [
            'jbalcas', 'mmascher', 'dciangot', 'riahi', 'erupeika', 'sbelforte'
        ]  # just random users for tests
        self.tasks = {}
        self.totalFiles = 10

    def testFileTransferPUT(self):
        """
        _testFileTransferPUT_

        Just test simple testFileTransferPUT with fake data
        """
        # We just sent fake data which is not monitored by dashboard.
        # Also only the first time to decide is publication ON or NOT
        for user in self.users:
            timestamp = time.strftime('%y%m%d_%H%M%S', time.gmtime())
            for i in range(self.totalFiles):
                now = int(time.time())
                # Generate a taskname
                workflowName = ""
                taskname = ""
                if user not in self.tasks:
                    workflowName = "".join([
                        random.choice(string.ascii_lowercase)
                        for _ in range(20)
                    ]) + "_" + str(now)
                    publicationState = random.choice(['NEW', 'NOT_REQUIRED'])
                else:
                    workflowName = self.tasks[user]['workflowName']
                    publicationState = self.tasks[user]['publication']
                transferState = random.choice(['NEW', 'DONE'])
                taskname = generateTaskName(user, workflowName, timestamp)
                finalLfn = self.lfnBase % (user, workflowName, i,
                                           random.randint(1, 9999))
                idHash = getHashLfn(finalLfn)
                self.fileDoc['id'] = idHash
                self.fileDoc['job_id'] = i
                self.fileDoc['username'] = user
                self.fileDoc['taskname'] = taskname
                self.fileDoc['start_time'] = int(time.time())
                self.fileDoc['source_lfn'] = finalLfn
                self.fileDoc['destination_lfn'] = finalLfn
                self.fileDoc['transfer_state'] = transferState
                self.fileDoc['publication_state'] = publicationState
                print(self.fileDoc)
                self.server.put('/crabserver/dev/fileusertransfers',
                                data=encodeRequest(self.fileDoc))
                # if I will put the same doc twice, it should raise an error.
                # self.server.put('/crabserver/dev/fileusertransfers', data=urllib.urlencode(self.fileDoc))
                # This tasks are for the future and next calls
                if user not in self.tasks:
                    self.tasks[user] = {
                        'workflowName': workflowName,
                        'taskname': taskname,
                        'listOfIds': [],
                        'publication': publicationState,
                        'toTransfer': 0,
                        'toPublish': 0,
                        'total': self.totalFiles
                    }
                if self.tasks[user]['publication'] == 'NEW':
                    self.tasks[user]['toPublish'] += 1
                if transferState == 'NEW':
                    self.tasks[user]['toTransfer'] += 1
                self.tasks[user]['listOfIds'].append(idHash)
        # This should raise an error
        for username in self.tasks:
            taskname = self.tasks[username]['taskname']
            for query in ['getTransferStatus', 'getPublicationStatus']:
                result = self.server.get('/crabserver/dev/fileusertransfers',
                                         data=encodeRequest({
                                             'subresource':
                                             query,
                                             'username':
                                             username,
                                             'taskname':
                                             taskname
                                         }))
                print(result)
                print(result[0]['result'])
                taskInfoDict = oracleOutputMapping(result, 'id')
                print(taskInfoDict)
                for key, docDict in taskInfoDict.items():
                    result = self.server.get(
                        '/crabserver/dev/fileusertransfers',
                        data=encodeRequest({
                            'subresource': 'getById',
                            'id': key
                        }))

        randomUsers = random.sample(
            set(self.users), 3
        )  # Take half of the users and kill their transfers for specific task
        for username in randomUsers:
            taskname = self.tasks[username]['taskname']
            result = self.server.post('/crabserver/dev/fileusertransfers',
                                      data=encodeRequest({
                                          'subresource': 'killTransfers',
                                          'username': username,
                                          'taskname': taskname
                                      }))
            print(result)
        # oneUser is left for killing a list of IDs
        # leftUsers will be killing transfers one by one for specific id.
        leftUsers = list(set(self.users) - set(randomUsers))
        oneUser = random.sample(set(leftUsers), 1)
        leftUsers = list(set(leftUsers) - set(oneUser))
        for username in leftUsers:
            # First get all left ids for this users
            result = self.server.get('/crabserver/dev/fileusertransfers',
                                     data=encodeRequest({
                                         'subresource':
                                         'getTransferStatus',
                                         'username':
                                         username,
                                         'taskname':
                                         self.tasks[username]['taskname']
                                     }))
            resultOut = oracleOutputMapping(result, None)
            print("**" * 50)
            for outDict in resultOut:
                print(outDict)
                result = self.server.post('/crabserver/dev/fileusertransfers',
                                          data=encodeRequest({
                                              'subresource':
                                              'killTransfersById',
                                              'username':
                                              username,
                                              'listOfIds':
                                              outDict['id']
                                          }))
                print(result)
            print(resultOut)
            print(result)
        for username in oneUser:
            result = self.server.post(
                '/crabserver/dev/fileusertransfers',
                data=encodeRequest(
                    {
                        'subresource': 'killTransfersById',
                        'username': username,
                        'listOfIds': self.tasks[username]['listOfIds']
                    }, ['listOfIds']))
            # As it asks to kill all which are in new, need to double check what we submitted before and if the output of killed is correct
            print(result)
            print(self.tasks[username])
コード例 #34
0
def publishInDBS3(taskname):
    """

    """
    def createLogdir(dirname):
        """
        Create the directory dirname ignoring erors in case it exists. Exit if
        the directory cannot be created.
        """
        try:
            os.mkdir(dirname)
        except OSError as ose:
            if ose.errno != 17: #ignore the "Directory already exists error"
                print(str(ose))
                print("The task worker need to access the '%s' directory" % dirname)
                sys.exit(1)
    createLogdir('taskLogs')

    logger = logging.getLogger(taskname)
    logging.basicConfig(filename='taskLogs/'+taskname+'.log', level=logging.INFO, format=config.General.logMsgFormat)

    logger.info("Getting files to publish")

    toPublish = []
    # TODO move from new to done when processed
    with open("/tmp/"+taskname+".json") as f:
        toPublish = json.load(f)

    workflow = taskname

    if len(toPublish) == 0:
        return "EMPTY"

    if not workflow:
        logger.info("NO TASKNAME: %s" % toPublish[0])
    for k, v in toPublish[0].iteritems():
        if k == 'taskname':
            logger.info("Starting: %s: %s" % (k, v))
    wfnamemsg = "%s: " % (workflow)

    user = toPublish[0]["User"]
    try:
        group = toPublish[0]["Group"]
        role = toPublish[0]["Role"]
    except:
        group = ""
        role = ""

    if not group or group in ['null']:
        group = ""
    if not role or role in ['null']:
        role = ""

    userDN = toPublish[0]["UserDN"]
    pnn = toPublish[0]["Destination"]
    logger.info(wfnamemsg+" "+user)

    READ_PATH = "/DBSReader"
    READ_PATH_1 = "/DBSReader/"


    # TODO: get user role and group
    try:
        proxy = Proxy(userDN, group, role, logger)
    except:
        logger.exception("Failed to retrieve user proxy")
        return "FAILED"

    oracelInstance = config.General.oracleDB
    oracleDB = HTTPRequests(oracelInstance,
                            proxy,
                            proxy)

    fileDoc = dict()
    fileDoc['subresource'] = 'search'
    fileDoc['workflow'] = workflow

    try:
        results = oracleDB.get(task_path,
                               data=encodeRequest(fileDoc))
    except Exception as ex:
        logger.error("Failed to get acquired publications from oracleDB for %s: %s" % (workflow, ex))
        return "FAILED"

    logger.info(results[0]['desc']['columns'])

    try:
        inputDatasetIndex = results[0]['desc']['columns'].index("tm_input_dataset")
        inputDataset = results[0]['result'][inputDatasetIndex]
        sourceURLIndex = results[0]['desc']['columns'].index("tm_dbs_url")
        sourceURL = results[0]['result'][sourceURLIndex]
        publish_dbs_urlIndex = results[0]['desc']['columns'].index("tm_publish_dbs_url")
        publish_dbs_url = results[0]['result'][publish_dbs_urlIndex]

        #sourceURL = "https://cmsweb.cern.ch/dbs/prod/global/DBSReader"
        if not sourceURL.endswith(READ_PATH) and not sourceURL.endswith(READ_PATH_1):
            sourceURL += READ_PATH


    except Exception:
        logger.exception("ERROR")
    ## When looking up parents may need to look in global DBS as well.
    globalURL = sourceURL
    globalURL = globalURL.replace('phys01', 'global')
    globalURL = globalURL.replace('phys02', 'global')
    globalURL = globalURL.replace('phys03', 'global')
    globalURL = globalURL.replace('caf', 'global')

    pr = os.environ.get("SOCKS5_PROXY")
    logger.info(wfnamemsg+"Source API URL: %s" % sourceURL)
    sourceApi = dbsClient.DbsApi(url=sourceURL, proxy=pr)
    logger.info(wfnamemsg+"Global API URL: %s" % globalURL)
    globalApi = dbsClient.DbsApi(url=globalURL, proxy=pr)

    WRITE_PATH = "/DBSWriter"
    MIGRATE_PATH = "/DBSMigrate"
    READ_PATH = "/DBSReader"

    if publish_dbs_url.endswith(WRITE_PATH):
        publish_read_url = publish_dbs_url[:-len(WRITE_PATH)] + READ_PATH
        publish_migrate_url = publish_dbs_url[:-len(WRITE_PATH)] + MIGRATE_PATH
    else:
        publish_migrate_url = publish_dbs_url + MIGRATE_PATH
        publish_read_url = publish_dbs_url + READ_PATH
        publish_dbs_url += WRITE_PATH

    try:
        logger.debug(wfnamemsg+"Destination API URL: %s" % publish_dbs_url)
        destApi = dbsClient.DbsApi(url=publish_dbs_url, proxy=pr)
        logger.debug(wfnamemsg+"Destination read API URL: %s" % publish_read_url)
        destReadApi = dbsClient.DbsApi(url=publish_read_url, proxy=pr)
        logger.debug(wfnamemsg+"Migration API URL: %s" % publish_migrate_url)
        migrateApi = dbsClient.DbsApi(url=publish_migrate_url, proxy=pr)
    except:
        logger.exception('Wrong DBS URL %s' % publish_dbs_url)
        return "FAILED"

    logger.info("inputDataset: %s" % inputDataset)
    noInput = len(inputDataset.split("/")) <= 3

    # TODO: fix dbs dep
    if not noInput:
        try:
            existing_datasets = sourceApi.listDatasets(dataset=inputDataset, detail=True, dataset_access_type='*')
            primary_ds_type = existing_datasets[0]['primary_ds_type']
            # There's little chance this is correct, but it's our best guess for now.
            # CRAB2 uses 'crab2_tag' for all cases
            existing_output = destReadApi.listOutputConfigs(dataset=inputDataset)
        except:
            logger.exception('Wrong DBS URL %s' % publish_dbs_url)
            return "FAILED"
        if not existing_output:
            msg = "Unable to list output config for input dataset %s." % (inputDataset)
            logger.error(wfnamemsg+msg)
            global_tag = 'crab3_tag'
        else:
            global_tag = existing_output[0]['global_tag']
    else:
        msg = "This publication appears to be for private MC."
        logger.info(wfnamemsg+msg)
        primary_ds_type = 'mc'
        global_tag = 'crab3_tag'

    acquisition_era_name = "CRAB"
    processing_era_config = {'processing_version': 1, 'description': 'CRAB3_processing_era'}

    appName = 'cmsRun'
    appVer = toPublish[0]["swversion"]
    pset_hash = toPublish[0]['publishname'].split("-")[-1]
    gtag = str(toPublish[0]['globaltag'])
    if gtag == "None":
        gtag = global_tag
    try:
        if toPublish[0]['acquisitionera'] and not toPublish[0]['acquisitionera'] in ["null"]:
            acquisitionera = str(toPublish[0]['acquisitionera'])
        else:
            acquisitionera = acquisition_era_name
    except:
        acquisitionera = acquisition_era_name

    _, primName, procName, tier = toPublish[0]['outdataset'].split('/')

    primds_config = {'primary_ds_name': primName, 'primary_ds_type': primary_ds_type}
    msg = "About to insert primary dataset: %s" % (str(primds_config))
    logger.debug(wfnamemsg+msg)
    destApi.insertPrimaryDataset(primds_config)
    msg = "Successfully inserted primary dataset %s." % (primName)
    logger.debug(wfnamemsg+msg)

    final = {}
    failed = []
    publish_in_next_iteration = []
    published = []

    dataset = toPublish[0]['outdataset']
    # Find all (valid) files already published in this dataset.
    try:
        existingDBSFiles = destReadApi.listFiles(dataset=dataset, detail=True)
        existingFiles = [f['logical_file_name'] for f in existingDBSFiles]
        existingFilesValid = [f['logical_file_name'] for f in existingDBSFiles if f['is_file_valid']]
        msg = "Dataset %s already contains %d files" % (dataset, len(existingFiles))
        msg += " (%d valid, %d invalid)." % (len(existingFilesValid), len(existingFiles) - len(existingFilesValid))
        logger.info(wfnamemsg+msg)
        final['existingFiles'] = len(existingFiles)
    except Exception as ex:
        msg = "Error when listing files in DBS: %s" % (str(ex))
        msg += "\n%s" % (str(traceback.format_exc()))
        logger.error(wfnamemsg+msg)
        return "FAILED"

    # check if actions are needed
    workToDo = False

    for fileTo in toPublish:
        if fileTo['lfn'] not in existingFilesValid:
            workToDo = True

    if not workToDo:
        msg = "Nothing uploaded, %s has these files already or not enough files." % (dataset)
        logger.info(wfnamemsg+msg)
        return "NOTHING TO DO"

    acquisition_era_config = {'acquisition_era_name': acquisitionera, 'start_date': 0}

    output_config = {'release_version': appVer,
                     'pset_hash': pset_hash,
                     'app_name': appName,
                     'output_module_label': 'o',
                     'global_tag': global_tag,
                    }
    msg = "Published output config."
    logger.debug(wfnamemsg+msg)

    dataset_config = {'dataset': dataset,
                      'processed_ds_name': procName,
                      'data_tier_name': tier,
                      'acquisition_era_name': acquisitionera,
                      'dataset_access_type': 'VALID',
                      'physics_group_name': 'CRAB3',
                      'last_modification_date': int(time.time()),
                     }
    msg = "About to insert dataset: %s" % (str(dataset_config))
    logger.info(wfnamemsg+msg)
    del dataset_config['acquisition_era_name']

    # List of all files that must (and can) be published.
    dbsFiles = []
    dbsFiles_f = []
    # Set of all the parent files from all the files requested to be published.
    parentFiles = set()
    # Set of parent files for which the migration to the destination DBS instance
    # should be skipped (because they were not found in DBS).
    parentsToSkip = set()
    # Set of parent files to migrate from the source DBS instance
    # to the destination DBS instance.
    localParentBlocks = set()
    # Set of parent files to migrate from the global DBS instance
    # to the destination DBS instance.
    globalParentBlocks = set()

    # Loop over all files to publish.
    for file_ in toPublish:
        logger.info(file_)
        # Check if this file was already published and if it is valid.
        if file_['lfn'] not in existingFilesValid:
            # We have a file to publish.
            # Get the parent files and for each parent file do the following:
            # 1) Add it to the list of parent files.
            # 2) Find the block to which it belongs and insert that block name in
            #    (one of) the set of blocks to be migrated to the destination DBS.
            for parentFile in list(file_['parents']):
                if parentFile not in parentFiles:
                    parentFiles.add(parentFile)
                    # Is this parent file already in the destination DBS instance?
                    # (If yes, then we don't have to migrate this block.)
                    blocksDict = destReadApi.listBlocks(logical_file_name=parentFile)
                    if not blocksDict:
                        # No, this parent file is not in the destination DBS instance.
                        # Maybe it is in the same DBS instance as the input dataset?
                        blocksDict = sourceApi.listBlocks(logical_file_name=parentFile)
                        if blocksDict:
                            # Yes, this parent file is in the same DBS instance as the input dataset.
                            # Add the corresponding block to the set of blocks from the source DBS
                            # instance that have to be migrated to the destination DBS.
                            localParentBlocks.add(blocksDict[0]['block_name'])
                        else:
                            # No, this parent file is not in the same DBS instance as input dataset.
                            # Maybe it is in global DBS instance?
                            blocksDict = globalApi.listBlocks(logical_file_name=parentFile)
                            if blocksDict:
                                # Yes, this parent file is in global DBS instance.
                                # Add the corresponding block to the set of blocks from global DBS
                                # instance that have to be migrated to the destination DBS.
                                globalParentBlocks.add(blocksDict[0]['block_name'])
                    # If this parent file is not in the destination DBS instance, is not
                    # the source DBS instance, and is not in global DBS instance, then it
                    # means it is not known to DBS and therefore we can not migrate it.
                    # Put it in the set of parent files for which migration should be skipped.
                    if not blocksDict:
                        parentsToSkip.add(parentFile)
                # If this parent file should not be migrated because it is not known to DBS,
                # we remove it from the list of parents in the file-to-publish info dictionary
                # (so that when publishing, this "parent" file will not appear as a parent).
                if parentFile in parentsToSkip:
                    msg = "Skipping parent file %s, as it doesn't seem to be known to DBS." % (parentFile)
                    logger.info(wfnamemsg+msg)
                    if parentFile in file_['parents']:
                        file_['parents'].remove(parentFile)
            # Add this file to the list of files to be published.
            dbsFiles.append(format_file_3(file_))
            dbsFiles_f.append(file_)
        #print file
        published.append(file_['SourceLFN'])
    # Print a message with the number of files to publish.
    msg = "Found %d files not already present in DBS which will be published." % (len(dbsFiles))
    logger.info(wfnamemsg+msg)

    # If there are no files to publish, continue with the next dataset.
    if len(dbsFiles_f) == 0:
        msg = "Nothing to do for this dataset."
        logger.info(wfnamemsg+msg)
        return "NOTHING TO DO"

    # Migrate parent blocks before publishing.
    # First migrate the parent blocks that are in the same DBS instance
    # as the input dataset.
    if localParentBlocks:
        msg = "List of parent blocks that need to be migrated from %s:\n%s" % (sourceApi.url, localParentBlocks)
        logger.info(wfnamemsg+msg)
        statusCode, failureMsg = migrateByBlockDBS3(workflow,
                                                    migrateApi,
                                                    destReadApi,
                                                    sourceApi,
                                                    inputDataset,
                                                    localParentBlocks
                                                   )
        if statusCode:
            failureMsg += " Not publishing any files."
            logger.info(wfnamemsg+failureMsg)
            failed.extend([f['SourceLFN'] for f in dbsFiles_f])
            failure_reason = failureMsg
            published = [x for x in published[dataset] if x not in failed[dataset]]
            return "NOTHING TO DO"
    # Then migrate the parent blocks that are in the global DBS instance.
    if globalParentBlocks:
        msg = "List of parent blocks that need to be migrated from %s:\n%s" % (globalApi.url, globalParentBlocks)
        logger.info(wfnamemsg+msg)
        statusCode, failureMsg = migrateByBlockDBS3(workflow, migrateApi, destReadApi, globalApi, inputDataset, globalParentBlocks)
        if statusCode:
            failureMsg += " Not publishing any files."
            logger.info(wfnamemsg+failureMsg)
            failed.extend([f['SourceLFN'] for f in dbsFiles_f])
            failure_reason = failureMsg
            published = [x for x in published[dataset] if x not in failed[dataset]]
            return "NOTHING TO DO"
    # Publish the files in blocks. The blocks must have exactly max_files_per_block
    # files, unless there are less than max_files_per_block files to publish to
    # begin with. If there are more than max_files_per_block files to publish,
    # publish as many blocks as possible and leave the tail of files for the next
    # PublisherWorker call, unless forced to published.
    block_count = 0
    count = 0
    max_files_per_block = config.General.max_files_per_block
    while True:
        block_name = "%s#%s" % (dataset, str(uuid.uuid4()))
        files_to_publish = dbsFiles[count:count+max_files_per_block]
        try:
            block_config = {'block_name': block_name, 'origin_site_name': pnn, 'open_for_writing': 0}
            msg = "Inserting files %s into block %s." % ([f['logical_file_name']
                                                          for f in files_to_publish], block_name)
            logger.info(wfnamemsg+msg)
            blockDump = createBulkBlock(output_config, processing_era_config,
                                        primds_config, dataset_config,
                                        acquisition_era_config, block_config, files_to_publish)
            #logger.debug(wfnamemsg+"Block to insert: %s\n %s" % (blockDump, destApi.__dict__ ))

            destApi.insertBulkBlock(blockDump)
            block_count += 1
        except Exception as ex:
            logger.error("Error for files: %s" % [f['SourceLFN'] for f in toPublish])
            failed.extend([f['SourceLFN'] for f in toPublish])
            msg = "Error when publishing (%s) " % ", ".join(failed)
            msg += str(ex)
            msg += str(traceback.format_exc())
            logger.error(wfnamemsg+msg)
            failure_reason = str(ex)
        count += max_files_per_block
        files_to_publish_next = dbsFiles_f[count:count+max_files_per_block]
        if len(files_to_publish_next) < max_files_per_block:
            publish_in_next_iteration.extend([f["SourceLFN"] for f in files_to_publish_next])
            break
    published = [x for x in published if x not in failed + publish_in_next_iteration]
    # Fill number of files/blocks published for this dataset.
    final['files'] = len(dbsFiles) - len(failed) - len(publish_in_next_iteration)
    final['blocks'] = block_count
    # Print a publication status summary for this dataset.
    msg = "End of publication status for dataset %s:" % (dataset)
    msg += " failed (%s) %s" % (len(failed), failed)
    msg += ", published (%s) %s" % (len(published), published)
    msg += ", publish_in_next_iteration (%s) %s" % (len(publish_in_next_iteration),
                                                    publish_in_next_iteration)
    msg += ", results %s" % (final)
    logger.info(wfnamemsg+msg)

    try:
        if published:
            mark_good(workflow, published, oracleDB, logger)
        if failed:
            logger.debug("Failed files: %s " % failed)
            mark_failed(failed, oracleDB, logger, failure_reason)
    except:
        logger.exception("Status update failed")

    return 0
コード例 #35
0
class FileTransfersTest(unittest.TestCase):
    """
    _DashboardAPITest_

    Unit tests for the FileTransfers API
    """
    def setUp(self):
        """
        Setup for unit tests
        """
        self.server = HTTPRequests(os.environ['SERVER_HOST'], os.environ['X509_USER_PROXY'], os.environ['X509_USER_PROXY'])
        self.lfnBase = '/store/temp/user/%s/my_cool_dataset-%s/file-%s-%s.root'
        self.fileDoc = {'id': 'OVERWRITE',
                        'username': '******',
                        'taskname': 'OVERWRITE',
                        'start_time': 0,
                        'destination': 'T2_CH_CERN',
                        'destination_lfn': 'OVERWRITE',
                        'source': 'T2_US_Caltech',
                        'source_lfn': 'OVERWRITE',
                        'filesize': random.randint(1, 9999),
                        'publish': 1,
                        'transfer_state': 'OVERWRITE',
                        'publication_state': 'OVERWRITE',
                        'job_id': 1,
                        'job_retry_count': 0,
                        'type': 'log',
                        'rest_host': 'cmsweb.cern.ch',
                        'rest_uri': '/crabserver/prod/'}
        self.ids = []
        self.users = ['jbalcas', 'mmascher', 'dciangot', 'riahi', 'erupeika', 'sbelforte']  # just random users for tests
        self.tasks = {}
        self.totalFiles = 10

    def testFileTransferPUT(self):
        """
        _testFileTransferPUT_

        Just test simple testFileTransferPUT with fake data
        """
        # We just sent fake data which is not monitored by dashboard.
        # Also only the first time to decide is publication ON or NOT
        for user in self.users:
            timestamp = time.strftime('%y%m%d_%H%M%S', time.gmtime())
            for i in range(self.totalFiles):
                now = int(time.time())
                # Generate a taskname
                workflowName = ""
                taskname = ""
                if user not in self.tasks:
                    workflowName = "".join([random.choice(string.ascii_lowercase) for _ in range(20)]) + "_" + str(now)
                    publicationState = random.choice(['NEW', 'NOT_REQUIRED'])
                else:
                    workflowName = self.tasks[user]['workflowName']
                    publicationState = self.tasks[user]['publication']
                transferState = random.choice(['NEW', 'DONE'])
                taskname = generateTaskName(user, workflowName, timestamp)
                finalLfn = self.lfnBase % (user, workflowName, i, random.randint(1, 9999))
                idHash = getHashLfn(finalLfn)
                self.fileDoc['id'] = idHash
                self.fileDoc['job_id'] = i
                self.fileDoc['username'] = user
                self.fileDoc['taskname'] = taskname
                self.fileDoc['start_time'] = int(time.time())
                self.fileDoc['source_lfn'] = finalLfn
                self.fileDoc['destination_lfn'] = finalLfn
                self.fileDoc['transfer_state'] = transferState
                self.fileDoc['publication_state'] = publicationState
                print(self.fileDoc)
                self.server.put('/crabserver/dev/fileusertransfers', data=encodeRequest(self.fileDoc))
                # if I will put the same doc twice, it should raise an error.
                # self.server.put('/crabserver/dev/fileusertransfers', data=urllib.urlencode(self.fileDoc))
                # This tasks are for the future and next calls
                if user not in self.tasks:
                    self.tasks[user] = {'workflowName': workflowName, 'taskname': taskname, 'listOfIds': [],
                                        'publication': publicationState, 'toTransfer': 0, 'toPublish': 0, 'total': self.totalFiles}
                if self.tasks[user]['publication'] == 'NEW':
                    self.tasks[user]['toPublish'] += 1
                if transferState == 'NEW':
                    self.tasks[user]['toTransfer'] += 1
                self.tasks[user]['listOfIds'].append(idHash)
        # This should raise an error
        for username in self.tasks:
            taskname = self.tasks[username]['taskname']
            for query in ['getTransferStatus', 'getPublicationStatus']:
                result = self.server.get('/crabserver/dev/fileusertransfers', data=encodeRequest({'subresource': query,
                                                                                    'username': username,
                                                                                    'taskname': taskname}))
                print(result)
                print(result[0]['result'])
                taskInfoDict = oracleOutputMapping(result, 'id')
                print(taskInfoDict)
                for key, docDict in taskInfoDict.items():
                    result = self.server.get('/crabserver/dev/fileusertransfers', data=encodeRequest({'subresource': 'getById', 'id': key}))

        randomUsers = random.sample(set(self.users), 3)  # Take half of the users and kill their transfers for specific task
        for username in randomUsers:
            taskname = self.tasks[username]['taskname']
            result = self.server.post('/crabserver/dev/fileusertransfers', data=encodeRequest({'subresource': 'killTransfers',
                                                                                                  'username': username,
                                                                                                  'taskname': taskname}))
            print(result)
        # oneUser is left for killing a list of IDs
        # leftUsers will be killing transfers one by one for specific id.
        leftUsers = list(set(self.users) - set(randomUsers))
        oneUser = random.sample(set(leftUsers), 1)
        leftUsers = list(set(leftUsers) - set(oneUser))
        for username in leftUsers:
            # First get all left ids for this users
            result = self.server.get('/crabserver/dev/fileusertransfers', data=encodeRequest({'subresource': 'getTransferStatus',
                                                                                                 'username': username,
                                                                                                 'taskname': self.tasks[username]['taskname']}))
            resultOut = oracleOutputMapping(result, None)
            print("**"*50)
            for outDict in resultOut:
                print(outDict)
                result = self.server.post('/crabserver/dev/fileusertransfers', data=encodeRequest({'subresource': 'killTransfersById',
                                                                                                      'username': username,
                                                                                                      'listOfIds': outDict['id']}))
                print(result)
            print(resultOut)
            print(result)
        for username in oneUser:
            result = self.server.post('/crabserver/dev/fileusertransfers', data=encodeRequest({'subresource': 'killTransfersById',
                                                                                               'username': username,
                                                                                               'listOfIds': self.tasks[username]['listOfIds']}, ['listOfIds']))
            # As it asks to kill all which are in new, need to double check what we submitted before and if the output of killed is correct
            print(result)
            print(self.tasks[username])
コード例 #36
0
class Master(object):
    """I am the main daemon kicking off all Publisher work via slave Publishers"""
    def __init__(self, confFile=None, quiet=False, debug=True, testMode=False):
        """
        Initialise class members

        :arg WMCore.Configuration config: input Publisher configuration
        :arg bool quiet: it tells if a quiet logger is needed
        :arg bool debug: it tells if needs a verbose logger
        :arg bool testMode: it tells if to run in test (no subprocesses) mode.
        """
        def createLogdir(dirname):
            """ Create the directory dirname ignoring erors in case it exists. Exit if
                the directory cannot be created.
            """
            try:
                os.makedirs(dirname)
            except OSError as ose:
                if ose.errno != 17:  #ignore the "Directory already exists error"
                    print(str(ose))
                    print(
                        "The Publisher Worker needs to access the '%s' directory"
                        % dirname)
                    sys.exit(1)

        self.configurationFile = confFile  # remember this, will have to pass it to TaskPublish
        config = loadConfigurationFile(confFile)
        self.config = config.General
        self.TPconfig = config.TaskPublisher

        # these are used for talking to DBS
        os.putenv('X509_USER_CERT', self.config.serviceCert)
        os.putenv('X509_USER_KEY', self.config.serviceKey)
        self.block_publication_timeout = self.config.block_closure_timeout
        self.lfn_map = {}
        self.force_publication = False
        self.force_failure = False
        self.TestMode = testMode
        self.taskFilesDir = self.config.taskFilesDir
        createLogdir(self.taskFilesDir)

        def setRootLogger(logsDir, quiet=False, debug=True, console=False):
            """Sets the root logger with the desired verbosity level
               The root logger logs to logs/twlog.txt and every single
               logging instruction is propagated to it (not really nice
               to read)

            :arg bool quiet: it tells if a quiet logger is needed
            :arg bool debug: it tells if needs a verbose logger
            :arg bool console: it tells if to direct all printoput to console rather then files, useful for debug
            :return logger: a logger with the appropriate logger level."""

            createLogdir(logsDir)
            createLogdir(logsDir + '/processes')
            createLogdir(logsDir + '/tasks')

            if console:
                # if we are testing log to the console is easier
                logging.getLogger().addHandler(logging.StreamHandler())
            else:
                logHandler = MultiProcessingLog(logsDir + '/log.txt',
                                                when='midnight')
                logFormatter = logging.Formatter(
                    "%(asctime)s:%(levelname)s:%(module)s,%(lineno)d:%(message)s"
                )
                logHandler.setFormatter(logFormatter)
                logging.getLogger().addHandler(logHandler)
            loglevel = logging.INFO
            if quiet:
                loglevel = logging.WARNING
            if debug:
                loglevel = logging.DEBUG
            logging.getLogger().setLevel(loglevel)
            logger = setMasterLogger()
            logger.debug("PID %s.", os.getpid())
            logger.debug("Logging level initialized to %s.", loglevel)
            return logger

        self.logger = setRootLogger(self.config.logsDir,
                                    quiet=quiet,
                                    debug=debug,
                                    console=self.TestMode)

        from WMCore.Credential.Proxy import Proxy
        proxy = Proxy({'logger': self.logger})
        from ServerUtilities import tempSetLogLevel
        with tempSetLogLevel(self.logger, logging.ERROR):
            self.myDN = proxy.getSubjectFromCert(
                certFile=self.config.serviceCert)

        # CRABServer REST API's (see CRABInterface)
        try:
            instance = self.config.instance
        except:
            msg = "No instance provided: need to specify config.General.instance in the configuration"
            raise ConfigException(msg)

        if instance in SERVICE_INSTANCES:
            self.logger.info('Will connect to CRAB service: %s', instance)
            restHost = SERVICE_INSTANCES[instance]['restHost']
            dbInstance = SERVICE_INSTANCES[instance]['dbInstance']
        else:
            msg = "Invalid instance value '%s'" % instance
            raise ConfigException(msg)
        if instance == 'other':
            self.logger.info(
                'Will use restHost and dbInstance from config file')
            try:
                restHost = self.config.restHost
                dbInstance = self.config.dbInstance
            except:
                msg = "Need to specify config.General.restHost and dbInstance in the configuration"
                raise ConfigException(msg)

        restURInoAPI = '/crabserver/' + dbInstance
        self.logger.info(
            'Will connect to CRAB Data Base via URL: https://%s/%s', restHost,
            restURInoAPI)

        # CRAB REST API's
        self.REST_filetransfers = restURInoAPI + '/filetransfers'
        #self.REST_usertransfers = restURInoAPI +  '/fileusertransfers'
        self.REST_filemetadata = restURInoAPI + '/filemetadata'
        self.REST_workflow = restURInoAPI + '/workflow'
        self.REST_task = restURInoAPI + '/task'
        self.max_files_per_block = self.config.max_files_per_block
        self.crabServer = HTTPRequests(url=restHost,
                                       localcert=self.config.serviceCert,
                                       localkey=self.config.serviceKey,
                                       retry=3)

        #try:
        #    self.connection = RequestHandler(config={'timeout': 900, 'connecttimeout' : 900})
        #except Exception as ex:
        #    msg = "Error initializing the connection handler"
        #    msg += str(ex)
        #    msg += str(traceback.format_exc())
        #    self.logger.debug(msg)

    def active_tasks(self, db):
        """
        :param db: HTTPRequest object to access proper REST as createdin __init__ method
        TODO  detail here the strucutre it returns
        :return: a list tuples [(task,info)]. One element for each task which has jobs to be published
                 each list element has the format (task, info) where task and info are lists
                 task=[] a set fo 4 attributes formatted as a list of strings:
                    [username, usergroup, userrole, taskname]
                    this is a subset of the information present in each element of thee next list !
                 info=[filedic1, filedic2, filedic3...] a list of dictionaries,one per file with keys:
                   u'username', u'cache_url', u'source_lfn', u'publication_state', u'destination',
                   u'user_role', u'last_update', u'input_dataset', u'dbs_url', u'aso_worker',
                   u'user_group', u'taskname', u'transfer_state', u'destination_lfn'
        """

        self.logger.debug("Retrieving publications from oracleDB")
        uri = self.REST_filetransfers
        fileDoc = {}
        fileDoc['asoworker'] = self.config.asoworker
        fileDoc['subresource'] = 'acquirePublication'
        data = encodeRequest(fileDoc)
        results = ''
        try:
            results = db.post(uri=uri, data=data)
        except Exception as ex:
            self.logger.error("Failed to acquire publications from %s: %s",
                              uri, ex)
            return []

        self.logger.debug(
            "Retrieving max.100000 acquired publications from oracleDB")
        fileDoc = dict()
        fileDoc['asoworker'] = self.config.asoworker
        fileDoc['subresource'] = 'acquiredPublication'
        fileDoc['grouping'] = 0
        fileDoc['limit'] = 100000
        result = []
        uri = self.REST_filetransfers
        data = encodeRequest(fileDoc)

        try:
            results = db.get(uri=uri, data=data)
            result.extend(oracleOutputMapping(results))
        except Exception as ex:
            self.logger.error("Failed to acquire publications from %s: %s",
                              uri, ex)
            return []

        self.logger.debug("publen: %s", len(result))

        self.logger.debug("%s acquired publications retrieved", len(result))

        # TODO: join query for publisher (same of submitter)
        unique_tasks = [
            list(i) for i in set(
                tuple([
                    x['username'], x['user_group'], x['user_role'],
                    x['taskname']
                ]) for x in result if x['transfer_state'] == 3)
        ]

        info = []
        for task in unique_tasks:
            info.append([x for x in result if x['taskname'] == task[3]])
        return zip(unique_tasks, info)

    def getPublDescFiles(self, workflow, lfn_ready, logger):
        """
        Download and read the files describing
        what needs to be published
        """
        dataDict = {}
        dataDict['taskname'] = workflow
        dataDict['filetype'] = 'EDM'

        out = []

        # divide lfn per chunks, avoiding URI-too long exception
        def chunks(l, n):
            """
            Yield successive n-sized chunks from l.
            :param l: list to splitt in chunks
            :param n: chunk size
            :return: yield the next list chunk
            """
            for i in range(0, len(l), n):
                yield l[i:i + n]

        chunkSize = 10
        nIter = 0
        if len(lfn_ready) > chunkSize:
            logger.info(
                "retrieving input file metadata for %s files in chunks of %s",
                len(lfn_ready), chunkSize)
        for lfn_ in chunks(lfn_ready, chunkSize):
            nIter += 1
            dataDict['lfn'] = lfn_
            data = encodeRequest(dataDict, listParams=["lfn"])
            uri = self.REST_filemetadata

            try:
                #res = self.crabServer.get(uri=uri, data=encodeRequest(data, listParams=["lfn"]))
                res = self.crabServer.get(uri=uri, data=data)
                res = res[0]
            except Exception as ex:
                logger.error("Error during metadata retrieving from %s: %s",
                             uri, ex)
                continue

            # print(len(res['result']))
            for obj in res['result']:
                if isinstance(obj, dict):
                    out.append(obj)
                else:
                    # print type(obj)
                    out.append(json.loads(str(obj)))
            if nIter % 10 == 0:
                logger.info("... retrieved %s metadata", len(out))

        return out

    def algorithm(self):
        """
        1. Get a list of users with files to publish from the REST
        2. For each user get a suitably sized input for publish
        3. Submit the publish to a subprocess
        """
        tasks = self.active_tasks(self.crabServer)

        # example code, uncomment to pick only one task
        # myTask='180912_142016:arizzi_crab_NanoDnRMXDYJetsToLL_M-105To160_TuneCUETP8M1_13TeV-amcaRunIISummer16MiniAODv2-PUMorio__heIV_v6-v22201'
        # tasksToDo=[]
        # for t in tasks:
        #  if t[0][3]==myTask:
        #  tasksToDo.append(t)
        # tasks = tasksToDo

        maxSlaves = self.config.max_slaves
        self.logger.info('kicking off pool for %s tasks in batches of %s',
                         len(tasks), maxSlaves)
        self.logger.debug('list of tasks %s', [x[0][3] for x in tasks])
        processes = []

        try:
            for task in tasks:
                taskname = str(task[0][3])
                if int(taskname[0:4]) < 2007:
                    self.logger.info(
                        "Skipped %s. Ignore tasks created before July 2020.",
                        taskname)
                    continue
                if self.TestMode:
                    self.startSlave(
                        task)  # sequentially do one task after another
                else:  # deal with each task in a separate process
                    p = Process(target=self.startSlave, args=(task, ))
                    p.start()
                    self.logger.info('Starting process %s  pid=%s', p, p.pid)
                    processes.append(p)
                if len(processes) == maxSlaves:
                    # wait until a batch of maxSlaves processes have completed before starting more
                    for proc in processes:
                        proc.join()
                    self.logger.info('All processes in the batch have ended')
                    time.sleep(
                        10)  # take a breath, mostly useful for debugging
                    processes = []

        except Exception:
            self.logger.exception("Error during process mapping")

    def startSlave(self, task):
        """
        start a slave process to deal with publication for a single task
        :param task: one tupla describing  a task as returned by  active_tasks()
        :return: 0  It will always terminate normally, if publication fails it will mark it in the DB
        """
        # TODO: lock task!
        # - process logger
        logger = setSlaveLogger(str(task[0][3]))
        logger.info("Process %s is starting. PID %s", task[0][3], os.getpid())

        self.force_publication = False
        workflow = str(task[0][3])

        if int(workflow[0:2]) < 20:
            msg = "Skipped. Ignore tasks created before 2020."
            logger.info(msg)
            return 0

        if len(task[1]) > self.max_files_per_block:
            self.force_publication = True
            msg = "All datasets have more than %s ready files." % (
                self.max_files_per_block)
            msg += " No need to retrieve task status nor last publication time."
            logger.info(msg)
        else:
            msg = "At least one dataset has less than %s ready files." % (
                self.max_files_per_block)
            logger.info(msg)
            # Retrieve the workflow status. If the status can not be retrieved, continue
            # with the next workflow.
            workflow_status = ''
            msg = "Retrieving status"
            logger.info(msg)
            uri = self.REST_workflow
            data = encodeRequest({'workflow': workflow})
            try:
                res = self.crabServer.get(uri=uri, data=data)
            except Exception as ex:
                logger.warn('Error retrieving status from %s for %s.', uri,
                            workflow)
                return 0

            try:
                workflow_status = res[0]['result'][0]['status']
                msg = "Task status is %s." % workflow_status
                logger.info(msg)
            except ValueError:
                msg = "Workflow removed from WM."
                logger.error(msg)
                workflow_status = 'REMOVED'
            except Exception as ex:
                msg = "Error loading task status!"
                msg += str(ex)
                msg += str(traceback.format_exc())
                logger.error(msg)
            # If the workflow status is terminal, go ahead and publish all the ready files
            # in the workflow.
            if workflow_status in ['COMPLETED', 'FAILED', 'KILLED', 'REMOVED']:
                self.force_publication = True
                if workflow_status in ['KILLED', 'REMOVED']:
                    self.force_failure = True
                msg = "Considering task status as terminal. Will force publication."
                logger.info(msg)
            # Otherwise...
            else:
                msg = "Task status is not considered terminal."
                logger.info(msg)
                msg = "Getting last publication time."
                logger.info(msg)
                # Get when was the last time a publication was done for this workflow (this
                # should be more or less independent of the output dataset in case there are
                # more than one).
                last_publication_time = None
                data = encodeRequest({
                    'workflow': workflow,
                    'subresource': 'search'
                })
                try:
                    result = self.crabServer.get(self.REST_task, data)
                    logger.debug("task: %s ", str(result[0]))
                    last_publication_time = getColumn(result[0],
                                                      'tm_last_publication')
                except Exception as ex:
                    logger.error("Error during task doc retrieving: %s", ex)
                if last_publication_time:
                    date = last_publication_time  # datetime in Oracle format
                    timetuple = datetime.strptime(
                        date, "%Y-%m-%d %H:%M:%S.%f").timetuple(
                        )  # convert to time tuple
                    last_publication_time = time.mktime(
                        timetuple)  # convert to seconds since Epoch (float)

                msg = "Last publication time: %s." % str(last_publication_time)
                logger.debug(msg)
                # If this is the first time a publication would be done for this workflow, go
                # ahead and publish.
                if not last_publication_time:
                    self.force_publication = True
                    msg = "There was no previous publication. Will force publication."
                    logger.info(msg)
                # Otherwise...
                else:
                    last = last_publication_time
                    msg = "Last published block time: %s" % last
                    logger.debug(msg)
                    # If the last publication was long time ago (> our block publication timeout),
                    # go ahead and publish.
                    now = int(time.time()) - time.timezone
                    time_since_last_publication = now - last
                    hours = int(time_since_last_publication / 60 / 60)
                    minutes = int(
                        (time_since_last_publication - hours * 60 * 60) / 60)
                    timeout_hours = int(self.block_publication_timeout / 60 /
                                        60)
                    timeout_minutes = int((self.block_publication_timeout -
                                           timeout_hours * 60 * 60) / 60)
                    msg = "Last publication was %sh:%sm ago" % (hours, minutes)
                    if time_since_last_publication > self.block_publication_timeout:
                        self.force_publication = True
                        msg += " (more than the timeout of %sh:%sm)." % (
                            timeout_hours, timeout_minutes)
                        msg += " Will force publication."
                    else:
                        msg += " (less than the timeout of %sh:%sm)." % (
                            timeout_hours, timeout_minutes)
                        msg += " Not enough to force publication."
                    logger.info(msg)

        # logger.info(task[1])
        try:
            if self.force_publication:
                # - get info
                active_ = [{
                    'key': [
                        x['username'], x['user_group'], x['user_role'],
                        x['taskname']
                    ],
                    'value': [
                        x['destination'], x['source_lfn'],
                        x['destination_lfn'], x['input_dataset'], x['dbs_url'],
                        x['last_update']
                    ]
                } for x in task[1] if x['transfer_state'] == 3
                           and x['publication_state'] not in [2, 3, 5]]

                lfn_ready = []
                wf_jobs_endtime = []
                pnn, input_dataset, input_dbs_url = "", "", ""
                for active_file in active_:
                    job_end_time = active_file['value'][5]
                    if job_end_time:
                        wf_jobs_endtime.append(
                            int(job_end_time) - time.timezone)
                    source_lfn = active_file['value'][1]
                    dest_lfn = active_file['value'][2]
                    self.lfn_map[dest_lfn] = source_lfn
                    if not pnn or not input_dataset or not input_dbs_url:
                        pnn = str(active_file['value'][0])
                        input_dataset = str(active_file['value'][3])
                        input_dbs_url = str(active_file['value'][4])
                    lfn_ready.append(dest_lfn)

                username = task[0][0]

                # Get metadata
                toPublish = []
                publDescFiles_list = self.getPublDescFiles(
                    workflow, lfn_ready, logger)
                for file_ in active_:
                    for _, doc in enumerate(publDescFiles_list):
                        # logger.info(type(doc))
                        # logger.info(doc)
                        if doc["lfn"] == file_["value"][2]:
                            doc["User"] = username
                            doc["Group"] = file_["key"][1]
                            doc["Role"] = file_["key"][2]
                            doc["UserDN"] = self.myDN
                            doc["Destination"] = file_["value"][0]
                            doc["SourceLFN"] = file_["value"][1]
                            toPublish.append(doc)
                with open(self.taskFilesDir + workflow + '.json',
                          'w') as outfile:
                    json.dump(toPublish, outfile)

                # find the location in the current environment of the script we want to run
                import Publisher.TaskPublish as tp
                taskPublishScript = tp.__file__
                cmd = "python %s " % taskPublishScript
                cmd += " --configFile=%s" % self.configurationFile
                cmd += " --taskname=%s" % workflow
                if self.TPconfig.dryRun:
                    cmd += " --dry"
                logger.info("Now execute: %s", cmd)
                subprocess.call(cmd, shell=True)
                logger.info('ALL DONE')

        except Exception:
            logger.exception("Exception when calling TaskPublish!")

        return 0

    def pollInterval(self):
        """
        :return: the poll Interval from thius master configuration
        """
        return self.config.pollInterval
コード例 #37
0
ファイル: TaskPublish.py プロジェクト: belforte/CRABServer
def publishInDBS3(taskname):
    """

    """
    def createLogdir(dirname):
        """
        Create the directory dirname ignoring erors in case it exists. Exit if
        the directory cannot be created.
        """
        try:
            os.mkdir(dirname)
        except OSError as ose:
            if ose.errno != 17: #ignore the "Directory already exists error"
                print(str(ose))
                print("The task worker need to access the '%s' directory" % dirname)
                sys.exit(1)

    createLogdir('taskLogs')
    logger = logging.getLogger(taskname)
    logging.basicConfig(filename='taskLogs/'+taskname+'.log', level=logging.INFO, format=config.General.logMsgFormat)

    logger.info("Getting files to publish")

    toPublish = []
    # TODO move from new to done when processed
    with open("/tmp/publisher_files/"+taskname+".json") as f:
        toPublish = json.load(f)

    workflow = taskname

    if len(toPublish) == 0:
        return "EMPTY"

    if not workflow:
        logger.info("NO TASKNAME: %s" % toPublish[0])
    for k, v in toPublish[0].iteritems():
        if k == 'taskname':
            logger.info("Starting: %s: %s" % (k, v))
    wfnamemsg = "%s: " % (workflow)

    user = toPublish[0]["User"]
    try:
        group = toPublish[0]["Group"]
        role = toPublish[0]["Role"]
    except:
        group = ""
        role = ""

    if not group or group in ['null']:
        group = ""
    if not role or role in ['null']:
        role = ""

    userDN = toPublish[0]["UserDN"]
    pnn = toPublish[0]["Destination"]
    logger.info(wfnamemsg+" "+user)

    READ_PATH = "/DBSReader"
    READ_PATH_1 = "/DBSReader/"

    # TODO: get user role and group
    try:
        proxy = Proxy(userDN, group, role, logger)
    except:
        logger.exception("Failed to retrieve user proxy")
        return "FAILED"

    oracelInstance = config.General.oracleDB
    oracleDB = HTTPRequests(oracelInstance,
                            proxy,
                            proxy)

    fileDoc = dict()
    fileDoc['subresource'] = 'search'
    fileDoc['workflow'] = workflow

    try:
        results = oracleDB.get(task_path,
                               data=encodeRequest(fileDoc))
    except Exception as ex:
        logger.error("Failed to get acquired publications from oracleDB for %s: %s" % (workflow, ex))
        return "FAILED"

    logger.info(results[0]['desc']['columns'])

    try:
        inputDatasetIndex = results[0]['desc']['columns'].index("tm_input_dataset")
        inputDataset = results[0]['result'][inputDatasetIndex]
        sourceURLIndex = results[0]['desc']['columns'].index("tm_dbs_url")
        sourceURL = results[0]['result'][sourceURLIndex]
        publish_dbs_urlIndex = results[0]['desc']['columns'].index("tm_publish_dbs_url")
        publish_dbs_url = results[0]['result'][publish_dbs_urlIndex]

        #sourceURL = "https://cmsweb.cern.ch/dbs/prod/global/DBSReader"
        if not sourceURL.endswith(READ_PATH) and not sourceURL.endswith(READ_PATH_1):
            sourceURL += READ_PATH
    except Exception:
        logger.exception("ERROR")
    # When looking up parents may need to look in global DBS as well.
    globalURL = sourceURL
    globalURL = globalURL.replace('phys01', 'global')
    globalURL = globalURL.replace('phys02', 'global')
    globalURL = globalURL.replace('phys03', 'global')
    globalURL = globalURL.replace('caf', 'global')

    pr = os.environ.get("SOCKS5_PROXY")
    logger.info(wfnamemsg+"Source API URL: %s" % sourceURL)
    sourceApi = dbsClient.DbsApi(url=sourceURL, proxy=pr)
    logger.info(wfnamemsg+"Global API URL: %s" % globalURL)
    globalApi = dbsClient.DbsApi(url=globalURL, proxy=pr)

    WRITE_PATH = "/DBSWriter"
    MIGRATE_PATH = "/DBSMigrate"
    READ_PATH = "/DBSReader"

    if publish_dbs_url.endswith(WRITE_PATH):
        publish_read_url = publish_dbs_url[:-len(WRITE_PATH)] + READ_PATH
        publish_migrate_url = publish_dbs_url[:-len(WRITE_PATH)] + MIGRATE_PATH
    else:
        publish_migrate_url = publish_dbs_url + MIGRATE_PATH
        publish_read_url = publish_dbs_url + READ_PATH
        publish_dbs_url += WRITE_PATH

    try:
        logger.debug(wfnamemsg+"Destination API URL: %s" % publish_dbs_url)
        destApi = dbsClient.DbsApi(url=publish_dbs_url, proxy=pr)
        logger.debug(wfnamemsg+"Destination read API URL: %s" % publish_read_url)
        destReadApi = dbsClient.DbsApi(url=publish_read_url, proxy=pr)
        logger.debug(wfnamemsg+"Migration API URL: %s" % publish_migrate_url)
        migrateApi = dbsClient.DbsApi(url=publish_migrate_url, proxy=pr)
    except:
        logger.exception('Wrong DBS URL %s' % publish_dbs_url)
        return "FAILED"

    logger.info("inputDataset: %s" % inputDataset)
    noInput = len(inputDataset.split("/")) <= 3

    # TODO: fix dbs dep
    if not noInput:
        try:
            existing_datasets = sourceApi.listDatasets(dataset=inputDataset, detail=True, dataset_access_type='*')
            primary_ds_type = existing_datasets[0]['primary_ds_type']
            # There's little chance this is correct, but it's our best guess for now.
            # CRAB2 uses 'crab2_tag' for all cases
            existing_output = destReadApi.listOutputConfigs(dataset=inputDataset)
        except:
            logger.exception('Wrong DBS URL %s' % publish_dbs_url)
            return "FAILED"
        if not existing_output:
            msg = "Unable to list output config for input dataset %s." % (inputDataset)
            logger.error(wfnamemsg+msg)
            global_tag = 'crab3_tag'
        else:
            global_tag = existing_output[0]['global_tag']
    else:
        msg = "This publication appears to be for private MC."
        logger.info(wfnamemsg+msg)
        primary_ds_type = 'mc'
        global_tag = 'crab3_tag'

    acquisition_era_name = "CRAB"
    processing_era_config = {'processing_version': 1, 'description': 'CRAB3_processing_era'}

    appName = 'cmsRun'
    appVer = toPublish[0]["swversion"]
    pset_hash = toPublish[0]['publishname'].split("-")[-1]
    gtag = str(toPublish[0]['globaltag'])
    if gtag == "None":
        gtag = global_tag
    try:
        if toPublish[0]['acquisitionera'] and not toPublish[0]['acquisitionera'] in ["null"]:
            acquisitionera = str(toPublish[0]['acquisitionera'])
        else:
            acquisitionera = acquisition_era_name
    except:
        acquisitionera = acquisition_era_name

    _, primName, procName, tier = toPublish[0]['outdataset'].split('/')

    primds_config = {'primary_ds_name': primName, 'primary_ds_type': primary_ds_type}
    msg = "About to insert primary dataset: %s" % (str(primds_config))
    logger.debug(wfnamemsg+msg)
    destApi.insertPrimaryDataset(primds_config)
    msg = "Successfully inserted primary dataset %s." % (primName)
    logger.debug(wfnamemsg+msg)

    final = {}
    failed = []
    publish_in_next_iteration = []
    published = []

    dataset = toPublish[0]['outdataset']
    # Find all (valid) files already published in this dataset.
    try:
        existingDBSFiles = destReadApi.listFiles(dataset=dataset, detail=True)
        existingFiles = [f['logical_file_name'] for f in existingDBSFiles]
        existingFilesValid = [f['logical_file_name'] for f in existingDBSFiles if f['is_file_valid']]
        msg = "Dataset %s already contains %d files" % (dataset, len(existingFiles))
        msg += " (%d valid, %d invalid)." % (len(existingFilesValid), len(existingFiles) - len(existingFilesValid))
        logger.info(wfnamemsg+msg)
        final['existingFiles'] = len(existingFiles)
    except Exception as ex:
        msg = "Error when listing files in DBS: %s" % (str(ex))
        msg += "\n%s" % (str(traceback.format_exc()))
        logger.error(wfnamemsg+msg)
        return "FAILED"

    # check if actions are needed
    workToDo = False

    for fileTo in toPublish:
        #print(existingFilesValid)
        if fileTo['lfn'] not in existingFilesValid:
            workToDo = True
            break

    if not workToDo:
        msg = "Nothing uploaded, %s has these files already or not enough files." % (dataset)
        logger.info(wfnamemsg+msg)
        return "NOTHING TO DO"

    acquisition_era_config = {'acquisition_era_name': acquisitionera, 'start_date': 0}

    output_config = {'release_version': appVer,
                     'pset_hash': pset_hash,
                     'app_name': appName,
                     'output_module_label': 'o',
                     'global_tag': global_tag,
                    }
    msg = "Published output config."
    logger.debug(wfnamemsg+msg)

    dataset_config = {'dataset': dataset,
                      'processed_ds_name': procName,
                      'data_tier_name': tier,
                      'acquisition_era_name': acquisitionera,
                      'dataset_access_type': 'VALID',
                      'physics_group_name': 'CRAB3',
                      'last_modification_date': int(time.time()),
                     }
    msg = "About to insert dataset: %s" % (str(dataset_config))
    logger.info(wfnamemsg+msg)
    del dataset_config['acquisition_era_name']

    # List of all files that must (and can) be published.
    dbsFiles = []
    dbsFiles_f = []
    # Set of all the parent files from all the files requested to be published.
    parentFiles = set()
    # Set of parent files for which the migration to the destination DBS instance
    # should be skipped (because they were not found in DBS).
    parentsToSkip = set()
    # Set of parent files to migrate from the source DBS instance
    # to the destination DBS instance.
    localParentBlocks = set()
    # Set of parent files to migrate from the global DBS instance
    # to the destination DBS instance.
    globalParentBlocks = set()

    # Loop over all files to publish.
    for file_ in toPublish:
        logger.info(file_)
        # Check if this file was already published and if it is valid.
        if file_['lfn'] not in existingFilesValid:
            # We have a file to publish.
            # Get the parent files and for each parent file do the following:
            # 1) Add it to the list of parent files.
            # 2) Find the block to which it belongs and insert that block name in
            #    (one of) the set of blocks to be migrated to the destination DBS.
            for parentFile in list(file_['parents']):
                if parentFile not in parentFiles:
                    parentFiles.add(parentFile)
                    # Is this parent file already in the destination DBS instance?
                    # (If yes, then we don't have to migrate this block.)
                    blocksDict = destReadApi.listBlocks(logical_file_name=parentFile)
                    if not blocksDict:
                        # No, this parent file is not in the destination DBS instance.
                        # Maybe it is in the same DBS instance as the input dataset?
                        blocksDict = sourceApi.listBlocks(logical_file_name=parentFile)
                        if blocksDict:
                            # Yes, this parent file is in the same DBS instance as the input dataset.
                            # Add the corresponding block to the set of blocks from the source DBS
                            # instance that have to be migrated to the destination DBS.
                            localParentBlocks.add(blocksDict[0]['block_name'])
                        else:
                            # No, this parent file is not in the same DBS instance as input dataset.
                            # Maybe it is in global DBS instance?
                            blocksDict = globalApi.listBlocks(logical_file_name=parentFile)
                            if blocksDict:
                                # Yes, this parent file is in global DBS instance.
                                # Add the corresponding block to the set of blocks from global DBS
                                # instance that have to be migrated to the destination DBS.
                                globalParentBlocks.add(blocksDict[0]['block_name'])
                    # If this parent file is not in the destination DBS instance, is not
                    # the source DBS instance, and is not in global DBS instance, then it
                    # means it is not known to DBS and therefore we can not migrate it.
                    # Put it in the set of parent files for which migration should be skipped.
                    if not blocksDict:
                        parentsToSkip.add(parentFile)
                # If this parent file should not be migrated because it is not known to DBS,
                # we remove it from the list of parents in the file-to-publish info dictionary
                # (so that when publishing, this "parent" file will not appear as a parent).
                if parentFile in parentsToSkip:
                    msg = "Skipping parent file %s, as it doesn't seem to be known to DBS." % (parentFile)
                    logger.info(wfnamemsg+msg)
                    if parentFile in file_['parents']:
                        file_['parents'].remove(parentFile)
            # Add this file to the list of files to be published.
            dbsFiles.append(format_file_3(file_))
            dbsFiles_f.append(file_)
        #print file
        published.append(file_['SourceLFN'])
        #published.append(file_['lfn'].replace("/store","/store/temp"))

    # Print a message with the number of files to publish.
    msg = "Found %d files not already present in DBS which will be published." % (len(dbsFiles))
    logger.info(wfnamemsg+msg)

    # If there are no files to publish, continue with the next dataset.
    if len(dbsFiles_f) == 0:
        msg = "Nothing to do for this dataset."
        logger.info(wfnamemsg+msg)
        return "NOTHING TO DO"

    # Migrate parent blocks before publishing.
    # First migrate the parent blocks that are in the same DBS instance
    # as the input dataset.
    if localParentBlocks:
        msg = "List of parent blocks that need to be migrated from %s:\n%s" % (sourceApi.url, localParentBlocks)
        logger.info(wfnamemsg+msg)
        statusCode, failureMsg = migrateByBlockDBS3(workflow,
                                                    migrateApi,
                                                    destReadApi,
                                                    sourceApi,
                                                    inputDataset,
                                                    localParentBlocks
                                                   )
        if statusCode:
            failureMsg += " Not publishing any files."
            logger.info(wfnamemsg+failureMsg)
            failed.extend([f['SourceLFN'] for f in dbsFiles_f])
            #failed.extend([f['lfn'].replace("/store","/store/temp") for f in dbsFiles_f])
            failure_reason = failureMsg
            published = [x for x in published[dataset] if x not in failed[dataset]]
            return "NOTHING TO DO"
    # Then migrate the parent blocks that are in the global DBS instance.
    if globalParentBlocks:
        msg = "List of parent blocks that need to be migrated from %s:\n%s" % (globalApi.url, globalParentBlocks)
        logger.info(wfnamemsg+msg)
        statusCode, failureMsg = migrateByBlockDBS3(workflow, migrateApi, destReadApi, globalApi, inputDataset, globalParentBlocks)
        if statusCode:
            failureMsg += " Not publishing any files."
            logger.info(wfnamemsg+failureMsg)
            failed.extend([f['SourceLFN'] for f in dbsFiles_f])
            #failed.extend([f['lfn'].replace("/store","/store/temp") for f in dbsFiles_f])
            failure_reason = failureMsg
            published = [x for x in published[dataset] if x not in failed[dataset]]
            return "NOTHING TO DO"
    # Publish the files in blocks. The blocks must have exactly max_files_per_block
    # files, unless there are less than max_files_per_block files to publish to
    # begin with. If there are more than max_files_per_block files to publish,
    # publish as many blocks as possible and leave the tail of files for the next
    # PublisherWorker call, unless forced to published.
    block_count = 0
    count = 0
    max_files_per_block = config.General.max_files_per_block
    while True:
        block_name = "%s#%s" % (dataset, str(uuid.uuid4()))
        files_to_publish = dbsFiles[count:count+max_files_per_block]
        try:
            block_config = {'block_name': block_name, 'origin_site_name': pnn, 'open_for_writing': 0}
            msg = "Inserting files %s into block %s." % ([f['logical_file_name']
                                                          for f in files_to_publish], block_name)
            logger.info(wfnamemsg+msg)
            blockDump = createBulkBlock(output_config, processing_era_config,
                                        primds_config, dataset_config,
                                        acquisition_era_config, block_config, files_to_publish)
            #logger.debug(wfnamemsg+"Block to insert: %s\n %s" % (blockDump, destApi.__dict__ ))

            destApi.insertBulkBlock(blockDump)
            block_count += 1
        except Exception as ex:
            #logger.error("Error for files: %s" % [f['SourceLFN'] for f in toPublish])
            logger.error("Error for files: %s" % [f['lfn'] for f in toPublish])
            failed.extend([f['SourceLFN'] for f in toPublish])
            #failed.extend([f['lfn'].replace("/store","/store/temp") for f in toPublish])
            msg = "Error when publishing (%s) " % ", ".join(failed)
            msg += str(ex)
            msg += str(traceback.format_exc())
            logger.error(wfnamemsg+msg)
            failure_reason = str(ex)
            file='/tmp/failed-block-at-%s.txt' % time.time()
            with open(file,'write') as fd:
                fd.write(blockDump)
            logger.error("FAILING BLOCK SAVED AS %s" % file)
        count += max_files_per_block
        files_to_publish_next = dbsFiles_f[count:count+max_files_per_block]
        if len(files_to_publish_next) < max_files_per_block:
            publish_in_next_iteration.extend([f["SourceLFN"] for f in files_to_publish_next])
            #publish_in_next_iteration.extend([f["lfn"].replace("/store","/store/temp") for f in files_to_publish_next])
            break
    published = [x for x in published if x not in failed + publish_in_next_iteration]
    # Fill number of files/blocks published for this dataset.
    final['files'] = len(dbsFiles) - len(failed) - len(publish_in_next_iteration)
    final['blocks'] = block_count
    # Print a publication status summary for this dataset.
    msg = "End of publication status for dataset %s:" % (dataset)
    msg += " failed (%s) %s" % (len(failed), failed)
    msg += ", published (%s) %s" % (len(published), published)
    msg += ", publish_in_next_iteration (%s) %s" % (len(publish_in_next_iteration),
                                                    publish_in_next_iteration)
    msg += ", results %s" % (final)
    logger.info(wfnamemsg+msg)

    try:
        if published:
            mark_good(workflow, published, oracleDB, logger)
        if failed:
            logger.debug("Failed files: %s " % failed)
            mark_failed(failed, oracleDB, logger, failure_reason)
    except:
        logger.exception("Status update failed")

    return 0
コード例 #38
0
class TransferWorker:
    """
    Submit user transfers to FTS
    """
    def __init__(self, user, tfc_map, config):
        """
        store the user transfer info and retrieve user proxy.
        """
        self.user = user[0]
        self.group = user[1]
        self.role = user[2]
        self.tfc_map = tfc_map
        self.config = config
        self.dropbox_dir = '%s/dropbox/outputs' % self.config.componentDir
        logging.basicConfig(level=config.log_level)
        self.logger = logging.getLogger('AsyncTransfer-Worker-%s' % self.user)
        formatter = getCommonLogFormatter(self.config)
        for handler in logging.getLogger().handlers:
            handler.setFormatter(formatter)
        self.pfn_to_lfn_mapping = {}
        self.max_retry = config.max_retry
        self.uiSetupScript = getattr(self.config, 'UISetupScript', None)
        self.submission_command = getattr(self.config, 'submission_command', 'glite-transfer-submit')
        self.cleanEnvironment = ''
        self.userDN = ''
        self.init = True
        if getattr(self.config, 'cleanEnvironment', False):
            self.cleanEnvironment = 'unset LD_LIBRARY_PATH; unset X509_USER_CERT; unset X509_USER_KEY;'
        self.logger.debug("Trying to get DN for %s" % self.user)
        try:
            self.userDN = getDNFromUserName(self.user, self.logger, ckey=self.config.opsProxy, cert=self.config.opsProxy)
        except Exception as ex:
            msg = "Error retrieving the user DN"
            msg += str(ex)
            msg += str(traceback.format_exc())
            self.logger.error(msg)
            self.init = False
            return
        if not self.userDN:
            self.init = False
            return
        defaultDelegation = {'logger': self.logger,
                             'credServerPath' : self.config.credentialDir,
                             # It will be moved to be getfrom couchDB
                             'myProxySvr': 'myproxy.cern.ch',
                             'min_time_left' : getattr(self.config, 'minTimeLeft', 36000),
                             'serverDN' : self.config.serverDN,
                             'uisource' : self.uiSetupScript,
                             'cleanEnvironment' : getattr(self.config, 'cleanEnvironment', False)}

        # Set up a factory for loading plugins
        self.factory = WMFactory(self.config.pluginDir, namespace=self.config.pluginDir)
        self.commandTimeout = 1200
        try:
            if self.config.isOracle:
                self.oracleDB = HTTPRequests(self.config.oracleDB,
                                             self.config.opsProxy,
                                             self.config.opsProxy)
            else:
                server = CouchServer(dburl=self.config.couch_instance, ckey=self.config.opsProxy, cert=self.config.opsProxy)
                self.db = server.connectDatabase(self.config.files_database)
            config_server = CouchServer(dburl=self.config.config_couch_instance, ckey=self.config.opsProxy, cert=self.config.opsProxy)
            self.config_db = config_server.connectDatabase(self.config.config_database)
            self.fts_server_for_transfer = getFTServer("T1_UK_RAL", 'getRunningFTSserver', self.config_db, self.logger)
        except Exception:
            self.logger.exception('Failed to contact DB')

        self.cache_area = ""
        if hasattr(self.config, "cache_area"):
            self.cache_area = self.config.cache_area

        if not self.config.isOracle:
            query = {'key': self.user}

            try:
                self.user_cache_area = self.db.loadView('DBSPublisher', 'cache_area', query)['rows']
                self.cache_area = "https://" + self.user_cache_area[0]['value'][0] + \
                                  self.user_cache_area[0]['value'][1] + "/filemetadata"
            except Exception:
                self.logger.exception("Error getting user cache_area.")
                pass
        try:
            defaultDelegation['myproxyAccount'] = re.compile('https?://([^/]*)/.*').findall(self.cache_area)[0]
        except IndexError:
            self.logger.error('MyproxyAccount parameter cannot be retrieved from %s . ' % self.config.cache_area)
        if getattr(self.config, 'serviceCert', None):
            defaultDelegation['server_cert'] = self.config.serviceCert
        if getattr(self.config, 'serviceKey', None):
            defaultDelegation['server_key'] = self.config.serviceKey
        self.valid_proxy = False
        self.user_proxy = self.config.opsProxy
        try:
            defaultDelegation['userDN'] = self.userDN
            defaultDelegation['group'] = self.group
            defaultDelegation['role'] = self.role
            self.logger.debug('delegation: %s' % defaultDelegation )
            self.valid_proxy, self.user_proxy = getProxy(defaultDelegation, self.logger)
        except Exception as ex:
            msg = "Error getting the user proxy"
            msg += str(ex)
            msg += str(traceback.format_exc())
            self.logger.error(msg)
        self.context = dict()

    def __call__(self):
        """
        a. makes the RESTFTS job
        b. submits FTS
        c. update status and create dropbox json
        """
        stdout, stderr, rc = None, None, 99999
        #fts_url_delegation = self.fts_server_for_transfer.replace('8446', '8443')
        if self.user_proxy:
            try:
                self.context = fts3.Context(self.fts_server_for_transfer, self.user_proxy, self.user_proxy, verify=True)
                self.logger.debug(fts3.delegate(self.context, lifetime=timedelta(hours=48), force=False))
                init_time = str(time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime()))
                jobs, jobs_lfn, jobs_pfn, jobs_report = self.files_for_transfer()
            except:
                self.logger.exception('delegation failed')
            self.logger.debug("Processing files for %s " % self.user_proxy)
            if jobs:
                self.command(jobs, jobs_lfn, jobs_pfn, jobs_report)
        else:
            self.logger.debug("User proxy of %s could not be delagated! Trying next time." % self.user)
        self.logger.info('Transfers completed')
        return

    def source_destinations_by_user(self):
        """
        Get all the destinations for a user
        """
        if self.config.isOracle:
            self.logger.debug('Running acquiredTransfers query... ' + self.user)
            fileDoc = dict()
            fileDoc['asoworker'] = self.config.asoworker
            fileDoc['subresource'] = 'acquiredTransfers'
            fileDoc['grouping'] = 1
            fileDoc['username'] = self.user
            if self.group == '':
                group = None
            if self.role == '':
                role = None
            fileDoc['vogroup'] = group
            fileDoc['vorole'] = role
            fileDoc['limit'] =  self.config.max_files_per_transfer
            result = []

            self.logger.debug('Request: ' + str(fileDoc))
            try:
                results = self.oracleDB.get(self.config.oracleFileTrans,
                                            data=encodeRequest(fileDoc))
                result = oracleOutputMapping(results)
                res = [[x['source'], x['destination']] for x in result]
                res.sort()
                res = list(k for k, _ in itertools.groupby(res))
            except Exception as ex:
                self.logger.error("Failed to get acquired transfers \
                                  from oracleDB: %s" %ex)
                return [], {}
            return res, result
        else:
            query = {'group': True,
                     'startkey':[self.user, self.group, self.role], 'endkey':[self.user, self.group, self.role, {}, {}]}
            try:
                sites = self.db.loadView(self.config.ftscp_design, 'ftscp_all', query)
            except:
                return []
            return [[x[4], x[3]] for x in sites['rows']]

    def files_for_transfer(self):
        """
        Process a queue of work per transfer source:destination for a user. Return one
        job per source:destination.
        """
        if self.config.isOracle:
            source_dests, docs = self.source_destinations_by_user()
        else:
            source_dests = self.source_destinations_by_user()
        jobs = {}
        jobs_lfn = {}
        jobs_pfn = {}
        jobs_report = {}
        self.logger.info('%s has %s links to transfer on: %s' % (self.user, len(source_dests), str(source_dests)))
        try:
            count = 0
            for (source, destination) in source_dests:
                count += 1
                self.logger.info('dest1: %s source: %s' % (docs[0]['destination'],source))
                if self.config.isOracle:
                    if self.group == '':
                        group = None
                    else:
                        group = self.group
                    if self.role == '':
                        role = None
                    else:
                        role = self.role
                    active_docs = [x for x in docs
                                   if x['destination'] == destination
                                   and x['source'] == source
                                   and x['username'] == self.user
                                   and x['user_group'] == group
                                   and x['user_role'] == role
                                  ]
                    # self.logger.info('%s' % active_docs)

                    def map_active(inputdoc):
                        """
                        map active_users
                        """
                        outDict = dict()
                        outDict['key'] = [inputdoc['username'],
                                          inputdoc['user_group'],
                                          inputdoc['user_role'],
                                          inputdoc['destination'],
                                          inputdoc['source'],
                                          inputdoc['id']]
                        outDict['value'] = [inputdoc['source_lfn'], inputdoc['destination_lfn']]
                        return outDict
                    active_files = [map_active(x) for x in active_docs]
                    #active_files = active_files[:1000]
                    self.logger.debug('%s -  %s has %s files to transfer \
                                      from %s to %s' % (count, self.user,
                                                        len(active_files),
                                                        source,
                                                        destination))
                else:
                    query = {'reduce': False,
                             'limit': self.config.max_files_per_transfer,
                             'key': [self.user, self.group,
                                     self.role, destination, source],
                             'stale': 'ok'}
                    try:
                        active_files = self.db.loadView(self.config.ftscp_design, 'ftscp_all', query)['rows']
                    except:
                        continue
                    self.logger.debug('%s has %s files to transfer from %s to %s' % (self.user, len(active_files),
                                                                                     source, destination))
                new_job = []
                lfn_list = []
                pfn_list = []
                dash_report = []

          
                # take these active files and make a copyjob entry
                def tfc_map(item):
                    self.logger.debug('Preparing PFNs...')
                    source_pfn = self.apply_tfc_to_lfn('%s:%s' % (source, item['value'][0]))
                    destination_pfn = self.apply_tfc_to_lfn('%s:%s' % (destination, item['value'][1]))
                    self.logger.debug('PFNs prepared... %s %s' %(destination_pfn,source_pfn))
                    if source_pfn and destination_pfn and self.valid_proxy:
                        try:
                            acquired_file, dashboard_report = self.mark_acquired([item])
                            self.logger.debug('Files have been marked acquired')
                        except Exception as ex:
                            self.logger.error("%s" % ex)
                            raise
                        if acquired_file:
                            self.logger.debug('Starting FTS Job creation...')
                            # Prepare Monitor metadata
                            lfn_list.append(item['value'][0])
                            pfn_list.append(source_pfn)
                            # Prepare FTS Dashboard metadata
                            dash_report.append(dashboard_report)
                            new_job.append('%s %s' % (source_pfn, destination_pfn))
                            self.logger.debug('FTS job created...')
                        else:
                            pass
                    else:
                        self.mark_failed([item])
                #self.logger.debug('Preparing job... %s' % (active_files))
                map(tfc_map, active_files)
                self.logger.debug('Job prepared...')
                if new_job:
                    jobs[(source, destination)] = new_job
                    jobs_lfn[(source, destination)] = lfn_list
                    jobs_pfn[(source, destination)] = pfn_list
                    jobs_report[(source, destination)] = dash_report
                    self.logger.debug('FTS job ready for submission over  %s ---> %s ...going to next job' % (source, destination))

            self.logger.debug('ftscp input created for %s (%s jobs)' % (self.user, len(jobs.keys())))
            return jobs, jobs_lfn, jobs_pfn, jobs_report
        except Exception:
            self.logger.exception("fail")
            return jobs, jobs_lfn, jobs_pfn, jobs_report

    def apply_tfc_to_lfn(self, file):
        """
        Take a CMS_NAME:lfn string and make a pfn.
        Update pfn_to_lfn_mapping dictionary.
        """
        try:
            site, lfn = tuple(file.split(':'))
        except:
            self.logger.error('it does not seem to be an lfn %s' %file.split(':'))
            return None
        if site in self.tfc_map:
            pfn = self.tfc_map[site].matchLFN('srmv2', lfn)
            # TODO: improve fix for wrong tfc on sites
            try:
                if pfn.find("\\") != -1: pfn = pfn.replace("\\", "")
                if len(pfn.split(':')) == 1:
                    self.logger.error('Broken tfc for file %s at site %s' % (lfn, site))
                    return None
            except IndexError:
                self.logger.error('Broken tfc for file %s at site %s' % (lfn, site))
                return None
            except AttributeError:
                self.logger.error('Broken tfc for file %s at site %s' % (lfn, site))
                return None
            # Add the pfn key into pfn-to-lfn mapping
            if pfn not in self.pfn_to_lfn_mapping:
                self.pfn_to_lfn_mapping[pfn] = lfn
            return pfn
        else:
            self.logger.error('Wrong site %s!' % site)
            return None

    def command(self, jobs, jobs_lfn, jobs_pfn, jobs_report):
        """
        For each job the worker has to complete:
        Delete files that have failed previously
        Create a temporary copyjob file
        Submit the copyjob to the appropriate FTS server
        Parse the output of the FTS transfer and return complete and failed files for recording
        """
        # Output: {"userProxyPath":"/path/to/proxy","LFNs":["lfn1","lfn2","lfn3"],"PFNs":["pfn1","pfn2","pfn3"],
        #          "FTSJobid":'id-of-fts-job', "username": '******'}
        # Loop through all the jobs for the links we have
        failure_reasons = []
        for link, copyjob in jobs.items():
            submission_error = False
            fts_job = {}
            # Validate copyjob file before doing anything
            self.logger.debug("Valid %s" % self.validate_copyjob(copyjob))
            if not self.validate_copyjob(copyjob): continue

            rest_copyjob = {
                "params":{
                    "bring_online": None,
                    "verify_checksum": False,
                    "copy_pin_lifetime": -1,
                    "max_time_in_queue": self.config.max_h_in_queue,
                    "job_metadata": {"issuer": "ASO"},
                    "spacetoken": None,
                    "source_spacetoken": None,
                    "fail_nearline": False,
                    "overwrite": True,
                    "gridftp": None
                    },
                "files": []
                }
            transfers = list()
            #for SrcDest in copyjob:
            #    self.logger.debug("Creating FTS job...")
            #    self.logger.debug("%s -> %s" % (SrcDest.split(" ")[0], SrcDest.split(" ")[1]))
                #    transfers.append(fts3.new_transfer(SrcDest.split(" ")[0],
                #                                       SrcDest.split(" ")[1])
                #                     )
                #except:
                #    self.logger.exception("Failure during new_transfer")

            for SrcDest in copyjob:
                tempDict = {"sources": [], "metadata": None, "destinations": []}

                tempDict["sources"].append(SrcDest.split(" ")[0])
                tempDict["destinations"].append(SrcDest.split(" ")[1])
                rest_copyjob["files"].append(tempDict)            

            #self.logger.debug("FTS job Created with %s files..." % (transfers))

            self.logger.debug("Subbmitting this REST copyjob %s" % rest_copyjob)
            url = self.fts_server_for_transfer + '/jobs'
            self.logger.debug("Running FTS submission command")
            self.logger.debug("FTS server: %s" % self.fts_server_for_transfer)
            self.logger.debug("link: %s -> %s" % link)
            heade = {"Content-Type ": "application/json"}
            
            """
            try:
                job = fts3.new_job(transfers,
                                   overwrite=True,
                                   verify_checksum=False,
                                   metadata={"issuer": "ASO"},
                                   copy_pin_lifetime=-1,
                                   bring_online=None,
                                   source_spacetoken=None,
                                   spacetoken=None
                                   # TODO: check why not on fts3 (clone repo maybe?)
                                   # max_time_in_queue=6
                                   )

                jobid = fts3.submit(self.context, job)
                self.logger.info("Monitor link: https://fts3-pilot.cern.ch:8449/fts3/ftsmon/#/job/"+jobid)
            except Exception as ex:
                msg = "Error submitting to FTS" 
                msg += str(ex)
                msg += str(traceback.format_exc())
                self.logger.debug(msg)
                failure_reasons.append(msg)
                submission_error = True

            """
            buf = StringIO.StringIO()
            try:
                connection = RequestHandler(config={'timeout': 300, 'connecttimeout': 300})
            except Exception as ex:
                msg = str(ex)
                msg += str(traceback.format_exc())
                self.logger.debug(msg)
            try:
                response, datares = connection.request(url, rest_copyjob, heade, verb='POST',
                                                       doseq=True, ckey=self.user_proxy,
                                                       cert=self.user_proxy, capath='/etc/grid-security/certificates',
                                                       cainfo=self.user_proxy, verbose=True)
                self.logger.debug("Submission done")
                self.logger.debug('Submission header status: %s' % response.status)
                self.logger.debug('Submission header reason: %s' % response.reason)
                self.logger.debug('Submission result %s' %  datares)
            except Exception as ex:
                msg = "Error submitting to FTS: %s " % url
                msg += str(ex)
                msg += str(traceback.format_exc())
                self.logger.debug(msg)
                failure_reasons.append(msg)
                submission_error = True
            buf.close()
            
            if not submission_error:
                res = {}
                try:
                    res = json.loads(datares)
                except Exception as ex:
                    msg = "Couldn't load submission acknowledgment from FTS"
                    msg += str(ex)
                    msg += str(traceback.format_exc())
                    self.logger.debug(msg)
                    submission_error = True
                    failure_reasons.append(msg)
                if 'job_id' in res:
                    fileId_list = []
                    files_res = []
                    files_ = {}
                    job_id = res['job_id']
                    file_url = self.fts_server_for_transfer + '/jobs/' + job_id +'/files'
                    self.logger.debug("Submitting to %s" % file_url)
                    file_buf = StringIO.StringIO()
                    try:
                        response, files_ = connection.request(file_url, {}, heade, doseq=True, ckey=self.user_proxy,
                                                              cert=self.user_proxy,
                                                              capath='/etc/grid-security/certificates',
                                                              cainfo=self.user_proxy, verbose=True)
                        files_res = json.loads(files_)
                    except Exception as ex:
                        msg = "Error contacting FTS to retrieve file: %s " % file_url
                        msg += str(ex)
                        msg += str(traceback.format_exc())
                        self.logger.debug(msg)
                        submission_error = True
                        failure_reasons.append(msg)
                    #self.logger.debug("List files in job %s" % files_)
                    file_buf.close()
                    for file_in_job in files_res:
                        if 'file_id' in file_in_job:
                            fileId_list.append(file_in_job['file_id'])
                        else:
                            msg = "Could not load submitted file %s from FTS" % file_url
                            self.logger.debug(msg)
                            submission_error = True
                            failure_reasons.append(msg)
                    self.logger.debug("File id list %s" % fileId_list)
            if submission_error:
                self.logger.debug("Submission failed")
                self.logger.info("Mark failed %s files" % len(jobs_lfn[link]))
                self.logger.debug("Mark failed %s files" % jobs_lfn[link])
                failed_files = self.mark_failed(jobs_lfn[link], force_fail=False, submission_error=True)
                self.logger.info("Marked failed %s" % len(failed_files))
                continue
            fts_job['userProxyPath'] = self.user_proxy
            fts_job['LFNs'] = jobs_lfn[link]
            fts_job['PFNs'] = jobs_pfn[link]
            fts_job['FTSJobid'] = job_id
            fts_job['files_id'] = fileId_list
            fts_job['username'] = self.user
            self.logger.debug("Creating json file %s in %s" % (fts_job, self.dropbox_dir))
            ftsjob_file = open('%s/Monitor.%s.json' % (self.dropbox_dir, fts_job['FTSJobid']), 'w')
            jsondata = json.dumps(fts_job)
            ftsjob_file.write(jsondata)
            ftsjob_file.close()
            self.logger.debug("%s ready." % fts_job)
            # Prepare Dashboard report
            for lfn in fts_job['LFNs']:
                lfn_report = dict()
                lfn_report['FTSJobid'] = fts_job['FTSJobid']
                index = fts_job['LFNs'].index(lfn)
                lfn_report['PFN'] = fts_job['PFNs'][index]
                lfn_report['FTSFileid'] = fts_job['files_id'][index]
                lfn_report['Workflow'] = jobs_report[link][index][2]
                lfn_report['JobVersion'] = jobs_report[link][index][1]
                job_id = '%d_https://glidein.cern.ch/%d/%s_%s' % (int(jobs_report[link][index][0]),
                                                                  int(jobs_report[link][index][0]),
                                                                  lfn_report['Workflow'].replace("_", ":"),
                                                                  lfn_report['JobVersion'])
                lfn_report['JobId'] = job_id
                lfn_report['URL'] = self.fts_server_for_transfer
                self.logger.debug("Creating json file %s in %s for FTS3 Dashboard" % (lfn_report, self.dropbox_dir))
                dash_job_file = open('/tmp/DashboardReport/Dashboard.%s.json' % getHashLfn(lfn_report['PFN']), 'w')
                jsondata = json.dumps(lfn_report)
                dash_job_file.write(jsondata)
                dash_job_file.close()
                self.logger.debug("%s ready for FTS Dashboard report." % lfn_report)
        return

    def validate_copyjob(self, copyjob):
        """
        the copyjob file is valid when source pfn and destination pfn are not None.
        """
        for task in copyjob:
            if task.split()[0] == 'None' or task.split()[1] == 'None': return False
        return True

    def mark_acquired(self, files=[]):
        """
        Mark the list of files as tranferred
        """
        lfn_in_transfer = []
        dash_rep = ()
        if self.config.isOracle:
            toUpdate = list()
            for lfn in files:
                if lfn['value'][0].find('temp') == 7:
                    self.logger.debug("Marking acquired %s" % lfn)
                    docId = lfn['key'][5]
                    self.logger.debug("Marking acquired %s" % docId)
                    toUpdate.append(docId)
                    try:
                        docbyId = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers','fileusertransfers'),
                                                    data=encodeRequest({'subresource': 'getById', 'id': docId}))
                        document = oracleOutputMapping(docbyId, None)[0]
                    except Exception as ex:
                        self.logger.error("Error during dashboard report update: %s" %ex)

                    lfn_in_transfer.append(lfn)
                    dash_rep = (document['jobid'], document['job_retry_count'], document['taskname'])

            try:
                fileDoc = dict()
                fileDoc['asoworker'] = self.config.asoworker
                fileDoc['subresource'] = 'updateTransfers'
                fileDoc['list_of_ids'] = files[0]['key'][5] 
                fileDoc['list_of_transfer_state'] = "SUBMITTED"

                self.logger.debug("Marking acquired %s" % (fileDoc))

                result = self.oracleDB.post(self.config.oracleFileTrans,
                                                    data=encodeRequest(fileDoc))
                self.logger.debug("Marked acquired %s of %s" % (fileDoc, result))
            except Exception as ex:
                self.logger.error("Error during status update: %s" %ex)

                    # TODO: no need of mark good right? the postjob should updated the status in case of direct stageout I think
            return lfn_in_transfer, dash_rep
        else:
            for lfn in files:
                if lfn['value'][0].find('temp') == 7:
                    docId = getHashLfn(lfn['value'][0])
                    self.logger.debug("Marking acquired %s" % docId)
                    # Load document to get the retry_count
                    try:
                        document = self.db.document(docId)
                    except Exception as ex:
                        msg = "Error loading document from couch"
                        msg += str(ex)
                        msg += str(traceback.format_exc())
                        self.logger.error(msg)
                        continue
                    if document['state'] == 'new' or document['state'] == 'retry':
                        data = dict()
                        data['state'] = 'acquired'
                        data['last_update'] = time.time()
                        updateUri = "/" + self.db.name + "/_design/AsyncTransfer/_update/updateJobs/" + docId
                        updateUri += "?" + urllib.urlencode(data)
                        try:
                            self.db.makeRequest(uri=updateUri, type="PUT", decode=False)
                        except Exception as ex:
                            msg = "Error updating document in couch"
                            msg += str(ex)
                            msg += str(traceback.format_exc())
                            self.logger.error(msg)
                            continue
                        self.logger.debug("Marked acquired %s of %s" % (docId, lfn))
                        lfn_in_transfer.append(lfn)
                        dash_rep = (document['jobid'], document['job_retry_count'], document['workflow'])
                    else:
                        continue
                else:
                    good_lfn = lfn['value'][0].replace('store', 'store/temp', 1)
                    self.mark_good([good_lfn])
            return lfn_in_transfer, dash_rep

    def mark_good(self, files=[]):
        """
        Mark the list of files as tranferred
        """
        for lfn in files:
            try:
                document = self.db.document(getHashLfn(lfn))
            except Exception as ex:
                msg = "Error loading document from couch"
                msg += str(ex)
                msg += str(traceback.format_exc())
                self.logger.error(msg)
                continue
            if document['state'] != 'killed' and document['state'] != 'done' and document['state'] != 'failed':
                outputLfn = document['lfn'].replace('store/temp', 'store', 1)
                try:
                    now = str(datetime.datetime.now())
                    last_update = time.time()
                    data = dict()
                    data['end_time'] = now
                    data['state'] = 'done'
                    data['lfn'] = outputLfn
                    data['last_update'] = last_update
                    updateUri = "/" + self.db.name + "/_design/AsyncTransfer/_update/updateJobs/" + getHashLfn(lfn)
                    updateUri += "?" + urllib.urlencode(data)
                    self.db.makeRequest(uri=updateUri, type="PUT", decode=False)
                except Exception as ex:
                    msg = "Error updating document in couch"
                    msg += str(ex)
                    msg += str(traceback.format_exc())
                    self.logger.error(msg)
                    continue
                try:
                    self.db.commit()
                except Exception as ex:
                    msg = "Error commiting documents in couch"
                    msg += str(ex)
                    msg += str(traceback.format_exc())
                    self.logger.error(msg)
                    continue
        self.logger.debug("transferred file updated")

    def mark_failed(self, files=[], force_fail=False, submission_error=False):
        """
        Something failed for these files so increment the retry count
        """
        updated_lfn = []
        for lfn in files:
            data = {}
            if not isinstance(lfn, dict):
                if 'temp' not in lfn:
                    temp_lfn = lfn.replace('store', 'store/temp', 1)
                else:
                    temp_lfn = lfn
            else:
                if 'temp' not in lfn['value'][0]:
                    temp_lfn = lfn['value'][0].replace('store', 'store/temp', 1)
                else:
                    temp_lfn = lfn['value'][0]

            # Load document and get the retry_count
            if self.config.isOracle:
                docId = getHashLfn(temp_lfn)
                self.logger.debug("Marking failed %s" % docId)
                try:
                    docbyId = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers',
                                                                                    'fileusertransfers'),
                                                data=encodeRequest({'subresource': 'getById', 'id': docId}))
                except Exception as ex:
                    self.logger.error("Error updating failed docs: %s" %ex)
                    continue
                document = oracleOutputMapping(docbyId, None)[0]
                self.logger.debug("Document: %s" % document)

                fileDoc = dict()
                fileDoc['asoworker'] = self.config.asoworker
                fileDoc['subresource'] = 'updateTransfers'
                fileDoc['list_of_ids'] = docId 

                if force_fail or document['transfer_retry_count'] + 1 > self.max_retry:
                    fileDoc['list_of_transfer_state'] = 'FAILED'
                    fileDoc['list_of_retry_value'] = 1
                else:
                    fileDoc['list_of_transfer_state'] = 'RETRY'
                if submission_error:
                    fileDoc['list_of_failure_reason'] = "Job could not be submitted to FTS: temporary problem of FTS"
                    fileDoc['list_of_retry_value'] = 1
                elif not self.valid_proxy:
                    fileDoc['list_of_failure_reason'] = "Job could not be submitted to FTS: user's proxy expired"
                    fileDoc['list_of_retry_value'] = 1
                else:
                    fileDoc['list_of_failure_reason'] = "Site config problem."
                    fileDoc['list_of_retry_value'] = 1

                self.logger.debug("update: %s" % fileDoc)
                try:
                    updated_lfn.append(docId)
                    result = self.oracleDB.post(self.config.oracleFileTrans,
                                         data=encodeRequest(fileDoc))
                except Exception as ex:
                    msg = "Error updating document"
                    msg += str(ex)
                    msg += str(traceback.format_exc())
                    self.logger.error(msg)
                    continue

            else:
                docId = getHashLfn(temp_lfn)
                try:
                    document = self.db.document(docId)
                except Exception as ex:
                    msg = "Error loading document from couch"
                    msg += str(ex)
                    msg += str(traceback.format_exc())
                    self.logger.error(msg)
                    continue
                if document['state'] != 'killed' and document['state'] != 'done' and document['state'] != 'failed':
                    now = str(datetime.datetime.now())
                    last_update = time.time()
                    # Prepare data to update the document in couch
                    if force_fail or len(document['retry_count']) + 1 > self.max_retry:
                        data['state'] = 'failed'
                    else:
                        data['state'] = 'retry'
                    if submission_error:
                        data['failure_reason'] = "Job could not be submitted to FTS: temporary problem of FTS"
                    elif not self.valid_proxy:
                        data['failure_reason'] = "Job could not be submitted to FTS: user's proxy expired"
                    else:
                        data['failure_reason'] = "Site config problem."
                    data['last_update'] = last_update
                    data['retry'] = now

                    # Update the document in couch
                    self.logger.debug("Marking failed %s" % docId)
                    try:
                        updateUri = "/" + self.db.name + "/_design/AsyncTransfer/_update/updateJobs/" + docId
                        updateUri += "?" + urllib.urlencode(data)
                        self.db.makeRequest(uri=updateUri, type="PUT", decode=False)
                        updated_lfn.append(docId)
                        self.logger.debug("Marked failed %s" % docId)
                    except Exception as ex:
                        msg = "Error in updating document in couch"
                        msg += str(ex)
                        msg += str(traceback.format_exc())
                        self.logger.error(msg)
                        continue
                    try:
                        self.db.commit()
                    except Exception as ex:
                        msg = "Error commiting documents in couch"
                        msg += str(ex)
                        msg += str(traceback.format_exc())
                        self.logger.error(msg)
                        continue
            self.logger.debug("failed file updated")
            return updated_lfn

    def mark_incomplete(self):
        """
        Mark the list of files as acquired
        """
        self.logger('Something called mark_incomplete which should never be called')
コード例 #39
0
def algorithm():
    """

    script algorithm
    - create fts REST HTTPRequest
    - delegate user proxy to fts if needed
    - check for fts jobs to monitor and update states in oracle
    - get last line from last_transfer.txt
    - gather list of file to transfers
        + group by source
        + submit ftsjob and save fts jobid
        + update info in oracle
    - append new fts job ids to fts_jobids.txt
    """

    # TODO: pass by configuration
    fts = HTTPRequests(hostname=FTS_ENDPOINT.split("https://")[1],
                       localcert=proxy, localkey=proxy)

    logging.info("using user's proxy from %s", proxy)
    ftsContext = fts3.Context(FTS_ENDPOINT, proxy, proxy, verify=True)
    logging.info("Delegating proxy to FTS...")
    delegationId = fts3.delegate(ftsContext, lifetime=timedelta(hours=48), delegate_when_lifetime_lt=timedelta(hours=24), force=False)
    delegationStatus = fts.get("delegation/"+delegationId)
    logging.info("Delegated proxy valid until %s", delegationStatus[0]['termination_time'])

    # instantiate an object to talk with CRAB REST server

    try:
        crabserver = CRABRest(restInfo['host'], localcert=proxy, localkey=proxy,
                              userAgent='CRABSchedd')
        crabserver.setDbInstance(restInfo['dbInstance'])
    except Exception:
        logging.exception("Failed to set connection to crabserver")
        return

    with open("task_process/transfers.txt") as _list:
        _data = _list.readlines()[0]
        try:
            doc = json.loads(_data)
            username = doc["username"]
            taskname = doc["taskname"]
            destination = doc["destination"]
        except Exception as ex:
            msg = "Username gathering failed with\n%s" % str(ex)
            logging.warn(msg)
            raise ex

    try:
        logging.info("Initializing Rucio client")
        os.environ["X509_USER_PROXY"] = proxy
        logging.info("Initializing Rucio client for %s", taskname)
        rucioClient = CRABDataInjector(taskname,
                                    destination,
                                    account=username,
                                    scope="user."+username,
                                    auth_type='x509_proxy')
    except Exception as exc:
        msg = "Rucio initialization failed with\n%s" % str(exc)
        logging.warn(msg)
        raise exc

    jobs_ongoing = state_manager(fts, crabserver)
    new_jobs = submission_manager(rucioClient, ftsContext, crabserver)

    logging.info("Transfer jobs ongoing: %s, new: %s ", jobs_ongoing, new_jobs)

    return
コード例 #40
0
class MasterWorker(object):
    """I am the master of the TaskWorker"""

    def __init__(self, config, quiet, debug, test=False):
        """Initializer

        :arg WMCore.Configuration config: input TaskWorker configuration
        :arg logging logger: the logger
        :arg bool quiet: it tells if a quiet logger is needed
        :arg bool debug: it tells if needs a verbose logger."""


        def createLogdir(dirname):
            """ Create the directory dirname ignoring erors in case it exists. Exit if
                the directory cannot be created.
            """
            try:
                os.mkdir(dirname)
            except OSError as ose:
                if ose.errno != 17: #ignore the "Directory already exists error"
                    print(str(ose))
                    print("The task worker need to access the '%s' directory" % dirname)
                    sys.exit(1)


        def setRootLogger(quiet, debug):
            """Sets the root logger with the desired verbosity level
               The root logger logs to logs/twlog.txt and every single
               logging instruction is propagated to it (not really nice
               to read)

            :arg bool quiet: it tells if a quiet logger is needed
            :arg bool debug: it tells if needs a verbose logger
            :return logger: a logger with the appropriate logger level."""

            createLogdir('logs')
            createLogdir('logs/processes')
            createLogdir('logs/tasks')

            if self.TEST:
                #if we are testing log to the console is easier
                logging.getLogger().addHandler(logging.StreamHandler())
            else:
                logHandler = MultiProcessingLog('logs/twlog.txt', when='midnight')
                logFormatter = \
                    logging.Formatter("%(asctime)s:%(levelname)s:%(module)s:%(message)s")
                logHandler.setFormatter(logFormatter)
                logging.getLogger().addHandler(logHandler)
            loglevel = logging.INFO
            if quiet:
                loglevel = logging.WARNING
            if debug:
                loglevel = logging.DEBUG
            logging.getLogger().setLevel(loglevel)
            logger = setProcessLogger("master")
            logger.debug("PID %s.", os.getpid())
            logger.debug("Logging level initialized to %s.", loglevel)
            return logger

        self.STOP = False
        self.TEST = test
        self.logger = setRootLogger(quiet, debug)
        self.config = config
        resthost = None
        self.restURInoAPI = None
        if not self.config.TaskWorker.mode in MODEURL.keys():
            raise ConfigException("No mode provided: need to specify config.TaskWorker.mode in the configuration")
        elif MODEURL[self.config.TaskWorker.mode]['host'] is not None:
            resthost = MODEURL[self.config.TaskWorker.mode]['host']
            self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance']
        else:
            resthost = self.config.TaskWorker.resturl #this should be called resthost in the TaskWorkerConfig -_-
            self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance']
        if resthost is None:
            raise ConfigException("No correct mode provided: need to specify config.TaskWorker.mode in the configuration")
        #Let's increase the server's retries for recoverable errors in the MasterWorker
        #60 means we'll keep retrying for 1 hour basically (we retry at 20*NUMRETRY seconds, so at: 20s, 60s, 120s, 200s, 300s ...)
        self.server = HTTPRequests(resthost, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, retry = 20,
                                   logger = self.logger)
        self.logger.debug("Hostcert: %s, hostkey: %s", str(self.config.TaskWorker.cmscert), str(self.config.TaskWorker.cmskey))
        # Retries for any failures
        if not hasattr(self.config.TaskWorker, 'max_retry'):
            self.config.TaskWorker.max_retry = 0
        if not hasattr(self.config.TaskWorker, 'retry_interval'):
            self.config.TaskWorker.retry_interval = [retry*20*2 for retry in range(self.config.TaskWorker.max_retry)]
        if not len(self.config.TaskWorker.retry_interval) == self.config.TaskWorker.max_retry:
            raise ConfigException("No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry.")
        if self.TEST:
            self.slaves = TestWorker(self.config, resthost, self.restURInoAPI + '/workflowdb')
        else:
            self.slaves = Worker(self.config, resthost, self.restURInoAPI + '/workflowdb')
        self.slaves.begin()
        recurringActionsNames = getattr(self.config.TaskWorker, 'recurringActions', [])
        self.recurringActions = [self.getRecurringActionInst(name) for name in recurringActionsNames]


    def getRecurringActionInst(self, actionName):
        mod = __import__('TaskWorker.Actions.Recurring.%s' % actionName, fromlist=actionName)
        return getattr(mod, actionName)()


    def _lockWork(self, limit, getstatus, setstatus):
        """Today this is always returning true, because we do not want the worker to die if
           the server endpoint is not avaialable.
           Prints a log entry if answer is greater than 400:
            * the server call succeeded or
            * the server could not find anything to update or
            * the server has an internal error"""

        configreq = {'subresource': 'process', 'workername': self.config.TaskWorker.name, 'getstatus': getstatus, 'limit': limit, 'status': setstatus}
        try:
            self.server.post(self.restURInoAPI + '/workflowdb', data = urllib.urlencode(configreq))
        except HTTPException as hte:
            msg = "HTTP Error during _lockWork: %s\n" % str(hte)
            msg += "HTTP Headers are %s: " % hte.headers
            self.logger.error(msg)
            return False
        except Exception: #pylint: disable=broad-except
            self.logger.exception("Server could not process the _lockWork request (prameters are %s)", configreq)
            return False

        return True


    def _getWork(self, limit, getstatus):
        configreq = {'limit': limit, 'workername': self.config.TaskWorker.name, 'getstatus': getstatus}
        pendingwork = []
        try:
            pendingwork = self.server.get(self.restURInoAPI + '/workflowdb', data = configreq)[0]['result']
        except HTTPException as hte:
            msg = "HTTP Error during _getWork: %s\n" % str(hte)
            msg += "HTTP Headers are %s: " % hte.headers
            self.logger.error(msg)
        except Exception: #pylint: disable=broad-except
            self.logger.exception("Server could not process the _getWork request (prameters are %s)", configreq)
        return pendingwork


    def quit_(self, dummyCode, dummyTraceback):
        self.logger.info("Received kill request. Setting STOP flag in the master process...")
        self.STOP = True


    def updateWork(self, taskname, command, status):
        configreq = {'workflow': taskname, 'command': command, 'status': status, 'subresource': 'state'}
        try:
            self.server.post(self.restURInoAPI + '/workflowdb', data = urllib.urlencode(configreq))
        except HTTPException as hte:
            msg = "HTTP Error during updateWork: %s\n" % str(hte)
            msg += "HTTP Headers are %s: " % hte.headers
            self.logger.error(msg)
        except Exception: #pylint: disable=broad-except
            self.logger.exception("Server could not process the updateWork request (prameters are %s)", configreq)


    def failQueuedTasks(self):
        """ This method is used at the TW startup and it fails QUEUED tasks that supposedly
            could not communicate with the REST and update their status. The method put those
            task to SUBMITFAILED, KILLFAILED, RESUBMITFAILED depending on the value of
            the command field.
        """
        limit = self.slaves.nworkers * 2
        total = 0
        while True:
            pendingwork = self._getWork(limit=limit, getstatus='QUEUED')
            for task in pendingwork:
                self.logger.debug("Failing QUEUED task %s", task['tm_taskname'])
                dummyWorktype, failstatus = STATE_ACTIONS_MAP[task['tm_task_command']]
                self.updateWork(task['tm_taskname'], task['tm_task_command'], failstatus)
            if not len(pendingwork):
                self.logger.info("Finished failing QUEUED tasks (total %s)", total)
                break #too bad "do..while" does not exist in python...
            else:
                total += len(pendingwork)
                self.logger.info("Failed %s tasks (limit %s), getting next chunk of tasks", len(pendingwork), limit)


    def algorithm(self):
        """I'm the intelligent guy taking care of getting the work
           and distributing it to the slave processes."""

        self.logger.debug("Failing QUEUED tasks before startup.")
        self.failQueuedTasks()
        self.logger.debug("Starting main loop.")
        while(not self.STOP):
            limit = self.slaves.queueableTasks()
            if not self._lockWork(limit=limit, getstatus='NEW', setstatus='HOLDING'):
                continue

            pendingwork = self._getWork(limit=limit, getstatus='HOLDING')

            if len(pendingwork) > 0:
                self.logger.info("Retrieved a total of %d works", len(pendingwork))
                self.logger.debug("Retrieved the following works: \n%s", str(pendingwork))

            toInject = []
            for task in pendingwork:
                worktype, failstatus = STATE_ACTIONS_MAP[task['tm_task_command']]
                toInject.append((worktype, task, failstatus, None))

            for task in pendingwork:
                self.updateWork(task['tm_taskname'], task['tm_task_command'], 'QUEUED')

            self.slaves.injectWorks(toInject)

            for action in self.recurringActions:
                if action.isTimeToGo():
                    #Maybe we should use new slaves and not reuse the ones used for the tasks
                    self.logger.debug("Injecting recurring action: \n%s", (str(action.__module__)))
                    self.slaves.injectWorks([(handleRecurring, {'tm_taskname' : action.__module__}, 'FAILED', action.__module__)])

            self.logger.info('Master Worker status:')
            self.logger.info(' - free slaves: %d', self.slaves.freeSlaves())
            self.logger.info(' - acquired tasks: %d', self.slaves.queuedTasks())
            self.logger.info(' - tasks pending in queue: %d', self.slaves.pendingTasks())

            time.sleep(self.config.TaskWorker.polling)

            dummyFinished = self.slaves.checkFinished()

        self.logger.debug("Master Worker Exiting Main Cycle")
コード例 #41
0
class RetryManagerDaemon(BaseDaemon):
    """
    _RetryManagerPoller_

    Polls for Files in CoolOff State and attempts to retry them
    based on the requirements in the selected plugin
    """
    def __init__(self, config):
        """
        Initialise class members
        """
        BaseDaemon.__init__(self, config, 'RetryManager')

        if self.config.isOracle:
            try:
                self.oracleDB = HTTPRequests(self.config.oracleDB,
                                             self.config.opsProxy,
                                             self.config.opsProxy)
            except:
                self.logger.exception('Failed to connect to Oracle')
        else:
            try:
                server = CouchServer(dburl=self.config.couch_instance,
                                     ckey=self.config.opsProxy,
                                     cert=self.config.opsProxy)
                self.db = server.connectDatabase(self.config.files_database)
            except Exception as e:
                self.logger.exception('A problem occured when connecting to couchDB: %s' % e)
                raise
            self.logger.debug('Connected to files DB')

            # Set up a factory for loading plugins
        self.factory = WMFactory(self.config.retryAlgoDir, namespace=self.config.retryAlgoDir)
        try:
            self.plugin = self.factory.loadObject(self.config.algoName, self.config,
                                                  getFromCache=False, listFlag=True)
        except Exception as ex:
            msg = "Error loading plugin %s on path %s\n" % (self.config.algoName,
                                                            self.config.retryAlgoDir)
            msg += str(ex)
            self.logger.error(msg)
            raise RetryManagerException(msg)
        self.cooloffTime = self.config.cooloffTime

    def terminate(self, params):
        """
        Run one more time through, then terminate

        """
        logging.debug("Terminating. doing one more pass before we die")
        self.algorithm(params)

    def algorithm(self, parameters=None):
        """
        Performs the doRetries method, loading the appropriate
        plugin for each job and handling it.
        """
        logging.debug("Running retryManager algorithm")
        if self.config.isOracle:
            fileDoc = dict()
            fileDoc['asoworker'] = self.config.asoworker
            fileDoc['subresource'] = 'retryTransfers'
            fileDoc['time_to'] = self.cooloffTime
            self.logger.debug('fileDoc: %s' % fileDoc)
            try:
                results = self.oracleDB.post(self.config.oracleFileTrans,
                                             data=encodeRequest(fileDoc))
            except Exception:
                self.logger.exception("Failed to get retry transfers in oracleDB: %s")
                return
            logging.info("Retried files in cooloff: %s,\n now getting transfers to kill" % str(results))

            fileDoc = dict()
            fileDoc['asoworker'] = self.config.asoworker
            fileDoc['subresource'] = 'getTransfersToKill'
            fileDoc['grouping'] = 0
            try:
                results = self.oracleDB.get(self.config.oracleFileTrans,
                                            data=encodeRequest(fileDoc))
                result = oracleOutputMapping(results)
            except Exception as ex:
                self.logger.error("Failed to get killed transfers \
                                   from oracleDB: %s" % ex)
                return

            usersToKill = list(set([(x['username'], x['user_group'], x['user_role']) for x in result]))

            self.logger.debug("Users with transfers to kill: %s" % usersToKill)
            transfers = Queue()

            for i in range(self.config.kill_threads):
                worker = Thread(target=self.killThread, args=(i, transfers,))
                worker.setDaemon(True)
                worker.start()

            for user in usersToKill:
                user_trans = [x for x in result if (x['username'], x['user_group'], x['user_role']) == user]
                self.logger.info("Inserting %s transfers of user %s in the killing queue" % (len(user_trans), user))
                transfers.put(user_trans)

            transfers.join()
            self.logger.info("Transfers killed.")
        else:
            self.doRetries()

    def killThread(self, thread_id, transfers):
        """This is the worker thread function for kill command.
        """
        while True:
            transfer_list = transfers.get()
            self.logger.info("Starting thread %s" % (thread_id))
            user = transfer_list[0]['username']
            group = transfer_list[0]['user_group']
            role = transfer_list[0]['user_role']

            uiSetupScript = getattr(self.config, 'UISetupScript', None)

            self.logger.debug("Trying to get DN for %s %s %s %s" % (user, self.logger, self.config.opsProxy, self.config.opsProxy))
            try:
                userDN = getDNFromUserName(user, self.logger, ckey=self.config.opsProxy, cert=self.config.opsProxy)
            except Exception as ex:
                msg = "Error retrieving the user DN"
                msg += str(ex)
                msg += str(traceback.format_exc())
                self.logger.error(msg)
                continue
            if not userDN:
                transfers.task_done()
                time.sleep(1)
                continue
            self.logger.debug("user DN: %s" % userDN)

            try:
                defaultDelegation = {'logger': self.logger,
                                     'credServerPath': self.config.credentialDir,
                                     'myProxySvr': 'myproxy.cern.ch',
                                     'min_time_left': getattr(self.config, 'minTimeLeft', 36000),
                                     'serverDN': self.config.serverDN,
                                     'uisource': uiSetupScript,
                                     'cleanEnvironment': getattr(self.config, 'cleanEnvironment', False)}
                if hasattr(self.config, "cache_area"):
                    cache_area = self.config.cache_area
                    defaultDelegation['myproxyAccount'] = re.compile('https?://([^/]*)/.*').findall(cache_area)[0]
            except IndexError:
                self.logger.error('MyproxyAccount parameter cannot be retrieved from %s . ' % self.config.cache_area)
                transfers.task_done()
                time.sleep(1)
                continue
            if getattr(self.config, 'serviceCert', None):
                defaultDelegation['server_cert'] = self.config.serviceCert
            if getattr(self.config, 'serviceKey', None):
                defaultDelegation['server_key'] = self.config.serviceKey
            try:
                defaultDelegation['userDN'] = userDN
                defaultDelegation['group'] = group if group else ''
                defaultDelegation['role'] = role if group else ''
                self.logger.debug('delegation: %s' % defaultDelegation)
                valid_proxy, user_proxy = getProxy(defaultDelegation, self.logger)
            except Exception as ex:
                msg = "Error getting the user proxy"
                msg += str(ex)
                msg += str(traceback.format_exc())
                self.logger.error(msg)
                transfers.task_done()
                time.sleep(1)
                continue

            # TODO: take server from db, right now, take only the first of the list and assuming it valid for all
            try:
                # TODO: debug u added during info upload. To be fixed soon! For now worked around
                fts_server = transfer_list[0]['fts_instance'].split('u')[1]
                self.logger.info("Delegating proxy to %s" % fts_server)
                context = fts3.Context(fts_server, user_proxy, user_proxy, verify=True)
                self.logger.debug(fts3.delegate(context, lifetime=timedelta(hours=48), force=False))

                self.logger.info("Proxy delegated. Grouping files by jobId")
                jobs = {}
                for fileToKill in transfer_list:
                    # TODO: debug u added during info upload. To be fixed soon! For now worked around
                    jid = str(fileToKill['fts_id']).split('u')[1]
                    if jid not in jobs:
                        jobs[jid] = []
                    jobs[jid].append(fileToKill)

                self.logger.info("Found %s jobIds", len(jobs.keys()))
                self.logger.debug("jobIds: %s", jobs.keys)

                # list for files killed or failed to
                killed = []
                too_late = []

                for ftsJobId, files in jobs.iteritems():
                    self.logger.info("Cancelling tranfers in %s" % ftsJobId)

                    ref_lfns = [str(x['destination_lfn'].split('/store/')[1]) for x in files]
                    source_lfns = [x['source_lfn'] for x in files]

                    job_list = fts3.get_job_status(context, ftsJobId, list_files=True)
                    tx = job_list['files']

                    # TODO: this workaround is needed to get FTS file id, we may want to add a column in the db?
                    idListToKill = [x['file_id'] for x in tx
                                    if x['dest_surl'].split('/cms/store/')[1] in ref_lfns]

                    # needed for the state update
                    lfnListToKill = [ref_lfns.index(str(x['dest_surl'].split('/cms/store/')[1])) for x in tx
                                       if x['dest_surl'].split('/cms/store/')[1] in ref_lfns]

                    self.logger.debug("List of ids to cancel for job %s: %s" % (ftsJobId, idListToKill))
                    res = fts3.cancel(context, ftsJobId, idListToKill)
                    self.logger.debug('Kill command result: %s' % json.dumps(res))

                    if not isinstance(res, list):
                        res = [res]

                    # Verify if the kill command succeeded
                    for k, kill_res in enumerate(res):
                        indexToUpdate = lfnListToKill[k]
                        if kill_res in ("FINISHEDDIRTY", "FINISHED", "FAILED"):
                            self.logger.debug(source_lfns[indexToUpdate])
                            too_late.append(getHashLfn(source_lfns[indexToUpdate]))
                        else:
                            killed.append(getHashLfn(source_lfns[indexToUpdate]))

                # TODO: decide how to update status for too_late files
                killed += too_late
                self.logger.debug('Updating status of killed files: %s' % killed)

                if len(killed) > 0:
                    data = dict()
                    data['asoworker'] = self.config.asoworker
                    data['subresource'] = 'updateTransfers'
                    data['list_of_ids'] = killed
                    data['list_of_transfer_state'] = ["KILLED" for _ in killed]
                    self.oracleDB.post(self.config.oracleFileTrans,
                                       data=encodeRequest(data))
                    self.logger.debug("Marked killed %s" % killed)
            except:
                # TODO: split and improve try/except
                self.logger.exception('Kill command failed')

            transfers.task_done()

    def processRetries(self, files):
        """
        _processRetries_

        Actually does the dirty work of figuring out what to do with jobs
        """
        if len(files) < 1:
            # We got no files?
            return

        propList = []
        fileList = self.loadFilesFromList(recList=files)
        logging.debug("Files in cooloff %s" % fileList)
        # Now we should have the files
        propList = self.selectFilesToRetry(fileList)
        logging.debug("Files to retry %s" % propList)
        now = str(datetime.datetime.now())
        for file in propList:
            # update couch
            self.logger.debug("Trying to resubmit %s" % file['id'])
            try:
                document = self.db.document(file['id'])
            except Exception as ex:
                msg = "Error loading document from couch"
                msg += str(ex)
                msg += str(traceback.format_exc())
                self.logger.error(msg)
                continue
            if document['state'] != 'killed':
                data = dict()
                data['state'] = 'new'
                data['last_update'] = time.time()
                data['retry'] = now
                updateUri = "/" + self.db.name + "/_design/AsyncTransfer/_update/updateJobs/" + file['id']
                updateUri += "?" + urllib.urlencode(data)
                try:
                    self.db.makeRequest(uri=updateUri, type="PUT", decode=False)
                except Exception as ex:
                    msg = "Error updating document in couch"
                    msg += str(ex)
                    msg += str(traceback.format_exc())
                    self.logger.error(msg)
                    continue
                self.logger.debug("%s resubmitted" % file['id'])
            else:
                continue
        return

    def loadFilesFromList(self, recList):
        """
        _loadFilesFromList_

        Load jobs in bulk
        """
        all_files = []
        index = 0
        for record in recList:
            all_files.append({})
            all_files[index]['id'] = record['key']
            all_files[index]['state_time'] = record['value']
            index += 1
        return all_files

    def selectFilesToRetry(self, fileList):
        """
        _selectFilesToRetry_

       Select files to retry
       """
        result = []

        if len(fileList) == 0:
            return result
        for file in fileList:
            logging.debug("Current file %s" %file)
            try:
                if self.plugin.isReady(file=file, cooloffTime=self.cooloffTime):
                    result.append(file)
            except Exception as ex:
                msg = "Exception while checking for cooloff timeout for file %s\n" % file
                msg += str(ex)
                logging.error(msg)
                logging.debug("File: %s\n" % file)
                raise RetryManagerException(msg)

        return result

    def doRetries(self):
        """
        Queries DB for all watched filesets, if matching filesets become
        available, create the subscriptions
        """
        # Discover files that are in cooloff
        query = {'stale': 'ok'}
        try:
            files = self.db.loadView('AsyncTransfer', 'getFilesToRetry', query)['rows']
        except Exception as e:
            self.logger.exception('A problem occured when contacting \
                                  couchDB to retrieve LFNs: %s' % e)
            return
        logging.info("Found %s files in cooloff" % len(files))
        self.processRetries(files)
コード例 #42
0
class MasterWorker(object):
    """I am the master of the TaskWorker"""
    def __init__(self,
                 config,
                 logWarning,
                 logDebug,
                 sequential=False,
                 console=False,
                 name='master'):
        """Initializer

        :arg WMCore.Configuration config: input TaskWorker configuration
        :arg bool logWarning: it tells if a quiet logger is needed
        :arg bool logDebug: it tells if needs a verbose logger
        :arg bool sequential: it tells if to run in sequential (no subprocesses) mode.
        :arg bool console: it tells if to log to console.
        :arg string name: defines a name for the log of this master process"""
        def createLogdir(dirname):
            """ Create the directory dirname ignoring errors in case it exists. Exit if
                the directory cannot be created.
            """
            try:
                os.mkdir(dirname)
            except OSError as ose:
                if ose.errno != 17:  #ignore the "Directory already exists error"
                    print(str(ose))
                    print("The task worker need to access the '%s' directory" %
                          dirname)
                    sys.exit(1)

        def createAndCleanLogDirectories(logsDir):
            # it can be named with the time stamp a TW started
            createLogdir(logsDir)
            createLogdir(logsDir + '/tasks')
            currentProcessesDir = logsDir + '/processes/'
            createLogdir(currentProcessesDir)
            # when running inside a container process logs will start with same
            # process numbers, i.e. same name, at any container restart.
            # to avoid clashes and confusion, we will put away all previous processes
            # logs when a TW instance starts. To this goal each TW which runs
            # creates a directory where new containers will move its logs, so
            # identify LastLogs_timestamp directory
            latestLogDir = None  # the logs directory could be empty
            files = os.listdir(currentProcessesDir)
            files.sort(
                reverse=True
            )  # if there are multiple Latest*, will hit the latest first
            for f in files:
                if f.startswith('Latest'):
                    latestLogDir = currentProcessesDir + f
                    break
            if files and latestLogDir:
                # rename from Latest to Old
                oldLogsDir = latestLogDir.replace('Latest', 'Old')
                shutil.move(latestLogDir, oldLogsDir)
            else:
                print(
                    "LatestLogDir not found in logs/processes, create a dummy dir to store old files"
                )
                oldLogsDir = currentProcessesDir + 'OldLog-Unknwown'
                createLogdir(oldLogsDir)
            # move process logs for latest TW run to old directory
            for f in files:
                if f.startswith('proc.c3id'):
                    shutil.move(currentProcessesDir + f, oldLogsDir)

            # create a new LateastLogs directory where to store logs from this TaskWorker
            YYMMDD_HHMMSS = time.strftime('%y%m%d_%H%M%S', time.localtime())
            myDir = currentProcessesDir + 'LatestLogs-' + YYMMDD_HHMMSS
            createLogdir(myDir)

        def setRootLogger(logWarning, logDebug, console, name):
            """Sets the root logger with the desired verbosity level
               The root logger logs to logsDir/twlog.txt and every single
               logging instruction is propagated to it (not really nice
               to read)

            :arg bool logWarning: it tells if a quiet logger is needed
            :arg bool logDebug: it tells if needs a verbose logger
            :arg bool console: it tells if to log to console
            :arg string name: define a name for the log file of this master process
            :return logger: a logger with the appropriate logger level."""

            # this must only done for real Master, not when it is used by TapeRecallStatus
            logsDir = config.TaskWorker.logsDir
            if name == 'master':
                createAndCleanLogDirectories(logsDir)

            if console:
                logging.getLogger().addHandler(logging.StreamHandler())
            else:
                logHandler = MultiProcessingLog(logsDir + '/twlog.txt',
                                                when='midnight')
                logFormatter = \
                    logging.Formatter("%(asctime)s:%(levelname)s:%(module)s,%(lineno)d:%(message)s")
                logHandler.setFormatter(logFormatter)
                logging.getLogger().addHandler(logHandler)
            loglevel = logging.INFO
            if logWarning:
                loglevel = logging.WARNING
            if logDebug:
                loglevel = logging.DEBUG
            logging.getLogger().setLevel(loglevel)
            logger = setProcessLogger(name, logsDir)
            logger.debug("PID %s.", os.getpid())
            logger.debug("Logging level initialized to %s.", loglevel)
            return logger

        self.STOP = False
        self.TEST = sequential
        self.logger = setRootLogger(logWarning, logDebug, console, name)
        self.config = config
        self.restHost = None
        dbInstance = None

        try:
            instance = self.config.TaskWorker.instance
        except:
            msg = "No instance provided: need to specify config.TaskWorker.instance in the configuration"
            raise ConfigException(msg)

        if instance in SERVICE_INSTANCES:
            self.logger.info('Will connect to CRAB service: %s', instance)
            self.restHost = SERVICE_INSTANCES[instance]['restHost']
            dbInstance = SERVICE_INSTANCES[instance]['dbInstance']
        else:
            msg = "Invalid instance value '%s'" % instance
            raise ConfigException(msg)
        if instance is 'other':
            self.logger.info(
                'Will use restHost and dbInstance from config file')
            try:
                self.restHost = self.config.TaskWorker.restHost
                dbInstance = self.config.TaskWorker.dbInstance
            except:
                msg = "Need to specify config.TaskWorker.restHost and dbInstance in the configuration"
                raise ConfigException(msg)
        self.restURInoAPI = '/crabserver/' + dbInstance

        self.logger.info('Will connect via URL: https://%s/%s', self.restHost,
                         self.restURInoAPI)

        #Let's increase the server's retries for recoverable errors in the MasterWorker
        #60 means we'll keep retrying for 1 hour basically (we retry at 20*NUMRETRY seconds, so at: 20s, 60s, 120s, 200s, 300s ...)
        self.server = HTTPRequests(self.restHost,
                                   self.config.TaskWorker.cmscert,
                                   self.config.TaskWorker.cmskey,
                                   retry=20,
                                   logger=self.logger)
        self.logger.debug("Hostcert: %s, hostkey: %s",
                          str(self.config.TaskWorker.cmscert),
                          str(self.config.TaskWorker.cmskey))
        # Retries for any failures
        if not hasattr(self.config.TaskWorker, 'max_retry'):
            self.config.TaskWorker.max_retry = 0
        if not hasattr(self.config.TaskWorker, 'retry_interval'):
            self.config.TaskWorker.retry_interval = [
                retry * 20 * 2
                for retry in range(self.config.TaskWorker.max_retry)
            ]
        if not len(self.config.TaskWorker.retry_interval
                   ) == self.config.TaskWorker.max_retry:
            raise ConfigException(
                "No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry."
            )
        # use the config to pass some useful global stuff to all workers
        # will use TaskWorker.cmscert/key to talk with CMSWEB
        self.config.TaskWorker.envForCMSWEB = newX509env(
            X509_USER_CERT=self.config.TaskWorker.cmscert,
            X509_USER_KEY=self.config.TaskWorker.cmskey)

        if self.TEST:
            self.slaves = TestWorker(self.config, self.restHost,
                                     self.restURInoAPI + '/workflowdb')
        else:
            self.slaves = Worker(self.config, self.restHost,
                                 self.restURInoAPI + '/workflowdb')
        self.slaves.begin()
        recurringActionsNames = getattr(self.config.TaskWorker,
                                        'recurringActions', [])
        self.recurringActions = [
            self.getRecurringActionInst(name) for name in recurringActionsNames
        ]

    def getRecurringActionInst(self, actionName):
        mod = __import__('TaskWorker.Actions.Recurring.%s' % actionName,
                         fromlist=actionName)
        return getattr(mod, actionName)(self.config.TaskWorker.logsDir)

    def _lockWork(self, limit, getstatus, setstatus):
        """Today this is always returning true, because we do not want the worker to die if
           the server endpoint is not avaialable.
           Prints a log entry if answer is greater than 400:
            * the server call succeeded or
            * the server could not find anything to update or
            * the server has an internal error"""

        configreq = {
            'subresource': 'process',
            'workername': self.config.TaskWorker.name,
            'getstatus': getstatus,
            'limit': limit,
            'status': setstatus
        }
        try:
            self.server.post(self.restURInoAPI + '/workflowdb',
                             data=urllib.urlencode(configreq))
        except HTTPException as hte:
            msg = "HTTP Error during _lockWork: %s\n" % str(hte)
            msg += "HTTP Headers are %s: " % hte.headers
            self.logger.error(msg)
            return False
        except Exception:  #pylint: disable=broad-except
            self.logger.exception(
                "Server could not process the _lockWork request (prameters are %s)",
                configreq)
            return False

        return True

    def getWork(self, limit, getstatus, ignoreTWName=False):
        configreq = {
            'limit': limit,
            'workername': self.config.TaskWorker.name,
            'getstatus': getstatus
        }
        if ignoreTWName:
            configreq['workername'] = '%'

        pendingwork = []
        try:
            pendingwork = self.server.get(self.restURInoAPI + '/workflowdb',
                                          data=configreq)[0]['result']
        except HTTPException as hte:
            msg = "HTTP Error during getWork: %s\n" % str(hte)
            msg += "HTTP Headers are %s: " % hte.headers
            self.logger.error(msg)
        except Exception:  #pylint: disable=broad-except
            self.logger.exception(
                "Server could not process the getWork request (prameters are %s)",
                configreq)
        return pendingwork

    def quit_(self, dummyCode, dummyTraceback):
        self.logger.info(
            "Received kill request. Setting STOP flag in the master process..."
        )
        self.STOP = True

    def updateWork(self, taskname, command, status):
        """ Update taskname setting the status and the command for it
            Return True if the change succeded, False otherwise
        """

        configreq = {
            'workflow': taskname,
            'command': command,
            'status': status,
            'subresource': 'state'
        }
        try:
            self.server.post(self.restURInoAPI + '/workflowdb',
                             data=urllib.urlencode(configreq))
        except HTTPException as hte:
            msg = "HTTP Error during updateWork: %s\n" % str(hte)
            msg += "HTTP Headers are %s: " % hte.headers
            self.logger.error(msg)
        except Exception:  #pylint: disable=broad-except
            self.logger.exception(
                "Server could not process the updateWork request (prameters are %s)",
                configreq)
        else:
            return True  #success
        return False  #failure

    def failQueuedTasks(self):
        """ This method is used at the TW startup and it fails QUEUED tasks that supposedly
            could not communicate with the REST and update their status. The method put those
            task to SUBMITFAILED, KILLFAILED, RESUBMITFAILED depending on the value of
            the command field.
        """
        limit = self.slaves.nworkers * 2
        total = 0
        while True:
            pendingwork = self.getWork(limit=limit, getstatus='QUEUED')
            for task in pendingwork:
                self.logger.debug("Failing QUEUED task %s",
                                  task['tm_taskname'])
                if task['tm_task_command']:
                    dummyWorktype, failstatus = STATE_ACTIONS_MAP[
                        task['tm_task_command']]
                else:
                    failstatus = 'FAILED'
                self.updateWork(task['tm_taskname'], task['tm_task_command'],
                                failstatus)
            if not len(pendingwork):
                self.logger.info("Finished failing QUEUED tasks (total %s)",
                                 total)
                break  #too bad "do..while" does not exist in python...
            else:
                total += len(pendingwork)
                self.logger.info(
                    "Failed %s tasks (limit %s), getting next chunk of tasks",
                    len(pendingwork), limit)

    def failBannedTask(self, task):
        """ This method is used at the TW startup and it fails NEW tasks which I do not like
        The method put those task to SUBMITFAILED, KILLFAILED, RESUBMITFAILED depending on the value of
         the command field.
        Initial implementation bans based on a list of usernames, other task attributes can
          be checked if needed by adding a bit of code
        Returns:
            True : if the task was declared bad and was failed
            False: for normal (good) tasks
        """
        bannedUsernames = getattr(self.config.TaskWorker, 'bannedUsernames',
                                  [])
        if task['tm_username'] in bannedUsernames:
            self.logger.debug("Forcefully failing task %s",
                              task['tm_taskname'])
            if task['tm_task_command']:
                dummyWorktype, failstatus = STATE_ACTIONS_MAP[
                    task['tm_task_command']]
            else:
                failstatus = 'FAILED'
            self.updateWork(task['tm_taskname'], task['tm_task_command'],
                            failstatus)
            # TODO look into logging a message for the user
            return True
        return False

    def algorithm(self):
        """I'm the intelligent guy taking care of getting the work
           and distributing it to the slave processes."""

        self.logger.debug("Failing QUEUED tasks before startup.")
        self.failQueuedTasks()
        self.logger.debug("Master Worker Starting Main Cycle.")
        while not self.STOP:
            limit = self.slaves.queueableTasks()
            if not self._lockWork(
                    limit=limit, getstatus='NEW', setstatus='HOLDING'):
                time.sleep(self.config.TaskWorker.polling)
                continue

            pendingwork = self.getWork(limit=limit, getstatus='HOLDING')

            if len(pendingwork) > 0:
                self.logger.info("Retrieved a total of %d works",
                                 len(pendingwork))
                self.logger.debug("Retrieved the following works: \n%s",
                                  str(pendingwork))

            toInject = []
            for task in pendingwork:
                if self.failBannedTask(task):
                    continue
                if self.updateWork(task['tm_taskname'],
                                   task['tm_task_command'], 'QUEUED'):
                    worktype, failstatus = STATE_ACTIONS_MAP[
                        task['tm_task_command']]
                    toInject.append((worktype, task, failstatus, None))
                else:
                    #The task stays in HOLDING and will be acquired again later
                    self.logger.info(
                        "Skipping %s since it could not be updated to QUEUED. Will be retried in the next iteration",
                        task['tm_taskname'])

            self.slaves.injectWorks(toInject)

            for action in self.recurringActions:
                if action.isTimeToGo():
                    #Maybe we should use new slaves and not reuse the ones used for the tasks
                    self.logger.debug("Injecting recurring action: \n%s",
                                      (str(action.__module__)))
                    self.slaves.injectWorks([(handleRecurring, {
                        'tm_username': '******',
                        'tm_taskname': action.__module__
                    }, 'FAILED', action.__module__)])

            self.logger.info('Master Worker status:')
            self.logger.info(' - free slaves: %d', self.slaves.freeSlaves())
            self.logger.info(' - acquired tasks: %d',
                             self.slaves.queuedTasks())
            self.logger.info(' - tasks pending in queue: %d',
                             self.slaves.pendingTasks())

            time.sleep(self.config.TaskWorker.polling)

            dummyFinished = self.slaves.checkFinished()

        self.logger.debug("Master Worker Exiting Main Cycle.")
コード例 #43
0
class MasterWorker(object):
    """I am the master of the TaskWorker"""

    def __init__(self, config, quiet, debug, test=False):
        """Initializer

        :arg WMCore.Configuration config: input TaskWorker configuration
        :arg bool quiet: it tells if a quiet logger is needed
        :arg bool debug: it tells if needs a verbose logger
        :arg bool test: it tells if to run in test (no subprocesses) mode."""


        def createLogdir(dirname):
            """ Create the directory dirname ignoring erors in case it exists. Exit if
                the directory cannot be created.
            """
            try:
                os.mkdir(dirname)
            except OSError as ose:
                if ose.errno != 17: #ignore the "Directory already exists error"
                    print(str(ose))
                    print("The task worker need to access the '%s' directory" % dirname)
                    sys.exit(1)


        def setRootLogger(quiet, debug):
            """Sets the root logger with the desired verbosity level
               The root logger logs to logs/twlog.txt and every single
               logging instruction is propagated to it (not really nice
               to read)

            :arg bool quiet: it tells if a quiet logger is needed
            :arg bool debug: it tells if needs a verbose logger
            :return logger: a logger with the appropriate logger level."""

            createLogdir('logs')
            createLogdir('logs/processes')
            createLogdir('logs/tasks')

            if self.TEST:
                #if we are testing log to the console is easier
                logging.getLogger().addHandler(logging.StreamHandler())
            else:
                logHandler = MultiProcessingLog('logs/twlog.txt', when='midnight')
                logFormatter = \
                    logging.Formatter("%(asctime)s:%(levelname)s:%(module)s,%(lineno)d:%(message)s")
                logHandler.setFormatter(logFormatter)
                logging.getLogger().addHandler(logHandler)
            loglevel = logging.INFO
            if quiet:
                loglevel = logging.WARNING
            if debug:
                loglevel = logging.DEBUG
            logging.getLogger().setLevel(loglevel)
            logger = setProcessLogger("master")
            logger.debug("PID %s.", os.getpid())
            logger.debug("Logging level initialized to %s.", loglevel)
            return logger


        self.STOP = False
        self.TEST = test
        self.logger = setRootLogger(quiet, debug)
        self.config = config
        resthost = None
        self.restURInoAPI = None
        if not self.config.TaskWorker.mode in MODEURL.keys():
            raise ConfigException("No mode provided: need to specify config.TaskWorker.mode in the configuration")
        elif MODEURL[self.config.TaskWorker.mode]['host'] is not None:
            resthost = MODEURL[self.config.TaskWorker.mode]['host']
            self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance']
        else:
            resthost = self.config.TaskWorker.resturl #this should be called resthost in the TaskWorkerConfig -_-
            self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance']
        if resthost is None:
            raise ConfigException("No correct mode provided: need to specify config.TaskWorker.mode in the configuration")
        #Let's increase the server's retries for recoverable errors in the MasterWorker
        #60 means we'll keep retrying for 1 hour basically (we retry at 20*NUMRETRY seconds, so at: 20s, 60s, 120s, 200s, 300s ...)
        self.server = HTTPRequests(resthost, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, retry = 20,
                                   logger = self.logger)
        self.logger.debug("Hostcert: %s, hostkey: %s", str(self.config.TaskWorker.cmscert), str(self.config.TaskWorker.cmskey))
        # Retries for any failures
        if not hasattr(self.config.TaskWorker, 'max_retry'):
            self.config.TaskWorker.max_retry = 0
        if not hasattr(self.config.TaskWorker, 'retry_interval'):
            self.config.TaskWorker.retry_interval = [retry*20*2 for retry in range(self.config.TaskWorker.max_retry)]
        if not len(self.config.TaskWorker.retry_interval) == self.config.TaskWorker.max_retry:
            raise ConfigException("No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry.")
        # use the config to pass some useful global stuff to all workers
        # will use TaskWorker.cmscert/key to talk with CMSWEB
        self.config.TaskWorker.envForCMSWEB = newX509env(X509_USER_CERT = self.config.TaskWorker.cmscert,
                                                         X509_USER_KEY  = self.config.TaskWorker.cmskey)

        if self.TEST:
            self.slaves = TestWorker(self.config, resthost, self.restURInoAPI + '/workflowdb')
        else:
            self.slaves = Worker(self.config, resthost, self.restURInoAPI + '/workflowdb')
        self.slaves.begin()
        recurringActionsNames = getattr(self.config.TaskWorker, 'recurringActions', [])
        self.recurringActions = [self.getRecurringActionInst(name) for name in recurringActionsNames]


    def getRecurringActionInst(self, actionName):
        mod = __import__('TaskWorker.Actions.Recurring.%s' % actionName, fromlist=actionName)
        return getattr(mod, actionName)()


    def _lockWork(self, limit, getstatus, setstatus):
        """Today this is always returning true, because we do not want the worker to die if
           the server endpoint is not avaialable.
           Prints a log entry if answer is greater than 400:
            * the server call succeeded or
            * the server could not find anything to update or
            * the server has an internal error"""

        configreq = {'subresource': 'process', 'workername': self.config.TaskWorker.name, 'getstatus': getstatus, 'limit': limit, 'status': setstatus}
        try:
            self.server.post(self.restURInoAPI + '/workflowdb', data = urllib.urlencode(configreq))
        except HTTPException as hte:
            msg = "HTTP Error during _lockWork: %s\n" % str(hte)
            msg += "HTTP Headers are %s: " % hte.headers
            self.logger.error(msg)
            return False
        except Exception: #pylint: disable=broad-except
            self.logger.exception("Server could not process the _lockWork request (prameters are %s)", configreq)
            return False

        return True


    def getWork(self, limit, getstatus):
        configreq = {'limit': limit, 'workername': self.config.TaskWorker.name, 'getstatus': getstatus}
        pendingwork = []
        try:
            pendingwork = self.server.get(self.restURInoAPI + '/workflowdb', data = configreq)[0]['result']
        except HTTPException as hte:
            msg = "HTTP Error during getWork: %s\n" % str(hte)
            msg += "HTTP Headers are %s: " % hte.headers
            self.logger.error(msg)
        except Exception: #pylint: disable=broad-except
            self.logger.exception("Server could not process the getWork request (prameters are %s)", configreq)
        return pendingwork


    def quit_(self, dummyCode, dummyTraceback):
        self.logger.info("Received kill request. Setting STOP flag in the master process...")
        self.STOP = True


    def updateWork(self, taskname, command, status):
        """ Update taskname setting the status and the command for it
            Return True if the change succeded, False otherwise
        """

        configreq = {'workflow': taskname, 'command': command, 'status': status, 'subresource': 'state'}
        try:
            self.server.post(self.restURInoAPI + '/workflowdb', data = urllib.urlencode(configreq))
        except HTTPException as hte:
            msg = "HTTP Error during updateWork: %s\n" % str(hte)
            msg += "HTTP Headers are %s: " % hte.headers
            self.logger.error(msg)
        except Exception: #pylint: disable=broad-except
            self.logger.exception("Server could not process the updateWork request (prameters are %s)", configreq)
        else:
            return True #success
        return False #failure


    def failQueuedTasks(self):
        """ This method is used at the TW startup and it fails QUEUED tasks that supposedly
            could not communicate with the REST and update their status. The method put those
            task to SUBMITFAILED, KILLFAILED, RESUBMITFAILED depending on the value of
            the command field.
        """
        limit = self.slaves.nworkers * 2
        total = 0
        while True:
            pendingwork = self.getWork(limit=limit, getstatus='QUEUED')
            for task in pendingwork:
                self.logger.debug("Failing QUEUED task %s", task['tm_taskname'])
                if task['tm_task_command']:
                    dummyWorktype, failstatus = STATE_ACTIONS_MAP[task['tm_task_command']]
                else:
                    failstatus = 'FAILED'
                self.updateWork(task['tm_taskname'], task['tm_task_command'], failstatus)
            if not len(pendingwork):
                self.logger.info("Finished failing QUEUED tasks (total %s)", total)
                break #too bad "do..while" does not exist in python...
            else:
                total += len(pendingwork)
                self.logger.info("Failed %s tasks (limit %s), getting next chunk of tasks", len(pendingwork), limit)


    def algorithm(self):
        """I'm the intelligent guy taking care of getting the work
           and distributing it to the slave processes."""

        self.logger.debug("Failing QUEUED tasks before startup.")
        self.failQueuedTasks()
        self.logger.debug("Master Worker Starting Main Cycle.")
        while(not self.STOP):
            limit = self.slaves.queueableTasks()
            if not self._lockWork(limit=limit, getstatus='NEW', setstatus='HOLDING'):
                time.sleep(self.config.TaskWorker.polling)
                continue

            pendingwork = self.getWork(limit=limit, getstatus='HOLDING')

            if len(pendingwork) > 0:
                self.logger.info("Retrieved a total of %d works", len(pendingwork))
                self.logger.debug("Retrieved the following works: \n%s", str(pendingwork))

            toInject = []
            for task in pendingwork:
                if self.updateWork(task['tm_taskname'], task['tm_task_command'], 'QUEUED'):
                    worktype, failstatus = STATE_ACTIONS_MAP[task['tm_task_command']]
                    toInject.append((worktype, task, failstatus, None))
                else:
                    #The task stays in HOLDING and will be acquired again later
                    self.logger.info("Skipping %s since it could not be updated to QUEUED. Will be retried in the next iteration", task['tm_taskname'])

            self.slaves.injectWorks(toInject)

            for action in self.recurringActions:
                if action.isTimeToGo():
                    #Maybe we should use new slaves and not reuse the ones used for the tasks
                    self.logger.debug("Injecting recurring action: \n%s", (str(action.__module__)))
                    self.slaves.injectWorks([(handleRecurring, {'tm_username': '******', 'tm_taskname' : action.__module__}, 'FAILED', action.__module__)])

            self.logger.info('Master Worker status:')
            self.logger.info(' - free slaves: %d', self.slaves.freeSlaves())
            self.logger.info(' - acquired tasks: %d', self.slaves.queuedTasks())
            self.logger.info(' - tasks pending in queue: %d', self.slaves.pendingTasks())

            time.sleep(self.config.TaskWorker.polling)

            dummyFinished = self.slaves.checkFinished()

        self.logger.debug("Master Worker Exiting Main Cycle.")
コード例 #44
0
class update(object):

    def __init__(self, logger, config):
        """
        Initialize connection to the db and logging/config

        :param logger: pass the logging
        :param config: refer to the configuration file
        """
        self.oracleDB = HTTPRequests(config.oracleDB,
                                     config.opsProxy,
                                     config.opsProxy)

        self.config = config
        self.logger = logger

    def retry(self):
        """
        Retry documents older than self.config.cooloffTime
        :return:
        """
        fileDoc = dict()
        fileDoc['asoworker'] = self.config.asoworker
        fileDoc['subresource'] = 'retryTransfers'
        fileDoc['time_to'] = self.config.cooloffTime
        self.logger.debug('fileDoc: %s' % fileDoc)

        results = dict()
        try:
            results = self.oracleDB.post(self.config.oracleFileTrans,
                                         data=encodeRequest(fileDoc))
        except Exception:
            self.logger.exception("Failed to get retry transfers in oracleDB: %s")
        self.logger.info("Retried files in cooloff: %s" % str(results))

        return 0

    def acquire(self):
        """
        Get a number (1k for current oracle rest) of documents and bind them to this aso
        NEW -> ACQUIRED (asoworker NULL -> config.asoworker)
        :return:
        """

        self.logger.info('Retrieving users...')
        fileDoc = dict()
        fileDoc['subresource'] = 'activeUsers'
        fileDoc['grouping'] = 0
        fileDoc['asoworker'] = self.config.asoworker

        try:
            self.oracleDB.get(self.config.oracleFileTrans,
                              data=encodeRequest(fileDoc))
        except Exception as ex:
            self.logger.error("Failed to acquire transfers \
                              from oracleDB: %s" % ex)
            return 1

        users = list()
        try:
            docs = oracleOutputMapping(result)
            users = [[x['username'], x['user_group'], x['user_role']] for x in docs]
            self.logger.info('Users to process: %s' % str(users))
        except:
            self.logger.exception('User data malformed. ')

        for user in users:
            fileDoc = dict()
            fileDoc['asoworker'] = self.config.asoworker
            fileDoc['subresource'] = 'acquireTransfers'
            fileDoc['username'] = user[0]

            self.logger.debug("Retrieving transfers from oracleDB for user: %s " % user)

            try:
                self.oracleDB.post(self.config.oracleFileTrans,
                                   data=encodeRequest(fileDoc))
            except Exception as ex:
                self.logger.error("Failed to acquire transfers \
                                  from oracleDB: %s" % ex)

        return users

    def getAcquired(self, users):
        """
        Get a number of documents to be submitted (in ACQUIRED status) and return results of the query for logs
        :return:
        """
        documents = list()

        for user in users:
            username = user[0]
            group = user[1]
            role = user[2]

            fileDoc = dict()
            fileDoc['asoworker'] = self.config.asoworker
            fileDoc['subresource'] = 'acquiredTransfers'
            fileDoc['grouping'] = 1
            fileDoc['username'] = username
            if group == '':
                group = None
            if role == '':
                role = None
            fileDoc['vogroup'] = group
            fileDoc['vorole'] = role

            self.logger.debug("Retrieving users from oracleDB")

            try:
                results = self.oracleDB.get(self.config.oracleFileTrans,
                                            data=encodeRequest(fileDoc))
                documents += oracleOutputMapping(results)
            except Exception as ex:
                self.logger.error("Failed to get acquired transfers \
                                  from oracleDB: %s" % ex)

        return documents

    def submitted(self, files):
        """
        Mark the list of files as submitted once the FTS submission succeeded
        ACQUIRED -> SUBMITTED
        Return the lfns updated successfully and report data for dashboard
        :param files: tuple (source_lfn, dest_lfn)
        :return:
        """
        lfn_in_transfer = []
        dash_rep = ()
        id_list = list()
        docId = ''
        for lfn in files:
            lfn = lfn[0]
            if lfn.find('temp') == 7:
                self.logger.debug("Marking acquired %s" % lfn)
                docId = getHashLfn(lfn)
                self.logger.debug("Marking acquired %s" % docId)
                try:
                    id_list.append(docId)
                    lfn_in_transfer.append(lfn)
                except Exception as ex:
                    self.logger.error("Error getting id: %s" % ex)
                    raise

            lfn_in_transfer.append(lfn)
            # TODO: add dashboard stuff
            # dash_rep = (document['jobid'], document['job_retry_count'], document['taskname'])
        try:
            fileDoc = dict()
            fileDoc['asoworker'] = self.config.asoworker
            fileDoc['subresource'] = 'updateTransfers'
            fileDoc['list_of_ids'] = id_list
            fileDoc['list_of_transfer_state'] = ["SUBMITTED" for x in id_list]

            self.oracleDB.post(self.config.oracleFileTrans,
                               data=encodeRequest(fileDoc))
            self.logger.debug("Marked acquired %s" % (id_list))
        except Exception as ex:
            self.logger.error("Error during status update: %s" % ex)
        return lfn_in_transfer, dash_rep

    def transferred(self, files):
        """
        Mark the list of files as tranferred
        """
        good_ids = list()
        updated_lfn = list()
        try:
            for lfn in files:
                lfn = lfn[0]
                if lfn.find('temp') == 7:
                    docId = getHashLfn(lfn)
                    good_ids.append(docId)
                    updated_lfn.append(lfn)
                    self.logger.debug("Marking done %s" % lfn)
                    self.logger.debug("Marking done %s" % docId)

            data = dict()
            data['asoworker'] = self.config.asoworker
            data['subresource'] = 'updateTransfers'
            data['list_of_ids'] = good_ids
            data['list_of_transfer_state'] = ["DONE" for x in good_ids]
            self.oracleDB.post(self.config.oracleFileTrans,
                               data=encodeRequest(data))
            self.logger.debug("Marked good %s" % good_ids)
        except Exception:
            self.logger.exception("Error updating documents")
            return 1
        return 0

    def failed(self, files, failures_reasons=[], max_retry=3, force_fail=False, submission_error=False):
        """

        :param files: tuple (source_lfn, dest_lfn)
        :param failures_reasons: list(str) with reasons of failure
        :param max_retry: number of retry before giving up
        :param force_fail: flag for triggering failure without retry
        :param submission_error: error during fts submission
        :return:
        """
        updated_lfn = []
        for Lfn in files:
            lfn = Lfn[0]
            # Load document and get the retry_count
            docId = getHashLfn(lfn)
            self.logger.debug("Marking failed %s" % docId)
            try:
                docbyId = self.oracleDB.get(self.config.oracleUserFileTrans.replace('filetransfer','fileusertransfers'),
                                            data=encodeRequest({'subresource': 'getById', 'id': docId}))
                document = oracleOutputMapping(docbyId, None)[0]
                self.logger.debug("Document: %s" % document)
            except Exception as ex:
                self.logger.error("Error updating failed docs: %s" % ex)
                return 1

            fileDoc = dict()
            fileDoc['asoworker'] = self.config.asoworker
            fileDoc['subresource'] = 'updateTransfers'
            fileDoc['list_of_ids'] = docId
            if not len(failures_reasons) == 0:
                try:
                    fileDoc['list_of_failure_reason'] = failures_reasons[files.index(Lfn)]
                except:
                    fileDoc['list_of_failure_reason'] = "unexcpected error, missing reasons"
                    self.logger.exception("missing reasons")

            if force_fail or document['transfer_retry_count'] + 1 > max_retry:
                fileDoc['list_of_transfer_state'] = 'FAILED'
                fileDoc['list_of_retry_value'] = 1
            else:
                fileDoc['list_of_transfer_state'] = 'RETRY'

            if submission_error:
                fileDoc['list_of_failure_reason'] = "Job could not be submitted to FTS: temporary problem of FTS"
                fileDoc['list_of_retry_value'] = 1
            else:
                fileDoc['list_of_retry_value'] = 1

            self.logger.debug("update: %s" % fileDoc)
            try:
                updated_lfn.append(docId)
                self.oracleDB.post(self.config.oracleFileTrans,
                                   data=encodeRequest(fileDoc))
            except Exception:
                self.logger.exception('ERROR updating failed documents')
                return 1
        self.logger.debug("failed file updated")
        return 0

    def acquirePub(self):
        """

        :return:
        """
        fileDoc = dict()
        fileDoc['asoworker'] = self.config.asoworker
        fileDoc['subresource'] = 'acquirePublication'

        self.logger.debug("Retrieving publications from oracleDB")

        try:
            self.oracleDB.post(self.config.oracleFileTrans,
                    data=encodeRequest(fileDoc))
        except Exception as ex:
            self.logger.error("Failed to acquire publications \
                              from oracleDB: %s" % ex)

    def getPub(self):
        """

        :return:
        """
        to_pub_docs = list()
        filedoc = dict()
        filedoc['asoworker'] = self.config.asoworker
        filedoc['subresource'] = 'acquiredPublication'
        filedoc['grouping'] = 0

        try:
            results = self.oracleDB.get(self.config.oracleFileTrans,
                                        data=encodeRequest(filedoc))
            to_pub_docs = oracleOutputMapping(results)
        except Exception as ex:
            self.logger.error("Failed to get acquired publications \
                              from oracleDB: %s" % ex)
            return to_pub_docs

        return to_pub_docs

    def pubDone(self, workflow, files):
        """

        :param files:
        :param workflow:
        :return:
        """
        wfnamemsg = "%s: " % workflow
        data = dict()
        id_list = list()
        for lfn in files:
            source_lfn = lfn
            docId = getHashLfn(source_lfn)
            id_list.append(docId)
            msg = "Marking file %s as published." % lfn
            msg += " Document id: %s (source LFN: %s)." % (docId, source_lfn)
            self.logger.info(wfnamemsg + msg)
        data['asoworker'] = self.config.asoworker
        data['subresource'] = 'updatePublication'
        data['list_of_ids'] = id_list
        data['list_of_publication_state'] = ['DONE' for x in id_list]
        try:
            self.oracleDB.post(self.config.oracleFileTrans,
                               data=encodeRequest(data))
            self.logger.debug("updated done: %s " % id_list)
        except Exception as ex:
            self.logger.error("Error during status update for published docs: %s" % ex)

    def pubFailed(self, task, files, failure_reasons=list(), force_failure=False):
        """

        :param files:
        :param failure_reasons:
        :return:
        """
        id_list = list()
        for Lfn in files:
            source_lfn = Lfn[0]
            docId = getHashLfn(source_lfn)
            id_list.append(docId)
            self.logger.debug("Marking failed %s" % docId)

        fileDoc = dict()
        fileDoc['asoworker'] = 'asodciangot1'
        fileDoc['subresource'] = 'updatePublication'
        fileDoc['list_of_ids'] = id_list
        fileDoc['list_of_publication_state'] = ['FAILED' for x in id_list]


        # TODO: implement retry, publish_retry_count missing from input?

        fileDoc['list_of_retry_value'] = [1 for x in id_list]
        fileDoc['list_of_failure_reason'] = failure_reasons

        try:
            self.oracleDB.post(self.config.oracleFileTrans,
                                data=encodeRequest(fileDoc))
            self.logger.debug("updated failed: %s " % id_list)
        except Exception:
            msg = "Error updating failed documents"
            self.logger.exception(msg)

    def lastPubTime(self, workflow):
        """

        :param workflow:
        :return:
        """
        data = dict()
        data['workflow'] = workflow
        data['subresource'] = 'updatepublicationtime'
        try:
            result = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers', 'task'),
                                       data=encodeRequest(data))
            self.logger.debug("%s last publication type update: %s " % (workflow, str(result)))
        except Exception:
            msg = "Error updating last publication time"
            self.logger.exception(msg)

    def searchTask(self, workflow):
        """

        :param workflow:
        :return:
        """
        data = dict()
        data['workflow'] = workflow
        data['subresource'] = 'search'
        try:
            result = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers', 'task'),
                                       data=encodeRequest(data))
            self.logger.debug("task: %s " % str(result[0]))
            self.logger.debug("task: %s " % getColumn(result[0], 'tm_last_publication'))
        except Exception as ex:
            self.logger.error("Error during task doc retrieving: %s" % ex)
            return {}

        return oracleOutputMapping(result)
コード例 #45
0
class MasterWorker(object):
    """I am the master of the TaskWorker"""

    def __init__(self, config, quiet, debug, test=False):
        """Initializer

        :arg WMCore.Configuration config: input TaskWorker configuration
        :arg logging logger: the logger
        :arg bool quiet: it tells if a quiet logger is needed
        :arg bool debug: it tells if needs a verbose logger."""


        def createLogdir(dirname):
            """ Create the directory dirname ignoring erors in case it exists. Exit if
                the directory cannot be created.
            """
            try:
                os.mkdir(dirname)
            except OSError as ose:
                if ose.errno != 17: #ignore the "Directory already exists error"
                    print(str(ose))
                    print("The task worker need to access the '%s' directory" % dirname)
                    sys.exit(1)


        def setRootLogger(quiet, debug):
            """Sets the root logger with the desired verbosity level
               The root logger logs to logs/twlog.txt and every single
               logging instruction is propagated to it (not really nice
               to read)

            :arg bool quiet: it tells if a quiet logger is needed
            :arg bool debug: it tells if needs a verbose logger
            :return logger: a logger with the appropriate logger level."""

            createLogdir('logs')
            createLogdir('logs/processes')
            createLogdir('logs/tasks')

            if self.TEST:
                #if we are testing log to the console is easier
                logging.getLogger().addHandler(logging.StreamHandler())
            else:
                logHandler = MultiProcessingLog('logs/twlog.txt', when='midnight')
                logFormatter = \
                    logging.Formatter("%(asctime)s:%(levelname)s:%(module)s:%(message)s")
                logHandler.setFormatter(logFormatter)
                logging.getLogger().addHandler(logHandler)
            loglevel = logging.INFO
            if quiet:
                loglevel = logging.WARNING
            if debug:
                loglevel = logging.DEBUG
            logging.getLogger().setLevel(loglevel)
            logger = setProcessLogger("master")
            logger.debug("PID %s." % os.getpid())
            logger.debug("Logging level initialized to %s." % loglevel)
            return logger

        self.STOP = False
        self.TEST = test
        self.logger = setRootLogger(quiet, debug)
        self.config = config
        resthost = None
        self.restURInoAPI = None
        if not self.config.TaskWorker.mode in MODEURL.keys():
            raise ConfigException("No mode provided: need to specify config.TaskWorker.mode in the configuration")
        elif MODEURL[self.config.TaskWorker.mode]['host'] is not None:
            resthost = MODEURL[self.config.TaskWorker.mode]['host']
            self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance']
        else:
            resthost = self.config.TaskWorker.resturl #this should be called resthost in the TaskWorkerConfig -_-
            self.restURInoAPI = '/crabserver/' + MODEURL[self.config.TaskWorker.mode]['instance']
        if resthost is None:
            raise ConfigException("No correct mode provided: need to specify config.TaskWorker.mode in the configuration")
        self.server = HTTPRequests(resthost, self.config.TaskWorker.cmscert, self.config.TaskWorker.cmskey, retry = 2)
        self.logger.debug("Hostcert: %s, hostkey: %s" %(str(self.config.TaskWorker.cmscert), str(self.config.TaskWorker.cmskey)))
        # Retries for any failures
        if not hasattr(self.config.TaskWorker, 'max_retry'):
            self.config.TaskWorker.max_retry = 0
        if not hasattr(self.config.TaskWorker, 'retry_interval'):
            self.config.TaskWorker.retry_interval = [retry*20*2 for retry in range(self.config.TaskWorker.max_retry)]
        if not len(self.config.TaskWorker.retry_interval) == self.config.TaskWorker.max_retry:
            raise ConfigException("No correct max_retry and retry_interval specified; len of retry_interval must be equal to max_retry.")
        if self.TEST:
            self.slaves = TestWorker(self.config, resthost, self.restURInoAPI + '/workflowdb')
        else:
            self.slaves = Worker(self.config, resthost, self.restURInoAPI + '/workflowdb')
        self.slaves.begin()
        recurringActionsNames = getattr(self.config.TaskWorker, 'recurringActions', [])
        self.recurringActions = [self.getRecurringActionInst(name) for name in recurringActionsNames]


    def getRecurringActionInst(self, actionName):
        mod = __import__('TaskWorker.Actions.Recurring.%s' % actionName, fromlist=actionName)
        return getattr(mod, actionName)()


    def _lockWork(self, limit, getstatus, setstatus):
        """Today this is always returning true, because we do not want the worker to die if
           the server endpoint is not avaialable.
           Prints a log entry if answer is greater than 400:
            * the server call succeeded or
            * the server could not find anything to update or
            * the server has an internal error"""
        configreq = {'subresource': 'process', 'workername': self.config.TaskWorker.name, 'getstatus': getstatus, 'limit': limit, 'status': setstatus}
        try:
            self.server.post(self.restURInoAPI + '/workflowdb', data = urllib.urlencode(configreq))
        except HTTPException as hte:
            #Using a msg variable and only one self.logger.error so that messages do not get shuffled
            msg = "Task Worker could not update a task status (HTTPException): %s\nConfiguration parameters=%s\n" % (str(hte), configreq)
            if not hte.headers.get('X-Error-Detail', '') == 'Required object is missing' or \
               not hte.headers.get('X-Error-Http', -1) == '400':
                msg += "Task Worker could not update work to the server: \n" +\
                                  "\tstatus: %s\n" %(hte.headers.get('X-Error-Http', 'unknown')) +\
                                  "\treason: %s\n" %(hte.headers.get('X-Error-Detail', 'unknown'))
                msg += "Probably no task to be updated\n"
            if hte.headers.get('X-Error-Http', 'unknown') in ['unknown']:
                msg += "TW could not update work to the server:\n"
                msg += "%s \n" %(str(traceback.format_exc()))
                msg += "\turl: %s\n" %(getattr(hte, 'url', 'unknown'))
                msg += "\tresult: %s\n" %(getattr(hte, 'result', 'unknown'))
            self.logger.error(msg)
        except Exception as exc:
            msg = "Task Worker could not update a task status: %s\nConfiguration parameters=%s\n" % (str(exc), configreq)
            self.logger.error(msg + traceback.format_exc())

        return True


    def _getWork(self, limit, getstatus):
        configreq = {'limit': limit, 'workername': self.config.TaskWorker.name, 'getstatus': getstatus}
        pendingwork = []
        try:
            pendingwork = self.server.get(self.restURInoAPI + '/workflowdb', data = configreq)[0]['result']
        except HTTPException as hte:
            self.logger.error("HTTP Error during _getWork: %s" % str(hte))
            self.logger.error("Could not get any work from the server: \n" +
                              "\tstatus: %s\n" %(hte.headers.get('X-Error-Http', 'unknown')) +
                              "\treason: %s" %(hte.headers.get('X-Error-Detail', 'unknown')))
            if hte.headers.get('X-Error-Http', 'unknown') in ['unknown']:
                self.logger.error("Server could not acquire any work from the server:")
                self.logger.error("%s " %(str(traceback.format_exc())))
                self.logger.error("\turl: %s\n" %(getattr(hte, 'url', 'unknown')))
                self.logger.error("\tresult: %s\n" %(getattr(hte, 'result', 'unknown')))
        except Exception as exc:
            self.logger.error("Server could not process the request: %s" %(str(exc)))
            self.logger.error(traceback.format_exc())
        return pendingwork


    def quit(self, code, traceback_):
        self.logger.info("Received kill request. Waiting for the workers...")
        self.STOP = True


    def updateWork(self, taskname, status):
        configreq = {'workflow': taskname, 'status': status, 'subresource': 'state'}
        retry = True
        while retry:
            try:
                self.server.post(self.restURInoAPI + '/workflowdb', data = urllib.urlencode(configreq))
                retry = False
            except HTTPException as hte:
                #Using a msg variable and only one self.logger.error so that messages do not get shuffled
                msg = "Task Worker could not update a task status (HTTPException): %s\nConfiguration parameters=%s\n" % (str(hte), configreq)
                msg += "\tstatus: %s\n" %(hte.headers.get('X-Error-Http', 'unknown'))
                msg += "\treason: %s\n" %(hte.headers.get('X-Error-Detail', 'unknown'))
                msg += "\turl: %s\n" %(getattr(hte, 'url', 'unknown'))
                msg += "\tresult: %s\n" %(getattr(hte, 'result', 'unknown'))
                msg += "%s \n" %(str(traceback.format_exc()))
                self.logger.error(msg)
                retry = False
                if int(hte.headers.get('X-Error-Http', '0')) == 503:
                    #503 - Database/Service unavailable. Maybe Intervention of CMSWEB ongoing?
                    retry = True
                    time_sleep = 30 + random.randint(10, 30)
                    self.logger.info("Sleeping %s seconds and will try to update again." % str(time_sleep))
                    time.sleep(time_sleep)
            except Exception as exc:
                msg = "Task Worker could not update a task status: %s\nConfiguration parameters=%s\n" % (str(exc), configreq)
                self.logger.error(msg + traceback.format_exc())
                retry = False


    def algorithm(self):
        """I'm the intelligent guy taking care of getting the work
           and distribuiting it to the slave processes."""

        self.logger.debug("Starting")
        while(not self.STOP):
            for status, worktype, failstatus in states():
                limit = self.slaves.queueableTasks()
                if not self._lockWork(limit=limit, getstatus=status, setstatus='HOLDING'):
                    continue
                ## Warning: If we fail to retrieve tasks on HOLDING (e.g. because cmsweb is down)
                ## we may end up executing the wrong worktype later on. A solution would be to
                ## save the previous task state in a new column of the TaskDB.
                pendingwork = self._getWork(limit=limit, getstatus='HOLDING')
                self.logger.info("Retrieved a total of %d %s works" %(len(pendingwork), worktype))
                self.logger.debug("Retrieved the following works: \n%s" %(str(pendingwork)))
                self.slaves.injectWorks([(worktype, task, failstatus, None) for task in pendingwork])
                for task in pendingwork:
                    self.updateWork(task['tm_taskname'], 'QUEUED')

            for action in self.recurringActions:
                if action.isTimeToGo():
                    #Maybe we should use new slaves and not reuse the ones used for the tasks
                    self.logger.debug("Injecting recurring action: \n%s" %(str(action.__module__)))
                    self.slaves.injectWorks([(handleRecurring, {'tm_taskname' : action.__module__}, 'FAILED', action.__module__)])

            self.logger.info('Master Worker status:')
            self.logger.info(' - free slaves: %d' % self.slaves.freeSlaves())
            self.logger.info(' - acquired tasks: %d' % self.slaves.queuedTasks())
            self.logger.info(' - tasks pending in queue: %d' % self.slaves.pendingTasks())

            time.sleep(self.config.TaskWorker.polling)

            finished = self.slaves.checkFinished()

        self.logger.debug("Master Worker Exiting Main Cycle")