Esempio n. 1
0
    def _getARCJob(self, jobID):
        """Create an ARC Job with all the needed / possible parameters defined.
        By the time we come here, the environment variable X509_USER_PROXY should already be set
        """
        j = arc.Job()
        j.JobID = str(jobID)
        j.IDFromEndpoint = os.path.basename(j.JobID)

        if self.endpointType == "Gridftp":
            statURL = "ldap://%s:2135/Mds-Vo-Name=local,o=grid??sub?(nordugrid-job-globalid=%s)" % (
                self.ceHost, jobID)
            j.JobStatusURL = arc.URL(str(statURL))
            j.JobStatusInterfaceName = "org.nordugrid.ldapng"

            mangURL = "gsiftp://%s:2811/jobs/" % (self.ceHost)
            j.JobManagementURL = arc.URL(str(mangURL))
            j.JobManagementInterfaceName = "org.nordugrid.gridftpjob"

            j.ServiceInformationURL = j.JobManagementURL
            j.ServiceInformationInterfaceName = "org.nordugrid.ldapng"
        else:
            commonURL = "https://%s:8443/arex" % self.ceHost
            j.JobStatusURL = arc.URL(str(commonURL))
            j.JobStatusInterfaceName = "org.ogf.glue.emies.activitymanagement"

            j.JobManagementURL = arc.URL(str(commonURL))
            j.JobManagementInterfaceName = "org.ogf.glue.emies.activitymanagement"

            j.ServiceInformationURL = arc.URL(str(commonURL))
            j.ServiceInformationInterfaceName = "org.ogf.glue.emies.resourceinfo"

        j.PrepareHandler(self.usercfg)
        return j
Esempio n. 2
0
def example():
    # Creating a UserConfig object with the user's proxy
    # and the path of the trusted CA certificates
    uc = arc.UserConfig()
    uc.ProxyPath("/tmp/x509up_u%s" % os.getuid())
    uc.CACertificatesDirectory("/etc/grid-security/certificates")

    # Create a new job object with a given JobID
    job = arc.Job()
    job.JobID = "https://piff.hep.lu.se:443/arex/w7LNDmSkEiun1ZPzno6AuCjpABFKDmABFKDmZ9LKDmUBFKDmXugZwm"
    job.IDFromEndpoint = "w7LNDmSkEiun1ZPzno6AuCjpABFKDmABFKDmZ9LKDmUBFKDmXugZwm"
    job.JobManagementURL = arc.URL("https://piff.hep.lu.se:443/arex")
    job.JobStatusURL = arc.URL("https://piff.hep.lu.se:443/arex")
    job.JobStatusInterfaceName = 'org.ogf.glue.emies.activitymanagement'
    job.JobManagementInterfaceName = 'org.ogf.glue.emies.activitymanagement'

    sys.stdout.write("Job object before update:\n")
    job.SaveToStream(arc.CPyOstream(sys.stdout), True)

    job_supervisor = arc.JobSupervisor(uc, [job])

    # Update the states of jobs within this JobSupervisor
    job_supervisor.Update()

    # Get our updated job from the JobSupervisor
    jobs = job_supervisor.GetAllJobs()
    if not jobs:
        sys.stdout.write("No jobs found\n")
        return

    job = jobs[0]

    sys.stdout.write("Job object after update:\n")
    job.SaveToStream(arc.CPyOstream(sys.stdout), True)
Esempio n. 3
0
def example():
    # Creating a UserConfig object with the user's proxy
    # and the path of the trusted CA certificates
    uc = arc.UserConfig()
    uc.ProxyPath("/tmp/x509up_u%s" % os.getuid())
    uc.CACertificatesDirectory("/etc/grid-security/certificates")

    # Create a new job object with a given JobID
    job = arc.Job()
    job.JobID = "https://piff.hep.lu.se:443/arex/1QuMDmRwvUfn5h5iWqkutBwoABFKDmABFKDmIpHKDmXBFKDmIuAean"
    job.Flavour = "ARC1"
    job.JobManagementURL = arc.URL("https://piff.hep.lu.se:443/arex")
    job.JobStatusURL = arc.URL("https://piff.hep.lu.se:443/arex")

    sys.stdout.write("Job object before update:\n")
    job.SaveToStream(arc.CPyOstream(sys.stdout), True)

    job_supervisor = arc.JobSupervisor(uc, [job])

    # Update the states of jobs within this JobSupervisor
    job_supervisor.Update()

    # Get our updated job from the JobSupervisor
    jobs = job_supervisor.GetAllJobs()
    job = jobs[0]

    sys.stdout.write("Job object after update:\n")
    job.SaveToStream(arc.CPyOstream(sys.stdout), True)
Esempio n. 4
0
    def _getARCJob(self, jobID):
        """Create an ARC Job with all the needed / possible parameters defined.
        By the time we come here, the environment variable X509_USER_PROXY should already be set
        """
        j = arc.Job()
        j.JobID = str(jobID)
        j.IDFromEndpoint = os.path.basename(j.JobID)

        # Get the endpoint type (GridFTP or AREX)
        endpointType = j.JobID.split(":")[0]
        if endpointType == "gsiftp":
            statURL = "ldap://%s:2135/Mds-Vo-Name=local,o=grid??sub?(nordugrid-job-globalid=%s)" % (self.ceHost, jobID)
            j.JobStatusURL = arc.URL(str(statURL))
            j.JobStatusInterfaceName = "org.nordugrid.ldapng"

            mangURL = os.path.dirname(j.JobID)
            j.JobManagementURL = arc.URL(str(mangURL))
            j.JobManagementInterfaceName = "org.nordugrid.gridftpjob"

            j.ServiceInformationURL = j.JobManagementURL
            j.ServiceInformationInterfaceName = "org.nordugrid.ldapng"
        else:
            commonURL = "/".join(j.JobID.split("/")[0:4])
            j.JobStatusURL = arc.URL(str(commonURL))
            j.JobStatusInterfaceName = "org.nordugrid.arcrest"

            j.JobManagementURL = arc.URL(str(commonURL))
            j.JobManagementInterfaceName = "org.nordugrid.arcrest"

            j.ServiceInformationURL = arc.URL(str(commonURL))
            j.ServiceInformationInterfaceName = "org.nordugrid.arcrest"

        j.PrepareHandler(self.usercfg)
        return j
Esempio n. 5
0
def example():
    # Creating a UserConfig object with the user's proxy
    # and the path of the trusted CA certificates
    uc = arc.UserConfig()
    uc.ProxyPath("/tmp/x509up_u%s" % os.getuid())
    uc.CACertificatesDirectory("/etc/grid-security/certificates")

    # Create a new job object with a given JobID
    job = arc.Job()
    job.JobID = "https://piff.hep.lu.se:443/arex/hYDLDmyxvUfn5h5iWqkutBwoABFKDmABFKDmIpHKDmYBFKDmtRy9En"
    job.Flavour = "ARC1"
    job.ServiceInformationURL = job.JobStatusURL = job.JobManagementURL = arc.URL("https://piff.hep.lu.se:443/arex")

    sys.stdout.write("Get job information from the computing element...\n")
    # Put the job into a JobSupervisor and update its information
    job_supervisor = arc.JobSupervisor(uc, [job])
    job_supervisor.Update()

    sys.stdout.write("Downloading results...\n")
    # Prepare a list for storing the directories for the downloaded job results (if there would be more jobs)
    downloadeddirectories = arc.StringList()
    # Start retrieving results of all the selected jobs
    #   into the "/tmp" directory (first argument)
    #   using the jobid and not the jobname as the name of the subdirectory (second argument, usejobname = False)
    #   do not overwrite existing directories with the same name (third argument: force = False)
    #   collect the downloaded directories into the variable "downloadeddirectories" (forth argument)
    success = job_supervisor.Retrieve("/tmp", False, False, downloadeddirectories)
    if not success:
        sys.stdout.write("Downloading results failed.\n")
    for downloadeddirectory in downloadeddirectories:
        sys.stdout.write("Job results were downloaded to %s\n"%str(downloadeddirectory))
        sys.stdout.write("Contents of the directory:\n")
        for filename in os.listdir(downloadeddirectory):
            sys.stdout.write("   %s\n"%filename)
Esempio n. 6
0
def test():
    from pandaharvester.harvestercore.work_spec import WorkSpec
    wspec = WorkSpec()
    jobid = "gsiftp://pcoslo5.cern.ch:2811/jobs/XkNNDmultdtn1ZPzno6AuCjpABFKDmABFKDmwqyLDmABFKDm8dOcOn"
    wspec.batchID = jobid
    workAttributes = {"arcjob": {}}
    workAttributes["arcjob"]["JobID"] = wspec.batchID
    workAttributes["arcjob"][
        "JobStatusURL"] = "ldap://{0}:2135/mds-vo-name=local,o=grid??sub?(nordugrid-job-globalid={1})".format(
            urlparse(jobid).netloc, jobid)
    workAttributes["arcjob"]["JobStatusInterfaceName"] = "org.nordugrid.ldapng"
    jobmanagementurl = arc.URL(wspec.batchID)
    jobmanagementurl.ChangePath("/jobs")
    workAttributes["arcjob"]["JobManagementURL"] = jobmanagementurl.str()
    workAttributes["arcjob"][
        "JobManagementInterfaceName"] = "org.nordugrid.gridftpjob"
    workAttributes["proxyrole"] = 'production'

    wspec.workAttributes = workAttributes
    wspec.accessPoint = '/tmp'
    wspec.mapType = WorkSpec.MT_OneToOne
    wspec.pandaid_list = [1234]
    print wspec.workAttributes

    messenger = ARCMessenger()
    print messenger.events_requested(wspec)
    print messenger.feed_events(wspec, {'event': 1234})
    print messenger.events_to_update(wspec)
    messenger.acknowledge_events_files(wspec)
Esempio n. 7
0
    def save_job_outputs(self, job_id):
        """
        Retrieve output files from a job and save them to a temp directory. The file/directory
        specified in `OUTPUT_FILE` will be downloaded, and ``stdout`` and ``stderr`` outputs are
        saved as ``stdout.txt`` and ``stderr.txt`` respectively.

        :param job_id:            ID of the job as returned by `submit_job`
        :raises JobNotFoundError: if no job with the given ID could be found

        :return: Path to the directory the output files were saved in, or ``None`` if no files
                 were saved
        """
        job = self.get_job(job_id)
        user_config = self.get_user_config()
        temp_dir = tempfile.mkdtemp()
        # Last argument is 'force' - whether to continue if destination directory already exists
        success = job.Retrieve(user_config,
                               arc.URL("file://{}".format(temp_dir)), True)

        # Remove temp dir and fail if no files were downloaded
        if not os.listdir(temp_dir):
            success = False
            os.rmdir(temp_dir)

        return temp_dir if success else None
Esempio n. 8
0
 def __getARCJob( self, jobID ):
   """ Create an ARC Job with all the needed / possible parameters defined.
       By the time we come here, the environment variable X509_USER_PROXY should already be set
   """
   j = arc.Job()
   j.JobID = jobID
   statURL = "ldap://%s:2135/Mds-Vo-Name=local,o=grid??sub?(nordugrid-job-globalid=%s)" % ( self.ceHost, jobID )
   j.JobStatusURL = arc.URL( statURL )
   j.JobStatusInterfaceName = "org.nordugrid.ldapng"
   mangURL = "gsiftp://%s:2811/jobs/" % ( self.ceHost )
   j.JobManagementURL = arc.URL( mangURL )
   j.JobManagementInterfaceName = "org.nordugrid.gridftpjob"
   j.ServiceInformationURL = j.JobManagementURL
   j.ServiceInformationInterfaceName = "org.nordugrid.ldapng"
   j.PrepareHandler( self.usercfg )
   return j
Esempio n. 9
0
    def getJobOutput(self, jobID, localDir=None):
        """ Get the specified job standard output and error files. If the localDir is provided,
        the output is returned as file in this directory. Otherwise, the output is returned 
        as strings. 
    """
        result = self._prepareProxy()
        self.usercfg.ProxyPath(os.environ['X509_USER_PROXY'])
        if not result['OK']:
            gLogger.error('ARCComputingElement: failed to set up proxy',
                          result['Message'])
            return result

        if jobID.find(':::') != -1:
            pilotRef, stamp = jobID.split(':::')
        else:
            pilotRef = jobID
            stamp = ''
        if not stamp:
            return S_ERROR('Pilot stamp not defined for %s' % pilotRef)

        job = self.__getARCJob(pilotRef)

        arcID = os.path.basename(pilotRef)
        gLogger.debug("Retrieving pilot logs for %s" % pilotRef)
        if "WorkingDirectory" in self.ceParameters:
            workingDirectory = os.path.join(
                self.ceParameters['WorkingDirectory'], arcID)
        else:
            workingDirectory = arcID
        outFileName = os.path.join(workingDirectory, '%s.out' % stamp)
        errFileName = os.path.join(workingDirectory, '%s.err' % stamp)
        gLogger.debug("Working directory for pilot output %s" %
                      workingDirectory)

        isItOkay = job.Retrieve(self.usercfg, arc.URL(workingDirectory), False)
        if (isItOkay):
            outFile = open(outFileName, 'r')
            output = outFile.read()
            outFile.close()
            os.unlink(outFileName)
            errFile = open(errFileName, 'r')
            error = errFile.read()
            errFile.close()
            os.unlink(errFileName)
            gLogger.debug("Pilot output = %s" % output)
            gLogger.debug("Pilot error = %s" % error)
        else:
            job.Update()
            arcState = job.State.GetGeneralState()
            if (arcState != "Undefined"):
                return S_ERROR(
                    'Failed to retrieve output for %s as job is not finished (maybe not started yet)'
                    % jobID)
            gLogger.debug(
                "Could not retrieve pilot output for %s - either permission / proxy error or could not connect to CE"
                % pilotRef)
            return S_ERROR('Failed to retrieve output for %s' % jobID)

        return S_OK((output, error))
Esempio n. 10
0
 def create_test_job(self,
                     job_id = "http://test.nordugrid.org/testid",
                     cluster = "http://test.nordugrid.org",
                     state = arc.JobState.RUNNING,
                     state_text = None,
                     job_description = "non-empty"):
     job = arc.Job()
     job.JobID = job_id
     job.ServiceInformationInterfaceName = job.JobStatusInterfaceName = job.JobManagementInterfaceName = "org.nordugrid.test"
     job.ServiceInformationURL = job.JobStatusURL = job.JobManagementURL = arc.URL(cluster)
     if state_text is None:
         job.State = arc.JobStateTEST(state)
     else:
         job.State = arc.JobStateTEST(state, state_text)
     job.JobDescriptionDocument = job_description
     return job
Esempio n. 11
0
    def extractOutputFilesFromMetadata(self, arcjobid):
        aj = self.dbarc.getArcJobInfo(arcjobid, columns=["JobID", "appjobid"])
        if not aj or 'JobID' not in aj or not aj['JobID']:
            self.log.error("failed to find arcjobid %s in database" % arcjobid)
            return {}

        jobid = aj['JobID']
        sessionid = jobid[jobid.rfind('/') + 1:]
        try:
            jobinfo = aCTPandaJob(filename=os.path.join(
                self.tmpdir, sessionid, 'heartbeat.json'))
            metadata = getattr(jobinfo,
                               'xml')  # travis doesn't like jobinfo.xml
        except Exception as x:
            self.log.error("%s: failed to extract metadata for arcjob %s: %s" %
                           (aj['appjobid'], sessionid, x))
            return {}

        try:
            outputfiles = json.loads(metadata)
        except Exception as e:
            self.log.error(
                "%s: failed to load output file info for arcjob %s: %s" %
                (aj['appjobid'], sessionid, str(e)))
            return {}

        surls = {}
        for attrs in outputfiles.values():
            try:
                size = attrs['fsize']
                adler32 = attrs['adler32']
                surl = attrs['surl']
                se = arc.URL(str(surl)).Host()
            except Exception as x:
                self.log.error('%s: %s' % (aj['appjobid'], x))
            else:
                checksum = "adler32:" + adler32
                if se not in surls:
                    surls[se] = []
                surls[se] += [{
                    "surl": surl,
                    "fsize": size,
                    "checksum": checksum,
                    "arcjobid": arcjobid
                }]

        return surls
Esempio n. 12
0
    def rename(self, pfn, new_pfn):
        """ Allows to rename a file stored inside the connected RSE.

            :param pfn      Current physical file name
            :param new_pfn  New physical file name

            :raises DestinationNotAccessible, ServiceUnavailable, SourceNotFound
        """
        dp = DataPoint(str(pfn), self.cfg)
        if dp.h is None:
            raise ServiceUnavailable("Can't handle pfn %s" % pfn)

        url = arc.URL(str(new_pfn))
        if not url:
            raise ServiceUnavailable("Can't handle new pfn %s" % new_pfn)

        status = dp.h.Rename(url)
        if not status:
            if status.GetErrno() == errno.ENOENT:
                raise SourceNotFound()
            raise ServiceUnavailable(str(status))
Esempio n. 13
0
def test(jobid):
    '''Test checking status'''
    from pandaharvester.harvestercore.work_spec import WorkSpec
    wspec = WorkSpec()
    wspec.batchID = jobid  #"gsiftp://pikolit.ijs.si:2811/jobs/HtgKDmtCe7qn4J8tmqCBXHLnABFKDmABFKDmBcGKDmABFKDm4NCTCn"
    workAttributes = {"arcjob": {}}
    workAttributes["arcjob"]["JobID"] = wspec.batchID
    workAttributes["arcjob"][
        "JobStatusURL"] = "ldap://{0}:2135/mds-vo-name=local,o=grid??sub?(nordugrid-job-globalid={1})".format(
            urlparse.urlparse(jobid).netloc, jobid)
    workAttributes["arcjob"]["JobStatusInterfaceName"] = "org.nordugrid.ldapng"
    jobmanagementurl = arc.URL(wspec.batchID)
    jobmanagementurl.ChangePath("/jobs")
    workAttributes["arcjob"]["JobManagementURL"] = jobmanagementurl.str()
    workAttributes["arcjob"][
        "JobManagementInterfaceName"] = "org.nordugrid.gridftpjob"

    wspec.workAttributes = workAttributes
    print wspec.workAttributes

    monitor = ARCMonitor()
    print monitor.check_workers([wspec])
Esempio n. 14
0
    def copyFinishedFiles(self, arcjobid, extractmetadata):
        """
        - if extractmetadata: (normal arc jobs, not true pilot jobs) 
           - extract panda_node_struct.pickle from jobSmallFiles.tgz and store it under tmp/pickle
           - extract metadata-surl.xml and update pickle. store xml under tmp/xml
        - copy .job.log file to jobs/date/cluster/jobid
        - copy gmlog dir to jobs/date/cluster/jobid
        """

        columns = [
            'JobID', 'appjobid', 'cluster', 'UsedTotalWallTime', 'EndTime',
            'ExecutionNode', 'stdout'
        ]
        aj = self.dbarc.getArcJobInfo(arcjobid, columns=columns)
        if not aj.has_key('JobID') or not aj['JobID']:
            self.log.error('No JobID in arcjob %s: %s' %
                           (str(arcjobid), str(aj)))
            return False
        jobid = aj['JobID']
        sessionid = jobid[jobid.rfind('/') + 1:]
        date = time.strftime('%Y%m%d')
        cluster = arc.URL(str(jobid)).Host()
        if extractmetadata:
            try:
                pandapickle = self._extractFromSmallFiles(
                    aj, "panda_node_struct.pickle")
            except Exception, x:
                self.log.error(
                    "%s: failed to extract pickle for arcjob %s: %s" %
                    (aj['appjobid'], sessionid, x))
                pandapickle = None
            try:
                metadata = self._extractFromSmallFiles(aj, "metadata-surl.xml")
            except Exception, x:
                self.log.error(
                    "%s: failed to extract metadata-surl.xml for arcjob %s: %s"
                    % (aj['appjobid'], sessionid, x))
                metadata = None
Esempio n. 15
0
def test(jobid):
    '''Kill a job'''
    from pandaharvester.harvestercore.work_spec import WorkSpec
    import json
    wspec = WorkSpec()
    wspec.batchID = jobid
    workAttributes = {"arcjob": {}}
    workAttributes["arcjob"]["JobID"] = wspec.batchID
    workAttributes["arcjob"][
        "JobStatusURL"] = "ldap://{0}:2135/mds-vo-name=local,o=grid??sub?(nordugrid-job-globalid={1})".format(
            urlparse.urlparse(jobid).netloc, wspec.batchID)
    workAttributes["arcjob"]["JobStatusInterfaceName"] = "org.nordugrid.ldapng"
    jobmanagementurl = arc.URL(wspec.batchID)
    jobmanagementurl.ChangePath("/jobs")
    workAttributes["arcjob"]["JobManagementURL"] = jobmanagementurl.str()
    workAttributes["arcjob"][
        "JobManagementInterfaceName"] = "org.nordugrid.gridftpjob"

    wspec.workAttributes = workAttributes
    print wspec.workAttributes

    sweeper = ARCSweeper()
    print sweeper.kill_worker(wspec)
Esempio n. 16
0
 def infinite(self, url):
     logger.msg(arc.INFO, "EchoService (python) thread test starting")
     i = 0
     while True:
         try:
             i += 1
             cfg = arc.MCCConfig()
             s = arc.ClientSOAP(cfg, arc.URL(url))
             ns = arc.NS('echo', echo_ns)
             outpayload = arc.PayloadSOAP(ns)
             outpayload.NewChild('echo:echo').NewChild('echo:say').Set(
                 'hi!')
             resp, status = s.process(outpayload)
             logger.msg(
                 arc.INFO,
                 "EchoService (python) thread test, iteration %(iteration)s %(status)s"
                 % {
                     'iteration': i,
                     'status': status
                 })
             time.sleep(3)
         except Exception as e:
             import traceback
             logger.msg(arc.DEBUG, traceback.format_exc())
Esempio n. 17
0
    def submit(self):
        """
        Main function to submit jobs.
        """

        global queuelist

        # check for stopsubmission flag
        if self.conf.get(['downtime', 'stopsubmission']) == "true":
            self.log.info('Submission suspended due to downtime')
            return

        # check for any site-specific limits or status
        clusterstatus = self.conf.getCond(["sites", "site"],
                                          f"endpoint={self.cluster}",
                                          ["status"]) or 'online'
        if clusterstatus == 'offline':
            self.log.info('Site status is offline')
            return

        clustermaxjobs = int(
            self.conf.getCond(["sites", "site"], f"endpoint={self.cluster}",
                              ["maxjobs"]) or 999999)
        nsubmitted = self.db.getNArcJobs(f"cluster='{self.cluster}'")
        if nsubmitted >= clustermaxjobs:
            self.log.info(
                f'{nsubmitted} submitted jobs is greater than or equal to max jobs {clustermaxjobs}'
            )
            return

        # Get cluster host and queue: cluster/queue
        clusterhost = clusterqueue = None
        if self.cluster:
            cluster = self.cluster
            if cluster.find('://') == -1:
                cluster = 'gsiftp://' + cluster
            clusterurl = arc.URL(cluster)
            clusterhost = clusterurl.Host()
            clusterqueue = clusterurl.Path()[1:]  # strip off leading slash

        # Apply fair-share
        if self.cluster:
            fairshares = self.db.getArcJobsInfo(
                "arcstate='tosubmit' and clusterlist like '%" + self.cluster +
                "%'", ['fairshare', 'proxyid'])
        else:
            fairshares = self.db.getArcJobsInfo(
                "arcstate='tosubmit' and clusterlist=''",
                ['fairshare', 'proxyid'])

        if not fairshares:
            self.log.info('Nothing to submit')
            return

        # split by proxy for GU queues
        fairshares = list(
            set([(p['fairshare'], p['proxyid']) for p in fairshares]))
        # For proxy bug - see below
        shuffle(fairshares)

        for fairshare, proxyid in fairshares:

            # apply maxjobs limit (check above should make sure greater than zero)
            # Note: relies on exit after first loop
            limit = min(clustermaxjobs - nsubmitted, 10)
            try:
                # catch any exceptions here to avoid leaving lock
                if self.cluster:
                    # Lock row for update in case multiple clusters are specified
                    #jobs=self.db.getArcJobsInfo("arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' order by priority desc limit 10".format(self.cluster, fairshare),
                    jobs = self.db.getArcJobsInfo(
                        "arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' and proxyid='{2}' limit {3}"
                        .format(self.cluster, fairshare, proxyid, limit),
                        columns=[
                            "id", "jobdesc", "appjobid", "priority", "proxyid",
                            "clusterlist"
                        ],
                        lock=True)
                    if jobs:
                        self.log.debug("started lock for writing %d jobs" %
                                       len(jobs))
                else:
                    jobs = self.db.getArcJobsInfo(
                        "arcstate='tosubmit' and clusterlist='' and fairshare='{0} and proxyid={1}' limit {2}"
                        .format(fairshare, proxyid, limit),
                        columns=[
                            "id", "jobdesc", "appjobid", "priority", "proxyid",
                            "clusterlist"
                        ])
                # mark submitting in db
                jobs_taken = []
                for j in jobs:
                    jd = {
                        'cluster': self.cluster,
                        'arcstate': 'submitting',
                        'tarcstate': self.db.getTimeStamp()
                    }
                    self.db.updateArcJobLazy(j['id'], jd)
                    jobs_taken.append(j)
                jobs = jobs_taken

            finally:
                if self.cluster:
                    try:
                        self.db.Commit(lock=True)
                        self.log.debug("ended lock")
                    except:
                        self.log.warning("Failed to release DB lock")
                else:
                    self.db.Commit()

            if len(jobs) == 0:
                #self.log.debug("No jobs to submit")
                continue
            self.log.info(
                "Submitting %d jobs for fairshare %s and proxyid %d" %
                (len(jobs), fairshare, proxyid))

            # max waiting priority
            try:
                maxpriowaiting = max(jobs,
                                     key=lambda x: x['priority'])['priority']
            except:
                maxpriowaiting = 0
            self.log.info("Maximum priority of waiting jobs: %d" %
                          maxpriowaiting)

            # Query infosys - either local or index
            if self.cluster:
                if self.cluster.find('://') != -1:
                    aris = arc.URL(self.cluster)
                else:
                    aris = arc.URL('gsiftp://%s' % self.cluster)
                if aris.Protocol() == 'https':
                    aris.ChangePath('/arex')
                    infoendpoints = [
                        arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO,
                                     'org.ogf.glue.emies.resourceinfo')
                    ]
                elif aris.Protocol() == 'local':
                    infoendpoints = [
                        arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO,
                                     'org.nordugrid.local')
                    ]
                else:
                    aris = 'ldap://' + aris.Host(
                    ) + '/mds-vo-name=local,o=grid'
                    infoendpoints = [
                        arc.Endpoint(aris, arc.Endpoint.COMPUTINGINFO,
                                     'org.nordugrid.ldapng')
                    ]
            else:
                giises = self.conf.getList(['atlasgiis', 'item'])
                infoendpoints = []
                for g in giises:
                    # Specify explicitly EGIIS
                    infoendpoints.append(
                        arc.Endpoint(str(g), arc.Endpoint.REGISTRY,
                                     "org.nordugrid.ldapegiis"))

            # Set UserConfig credential for querying infosys
            proxystring = str(self.db.getProxy(proxyid))
            self.uc.CredentialString(proxystring)
            global usercred
            usercred = self.uc
            # retriever contains a list of CE endpoints
            retriever = arc.ComputingServiceRetriever(self.uc, infoendpoints)
            retriever.wait()
            # targets is the list of queues
            # parse target.ComputingService.ID for the CE hostname
            # target.ComputingShare.Name is the queue name
            targets = retriever.GetExecutionTargets()

            # Filter only sites for this process
            queuelist = []
            for target in targets:
                if not target.ComputingService.ID:
                    self.log.info(
                        "Target %s does not have ComputingService ID defined, skipping"
                        % target.ComputingService.Name)
                    continue
                # If EMI-ES infoendpoint, force EMI-ES submission
                if infoendpoints[
                        0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation':
                    self.log.debug(
                        "Rejecting target interface %s because not EMI-ES" %
                        target.ComputingEndpoint.InterfaceName)
                    continue
                # Check for matching host and queue
                targethost = re.sub(
                    ':arex$', '',
                    re.sub('urn:ogf:ComputingService:', '',
                           target.ComputingService.ID))
                targetqueue = target.ComputingShare.Name
                if clusterhost and targethost != clusterhost:
                    self.log.debug(
                        'Rejecting target host %s as it does not match %s' %
                        (targethost, clusterhost))
                    continue
                if clusterqueue and targetqueue != clusterqueue:
                    self.log.debug(
                        'Rejecting target queue %s as it does not match %s' %
                        (targetqueue, clusterqueue))
                    continue
                if targetqueue in self.conf.getList(['queuesreject', 'item']):
                    self.log.debug(
                        'Rejecting target queue %s in queuesreject list' %
                        targetqueue)
                    continue
                elif targethost in self.conf.getList(
                    ['clustersreject', 'item']):
                    self.log.debug(
                        'Rejecting target host %s in clustersreject list' %
                        targethost)
                    continue
                else:
                    # tmp hack
                    target.ComputingShare.LocalWaitingJobs = 0
                    target.ComputingShare.PreLRMSWaitingJobs = 0
                    target.ExecutionEnvironment.CPUClockSpeed = 2000
                    qjobs = self.db.getArcJobsInfo(
                        "cluster='" + str(self.cluster) +
                        "' and  arcstate='submitted' and fairshare='%s'" %
                        fairshare, ['id', 'priority'])
                    rjobs = self.db.getArcJobsInfo(
                        "cluster='" + str(self.cluster) +
                        "' and  arcstate='running' and fairshare='%s'" %
                        fairshare, ['id'])

                    # max queued priority
                    try:
                        maxprioqueued = max(
                            qjobs, key=lambda x: x['priority'])['priority']
                    except:
                        maxprioqueued = 0
                    self.log.info("Max priority queued: %d" % maxprioqueued)

                    # Limit number of submitted jobs using configuration or default (0.15 + 100/num of shares)
                    # Note: assumes only a few shares are used
                    qfraction = float(self.conf.get([
                        'jobs', 'queuefraction'
                    ])) if self.conf.get(['jobs', 'queuefraction']) else 0.15
                    qoffset = int(self.conf.get([
                        'jobs', 'queueoffset'
                    ])) if self.conf.get(['jobs', 'queueoffset']) else 100
                    jlimit = len(rjobs) * qfraction + qoffset / len(fairshares)
                    self.log.debug("running %d, queued %d, queue limit %d" %
                                   (len(rjobs), len(qjobs), jlimit))
                    if str(self.cluster).find('arc-boinc-0') != -1:
                        jlimit = len(rjobs) * 0.15 + 400
                    if str(self.cluster).find('XXXpikolit') != -1:
                        jlimit = len(rjobs) * 0.15 + 100
                    if str(self.cluster).find('arc05.lcg') != -1:
                        jlimit = len(rjobs) * 0.15 + 400
                    target.ComputingShare.PreLRMSWaitingJobs = len(qjobs)
                    if len(qjobs) < jlimit or ((maxpriowaiting > maxprioqueued)
                                               and (maxpriowaiting > 10)):
                        if maxpriowaiting > maxprioqueued:
                            self.log.info(
                                "Overriding limit, maxpriowaiting: %d > maxprioqueued: %d"
                                % (maxpriowaiting, maxprioqueued))
                        queuelist.append(target)
                        self.log.debug("Adding target %s:%s" %
                                       (targethost, targetqueue))
                    else:
                        self.log.info(
                            "%s/%s already at limit of submitted jobs for fairshare %s"
                            % (targethost, targetqueue, fairshare))

            # check if any queues are available, if not leave and try again next time
            if not queuelist:
                self.log.info("No free queues available")
                self.db.Commit()
                continue

            self.log.info("start submitting")

            # Just run one thread for each job in sequence. Strange things happen
            # when trying to create a new UserConfig object for each thread.
            tasks = []
            for j in jobs:
                self.log.debug("%s: preparing submission" % j['appjobid'])
                jobdescstr = str(
                    self.db.getArcJobDescription(str(j['jobdesc'])))
                jobdescs = arc.JobDescriptionList()
                if not jobdescstr or not arc.JobDescription_Parse(
                        jobdescstr, jobdescs):
                    self.log.error("%s: Failed to prepare job description" %
                                   j['appjobid'])
                    continue
                tasks.append((j['id'], j['appjobid'], jobdescstr, proxystring,
                              int(self.conf.get(['atlasgiis', 'timeout']))))

            npools = 1
            if any(s in self.cluster
                   for s in self.conf.getList(['parallelsubmit', 'item'])):
                npools = int(self.conf.get(['parallelsubmit', 'npools']))
            self.log.debug("Starting submitters: %s" % npools)

            pool = multiprocessing.Pool(npools)
            #results = []
            #for task in tasks:
            #    result = pool.apply_async(Submit,(task))
            #    results.append(result)
            # Submit in workers
            results = [pool.apply_async(Submit, (t)) for t in tasks]

            # timeout per submission
            timeout = 60
            stopflag = False
            for result, task in zip(results, tasks):
                try:
                    jdb = result.get(timeout)
                    jconv = JobConv()
                    job = jconv.db2job(jdb)
                except multiprocessing.TimeoutError:
                    self.log.error(
                        "%s: submission timeout: exit and try again" % task[1])
                    # abort submission if Submit process is stuck
                    #pool.terminate()
                    KillPool(pool)
                    pool.join()
                    stopflag = True
                    # reduce timeout to finish quickly
                    timeout = 0.1
                    continue
                if job is None:
                    self.log.error("%s: no job defined for %d" %
                                   (task[1], task[0]))
                    continue
                jd = {}
                jd['arcstate'] = 'submitted'
                # initial offset to 1 minute to force first status check
                jd['tarcstate'] = self.db.getTimeStamp(
                    time.time() -
                    int(self.conf.get(['jobs', 'checkinterval'])) + 120)
                jd['tstate'] = self.db.getTimeStamp()
                # extract hostname of cluster (depends on JobID being a URL)
                self.log.info("%s: job id %s" % (task[1], job.JobID))
                jd['cluster'] = self.cluster
                self.db.updateArcJobLazy(task[0], jd, job)
            if not stopflag:
                pool.terminate()
                pool.join()
            else:
                # stop submitting, gsiftp connection problem likely
                raise ExceptInterrupt(15)

            self.log.info("threads finished")
            # commit transaction to release row locks
            self.db.Commit()

            # still proxy bug - exit if there are multiple proxies
            if len(self.db.getProxiesInfo('TRUE', ['id'])) > 1:
                raise ExceptInterrupt(15)

        self.log.info("end submitting")

        return
Esempio n. 18
0
    def updatePandaHeartbeat(self,pstatus):
        """
        Heartbeat status updates.
        """
        nthreads=int(self.conf.get(["panda","threads"]))
        columns = ['pandaid', 'siteName', 'startTime', 'computingElement', 'node', 'corecount', 'eventranges']
        jobs=self.dbpanda.getJobs("pandastatus='"+pstatus+"' and sendhb=1 and ("+self.dbpanda.timeStampLessThan("theartbeat", self.conf.get(['panda','heartbeattime']))+" or modified > theartbeat) limit 1000", columns)
        if not jobs:
            return

        self.log.info("Update heartbeat for %d jobs in state %s (%s)" % (len(jobs), pstatus, ','.join([str(j['pandaid']) for j in jobs])))

        changed_pstatus = False
        if pstatus == 'sent':
            pstatus = 'starting'
            changed_pstatus = True

        tlist=[]
        for j in jobs:
            # Don't send transferring heartbeat for ES jobs, they must be in running while events are updated
            if pstatus == 'transferring' and j['eventranges']:
                pstatus = 'running'
            jd = {}
            if pstatus != 'starting':
                jd['startTime'] = j['startTime']
            if j['computingElement']:
                if j['computingElement'].find('://') != -1: # this if is only needed during transition period
                    jd['computingElement'] = arc.URL(str(j['computingElement'])).Host()
                else:
                    jd['computingElement'] = j['computingElement']
            jd['node'] = j['node']
            jd['siteName'] = j['siteName']
            # For starting truepilot jobs send pilotID with expected log
            # location so logs are available in case of lost heartbeat
            if pstatus == 'starting' and not changed_pstatus and self.sites[j['siteName']]['truepilot']:
                date = time.strftime('%Y-%m-%d', time.gmtime())
                logurl = '/'.join([self.conf.get(["joblog","urlprefix"]), date, j['siteName'], '%s.out' % j['pandaid']])
                jd['pilotID'] = '%s|Unknown|Unknown|Unknown|Unknown' % logurl
            try:
                jd['jobMetrics']="coreCount=%s" % (j['corecount'] if j['corecount'] > 0 else self.sites[j['siteName']]['corecount'])
            except:
                pass
            t=PandaThr(self.getPanda(j['siteName']).updateStatus,j['pandaid'],pstatus,jd)
            tlist.append(t)
        aCTUtils.RunThreadsSplit(tlist,nthreads)

        for t in tlist:
            if t.result == None or 'StatusCode' not in t.result:
                # Strange response from panda, try later
                continue
            if t.result['StatusCode'] and t.result['StatusCode'][0] == '60':
                self.log.error('Failed to contact Panda, proxy may have expired')
                continue
            #self.log.debug('%s: %s' % (t.id, t.result))
            if 'command' in t.result  and t.result['command'][0] != "NULL":
                self.log.info("%s: response: %s" % (t.id,t.result) )
            jd={}
            if changed_pstatus:
                jd['pandastatus']=pstatus
            # Make sure heartbeat is ahead of modified time so it is not picked up again
            if self.sites[t.args['siteName']]['truepilot'] and pstatus == 'starting':
                # Set theartbeat 1h in the future to allow job to start
                # running and avoid race conditions with heartbeats
                # Now heartbeat timeout is 2h so we remove the offset
                #jd['theartbeat'] = self.dbpanda.getTimeStamp(time.time()+3600)
                jd['theartbeat'] = self.dbpanda.getTimeStamp(time.time()+1)
            else:
                jd['theartbeat'] = self.dbpanda.getTimeStamp(time.time()+1)
            # If panda tells us to kill the job, set actpandastatus to tobekilled
            # and remove from heartbeats
            if 'command' in t.result and ( ("tobekilled" in t.result['command'][0]) or ("badattemptnr" in t.result['command'][0]) ):
                self.log.info('%s: cancelled by panda' % t.id)
                jd['actpandastatus']="tobekilled"
                jd['pandastatus']=None
            self.dbpanda.updateJob(t.id,jd)

        self.log.info("Threads finished")
Esempio n. 19
0
    def _arc_submit(self, xrsl, arcces, userconfig, log):
        '''Check the available CEs and submit'''

        queuelist = []

        for arcce in arcces:
            (ce_endpoint, ce_queue) = arcce
            aris = arc.URL(str(ce_endpoint))
            ce_host = aris.Host()
            if aris.Protocol() == 'https':
                aris.ChangePath('/arex')
                infoendpoints = [
                    arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO,
                                 'org.ogf.glue.emies.resourceinfo')
                ]
            else:
                aris = 'ldap://' + aris.Host() + '/mds-vo-name=local,o=grid'
                infoendpoints = [
                    arc.Endpoint(aris, arc.Endpoint.COMPUTINGINFO,
                                 'org.nordugrid.ldapng')
                ]

            # retriever contains a list of CE endpoints
            retriever = arc.ComputingServiceRetriever(userconfig,
                                                      infoendpoints)
            retriever.wait()
            # targets is the list of queues
            # parse target.ComputingService.ID for the CE hostname
            # target.ComputingShare.Name is the queue name
            targets = retriever.GetExecutionTargets()

            # Filter only sites for this process
            for target in targets:
                if not target.ComputingService.ID:
                    log.info(
                        "Target {0} does not have ComputingService ID defined, skipping"
                        .format(target.ComputingService.Name))
                    continue
                # If EMI-ES infoendpoint, force EMI-ES submission
                if infoendpoints[0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' \
                  and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation':
                    log.debug(
                        "Rejecting target interface {0} because not EMI-ES".
                        format(target.ComputingEndpoint.InterfaceName))
                    continue
                # Check for matching host and queue
                targethost = re.sub(
                    ':arex$', '',
                    re.sub('urn:ogf:ComputingService:', '',
                           target.ComputingService.ID))
                targetqueue = target.ComputingShare.Name
                if targethost != ce_host:
                    log.debug(
                        'Rejecting target host {0} as it does not match {1}'.
                        format(targethost, ce_host))
                    continue
                if targetqueue != ce_queue:
                    log.debug(
                        'Rejecting target queue {0} as it does not match {1}'.
                        format(targetqueue, ce_queue))
                    continue

                queuelist.append(target)
                log.debug("Adding target {0}:{1}".format(
                    targethost, targetqueue))

        # check if any queues are available, if not leave and try again next time
        if not queuelist:
            raise Exception("No free queues available")

        log.debug("preparing submission")
        jobdescs = arc.JobDescriptionList()
        if not arc.JobDescription_Parse(str(xrsl), jobdescs):
            raise Exception("Failed to prepare job description")

        # Run the submission in a separate thread
        thr = SubmitThr(queuelist, jobdescs, userconfig)
        return self._run_submit(thr)
Esempio n. 20
0
    def submit(self):
        """
        Main function to submit jobs.
        """

        global queuelist

        # check for stopsubmission flag
        if self.conf.get(['downtime', 'stopsubmission']) == "true":
            self.log.info('Submission suspended due to downtime')
            return 0

        # Get cluster host and queue: cluster/queue
        clusterhost = clusterqueue = None
        if self.cluster:
            cluster = self.cluster
            if cluster.find('://') == -1:
                cluster = 'gsiftp://' + cluster
            clusterurl = arc.URL(cluster)
            clusterhost = clusterurl.Host()
            clusterqueue = clusterurl.Path()[1:]  # strip off leading slash

        # Apply fair-share
        if self.cluster:
            fairshares = self.db.getArcJobsInfo(
                "arcstate='tosubmit' and clusterlist like '%" + self.cluster +
                "%'", ['fairshare'])
        else:
            fairshares = self.db.getArcJobsInfo(
                "arcstate='tosubmit' and clusterlist=''", ['fairshare'])

        if not fairshares:
            self.log.info('Nothing to submit')
            return 0

        fairshares = list(set([p['fairshare'] for p in fairshares]))
        # For EMI-ES proxy bug - see below
        shuffle(fairshares)
        count = 0

        for fairshare in fairshares:

            try:
                # catch any exceptions here to avoid leaving lock
                if self.cluster:
                    # Lock row for update in case multiple clusters are specified
                    #jobs=self.db.getArcJobsInfo("arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' order by priority desc limit 10".format(self.cluster, fairshare),
                    jobs = self.db.getArcJobsInfo(
                        "arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' limit 10"
                        .format(self.cluster, fairshare),
                        columns=[
                            "id", "jobdesc", "appjobid", "priority", "proxyid"
                        ],
                        lock=True)
                    if jobs:
                        self.log.debug("started lock for writing %d jobs" %
                                       len(jobs))
                else:
                    jobs = self.db.getArcJobsInfo(
                        "arcstate='tosubmit' and clusterlist='' and fairshare='{0}' limit 10"
                        .format(fairshare),
                        columns=["id", "jobdesc", "appjobid", "priority"])
                # mark submitting in db
                jobs_taken = []
                for j in jobs:
                    jd = {
                        'cluster': self.cluster,
                        'arcstate': 'submitting',
                        'tarcstate': self.db.getTimeStamp()
                    }
                    self.db.updateArcJobLazy(j['id'], jd)
                    jobs_taken.append(j)
                jobs = jobs_taken

            finally:
                if self.cluster:
                    try:
                        self.db.Commit(lock=True)
                        self.log.debug("ended lock")
                    except:
                        self.log.warning("Failed to release DB lock")
                else:
                    self.db.Commit()

            if len(jobs) == 0:
                #self.log.debug("No jobs to submit")
                continue
            self.log.info("Submitting %d jobs for fairshare %s" %
                          (len(jobs), fairshare))

            # max waiting priority
            try:
                maxpriowaiting = max(jobs,
                                     key=lambda x: x['priority'])['priority']
            except:
                maxpriowaiting = 0
            self.log.info("Maximum priority of waiting jobs: %d" %
                          maxpriowaiting)

            # Query infosys - either local or index
            if self.cluster:
                if self.cluster.find('://') != -1:
                    aris = arc.URL(self.cluster)
                else:
                    aris = arc.URL('gsiftp://%s' % self.cluster)
                if aris.Protocol() == 'https':
                    aris.ChangePath('/arex')
                    infoendpoints = [
                        arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO,
                                     'org.ogf.glue.emies.resourceinfo')
                    ]
                elif aris.Protocol() == 'local':
                    infoendpoints = [
                        arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO,
                                     'org.nordugrid.local')
                    ]
                else:
                    aris = 'ldap://' + aris.Host(
                    ) + '/mds-vo-name=local,o=grid'
                    infoendpoints = [
                        arc.Endpoint(aris, arc.Endpoint.COMPUTINGINFO,
                                     'org.nordugrid.ldapng')
                    ]
            else:
                giises = self.conf.getList(['atlasgiis', 'item'])
                infoendpoints = []
                for g in giises:
                    # Specify explicitly EGIIS
                    infoendpoints.append(
                        arc.Endpoint(str(g), arc.Endpoint.REGISTRY,
                                     "org.nordugrid.ldapegiis"))

            # Set UserConfig credential for each proxy. Assumes that any proxy
            # in the fairshare can query the CE infosys
            self.uc.CredentialString(self.db.getProxy(jobs[0]['proxyid']))
            # retriever contains a list of CE endpoints
            retriever = arc.ComputingServiceRetriever(self.uc, infoendpoints)
            retriever.wait()
            # targets is the list of queues
            # parse target.ComputingService.ID for the CE hostname
            # target.ComputingShare.Name is the queue name
            targets = retriever.GetExecutionTargets()

            # Filter only sites for this process
            queuelist = []
            for target in targets:
                if not target.ComputingService.ID:
                    self.log.info(
                        "Target %s does not have ComputingService ID defined, skipping"
                        % target.ComputingService.Name)
                    continue
                # If EMI-ES infoendpoint, force EMI-ES submission
                if infoendpoints[
                        0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation':
                    self.log.debug(
                        "Rejecting target interface %s because not EMI-ES" %
                        target.ComputingEndpoint.InterfaceName)
                    continue
                # Check for matching host and queue
                targethost = re.sub(
                    ':arex$', '',
                    re.sub('urn:ogf:ComputingService:', '',
                           target.ComputingService.ID))
                targetqueue = target.ComputingShare.Name
                if clusterhost and targethost != clusterhost:
                    self.log.debug(
                        'Rejecting target host %s as it does not match %s' %
                        (targethost, clusterhost))
                    continue
                if clusterqueue and targetqueue != clusterqueue:
                    self.log.debug(
                        'Rejecting target queue %s as it does not match %s' %
                        (targetqueue, clusterqueue))
                    continue
                if targetqueue in self.conf.getList(['queuesreject', 'item']):
                    self.log.debug(
                        'Rejecting target queue %s in queuesreject list' %
                        targetqueue)
                    continue
                elif targethost in self.conf.getList(
                    ['clustersreject', 'item']):
                    self.log.debug(
                        'Rejecting target host %s in clustersreject list' %
                        targethost)
                    continue
                else:
                    # tmp hack
                    target.ComputingShare.LocalWaitingJobs = 0
                    target.ComputingShare.PreLRMSWaitingJobs = 0
                    target.ExecutionEnvironment.CPUClockSpeed = 2000
                    qjobs = self.db.getArcJobsInfo(
                        "cluster='" + str(self.cluster) +
                        "' and  arcstate='submitted' and fairshare='%s'" %
                        fairshare, ['id', 'priority'])
                    rjobs = self.db.getArcJobsInfo(
                        "cluster='" + str(self.cluster) +
                        "' and  arcstate='running' and fairshare='%s'" %
                        fairshare, ['id'])

                    # max queued priority
                    try:
                        maxprioqueued = max(
                            qjobs, key=lambda x: x['priority'])['priority']
                    except:
                        maxprioqueued = 0
                    self.log.info("Max priority queued: %d" % maxprioqueued)

                    # Set number of submitted jobs to running * 0.15 + 400/num of shares
                    # Note: assumes only a few shares are used
                    jlimit = len(rjobs) * 0.15 + 100 / len(fairshares)
                    if str(self.cluster).find('arc-boinc-0') != -1:
                        jlimit = len(rjobs) * 0.15 + 400
                    if str(self.cluster).find('XXXpikolit') != -1:
                        jlimit = len(rjobs) * 0.15 + 100
                    if str(self.cluster).find('arc05.lcg') != -1:
                        jlimit = len(rjobs) * 0.15 + 400
                    target.ComputingShare.PreLRMSWaitingJobs = len(qjobs)
                    if len(qjobs) < jlimit or ((maxpriowaiting > maxprioqueued)
                                               and (maxpriowaiting > 10)):
                        if maxpriowaiting > maxprioqueued:
                            self.log.info(
                                "Overriding limit, maxpriowaiting: %d > maxprioqueued: %d"
                                % (maxpriowaiting, maxprioqueued))
                        queuelist.append(target)
                        self.log.debug("Adding target %s:%s" %
                                       (targethost, targetqueue))
                    else:
                        self.log.info(
                            "%s/%s already at limit of submitted jobs for fairshare %s"
                            % (targethost, targetqueue, fairshare))

            # check if any queues are available, if not leave and try again next time
            if not queuelist:
                self.log.info("No free queues available")
                self.db.Commit()
                # EMI-ES proxy problem - see bug 3685
                if self.cluster and self.cluster.startswith('https://'):
                    raise ExceptInterrupt(15)
                continue

            self.log.info("start submitting")

            # Just run one thread for each job in sequence. Strange things happen
            # when trying to create a new UserConfig object for each thread.
            for j in jobs:
                self.log.debug("%s: preparing submission" % j['appjobid'])
                jobdescstr = str(
                    self.db.getArcJobDescription(str(j['jobdesc'])))
                jobdescs = arc.JobDescriptionList()
                if not jobdescstr or not arc.JobDescription_Parse(
                        jobdescstr, jobdescs):
                    self.log.error("%s: Failed to prepare job description" %
                                   j['appjobid'])
                    continue
                # TODO: might not work if proxies are different within a share
                # since same uc object is shared among threads
                self.uc.CredentialString(self.db.getProxy(j['proxyid']))
                t = SubmitThr(Submit, j['id'], j['appjobid'], jobdescs,
                              self.uc, self.log)
                self.RunThreadsSplit([t], 1)
                count = count + 1

            self.log.info("threads finished")
            # commit transaction to release row locks
            self.db.Commit()

            # EMI-ES proxy problem - see bug 3685
            if self.cluster and self.cluster.startswith('https://'):
                raise ExceptInterrupt(15)

        self.log.info("end submitting")

        return count
Esempio n. 21
0
    def updatePandaHeartbeatBulk(self,pstatus):
        """
        Heartbeat status updates in bulk.
        """
        columns = ['pandaid', 'siteName', 'startTime', 'computingElement', 'node', 'corecount', 'eventranges']
        jobs=self.dbpanda.getJobs("pandastatus='"+pstatus+"' and sendhb=1 and ("+self.dbpanda.timeStampLessThan("theartbeat", self.conf.get(['panda','heartbeattime']))+" or modified > theartbeat) limit 1000", columns)
        #jobs=self.dbpanda.getJobs("pandastatus='"+pstatus+"' and sendhb=1 and ("+self.dbpanda.timeStampLessThan("theartbeat", 60)+" or modified > theartbeat) limit 1000", columns)
        if not jobs:
            return

        self.log.info("Update heartbeat for %d jobs in state %s (%s)" % (len(jobs), pstatus, ','.join([str(j['pandaid']) for j in jobs])))

        changed_pstatus = False
        if pstatus == 'sent':
            pstatus = 'starting'
            changed_pstatus = True

        tlist=[]
        jobsbyproxy = {}
        for j in jobs:
            # Don't send transferring heartbeat for ES jobs, they must be in running while events are updated
            if pstatus == 'transferring' and j['eventranges']:
                pstatus = 'running'
            jd = {'jobId': j['pandaid'], 'state': pstatus}
            if pstatus != 'starting':
                jd['startTime'] = j['startTime']
            if j['computingElement']:
                if j['computingElement'].find('://') != -1: # this if is only needed during transition period
                    jd['computingElement'] = arc.URL(str(j['computingElement'])).Host()
                else:
                    jd['computingElement'] = j['computingElement']
            jd['node'] = j['node']
            jd['siteName'] = j['siteName']
            # For starting truepilot jobs send pilotID with expected log
            # location so logs are available in case of lost heartbeat
            if pstatus == 'starting' and not changed_pstatus and self.sites[j['siteName']]['truepilot']:
                date = time.strftime('%Y-%m-%d', time.gmtime())
                logurl = '/'.join([self.conf.get(["joblog","urlprefix"]), date, j['siteName'], '%s.out' % j['pandaid']])
                jd['pilotID'] = '%s|Unknown|Unknown|Unknown|Unknown' % logurl
            try:
                corecount = int(j['corecount']) if j['corecount'] > 0 else self.sites[j['siteName']]['corecount']
                jd['jobMetrics'] = "coreCount=%d" % corecount
                jd['coreCount'] = corecount
            except:
                self.log.warning('%s: no corecount available' % j['pandaid'])

            try:
                jobsbyproxy[self.sites[j['siteName']]['type']].append(jd)
            except:
                jobsbyproxy[self.sites[j['siteName']]['type']] = [jd]

        for sitetype, jobs in jobsbyproxy.items():
            t = PandaBulkThr(self.pandas.get(sitetype, self.pandas.get('production')).updateStatuses, [j['jobId'] for j in jobs], jobs)
            tlist.append(t)
        aCTUtils.RunThreadsSplit(tlist, self.nthreads)

        for t in tlist:
            if not t or not t.result or not t.result[0]:
                # Strange response from panda, try later
                continue

            for pandaid, response in zip(t.ids, t.result[1]):
                try:
                    result = cgi.parse_qs(response)
                except Exception:
                    self.log.error('Could not parse result from panda: %s' % response)
                    continue

                if not result.get('StatusCode'):
                    # Strange response from panda, try later
                    continue
                if result['StatusCode'][0] == '60':
                    self.log.error('Failed to contact Panda, proxy may have expired')
                    continue
                if result.get('command', [''])[0] not in ['', "NULL"]:
                    self.log.info("%s: response: %s" % (pandaid, result))
                jd = {}
                if changed_pstatus:
                    jd['pandastatus'] = pstatus
                # Make sure heartbeat is ahead of modified time so it is not picked up again
                jd['theartbeat'] = self.dbpanda.getTimeStamp(time.time()+1)
                # If panda tells us to kill the job, set actpandastatus to tobekilled
                # and remove from heartbeats
                if result.get('command', [''])[0] in ["tobekilled", "badattemptnr", "alreadydone"]:
                    self.log.info('%s: cancelled by panda' % pandaid)
                    jd['actpandastatus'] = "tobekilled"
                    jd['pandastatus'] = None
                self.dbpanda.updateJob(pandaid, jd)

        self.log.info("Threads finished")
Esempio n. 22
0
    def _download_outputs(self, files, logdir, jobid, pandaid, userconfig,
                          log):
        '''Download the output files specified in downloadfiles'''

        # construct datapoint object, initialising connection. Use the same
        # object until base URL changes. TODO group by base URL.

        datapoint = arc_utils.DataPoint(str(jobid), userconfig)
        dp = datapoint.h
        dm = arc.DataMover()
        dm.retry(False)
        dm.passive(True)
        dm.secure(False)
        fetched = []
        notfetched = []
        notfetchedretry = []

        # create required local log dirs
        try:
            os.makedirs(logdir, 0755)
        except OSError as e:
            if e.errno != errno.EEXIST or not os.path.isdir(logdir):
                log.warning('Failed to create directory {0}: {1}'.format(
                    logdir, os.strerror(e.errno)))
                notfetched.append(jobid)
                return (fetched, notfetched, notfetchedretry)

        tmpdldir = os.path.join(self.tmpdir, pandaid)
        try:
            os.makedirs(tmpdldir, 0755)
        except OSError as e:
            if e.errno != errno.EEXIST or not os.path.isdir(tmpdldir):
                log.warning('Failed to create directory {0}: {1}'.format(
                    tmpdldir, os.strerror(e.errno)))
                notfetched.append(jobid)
                return (fetched, notfetched, notfetchedretry)

        filelist = files.split(';')
        if re.search(r'[\*\[\]\?]', files):
            # found wildcard, need to get sessiondir list
            remotefiles = self.listUrlRecursive(jobid, log)
            expandedfiles = []
            for wcf in filelist:
                if re.search(r'[\*\[\]\?]', wcf):
                    # only match wildcards in matching dirs
                    expandedfiles += [
                        rf for rf in remotefiles if fnmatch.fnmatch(rf, wcf)
                        and os.path.dirname(rf) == os.path.dirname(wcf)
                    ]
                else:
                    expandedfiles.append(wcf)
            # remove duplicates from wildcard matching through set
            filelist = list(set(expandedfiles))

        for f in filelist:
            if f == 'gmlog/errors':
                localfile = os.path.join(logdir, '%s.log' % pandaid)
            elif f.find('.log') != -1:
                localfile = os.path.join(logdir, '%s.out' % pandaid)
            else:
                localfile = os.path.join(tmpdldir, f)

            remotefile = arc.URL(str(jobid + '/' + f))
            dp.SetURL(remotefile)
            localdp = arc_utils.DataPoint(str(localfile), userconfig)
            # do the copy
            status = dm.Transfer(dp, localdp.h, arc.FileCache(), arc.URLMap())
            if not status and str(status).find(
                    'File unavailable'
            ) == -1:  # tmp fix for globus error which is always retried
                if status.Retryable():
                    log.warning(
                        'Failed to download but will retry {0}: {1}'.format(
                            dp.GetURL().str(), str(status)))
                    notfetchedretry.append(jobid)
                else:
                    log.error(
                        'Failed to download with permanent failure {0}: {1}'.
                        format(dp.GetURL().str(), str(status)))
                    notfetched.append(jobid)
            else:
                os.chmod(localfile, 0644)
                log.info('Downloaded {0}'.format(dp.GetURL().str()))

        if jobid not in notfetched and jobid not in notfetchedretry:
            fetched.append(jobid)

        return (fetched, notfetched, notfetchedretry)
Esempio n. 23
0
    def copyFinishedFiles(self, arcjobid, extractmetadata):
        """
        - if extractmetadata: (normal arc jobs, not true pilot jobs)
           - store heartbeat file under tmp/pickle or under harvester access
             point if specified
        - copy .job.log file to jobs/date/pandaqueue/pandaid.out
        - copy gmlog errors to jobs/date/pandaqueue/pandaid.log
        """

        columns = ['JobID', 'appjobid', 'cluster', 'UsedTotalWallTime', 'arcjobs.EndTime',
                   'ExecutionNode', 'stdout', 'fairshare', 'pandajobs.created', 'metadata']
        select = "arcjobs.id=%d AND arcjobs.id=pandajobs.arcjobid" % arcjobid
        aj = self.dbarc.getArcJobsInfo(select, columns=columns, tables='arcjobs,pandajobs')
        if not aj or 'JobID' not in aj[0] or not aj[0]['JobID']:
            self.log.error('No JobID in arcjob %s: %s'%(str(arcjobid), str(aj)))
            return False
        aj = aj[0]
        jobid = aj['JobID']
        sessionid = jobid[jobid.rfind('/')+1:]
        date = aj['created'].strftime('%Y-%m-%d')
        if extractmetadata:
            try:
                jobinfo = aCTPandaJob(filename=os.path.join(self.tmpdir, sessionid, 'heartbeat.json'))
            except Exception as x:
                self.log.error("%s: failed to load heartbeat file for arcjob %s: %s" %(aj['appjobid'], jobid, x))
                jobinfo = aCTPandaJob(jobinfo={'jobId': aj['appjobid'], 'state': 'finished'})

            # update heartbeat and dump to tmp/heartbeats
            jobinfo.computingElement = arc.URL(str(aj['cluster'])).Host()
            if hasattr(jobinfo, 'startTime') and hasattr(jobinfo, 'endTime'):
                # take values from the pilot
                jobinfo.startTime = datetime.datetime.utcfromtimestamp(jobinfo.startTime).strftime('%Y-%m-%d %H:%M:%S')
                jobinfo.endTime = datetime.datetime.utcfromtimestamp(jobinfo.endTime).strftime('%Y-%m-%d %H:%M:%S')
            else:
                # Use ARC values
                if aj['EndTime']:
                    # datetime cannot be serialised to json so use string (for harvester)
                    jobinfo.startTime = (aj['EndTime'] - datetime.timedelta(0, aj['UsedTotalWallTime'])).strftime('%Y-%m-%d %H:%M:%S')
                    jobinfo.endTime = aj['EndTime'].strftime('%Y-%m-%d %H:%M:%S')
                    # Sanity check for efficiency > 100%
                    cputimepercore = getattr(jobinfo, 'cpuConsumptionTime', 0) / getattr(jobinfo, 'coreCount', 1)
                    if aj['UsedTotalWallTime'] < cputimepercore:
                        self.log.warning('%s: Adjusting reported walltime %d to CPU time %d' %
                                          (aj['appjobid'], aj['UsedTotalWallTime'], cputimepercore))
                        jobinfo.startTime = (aj['EndTime'] - datetime.timedelta(0, cputimepercore)).strftime('%Y-%m-%d %H:%M:%S')
                else:
                    self.log.warning('%s: no endtime found' % aj['appjobid'])
            if len(aj["ExecutionNode"]) > 255:
                jobinfo.node = aj["ExecutionNode"][:254]
                self.log.warning("%s: Truncating wn hostname from %s to %s" % (aj['appjobid'], aj['ExecutionNode'], jobinfo.node))
            else:
                jobinfo.node = aj["ExecutionNode"]

            try:
                smeta = json.loads(aj['metadata'].decode())
            except:
                smeta = None

            if smeta and smeta.get('harvesteraccesspoint'):
                # de-serialise the metadata to json
                try:
                    jobinfo.metaData = json.loads(jobinfo.metaData)
                except Exception as e:
                    self.log.warning("%s: no metaData in pilot metadata: %s" % (aj['appjobid'], str(e)))
                jobinfo.writeToFile(os.path.join(smeta['harvesteraccesspoint'], 'jobReport.json'))
            else:
                jobinfo.writeToFile(os.path.join(self.tmpdir, "heartbeats", "%s.json" % aj['appjobid']))

        # copy to joblog dir files downloaded for the job: gmlog errors and pilot log
        outd = os.path.join(self.conf.get(['joblog','dir']), date, aj['fairshare'])
        try:
            os.makedirs(outd, 0o755)
        except:
            pass

        localdir = os.path.join(self.tmpdir, sessionid)
        gmlogerrors = os.path.join(localdir, "gmlog", "errors")
        arcjoblog = os.path.join(outd, "%s.log" % aj['appjobid'])
        if not os.path.exists(arcjoblog):
            try:
                shutil.move(gmlogerrors, arcjoblog)
                os.chmod(arcjoblog, 0o644)
            except:
                self.log.error("Failed to copy %s" % gmlogerrors)

        pilotlog = aj['stdout']
        if not pilotlog and os.path.exists(localdir):
            pilotlogs = [f for f in os.listdir(localdir)]
            for f in pilotlogs:
                if f.find('.log'):
                    pilotlog = f
        if pilotlog:
            try:
                shutil.move(os.path.join(localdir, pilotlog),
                            os.path.join(outd, '%s.out' % aj['appjobid']))
                os.chmod(os.path.join(outd, '%s.out' % aj['appjobid']), 0o644)
            except Exception as e:
                self.log.error("Failed to copy file %s: %s" % (os.path.join(localdir,pilotlog), str(e)))
                return False

        return True
Esempio n. 24
0
 def process(self, inmsg, outmsg):
     logger.msg(arc.DEBUG, "EchoService (python) 'Process' called")
     # time.sleep(10)
     # get the payload from the message
     inpayload = inmsg.Payload()
     logger.msg(
         arc.VERBOSE, 'inmsg.Auth().Export(arc.SecAttr.ARCAuth) = %s' %
         inmsg.Auth().Export(arc.SecAttr.ARCAuth).GetXML())
     logger.msg(
         arc.VERBOSE,
         'inmsg.Attributes().getAll() = %s ' % inmsg.Attributes().getAll())
     logger.msg(arc.INFO,
                "EchoService (python) got: %s " % inpayload.GetXML())
     # the first child of the payload should be the name of the request
     request_node = inpayload.Child()
     # get the namespace
     request_namespace = request_node.Namespace()
     logger.msg(
         arc.DEBUG,
         "EchoService (python) request_namespace: %s" % request_namespace)
     if request_namespace != echo_ns:
         if request_namespace == wsrf_rp_ns:
             outpayload = arc.PayloadSOAP(arc.NS({'wsrf-rp': wsrf_rp_ns}))
             outpayload.NewChild(
                 'wsrf-rp:GetResourcePropertyDocumentResponse').NewChild(
                     self.GetLocalInformation())
             outmsg.Payload(outpayload)
             logger.msg(arc.DEBUG, "outpayload %s" % outpayload.GetXML())
             return arc.MCC_Status(arc.STATUS_OK)
         raise Exception('wrong namespace. expected: %s' % echo_ns)
     # get the name of the request without the namespace prefix
     # this is the name of the Body node's first child
     request_name = request_node.Name()
     # create an answer payload
     ns = arc.NS({'echo': echo_ns})
     outpayload = arc.PayloadSOAP(ns)
     # here we defined that 'echo' prefix will be the namespace prefix of 'http://www.nordugrid.org/schemas/echo'
     # get the message
     say = str(request_node.Get('say'))
     # put it between the response-prefix and the response-suffix
     hear = self.prefix + say + self.suffix
     if request_name == 'double':
         # if the name of the request is 'double'
         # we create a new echo message which we send to http://localhost:60000/Echo using the ClientSOAP object
         cfg = arc.MCCConfig()
         ssl = False
         if self.ssl_config:
             cfg.AddCertificate(self.ssl_config.get('cert_file', None))
             cfg.AddPrivateKey(self.ssl_config.get('key_file', None))
             if 'ca_file' in self.ssl_config:
                 cfg.AddCAFile(self.ssl_config.get('ca_file', None))
             else:
                 cfg.AddCADir(self.ssl_config.get('ca_dir', None))
             ssl = True
         if ssl:
             url = arc.URL('https://localhost:60000/Echo')
             logger.msg(
                 arc.DEBUG,
                 'Calling https://localhost:60000/Echo using ClientSOAP')
         else:
             url = arc.URL('http://localhost:60000/Echo')
             logger.msg(
                 arc.DEBUG,
                 'Calling http://localhost:60000/Echo using ClientSOAP')
         # creating the ClientSOAP object
         s = arc.ClientSOAP(cfg, url)
         new_payload = arc.PayloadSOAP(ns)
         # creating the message
         new_payload.NewChild('echo:echo').NewChild('echo:say').Set(hear)
         logger.msg(arc.DEBUG, 'new_payload %s' % new_payload.GetXML())
         # sending the message
         resp, status = s.process(new_payload)
         # get the response
         hear = str(resp.Get('echoResponse').Get('hear'))
     elif request_name == 'httplib':
         # if the name of the request is 'httplib'
         # we create a new echo message which we send to http://localhost:60000/echo using python's built-in http client
         try:
             import http.client as httplib
         except ImportError:
             import httplib
         logger.msg(arc.DEBUG,
                    'Calling http://localhost:60000/Echo using httplib')
         # create the connection
         h = httplib.HTTPConnection('localhost', 60000)
         new_payload = arc.PayloadSOAP(ns)
         # create the message
         new_payload.NewChild('echo:echo').NewChild('echo:say').Set(hear)
         logger.msg(arc.DEBUG, 'new_payload %s' % new_payload.GetXML())
         # send the message
         h.request('POST', '/Echo', new_payload.GetXML())
         r = h.getresponse()
         response = r.read()
         logger.msg(arc.DEBUG, response)
         resp = arc.XMLNode(response)
         # get the response
         hear = str(resp.Child().Get('echoResponse').Get('hear'))
     elif request_name == 'wait':
         logger.msg(arc.DEBUG, 'Start waiting 10 sec...')
         time.sleep(10)
         logger.msg(arc.DEBUG, 'Waiting ends.')
     # we create a node at '/echo:echoResponse/echo:hear' and put the string in it
     outpayload.NewChild('echo:echoResponse').NewChild('echo:hear').Set(
         hear)
     outmsg.Payload(outpayload)
     logger.msg(arc.DEBUG, "outpayload %s" % outpayload.GetXML())
     # return with STATUS_OK
     return arc.MCC_Status(arc.STATUS_OK)
Esempio n. 25
0
    def getJobOutput(self, jobID, workingDirectory=None):
        """Get the specified job standard output and error files.
        Standard output and error are returned as strings.
        If further outputs are retrieved, they are stored in workingDirectory.
        """
        result = self._prepareProxy()
        if not result["OK"]:
            self.log.error("ARCComputingElement: failed to set up proxy",
                           result["Message"])
            return result
        self.usercfg.ProxyPath(os.environ["X509_USER_PROXY"])

        if jobID.find(":::") != -1:
            pilotRef, stamp = jobID.split(":::")
        else:
            pilotRef = jobID
            stamp = ""
        if not stamp:
            return S_ERROR("Pilot stamp not defined for %s" % pilotRef)

        job = self._getARCJob(pilotRef)

        arcID = os.path.basename(pilotRef)
        self.log.debug("Retrieving pilot logs for %s" % pilotRef)
        if not workingDirectory:
            if "WorkingDirectory" in self.ceParameters:
                workingDirectory = os.path.join(
                    self.ceParameters["WorkingDirectory"], arcID)
            else:
                workingDirectory = arcID
        outFileName = os.path.join(workingDirectory, "%s.out" % stamp)
        errFileName = os.path.join(workingDirectory, "%s.err" % stamp)
        self.log.debug("Working directory for pilot output %s" %
                       workingDirectory)

        # Retrieve the job output:
        # last parameter allows downloading the outputs even if workingDirectory already exists
        isItOkay = job.Retrieve(self.usercfg, arc.URL(str(workingDirectory)),
                                True)
        if isItOkay:
            output = None
            error = None
            try:
                with open(outFileName, "r") as outFile:
                    output = outFile.read()
                os.unlink(outFileName)
                with open(errFileName, "r") as errFile:
                    error = errFile.read()
                os.unlink(errFileName)
            except IOError as e:
                self.log.error("Error downloading outputs",
                               repr(e).replace(",)", ")"))
                return S_ERROR("Error downloading outputs")
            self.log.debug("Pilot output = %s" % output)
            self.log.debug("Pilot error = %s" % error)
        else:
            job.Update()
            arcState = job.State.GetGeneralState()
            if arcState != "Undefined":
                return S_ERROR(
                    "Failed to retrieve output for %s as job is not finished (maybe not started yet)"
                    % jobID)
            self.log.debug(
                "Could not retrieve pilot output for %s - either permission / proxy error or could not connect to CE"
                % pilotRef)
            return S_ERROR("Failed to retrieve output for %s" % jobID)

        return S_OK((output, error))
Esempio n. 26
0
#! /usr/bin/env python

from __future__ import print_function

import arc
import sys
root_logger = arc.Logger_getRootLogger()
root_logger.addDestination(arc.LogStream(sys.stdout))
root_logger.setThreshold(arc.ERROR)
if len(sys.argv) < 2:
    print("Usage: echo_client.py URL [message]")
    print(
        "  echo_client gets the credentials from the default user config file")
    sys.exit(-1)
url = arc.URL(sys.argv[1])
try:
    message = sys.argv[2]
except:
    message = 'hi!'
cfg = arc.MCCConfig()
uc = arc.UserConfig('')
uc.ApplyToConfig(cfg)
s = arc.ClientSOAP(cfg, url)
outpayload = arc.PayloadSOAP(
    arc.NS('echo', 'http://www.nordugrid.org/schemas/echo'))
outpayload.NewChild('echo:echo').NewChild('echo:say').Set(message)
resp, status = s.process(outpayload)
print(resp.GetXML(True))
Esempio n. 27
0
    def fetchSome(self, jobs, downloadfiles):

        # Get specified files for the jobs in downloadfiles
        # jobs: id: Job object
        # downloadfiles: id: list of files relative to session dir, with wildcards
        if not jobs or not downloadfiles:
            return ([], [], [])

        # construct datapoint object, initialising connection. Use the same
        # object until base URL changes. TODO group by base URL.
        datapoint = aCTUtils.DataPoint(jobs.values()[0].JobID, self.uc)
        dp = datapoint.h
        dm = arc.DataMover()
        dm.retry(False)
        dm.passive(True)
        dm.secure(False)
        fetched = []
        notfetched = []
        notfetchedretry = []

        for (id, job) in jobs.items():
            if id not in downloadfiles:
                continue
            jobid = job.JobID

            # If connection URL is different reconnect
            if arc.URL(jobid).ConnectionURL() != dp:
                datapoint = aCTUtils.DataPoint(jobid, self.uc)
                dp = datapoint.h
            localdir = str(self.conf.get(['tmp', 'dir'
                                          ])) + jobid[jobid.rfind('/'):] + '/'

            files = downloadfiles[id].split(';')
            if re.search('[\*\[\]\?]', downloadfiles[id]):
                # found wildcard, need to get sessiondir list
                remotefiles = self.listUrlRecursive(jobid)
                expandedfiles = []
                for wcf in files:
                    if re.search('[\*\[\]\?]', wcf):
                        # only match wildcards in matching dirs
                        expandedfiles += [
                            rf for rf in remotefiles
                            if fnmatch.fnmatch(rf, wcf)
                            and os.path.dirname(rf) == os.path.dirname(wcf)
                        ]
                    else:
                        expandedfiles.append(wcf)
                # remove duplicates from wildcard matching through set
                files = list(set(expandedfiles))

            for f in files:
                localfile = str(localdir + f)
                localfiledir = localfile[:localfile.rfind('/')]
                # create required local dirs
                try:
                    os.makedirs(localfiledir, 0755)
                except OSError as e:
                    if e.errno != errno.EEXIST or not os.path.isdir(
                            localfiledir):
                        self.log.warning('Failed to create directory %s: %s',
                                         localfiledir, os.strerror(e.errno))
                        notfetched.append(jobid)
                        break
                remotefile = arc.URL(str(jobid + '/' + f))
                dp.SetURL(remotefile)
                localdp = aCTUtils.DataPoint(localfile, self.uc)
                # do the copy
                status = dm.Transfer(dp, localdp.h, arc.FileCache(),
                                     arc.URLMap())
                if not status and str(status).find(
                        'File unavailable'
                ) == -1:  # tmp fix for globus error which is always retried
                    if status.Retryable():
                        self.log.warning(
                            'Failed to download but will retry %s: %s',
                            dp.GetURL().str(), str(status))
                        notfetchedretry.append(jobid)
                    else:
                        self.log.error(
                            'Failed to download with permanent failure %s: %s',
                            dp.GetURL().str(), str(status))
                        notfetched.append(jobid)
                    break
                self.log.info('Downloaded %s', dp.GetURL().str())
            if jobid not in notfetched and jobid not in notfetchedretry:
                fetched.append(jobid)
        return (fetched, notfetched, notfetchedretry)
Esempio n. 28
0
    def processFailed(self, arcjobs):
        """
        process jobs failed for other reasons than athena (log_extracts was not created by pilot)
        """
        if not arcjobs:
            return

        self.log.info("processing %d failed jobs" % len(arcjobs))
        for aj in arcjobs:
            jobid = aj['JobID']
            if not jobid:
                # Job was not even submitted, there is no more information
                self.log.warning(
                    "%s: Job has not been submitted yet so no information to report",
                    aj['appjobid'])
                continue

            cluster = arc.URL(str(jobid)).Host()
            sessionid = jobid[jobid.rfind('/') + 1:]
            date = time.strftime('%Y%m%d')
            outd = os.path.join(self.conf.get(['joblog', 'dir']), date,
                                cluster, sessionid)
            # Make sure the path up to outd exists
            try:
                os.makedirs(os.path.dirname(outd), 0755)
            except:
                pass
            try:
                shutil.rmtree(outd)
            except:
                pass
            # copy from tmp to outd. tmp dir will be cleaned in validator
            localdir = os.path.join(self.arcconf.get(['tmp', 'dir']),
                                    sessionid)
            try:
                shutil.copytree(localdir, outd)
            except (OSError, shutil.Error) as e:
                self.log.warning("%s: Failed to copy job output for %s: %s" %
                                 (aj['appjobid'], jobid, str(e)))
                # Sometimes fetcher fails to get output, so just make empty dir
                try:
                    os.makedirs(outd, 0755)
                except OSError, e:
                    self.log.warning(
                        "%s: Failed to create %s: %s. Job logs will be missing"
                        % (aj['appjobid'], outd, str(e)))

            # set right permissions
            aCTUtils.setFilePermissionsRecursive(outd)

            # set update, pickle from pilot is not available
            # some values might not be properly set
            # TODO synchronize error codes with the rest of production
            pupdate = aCTPandaJob()
            pupdate.jobId = aj['appjobid']
            pupdate.state = 'failed'
            pupdate.siteName = aj['siteName']
            pupdate.computingElement = cluster
            pupdate.schedulerID = self.conf.get(['panda', 'schedulerid'])
            pupdate.pilotID = self.conf.get(
                ["joblog", "urlprefix"]
            ) + "/" + date + "/" + cluster + '/' + sessionid + "|Unknown|Unknown|Unknown|Unknown"
            if len(aj["ExecutionNode"]) > 255:
                pupdate.node = aj["ExecutionNode"][:254]
                self.log.warning(
                    "%s: Truncating wn hostname from %s to %s" %
                    (aj['pandaid'], aj['ExecutionNode'], pupdate.node))
            else:
                pupdate.node = aj["ExecutionNode"]
            pupdate.node = aj['ExecutionNode']
            pupdate.pilotLog = self.createPilotLog(outd, aj['pandaid'])
            pupdate.cpuConsumptionTime = aj['UsedTotalCPUTime']
            pupdate.cpuConsumptionUnit = 'seconds'
            pupdate.cpuConversionFactor = 1
            pupdate.pilotTiming = "0|0|%s|0" % aj['UsedTotalWallTime']
            pupdate.exeErrorCode = aj['ExitCode']
            pupdate.exeErrorDiag = aj['Error']
            pupdate.pilotErrorCode = 1008
            codes = []
            codes.append("Job timeout")
            codes.append("qmaster enforced h_rt limit")
            codes.append("job killed: wall")
            codes.append("Job exceeded time limit")
            if [
                    errcode for errcode in codes
                    if re.search(errcode, aj['Error'])
            ]:
                pupdate.pilotErrorCode = 1213
            codes = []
            codes.append("Job probably exceeded memory limit")
            codes.append("job killed: vmem")
            codes.append("pvmem exceeded")
            if [
                    errcode for errcode in codes
                    if re.search(errcode, aj['Error'])
            ]:
                pupdate.pilotErrorCode = 1212
            pupdate.pilotErrorDiag = aj['Error']
            # set start/endtime
            pupdate.startTime = self.getStartTime(aj['EndTime'],
                                                  aj['UsedTotalWallTime'])
            pupdate.endTime = aj['EndTime']
            # save the pickle file to be used by aCTAutopilot panda update
            try:
                picklefile = os.path.join(self.arcconf.get(['tmp', 'dir']),
                                          "pickle",
                                          str(aj['pandaid']) + ".pickle")
                pupdate.writeToFile(picklefile)
            except Exception as e:
                self.log.warning("%s: Failed to write file %s: %s" %
                                 (aj['appjobid'], picklefile, str(e)))
Esempio n. 29
0
                #lfn = f.getElementsByTagName("logical")[0].getElementsByTagName("lfn")[0].getAttribute("name")
                #guid = str(file.getAttribute('ID'))
                size = ""
                adler32 = ""
                surl = ""
                se = ""
                for m in f.getElementsByTagName("metadata"):
                    v = m.getAttribute("att_value")
                    if m.getAttribute("att_name") == "fsize":
                        size = v
                    if m.getAttribute("att_name") == "adler32":
                        adler32 = v
                    # rewrite surl in xml
                    if m.getAttribute("att_name") == "surl":
                        surl = v
                        se = arc.URL(str(surl)).Host()
            except Exception, x:
                self.log.error('%s: %s' % (aj['appjobid'], x))
                outp = False

            if outp:
                checksum = "adler32:" + adler32
                if not surls.has_key(se):
                    surls[se] = []
                surls[se] += [{
                    "surl": surl,
                    "fsize": size,
                    "checksum": checksum,
                    "arcjobid": arcjobid
                }]