Beispiel #1
0
 def test_adding_endpoints(self):
     container = arc.EndpointContainer()
     endpoint1 = arc.Endpoint()
     endpoint2 = arc.Endpoint()
     container.addEntity(endpoint1)
     container.addEntity(endpoint2)
     self.expect(container).to_have(2).endpoints()
    def test_getting_the_endpoints(self):
        arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back(
            [arc.Endpoint()])
        arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back(
            arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.SUCCESSFUL))

        self.retriever = arc.ServiceEndpointRetriever(self.usercfg)
        container = arc.EndpointContainer()
        self.retriever.addConsumer(container)
        self.expect(container).to_be_empty()
        registry = arc.Endpoint("test.nordugrid.org", arc.Endpoint.REGISTRY,
                                "org.nordugrid.sertest")
        self.retriever.addEndpoint(registry)
        self.retriever.wait()
        self.expect(container).to_have(1).endpoint()
    def test_filtering(self):
        arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back([
            arc.Endpoint("test1.nordugrid.org", ["cap1", "cap2"]),
            arc.Endpoint("test2.nordugrid.org", ["cap3", "cap4"]),
            arc.Endpoint("test3.nordugrid.org", ["cap1", "cap3"])
        ])
        arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back([
            arc.Endpoint("test1.nordugrid.org", ["cap1", "cap2"]),
            arc.Endpoint("test2.nordugrid.org", ["cap3", "cap4"]),
            arc.Endpoint("test3.nordugrid.org", ["cap1", "cap3"])
        ])
        arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back([
            arc.Endpoint("test1.nordugrid.org", ["cap1", "cap2"]),
            arc.Endpoint("test2.nordugrid.org", ["cap3", "cap4"]),
            arc.Endpoint("test3.nordugrid.org", ["cap1", "cap3"])
        ])
        arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back(
            arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.SUCCESSFUL))
        arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back(
            arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.SUCCESSFUL))
        arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back(
            arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.SUCCESSFUL))
        registry = arc.Endpoint("test.nordugrid.org", arc.Endpoint.REGISTRY,
                                "org.nordugrid.sertest")

        options = arc.ServiceEndpointQueryOptions(False, ["cap1"])
        self.retriever = arc.ServiceEndpointRetriever(self.usercfg, options)
        container = arc.EndpointContainer()
        self.retriever.addConsumer(container)
        self.retriever.addEndpoint(registry)
        self.retriever.wait()
        self.expect(container).to_have(2).endpoints()

        options = arc.ServiceEndpointQueryOptions(False, ["cap2"])
        self.retriever = arc.ServiceEndpointRetriever(self.usercfg, options)
        container = arc.EndpointContainer()
        self.retriever.addConsumer(container)
        self.retriever.addEndpoint(registry)
        self.retriever.wait()
        self.expect(container).to_have(1).endpoint()

        options = arc.ServiceEndpointQueryOptions(False, ["cap5"])
        self.retriever = arc.ServiceEndpointRetriever(self.usercfg, options)
        container = arc.EndpointContainer()
        self.retriever.addConsumer(container)
        self.retriever.addEndpoint(registry)
        self.retriever.wait()
        self.expect(container).to_have(0).endpoints()
    def test_getting_status(self):
        arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back(
            [arc.Endpoint()])
        arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back(
            arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.FAILED))

        self.retriever = arc.ServiceEndpointRetriever(self.usercfg)
        container = arc.EndpointContainer()
        self.retriever.addConsumer(container)
        registry = arc.Endpoint("test.nordugrid.org", arc.Endpoint.REGISTRY,
                                "org.nordugrid.sertest")
        self.retriever.addEndpoint(registry)
        self.retriever.wait()
        status = self.retriever.getStatusOfEndpoint(registry)
        self.expect(status).to_be_an_instance_of(arc.EndpointQueryingStatus)
        self.expect(status).to_be(arc.EndpointQueryingStatus.FAILED)
Beispiel #5
0
def example():
    uc = arc.UserConfig()

    # Create a JobSupervisor to handle all the jobs
    job_supervisor = arc.JobSupervisor(uc)

    # Retrieve all the jobs from this computing element
    endpoint = arc.Endpoint("https://piff.hep.lu.se:443/arex",
                            arc.Endpoint.JOBLIST)
    sys.stdout.write("Querying %s for jobs...\n" % endpoint.str())
    retriever = arc.JobListRetriever(uc)
    retriever.addConsumer(job_supervisor)
    retriever.addEndpoint(endpoint)
    retriever.wait()

    sys.stdout.write("%s jobs found\n" % len(job_supervisor.GetAllJobs()))

    sys.stdout.write("Getting job states...\n")
    # Update the states of the jobs
    job_supervisor.Update()

    # Print state of updated jobs
    sys.stdout.write("The jobs have the following states: %s\n" % (", ".join(
        [job.State.GetGeneralState() for job in job_supervisor.GetAllJobs()])))

    # Select failed jobs
    job_supervisor.SelectByStatus(["Failed"])
    failed_jobs = job_supervisor.GetSelectedJobs()

    sys.stdout.write("The failed jobs:\n")
    for job in failed_jobs:
        job.SaveToStream(arc.CPyOstream(sys.stdout), True)
Beispiel #6
0
    def test_resubmit(self):
        self.usercfg.Broker("TEST")

        arc.TargetInformationRetrieverPluginTESTControl.targets = [
            self.create_test_target("http://test2.nordugrid.org")
        ]
        arc.TargetInformationRetrieverPluginTESTControl.status = arc.EndpointQueryingStatus(
            arc.EndpointQueryingStatus.SUCCESSFUL)

        js = arc.JobSupervisor(self.usercfg, [
            self.create_test_job(
                job_id="http://test.nordugrid.org/1234567890test1",
                state=arc.JobState.FAILED),
            self.create_test_job(
                job_id="http://test.nordugrid.org/1234567890test2",
                state=arc.JobState.RUNNING)
        ])

        self.expect(js.GetAllJobs()).to_have(2).jobs()

        endpoints = [
            arc.Endpoint("http://test2.nordugrid.org",
                         arc.Endpoint.COMPUTINGINFO, "org.nordugrid.tirtest")
        ]
        resubmitted = arc.JobList()
        result = js.Resubmit(0, endpoints, resubmitted)
Beispiel #7
0
    def get_job(self, job_id):
        """
        Return an instance of ``arc.Job`` representing the job with the given ID

        :param job_id:            ID of the job as returned by `submit_job`
        :raises JobNotFoundError: if no job with the given ID could be found
        :return:                  Instance of ``arc.Job`` representing the job
        """
        user_config = self.get_user_config()

        # Create a JobSupervisor to handle all the jobs
        job_supervisor = arc.JobSupervisor(user_config)

        # Retrieve all the jobs from this computing element
        endpoint = arc.Endpoint(self.config.ARC_SERVER, arc.Endpoint.JOBLIST)
        retriever = arc.JobListRetriever(user_config)
        retriever.addConsumer(job_supervisor)
        retriever.addEndpoint(endpoint)
        retriever.wait()

        # Update the states of the jobs
        job_supervisor.Update()

        # Get all jobs and find job by ID
        jobs = job_supervisor.GetAllJobs()

        for job in jobs:
            if job.JobID == job_id:
                return job

        raise JobNotFoundError(
            "Could not find a job with ID '{}'".format(job_id))
Beispiel #8
0
def example():
    # Creating a UserConfig object with the user's proxy
    # and the path of the trusted CA certificates
    uc = arc.UserConfig()
    uc.ProxyPath("/tmp/x509up_u%s" % os.getuid())
    uc.CACertificatesDirectory("/etc/grid-security/certificates")

    # Creating an endpoint for a Computing Element
    endpoint = arc.Endpoint("piff.hep.lu.se:443/arex", arc.Endpoint.COMPUTINGINFO)

    # Creating a container which will store the retrieved jobs
    jobs = arc.JobContainer()

    # Create a job list retriever
    retriever = arc.JobListRetriever(uc)
    # Add our container as the consumer of this retriever, so it will get the results
    retriever.addConsumer(jobs)

    # Add our endpoint to the retriever, which starts querying it
    retriever.addEndpoint(endpoint)

    # Wait until it finishes
    retriever.wait()

    # Get the status of the retrieval
    sys.stdout.write("%s\n"%retriever.getStatusOfEndpoint(endpoint).str())

    sys.stdout.write("Number of jobs found: %d\n"%len(jobs))
    for job in jobs:
        job.SaveToStream(arc.CPyOstream(sys.stdout), True)
 def test_removing_consumer(self):
     self.retriever = arc.ServiceEndpointRetriever(self.usercfg)
     container = arc.EndpointContainer()
     self.retriever.addConsumer(container)
     self.retriever.removeConsumer(container)
     registry = arc.Endpoint("test.nordugrid.org", arc.Endpoint.REGISTRY,
                             "org.nordugrid.sertest")
     self.retriever.addEndpoint(registry)
     self.retriever.wait()
     self.expect(container).to_have(0).endpoints()
    def test_rejected_services(self):
        rejected = "http://test.nordugrid.org"
        not_rejected = "http://test2.nordugrid.org"
        arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back(
            [arc.Endpoint(rejected),
             arc.Endpoint(not_rejected)])
        arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back(
            arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.SUCCESSFUL))

        options = arc.ServiceEndpointQueryOptions(False, [], [rejected])
        self.retriever = arc.ServiceEndpointRetriever(self.usercfg, options)
        container = arc.EndpointContainer()
        self.retriever.addConsumer(container)

        registry = arc.Endpoint("registry.nordugrid.org",
                                arc.Endpoint.REGISTRY)
        self.retriever.addEndpoint(registry)
        self.retriever.wait()
        self.expect(container).to_have(1).endpoint()
        self.expect(container[0].URLString).to_be(not_rejected)
    def test_the_status_is_started_first(self):
        arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back(
            [arc.Endpoint()])
        arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back(
            arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.SUCCESSFUL))
        self.condition = arc.SimpleCondition()
        arc.ServiceEndpointRetrieverPluginTESTControl.condition.push_back(
            self.condition)

        self.retriever = arc.ServiceEndpointRetriever(self.usercfg)
        container = arc.EndpointContainer()
        self.retriever.addConsumer(container)
        registry = arc.Endpoint("test.nordugrid.org", arc.Endpoint.REGISTRY,
                                "org.nordugrid.sertest")
        self.retriever.addEndpoint(registry)
        status = self.retriever.getStatusOfEndpoint(registry)
        self.expect(status).to_be(arc.EndpointQueryingStatus.STARTED)
        self.condition.signal()
        self.retriever.wait()
        status = self.retriever.getStatusOfEndpoint(registry)
        self.expect(status).to_be(arc.EndpointQueryingStatus.SUCCESSFUL)
Beispiel #12
0
def example():
    # Creating a UserConfig object with the user's proxy
    # and the path of the trusted CA certificates
    uc = arc.UserConfig()
    uc.ProxyPath("/tmp/x509up_u%s" % os.getuid())
    uc.CACertificatesDirectory("/etc/grid-security/certificates")

    # Query two registries (index servers) for Computing Services
    registries = [
        # for the index1, we specify that it is an EGIIS service
        arc.Endpoint("index1.nordugrid.org:2135/Mds-Vo-name=NorduGrid,o=grid",
                     arc.Endpoint.REGISTRY, "org.nordugrid.ldapegiis"),
        # for the arc-emi.grid.upjs.sk, we don't specify the type (the InterfaceName)
        # we let the system to try all possibilities
        arc.Endpoint("nordugrid.org", arc.Endpoint.REGISTRY,
                     "org.nordugrid.archery")
    ]

    retriever = retrieve(uc, registries)

    # The retriever acts as a list containing all the discovered ComputingServices:
    sys.stdout.write("Discovered ComputingServices: %s\n" %
                     (", ".join([service.Name for service in retriever])))

    # Get all the ExecutionTargets on these ComputingServices
    targets = retriever.GetExecutionTargets()
    sys.stdout.write(
        "Number of ExecutionTargets on these ComputingServices: %d\n" %
        len(targets))

    # Query the local infosys (COMPUTINGINFO) of computing elements
    computing_elements = [
        # for piff, we specify that we want to query the LDAP GLUE2 tree
        arc.Endpoint("piff.hep.lu.se", arc.Endpoint.COMPUTINGINFO,
                     "org.nordugrid.ldapglue2"),
        # for pgs03, we don't specify the interface, we let the system try all possibilities
        arc.Endpoint("pgs03.grid.upjs.sk", arc.Endpoint.COMPUTINGINFO)
    ]

    retriever2 = retrieve(uc, computing_elements)

    # Get all the ExecutionTargets on these ComputingServices
    targets2 = retriever2.GetExecutionTargets()

    sys.stdout.write("The discovered ExecutionTargets:\n")
    for target in targets2:
        sys.stdout.write("%s\n" % str(target))

    # Query both registries and computing elements at the same time:
    endpoints = [
        arc.Endpoint("arc-emi.grid.upjs.sk/O=Grid/Mds-Vo-Name=ARC-EMI",
                     arc.Endpoint.REGISTRY),
        arc.Endpoint("piff.hep.lu.se", arc.Endpoint.COMPUTINGINFO,
                     "org.nordugrid.ldapglue2")
    ]

    retriever3 = retrieve(uc, endpoints)

    sys.stdout.write("Discovered ComputingServices: %s\n" %
                     (", ".join([service.Name for service in retriever3])))
Beispiel #13
0
  def getCEStatus(self):
    """ Method to return information on running and pending jobs.
        We hope to satisfy both instances that use robot proxies and those which use proper configurations.
    """

    result = self._prepareProxy()
    if not result['OK']:
      gLogger.error('ARCComputingElement: failed to set up proxy', result['Message'])
      return result
    self.usercfg.ProxyPath(os.environ['X509_USER_PROXY'])

    # Try to find out which VO we are running for.
    vo = ''
    res = getVOfromProxyGroup()
    if res['OK']:
      vo = res['Value']

    result = S_OK()
    result['SubmittedJobs'] = 0
    if not vo:
      # Presumably the really proper way forward once the infosys-discuss WG comes up with a solution
      # and it is implemented. Needed for DIRAC instances which use robot certificates for pilots.
      endpoints = [arc.Endpoint("ldap://" + self.ceHost + "/MDS-Vo-name=local,o=grid",
                                arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.ldapng')]
      retriever = arc.ComputingServiceRetriever(self.usercfg, endpoints)
      retriever.wait()  # Takes a bit of time to get and parse the ldap information
      targets = retriever.GetExecutionTargets()
      ceStats = targets[0].ComputingShare
      gLogger.debug("Running jobs for CE %s : %s" % (self.ceHost, ceStats.RunningJobs))
      gLogger.debug("Waiting jobs for CE %s : %s" % (self.ceHost, ceStats.WaitingJobs))
      result['RunningJobs'] = ceStats.RunningJobs
      result['WaitingJobs'] = ceStats.WaitingJobs
    else:
      # The system which works properly at present for ARC CEs that are configured correctly.
      # But for this we need the VO to be known - ask me (Raja) for the whole story if interested.
      cmd = 'ldapsearch -x -LLL -H ldap://%s:2135 -b mds-vo-name=resource,o=grid "(GlueVOViewLocalID=%s)"' % (
          self.ceHost, vo.lower())
      res = shellCall(0, cmd)
      if not res['OK']:
        gLogger.debug("Could not query CE %s - is it down?" % self.ceHost)
        return res
      try:
        ldapValues = res['Value'][1].split("\n")
        running = [lValue for lValue in ldapValues if 'GlueCEStateRunningJobs' in lValue]
        waiting = [lValue for lValue in ldapValues if 'GlueCEStateWaitingJobs' in lValue]
        result['RunningJobs'] = int(running[0].split(":")[1])
        result['WaitingJobs'] = int(waiting[0].split(":")[1])
      except IndexError:
        res = S_ERROR('Unknown ldap failure for site %s' % self.ceHost)
        return res

    return result
Beispiel #14
0
 def setUp(self):
     self.usercfg = arc.UserConfig(
         arc.initializeCredentialsType(
             arc.initializeCredentialsType.SkipCredentials))
     self.ce = arc.Endpoint()
     self.ce.URLString = "test.nordugrid.org"
     self.ce.InterfaceName = "org.nordugrid.tirtest"
     arc.TargetInformationRetrieverPluginTESTControl.delay = 0
     arc.TargetInformationRetrieverPluginTESTControl.targets = [
         arc.ComputingServiceType()
     ]
     arc.TargetInformationRetrieverPluginTESTControl.status = arc.EndpointQueryingStatus(
         arc.EndpointQueryingStatus.SUCCESSFUL)
    def test_constructor_returns_immediately(self):
        arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back(
            [arc.Endpoint()])
        arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back(
            arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.SUCCESSFUL))
        self.condition = arc.SimpleCondition()
        arc.ServiceEndpointRetrieverPluginTESTControl.condition.push_back(
            self.condition)

        self.retriever = arc.ServiceEndpointRetriever(self.usercfg)
        container = arc.EndpointContainer()
        self.retriever.addConsumer(container)
        registry = arc.Endpoint("test.nordugrid.org", arc.Endpoint.REGISTRY,
                                "org.nordugrid.sertest")
        self.retriever.addEndpoint(registry)
        # the endpoint should not arrive yet
        self.expect(container).to_have(0).endpoints()
        self.condition.signal()
        # we are not interested in it anymore
        self.retriever.removeConsumer(container)
        # we must wait until self.retriever is done otherwise 'condition' will go out of scope while being used.
        self.retriever.wait()
    def test_recursivity_with_filtering(self):
        arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back([
            arc.Endpoint("emir.nordugrid.org", arc.Endpoint.REGISTRY,
                         "org.nordugrid.sertest"),
            arc.Endpoint("ce.nordugrid.org", arc.Endpoint.COMPUTINGINFO,
                         "org.ogf.glue.emies.resourceinfo"),
        ])
        arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back(
            arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.SUCCESSFUL))
        arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back([
            arc.Endpoint("emir.nordugrid.org", arc.Endpoint.REGISTRY,
                         "org.nordugrid.sertest"),
            arc.Endpoint("ce.nordugrid.org", arc.Endpoint.COMPUTINGINFO,
                         "org.ogf.glue.emies.resourceinfo"),
        ])
        arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back(
            arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.SUCCESSFUL))

        options = arc.ServiceEndpointQueryOptions(
            True, ["information.discovery.resource"])
        self.retriever = arc.ServiceEndpointRetriever(self.usercfg, options)
        container = arc.EndpointContainer()
        self.retriever.addConsumer(container)
        registry = arc.Endpoint("test.nordugrid.org", arc.Endpoint.REGISTRY,
                                "org.nordugrid.sertest")
        self.retriever.addEndpoint(registry)
        self.retriever.wait()
        # expect to only get the ce.nordugrid.org, but that will be there twice
        # once from test.nordugrid.org, once from emir.nordugrid.org
        self.expect(container).to_have(2).endpoints()
        emirs = [
            endpoint for endpoint in container if "emir" in endpoint.URLString
        ]
        ces = [
            endpoint for endpoint in container if "ce" in endpoint.URLString
        ]
        self.expect(emirs).to_have(0).endpoints()
        self.expect(ces).to_have(2).endpoints()
    def test_status_of_typeless_registry(self):
        arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back(
            arc.EndpointList())
        arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back(
            arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.SUCCESSFUL))

        self.retriever = arc.ServiceEndpointRetriever(self.usercfg)
        container = arc.EndpointContainer()
        self.retriever.addConsumer(container)
        registry = arc.Endpoint("test.nordugrid.org", arc.Endpoint.REGISTRY)
        self.retriever.addEndpoint(registry)
        self.retriever.wait()
        status = self.retriever.getStatusOfEndpoint(registry)
        self.expect(status).to_be(arc.EndpointQueryingStatus.SUCCESSFUL)
    def test_deleting_the_consumer_before_the_retriever(self):
        arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back(
            arc.EndpointList())
        arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back(
            arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.SUCCESSFUL))

        self.retriever = arc.ServiceEndpointRetriever(self.usercfg)
        container = arc.EndpointContainer()
        self.retriever.addConsumer(container)
        registry = arc.Endpoint("test.nordugrid.org", arc.Endpoint.REGISTRY,
                                "org.nordugrid.sertest")
        self.retriever.addEndpoint(registry)
        self.retriever.removeConsumer(container)
        del container
        self.retriever.wait()
    def test_empty_registry_type(self):
        arc.ServiceEndpointRetrieverPluginTESTControl.endpoints.push_back(
            arc.EndpointList())
        arc.ServiceEndpointRetrieverPluginTESTControl.status.push_back(
            arc.EndpointQueryingStatus(arc.EndpointQueryingStatus.SUCCESSFUL))

        self.retriever = arc.ServiceEndpointRetriever(self.usercfg)
        container = arc.EndpointContainer()
        self.retriever.addConsumer(container)
        registry = arc.Endpoint("test.nordugrid.org", arc.Endpoint.REGISTRY)
        self.retriever.addEndpoint(registry)
        self.retriever.wait()
        # it should fill the empty type with the available plugins:
        # among them the TEST plugin which doesn't return any endpoint
        self.expect(container).to_have(0).endpoint()
 def setUp(self):
     self.usercfg = arc.UserConfig(
         arc.initializeCredentialsType(
             arc.initializeCredentialsType.SkipCredentials))
     self.ce = arc.Endpoint()
     self.ce.URLString = "test.nordugrid.org"
     self.ce.InterfaceName = "org.nordugrid.tirtest"
     self.ce.Capability.append(
         arc.Endpoint_GetStringForCapability(arc.Endpoint.COMPUTINGINFO))
     arc.TargetInformationRetrieverPluginTESTControl.delay = 0
     arc.TargetInformationRetrieverPluginTESTControl.targets = [
         self.create_test_target()
     ]
     arc.TargetInformationRetrieverPluginTESTControl.status = arc.EndpointQueryingStatus(
         arc.EndpointQueryingStatus.SUCCESSFUL)
Beispiel #21
0
def example():
    # Creating a UserConfig object with the user's proxy
    # and the path of the trusted CA certificates
    uc = arc.UserConfig()
    uc.ProxyPath("/tmp/x509up_u%s" % os.getuid())
    uc.CACertificatesDirectory("/etc/grid-security/certificates")

    # Creating an endpoint for a Computing Element
    endpoint = arc.Endpoint("piff.hep.lu.se", arc.Endpoint.COMPUTINGINFO,
                            "org.nordugrid.ldapglue2")

    # Get the ExecutionTargets of this ComputingElement
    retriever = arc.ComputingServiceRetriever(uc, [endpoint])
    retriever.wait()
    targets = retriever.GetExecutionTargets()

    # Shuffle the targets to simulate a random broker
    targets = list(targets)
    random.shuffle(targets)

    # Create a JobDescription
    jobdesc = arc.JobDescription()
    jobdesc.Application.Executable.Path = "/bin/hostname"
    jobdesc.Application.Output = "stdout.txt"

    # create an empty job object which will contain our submitted job
    job = arc.Job()
    success = False
    # Submit job directly to the execution targets, without a broker
    for target in targets:
        sys.stdout.write("Trying to submit to %s (%s) ... " %
                         (target.ComputingEndpoint.URLString,
                          target.ComputingEndpoint.InterfaceName))
        sys.stdout.flush()
        success = target.Submit(uc, jobdesc, job)
        if success:
            sys.stdout.write("succeeded!\n")
            break
        else:
            sys.stdout.write("failed!\n")
    if success:
        sys.stdout.write("Job was submitted:\n")
        job.SaveToStream(arc.CPyOstream(sys.stdout), False)
    else:
        sys.stdout.write("Job submission failed\n")
Beispiel #22
0
    def getCEStatus(self):
        """Method to return information on running and pending jobs.
        We hope to satisfy both instances that use robot proxies and those which use proper configurations.
        """

        result = self._prepareProxy()
        if not result["OK"]:
            self.log.error("ARCComputingElement: failed to set up proxy", result["Message"])
            return result
        self.usercfg.ProxyPath(os.environ["X509_USER_PROXY"])

        # Creating an endpoint
        endpoint = arc.Endpoint(self.ceHost, arc.Endpoint.COMPUTINGINFO, "org.nordugrid.ldapglue2")

        # Get the ExecutionTargets of the ComputingElement (Can be REST, EMI-ES or GRIDFTP)
        retriever = arc.ComputingServiceRetriever(self.usercfg, [endpoint])
        retriever.wait()  # Takes a bit of time to get and parse the ldap information
        targetsWithQueues = retriever.GetExecutionTargets()

        # Targets also include queues
        # Some of them might be used by different VOs
        targets = []
        for target in targetsWithQueues:
            if target.ComputingShare.Name == self.arcQueue:
                self.log.debug(
                    "Adding target:",
                    "%s (%s)" % (target.ComputingEndpoint.URLString, target.ComputingEndpoint.InterfaceName),
                )
                targets.append(target)

        # We extract stat from the AREX service (targets[0])
        ceStats = targets[0].ComputingShare
        self.log.debug("Running jobs for CE %s : %s" % (self.ceHost, ceStats.RunningJobs))
        self.log.debug("Waiting jobs for CE %s : %s" % (self.ceHost, ceStats.WaitingJobs))

        result = S_OK()
        result["SubmittedJobs"] = 0
        result["RunningJobs"] = ceStats.RunningJobs
        result["WaitingJobs"] = ceStats.WaitingJobs

        return result
Beispiel #23
0
usercfg = arc.UserConfig("", "")

# Two simple job descriptions which output hostname to stdout
jobdescstring = "+(&(executable=/bin/hostname)(stdout=stdout))(&(executable=/bin/hostname)(stdout=stdout))"

# Parse job description
jobdescs = arc.JobDescriptionList()
if not arc.JobDescription_Parse(jobdescstring, jobdescs):
    logger.msg(arc.ERROR, "Invalid job description")
    sys.exit(1)

# Use 'arc.JobDescription_ParseFromFile("helloworld.xrsl", jobdescs)'
# to parse job description from file.

# Use top-level NorduGrid information index to find resources
index = arc.Endpoint("nordugrid.org", arc.Endpoint.REGISTRY,
                     "org.nordugrid.archery")
services = arc.EndpointList(1, index)

# Do the submission
jobs = arc.JobList()
submitter = arc.Submitter(usercfg)
if submitter.BrokeredSubmit(services, jobdescs,
                            jobs) != arc.SubmissionStatus.NONE:
    logger.msg(arc.ERROR, "Failed to submit job")
    sys.exit(1)

# Write information on submitted job to local job list (~/.arc/jobs.xml)
jobList = arc.JobInformationStorageSQLite(usercfg.JobListFile())
if not jobList.Write(jobs):
    logger.msg(arc.WARNING,
               "Failed to write to local job list %s" % usercfg.JobListFile())
Beispiel #24
0
    def _arc_submit(self, xrsl, arcces, userconfig, log):
        '''Check the available CEs and submit'''

        queuelist = []

        for arcce in arcces:
            (ce_endpoint, ce_queue) = arcce
            aris = arc.URL(str(ce_endpoint))
            ce_host = aris.Host()
            if aris.Protocol() == 'https':
                aris.ChangePath('/arex')
                infoendpoints = [
                    arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO,
                                 'org.ogf.glue.emies.resourceinfo')
                ]
            else:
                aris = 'ldap://' + aris.Host() + '/mds-vo-name=local,o=grid'
                infoendpoints = [
                    arc.Endpoint(aris, arc.Endpoint.COMPUTINGINFO,
                                 'org.nordugrid.ldapng')
                ]

            # retriever contains a list of CE endpoints
            retriever = arc.ComputingServiceRetriever(userconfig,
                                                      infoendpoints)
            retriever.wait()
            # targets is the list of queues
            # parse target.ComputingService.ID for the CE hostname
            # target.ComputingShare.Name is the queue name
            targets = retriever.GetExecutionTargets()

            # Filter only sites for this process
            for target in targets:
                if not target.ComputingService.ID:
                    log.info(
                        "Target {0} does not have ComputingService ID defined, skipping"
                        .format(target.ComputingService.Name))
                    continue
                # If EMI-ES infoendpoint, force EMI-ES submission
                if infoendpoints[0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' \
                  and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation':
                    log.debug(
                        "Rejecting target interface {0} because not EMI-ES".
                        format(target.ComputingEndpoint.InterfaceName))
                    continue
                # Check for matching host and queue
                targethost = re.sub(
                    ':arex$', '',
                    re.sub('urn:ogf:ComputingService:', '',
                           target.ComputingService.ID))
                targetqueue = target.ComputingShare.Name
                if targethost != ce_host:
                    log.debug(
                        'Rejecting target host {0} as it does not match {1}'.
                        format(targethost, ce_host))
                    continue
                if targetqueue != ce_queue:
                    log.debug(
                        'Rejecting target queue {0} as it does not match {1}'.
                        format(targetqueue, ce_queue))
                    continue

                queuelist.append(target)
                log.debug("Adding target {0}:{1}".format(
                    targethost, targetqueue))

        # check if any queues are available, if not leave and try again next time
        if not queuelist:
            raise Exception("No free queues available")

        log.debug("preparing submission")
        jobdescs = arc.JobDescriptionList()
        if not arc.JobDescription_Parse(str(xrsl), jobdescs):
            raise Exception("Failed to prepare job description")

        # Run the submission in a separate thread
        thr = SubmitThr(queuelist, jobdescs, userconfig)
        return self._run_submit(thr)
  def submitJob( self, executableFile, proxy, numberOfJobs = 1 ):
    """ Method to submit job
    """

    # Assume that the ARC queues are always of the format nordugrid-<batchSystem>-<queue>
    # And none of our supported batch systems have a "-" in their name
    self.arcQueue = self.queue.split("-",2)[2]
    result = self._prepareProxy()
    self.usercfg.ProxyPath(os.environ['X509_USER_PROXY'])
    if not result['OK']:
      gLogger.error( 'ARCComputingElement: failed to set up proxy', result['Message'] )
      return result

    gLogger.verbose( "Executable file path: %s" % executableFile )
    if not os.access( executableFile, 5 ):
      os.chmod( executableFile, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH + stat.S_IXOTH )

    batchIDList = []
    stampDict = {}

    endpoint = arc.Endpoint( self.ceHost + ":2811/jobs", arc.Endpoint.JOBSUBMIT,
                            "org.nordugrid.gridftpjob")

    # Submit jobs iteratively for now. Tentatively easier than mucking around with the JobSupervisor class
    for __i in range(numberOfJobs):
      # The basic job description
      jobdescs = arc.JobDescriptionList()
      # Get the job into the ARC way
      xrslString, diracStamp = self.__writeXRSL( executableFile )
      gLogger.debug("XRSL string submitted : %s" %xrslString)
      gLogger.debug("DIRAC stamp for job : %s" %diracStamp)
      if not arc.JobDescription_Parse(xrslString, jobdescs):
        gLogger.error("Invalid job description")
        break
      # Submit the job
      jobs = arc.JobList() # filled by the submit process
      submitter = arc.Submitter(self.usercfg)
      result = submitter.Submit(endpoint, jobdescs, jobs)
      # Save info or else ..else.
      if ( result == arc.SubmissionStatus.NONE ):
        # Job successfully submitted
        pilotJobReference = jobs[0].JobID
        batchIDList.append( pilotJobReference )
        stampDict[pilotJobReference] = diracStamp
        gLogger.debug("Successfully submitted job %s to CE %s" % (pilotJobReference, self.ceHost))
      else:
        message = "Failed to submit job because "
        if (result.isSet(arc.SubmissionStatus.NOT_IMPLEMENTED) ):
          gLogger.warn( "%s feature not implemented on CE? (weird I know - complain to site admins" % message )
        if ( result.isSet(arc.SubmissionStatus.NO_SERVICES) ):
          gLogger.warn( "%s no services are running on CE? (open GGUS ticket to site admins" % message )
        if ( result.isSet(arc.SubmissionStatus.ENDPOINT_NOT_QUERIED) ):
          gLogger.warn( "%s endpoint was not even queried. (network ..?)" % message )
        if ( result.isSet(arc.SubmissionStatus.BROKER_PLUGIN_NOT_LOADED) ):
          gLogger.warn( "%s BROKER_PLUGIN_NOT_LOADED : ARC library installation problem?" % message )
        if ( result.isSet(arc.SubmissionStatus.DESCRIPTION_NOT_SUBMITTED) ):
          gLogger.warn( "%s Job not submitted - incorrect job description? (missing field in XRSL string?)" % message )
        if ( result.isSet(arc.SubmissionStatus.SUBMITTER_PLUGIN_NOT_LOADED) ):
          gLogger.warn( "%s SUBMITTER_PLUGIN_NOT_LOADED : ARC library installation problem?" % message )
        if ( result.isSet(arc.SubmissionStatus.AUTHENTICATION_ERROR) ):
          gLogger.warn( "%s authentication error - screwed up / expired proxy? Renew / upload pilot proxy on machine?" % message )
        if ( result.isSet(arc.SubmissionStatus.ERROR_FROM_ENDPOINT) ):
          gLogger.warn( "%s some error from the CE - possibly CE problems?" % message )
        gLogger.warn( "%s ... maybe above messages will give a hint." % message )
        break # Boo hoo *sniff*

    if batchIDList:
      result = S_OK( batchIDList )
      result['PilotStampDict'] = stampDict
    else:
      result = S_ERROR('No pilot references obtained from the ARC job submission')
    return result
Beispiel #26
0
    def submitJob(self,
                  executableFile,
                  proxy,
                  numberOfJobs=1,
                  inputs=None,
                  outputs=None):
        """Method to submit job"""

        # Assume that the ARC queues are always of the format nordugrid-<batchSystem>-<queue>
        # And none of our supported batch systems have a "-" in their name
        self.arcQueue = self.queue.split("-", 2)[2]
        result = self._prepareProxy()
        if not result["OK"]:
            self.log.error("ARCComputingElement: failed to set up proxy",
                           result["Message"])
            return result
        self.usercfg.ProxyPath(os.environ["X509_USER_PROXY"])

        self.log.verbose("Executable file path: %s" % executableFile)
        if not os.access(executableFile, 5):
            os.chmod(
                executableFile, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP
                | stat.S_IROTH + stat.S_IXOTH)

        executables = None
        if self.preamble:
            executables = [executableFile]
            executableFile = self._bundlePreamble(executableFile)

        batchIDList = []
        stampDict = {}

        if self.endpointType == "Gridftp":
            endpoint = arc.Endpoint(str(self.ceHost + ":2811/jobs"),
                                    arc.Endpoint.JOBSUBMIT,
                                    "org.nordugrid.gridftpjob")
        else:
            endpoint = arc.Endpoint(
                str("https://" + self.ceHost + ":8443/arex"),
                arc.Endpoint.JOBSUBMIT,
                "org.ogf.glue.emies.activitycreation",
            )

        # Submit jobs iteratively for now. Tentatively easier than mucking around with the JobSupervisor class
        for __i in range(numberOfJobs):
            # The basic job description
            jobdescs = arc.JobDescriptionList()
            # Get the job into the ARC way
            xrslString, diracStamp = self._writeXRSL(executableFile, inputs,
                                                     outputs, executables)
            self.log.debug("XRSL string submitted : %s" % xrslString)
            self.log.debug("DIRAC stamp for job : %s" % diracStamp)
            # The arc bindings don't accept unicode objects in Python 2 so xrslString must be explicitly cast
            result = arc.JobDescription_Parse(str(xrslString), jobdescs)
            if not result:
                self.log.error("Invalid job description",
                               "%r, message=%s" % (xrslString, result.str()))
                break
            # Submit the job
            jobs = arc.JobList()  # filled by the submit process
            submitter = arc.Submitter(self.usercfg)
            result = submitter.Submit(endpoint, jobdescs, jobs)
            # Save info or else ..else.
            if result == arc.SubmissionStatus.NONE:
                # Job successfully submitted
                pilotJobReference = jobs[0].JobID
                batchIDList.append(pilotJobReference)
                stampDict[pilotJobReference] = diracStamp
                self.log.debug("Successfully submitted job %s to CE %s" %
                               (pilotJobReference, self.ceHost))
            else:
                self._analyzeSubmissionError(result)
                break  # Boo hoo *sniff*

        if self.preamble:
            os.unlink(executableFile)

        if batchIDList:
            result = S_OK(batchIDList)
            result["PilotStampDict"] = stampDict
        else:
            result = S_ERROR(
                "No pilot references obtained from the ARC job submission")
        return result
Beispiel #27
0
    def submit(self):
        """
        Main function to submit jobs.
        """

        global queuelist

        # check for stopsubmission flag
        if self.conf.get(['downtime', 'stopsubmission']) == "true":
            self.log.info('Submission suspended due to downtime')
            return

        # check for any site-specific limits or status
        clusterstatus = self.conf.getCond(["sites", "site"],
                                          f"endpoint={self.cluster}",
                                          ["status"]) or 'online'
        if clusterstatus == 'offline':
            self.log.info('Site status is offline')
            return

        clustermaxjobs = int(
            self.conf.getCond(["sites", "site"], f"endpoint={self.cluster}",
                              ["maxjobs"]) or 999999)
        nsubmitted = self.db.getNArcJobs(f"cluster='{self.cluster}'")
        if nsubmitted >= clustermaxjobs:
            self.log.info(
                f'{nsubmitted} submitted jobs is greater than or equal to max jobs {clustermaxjobs}'
            )
            return

        # Get cluster host and queue: cluster/queue
        clusterhost = clusterqueue = None
        if self.cluster:
            cluster = self.cluster
            if cluster.find('://') == -1:
                cluster = 'gsiftp://' + cluster
            clusterurl = arc.URL(cluster)
            clusterhost = clusterurl.Host()
            clusterqueue = clusterurl.Path()[1:]  # strip off leading slash

        # Apply fair-share
        if self.cluster:
            fairshares = self.db.getArcJobsInfo(
                "arcstate='tosubmit' and clusterlist like '%" + self.cluster +
                "%'", ['fairshare', 'proxyid'])
        else:
            fairshares = self.db.getArcJobsInfo(
                "arcstate='tosubmit' and clusterlist=''",
                ['fairshare', 'proxyid'])

        if not fairshares:
            self.log.info('Nothing to submit')
            return

        # split by proxy for GU queues
        fairshares = list(
            set([(p['fairshare'], p['proxyid']) for p in fairshares]))
        # For proxy bug - see below
        shuffle(fairshares)

        for fairshare, proxyid in fairshares:

            # apply maxjobs limit (check above should make sure greater than zero)
            # Note: relies on exit after first loop
            limit = min(clustermaxjobs - nsubmitted, 10)
            try:
                # catch any exceptions here to avoid leaving lock
                if self.cluster:
                    # Lock row for update in case multiple clusters are specified
                    #jobs=self.db.getArcJobsInfo("arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' order by priority desc limit 10".format(self.cluster, fairshare),
                    jobs = self.db.getArcJobsInfo(
                        "arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' and proxyid='{2}' limit {3}"
                        .format(self.cluster, fairshare, proxyid, limit),
                        columns=[
                            "id", "jobdesc", "appjobid", "priority", "proxyid",
                            "clusterlist"
                        ],
                        lock=True)
                    if jobs:
                        self.log.debug("started lock for writing %d jobs" %
                                       len(jobs))
                else:
                    jobs = self.db.getArcJobsInfo(
                        "arcstate='tosubmit' and clusterlist='' and fairshare='{0} and proxyid={1}' limit {2}"
                        .format(fairshare, proxyid, limit),
                        columns=[
                            "id", "jobdesc", "appjobid", "priority", "proxyid",
                            "clusterlist"
                        ])
                # mark submitting in db
                jobs_taken = []
                for j in jobs:
                    jd = {
                        'cluster': self.cluster,
                        'arcstate': 'submitting',
                        'tarcstate': self.db.getTimeStamp()
                    }
                    self.db.updateArcJobLazy(j['id'], jd)
                    jobs_taken.append(j)
                jobs = jobs_taken

            finally:
                if self.cluster:
                    try:
                        self.db.Commit(lock=True)
                        self.log.debug("ended lock")
                    except:
                        self.log.warning("Failed to release DB lock")
                else:
                    self.db.Commit()

            if len(jobs) == 0:
                #self.log.debug("No jobs to submit")
                continue
            self.log.info(
                "Submitting %d jobs for fairshare %s and proxyid %d" %
                (len(jobs), fairshare, proxyid))

            # max waiting priority
            try:
                maxpriowaiting = max(jobs,
                                     key=lambda x: x['priority'])['priority']
            except:
                maxpriowaiting = 0
            self.log.info("Maximum priority of waiting jobs: %d" %
                          maxpriowaiting)

            # Query infosys - either local or index
            if self.cluster:
                if self.cluster.find('://') != -1:
                    aris = arc.URL(self.cluster)
                else:
                    aris = arc.URL('gsiftp://%s' % self.cluster)
                if aris.Protocol() == 'https':
                    aris.ChangePath('/arex')
                    infoendpoints = [
                        arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO,
                                     'org.ogf.glue.emies.resourceinfo')
                    ]
                elif aris.Protocol() == 'local':
                    infoendpoints = [
                        arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO,
                                     'org.nordugrid.local')
                    ]
                else:
                    aris = 'ldap://' + aris.Host(
                    ) + '/mds-vo-name=local,o=grid'
                    infoendpoints = [
                        arc.Endpoint(aris, arc.Endpoint.COMPUTINGINFO,
                                     'org.nordugrid.ldapng')
                    ]
            else:
                giises = self.conf.getList(['atlasgiis', 'item'])
                infoendpoints = []
                for g in giises:
                    # Specify explicitly EGIIS
                    infoendpoints.append(
                        arc.Endpoint(str(g), arc.Endpoint.REGISTRY,
                                     "org.nordugrid.ldapegiis"))

            # Set UserConfig credential for querying infosys
            proxystring = str(self.db.getProxy(proxyid))
            self.uc.CredentialString(proxystring)
            global usercred
            usercred = self.uc
            # retriever contains a list of CE endpoints
            retriever = arc.ComputingServiceRetriever(self.uc, infoendpoints)
            retriever.wait()
            # targets is the list of queues
            # parse target.ComputingService.ID for the CE hostname
            # target.ComputingShare.Name is the queue name
            targets = retriever.GetExecutionTargets()

            # Filter only sites for this process
            queuelist = []
            for target in targets:
                if not target.ComputingService.ID:
                    self.log.info(
                        "Target %s does not have ComputingService ID defined, skipping"
                        % target.ComputingService.Name)
                    continue
                # If EMI-ES infoendpoint, force EMI-ES submission
                if infoendpoints[
                        0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation':
                    self.log.debug(
                        "Rejecting target interface %s because not EMI-ES" %
                        target.ComputingEndpoint.InterfaceName)
                    continue
                # Check for matching host and queue
                targethost = re.sub(
                    ':arex$', '',
                    re.sub('urn:ogf:ComputingService:', '',
                           target.ComputingService.ID))
                targetqueue = target.ComputingShare.Name
                if clusterhost and targethost != clusterhost:
                    self.log.debug(
                        'Rejecting target host %s as it does not match %s' %
                        (targethost, clusterhost))
                    continue
                if clusterqueue and targetqueue != clusterqueue:
                    self.log.debug(
                        'Rejecting target queue %s as it does not match %s' %
                        (targetqueue, clusterqueue))
                    continue
                if targetqueue in self.conf.getList(['queuesreject', 'item']):
                    self.log.debug(
                        'Rejecting target queue %s in queuesreject list' %
                        targetqueue)
                    continue
                elif targethost in self.conf.getList(
                    ['clustersreject', 'item']):
                    self.log.debug(
                        'Rejecting target host %s in clustersreject list' %
                        targethost)
                    continue
                else:
                    # tmp hack
                    target.ComputingShare.LocalWaitingJobs = 0
                    target.ComputingShare.PreLRMSWaitingJobs = 0
                    target.ExecutionEnvironment.CPUClockSpeed = 2000
                    qjobs = self.db.getArcJobsInfo(
                        "cluster='" + str(self.cluster) +
                        "' and  arcstate='submitted' and fairshare='%s'" %
                        fairshare, ['id', 'priority'])
                    rjobs = self.db.getArcJobsInfo(
                        "cluster='" + str(self.cluster) +
                        "' and  arcstate='running' and fairshare='%s'" %
                        fairshare, ['id'])

                    # max queued priority
                    try:
                        maxprioqueued = max(
                            qjobs, key=lambda x: x['priority'])['priority']
                    except:
                        maxprioqueued = 0
                    self.log.info("Max priority queued: %d" % maxprioqueued)

                    # Limit number of submitted jobs using configuration or default (0.15 + 100/num of shares)
                    # Note: assumes only a few shares are used
                    qfraction = float(self.conf.get([
                        'jobs', 'queuefraction'
                    ])) if self.conf.get(['jobs', 'queuefraction']) else 0.15
                    qoffset = int(self.conf.get([
                        'jobs', 'queueoffset'
                    ])) if self.conf.get(['jobs', 'queueoffset']) else 100
                    jlimit = len(rjobs) * qfraction + qoffset / len(fairshares)
                    self.log.debug("running %d, queued %d, queue limit %d" %
                                   (len(rjobs), len(qjobs), jlimit))
                    if str(self.cluster).find('arc-boinc-0') != -1:
                        jlimit = len(rjobs) * 0.15 + 400
                    if str(self.cluster).find('XXXpikolit') != -1:
                        jlimit = len(rjobs) * 0.15 + 100
                    if str(self.cluster).find('arc05.lcg') != -1:
                        jlimit = len(rjobs) * 0.15 + 400
                    target.ComputingShare.PreLRMSWaitingJobs = len(qjobs)
                    if len(qjobs) < jlimit or ((maxpriowaiting > maxprioqueued)
                                               and (maxpriowaiting > 10)):
                        if maxpriowaiting > maxprioqueued:
                            self.log.info(
                                "Overriding limit, maxpriowaiting: %d > maxprioqueued: %d"
                                % (maxpriowaiting, maxprioqueued))
                        queuelist.append(target)
                        self.log.debug("Adding target %s:%s" %
                                       (targethost, targetqueue))
                    else:
                        self.log.info(
                            "%s/%s already at limit of submitted jobs for fairshare %s"
                            % (targethost, targetqueue, fairshare))

            # check if any queues are available, if not leave and try again next time
            if not queuelist:
                self.log.info("No free queues available")
                self.db.Commit()
                continue

            self.log.info("start submitting")

            # Just run one thread for each job in sequence. Strange things happen
            # when trying to create a new UserConfig object for each thread.
            tasks = []
            for j in jobs:
                self.log.debug("%s: preparing submission" % j['appjobid'])
                jobdescstr = str(
                    self.db.getArcJobDescription(str(j['jobdesc'])))
                jobdescs = arc.JobDescriptionList()
                if not jobdescstr or not arc.JobDescription_Parse(
                        jobdescstr, jobdescs):
                    self.log.error("%s: Failed to prepare job description" %
                                   j['appjobid'])
                    continue
                tasks.append((j['id'], j['appjobid'], jobdescstr, proxystring,
                              int(self.conf.get(['atlasgiis', 'timeout']))))

            npools = 1
            if any(s in self.cluster
                   for s in self.conf.getList(['parallelsubmit', 'item'])):
                npools = int(self.conf.get(['parallelsubmit', 'npools']))
            self.log.debug("Starting submitters: %s" % npools)

            pool = multiprocessing.Pool(npools)
            #results = []
            #for task in tasks:
            #    result = pool.apply_async(Submit,(task))
            #    results.append(result)
            # Submit in workers
            results = [pool.apply_async(Submit, (t)) for t in tasks]

            # timeout per submission
            timeout = 60
            stopflag = False
            for result, task in zip(results, tasks):
                try:
                    jdb = result.get(timeout)
                    jconv = JobConv()
                    job = jconv.db2job(jdb)
                except multiprocessing.TimeoutError:
                    self.log.error(
                        "%s: submission timeout: exit and try again" % task[1])
                    # abort submission if Submit process is stuck
                    #pool.terminate()
                    KillPool(pool)
                    pool.join()
                    stopflag = True
                    # reduce timeout to finish quickly
                    timeout = 0.1
                    continue
                if job is None:
                    self.log.error("%s: no job defined for %d" %
                                   (task[1], task[0]))
                    continue
                jd = {}
                jd['arcstate'] = 'submitted'
                # initial offset to 1 minute to force first status check
                jd['tarcstate'] = self.db.getTimeStamp(
                    time.time() -
                    int(self.conf.get(['jobs', 'checkinterval'])) + 120)
                jd['tstate'] = self.db.getTimeStamp()
                # extract hostname of cluster (depends on JobID being a URL)
                self.log.info("%s: job id %s" % (task[1], job.JobID))
                jd['cluster'] = self.cluster
                self.db.updateArcJobLazy(task[0], jd, job)
            if not stopflag:
                pool.terminate()
                pool.join()
            else:
                # stop submitting, gsiftp connection problem likely
                raise ExceptInterrupt(15)

            self.log.info("threads finished")
            # commit transaction to release row locks
            self.db.Commit()

            # still proxy bug - exit if there are multiple proxies
            if len(self.db.getProxiesInfo('TRUE', ['id'])) > 1:
                raise ExceptInterrupt(15)

        self.log.info("end submitting")

        return
Beispiel #28
0
# Simple job description which outputs hostname to stdout
jobdescstring = "&(executable=/bin/hostname)(stdout=stdout)"

# Parse job description
jobdescs = arc.JobDescriptionList()
if not arc.JobDescription_Parse(jobdescstring, jobdescs):
    logger.msg(arc.ERROR, "Invalid job description")
    sys.exit(1)

# Use 'arc.JobDescription_ParseFromFile("helloworld.xrsl", jobdescs)'
# to parse job description from file.

# Use top-level NorduGrid information index to find resources
index = arc.Endpoint(
    "ldap://index1.nordugrid.org:2135/Mds-Vo-name=NorduGrid,o=grid",
    arc.Endpoint.REGISTRY, "org.nordugrid.ldapegiis")
services = arc.EndpointList(1, index)

# Do the submission
jobs = arc.JobList()
submitter = arc.Submitter(usercfg)
if submitter.BrokeredSubmit(services, jobdescs,
                            jobs) != arc.SubmissionStatus.NONE:
    logger.msg(arc.ERROR, "Failed to submit job")
    sys.exit(1)

# Write information on submitted job to local job list (~/.arc/jobs.xml)
jobList = arc.JobInformationStorageXML(usercfg.JobListFile())
if not jobList.Write(jobs):
    logger.msg(arc.WARNING, "Failed to write to local job list %s",
Beispiel #29
0
  def getCEStatus(self):
    """ Method to return information on running and pending jobs.
        We hope to satisfy both instances that use robot proxies and those which use proper configurations.
    """

    result = self._prepareProxy()
    if not result['OK']:
      self.log.error('ARCComputingElement: failed to set up proxy', result['Message'])
      return result
    self.usercfg.ProxyPath(os.environ['X509_USER_PROXY'])

    # Try to find out which VO we are running for.
    vo = ''
    res = getVOfromProxyGroup()
    if res['OK']:
      vo = res['Value']

    result = S_OK()
    result['SubmittedJobs'] = 0
    if not vo:
      # Presumably the really proper way forward once the infosys-discuss WG comes up with a solution
      # and it is implemented. Needed for DIRAC instances which use robot certificates for pilots.
      endpoints = [arc.Endpoint(str("ldap://" + self.ceHost + "/MDS-Vo-name=local,o=grid"),
                                arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.ldapng')]
      retriever = arc.ComputingServiceRetriever(self.usercfg, endpoints)
      retriever.wait()  # Takes a bit of time to get and parse the ldap information
      targets = retriever.GetExecutionTargets()
      ceStats = targets[0].ComputingShare
      self.log.debug("Running jobs for CE %s : %s" % (self.ceHost, ceStats.RunningJobs))
      self.log.debug("Waiting jobs for CE %s : %s" % (self.ceHost, ceStats.WaitingJobs))
      result['RunningJobs'] = ceStats.RunningJobs
      result['WaitingJobs'] = ceStats.WaitingJobs
    else:
      # The system which works properly at present for ARC CEs that are configured correctly.
      # But for this we need the VO to be known - ask me (Raja) for the whole story if interested.
      # cmd = 'ldapsearch -x -LLL -H ldap://%s:2135 -b mds-vo-name=resource,o=grid "(GlueVOViewLocalID=%s)"' % (
      #     self.ceHost, vo.lower())
      if not self.queue:
        self.log.error('ARCComputingElement: No queue ...')
        res = S_ERROR('Unknown queue (%s) failure for site %s' % (self.queue, self.ceHost))
        return res
      cmd1 = "ldapsearch -x -o ldif-wrap=no -LLL -h %s:2135  -b \'o=glue\' " % self.ceHost
      cmd2 = '"(&(objectClass=GLUE2MappingPolicy)(GLUE2PolicyRule=vo:%s))"' % vo.lower()
      cmd3 = ' | grep GLUE2MappingPolicyShareForeignKey | grep %s' % (self.queue.split("-")[-1])
      cmd4 = ' | sed \'s/GLUE2MappingPolicyShareForeignKey: /GLUE2ShareID=/\' '
      cmd5 = ' | xargs -L1 ldapsearch -x -o ldif-wrap=no -LLL -h %s:2135 -b \'o=glue\' ' % self.ceHost
      cmd6 = ' | egrep \'(ShareWaiting|ShareRunning)\''
      res = shellCall(0, cmd1 + cmd2 + cmd3 + cmd4 + cmd5 + cmd6)
      if not res['OK']:
        self.log.debug("Could not query CE %s - is it down?" % self.ceHost)
        return res
      try:
        ldapValues = res['Value'][1].split("\n")
        running = [lValue for lValue in ldapValues if 'GLUE2ComputingShareRunningJobs' in lValue]
        waiting = [lValue for lValue in ldapValues if 'GLUE2ComputingShareWaitingJobs' in lValue]
        result['RunningJobs'] = int(running[0].split(":")[1])
        result['WaitingJobs'] = int(waiting[0].split(":")[1])
      except IndexError:
        res = S_ERROR('Unknown ldap failure for site %s' % self.ceHost)
        return res

    return result
Beispiel #30
0
    def submit(self):
        """
        Main function to submit jobs.
        """

        global queuelist

        # check for stopsubmission flag
        if self.conf.get(['downtime', 'stopsubmission']) == "true":
            self.log.info('Submission suspended due to downtime')
            return 0

        # Get cluster host and queue: cluster/queue
        clusterhost = clusterqueue = None
        if self.cluster:
            cluster = self.cluster
            if cluster.find('://') == -1:
                cluster = 'gsiftp://' + cluster
            clusterurl = arc.URL(cluster)
            clusterhost = clusterurl.Host()
            clusterqueue = clusterurl.Path()[1:]  # strip off leading slash

        # Apply fair-share
        if self.cluster:
            fairshares = self.db.getArcJobsInfo(
                "arcstate='tosubmit' and clusterlist like '%" + self.cluster +
                "%'", ['fairshare'])
        else:
            fairshares = self.db.getArcJobsInfo(
                "arcstate='tosubmit' and clusterlist=''", ['fairshare'])

        if not fairshares:
            self.log.info('Nothing to submit')
            return 0

        fairshares = list(set([p['fairshare'] for p in fairshares]))
        # For EMI-ES proxy bug - see below
        shuffle(fairshares)
        count = 0

        for fairshare in fairshares:

            try:
                # catch any exceptions here to avoid leaving lock
                if self.cluster:
                    # Lock row for update in case multiple clusters are specified
                    #jobs=self.db.getArcJobsInfo("arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' order by priority desc limit 10".format(self.cluster, fairshare),
                    jobs = self.db.getArcJobsInfo(
                        "arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' limit 10"
                        .format(self.cluster, fairshare),
                        columns=[
                            "id", "jobdesc", "appjobid", "priority", "proxyid"
                        ],
                        lock=True)
                    if jobs:
                        self.log.debug("started lock for writing %d jobs" %
                                       len(jobs))
                else:
                    jobs = self.db.getArcJobsInfo(
                        "arcstate='tosubmit' and clusterlist='' and fairshare='{0}' limit 10"
                        .format(fairshare),
                        columns=["id", "jobdesc", "appjobid", "priority"])
                # mark submitting in db
                jobs_taken = []
                for j in jobs:
                    jd = {
                        'cluster': self.cluster,
                        'arcstate': 'submitting',
                        'tarcstate': self.db.getTimeStamp()
                    }
                    self.db.updateArcJobLazy(j['id'], jd)
                    jobs_taken.append(j)
                jobs = jobs_taken

            finally:
                if self.cluster:
                    try:
                        self.db.Commit(lock=True)
                        self.log.debug("ended lock")
                    except:
                        self.log.warning("Failed to release DB lock")
                else:
                    self.db.Commit()

            if len(jobs) == 0:
                #self.log.debug("No jobs to submit")
                continue
            self.log.info("Submitting %d jobs for fairshare %s" %
                          (len(jobs), fairshare))

            # max waiting priority
            try:
                maxpriowaiting = max(jobs,
                                     key=lambda x: x['priority'])['priority']
            except:
                maxpriowaiting = 0
            self.log.info("Maximum priority of waiting jobs: %d" %
                          maxpriowaiting)

            # Query infosys - either local or index
            if self.cluster:
                if self.cluster.find('://') != -1:
                    aris = arc.URL(self.cluster)
                else:
                    aris = arc.URL('gsiftp://%s' % self.cluster)
                if aris.Protocol() == 'https':
                    aris.ChangePath('/arex')
                    infoendpoints = [
                        arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO,
                                     'org.ogf.glue.emies.resourceinfo')
                    ]
                elif aris.Protocol() == 'local':
                    infoendpoints = [
                        arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO,
                                     'org.nordugrid.local')
                    ]
                else:
                    aris = 'ldap://' + aris.Host(
                    ) + '/mds-vo-name=local,o=grid'
                    infoendpoints = [
                        arc.Endpoint(aris, arc.Endpoint.COMPUTINGINFO,
                                     'org.nordugrid.ldapng')
                    ]
            else:
                giises = self.conf.getList(['atlasgiis', 'item'])
                infoendpoints = []
                for g in giises:
                    # Specify explicitly EGIIS
                    infoendpoints.append(
                        arc.Endpoint(str(g), arc.Endpoint.REGISTRY,
                                     "org.nordugrid.ldapegiis"))

            # Set UserConfig credential for each proxy. Assumes that any proxy
            # in the fairshare can query the CE infosys
            self.uc.CredentialString(self.db.getProxy(jobs[0]['proxyid']))
            # retriever contains a list of CE endpoints
            retriever = arc.ComputingServiceRetriever(self.uc, infoendpoints)
            retriever.wait()
            # targets is the list of queues
            # parse target.ComputingService.ID for the CE hostname
            # target.ComputingShare.Name is the queue name
            targets = retriever.GetExecutionTargets()

            # Filter only sites for this process
            queuelist = []
            for target in targets:
                if not target.ComputingService.ID:
                    self.log.info(
                        "Target %s does not have ComputingService ID defined, skipping"
                        % target.ComputingService.Name)
                    continue
                # If EMI-ES infoendpoint, force EMI-ES submission
                if infoendpoints[
                        0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation':
                    self.log.debug(
                        "Rejecting target interface %s because not EMI-ES" %
                        target.ComputingEndpoint.InterfaceName)
                    continue
                # Check for matching host and queue
                targethost = re.sub(
                    ':arex$', '',
                    re.sub('urn:ogf:ComputingService:', '',
                           target.ComputingService.ID))
                targetqueue = target.ComputingShare.Name
                if clusterhost and targethost != clusterhost:
                    self.log.debug(
                        'Rejecting target host %s as it does not match %s' %
                        (targethost, clusterhost))
                    continue
                if clusterqueue and targetqueue != clusterqueue:
                    self.log.debug(
                        'Rejecting target queue %s as it does not match %s' %
                        (targetqueue, clusterqueue))
                    continue
                if targetqueue in self.conf.getList(['queuesreject', 'item']):
                    self.log.debug(
                        'Rejecting target queue %s in queuesreject list' %
                        targetqueue)
                    continue
                elif targethost in self.conf.getList(
                    ['clustersreject', 'item']):
                    self.log.debug(
                        'Rejecting target host %s in clustersreject list' %
                        targethost)
                    continue
                else:
                    # tmp hack
                    target.ComputingShare.LocalWaitingJobs = 0
                    target.ComputingShare.PreLRMSWaitingJobs = 0
                    target.ExecutionEnvironment.CPUClockSpeed = 2000
                    qjobs = self.db.getArcJobsInfo(
                        "cluster='" + str(self.cluster) +
                        "' and  arcstate='submitted' and fairshare='%s'" %
                        fairshare, ['id', 'priority'])
                    rjobs = self.db.getArcJobsInfo(
                        "cluster='" + str(self.cluster) +
                        "' and  arcstate='running' and fairshare='%s'" %
                        fairshare, ['id'])

                    # max queued priority
                    try:
                        maxprioqueued = max(
                            qjobs, key=lambda x: x['priority'])['priority']
                    except:
                        maxprioqueued = 0
                    self.log.info("Max priority queued: %d" % maxprioqueued)

                    # Set number of submitted jobs to running * 0.15 + 400/num of shares
                    # Note: assumes only a few shares are used
                    jlimit = len(rjobs) * 0.15 + 100 / len(fairshares)
                    if str(self.cluster).find('arc-boinc-0') != -1:
                        jlimit = len(rjobs) * 0.15 + 400
                    if str(self.cluster).find('XXXpikolit') != -1:
                        jlimit = len(rjobs) * 0.15 + 100
                    if str(self.cluster).find('arc05.lcg') != -1:
                        jlimit = len(rjobs) * 0.15 + 400
                    target.ComputingShare.PreLRMSWaitingJobs = len(qjobs)
                    if len(qjobs) < jlimit or ((maxpriowaiting > maxprioqueued)
                                               and (maxpriowaiting > 10)):
                        if maxpriowaiting > maxprioqueued:
                            self.log.info(
                                "Overriding limit, maxpriowaiting: %d > maxprioqueued: %d"
                                % (maxpriowaiting, maxprioqueued))
                        queuelist.append(target)
                        self.log.debug("Adding target %s:%s" %
                                       (targethost, targetqueue))
                    else:
                        self.log.info(
                            "%s/%s already at limit of submitted jobs for fairshare %s"
                            % (targethost, targetqueue, fairshare))

            # check if any queues are available, if not leave and try again next time
            if not queuelist:
                self.log.info("No free queues available")
                self.db.Commit()
                # EMI-ES proxy problem - see bug 3685
                if self.cluster and self.cluster.startswith('https://'):
                    raise ExceptInterrupt(15)
                continue

            self.log.info("start submitting")

            # Just run one thread for each job in sequence. Strange things happen
            # when trying to create a new UserConfig object for each thread.
            for j in jobs:
                self.log.debug("%s: preparing submission" % j['appjobid'])
                jobdescstr = str(
                    self.db.getArcJobDescription(str(j['jobdesc'])))
                jobdescs = arc.JobDescriptionList()
                if not jobdescstr or not arc.JobDescription_Parse(
                        jobdescstr, jobdescs):
                    self.log.error("%s: Failed to prepare job description" %
                                   j['appjobid'])
                    continue
                # TODO: might not work if proxies are different within a share
                # since same uc object is shared among threads
                self.uc.CredentialString(self.db.getProxy(j['proxyid']))
                t = SubmitThr(Submit, j['id'], j['appjobid'], jobdescs,
                              self.uc, self.log)
                self.RunThreadsSplit([t], 1)
                count = count + 1

            self.log.info("threads finished")
            # commit transaction to release row locks
            self.db.Commit()

            # EMI-ES proxy problem - see bug 3685
            if self.cluster and self.cluster.startswith('https://'):
                raise ExceptInterrupt(15)

        self.log.info("end submitting")

        return count