Ejemplo n.º 1
0
    def __mt_bulk_submit__(self, node_jdls):
        '''submitting jobs in multiple threads'''

        job = self.getJobObject()

        logger.warning(
            'submitting %d subjobs ... it may take a while' % len(node_jdls))

        # the algorithm for submitting a single bulk job
        class MyAlgorithm(Algorithm):

            def __init__(self, cred_req, masterInputWorkspace, ce, delid):
                Algorithm.__init__(self)
                self.inpw = masterInputWorkspace
                self.cred_req = cred_req
                self.ce = ce
                self.delid = delid

            def process(self, jdl_info):
                my_sj_id = jdl_info[0]
                my_sj_jdl = jdl_info[1]

                my_sj_jid = Grid.cream_submit(my_sj_jdl, self.ce, self.delid, self.cred_req)

                if not my_sj_jid:
                    return False
                else:
                    self.__appendResult__(my_sj_id, my_sj_jid)
                    return True

        mt_data = []
        for id, jdl in node_jdls.items():
            mt_data.append((id, jdl))

        myAlg = MyAlgorithm(cred_req=self.credential_requirements,
                            masterInputWorkspace=job.getInputWorkspace(),
                            ce=self.CE,
                            delid=self.delegation_id)
        myData = Data(collection=mt_data)

        runner = MTRunner(name='cream_jsubmit', algorithm=myAlg,
                          data=myData, numThread=config['SubmissionThread'])
        runner.start()
        runner.join(timeout=-1)

        if len(runner.getDoneList()) < len(mt_data):
            # not all bulk jobs are successfully submitted. canceling the
            # submitted jobs on WMS immediately
            logger.error(
                'some bulk jobs not successfully (re)submitted, canceling submitted jobs on WMS')
            Grid.cancel_multiple(runner.getResults().values())
            return None
        else:
            return runner.getResults()
Ejemplo n.º 2
0
    def master_bulk_kill(self):
        '''GLITE bulk resubmission'''

        job = self.getJobObject()

        # killing the individually re-submitted subjobs
        logger.debug('cancelling running/submitted subjobs.')

        # 1. collect job ids
        ids = []
        for sj in job.subjobs:
            if sj.status in ['submitted', 'running'] and sj.backend.id:
                ids.append(sj.backend.id)

        # 2. cancel the collected jobs
        ck = Grid.arc_cancel_multiple(ids, self.credential_requirements)
        if not ck:
            logger.warning('Job cancellation failed')
            return False
        else:
            for sj in job.subjobs:
                if sj.backend.id in ids:
                    sj.updateStatus('killed')

            return True
Ejemplo n.º 3
0
    def __setup_sandboxcache__(self, job):
        '''Sets up the sandbox cache object to adopt the runtime configuration of the LCG backend'''

        re_token = re.compile('^token:(.*):(.*)$')

        self.sandboxcache.timeout = config['SandboxTransferTimeout']

        if self.sandboxcache._name == 'LCGSandboxCache':
            if not self.sandboxcache.lfc_host:
                self.sandboxcache.lfc_host = Grid.__get_lfc_host__()

            if not self.sandboxcache.se:

                token = ''
                se_host = config['DefaultSE']
                m = re_token.match(se_host)
                if m:
                    token = m.group(1)
                    se_host = m.group(2)

                self.sandboxcache.se = se_host

                if token:
                    self.sandboxcache.srm_token = token

            if (self.sandboxcache.se_type in ['srmv2']) and (not self.sandboxcache.srm_token):
                self.sandboxcache.srm_token = config['DefaultSRMToken']

        return True
Ejemplo n.º 4
0
            def process(self, jdl_info):
                my_sj_id = jdl_info[0]
                my_sj_jdl = jdl_info[1]

                my_sj_jid = Grid.arc_submit(my_sj_jdl, self.ce, self.arcverbose, self.cred_req)

                if not my_sj_jid:
                    return False
                else:
                    self.__appendResult__(my_sj_id, my_sj_jid)
                    return True
Ejemplo n.º 5
0
    def kill(self):
        '''Kill the job'''
        job = self.getJobObject()

        logger.info('Killing job %s' % job.getFQID('.'))

        if not self.id:
            logger.warning('Job %s is not running.' % job.getFQID('.'))
            return False

        return Grid.cream_cancelMultiple([self.id])
Ejemplo n.º 6
0
    def kill(self):
        '''Kill the job'''
        job = self.getJobObject()

        logger.info('Killing job %s' % job.getFQID('.'))

        if not self.id:
            logger.warning('Job %s is not running.' % job.getFQID('.'))
            return False

        return Grid.arc_cancel([self.id], self.credential_requirements)
Ejemplo n.º 7
0
            def process(self, jdl_info):
                my_sj_id = jdl_info[0]
                my_sj_jdl = jdl_info[1]

                my_sj_jid = Grid.cream_submit(my_sj_jdl, self.ce, self.delid)

                if not my_sj_jid:
                    return False
                else:
                    self.__appendResult__(my_sj_id, my_sj_jid)
                    return True
Ejemplo n.º 8
0
    def __setup_sandboxcache__(self, job):
        '''Sets up the sandbox cache object to adopt the runtime configuration of the LCG backend'''

        re_token = re.compile('^token:(.*):(.*)$')

        self.sandboxcache.vo = config['VirtualOrganisation']
        self.sandboxcache.timeout = config['SandboxTransferTimeout']

        if self.sandboxcache._name == 'LCGSandboxCache':
            if not self.sandboxcache.lfc_host:
                self.sandboxcache.lfc_host = Grid.__get_lfc_host__()

            if not self.sandboxcache.se:

                token = ''
                se_host = config['DefaultSE']
                m = re_token.match(se_host)
                if m:
                    token = m.group(1)
                    se_host = m.group(2)

                self.sandboxcache.se = se_host

                if token:
                    self.sandboxcache.srm_token = token

            if (self.sandboxcache.se_type in ['srmv2']) and (not self.sandboxcache.srm_token):
                self.sandboxcache.srm_token = config['DefaultSRMToken']

        elif self.sandboxcache._name == 'DQ2SandboxCache':

            # generate a new dataset name if not given
            if not self.sandboxcache.dataset_name:
                from GangaAtlas.Lib.ATLASDataset.DQ2Dataset import dq2outputdatasetname
                self.sandboxcache.dataset_name, unused = dq2outputdatasetname(
                    "%s.input" % get_uuid(), 0, False, '')

            # subjobs inherits the dataset name from the master job
            for sj in job.subjobs:
                sj.backend.sandboxcache.dataset_name = self.sandboxcache.dataset_name

        elif self.sandboxcache._name == 'GridftpSandboxCache':
            if config['CreamInputSandboxBaseURI']:
                self.sandboxcache.baseURI = config['CreamInputSandboxBaseURI']
            elif self.CE:
                ce_host = re.sub(r'\:[0-9]+', '', self.CE.split('/cream')[0])
                self.sandboxcache.baseURI = 'gsiftp://%s/opt/glite/var/cream_sandbox/%s' % (
                    ce_host, self.sandboxcache.vo)
            else:
                logger.error('baseURI not available for GridftpSandboxCache')
                return False

        return True
Ejemplo n.º 9
0
def test_submit_no_proxy(mocker):
    """
    Test that the lack of a proxy object causes the submit to fail
    """
    check_proxy = mocker.patch('Ganga.Lib.LCG.Grid.check_proxy', return_value=False)

    from Ganga.Lib.LCG import Grid
    job_url = Grid.submit('/some/path')

    assert check_proxy.call_count == 1

    assert job_url is None
Ejemplo n.º 10
0
    def submit(self, subjobconfig, master_job_sandbox):
        '''Submit the job to the grid'''

        ick = False

        jdlpath = self.preparejob(subjobconfig, master_job_sandbox)

        if jdlpath:
            self.id = Grid.cream_submit(jdlpath, self.CE, self.delegation_id)

            if self.id:
                self.actualCE = self.CE
                ick = True

        return ick
Ejemplo n.º 11
0
def test_submit_bad_output(mocker):
    """
    Test that the external command returning bad data causes the job to fail
    """
    __set_submit_option__ = mocker.patch('Ganga.Lib.LCG.Grid.__set_submit_option__', return_value='  ')
    mocker.patch('Ganga.Lib.LCG.Grid.getShell', return_value=Shell)
    cmd1 = mocker.patch('Ganga.Utility.GridShell.Shell.cmd1', return_value=(0, 'some bad output', False))

    from Ganga.Lib.LCG import Grid
    job_url = Grid.submit('/some/path', cred_req=FakeCred())

    assert __set_submit_option__.call_count == 1
    assert cmd1.call_count == 1

    assert job_url is None
Ejemplo n.º 12
0
    def submit(self, subjobconfig, master_job_sandbox):
        '''Submit the job to the grid'''

        ick = False

        xrslpath = self.preparejob(subjobconfig, master_job_sandbox)

        if xrslpath:
            self.id = Grid.arc_submit(xrslpath, self.CE, self.verbose, self.credential_requirements)

            if self.id:
                self.actualCE = self.CE
                ick = True

        return ick
Ejemplo n.º 13
0
def test_submit_expired_proxy(mocker):
    """
    Test that an invalid proxy object causes the submit to fail
    """
    check_proxy = mocker.patch('Ganga.Lib.LCG.Grid.check_proxy', return_value=True)
    credential = mocker.patch('Ganga.Lib.LCG.Grid.credential', return_value=mocker.MagicMock())
    credential.return_value.isValid.return_value = False

    from Ganga.Lib.LCG import Grid
    job_url = Grid.submit('/some/path')

    assert check_proxy.call_count == 1
    assert credential.call_count == 1

    assert job_url is None
Ejemplo n.º 14
0
def test_submit(mocker):
    """
    Test that a job submit succeeds with valid input
    """
    __set_submit_option__ = mocker.patch('Ganga.Lib.LCG.Grid.__set_submit_option__', return_value='  ')
    mocker.patch('Ganga.Lib.LCG.Grid.getShell', return_value=Shell)
    cmd1 = mocker.patch('Ganga.Utility.GridShell.Shell.cmd1', return_value=(0, 'https://example.com:9000/some_url', False))

    from Ganga.Lib.LCG import Grid
    job_url = Grid.submit('/some/path', cred_req=FakeCred())

    assert __set_submit_option__.call_count == 1
    assert cmd1.call_count == 1

    assert '/some/path' in cmd1.call_args[0][0], 'JDL path was not passed correctly'
    assert job_url == 'https://example.com:9000/some_url'
Ejemplo n.º 15
0
    def process(self, item):
        """
        downloads output of one LCG job 
        """

        pps_check = (True, None)

        job = item.jobObj

        # it is very likely that the job's downloading task has been
        # created and assigned in a previous monitoring loop
        # ignore such kind of cases
        if job.status in ['completing', 'completed', 'failed']:
            return True

        # it can also happen that the job was killed/removed by user between
        # the downloading task was created in queue and being taken by one of
        # the downloading thread. Ignore suck kind of cases
        if job.status in ['removed', 'killed']:
            return True

        job.updateStatus('completing')
        outw = job.getOutputWorkspace()

        pps_check = Grid.get_output(job.backend.id, outw.getPath(),
                                    job.backend.credential_requirements)

        if pps_check[0]:
            job.updateStatus('completed')
            job.backend.exitcode = 0
        else:
            job.updateStatus('failed')
            # update the backend's reason if the failure detected in the
            # Ganga's pps
            if pps_check[1] != 0:
                job.backend.reason = 'non-zero app. exit code: %s' % pps_check[
                    1]
                job.backend.exitcode = pps_check[1]

        # needs to update the master job's status to give an up-to-date status
        # of the whole job
        if job.master:
            job.master.updateMasterJobStatus()

        self.__appendResult__(job.getFQID('.'), True)

        return True
Ejemplo n.º 16
0
    def process(self, item):
        """
        downloads output of one LCG job 
        """

        pps_check = (True, None)

        job = item.jobObj

        # it is very likely that the job's downloading task has been
        # created and assigned in a previous monitoring loop
        # ignore such kind of cases
        if job.status in ['completing', 'completed', 'failed']:
            return True

        # it can also happen that the job was killed/removed by user between
        # the downloading task was created in queue and being taken by one of
        # the downloading thread. Ignore suck kind of cases
        if job.status in ['removed', 'killed']:
            return True

        job.updateStatus('completing')
        outw = job.getOutputWorkspace()

        pps_check = Grid.get_output(job.backend.id, outw.getPath(), job.backend.credential_requirements)

        if pps_check[0]:
            job.updateStatus('completed')
            job.backend.exitcode = 0
        else:
            job.updateStatus('failed')
            # update the backend's reason if the failure detected in the
            # Ganga's pps
            if pps_check[1] != 0:
                job.backend.reason = 'non-zero app. exit code: %s' % pps_check[
                    1]
                job.backend.exitcode = pps_check[1]

        # needs to update the master job's status to give an up-to-date status
        # of the whole job
        if job.master:
            job.master.updateMasterJobStatus()

        self.__appendResult__(job.getFQID('.'), True)

        return True
Ejemplo n.º 17
0
def test_submit_bad_output(mocker):
    """
    Test that the external command returning bad data causes the job to fail
    """
    __set_submit_option__ = mocker.patch(
        'Ganga.Lib.LCG.Grid.__set_submit_option__', return_value='  ')
    mocker.patch('Ganga.Lib.LCG.Grid.getShell', return_value=Shell)
    cmd1 = mocker.patch('Ganga.Utility.GridShell.Shell.cmd1',
                        return_value=(0, 'some bad output', False))

    from Ganga.Lib.LCG import Grid
    job_url = Grid.submit('/some/path', cred_req=FakeCred())

    assert __set_submit_option__.call_count == 1
    assert cmd1.call_count == 1

    assert job_url is None
Ejemplo n.º 18
0
def test_submit_expired_proxy(mocker):
    """
    Test that an invalid proxy object causes the submit to fail
    """
    check_proxy = mocker.patch('Ganga.Lib.LCG.Grid.check_proxy',
                               return_value=True)
    credential = mocker.patch('Ganga.Lib.LCG.Grid.credential',
                              return_value=mocker.MagicMock())
    credential.return_value.isValid.return_value = False

    from Ganga.Lib.LCG import Grid
    job_url = Grid.submit('/some/path')

    assert check_proxy.call_count == 1
    assert credential.call_count == 1

    assert job_url is None
Ejemplo n.º 19
0
    def master_submit(self, rjobs, subjobconfigs, masterjobconfig):
        '''Submit the master job to the grid'''

        profiler = ElapsedTimeProfiler(getLogger(name='Profile.LCG'))
        profiler.start()

        job = self.getJobObject()

        # finding ARC CE endpoint for job submission
        #allowed_celist = []
        # try:
        #    allowed_celist = self.requirements.getce()
        #    if not self.CE and allowed_celist:
        #        self.CE = allowed_celist[0]
        # except:
        #    logger.warning('ARC CE assigment from ARCRequirements failed.')

        # if self.CE and allowed_celist:
        #    if self.CE not in allowed_celist:
        #        logger.warning('submission to CE not allowed: %s, use %s instead' % ( self.CE, allowed_celist[0] ) )
        #        self.CE = allowed_celist[0]

        # use arc info to check for any endpoints recorded in the config file
        rc, output = Grid.arc_info()

        if not self.CE and rc != 0:
            raise GangaException(
                "ARC CE endpoint not set and no default settings in '%s'. " %
                config['ArcConfigFile'])
        elif self.CE:
            logger.info('ARC CE endpoint set to: ' + str(self.CE))
        else:
            logger.info("Using ARC CE endpoints defined in '%s'" %
                        config['ArcConfigFile'])

        # doing massive job preparation
        if len(job.subjobs) == 0:
            ick = IBackend.master_submit(self, rjobs, subjobconfigs,
                                         masterjobconfig)
        else:
            ick = self.master_bulk_submit(rjobs, subjobconfigs,
                                          masterjobconfig)

        profiler.check('==> master_submit() elapsed time')

        return ick
Ejemplo n.º 20
0
    def master_submit(self, rjobs, subjobconfigs, masterjobconfig):
        '''Submit the master job to the grid'''

        profiler = ElapsedTimeProfiler(getLogger(name='Profile.LCG'))
        profiler.start()

        job = self.getJobObject()

        # finding CREAM CE endpoint for job submission
        allowed_celist = []
        try:
            allowed_celist = self.requirements.getce()
            if not self.CE and allowed_celist:
                self.CE = allowed_celist[0]
        except:
            logger.warning(
                'CREAM CE assigment from AtlasCREAMRequirements failed.')

        if self.CE and allowed_celist:
            if self.CE not in allowed_celist:
                logger.warning(
                    'submission to CE not allowed: %s, use %s instead' %
                    (self.CE, allowed_celist[0]))
                self.CE = allowed_celist[0]

        if not self.CE:
            raise GangaException('CREAM CE endpoint not set')

        # delegate proxy to CREAM CE
        self.delegation_id = Grid.cream_proxy_delegation(
            self.CE, self.delegation_id, self.credential_requirements)
        if not self.delegation_id:
            logger.warning('proxy delegation to %s failed' % self.CE)

        # doing massive job preparation
        if len(job.subjobs) == 0:
            ick = IBackend.master_submit(self, rjobs, subjobconfigs,
                                         masterjobconfig)
        else:
            ick = self.master_bulk_submit(rjobs, subjobconfigs,
                                          masterjobconfig)

        profiler.check('==> master_submit() elapsed time')

        return ick
Ejemplo n.º 21
0
    def master_submit(self, rjobs, subjobconfigs, masterjobconfig):
        '''Submit the master job to the grid'''

        profiler = ElapsedTimeProfiler(getLogger(name='Profile.LCG'))
        profiler.start()

        job = self.getJobObject()

        # finding ARC CE endpoint for job submission
        #allowed_celist = []
        # try:
        #    allowed_celist = self.requirements.getce()
        #    if not self.CE and allowed_celist:
        #        self.CE = allowed_celist[0]
        # except:
        #    logger.warning('ARC CE assigment from ARCRequirements failed.')

        # if self.CE and allowed_celist:
        #    if self.CE not in allowed_celist:
        #        logger.warning('submission to CE not allowed: %s, use %s instead' % ( self.CE, allowed_celist[0] ) )
        #        self.CE = allowed_celist[0]

        # use arc info to check for any endpoints recorded in the config file
        rc, output = Grid.arc_info(self.credential_requirements)

        if not self.CE and rc != 0:
            raise GangaException(
                "ARC CE endpoint not set and no default settings in '%s'. " % config['ArcConfigFile'])
        elif self.CE:
            logger.info('ARC CE endpoint set to: ' + str(self.CE))
        else:
            logger.info("Using ARC CE endpoints defined in '%s'" %
                        config['ArcConfigFile'])

        # doing massive job preparation
        if len(job.subjobs) == 0:
            ick = IBackend.master_submit(
                self, rjobs, subjobconfigs, masterjobconfig)
        else:
            ick = self.master_bulk_submit(
                rjobs, subjobconfigs, masterjobconfig)

        profiler.check('==> master_submit() elapsed time')

        return ick
Ejemplo n.º 22
0
    def resubmit(self):
        '''Resubmit the job'''

        ick = False

        job = self.getJobObject()

        jdlpath = job.getInputWorkspace().getPath("__jdlfile__")

        if jdlpath:
            self.id = Grid.arc_submit(jdlpath, self.CE, self.verbose, self.credential_requirements)

            if self.id:
                # refresh the lcg job information
                self.__refresh_jobinfo__(job)
                self.actualCE = self.CE
                ick = True

        return ick
Ejemplo n.º 23
0
    def master_submit(self, rjobs, subjobconfigs, masterjobconfig):
        '''Submit the master job to the grid'''

        profiler = ElapsedTimeProfiler(getLogger(name='Profile.LCG'))
        profiler.start()

        job = self.getJobObject()

        # finding CREAM CE endpoint for job submission
        allowed_celist = []
        try:
            allowed_celist = self.requirements.getce()
            if not self.CE and allowed_celist:
                self.CE = allowed_celist[0]
        except:
            logger.warning(
                'CREAM CE assigment from AtlasCREAMRequirements failed.')

        if self.CE and allowed_celist:
            if self.CE not in allowed_celist:
                logger.warning('submission to CE not allowed: %s, use %s instead' % (
                    self.CE, allowed_celist[0]))
                self.CE = allowed_celist[0]

        if not self.CE:
            raise GangaException('CREAM CE endpoint not set')

        # delegate proxy to CREAM CE
        self.delegation_id = Grid.cream_proxy_delegation(self.CE, self.delegation_id)
        if not self.delegation_id:
            logger.warning('proxy delegation to %s failed' % self.CE)

        # doing massive job preparation
        if len(job.subjobs) == 0:
            ick = IBackend.master_submit(
                self, rjobs, subjobconfigs, masterjobconfig)
        else:
            ick = self.master_bulk_submit(
                rjobs, subjobconfigs, masterjobconfig)

        profiler.check('==> master_submit() elapsed time')

        return ick
Ejemplo n.º 24
0
    def resubmit(self):
        '''Resubmit the job'''

        ick = False

        job = self.getJobObject()

        jdlpath = job.getInputWorkspace().getPath("__jdlfile__")

        if jdlpath:
            self.id = Grid.arc_submit(jdlpath, self.CE, self.verbose)

            if self.id:
                # refresh the lcg job information
                self.__refresh_jobinfo__(job)
                self.actualCE = self.CE
                ick = True

        return ick
Ejemplo n.º 25
0
    def resubmit(self):
        '''Resubmit the job'''

        ick = False

        job = self.getJobObject()

        jdlpath = job.getInputWorkspace().getPath("__jdlfile__")

        if jdlpath:
            self.id = Grid.cream_submit(jdlpath, self.CE, self.delegation_id)

            if self.id:
                # refresh the lcg job information
                self.__refresh_jobinfo__(job)
                self.actualCE = self.CE
                ick = True

        return ick
Ejemplo n.º 26
0
    def __setup_sandboxcache__(self, job):
        '''Sets up the sandbox cache object to adopt the runtime configuration of the LCG backend'''

        re_token = re.compile('^token:(.*):(.*)$')

        self.sandboxcache.vo = config['VirtualOrganisation']
        self.sandboxcache.timeout = config['SandboxTransferTimeout']

        if self.sandboxcache._name == 'LCGSandboxCache':
            if not self.sandboxcache.lfc_host:
                self.sandboxcache.lfc_host = Grid.__get_lfc_host__()

            if not self.sandboxcache.se:

                token = ''
                se_host = config['DefaultSE']
                m = re_token.match(se_host)
                if m:
                    token = m.group(1)
                    se_host = m.group(2)

                self.sandboxcache.se = se_host

                if token:
                    self.sandboxcache.srm_token = token

            if (self.sandboxcache.se_type
                    in ['srmv2']) and (not self.sandboxcache.srm_token):
                self.sandboxcache.srm_token = config['DefaultSRMToken']

        elif self.sandboxcache._name == 'GridftpSandboxCache':
            if config['CreamInputSandboxBaseURI']:
                self.sandboxcache.baseURI = config['CreamInputSandboxBaseURI']
            elif self.CE:
                ce_host = re.sub(r'\:[0-9]+', '', self.CE.split('/cream')[0])
                self.sandboxcache.baseURI = 'gsiftp://%s/opt/glite/var/cream_sandbox/%s' % (
                    ce_host, self.sandboxcache.vo)
            else:
                logger.error('baseURI not available for GridftpSandboxCache')
                return False

        return True
Ejemplo n.º 27
0
    def resubmit(self):
        '''Resubmit the job'''

        ick = False

        job = self.getJobObject()

        jdlpath = job.getInputWorkspace().getPath("__jdlfile__")

        if jdlpath:
            self.id = Grid.cream_submit(jdlpath, self.CE, self.delegation_id,
                                        self.credential_requirements)

            if self.id:
                # refresh the lcg job information
                self.__refresh_jobinfo__(job)
                self.actualCE = self.CE
                ick = True

        return ick
Ejemplo n.º 28
0
def test_submit(mocker):
    """
    Test that a job submit succeeds with valid input
    """
    __set_submit_option__ = mocker.patch(
        'Ganga.Lib.LCG.Grid.__set_submit_option__', return_value='  ')
    mocker.patch('Ganga.Lib.LCG.Grid.getShell', return_value=Shell)
    cmd1 = mocker.patch('Ganga.Utility.GridShell.Shell.cmd1',
                        return_value=(0, 'https://example.com:9000/some_url',
                                      False))

    from Ganga.Lib.LCG import Grid
    job_url = Grid.submit('/some/path', cred_req=FakeCred())

    assert __set_submit_option__.call_count == 1
    assert cmd1.call_count == 1

    assert '/some/path' in cmd1.call_args[0][
        0], 'JDL path was not passed correctly'
    assert job_url == 'https://example.com:9000/some_url'
Ejemplo n.º 29
0
def test_submit_bad_output(mocker):
    """
    Test that the external command returning bad data causes the job to fail
    """
    check_proxy = mocker.patch('Ganga.Lib.LCG.Grid.check_proxy', return_value=True)
    credential = mocker.patch('Ganga.Lib.LCG.Grid.credential', return_value=mocker.MagicMock())
    credential.return_value.isValid.return_value = True
    __set_submit_option__ = mocker.patch('Ganga.Lib.LCG.Grid.__set_submit_option__', return_value='  ')
    mocker.patch('Ganga.Lib.LCG.Grid.getShell', return_value=Shell)
    cmd1 = mocker.patch('Ganga.Utility.GridShell.Shell.cmd1', return_value=(0, 'some bad output', False))

    from Ganga.Lib.LCG import Grid
    job_url = Grid.submit('/some/path')

    assert check_proxy.call_count == 1
    assert credential.call_count == 1
    assert __set_submit_option__.call_count == 1
    assert cmd1.call_count == 1

    assert job_url is None
Ejemplo n.º 30
0
def test_submit(mocker):
    """
    Test that a job submit succeeds with valid input
    """
    check_proxy = mocker.patch('Ganga.Lib.LCG.Grid.check_proxy', return_value=True)
    credential = mocker.patch('Ganga.Lib.LCG.Grid.credential', return_value=mocker.MagicMock())
    credential.return_value.isValid.return_value = True
    __set_submit_option__ = mocker.patch('Ganga.Lib.LCG.Grid.__set_submit_option__', return_value='  ')
    mocker.patch('Ganga.Lib.LCG.Grid.getShell', return_value=Shell)
    cmd1 = mocker.patch('Ganga.Utility.GridShell.Shell.cmd1', return_value=(0, 'https://example.com:9000/some_url', False))

    from Ganga.Lib.LCG import Grid
    job_url = Grid.submit('/some/path')

    assert check_proxy.call_count == 1
    assert credential.call_count == 1
    assert __set_submit_option__.call_count == 1
    assert cmd1.call_count == 1

    assert '/some/path' in cmd1.call_args[0][0], 'JDL path was not passed correctly'
    assert job_url == 'https://example.com:9000/some_url'
Ejemplo n.º 31
0
    def master_resubmit(self, rjobs):
        '''Resubmit the master job to the grid'''

        profiler = ElapsedTimeProfiler(getLogger(name='Profile.LCG'))
        profiler.start()

        job = self.getJobObject()

        ick = False

        # delegate proxy to CREAM CE
        self.delegation_id = Grid.cream_proxy_delegation(
            self.CE, self.delegation_id, self.credential_requirements)
        if not self.delegation_id:
            logger.warning('proxy delegation to %s failed' % self.CE)

        if not job.master and len(job.subjobs) == 0:
            # case 1: master job normal resubmission
            logger.debug('rjobs: %s' % str(rjobs))
            logger.debug('mode: master job normal resubmission')
            ick = IBackend.master_resubmit(self, rjobs)

        elif job.master:
            # case 2: individual subjob resubmission
            logger.debug('mode: individual subjob resubmission')
            ick = IBackend.master_resubmit(self, rjobs)

        else:
            # case 3: master job bulk resubmission
            logger.debug('mode: master job resubmission')

            ick = self.master_bulk_resubmit(rjobs)
            if not ick:
                raise GangaException('CREAM bulk submission failure')

        profiler.check('job re-submission elapsed time')

        return ick
Ejemplo n.º 32
0
    def master_resubmit(self, rjobs):
        '''Resubmit the master job to the grid'''

        profiler = ElapsedTimeProfiler(getLogger(name='Profile.LCG'))
        profiler.start()

        job = self.getJobObject()

        ick = False

        # delegate proxy to CREAM CE
        self.delegation_id = Grid.cream_proxy_delegation(self.CE, self.delegation_id)
        if not self.delegation_id:
            logger.warning('proxy delegation to %s failed' % self.CE)

        if not job.master and len(job.subjobs) == 0:
            # case 1: master job normal resubmission
            logger.debug('rjobs: %s' % str(rjobs))
            logger.debug('mode: master job normal resubmission')
            ick = IBackend.master_resubmit(self, rjobs)

        elif job.master:
            # case 2: individual subjob resubmission
            logger.debug('mode: individual subjob resubmission')
            ick = IBackend.master_resubmit(self, rjobs)

        else:
            # case 3: master job bulk resubmission
            logger.debug('mode: master job resubmission')

            ick = self.master_bulk_resubmit(rjobs)
            if not ick:
                raise GangaException('CREAM bulk submission failure')

        profiler.check('job re-submission elapsed time')

        return ick
Ejemplo n.º 33
0
def test_submit_bad_output(mocker):
    """
    Test that the external command returning bad data causes the job to fail
    """
    check_proxy = mocker.patch('Ganga.Lib.LCG.Grid.check_proxy',
                               return_value=True)
    credential = mocker.patch('Ganga.Lib.LCG.Grid.credential',
                              return_value=mocker.MagicMock())
    credential.return_value.isValid.return_value = True
    __set_submit_option__ = mocker.patch(
        'Ganga.Lib.LCG.Grid.__set_submit_option__', return_value='  ')
    mocker.patch('Ganga.Lib.LCG.Grid.getShell', return_value=Shell)
    cmd1 = mocker.patch('Ganga.Utility.GridShell.Shell.cmd1',
                        return_value=(0, 'some bad output', False))

    from Ganga.Lib.LCG import Grid
    job_url = Grid.submit('/some/path')

    assert check_proxy.call_count == 1
    assert credential.call_count == 1
    assert __set_submit_option__.call_count == 1
    assert cmd1.call_count == 1

    assert job_url is None
Ejemplo n.º 34
0
    def preparejob(self, jobconfig, master_job_sandbox):
        '''Prepare the JDL'''

        script = self.__jobWrapperTemplate__()

        job = self.getJobObject()
        inpw = job.getInputWorkspace()

        wrapperlog = '__jobscript__.log'

        import Ganga.Core.Sandbox as Sandbox

        # FIXME: check what happens if 'stdout','stderr' are specified here
        script = script.replace('###OUTPUTSANDBOX###',
                                repr(jobconfig.outputbox))

        script = script.replace('###APPLICATION_NAME###',
                                getName(job.application))
        script = script.replace('###APPLICATIONEXEC###',
                                repr(jobconfig.getExeString()))
        script = script.replace('###APPLICATIONARGS###',
                                repr(jobconfig.getArguments()))

        from Ganga.GPIDev.Lib.File.OutputFileManager import getWNCodeForOutputPostprocessing, getWNCodeForDownloadingInputFiles

        script = script.replace('###OUTPUTUPLOADSPOSTPROCESSING###',
                                getWNCodeForOutputPostprocessing(job, '    '))

        script = script.replace('###DOWNLOADINPUTFILES###',
                                getWNCodeForDownloadingInputFiles(job, '    '))

        if jobconfig.env:
            script = script.replace('###APPLICATIONENVS###',
                                    repr(jobconfig.env))
        else:
            script = script.replace('###APPLICATIONENVS###', repr({}))

        script = script.replace('###WRAPPERLOG###', repr(wrapperlog))
        import inspect
        script = script.replace('###INLINEMODULES###',
                                inspect.getsource(Sandbox.WNSandbox))

        mon = job.getMonitoringService()

        self.monInfo = None

        # set the monitoring file by default to the stdout
        if isinstance(self.monInfo, dict):
            self.monInfo['remotefile'] = 'stdout'

        # try to print out the monitoring service information in debug mode
        try:
            logger.debug('job info of monitoring service: %s' %
                         str(self.monInfo))
        except:
            pass

#       prepare input/output sandboxes
        import Ganga.Utility.files
        from Ganga.GPIDev.Lib.File import File
        from Ganga.Core.Sandbox.WNSandbox import PYTHON_DIR
        import inspect

        fileutils = File(inspect.getsourcefile(Ganga.Utility.files),
                         subdir=PYTHON_DIR)
        packed_files = jobconfig.getSandboxFiles() + [fileutils]
        sandbox_files = job.createPackedInputSandbox(packed_files)

        # sandbox of child jobs should include master's sandbox
        sandbox_files.extend(master_job_sandbox)

        # check the input file size and pre-upload larger inputs to the iocache
        lfc_host = ''

        input_sandbox_uris = []
        input_sandbox_names = []

        ick = True

        max_prestaged_fsize = 0
        for f in sandbox_files:

            idx = self.__check_and_prestage_inputfile__(f)

            if not idx:
                logger.error('input sandbox preparation failed: %s' % f)
                ick = False
                break
            else:

                if idx['lfc_host']:
                    lfc_host = idx['lfc_host']

                if idx['remote']:
                    abspath = os.path.abspath(f)
                    fsize = os.path.getsize(abspath)

                    if fsize > max_prestaged_fsize:
                        max_prestaged_fsize = fsize

                    input_sandbox_uris.append(
                        idx['remote'][os.path.basename(f)])

                    input_sandbox_names.append(os.path.basename(
                        urlparse(f)[2]))

                if idx['local']:
                    input_sandbox_uris += idx['local']
                    input_sandbox_names.append(os.path.basename(f))

        if not ick:
            logger.error('stop job submission')
            return None

        # determin the lcg-cp timeout according to the max_prestaged_fsize
        # - using the assumption of 1 MB/sec.
        max_prestaged_fsize = 0
        lfc_host = ''
        transfer_timeout = config['SandboxTransferTimeout']
        predict_timeout = int(math.ceil(max_prestaged_fsize / 1000000.0))

        if predict_timeout > transfer_timeout:
            transfer_timeout = predict_timeout

        if transfer_timeout < 60:
            transfer_timeout = 60

        script = script.replace('###TRANSFERTIMEOUT###',
                                '%d' % transfer_timeout)

        # update the job wrapper with the inputsandbox list
        script = script.replace(
            '###INPUTSANDBOX###',
            repr({
                'remote': {},
                'local': input_sandbox_names
            }))

        # write out the job wrapper and put job wrapper into job's inputsandbox
        scriptPath = inpw.writefile(FileBuffer(
            '__jobscript_%s__' % job.getFQID('.'), script),
                                    executable=1)
        input_sandbox = input_sandbox_uris + [scriptPath]

        for isb in input_sandbox:
            logger.debug('ISB URI: %s' % isb)

        # compose output sandbox to include by default the following files:
        # - gzipped stdout (transferred only when the JobLogHandler is WMS)
        # - gzipped stderr (transferred only when the JobLogHandler is WMS)
        # - __jobscript__.log (job wrapper's log)
        output_sandbox = [wrapperlog]

        from Ganga.GPIDev.Lib.File.OutputFileManager import getOutputSandboxPatterns
        for outputSandboxPattern in getOutputSandboxPatterns(job):
            output_sandbox.append(outputSandboxPattern)

        if config['JobLogHandler'] in ['WMS']:
            output_sandbox += ['stdout.gz', 'stderr.gz']

        if len(jobconfig.outputbox):
            output_sandbox += [Sandbox.OUTPUT_TARBALL_NAME]

        # compose ARC XRSL
        xrsl = {
            #'VirtualOrganisation' : config['VirtualOrganisation'],
            'executable': os.path.basename(scriptPath),
            'environment': {
                'GANGA_LCG_VO': config['VirtualOrganisation'],
                'GANGA_LOG_HANDLER': config['JobLogHandler'],
                'LFC_HOST': lfc_host
            },
            #'stdout'                : 'stdout',
            #'stderr'                : 'stderr',
            'inputFiles': input_sandbox,
            'outputFiles': output_sandbox,
            #'OutputSandboxBaseDestURI': 'gsiftp://localhost'
        }

        xrsl['environment'].update({'GANGA_LCG_CE': self.CE})
        #xrsl['Requirements'] = self.requirements.merge(jobconfig.requirements).convert()

        # if self.jobtype.upper() in ['NORMAL','MPICH']:
        #xrsl['JobType'] = self.jobtype.upper()
        # if self.jobtype.upper() == 'MPICH':
        #xrsl['Requirements'].append('(other.GlueCEInfoTotalCPUs >= NodeNumber)')
        # xrsl['Requirements'].append('Member("MPICH",other.GlueHostApplicationSoftwareRunTimeEnvironment)')
        #xrsl['NodeNumber'] = self.requirements.nodenumber
        # else:
        #    logger.warning('JobType "%s" not supported' % self.jobtype)
        #    return

        #       additional settings from the job
        if jobconfig.env:
            xrsl['environment'].update(jobconfig.env)

        xrslText = Grid.expandxrsl(xrsl)

        # append any additional requirements from the requirements object
        xrslText += '\n'.join(self.requirements.other)

        logger.debug('subjob XRSL: %s' % xrslText)
        return inpw.writefile(FileBuffer('__xrslfile__', xrslText))
Ejemplo n.º 35
0
    def __check_and_prestage_inputfile__(self, file):
        '''Checks the given input file size and if it's size is
           over "BoundSandboxLimit", prestage it to a grid SE.

           The argument is a path of the local file.

           It returns a dictionary containing information to refer to the file:

               idx = {'lfc_host': lfc_host,
                      'local': [the local file pathes],
                      'remote': {'fname1': 'remote index1', 'fname2': 'remote index2', ... }
                     }

           If prestaging failed, None object is returned.

           If the file has been previously uploaded (according to md5sum),
           the prestaging is ignored and index to the previously uploaded file
           is returned.
           '''

        idx = {'lfc_host': '', 'local': [], 'remote': {}}

        job = self.getJobObject()

        # read-in the previously uploaded files
        uploadedFiles = []

        # getting the uploaded file list from the master job
        if job.master:
            uploadedFiles += job.master.backend.sandboxcache.get_cached_files()

        # set and get the $LFC_HOST for uploading oversized sandbox
        self.__setup_sandboxcache__(job)

        uploadedFiles += self.sandboxcache.get_cached_files()

        lfc_host = None

        # for LCGSandboxCache, take the one specified in the sansboxcache object.
        # the value is exactly the same as the one from the local grid shell env. if
        # it is not specified exclusively.
        if self.sandboxcache._name == 'LCGSandboxCache':
            lfc_host = self.sandboxcache.lfc_host

        # or in general, query it from the Grid object
        if not lfc_host:
            lfc_host = Grid.__get_lfc_host__()

        idx['lfc_host'] = lfc_host

        abspath = os.path.abspath(file)
        fsize = os.path.getsize(abspath)

        if fsize > config['BoundSandboxLimit']:

            md5sum = get_md5sum(abspath, ignoreGzipTimestamp=True)

            doUpload = True
            for uf in uploadedFiles:
                if uf.md5sum == md5sum:
                    # the same file has been uploaded to the iocache
                    idx['remote'][os.path.basename(file)] = uf.id
                    doUpload = False
                    break

            if doUpload:

                logger.warning(
                    'The size of %s is larger than the sandbox limit (%d byte). Please wait while pre-staging ...'
                    % (file, config['BoundSandboxLimit']))

                if self.sandboxcache.upload([abspath]):
                    remote_sandbox = self.sandboxcache.get_cached_files()[-1]
                    idx['remote'][remote_sandbox.name] = remote_sandbox.id
                else:
                    logger.error(
                        'Oversized sandbox not successfully pre-staged')
                    return None
        else:
            idx['local'].append(abspath)

        return idx
Ejemplo n.º 36
0
    def updateMonitoringInformation(jobs):
        '''Monitoring loop for normal jobs'''

        import datetime

        backenddict = {}
        jobdict = {}
        for j in jobs:
            if j.backend.id and (
                (datetime.datetime.utcnow() - j.time.timestamps["submitted"]
                 ).seconds > config["ArcWaitTimeBeforeStartingMonitoring"]):
                jobdict[j.backend.id] = j
                backenddict[j.backend.actualCE] = j

        if len(jobdict.keys()) == 0:
            return

        jobInfoDict = Grid.arc_status(jobdict.keys(), backenddict.keys())
        jidListForPurge = []

        # update job information for those available in jobInfoDict
        for id, info in jobInfoDict.items():

            if info:

                job = jobdict[id]

                if job.backend.actualCE != urlparse(id)[1].split(":")[0]:
                    job.backend.actualCE = urlparse(id)[1].split(":")[0]

                if job.backend.status != info['State']:

                    doStatusUpdate = True

                    # no need to update Ganga job status if backend status is
                    # not changed
                    if info['State'] == job.backend.status:
                        doStatusUpdate = False

                    # download output sandboxes if final status is reached
                    elif info['State'] in [
                            'Finished', '(FINISHED)', 'Finished (FINISHED)'
                    ]:

                        # grab output sandbox
                        if Grid.arc_get_output(
                                job.backend.id,
                                job.getOutputWorkspace(create=True).getPath()):
                            (ick, app_exitcode) = Grid.__get_app_exitcode__(
                                job.getOutputWorkspace(create=True).getPath())
                            job.backend.exitcode = app_exitcode

                            jidListForPurge.append(job.backend.id)

                        else:
                            logger.error('fail to download job output: %s' %
                                         jobdict[id].getFQID('.'))

                    if doStatusUpdate:
                        job.backend.status = info['State']
                        if 'Exit Code' in info:
                            try:
                                job.backend.exitcode_arc = int(
                                    info['Exit Code'])
                            except:
                                job.backend.exitcode_arc = 1

                        if 'Job Error' in info:
                            try:
                                job.backend.reason = info['Job Error']
                            except:
                                pass

                        job.backend.updateGangaJobStatus()
            else:
                logger.warning('fail to retrieve job informaton: %s' %
                               jobdict[id].getFQID('.'))

        # purging the jobs the output has been fetched locally
        if jidListForPurge:
            if not Grid.arc_purgeMultiple(jidListForPurge):
                logger.warning("Failed to purge all ARC jobs.")
Ejemplo n.º 37
0
    def updateMonitoringInformation(jobs):
        '''Monitoring loop for normal jobs'''

        jobdict = dict([[job.backend.id, job]
                        for job in jobs if job.backend.id])

        jobInfoDict = Grid.cream_status(jobdict.keys())

        jidListForPurge = []

        # update job information for those available in jobInfoDict
        for id, info in jobInfoDict.items():

            if info:

                job = jobdict[id]

                if job.backend.status != info['Current Status'] and ('ExitCode' not in info or ('ExitCode' in info and info['ExitCode'].isdigit())):

                    if 'Worker Node' in info:
                        job.backend.workernode = info['Worker Node']

                    if 'CREAM ISB URI' in info:
                        job.backend.isbURI = info['CREAM ISB URI']

                    if 'CREAM OSB URI' in info:
                        job.backend.osbURI = info['CREAM OSB URI']

                    doStatusUpdate = True

                    # no need to update Ganga job status if backend status is
                    # not changed
                    if info['Current Status'] == job.backend.status:
                        doStatusUpdate = False

                    # download output sandboxes if final status is reached
                    elif info['Current Status'] in ['DONE-OK', 'DONE-FAILED']:

                        # resolve output sandbox URIs based on the JDL
                        # information
                        osbURIList = __cream_resolveOSBList__(job, info['JDL'])

                        logger.debug('OSB list:')
                        for f in osbURIList:
                            logger.debug(f)

                        if osbURIList:

                            if Grid.cream_get_output(osbURIList, job.getOutputWorkspace(create=True).getPath() ):
                                (ick, app_exitcode) = Grid.__get_app_exitcode__(
                                    job.getOutputWorkspace(create=True).getPath() )
                                job.backend.exitcode = app_exitcode

                                jidListForPurge.append(job.backend.id)

                            else:
                                logger.error(
                                    'fail to download job output: %s' % jobdict[id].getFQID('.'))

                    if doStatusUpdate:
                        job.backend.status = info['Current Status']
                        if 'ExitCode' in info and info['ExitCode'] != "W":
                            try:
                                job.backend.exitcode_cream = int(
                                    info['ExitCode'])
                            except:
                                job.backend.exitcode_cream = 1

                        if 'FailureReason' in info:
                            try:
                                job.backend.reason = info['FailureReason']
                            except:
                                pass

                        job.backend.updateGangaJobStatus()
            else:
                logger.warning(
                    'fail to retrieve job informaton: %s' % jobdict[id].getFQID('.'))

            # purging the jobs the output has been fetched locally
            if jidListForPurge:
                Grid.cream_purgeMultiple(jidListForPurge)
Ejemplo n.º 38
0
    def updateMonitoringInformation(jobs):
        '''Monitoring loop for normal jobs'''

        import datetime

        backenddict = {}
        jobdict = {}
        for j in jobs:
            if j.backend.id and ((datetime.datetime.utcnow() - j.time.timestamps["submitted"]).seconds > config["ArcWaitTimeBeforeStartingMonitoring"]):
                jobdict[j.backend.id] = j
                backenddict[j.backend.actualCE] = j

        if len(jobdict.keys()) == 0:
            return

        jobInfoDict = Grid.arc_status(
            jobdict.keys(), backenddict.keys())
        jidListForPurge = []

        # update job information for those available in jobInfoDict
        for id, info in jobInfoDict.items():

            if info:

                job = jobdict[id]

                if job.backend.actualCE != urlparse(id)[1].split(":")[0]:
                    job.backend.actualCE = urlparse(id)[1].split(":")[0]

                if job.backend.status != info['State']:

                    doStatusUpdate = True

                    # no need to update Ganga job status if backend status is
                    # not changed
                    if info['State'] == job.backend.status:
                        doStatusUpdate = False

                    # download output sandboxes if final status is reached
                    elif info['State'] in ['Finished', '(FINISHED)', 'Finished (FINISHED)']:

                        # grab output sandbox
                        if Grid.arc_get_output(job.backend.id, job.getOutputWorkspace(create=True).getPath()):
                            (ick, app_exitcode) = Grid.__get_app_exitcode__(
                                job.getOutputWorkspace(create=True).getPath())
                            job.backend.exitcode = app_exitcode

                            jidListForPurge.append(job.backend.id)

                        else:
                            logger.error(
                                'fail to download job output: %s' % jobdict[id].getFQID('.'))

                    if doStatusUpdate:
                        job.backend.status = info['State']
                        if 'Exit Code' in info:
                            try:
                                job.backend.exitcode_arc = int(
                                    info['Exit Code'])
                            except:
                                job.backend.exitcode_arc = 1

                        if 'Job Error' in info:
                            try:
                                job.backend.reason = info['Job Error']
                            except:
                                pass

                        job.backend.updateGangaJobStatus()
            else:
                logger.warning(
                    'fail to retrieve job informaton: %s' % jobdict[id].getFQID('.'))

        # purging the jobs the output has been fetched locally
        if jidListForPurge:
            if not Grid.arc_purgeMultiple(jidListForPurge):
                logger.warning("Failed to purge all ARC jobs.")
Ejemplo n.º 39
0
    def updateMonitoringInformation(jobs):
        '''Monitoring loop for normal jobs'''

        import datetime

        ce_list = []  # type: List[str]
        jobdict = {}  # type: Mapping[str, Job]
        for j in jobs:
            if j.backend.id and ((datetime.datetime.utcnow() - j.time.timestamps["submitted"]).seconds > config["ArcWaitTimeBeforeStartingMonitoring"]):
                jobdict[j.backend.id] = j
                ce_list.append(j.backend.actualCE)

        if len(jobdict.keys()) == 0:
            return

        # Group jobs by the backend's credential requirements
        cred_to_backend_id_list = defaultdict(list)  # type: Mapping[ICredentialRequirement, List[str]]
        for jid, job in jobdict.items():
            cred_to_backend_id_list[job.backend.credential_requirements].append(jid)

        # Batch the status requests by credential requirement
        jobInfoDict = {}
        for cred_req, job_ids in cred_to_backend_id_list.items():
            # If the credential is not valid or doesn't exist then skip it
            cred = credential_store.get(cred_req)
            if not cred or not cred.is_valid():
                    needed_credentials.add(cred_req)
                    continue
            # Create a ``Grid`` for each credential requirement and request the relevant jobs through it
            info = Grid.arc_status(job_ids, ce_list, cred_req)
            jobInfoDict.update(info)

        jidListForPurge = []

        # update job information for those available in jobInfoDict
        for id, info in jobInfoDict.items():

            if info:

                job = jobdict[id]

                if job.backend.actualCE != urlparse(id)[1].split(":")[0]:
                    job.backend.actualCE = urlparse(id)[1].split(":")[0]

                if job.backend.status != info['State']:

                    doStatusUpdate = True

                    # no need to update Ganga job status if backend status is
                    # not changed
                    if info['State'] == job.backend.status:
                        doStatusUpdate = False

                    # download output sandboxes if final status is reached
                    elif info['State'] in ['Finished', '(FINISHED)', 'Finished (FINISHED)']:

                        # grab output sandbox
                        if Grid.arc_get_output(job.backend.id, job.getOutputWorkspace(create=True).getPath(), job.backend.credential_requirements):
                            (ick, app_exitcode) = Grid.__get_app_exitcode__(
                                job.getOutputWorkspace(create=True).getPath())
                            job.backend.exitcode = app_exitcode

                            jidListForPurge.append(job.backend.id)

                        else:
                            logger.error(
                                'fail to download job output: %s' % jobdict[id].getFQID('.'))

                    if doStatusUpdate:
                        job.backend.status = info['State']
                        if 'Exit Code' in info:
                            try:
                                job.backend.exitcode_arc = int(
                                    info['Exit Code'])
                            except:
                                job.backend.exitcode_arc = 1

                        if 'Job Error' in info:
                            try:
                                job.backend.reason = info['Job Error']
                            except:
                                pass

                        job.backend.updateGangaJobStatus()
            else:
                logger.warning(
                    'fail to retrieve job informaton: %s' % jobdict[id].getFQID('.'))

        # purging the jobs the output has been fetched locally
        if jidListForPurge:
            for cred_req, job_ids in cred_to_backend_id_list.items():
                if not Grid.arc_purge_multiple(set(job_ids) & set(jidListForPurge), cred_req):
                    logger.warning("Failed to purge all ARC jobs.")
Ejemplo n.º 40
0
    def updateMonitoringInformation(jobs):
        '''Monitoring loop for normal jobs'''

        jobdict = dict([(job.backend.id, job) for job in jobs
                        if job.backend.id])

        # Group jobs by the backend's credential requirements
        cred_to_backend_id_list = defaultdict(list)
        for job in jobs:
            cred_to_backend_id_list[
                job.backend.credential_requirements].append(job.backend.id)

        # Batch the status requests by credential requirement
        jobInfoDict = {}
        for cred_req, job_ids in cred_to_backend_id_list.items():
            # If the credential is not valid or doesn't exist then skip it
            cred = credential_store.get(cred_req)
            if not cred or not cred.is_valid():
                needed_credentials.add(cred_req)
                continue
            # Create a ``Grid`` for each credential requirement and request the relevant jobs through it
            info = Grid.cream_status(job_ids, cred_req)
            jobInfoDict.update(info)

        jidListForPurge = []

        # update job information for those available in jobInfoDict
        for id, info in jobInfoDict.items():

            if info:

                job = jobdict[id]

                if job.backend.status != info['Current Status'] and (
                        'ExitCode' not in info or
                    ('ExitCode' in info and info['ExitCode'].isdigit())):

                    if 'Worker Node' in info:
                        job.backend.workernode = info['Worker Node']

                    if 'CREAM ISB URI' in info:
                        job.backend.isbURI = info['CREAM ISB URI']

                    if 'CREAM OSB URI' in info:
                        job.backend.osbURI = info['CREAM OSB URI']

                    doStatusUpdate = True

                    # no need to update Ganga job status if backend status is
                    # not changed
                    if info['Current Status'] == job.backend.status:
                        doStatusUpdate = False

                    # download output sandboxes if final status is reached
                    elif info['Current Status'] in ['DONE-OK', 'DONE-FAILED']:

                        # resolve output sandbox URIs based on the JDL
                        # information
                        osbURIList = __cream_resolveOSBList__(job, info['JDL'])

                        logger.debug('OSB list:')
                        for f in osbURIList:
                            logger.debug(f)

                        if osbURIList:

                            if Grid.cream_get_output(
                                    osbURIList,
                                    job.getOutputWorkspace(
                                        create=True).getPath(),
                                    job.backend.credential_requirements):
                                (ick,
                                 app_exitcode) = Grid.__get_app_exitcode__(
                                     job.getOutputWorkspace(
                                         create=True).getPath())
                                job.backend.exitcode = app_exitcode

                                jidListForPurge.append(job.backend.id)

                            else:
                                logger.error(
                                    'fail to download job output: %s' %
                                    jobdict[id].getFQID('.'))

                    if doStatusUpdate:
                        job.backend.status = info['Current Status']
                        if 'ExitCode' in info and info['ExitCode'] != "W":
                            try:
                                job.backend.exitcode_cream = int(
                                    info['ExitCode'])
                            except:
                                job.backend.exitcode_cream = 1

                        if 'FailureReason' in info:
                            try:
                                job.backend.reason = info['FailureReason']
                            except:
                                pass

                        job.backend.updateGangaJobStatus()
            else:
                logger.warning('fail to retrieve job informaton: %s' %
                               jobdict[id].getFQID('.'))

        # purging the jobs the output has been fetched locally
        if jidListForPurge:
            for cred_req, job_ids in cred_to_backend_id_list.items():
                Grid.cream_purge_multiple(
                    set(job_ids) & set(jidListForPurge), cred_req)
Ejemplo n.º 41
0
    def __check_and_prestage_inputfile__(self, file):
        '''Checks the given input file size and if it's size is
           over "BoundSandboxLimit", prestage it to a grid SE.

           The argument is a path of the local file.

           It returns a dictionary containing information to refer to the file:

               idx = {'lfc_host': lfc_host,
                      'local': [the local file pathes],
                      'remote': {'fname1': 'remote index1', 'fname2': 'remote index2', ... }
                     }

           If prestaging failed, None object is returned.

           If the file has been previously uploaded (according to md5sum),
           the prestaging is ignored and index to the previously uploaded file
           is returned.
           '''

        idx = {'lfc_host': '', 'local': [], 'remote': {}}

        job = self.getJobObject()

        # read-in the previously uploaded files
        uploadedFiles = []

        # getting the uploaded file list from the master job
        if job.master:
            uploadedFiles += job.master.backend.sandboxcache.get_cached_files()

        # set and get the $LFC_HOST for uploading oversized sandbox
        self.__setup_sandboxcache__(job)

        uploadedFiles += self.sandboxcache.get_cached_files()

        lfc_host = None

        # for LCGSandboxCache, take the one specified in the sansboxcache object.
        # the value is exactly the same as the one from the local grid shell env. if
        # it is not specified exclusively.
        if self.sandboxcache._name == 'LCGSandboxCache':
            lfc_host = self.sandboxcache.lfc_host

        # or in general, query it from the Grid object
        if not lfc_host:
            lfc_host = Grid.__get_lfc_host__()

        idx['lfc_host'] = lfc_host

        abspath = os.path.abspath(file)
        fsize = os.path.getsize(abspath)

        if fsize > config['BoundSandboxLimit']:

            md5sum = get_md5sum(abspath, ignoreGzipTimestamp=True)

            doUpload = True
            for uf in uploadedFiles:
                if uf.md5sum == md5sum:
                    # the same file has been uploaded to the iocache
                    idx['remote'][os.path.basename(file)] = uf.id
                    doUpload = False
                    break

            if doUpload:

                logger.warning(
                    'The size of %s is larger than the sandbox limit (%d byte). Please wait while pre-staging ...' % (file, config['BoundSandboxLimit']))

                if self.sandboxcache.upload([abspath]):
                    remote_sandbox = self.sandboxcache.get_cached_files()[-1]
                    idx['remote'][remote_sandbox.name] = remote_sandbox.id
                else:
                    logger.error(
                        'Oversized sandbox not successfully pre-staged')
                    return None
        else:
            idx['local'].append(abspath)

        return idx
Ejemplo n.º 42
0
    def preparejob(self, jobconfig, master_job_sandbox):
        '''Prepare the JDL'''

        script = self.__jobWrapperTemplate__()

        job = self.getJobObject()
        inpw = job.getInputWorkspace()

        wrapperlog = '__jobscript__.log'

        import Ganga.Core.Sandbox as Sandbox

        # FIXME: check what happens if 'stdout','stderr' are specified here
        script = script.replace(
            '###OUTPUTSANDBOX###', repr(jobconfig.outputbox))

        script = script.replace(
            '###APPLICATION_NAME###', getName(job.application))
        script = script.replace(
            '###APPLICATIONEXEC###', repr(jobconfig.getExeString()))
        script = script.replace(
            '###APPLICATIONARGS###', repr(jobconfig.getArguments()))

        from Ganga.GPIDev.Lib.File.OutputFileManager import getWNCodeForOutputPostprocessing, getWNCodeForDownloadingInputFiles

        script = script.replace(
            '###OUTPUTUPLOADSPOSTPROCESSING###', getWNCodeForOutputPostprocessing(job, '    '))

        script = script.replace(
            '###DOWNLOADINPUTFILES###', getWNCodeForDownloadingInputFiles(job, '    '))

        if jobconfig.env:
            script = script.replace(
                '###APPLICATIONENVS###', repr(jobconfig.env))
        else:
            script = script.replace('###APPLICATIONENVS###', repr({}))

        script = script.replace('###WRAPPERLOG###', repr(wrapperlog))
        import inspect
        script = script.replace(
            '###INLINEMODULES###', inspect.getsource(Sandbox.WNSandbox))

        mon = job.getMonitoringService()

        self.monInfo = None

        # set the monitoring file by default to the stdout
        if isinstance(self.monInfo, dict):
            self.monInfo['remotefile'] = 'stdout'

        # try to print out the monitoring service information in debug mode
        try:
            logger.debug('job info of monitoring service: %s' %
                         str(self.monInfo))
        except:
            pass

#       prepare input/output sandboxes
        import Ganga.Utility.files
        from Ganga.GPIDev.Lib.File import File
        from Ganga.Core.Sandbox.WNSandbox import PYTHON_DIR
        import inspect

        fileutils = File( inspect.getsourcefile(Ganga.Utility.files), subdir=PYTHON_DIR )
        packed_files = jobconfig.getSandboxFiles() + [ fileutils ]
        sandbox_files = job.createPackedInputSandbox(packed_files)

        # sandbox of child jobs should include master's sandbox
        sandbox_files.extend(master_job_sandbox)

        # check the input file size and pre-upload larger inputs to the iocache
        lfc_host = ''

        input_sandbox_uris = []
        input_sandbox_names = []

        ick = True

        max_prestaged_fsize = 0
        for f in sandbox_files:

            idx = self.__check_and_prestage_inputfile__(f)

            if not idx:
                logger.error('input sandbox preparation failed: %s' % f)
                ick = False
                break
            else:

                if idx['lfc_host']:
                    lfc_host = idx['lfc_host']

                if idx['remote']:
                    abspath = os.path.abspath(f)
                    fsize = os.path.getsize(abspath)

                    if fsize > max_prestaged_fsize:
                        max_prestaged_fsize = fsize

                    input_sandbox_uris.append(
                        idx['remote'][os.path.basename(f)])

                    input_sandbox_names.append(
                        os.path.basename(urlparse(f)[2]))

                if idx['local']:
                    input_sandbox_uris += idx['local']
                    input_sandbox_names.append(os.path.basename(f))

        if not ick:
            logger.error('stop job submission')
            return None

        # determin the lcg-cp timeout according to the max_prestaged_fsize
        # - using the assumption of 1 MB/sec.
        max_prestaged_fsize = 0
        lfc_host = ''
        transfer_timeout = config['SandboxTransferTimeout']
        predict_timeout = int(math.ceil(max_prestaged_fsize / 1000000.0))

        if predict_timeout > transfer_timeout:
            transfer_timeout = predict_timeout

        if transfer_timeout < 60:
            transfer_timeout = 60

        script = script.replace(
            '###TRANSFERTIMEOUT###', '%d' % transfer_timeout)

        # update the job wrapper with the inputsandbox list
        script = script.replace(
            '###INPUTSANDBOX###', repr({'remote': {}, 'local': input_sandbox_names}))

        # write out the job wrapper and put job wrapper into job's inputsandbox
        scriptPath = inpw.writefile(
            FileBuffer('__jobscript_%s__' % job.getFQID('.'), script), executable=1)
        input_sandbox = input_sandbox_uris + [scriptPath]

        for isb in input_sandbox:
            logger.debug('ISB URI: %s' % isb)

        # compose output sandbox to include by default the following files:
        # - gzipped stdout (transferred only when the JobLogHandler is WMS)
        # - gzipped stderr (transferred only when the JobLogHandler is WMS)
        # - __jobscript__.log (job wrapper's log)
        output_sandbox = [wrapperlog]

        from Ganga.GPIDev.Lib.File.OutputFileManager import getOutputSandboxPatterns
        for outputSandboxPattern in getOutputSandboxPatterns(job):
            output_sandbox.append(outputSandboxPattern)

        if config['JobLogHandler'] in ['WMS']:
            output_sandbox += ['stdout.gz', 'stderr.gz']

        if len(jobconfig.outputbox):
            output_sandbox += [Sandbox.OUTPUT_TARBALL_NAME]

        # compose ARC XRSL
        xrsl = {
            #'VirtualOrganisation' : config['VirtualOrganisation'],
            'executable': os.path.basename(scriptPath),
            'environment': {'GANGA_LCG_VO': config['VirtualOrganisation'], 'GANGA_LOG_HANDLER': config['JobLogHandler'], 'LFC_HOST': lfc_host},
            #'stdout'                : 'stdout',
            #'stderr'                : 'stderr',
            'inputFiles': input_sandbox,
            'outputFiles': output_sandbox,
            #'OutputSandboxBaseDestURI': 'gsiftp://localhost'
        }

        xrsl['environment'].update({'GANGA_LCG_CE': self.CE})
        #xrsl['Requirements'] = self.requirements.merge(jobconfig.requirements).convert()

        # if self.jobtype.upper() in ['NORMAL','MPICH']:
        #xrsl['JobType'] = self.jobtype.upper()
        # if self.jobtype.upper() == 'MPICH':
        #xrsl['Requirements'].append('(other.GlueCEInfoTotalCPUs >= NodeNumber)')
        # xrsl['Requirements'].append('Member("MPICH",other.GlueHostApplicationSoftwareRunTimeEnvironment)')
        #xrsl['NodeNumber'] = self.requirements.nodenumber
        # else:
        #    logger.warning('JobType "%s" not supported' % self.jobtype)
        #    return

#       additional settings from the job
        if jobconfig.env:
            xrsl['environment'].update(jobconfig.env)

        xrslText = Grid.expandxrsl(xrsl)

        # append any additional requirements from the requirements object
        xrslText += '\n'.join(self.requirements.other)

        logger.debug('subjob XRSL: %s' % xrslText)
        return inpw.writefile(FileBuffer('__xrslfile__', xrslText))
Ejemplo n.º 43
0
    def updateMonitoringInformation(jobs):
        '''Monitoring loop for normal jobs'''

        import datetime

        ce_list = []  # type: List[str]
        jobdict = {}  # type: Mapping[str, Job]
        for j in jobs:
            if j.backend.id and (
                (datetime.datetime.utcnow() - j.time.timestamps["submitted"]
                 ).seconds > config["ArcWaitTimeBeforeStartingMonitoring"]):
                jobdict[j.backend.id] = j
                ce_list.append(j.backend.actualCE)

        if len(jobdict.keys()) == 0:
            return

        # Group jobs by the backend's credential requirements
        cred_to_backend_id_list = defaultdict(
            list)  # type: Mapping[ICredentialRequirement, List[str]]
        for jid, job in jobdict.items():
            cred_to_backend_id_list[
                job.backend.credential_requirements].append(jid)

        # Batch the status requests by credential requirement
        jobInfoDict = {}
        for cred_req, job_ids in cred_to_backend_id_list.items():
            # If the credential is not valid or doesn't exist then skip it
            cred = credential_store.get(cred_req)
            if not cred or not cred.is_valid():
                needed_credentials.add(cred_req)
                continue
            # Create a ``Grid`` for each credential requirement and request the relevant jobs through it
            info = Grid.arc_status(job_ids, ce_list, cred_req)
            jobInfoDict.update(info)

        jidListForPurge = []

        # update job information for those available in jobInfoDict
        for id, info in jobInfoDict.items():

            if info:

                job = jobdict[id]

                if job.backend.actualCE != urlparse(id)[1].split(":")[0]:
                    job.backend.actualCE = urlparse(id)[1].split(":")[0]

                if job.backend.status != info['State']:

                    doStatusUpdate = True

                    # no need to update Ganga job status if backend status is
                    # not changed
                    if info['State'] == job.backend.status:
                        doStatusUpdate = False

                    # download output sandboxes if final status is reached
                    elif info['State'] in [
                            'Finished', '(FINISHED)', 'Finished (FINISHED)'
                    ]:

                        # grab output sandbox
                        if Grid.arc_get_output(
                                job.backend.id,
                                job.getOutputWorkspace(create=True).getPath(),
                                job.backend.credential_requirements):
                            (ick, app_exitcode) = Grid.__get_app_exitcode__(
                                job.getOutputWorkspace(create=True).getPath())
                            job.backend.exitcode = app_exitcode

                            jidListForPurge.append(job.backend.id)

                        else:
                            logger.error('fail to download job output: %s' %
                                         jobdict[id].getFQID('.'))

                    if doStatusUpdate:
                        job.backend.status = info['State']
                        if 'Exit Code' in info:
                            try:
                                job.backend.exitcode_arc = int(
                                    info['Exit Code'])
                            except:
                                job.backend.exitcode_arc = 1

                        if 'Job Error' in info:
                            try:
                                job.backend.reason = info['Job Error']
                            except:
                                pass

                        job.backend.updateGangaJobStatus()
            else:
                logger.warning('fail to retrieve job informaton: %s' %
                               jobdict[id].getFQID('.'))

        # purging the jobs the output has been fetched locally
        if jidListForPurge:
            for cred_req, job_ids in cred_to_backend_id_list.items():
                if not Grid.arc_purge_multiple(
                        set(job_ids) & set(jidListForPurge), cred_req):
                    logger.warning("Failed to purge all ARC jobs.")
Ejemplo n.º 44
0
    def updateMonitoringInformation(jobs):
        '''Monitoring loop for normal jobs'''

        jobdict = dict([(job.backend.id, job) for job in jobs if job.backend.id])

        # Group jobs by the backend's credential requirements
        cred_to_backend_id_list = defaultdict(list)
        for job in jobs:
            cred_to_backend_id_list[job.backend.credential_requirements].append(job.backend.id)

        # Batch the status requests by credential requirement
        jobInfoDict = {}
        for cred_req, job_ids in cred_to_backend_id_list.items():
            # If the credential is not valid or doesn't exist then skip it
            cred = credential_store.get(cred_req)
            if not cred or not cred.is_valid():
                    needed_credentials.add(cred_req)
                    continue
            # Create a ``Grid`` for each credential requirement and request the relevant jobs through it
            info = Grid.cream_status(job_ids, cred_req)
            jobInfoDict.update(info)

        jidListForPurge = []

        # update job information for those available in jobInfoDict
        for id, info in jobInfoDict.items():

            if info:

                job = jobdict[id]

                if job.backend.status != info['Current Status'] and ('ExitCode' not in info or ('ExitCode' in info and info['ExitCode'].isdigit())):

                    if 'Worker Node' in info:
                        job.backend.workernode = info['Worker Node']

                    if 'CREAM ISB URI' in info:
                        job.backend.isbURI = info['CREAM ISB URI']

                    if 'CREAM OSB URI' in info:
                        job.backend.osbURI = info['CREAM OSB URI']

                    doStatusUpdate = True

                    # no need to update Ganga job status if backend status is
                    # not changed
                    if info['Current Status'] == job.backend.status:
                        doStatusUpdate = False

                    # download output sandboxes if final status is reached
                    elif info['Current Status'] in ['DONE-OK', 'DONE-FAILED']:

                        # resolve output sandbox URIs based on the JDL
                        # information
                        osbURIList = __cream_resolveOSBList__(job, info['JDL'])

                        logger.debug('OSB list:')
                        for f in osbURIList:
                            logger.debug(f)

                        if osbURIList:

                            if Grid.cream_get_output(osbURIList, job.getOutputWorkspace(create=True).getPath(), job.backend.credential_requirements):
                                (ick, app_exitcode) = Grid.__get_app_exitcode__(
                                    job.getOutputWorkspace(create=True).getPath())
                                job.backend.exitcode = app_exitcode

                                jidListForPurge.append(job.backend.id)

                            else:
                                logger.error(
                                    'fail to download job output: %s' % jobdict[id].getFQID('.'))

                    if doStatusUpdate:
                        job.backend.status = info['Current Status']
                        if 'ExitCode' in info and info['ExitCode'] != "W":
                            try:
                                job.backend.exitcode_cream = int(
                                    info['ExitCode'])
                            except:
                                job.backend.exitcode_cream = 1

                        if 'FailureReason' in info:
                            try:
                                job.backend.reason = info['FailureReason']
                            except:
                                pass

                        job.backend.updateGangaJobStatus()
            else:
                logger.warning(
                    'fail to retrieve job informaton: %s' % jobdict[id].getFQID('.'))

        # purging the jobs the output has been fetched locally
        if jidListForPurge:
            for cred_req, job_ids in cred_to_backend_id_list.items():
                Grid.cream_purge_multiple(set(job_ids) & set(jidListForPurge), cred_req)
Ejemplo n.º 45
0
    def updateMonitoringInformation(jobs):
        '''Monitoring loop for normal jobs'''

        jobdict = dict([[job.backend.id, job]
                        for job in jobs if job.backend.id])

        jobInfoDict = Grid.cream_status(jobdict.keys())

        jidListForPurge = []

        # update job information for those available in jobInfoDict
        for id, info in jobInfoDict.items():

            if info:

                job = jobdict[id]

                if job.backend.status != info['Current Status'] and ('ExitCode' not in info or ('ExitCode' in info and info['ExitCode'].isdigit())):

                    if 'Worker Node' in info:
                        job.backend.workernode = info['Worker Node']

                    if 'CREAM ISB URI' in info:
                        job.backend.isbURI = info['CREAM ISB URI']

                    if 'CREAM OSB URI' in info:
                        job.backend.osbURI = info['CREAM OSB URI']

                    doStatusUpdate = True

                    # no need to update Ganga job status if backend status is
                    # not changed
                    if info['Current Status'] == job.backend.status:
                        doStatusUpdate = False

                    # download output sandboxes if final status is reached
                    elif info['Current Status'] in ['DONE-OK', 'DONE-FAILED']:

                        # resolve output sandbox URIs based on the JDL
                        # information
                        osbURIList = __cream_resolveOSBList__(job, info['JDL'])

                        logger.debug('OSB list:')
                        for f in osbURIList:
                            logger.debug(f)

                        if osbURIList:

                            if Grid.cream_get_output(osbURIList, job.getOutputWorkspace(create=True).getPath() ):
                                (ick, app_exitcode) = Grid.__get_app_exitcode__(
                                    job.getOutputWorkspace(create=True).getPath() )
                                job.backend.exitcode = app_exitcode

                                jidListForPurge.append(job.backend.id)

                            else:
                                logger.error(
                                    'fail to download job output: %s' % jobdict[id].getFQID('.'))

                    if doStatusUpdate:
                        job.backend.status = info['Current Status']
                        if 'ExitCode' in info and info['ExitCode'] != "W":
                            try:
                                job.backend.exitcode_cream = int(
                                    info['ExitCode'])
                            except:
                                job.backend.exitcode_cream = 1

                        if 'FailureReason' in info:
                            try:
                                job.backend.reason = info['FailureReason']
                            except:
                                pass

                        job.backend.updateGangaJobStatus()
            else:
                logger.warning(
                    'fail to retrieve job informaton: %s' % jobdict[id].getFQID('.'))

            # purging the jobs the output has been fetched locally
            if jidListForPurge:
                Grid.cream_purgeMultiple(jidListForPurge)