Example #1
0
    def updateMonitoringInformation(jobs):
        '''Monitoring loop for normal jobs'''

        jobdict = dict([(job.backend.id, job) for job in jobs
                        if job.backend.id])

        # Group jobs by the backend's credential requirements
        cred_to_backend_id_list = defaultdict(list)
        for job in jobs:
            cred_to_backend_id_list[
                job.backend.credential_requirements].append(job.backend.id)

        # Batch the status requests by credential requirement
        jobInfoDict = {}
        for cred_req, job_ids in cred_to_backend_id_list.items():
            # If the credential is not valid or doesn't exist then skip it
            cred = credential_store.get(cred_req)
            if not cred or not cred.is_valid():
                needed_credentials.add(cred_req)
                continue
            # Create a ``Grid`` for each credential requirement and request the relevant jobs through it
            info = Grid.cream_status(job_ids, cred_req)
            jobInfoDict.update(info)

        jidListForPurge = []

        # update job information for those available in jobInfoDict
        for id, info in jobInfoDict.items():

            if info:

                job = jobdict[id]

                if job.backend.status != info['Current Status'] and (
                        'ExitCode' not in info or
                    ('ExitCode' in info and info['ExitCode'].isdigit())):

                    if 'Worker Node' in info:
                        job.backend.workernode = info['Worker Node']

                    if 'CREAM ISB URI' in info:
                        job.backend.isbURI = info['CREAM ISB URI']

                    if 'CREAM OSB URI' in info:
                        job.backend.osbURI = info['CREAM OSB URI']

                    doStatusUpdate = True

                    # no need to update Ganga job status if backend status is
                    # not changed
                    if info['Current Status'] == job.backend.status:
                        doStatusUpdate = False

                    # download output sandboxes if final status is reached
                    elif info['Current Status'] in ['DONE-OK', 'DONE-FAILED']:

                        # resolve output sandbox URIs based on the JDL
                        # information
                        osbURIList = __cream_resolveOSBList__(job, info['JDL'])

                        logger.debug('OSB list:')
                        for f in osbURIList:
                            logger.debug(f)

                        if osbURIList:

                            if Grid.cream_get_output(
                                    osbURIList,
                                    job.getOutputWorkspace(
                                        create=True).getPath(),
                                    job.backend.credential_requirements):
                                (ick,
                                 app_exitcode) = Grid.__get_app_exitcode__(
                                     job.getOutputWorkspace(
                                         create=True).getPath())
                                job.backend.exitcode = app_exitcode

                                jidListForPurge.append(job.backend.id)

                            else:
                                logger.error(
                                    'fail to download job output: %s' %
                                    jobdict[id].getFQID('.'))

                    if doStatusUpdate:
                        job.backend.status = info['Current Status']
                        if 'ExitCode' in info and info['ExitCode'] != "W":
                            try:
                                job.backend.exitcode_cream = int(
                                    info['ExitCode'])
                            except:
                                job.backend.exitcode_cream = 1

                        if 'FailureReason' in info:
                            try:
                                job.backend.reason = info['FailureReason']
                            except:
                                pass

                        job.backend.updateGangaJobStatus()
            else:
                logger.warning('fail to retrieve job informaton: %s' %
                               jobdict[id].getFQID('.'))

        # purging the jobs the output has been fetched locally
        if jidListForPurge:
            for cred_req, job_ids in cred_to_backend_id_list.items():
                Grid.cream_purge_multiple(
                    set(job_ids) & set(jidListForPurge), cred_req)
Example #2
0
    def updateMonitoringInformation(jobs):
        '''Monitoring loop for normal jobs'''

        import datetime

        ce_list = []  # type: List[str]
        jobdict = {}  # type: Mapping[str, Job]
        for j in jobs:
            if j.backend.id and ((datetime.datetime.utcnow() - j.time.timestamps["submitted"]).seconds > config["ArcWaitTimeBeforeStartingMonitoring"]):
                jobdict[j.backend.id] = j
                ce_list.append(j.backend.actualCE)

        if len(jobdict.keys()) == 0:
            return

        # Group jobs by the backend's credential requirements
        cred_to_backend_id_list = defaultdict(list)  # type: Mapping[ICredentialRequirement, List[str]]
        for jid, job in jobdict.items():
            cred_to_backend_id_list[job.backend.credential_requirements].append(jid)

        # Batch the status requests by credential requirement
        jobInfoDict = {}
        for cred_req, job_ids in cred_to_backend_id_list.items():
            # If the credential is not valid or doesn't exist then skip it
            cred = credential_store.get(cred_req)
            if not cred or not cred.is_valid():
                    needed_credentials.add(cred_req)
                    continue
            # Create a ``Grid`` for each credential requirement and request the relevant jobs through it
            info = Grid.arc_status(job_ids, ce_list, cred_req)
            jobInfoDict.update(info)

        jidListForPurge = []

        # update job information for those available in jobInfoDict
        for id, info in jobInfoDict.items():

            if info:

                job = jobdict[id]

                if job.backend.actualCE != urlparse(id)[1].split(":")[0]:
                    job.backend.actualCE = urlparse(id)[1].split(":")[0]

                if job.backend.status != info['State']:

                    doStatusUpdate = True

                    # no need to update Ganga job status if backend status is
                    # not changed
                    if info['State'] == job.backend.status:
                        doStatusUpdate = False

                    # download output sandboxes if final status is reached
                    elif info['State'] in ['Finished', '(FINISHED)', 'Finished (FINISHED)']:

                        # grab output sandbox
                        if Grid.arc_get_output(job.backend.id, job.getOutputWorkspace(create=True).getPath(), job.backend.credential_requirements):
                            (ick, app_exitcode) = Grid.__get_app_exitcode__(
                                job.getOutputWorkspace(create=True).getPath())
                            job.backend.exitcode = app_exitcode

                            jidListForPurge.append(job.backend.id)

                        else:
                            logger.error(
                                'fail to download job output: %s' % jobdict[id].getFQID('.'))

                    if doStatusUpdate:
                        job.backend.status = info['State']
                        if 'Exit Code' in info:
                            try:
                                job.backend.exitcode_arc = int(
                                    info['Exit Code'])
                            except:
                                job.backend.exitcode_arc = 1

                        if 'Job Error' in info:
                            try:
                                job.backend.reason = info['Job Error']
                            except:
                                pass

                        job.backend.updateGangaJobStatus()
            else:
                logger.warning(
                    'fail to retrieve job informaton: %s' % jobdict[id].getFQID('.'))

        # purging the jobs the output has been fetched locally
        if jidListForPurge:
            for cred_req, job_ids in cred_to_backend_id_list.items():
                if not Grid.arc_purge_multiple(set(job_ids) & set(jidListForPurge), cred_req):
                    logger.warning("Failed to purge all ARC jobs.")
Example #3
0
    def updateMonitoringInformation(jobs):
        '''Monitoring loop for normal jobs'''

        import datetime

        ce_list = []  # type: List[str]
        jobdict = {}  # type: Mapping[str, Job]
        for j in jobs:
            if j.backend.id and (
                (datetime.datetime.utcnow() - j.time.timestamps["submitted"]
                 ).seconds > config["ArcWaitTimeBeforeStartingMonitoring"]):
                jobdict[j.backend.id] = j
                ce_list.append(j.backend.actualCE)

        if len(jobdict.keys()) == 0:
            return

        # Group jobs by the backend's credential requirements
        cred_to_backend_id_list = defaultdict(
            list)  # type: Mapping[ICredentialRequirement, List[str]]
        for jid, job in jobdict.items():
            cred_to_backend_id_list[
                job.backend.credential_requirements].append(jid)

        # Batch the status requests by credential requirement
        jobInfoDict = {}
        for cred_req, job_ids in cred_to_backend_id_list.items():
            # If the credential is not valid or doesn't exist then skip it
            cred = credential_store.get(cred_req)
            if not cred or not cred.is_valid():
                needed_credentials.add(cred_req)
                continue
            # Create a ``Grid`` for each credential requirement and request the relevant jobs through it
            info = Grid.arc_status(job_ids, ce_list, cred_req)
            jobInfoDict.update(info)

        jidListForPurge = []

        # update job information for those available in jobInfoDict
        for id, info in jobInfoDict.items():

            if info:

                job = jobdict[id]

                if job.backend.actualCE != urlparse(id)[1].split(":")[0]:
                    job.backend.actualCE = urlparse(id)[1].split(":")[0]

                if job.backend.status != info['State']:

                    doStatusUpdate = True

                    # no need to update Ganga job status if backend status is
                    # not changed
                    if info['State'] == job.backend.status:
                        doStatusUpdate = False

                    # download output sandboxes if final status is reached
                    elif info['State'] in [
                            'Finished', '(FINISHED)', 'Finished (FINISHED)'
                    ]:

                        # grab output sandbox
                        if Grid.arc_get_output(
                                job.backend.id,
                                job.getOutputWorkspace(create=True).getPath(),
                                job.backend.credential_requirements):
                            (ick, app_exitcode) = Grid.__get_app_exitcode__(
                                job.getOutputWorkspace(create=True).getPath())
                            job.backend.exitcode = app_exitcode

                            jidListForPurge.append(job.backend.id)

                        else:
                            logger.error('fail to download job output: %s' %
                                         jobdict[id].getFQID('.'))

                    if doStatusUpdate:
                        job.backend.status = info['State']
                        if 'Exit Code' in info:
                            try:
                                job.backend.exitcode_arc = int(
                                    info['Exit Code'])
                            except:
                                job.backend.exitcode_arc = 1

                        if 'Job Error' in info:
                            try:
                                job.backend.reason = info['Job Error']
                            except:
                                pass

                        job.backend.updateGangaJobStatus()
            else:
                logger.warning('fail to retrieve job informaton: %s' %
                               jobdict[id].getFQID('.'))

        # purging the jobs the output has been fetched locally
        if jidListForPurge:
            for cred_req, job_ids in cred_to_backend_id_list.items():
                if not Grid.arc_purge_multiple(
                        set(job_ids) & set(jidListForPurge), cred_req):
                    logger.warning("Failed to purge all ARC jobs.")
Example #4
0
    def updateMonitoringInformation(jobs):
        '''Monitoring loop for normal jobs'''

        jobdict = dict([(job.backend.id, job) for job in jobs if job.backend.id])

        # Group jobs by the backend's credential requirements
        cred_to_backend_id_list = defaultdict(list)
        for job in jobs:
            cred_to_backend_id_list[job.backend.credential_requirements].append(job.backend.id)

        # Batch the status requests by credential requirement
        jobInfoDict = {}
        for cred_req, job_ids in cred_to_backend_id_list.items():
            # If the credential is not valid or doesn't exist then skip it
            cred = credential_store.get(cred_req)
            if not cred or not cred.is_valid():
                    needed_credentials.add(cred_req)
                    continue
            # Create a ``Grid`` for each credential requirement and request the relevant jobs through it
            info = Grid.cream_status(job_ids, cred_req)
            jobInfoDict.update(info)

        jidListForPurge = []

        # update job information for those available in jobInfoDict
        for id, info in jobInfoDict.items():

            if info:

                job = jobdict[id]

                if job.backend.status != info['Current Status'] and ('ExitCode' not in info or ('ExitCode' in info and info['ExitCode'].isdigit())):

                    if 'Worker Node' in info:
                        job.backend.workernode = info['Worker Node']

                    if 'CREAM ISB URI' in info:
                        job.backend.isbURI = info['CREAM ISB URI']

                    if 'CREAM OSB URI' in info:
                        job.backend.osbURI = info['CREAM OSB URI']

                    doStatusUpdate = True

                    # no need to update Ganga job status if backend status is
                    # not changed
                    if info['Current Status'] == job.backend.status:
                        doStatusUpdate = False

                    # download output sandboxes if final status is reached
                    elif info['Current Status'] in ['DONE-OK', 'DONE-FAILED']:

                        # resolve output sandbox URIs based on the JDL
                        # information
                        osbURIList = __cream_resolveOSBList__(job, info['JDL'])

                        logger.debug('OSB list:')
                        for f in osbURIList:
                            logger.debug(f)

                        if osbURIList:

                            if Grid.cream_get_output(osbURIList, job.getOutputWorkspace(create=True).getPath(), job.backend.credential_requirements):
                                (ick, app_exitcode) = Grid.__get_app_exitcode__(
                                    job.getOutputWorkspace(create=True).getPath())
                                job.backend.exitcode = app_exitcode

                                jidListForPurge.append(job.backend.id)

                            else:
                                logger.error(
                                    'fail to download job output: %s' % jobdict[id].getFQID('.'))

                    if doStatusUpdate:
                        job.backend.status = info['Current Status']
                        if 'ExitCode' in info and info['ExitCode'] != "W":
                            try:
                                job.backend.exitcode_cream = int(
                                    info['ExitCode'])
                            except:
                                job.backend.exitcode_cream = 1

                        if 'FailureReason' in info:
                            try:
                                job.backend.reason = info['FailureReason']
                            except:
                                pass

                        job.backend.updateGangaJobStatus()
            else:
                logger.warning(
                    'fail to retrieve job informaton: %s' % jobdict[id].getFQID('.'))

        # purging the jobs the output has been fetched locally
        if jidListForPurge:
            for cred_req, job_ids in cred_to_backend_id_list.items():
                Grid.cream_purge_multiple(set(job_ids) & set(jidListForPurge), cred_req)