Exemple #1
0
def kill_looping_job(job):
    """
    Kill the looping process.
    TODO: add allow_looping_job() exp. spec?

    :param job: job object.
    :return: (updated job object.)
    """

    # the child process is looping, kill it
    diagnostics = "pilot has decided to kill looping job %s at %s" % (
        job.jobid, time_stamp())
    logger.fatal(diagnostics)

    cmd = 'ps -fwu %s' % whoami()
    exit_code, stdout, stderr = execute(cmd, mute=True)
    logger.info("%s: %s", cmd + '\n', stdout)

    cmd = 'ls -ltr %s' % (job.workdir)
    exit_code, stdout, stderr = execute(cmd, mute=True)
    logger.info("%s: %s", cmd + '\n', stdout)

    cmd = 'ps -o pid,ppid,sid,pgid,tpgid,stat,comm -u %s' % whoami()
    exit_code, stdout, stderr = execute(cmd, mute=True)
    logger.info("%s: %s", cmd + '\n', stdout)

    cmd = 'pstree -g -a'
    exit_code, stdout, stderr = execute(cmd, mute=True)
    logger.info("%s: %s", cmd + '\n', stdout)

    # set the relevant error code
    if job.state == 'stagein':
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.STAGEINTIMEOUT)
    elif job.state == 'stageout':
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.STAGEOUTTIMEOUT)
    else:
        # most likely in the 'running' state, but use the catch-all 'else'
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.LOOPINGJOB)
    set_pilot_state(job=job, state="failed")

    # remove any lingering input files from the work dir
    lfns, guids = job.get_lfns_and_guids()
    if lfns:
        ec = remove_files(job.workdir, lfns)
        if ec != 0:
            logger.warning('failed to remove all files')

    kill_processes(job.pid)
Exemple #2
0
def publish_work_report(work_report=None,
                        worker_attributes_file="worker_attributes.json"):
    """
    Publishing of work report to file.
    The work report dictionary should contain the fields defined in get_initial_work_report().

    :param work_report: work report dictionary.
    :param worker_attributes_file:
    :return:
    """

    if work_report:
        work_report['timestamp'] = time_stamp()
        if "outputfiles" in work_report:
            del (work_report["outputfiles"])
        if "inputfiles" in work_report:
            del (work_report["inputfiles"])
        if "xml" in work_report:
            del (work_report["xml"])
        if write_json(worker_attributes_file, work_report):
            logger.info("work report published: {0}".format(work_report))
Exemple #3
0
def get_initial_work_report():
    """
    Prepare the work report dictionary.
    Note: the work_report should also contain all fields defined in parse_jobreport_data().

    :return: work report dictionary.
    """

    work_report = {
        'jobStatus': 'starting',
        'messageLevel': logging.getLevelName(logger.getEffectiveLevel()),
        'cpuConversionFactor': 1.0,
        'cpuConsumptionTime': '',
        'node': gethostname(),
        'workdir': '',
        'timestamp': time_stamp(),
        'endTime': '',
        'transExitCode': 0,
        'pilotErrorCode': 0,  # only add this in case of failure?
    }

    return work_report
Exemple #4
0
def publish_work_report(work_report=None,
                        worker_attributes_file="worker_attributes.json"):
    """
    Publishing of work report to file.
    The work report dictionary should contain the fields defined in get_initial_work_report().

    :param work_report: work report dictionary.
    :param worker_attributes_file:
    :raises FileHandlingFailure: in case of IOError.
    :return: True or False
    """

    if work_report:
        try:
            work_report['timestamp'] = time_stamp()
            if "outputfiles" in work_report:
                del (work_report["outputfiles"])
            if "inputfiles" in work_report:
                del (work_report["inputfiles"])
            if "xml" in work_report:
                del (work_report["xml"])
            if write_json(worker_attributes_file, work_report):
                logger.info("work report published: {0}".format(work_report))
                return True
            else:
                logger.error(
                    "work report publish failed: {0}".format(work_report))
                return False
        except IOError:
            logger.error("job report copy failed")
            return False
        except Exception as e:
            logger.error("write json file failed: {0}".format(e))
            return False
    else:
        # No work_report return False
        return False
Exemple #5
0
    def test_communicator_manager(self):
        """
        Make sure that es communicator manager thread works as expected.
        """
        communicator_manager = None
        try:
            args = {
                'workflow': 'eventservice_hpc',
                'queue': 'BNL_CLOUD_MCORE',
                'site': 'BNL_CLOUD_MCORE',
                'port': 25443,
                'url': 'https://aipanda007.cern.ch',
                'job_label': 'ptest',
                'pilot_user': '******',
                'node': socket.getfqdn(),
                'mem': 16000,
                'disk_space': 160000,
                'working_group': '',
                'cpu': 2601.0,
                'info': None
            }

            communicator_manager = CommunicationManager()
            communicator_manager.start()
            self.assertTrue(communicator_manager.is_alive())

            jobs = communicator_manager.get_jobs(njobs=2, args=args)
            self.assertEqual(len(jobs), 2)

            jobs = communicator_manager.get_jobs(njobs=1, args=args)
            self.assertEqual(len(jobs), 1)

            job_list = []
            for job in jobs:
                job_data = {
                    'node': socket.getfqdn(),
                    'pilotErrorCode': 0,
                    'startTime': time.time(),
                    'jobMetrics': 'coreCount=8',
                    'schedulerID': 'unknown',
                    'timestamp': time_stamp(),
                    'exeErrorCode': 0,
                    'pilotID': 'unknown|PR|2.0.0 (80)',
                    'transExitCode': 0,
                    'pilotErrorDiag': '',
                    'exeErrorDiag': ''
                }
                job_data['jobId'] = job['PandaID']
                job_data['siteName'] = 'BNL_CLOUD_MCORE'
                job_data['state'] = 'running'
                job_data['attemptNr'] = job['attemptNr'] + 1
                job_list.append(job_data)
            status = communicator_manager.update_jobs(jobs=job_list)
            self.assertEqual(status[0], True)

            events = communicator_manager.get_event_ranges(num_event_ranges=1,
                                                           job=jobs[0])
            self.assertEqual(len(events), 1)

            for event in events:
                event_range_status = {
                    "errorCode": 1220,
                    "eventRangeID": event['eventRangeID'],
                    "eventStatus": 'failed'
                }
                event_range_message = {
                    'version': 0,
                    'eventRanges': json.dumps(event_range_status)
                }
                res = communicator_manager.update_events(
                    update_events=event_range_message)
                self.assertEqual(res['StatusCode'], 0)

            events = communicator_manager.get_event_ranges(num_event_ranges=2,
                                                           job=jobs[0])
            self.assertEqual(len(events), 2)

            update_events = []
            for event in events:
                event_range = {
                    "eventRangeID": event['eventRangeID'],
                    "eventStatus": 'finished'
                }
                update_events.append(event_range)
            event_range_status = [{
                "zipFile": {
                    "numEvents": len(update_events),
                    "objstoreID": 1318,
                    "adler32": '000000',
                    "lfn": 'test_file',
                    "fsize": 100,
                    "pathConvention": 1000
                },
                "eventRanges": update_events
            }]

            event_range_message = {
                'version': 1,
                'eventRanges': json.dumps(event_range_status)
            }
            res = communicator_manager.update_events(
                update_events=event_range_message)
            self.assertEqual(res['StatusCode'], 0)

            communicator_manager.stop()
            time.sleep(2)
            self.assertFalse(communicator_manager.is_alive())
        except Exception as ex:
            if communicator_manager:
                communicator_manager.stop()
            raise ex