def kill_looping_job(job): """ Kill the looping process. TODO: add allow_looping_job() exp. spec? :param job: job object. :return: (updated job object.) """ # the child process is looping, kill it diagnostics = "pilot has decided to kill looping job %s at %s" % ( job.jobid, time_stamp()) logger.fatal(diagnostics) cmd = 'ps -fwu %s' % whoami() exit_code, stdout, stderr = execute(cmd, mute=True) logger.info("%s: %s", cmd + '\n', stdout) cmd = 'ls -ltr %s' % (job.workdir) exit_code, stdout, stderr = execute(cmd, mute=True) logger.info("%s: %s", cmd + '\n', stdout) cmd = 'ps -o pid,ppid,sid,pgid,tpgid,stat,comm -u %s' % whoami() exit_code, stdout, stderr = execute(cmd, mute=True) logger.info("%s: %s", cmd + '\n', stdout) cmd = 'pstree -g -a' exit_code, stdout, stderr = execute(cmd, mute=True) logger.info("%s: %s", cmd + '\n', stdout) # set the relevant error code if job.state == 'stagein': job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.STAGEINTIMEOUT) elif job.state == 'stageout': job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.STAGEOUTTIMEOUT) else: # most likely in the 'running' state, but use the catch-all 'else' job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.LOOPINGJOB) set_pilot_state(job=job, state="failed") # remove any lingering input files from the work dir lfns, guids = job.get_lfns_and_guids() if lfns: ec = remove_files(job.workdir, lfns) if ec != 0: logger.warning('failed to remove all files') kill_processes(job.pid)
def publish_work_report(work_report=None, worker_attributes_file="worker_attributes.json"): """ Publishing of work report to file. The work report dictionary should contain the fields defined in get_initial_work_report(). :param work_report: work report dictionary. :param worker_attributes_file: :return: """ if work_report: work_report['timestamp'] = time_stamp() if "outputfiles" in work_report: del (work_report["outputfiles"]) if "inputfiles" in work_report: del (work_report["inputfiles"]) if "xml" in work_report: del (work_report["xml"]) if write_json(worker_attributes_file, work_report): logger.info("work report published: {0}".format(work_report))
def get_initial_work_report(): """ Prepare the work report dictionary. Note: the work_report should also contain all fields defined in parse_jobreport_data(). :return: work report dictionary. """ work_report = { 'jobStatus': 'starting', 'messageLevel': logging.getLevelName(logger.getEffectiveLevel()), 'cpuConversionFactor': 1.0, 'cpuConsumptionTime': '', 'node': gethostname(), 'workdir': '', 'timestamp': time_stamp(), 'endTime': '', 'transExitCode': 0, 'pilotErrorCode': 0, # only add this in case of failure? } return work_report
def publish_work_report(work_report=None, worker_attributes_file="worker_attributes.json"): """ Publishing of work report to file. The work report dictionary should contain the fields defined in get_initial_work_report(). :param work_report: work report dictionary. :param worker_attributes_file: :raises FileHandlingFailure: in case of IOError. :return: True or False """ if work_report: try: work_report['timestamp'] = time_stamp() if "outputfiles" in work_report: del (work_report["outputfiles"]) if "inputfiles" in work_report: del (work_report["inputfiles"]) if "xml" in work_report: del (work_report["xml"]) if write_json(worker_attributes_file, work_report): logger.info("work report published: {0}".format(work_report)) return True else: logger.error( "work report publish failed: {0}".format(work_report)) return False except IOError: logger.error("job report copy failed") return False except Exception as e: logger.error("write json file failed: {0}".format(e)) return False else: # No work_report return False return False
def test_communicator_manager(self): """ Make sure that es communicator manager thread works as expected. """ communicator_manager = None try: args = { 'workflow': 'eventservice_hpc', 'queue': 'BNL_CLOUD_MCORE', 'site': 'BNL_CLOUD_MCORE', 'port': 25443, 'url': 'https://aipanda007.cern.ch', 'job_label': 'ptest', 'pilot_user': '******', 'node': socket.getfqdn(), 'mem': 16000, 'disk_space': 160000, 'working_group': '', 'cpu': 2601.0, 'info': None } communicator_manager = CommunicationManager() communicator_manager.start() self.assertTrue(communicator_manager.is_alive()) jobs = communicator_manager.get_jobs(njobs=2, args=args) self.assertEqual(len(jobs), 2) jobs = communicator_manager.get_jobs(njobs=1, args=args) self.assertEqual(len(jobs), 1) job_list = [] for job in jobs: job_data = { 'node': socket.getfqdn(), 'pilotErrorCode': 0, 'startTime': time.time(), 'jobMetrics': 'coreCount=8', 'schedulerID': 'unknown', 'timestamp': time_stamp(), 'exeErrorCode': 0, 'pilotID': 'unknown|PR|2.0.0 (80)', 'transExitCode': 0, 'pilotErrorDiag': '', 'exeErrorDiag': '' } job_data['jobId'] = job['PandaID'] job_data['siteName'] = 'BNL_CLOUD_MCORE' job_data['state'] = 'running' job_data['attemptNr'] = job['attemptNr'] + 1 job_list.append(job_data) status = communicator_manager.update_jobs(jobs=job_list) self.assertEqual(status[0], True) events = communicator_manager.get_event_ranges(num_event_ranges=1, job=jobs[0]) self.assertEqual(len(events), 1) for event in events: event_range_status = { "errorCode": 1220, "eventRangeID": event['eventRangeID'], "eventStatus": 'failed' } event_range_message = { 'version': 0, 'eventRanges': json.dumps(event_range_status) } res = communicator_manager.update_events( update_events=event_range_message) self.assertEqual(res['StatusCode'], 0) events = communicator_manager.get_event_ranges(num_event_ranges=2, job=jobs[0]) self.assertEqual(len(events), 2) update_events = [] for event in events: event_range = { "eventRangeID": event['eventRangeID'], "eventStatus": 'finished' } update_events.append(event_range) event_range_status = [{ "zipFile": { "numEvents": len(update_events), "objstoreID": 1318, "adler32": '000000', "lfn": 'test_file', "fsize": 100, "pathConvention": 1000 }, "eventRanges": update_events }] event_range_message = { 'version': 1, 'eventRanges': json.dumps(event_range_status) } res = communicator_manager.update_events( update_events=event_range_message) self.assertEqual(res['StatusCode'], 0) communicator_manager.stop() time.sleep(2) self.assertFalse(communicator_manager.is_alive()) except Exception as ex: if communicator_manager: communicator_manager.stop() raise ex