def _send_zmq_msg(job_id, command, data, address): """ simple code to send messages back to host (and get a reply back) """ logger = logging.getLogger(__name__) context = zmq.Context() zsocket = context.socket(zmq.REQ) logger.debug('Connecting to JobMonitor (%s)', address) zsocket.connect(address) host_name = socket.gethostname() ip_address = socket.gethostbyname(host_name) msg_container = {} msg_container["job_id"] = job_id msg_container["host_name"] = host_name msg_container["ip_address"] = ip_address msg_container["command"] = command msg_container["data"] = data # Send request logger.debug('Sending message: %s', msg_container) msg_string = zdumps(msg_container) zsocket.send(msg_string) # Get reply msg = zloads(zsocket.recv()) return msg
def check(self, session_id, jobs): """ serves input and output data """ # save list of jobs self.jobs = jobs self.id_to_job = {job.id: job for job in self.jobs} # keep track of DRMAA session_id (for resubmissions) self.session_id = session_id # determines in which interval to check if jobs are alive self.logger.debug('Starting local hearbeat') local_heart = multiprocessing.Process(target=_heart_beat, args=(-1, self.home_address, -1, "", CHECK_FREQUENCY)) local_heart.start() try: self.logger.debug("Starting ZMQ event loop") # main loop while not self.all_jobs_done(): self.logger.debug('Waiting for message') msg_str = self.socket.recv() msg = zloads(msg_str) self.logger.debug('Received message: %s', msg) return_msg = "" job_id = msg["job_id"] # only if its not the local beat if job_id != -1: # If message is from a valid job, process that message if job_id in self.id_to_job: job = self.id_to_job[job_id] if msg["command"] == "fetch_input": return_msg = self.id_to_job[job_id] job.timestamp = datetime.now() self.logger.debug("Received input request from %s", job_id) if msg["command"] == "store_output": # be nice return_msg = "thanks" # store tmp job object if isinstance(msg["data"], Job): tmp_job = msg["data"] # copy relevant fields job.ret = tmp_job.ret job.traceback = tmp_job.traceback self.logger.info("Received output from %s", job_id) # Returned exception instead of job, so store that elif isinstance(msg["data"], tuple): job.ret, job.traceback = msg["data"] self.logger.info("Received exception from %s", job_id) else: self.logger.error(("Received message with " + "invalid data: %s"), msg) job.ret = msg["data"] job.timestamp = datetime.now() if msg["command"] == "heart_beat": job.heart_beat = msg["data"] # keep track of mem and cpu try: job.track_mem.append(job.heart_beat["memory"]) job.track_cpu.append(job.heart_beat["cpu_load"]) except (ValueError, TypeError): self.logger.error("Error decoding heart-beat", exc_info=True) return_msg = "all good" job.timestamp = datetime.now() if msg["command"] == "get_job": # serve job for display return_msg = job else: # update host name job.host_name = msg["host_name"] # If this is an unknown job, report it and reply else: self.logger.error(('Received message from unknown job' + ' with ID %s. Known job IDs are: ' + '%s'), job_id, list(self.id_to_job.keys())) return_msg = 'thanks, but no thanks' else: # run check self.check_if_alive() if msg["command"] == "get_jobs": # serve list of jobs for display return_msg = self.jobs # send back compressed response self.logger.debug('Sending reply: %s', return_msg) self.socket.send(zdumps(return_msg)) finally: # Kill child processes that we don't need anymore local_heart.terminate()