def killtms(): logging.info('Killing task managers...') # Load the list of nodes to connect to tmlist = load_tm_list() for name, tm in tmlist.items(): try: logging.debug('Connecting to %s:%d...', tm.address, tm.port) tm.Open(jm_conn_timeout) # Send the job identifier tm.WriteString(jm_jobid) # Read back the job id of the answer tm.ReadString(jm_recv_timeout) tm.WriteInt64(messaging.msg_terminate) tm.Close() except: # Problem connecting to the task manager logging.warning('Error connecting to task manager at %s:%d!', tm.address, tm.port) log_lines(traceback.format_exc(), logging.debug)
def stats(): global _memstat_enabled, _can_enable if not _memstat_enabled: return if not _can_enable: logging.warning('could not enable memstat') _memstat_enabled = False return try: s0, s1, s2 = gc.get_count() usage = resource.getrusage(resource.RUSAGE_SELF) kb = usage.ru_maxrss if kb == 0: kb = my_getrss() _frame = inspect.currentframe() frame = _frame.f_back fname = frame.f_code.co_filename fnum = frame.f_lineno logging.info('memstat:%s:%d: rss_kb: %d gb_stages: %d %d %d' % \ (fname, fnum, kb, s0, s1, s2)) except: log_lines(sys.exc_info(), logging.debug) log_lines(traceback.format_exc(), logging.debug) logging.warning('something went wrong with memstat, disabling') _can_enable = False _memstat_enabled = False finally: # Necessary to avoid cyclic references and leak memory! del frame del _frame
def listener(self): if self.mode == config.mode_tcp: logging.info('Listening to network at %s:%d...', self.addr, self.port) elif self.mode == config.mode_uds: logging.info('Listening to file at %s...', self.addr) while True: try: conn, addr = self.socket.accept() # Assign the address from the connection if self.mode == config.mode_tcp: # TCP addr, port = addr elif self.mode == config.mode_uds: # UDS addr = 'uds' port = 0 # Create the endpoint and send to a thread to # process the request endpoint = ClientEndpoint(addr, port, conn) threading.Thread(target = self.callback, args=((endpoint, addr, port) + self.user_args)).start() except: log_lines(sys.exc_info(), logging.debug) log_lines(traceback.format_exc(), logging.debug) time.sleep(10)
def setup_endpoint_for_pushing(e): try: # Try to connect to a task manager e.Open(jm_conn_timeout) except: # Problem connecting to the task manager # Because this is a connection event, # make it a debug rather than a warning logging.debug('Error connecting to task manager at %s:%d!', e.address, e.port) log_lines(traceback.format_exc(), logging.debug) e.Close() return try: # Send the job identifier e.WriteString(jm_jobid) # Ask if it is possible to send tasks e.WriteInt64(messaging.msg_send_task) # Verify job id of the answer jobid = e.ReadString(jm_recv_timeout) if jm_jobid != jobid: logging.error( 'Job Id mismatch from %s:%d! Self: %s, task manager: %s!', e.address, e.port, jm_jobid, jobid) e.Close() return False # Wait for a response response = e.ReadInt64(jm_recv_timeout) if response == messaging.msg_send_full: # Task mananger is full logging.debug('Task manager at %s:%d is full.', e.address, e.port) elif response == messaging.msg_send_more: # Continue to the task pushing loop return True else: # The task manager is not replying as expected logging.error('Unknown response from the task manager!') except: # Problem connecting to the task manager logging.warning('Error connecting to task manager at %s:%d!', e.address, e.port) log_lines(traceback.format_exc(), logging.debug) e.Close() return False
def heartbeat(finished): global jm_heartbeat_interval t_last = time.clock() for isEnd, name, tm in infinite_tmlist_generator(): if finished[0]: logging.debug('Stopping heartbeat thread...') return if isEnd: t_curr = time.clock() elapsed = t_curr - t_last t_last = t_curr sleep_for = max(jm_heartbeat_interval - elapsed, 0) time.sleep(sleep_for) else: try: tm.Open(jm_heart_timeout) except: # Problem connecting to the task manager # Because this is a connection event, # make it a debug rather than a warning logging.debug('Error connecting to task manager at %s:%d!', tm.address, tm.port) log_lines(traceback.format_exc(), logging.debug) tm.Close() continue try: # Send the job identifier tm.WriteString(jm_jobid) # Verify job id of the answer jobid = tm.ReadString(jm_recv_timeout) if jm_jobid != jobid: logging.error( 'Job Id mismatch from %s:%d! Self: %s, task manager: %s!', tm.address, tm.port, jm_jobid, jobid) tm.Close() continue # Send the heartbeat tm.WriteInt64(messaging.msg_send_heart) except: logging.warning('Error connecting to task manager at %s:%d!', tm.address, tm.port) log_lines(traceback.format_exc(), logging.debug) finally: tm.Close()
def setup_endpoint_for_pulling(e): try: # Try to connect to a task manager e.Open(jm_conn_timeout) except: # Problem connecting to the task manager # Because this is a connection event, # make it a debug rather than a warning logging.debug('Error connecting to task manager at %s:%d!', e.address, e.port) log_lines(traceback.format_exc(), logging.debug) e.Close() return try: # Send the job identifier e.WriteString(jm_jobid) # Ask if it is possible to send tasks e.WriteInt64(messaging.msg_read_result) # Verify job id of the answer jobid = e.ReadString(jm_recv_timeout) if jm_jobid != jobid: logging.error( 'Job Id mismatch from %s:%d! Self: %s, task manager: %s!', e.address, e.port, jm_jobid, jobid) e.Close() return False return True except: # Problem connecting to the task manager logging.warning('Error connecting to task manager at %s:%d!', e.address, e.port) log_lines(traceback.format_exc(), logging.debug) e.Close() return False
def push_tasks(job, runid, jm, tm, taskid, task, tasklist, completed): # Keep pushing until finished or the task manager is full sent = [] while True: if task == None: # Avoid calling next_task after it's finished if completed: logging.debug('There are no new tasks to generate.') return (True, 0, None, sent) # Only get a task if the last one was already sent newtaskid = taskid + 1 r1, newtask, ctx = job.spits_job_manager_next_task(jm, newtaskid) # Exit if done if r1 == 0: return (True, 0, None, sent) if newtask == None: logging.error('Task %d was not pushed!', newtaskid) return (False, taskid, task, sent) if ctx != newtaskid: logging.error('Context verification failed for task %d!', newtaskid) return (False, taskid, task, sent) # Add the generated task to the tasklist taskid = newtaskid task = newtask[0] tasklist[taskid] = (0, task) logging.debug('Generated task %d with payload size of %d bytes.', taskid, len(task) if task != None else 0) try: logging.debug('Pushing %d...', taskid) # Push the task to the active task manager tm.WriteInt64(taskid) tm.WriteInt64(runid) if task == None: tm.WriteInt64(0) else: tm.WriteInt64(len(task)) tm.Write(task) # Wait for a response response = tm.ReadInt64(jm_recv_timeout) if response == messaging.msg_send_full: # Task was sent, but the task manager is now full sent.append((taskid, task)) task = None break elif response == messaging.msg_send_more: # Continue pushing tasks sent.append((taskid, task)) task = None pass elif response == messaging.msg_send_rjct: # Task was rejected by the task manager, this is not # predicted for a model where just one task manager # pushes tasks, exit the task loop logging.warning('Task manager at %s:%d rejected task %d', tm.address, tm.port, taskid) break else: # The task manager is not replying as expected logging.error('Unknown response from the task manager!') break except: # Something went wrong with the connection, # try with another task manager logging.error('Error pushing tasks to task manager!') log_lines(traceback.format_exc(), logging.debug) break return (False, taskid, task, sent)
def server_callback(conn, addr, port, job, tpool, cqueue, timeout): logging.debug('Connected to %s:%d.', addr, port) try: # Send the job identifier conn.WriteString(tm_jobid) # Verify job id of the answer jobid = conn.ReadString(tm_recv_timeout) if tm_jobid != jobid: logging.error('Job Id mismatch from %s:%d! Self: %s, task manager: %s!', conn.address, conn.port, tm_jobid, jobid) conn.Close() return False # Read the type of message mtype = conn.ReadInt64(tm_recv_timeout) timeout.reset() # Termination signal if mtype == messaging.msg_terminate: logging.info('Received a kill signal from %s:%d.', addr, port) os._exit(0) # Job manager is sending heartbeats if mtype == messaging.msg_send_heart: logging.debug('Received heartbeat from %s:%d', addr, port) # Job manager is trying to send tasks to the task manager elif mtype == messaging.msg_send_task: # Two phase pull: test-try-pull while not tpool.Full(): # Task pool is not full, start asking for data conn.WriteInt64(messaging.msg_send_more) taskid = conn.ReadInt64(tm_recv_timeout) runid = conn.ReadInt64(tm_recv_timeout) tasksz = conn.ReadInt64(tm_recv_timeout) task = conn.Read(tasksz, tm_recv_timeout) logging.info('Received task %d from %s:%d.', taskid, addr, port) # Try enqueue the received task if not tpool.Put(taskid, runid, task): # For some reason the pool got full in between # (shouldn't happen) logging.warning('Rejecting task %d because ' + 'the pool is ful!', taskid) conn.WriteInt64(messaging.msg_send_rjct) # Task pool is full, stop receiving tasks conn.WriteInt64(messaging.msg_send_full) # Job manager is querying the results of the completed tasks elif mtype == messaging.msg_read_result: taskid = None try: # Dequeue completed tasks until cqueue fires # an Empty exception while True: # Pop the task taskid, runid, r, res = cqueue.get_nowait() logging.info('Sending task %d to committer %s:%d...', taskid, addr, port) # Send the task conn.WriteInt64(taskid) conn.WriteInt64(runid) conn.WriteInt64(r) if res == None: conn.WriteInt64(0) else: conn.WriteInt64(len(res)) conn.Write(res) # Wait for the confirmation that the task has # been received by the other side ans = conn.ReadInt64(messaging.msg_read_result) if ans != messaging.msg_read_result: logging.warning('Unknown response received from '+ '%s:%d while committing task!', addr, port) raise messaging.MessagingError() taskid = None except queue.Empty: # Finish the response conn.WriteInt64(messaging.msg_read_empty) except: # Something went wrong while sending, put # the last task back in the queue if taskid != None: cqueue.put((taskid, runid, r, res)) logging.info('Task %d put back in the queue.', taskid) pass # Unknow message received or a wrong sized packet could be trashing # the buffer, don't do anything else: logging.warning('Unknown message received \'%d\'!', mtype) except messaging.SocketClosed: logging.debug('Connection to %s:%d closed from the other side.', addr, port) except socket.timeout: logging.warning('Connection to %s:%d timed out!', addr, port) except: logging.warning('Error occurred while reading request from %s:%d!', addr, port) log_lines(traceback.format_exc(), logging.debug) conn.Close() logging.debug('Connection to %s:%d closed.', addr, port)