Example #1
0
def killtms():
    logging.info('Killing task managers...')

    # Load the list of nodes to connect to
    tmlist = load_tm_list()

    for name, tm in tmlist.items():
        try:
            logging.debug('Connecting to %s:%d...', tm.address, tm.port)

            tm.Open(jm_conn_timeout)

            # Send the job identifier
            tm.WriteString(jm_jobid)

            # Read back the job id of the answer
            tm.ReadString(jm_recv_timeout)

            tm.WriteInt64(messaging.msg_terminate)
            tm.Close()
        except:
            # Problem connecting to the task manager
            logging.warning('Error connecting to task manager at %s:%d!',
                            tm.address, tm.port)
            log_lines(traceback.format_exc(), logging.debug)
Example #2
0
def stats():
    global _memstat_enabled, _can_enable
    if not _memstat_enabled:
        return
    if not _can_enable:
        logging.warning('could not enable memstat')
        _memstat_enabled = False
        return
    try:
        s0, s1, s2 = gc.get_count()
        usage = resource.getrusage(resource.RUSAGE_SELF)
        kb = usage.ru_maxrss
        if kb == 0:
            kb = my_getrss()
        _frame = inspect.currentframe()
        frame = _frame.f_back
        fname = frame.f_code.co_filename
        fnum = frame.f_lineno
        logging.info('memstat:%s:%d: rss_kb: %d gb_stages: %d %d %d' % \
                     (fname, fnum, kb, s0, s1, s2))
    except:
        log_lines(sys.exc_info(), logging.debug)
        log_lines(traceback.format_exc(), logging.debug)
        logging.warning('something went wrong with memstat, disabling')
        _can_enable = False
        _memstat_enabled = False
    finally:
        # Necessary to avoid cyclic references and leak memory!
        del frame
        del _frame
Example #3
0
    def listener(self):
        if self.mode == config.mode_tcp:
            logging.info('Listening to network at %s:%d...',
                self.addr, self.port)
        elif self.mode == config.mode_uds:
            logging.info('Listening to file at %s...',
                self.addr)
        while True:
            try:
                conn, addr = self.socket.accept()

                # Assign the address from the connection
                if self.mode == config.mode_tcp:
                    # TCP
                    addr, port = addr
                elif self.mode == config.mode_uds:
                    # UDS
                    addr = 'uds'
                    port = 0

                # Create the endpoint and send to a thread to
                # process the request
                endpoint = ClientEndpoint(addr, port, conn)
                threading.Thread(target = self.callback,
                    args=((endpoint, addr, port) + self.user_args)).start()
            except:
                log_lines(sys.exc_info(), logging.debug)
                log_lines(traceback.format_exc(), logging.debug)
                time.sleep(10)
Example #4
0
def setup_endpoint_for_pushing(e):
    try:
        # Try to connect to a task manager
        e.Open(jm_conn_timeout)
    except:
        # Problem connecting to the task manager
        # Because this is a connection event,
        # make it a debug rather than a warning
        logging.debug('Error connecting to task manager at %s:%d!', e.address,
                      e.port)
        log_lines(traceback.format_exc(), logging.debug)
        e.Close()
        return
    try:
        # Send the job identifier
        e.WriteString(jm_jobid)

        # Ask if it is possible to send tasks
        e.WriteInt64(messaging.msg_send_task)

        # Verify job id of the answer
        jobid = e.ReadString(jm_recv_timeout)

        if jm_jobid != jobid:
            logging.error(
                'Job Id mismatch from %s:%d! Self: %s, task manager: %s!',
                e.address, e.port, jm_jobid, jobid)
            e.Close()
            return False

        # Wait for a response
        response = e.ReadInt64(jm_recv_timeout)

        if response == messaging.msg_send_full:
            # Task mananger is full
            logging.debug('Task manager at %s:%d is full.', e.address, e.port)

        elif response == messaging.msg_send_more:
            # Continue to the task pushing loop
            return True

        else:
            # The task manager is not replying as expected
            logging.error('Unknown response from the task manager!')

    except:
        # Problem connecting to the task manager
        logging.warning('Error connecting to task manager at %s:%d!',
                        e.address, e.port)
        log_lines(traceback.format_exc(), logging.debug)

    e.Close()
    return False
Example #5
0
def heartbeat(finished):
    global jm_heartbeat_interval
    t_last = time.clock()
    for isEnd, name, tm in infinite_tmlist_generator():
        if finished[0]:
            logging.debug('Stopping heartbeat thread...')
            return
        if isEnd:
            t_curr = time.clock()
            elapsed = t_curr - t_last
            t_last = t_curr
            sleep_for = max(jm_heartbeat_interval - elapsed, 0)
            time.sleep(sleep_for)
        else:
            try:
                tm.Open(jm_heart_timeout)
            except:
                # Problem connecting to the task manager
                # Because this is a connection event,
                # make it a debug rather than a warning
                logging.debug('Error connecting to task manager at %s:%d!',
                              tm.address, tm.port)
                log_lines(traceback.format_exc(), logging.debug)
                tm.Close()
                continue
            try:
                # Send the job identifier
                tm.WriteString(jm_jobid)

                # Verify job id of the answer
                jobid = tm.ReadString(jm_recv_timeout)

                if jm_jobid != jobid:
                    logging.error(
                        'Job Id mismatch from %s:%d! Self: %s, task manager: %s!',
                        tm.address, tm.port, jm_jobid, jobid)
                    tm.Close()
                    continue

                # Send the heartbeat
                tm.WriteInt64(messaging.msg_send_heart)
            except:
                logging.warning('Error connecting to task manager at %s:%d!',
                                tm.address, tm.port)
                log_lines(traceback.format_exc(), logging.debug)
            finally:
                tm.Close()
Example #6
0
def setup_endpoint_for_pulling(e):
    try:
        # Try to connect to a task manager
        e.Open(jm_conn_timeout)
    except:
        # Problem connecting to the task manager
        # Because this is a connection event,
        # make it a debug rather than a warning
        logging.debug('Error connecting to task manager at %s:%d!', e.address,
                      e.port)
        log_lines(traceback.format_exc(), logging.debug)
        e.Close()
        return
    try:
        # Send the job identifier
        e.WriteString(jm_jobid)

        # Ask if it is possible to send tasks
        e.WriteInt64(messaging.msg_read_result)

        # Verify job id of the answer
        jobid = e.ReadString(jm_recv_timeout)

        if jm_jobid != jobid:
            logging.error(
                'Job Id mismatch from %s:%d! Self: %s, task manager: %s!',
                e.address, e.port, jm_jobid, jobid)
            e.Close()
            return False

        return True

    except:
        # Problem connecting to the task manager
        logging.warning('Error connecting to task manager at %s:%d!',
                        e.address, e.port)
        log_lines(traceback.format_exc(), logging.debug)

    e.Close()
    return False
Example #7
0
def push_tasks(job, runid, jm, tm, taskid, task, tasklist, completed):
    # Keep pushing until finished or the task manager is full
    sent = []
    while True:
        if task == None:

            # Avoid calling next_task after it's finished
            if completed:
                logging.debug('There are no new tasks to generate.')
                return (True, 0, None, sent)

            # Only get a task if the last one was already sent
            newtaskid = taskid + 1
            r1, newtask, ctx = job.spits_job_manager_next_task(jm, newtaskid)

            # Exit if done
            if r1 == 0:
                return (True, 0, None, sent)

            if newtask == None:
                logging.error('Task %d was not pushed!', newtaskid)
                return (False, taskid, task, sent)

            if ctx != newtaskid:
                logging.error('Context verification failed for task %d!',
                              newtaskid)
                return (False, taskid, task, sent)

            # Add the generated task to the tasklist
            taskid = newtaskid
            task = newtask[0]
            tasklist[taskid] = (0, task)

            logging.debug('Generated task %d with payload size of %d bytes.',
                          taskid,
                          len(task) if task != None else 0)

        try:
            logging.debug('Pushing %d...', taskid)

            # Push the task to the active task manager
            tm.WriteInt64(taskid)
            tm.WriteInt64(runid)
            if task == None:
                tm.WriteInt64(0)
            else:
                tm.WriteInt64(len(task))
                tm.Write(task)

            # Wait for a response
            response = tm.ReadInt64(jm_recv_timeout)

            if response == messaging.msg_send_full:
                # Task was sent, but the task manager is now full
                sent.append((taskid, task))
                task = None
                break

            elif response == messaging.msg_send_more:
                # Continue pushing tasks
                sent.append((taskid, task))
                task = None
                pass

            elif response == messaging.msg_send_rjct:
                # Task was rejected by the task manager, this is not
                # predicted for a model where just one task manager
                # pushes tasks, exit the task loop
                logging.warning('Task manager at %s:%d rejected task %d',
                                tm.address, tm.port, taskid)
                break

            else:
                # The task manager is not replying as expected
                logging.error('Unknown response from the task manager!')
                break
        except:
            # Something went wrong with the connection,
            # try with another task manager
            logging.error('Error pushing tasks to task manager!')
            log_lines(traceback.format_exc(), logging.debug)
            break

    return (False, taskid, task, sent)
Example #8
0
def server_callback(conn, addr, port, job, tpool, cqueue, timeout):
    logging.debug('Connected to %s:%d.', addr, port)

    try:
        # Send the job identifier
        conn.WriteString(tm_jobid)

        # Verify job id of the answer
        jobid = conn.ReadString(tm_recv_timeout)

        if tm_jobid != jobid:
            logging.error('Job Id mismatch from %s:%d! Self: %s, task manager: %s!',
                conn.address, conn.port, tm_jobid, jobid)
            conn.Close()
            return False

        # Read the type of message
        mtype = conn.ReadInt64(tm_recv_timeout)
        timeout.reset()

        # Termination signal
        if mtype == messaging.msg_terminate:
            logging.info('Received a kill signal from %s:%d.',
                addr, port)
            os._exit(0)

        # Job manager is sending heartbeats
        if mtype == messaging.msg_send_heart:
            logging.debug('Received heartbeat from %s:%d', addr, port)

        # Job manager is trying to send tasks to the task manager
        elif mtype == messaging.msg_send_task:
            # Two phase pull: test-try-pull
            while not tpool.Full():
                # Task pool is not full, start asking for data
                conn.WriteInt64(messaging.msg_send_more)
                taskid = conn.ReadInt64(tm_recv_timeout)
                runid = conn.ReadInt64(tm_recv_timeout)
                tasksz = conn.ReadInt64(tm_recv_timeout)
                task = conn.Read(tasksz, tm_recv_timeout)
                logging.info('Received task %d from %s:%d.',
                    taskid, addr, port)

                # Try enqueue the received task
                if not tpool.Put(taskid, runid, task):
                    # For some reason the pool got full in between
                    # (shouldn't happen)
                    logging.warning('Rejecting task %d because ' +
                        'the pool is ful!', taskid)
                    conn.WriteInt64(messaging.msg_send_rjct)

            # Task pool is full, stop receiving tasks
            conn.WriteInt64(messaging.msg_send_full)

        # Job manager is querying the results of the completed tasks
        elif mtype == messaging.msg_read_result:
            taskid = None
            try:
                # Dequeue completed tasks until cqueue fires
                # an Empty exception
                while True:
                    # Pop the task
                    taskid, runid, r, res = cqueue.get_nowait()

                    logging.info('Sending task %d to committer %s:%d...',
                        taskid, addr, port)

                    # Send the task
                    conn.WriteInt64(taskid)
                    conn.WriteInt64(runid)
                    conn.WriteInt64(r)
                    if res == None:
                        conn.WriteInt64(0)
                    else:
                        conn.WriteInt64(len(res))
                        conn.Write(res)

                    # Wait for the confirmation that the task has
                    # been received by the other side
                    ans = conn.ReadInt64(messaging.msg_read_result)
                    if ans != messaging.msg_read_result:
                        logging.warning('Unknown response received from '+
                            '%s:%d while committing task!', addr, port)
                        raise messaging.MessagingError()

                    taskid = None

            except queue.Empty:
                # Finish the response
                conn.WriteInt64(messaging.msg_read_empty)

            except:
                # Something went wrong while sending, put
                # the last task back in the queue
                if taskid != None:
                    cqueue.put((taskid, runid, r, res))
                    logging.info('Task %d put back in the queue.', taskid)
                pass

        # Unknow message received or a wrong sized packet could be trashing
        # the buffer, don't do anything
        else:
            logging.warning('Unknown message received \'%d\'!', mtype)

    except messaging.SocketClosed:
        logging.debug('Connection to %s:%d closed from the other side.',
            addr, port)

    except socket.timeout:
        logging.warning('Connection to %s:%d timed out!', addr, port)

    except:
        logging.warning('Error occurred while reading request from %s:%d!',
            addr, port)
        log_lines(traceback.format_exc(), logging.debug)

    conn.Close()
    logging.debug('Connection to %s:%d closed.', addr, port)