Beispiel #1
0
    def submit_job(self, max_memory):
        """Send a job to the Master node.

        When a `@parl.remote_class` object is created, the global client
        sends a job to the master node. Then the master node will allocate
        a vacant job from its job pool to the remote object.

        Args:
            max_memory (float): Maximum memory (MB) can be used by each remote
                                instance, the unit is in MB and default value is
                                none(unlimited).

        Returns:
            job_address(str): IP address of the job. None if there is no available CPU in the cluster.
        """
        if self.master_is_alive:

            while True:
                # A lock to prevent multiple actors from submitting job at the same time.
                self.lock.acquire()
                self.submit_job_socket.send_multipart([
                    remote_constants.CLIENT_SUBMIT_TAG,
                    to_byte(self.reply_master_heartbeat_address),
                    to_byte(self.client_id),
                ])
                message = self.submit_job_socket.recv_multipart()
                self.lock.release()

                tag = message[0]

                if tag == remote_constants.NORMAL_TAG:
                    job_address = to_str(message[1])
                    job_heartbeat_address = to_str(message[2])
                    ping_heartbeat_address = to_str(message[3])

                    check_result = self._check_and_monitor_job(
                        job_heartbeat_address, ping_heartbeat_address,
                        max_memory)
                    if check_result:
                        self.lock.acquire()
                        self.actor_num += 1
                        self.lock.release()
                        return job_address

                # no vacant CPU resources, cannot submit a new job
                elif tag == remote_constants.CPU_TAG:
                    job_address = None
                    # wait 1 second to avoid requesting in a high frequency.
                    time.sleep(1)
                    return job_address
                else:
                    raise NotImplementedError
        else:
            raise Exception("Client can not submit job to the master, "
                            "please check if master is connected.")
        return None
Beispiel #2
0
    def _check_and_monitor_job(self, job_heartbeat_address,
                               ping_heartbeat_address, max_memory):
        """ Sometimes the client may receive a job that is dead, thus 
        we have to check if this job is still alive before adding it to the `actor_num`.
        """
        # job_heartbeat_socket: sends heartbeat signal to job
        job_heartbeat_socket = self.ctx.socket(zmq.REQ)
        job_heartbeat_socket.linger = 0
        job_heartbeat_socket.setsockopt(zmq.RCVTIMEO, int(0.9 * 1000))
        job_heartbeat_socket.connect("tcp://" + ping_heartbeat_address)
        try:
            job_heartbeat_socket.send_multipart(
                [remote_constants.HEARTBEAT_TAG,
                 to_byte(str(max_memory))])
            job_heartbeat_socket.recv_multipart()
        except zmq.error.Again:
            job_heartbeat_socket.close(0)
            logger.error(
                "[Client] connects to a finished job, will try again, ping_heartbeat_address:{}"
                .format(ping_heartbeat_address))
            return False
        job_heartbeat_socket.disconnect("tcp://" + ping_heartbeat_address)
        job_heartbeat_socket.connect("tcp://" + job_heartbeat_address)
        job_heartbeat_socket.setsockopt(
            zmq.RCVTIMEO, remote_constants.HEARTBEAT_TIMEOUT_S * 1000)

        # a thread for sending heartbeat signals to job
        thread = threading.Thread(target=self._create_job_monitor,
                                  args=(job_heartbeat_socket, ))
        thread.setDaemon(True)
        thread.start()
        return True
Beispiel #3
0
    def _kill_job(self, job_address):
        """Kill a job process and update worker information"""
        success = self.worker_status.remove_job(job_address)
        if success:
            while True:
                initialized_job = self.job_buffer.get()
                initialized_job.worker_address = self.master_heartbeat_address
                if initialized_job.is_alive:
                    self.worker_status.add_job(initialized_job)
                    if not initialized_job.is_alive:  # make sure that the job is still alive.
                        self.worker_status.remove_job(
                            initialized_job.job_address)
                        continue
                else:
                    logger.warning(
                        "[Worker] a dead job found. The job buffer will not accept this one."
                    )
                if initialized_job.is_alive:
                    break

            self.lock.acquire()
            self.request_master_socket.send_multipart([
                remote_constants.NEW_JOB_TAG,
                cloudpickle.dumps(initialized_job),
                to_byte(job_address)
            ])
            _ = self.request_master_socket.recv_multipart()
            self.lock.release()
Beispiel #4
0
    def _reply_heartbeat(self, target):
        """Worker will kill its jobs when it lost connection with the master.
        """

        socket = self.ctx.socket(zmq.REP)
        socket.linger = 0
        socket.setsockopt(zmq.RCVTIMEO,
                          remote_constants.HEARTBEAT_RCVTIMEO_S * 1000)
        heartbeat_master_port =\
            socket.bind_to_random_port("tcp://*")
        self.master_heartbeat_address = "{}:{}".format(self.worker_ip,
                                                       heartbeat_master_port)

        logger.set_dir(
            os.path.expanduser('~/.parl_data/worker/{}'.format(
                self.master_heartbeat_address.replace(':', '_'))))

        self.heartbeat_socket_initialized.set()
        logger.info("[Worker] Connect to the master node successfully. "
                    "({} CPUs)".format(self.cpu_num))
        while self.master_is_alive and self.worker_is_alive:
            try:
                message = socket.recv_multipart()
                worker_status = self._get_worker_status()
                socket.send_multipart([
                    remote_constants.HEARTBEAT_TAG,
                    to_byte(str(worker_status[0])),
                    to_byte(str(worker_status[1])),
                    to_byte(worker_status[2]),
                    to_byte(str(worker_status[3]))
                ])
            except zmq.error.Again as e:
                self.master_is_alive = False
            except zmq.error.ContextTerminated as e:
                break
        socket.close(0)
        logger.warning(
            "[Worker] lost connection with the master, will exit reply heartbeat for master."
        )
        self.worker_status.clear()
        self.log_server_proc.kill()
        self.log_server_proc.wait()
        # exit the worker
        self.worker_is_alive = False
        self.exit()
Beispiel #5
0
    def _create_sockets(self, master_address):
        """ Each client has 1 sockets as start:

        (1) submit_job_socket: submits jobs to master node.
        """

        # submit_job_socket: submits job to master
        self.submit_job_socket = self.ctx.socket(zmq.REQ)
        self.submit_job_socket.linger = 0
        self.submit_job_socket.setsockopt(
            zmq.RCVTIMEO, remote_constants.HEARTBEAT_TIMEOUT_S * 1000)
        self.submit_job_socket.connect("tcp://{}".format(master_address))
        self.start_time = time.time()
        thread = threading.Thread(target=self._reply_heartbeat)
        thread.setDaemon(True)
        thread.start()
        self.heartbeat_socket_initialized.wait()

        self.client_id = self.reply_master_heartbeat_address.replace(':', '_') + \
                            '_' + str(int(time.time()))

        # check if the master is connected properly
        try:
            self.submit_job_socket.send_multipart([
                remote_constants.CLIENT_CONNECT_TAG,
                to_byte(self.reply_master_heartbeat_address),
                to_byte(socket.gethostname()),
                to_byte(self.client_id),
            ])
            message = self.submit_job_socket.recv_multipart()
            self.log_monitor_url = to_str(message[1])
        except zmq.error.Again as e:
            logger.warning("[Client] Can not connect to the master, please "
                           "check if master is started and ensure the input "
                           "address {} is correct.".format(master_address))
            self.master_is_alive = False
            raise Exception("Client can not connect to the master, please "
                            "check if master is started and ensure the input "
                            "address {} is correct.".format(master_address))
Beispiel #6
0
    def _reply_heartbeat(self):
        """Reply heartbeat signals to the master node."""

        socket = self.ctx.socket(zmq.REP)
        socket.linger = 0
        socket.setsockopt(zmq.RCVTIMEO,
                          remote_constants.HEARTBEAT_RCVTIMEO_S * 1000)
        reply_master_heartbeat_port =\
            socket.bind_to_random_port(addr="tcp://*")
        self.reply_master_heartbeat_address = "{}:{}".format(
            get_ip_address(), reply_master_heartbeat_port)
        self.heartbeat_socket_initialized.set()
        connected = False
        while self.client_is_alive and self.master_is_alive:
            try:
                message = socket.recv_multipart()
                elapsed_time = datetime.timedelta(seconds=int(time.time() -
                                                              self.start_time))
                socket.send_multipart([
                    remote_constants.HEARTBEAT_TAG,
                    to_byte(self.executable_path),
                    to_byte(str(self.actor_num)),
                    to_byte(str(elapsed_time)),
                    to_byte(str(self.log_monitor_url)),
                ])  # TODO: remove additional information
            except zmq.error.Again as e:
                if connected:
                    logger.warning("[Client] Cannot connect to the master."
                                   "Please check if it is still alive.")
                else:
                    logger.warning(
                        "[Client] Cannot connect to the master."
                        "Please check the firewall between client and master.(e.g., ping the master IP)"
                    )
                self.master_is_alive = False
        socket.close(0)
        logger.warning("Client exit replying heartbeat for master.")
Beispiel #7
0
 def _reply_client_heartbeat(self, socket):
     """Create a socket that replies heartbeat signals from the client.
     If the job losts connection with the client, it will exit too.
     """
     while True:
         try:
             message = socket.recv_multipart()
             stop_job = self._check_used_memory()
             socket.send_multipart([
                 remote_constants.HEARTBEAT_TAG,
                 to_byte(str(stop_job)),
                 to_byte(self.job_address)
             ])
             if stop_job == True:
                 logger.error(
                     "Memory used by this job exceeds {}. This job will exist."
                     .format(self.max_memory))
                 time.sleep(5)
                 socket.close(0)
                 os._exit(1)
         except zmq.error.Again as e:
             logger.warning(
                 "[Job] Cannot connect to the client. This job will exit and inform the worker."
             )
             break
     socket.close(0)
     with self.lock:
         self.kill_job_socket.send_multipart(
             [remote_constants.KILLJOB_TAG,
              to_byte(self.job_address)])
         try:
             _ = self.kill_job_socket.recv_multipart()
         except zmq.error.Again as e:
             pass
     logger.warning("[Job]lost connection with the client, will exit")
     os._exit(1)
Beispiel #8
0
    def wait_for_connection(self, reply_socket):
        """Wait for connection from the remote object.

        The remote object will send its class information and initialization
        arguments to the job, these parameters are then used to create a
        local instance in the job process.

        Args:
            reply_socket (sockert): main socket to accept commands of remote object.

        Returns:
            A local instance of the remote class object.
        """

        message = reply_socket.recv_multipart()
        tag = message[0]
        obj = None

        if tag == remote_constants.INIT_OBJECT_TAG:
            try:
                file_name, class_name, end_of_file = cloudpickle.loads(
                    message[1])
                #/home/nlp-ol/Firework/baidu/nlp/evokit/python_api/es_agent -> es_agent
                file_name = file_name.split(os.sep)[-1]
                cls = load_remote_class(file_name, class_name, end_of_file)
                args, kwargs = cloudpickle.loads(message[2])
                logfile_path = os.path.join(self.log_dir, 'stdout.log')
                with redirect_stdout_to_file(logfile_path):
                    obj = cls(*args, **kwargs)
            except Exception as e:
                traceback_str = str(traceback.format_exc())
                error_str = str(e)
                logger.error("traceback:\n{}".format(traceback_str))
                reply_socket.send_multipart([
                    remote_constants.EXCEPTION_TAG,
                    to_byte(error_str + "\ntraceback:\n" + traceback_str)
                ])
                return None
            reply_socket.send_multipart([remote_constants.NORMAL_TAG])
        else:
            logger.error("Message from job {}".format(message))
            reply_socket.send_multipart([
                remote_constants.EXCEPTION_TAG,
                b"[job]Unkonwn tag when tried to receive the class definition"
            ])
            raise NotImplementedError

        return obj
Beispiel #9
0
    def __init__(self, worker_address, log_server_address):
        """
        Args:
            worker_address(str): worker_address for sending job information(e.g, pid)

        Attributes:
            pid (int): Job process ID.
            max_memory (float): Maximum memory (MB) can be used by each remote instance.
        """
        self.max_memory = None

        self.job_address_receiver, job_address_sender = Pipe()
        self.job_id_receiver, job_id_sender = Pipe()

        self.worker_address = worker_address
        self.log_server_address = log_server_address
        self.job_ip = get_ip_address()
        self.pid = os.getpid()

        self.run_job_process = Process(target=self.run,
                                       args=(job_address_sender,
                                             job_id_sender))
        self.run_job_process.start()
        """
        NOTE:
            In Windows, it will raise errors when creating threading.Lock before starting multiprocess.Process.
        """
        self.lock = threading.Lock()
        self._create_sockets()

        process = psutil.Process(self.pid)
        self.init_memory = float(process.memory_info()[0]) / (1024**2)

        self.run_job_process.join()

        with self.lock:
            self.kill_job_socket.send_multipart(
                [remote_constants.KILLJOB_TAG,
                 to_byte(self.job_address)])
            try:
                _ = self.kill_job_socket.recv_multipart()
            except zmq.error.Again as e:
                pass
            os._exit(0)
Beispiel #10
0
                def wrapper(*args, **kwargs):
                    if self.job_shutdown:
                        raise RemoteError(
                            attr, "This actor losts connection with the job.")
                    self.internal_lock.acquire()
                    data = dumps_argument(*args, **kwargs)

                    self.job_socket.send_multipart(
                        [remote_constants.CALL_TAG,
                         to_byte(attr), data])

                    message = self.job_socket.recv_multipart()
                    tag = message[0]

                    if tag == remote_constants.NORMAL_TAG:
                        ret = loads_return(message[1])

                    elif tag == remote_constants.EXCEPTION_TAG:
                        error_str = to_str(message[1])
                        self.job_shutdown = True
                        raise RemoteError(attr, error_str)

                    elif tag == remote_constants.ATTRIBUTE_EXCEPTION_TAG:
                        error_str = to_str(message[1])
                        self.job_shutdown = True
                        raise RemoteAttributeError(attr, error_str)

                    elif tag == remote_constants.SERIALIZE_EXCEPTION_TAG:
                        error_str = to_str(message[1])
                        self.job_shutdown = True
                        raise RemoteSerializeError(attr, error_str)

                    elif tag == remote_constants.DESERIALIZE_EXCEPTION_TAG:
                        error_str = to_str(message[1])
                        self.job_shutdown = True
                        raise RemoteDeserializeError(attr, error_str)

                    else:
                        self.job_shutdown = True
                        raise NotImplementedError()

                    self.internal_lock.release()
                    return ret
Beispiel #11
0
 def __setattr__(self, attr, value):
     if attr not in cls().__dict__:
         super().__setattr__(attr, value)
     else:
         self.internal_lock.acquire()
         self.job_socket.send_multipart([
             remote_constants.SET_ATTRIBUTE,
             to_byte(attr),
             dumps_return(value)
         ])
         message = self.job_socket.recv_multipart()
         tag = message[0]
         self.internal_lock.release()
         if tag == remote_constants.NORMAL_TAG:
             pass
         else:
             self.job_shutdown = True
             raise NotImplementedError()
         return
Beispiel #12
0
    def _init_jobs(self, job_num):
        """Create jobs.

        Args:
            job_num(int): the number of jobs to create.
        """
        job_file = __file__.replace('worker.pyc', 'job.py')
        job_file = job_file.replace('worker.py', 'job.py')
        command = [
            sys.executable, job_file, "--worker_address",
            self.reply_job_address, "--log_server_address",
            self.log_server_address
        ]

        if sys.version_info.major == 3:
            warnings.simplefilter("ignore", ResourceWarning)

        # avoid that many jobs are killed and restarted at the same time.
        self.lock.acquire()

        # Redirect the output to DEVNULL
        FNULL = open(os.devnull, 'w')
        for _ in range(job_num):
            subprocess.Popen(command, stdout=FNULL, stderr=subprocess.STDOUT)
        FNULL.close()

        new_jobs = []
        for _ in range(job_num):
            job_message = self.reply_job_socket.recv_multipart()
            self.reply_job_socket.send_multipart(
                [remote_constants.NORMAL_TAG,
                 to_byte(self.kill_job_address)])
            initialized_job = cloudpickle.loads(job_message[1])
            new_jobs.append(initialized_job)

            # a thread for sending heartbeat signals to job
            thread = threading.Thread(target=self._create_job_monitor,
                                      args=(initialized_job, ))
            thread.setDaemon(True)
            thread.start()
        self.lock.release()
        assert len(new_jobs) > 0, "init jobs failed"
        return new_jobs
Beispiel #13
0
    def _receive_message(self):
        """Master node will receive various types of message: (1) worker
        connection; (2) worker update; (3) client connection; (4) job
        submittion; (5) reset job.
        """
        message = self.client_socket.recv_multipart()
        tag = message[0]

        # a new worker connects to the master
        if tag == remote_constants.WORKER_CONNECT_TAG:
            self.client_socket.send_multipart([remote_constants.NORMAL_TAG])

        elif tag == remote_constants.MONITOR_TAG:
            status = self._get_status()
            self.client_socket.send_multipart(
                [remote_constants.NORMAL_TAG, status])

        # `xparl status` command line API
        elif tag == remote_constants.STATUS_TAG:
            status_info = self.cluster_monitor.get_status_info()
            self.client_socket.send_multipart(
                [remote_constants.NORMAL_TAG,
                 to_byte(status_info)])

        elif tag == remote_constants.WORKER_INITIALIZED_TAG:
            initialized_worker = cloudpickle.loads(message[1])
            worker_address = initialized_worker.worker_address
            self.job_center.add_worker(initialized_worker)
            hostname = self.job_center.get_hostname(worker_address)
            self.cluster_monitor.add_worker_status(worker_address, hostname)
            logger.info("A new worker {} is added, ".format(worker_address) +
                        "the cluster has {} CPUs.\n".format(self.cpu_num))

            # a thread for sending heartbeat signals to `worker.address`
            thread = threading.Thread(
                target=self._create_worker_monitor,
                args=(initialized_worker.worker_address, ))
            thread.start()

            self.client_socket.send_multipart([remote_constants.NORMAL_TAG])

        # a client connects to the master
        elif tag == remote_constants.CLIENT_CONNECT_TAG:
            # `client_heartbeat_address` is the
            #      `reply_master_heartbeat_address` of the client
            client_heartbeat_address = to_str(message[1])
            client_hostname = to_str(message[2])
            client_id = to_str(message[3])
            self.client_hostname[client_heartbeat_address] = client_hostname
            logger.info(
                "Client {} is connected.".format(client_heartbeat_address))

            thread = threading.Thread(target=self._create_client_monitor,
                                      args=(client_heartbeat_address, ))
            thread.start()
            log_monitor_address = "{}/logs?client_id={}".format(
                self.monitor_url, client_id)
            self.client_socket.send_multipart(
                [remote_constants.NORMAL_TAG,
                 to_byte(log_monitor_address)])

        # a client submits a job to the master
        elif tag == remote_constants.CLIENT_SUBMIT_TAG:
            # check available CPU resources
            if self.cpu_num:
                logger.info("Submitting job...")
                job = self.job_center.request_job()
                self.client_socket.send_multipart([
                    remote_constants.NORMAL_TAG,
                    to_byte(job.job_address),
                    to_byte(job.client_heartbeat_address),
                    to_byte(job.ping_heartbeat_address),
                ])
                client_id = to_str(message[2])
                job_info = {job.job_id: job.log_server_address}
                self.cluster_monitor.add_client_job(client_id, job_info)
                self._print_workers()
            else:
                self.client_socket.send_multipart([remote_constants.CPU_TAG])

        # a worker updates
        elif tag == remote_constants.NEW_JOB_TAG:
            initialized_job = cloudpickle.loads(message[1])
            last_job_address = to_str(message[2])

            self.client_socket.send_multipart([remote_constants.NORMAL_TAG])
            self.job_center.update_job(last_job_address, initialized_job,
                                       initialized_job.worker_address)
            logger.info("A worker updated. cpu_num:{}".format(self.cpu_num))

            self._print_workers()

        # check before start a worker
        elif tag == remote_constants.NORMAL_TAG:
            self.client_socket.send_multipart([remote_constants.NORMAL_TAG])

        else:
            raise NotImplementedError()
Beispiel #14
0
            def __getattr__(self, attr):
                """Call the function of the unwrapped class."""
                #check if attr is a function or not
                if attr in cls().__dict__:
                    self.internal_lock.acquire()
                    self.job_socket.send_multipart(
                        [remote_constants.GET_ATTRIBUTE,
                         to_byte(attr)])
                    message = self.job_socket.recv_multipart()
                    tag = message[0]

                    if tag == remote_constants.NORMAL_TAG:
                        ret = loads_return(message[1])
                        self.internal_lock.release()
                        return ret
                    elif tag == remote_constants.EXCEPTION_TAG:
                        error_str = to_str(message[1])
                        self.job_shutdown = True
                        raise RemoteError(attr, error_str)

                    elif tag == remote_constants.ATTRIBUTE_EXCEPTION_TAG:
                        error_str = to_str(message[1])
                        self.job_shutdown = True
                        raise RemoteAttributeError(attr, error_str)

                    elif tag == remote_constants.SERIALIZE_EXCEPTION_TAG:
                        error_str = to_str(message[1])
                        self.job_shutdown = True
                        raise RemoteSerializeError(attr, error_str)

                    elif tag == remote_constants.DESERIALIZE_EXCEPTION_TAG:
                        error_str = to_str(message[1])
                        self.job_shutdown = True
                        raise RemoteDeserializeError(attr, error_str)

                    else:
                        self.job_shutdown = True
                        raise NotImplementedError()

                def wrapper(*args, **kwargs):
                    if self.job_shutdown:
                        raise RemoteError(
                            attr, "This actor losts connection with the job.")
                    self.internal_lock.acquire()
                    data = dumps_argument(*args, **kwargs)

                    self.job_socket.send_multipart(
                        [remote_constants.CALL_TAG,
                         to_byte(attr), data])

                    message = self.job_socket.recv_multipart()
                    tag = message[0]

                    if tag == remote_constants.NORMAL_TAG:
                        ret = loads_return(message[1])

                    elif tag == remote_constants.EXCEPTION_TAG:
                        error_str = to_str(message[1])
                        self.job_shutdown = True
                        raise RemoteError(attr, error_str)

                    elif tag == remote_constants.ATTRIBUTE_EXCEPTION_TAG:
                        error_str = to_str(message[1])
                        self.job_shutdown = True
                        raise RemoteAttributeError(attr, error_str)

                    elif tag == remote_constants.SERIALIZE_EXCEPTION_TAG:
                        error_str = to_str(message[1])
                        self.job_shutdown = True
                        raise RemoteSerializeError(attr, error_str)

                    elif tag == remote_constants.DESERIALIZE_EXCEPTION_TAG:
                        error_str = to_str(message[1])
                        self.job_shutdown = True
                        raise RemoteDeserializeError(attr, error_str)

                    else:
                        self.job_shutdown = True
                        raise NotImplementedError()

                    self.internal_lock.release()
                    return ret

                return wrapper
Beispiel #15
0
    def single_task(self, obj, reply_socket, job_address):
        """An infinite loop waiting for commands from the remote object.

        Each job will receive two kinds of message from the remote object:

        1. When the remote object calls a function, job will run the
           function on the local instance and return the results to the
           remote object.
        2. When the remote object is deleted, the job will quit and release
           related computation resources.

        Args:
            reply_socket (sockert): main socket to accept commands of remote object.
            job_address (String): address of reply_socket.
        """

        while True:
            message = reply_socket.recv_multipart()

            tag = message[0]

            if tag in [
                    remote_constants.CALL_TAG, remote_constants.GET_ATTRIBUTE,
                    remote_constants.SET_ATTRIBUTE
            ]:
                # if tag == remote_constants.CALL_TAG:
                try:
                    if tag == remote_constants.CALL_TAG:
                        function_name = to_str(message[1])
                        data = message[2]
                        args, kwargs = loads_argument(data)

                        # Redirect stdout to stdout.log temporarily
                        logfile_path = os.path.join(self.log_dir, 'stdout.log')
                        with redirect_stdout_to_file(logfile_path):
                            ret = getattr(obj, function_name)(*args, **kwargs)

                        ret = dumps_return(ret)

                        reply_socket.send_multipart(
                            [remote_constants.NORMAL_TAG, ret])

                    elif tag == remote_constants.GET_ATTRIBUTE:
                        attribute_name = to_str(message[1])
                        logfile_path = os.path.join(self.log_dir, 'stdout.log')
                        with redirect_stdout_to_file(logfile_path):
                            ret = getattr(obj, attribute_name)
                        ret = dumps_return(ret)
                        reply_socket.send_multipart(
                            [remote_constants.NORMAL_TAG, ret])
                    else:
                        attribute_name = to_str(message[1])
                        attribute_value = loads_return(message[2])
                        logfile_path = os.path.join(self.log_dir, 'stdout.log')
                        with redirect_stdout_to_file(logfile_path):
                            setattr(obj, attribute_name, attribute_value)
                        reply_socket.send_multipart(
                            [remote_constants.NORMAL_TAG])

                except Exception as e:
                    # reset the job

                    error_str = str(e)
                    logger.error(error_str)

                    if type(e) == AttributeError:
                        reply_socket.send_multipart([
                            remote_constants.ATTRIBUTE_EXCEPTION_TAG,
                            to_byte(error_str)
                        ])
                        raise AttributeError

                    elif type(e) == SerializeError:
                        reply_socket.send_multipart([
                            remote_constants.SERIALIZE_EXCEPTION_TAG,
                            to_byte(error_str)
                        ])
                        raise SerializeError

                    elif type(e) == DeserializeError:
                        reply_socket.send_multipart([
                            remote_constants.DESERIALIZE_EXCEPTION_TAG,
                            to_byte(error_str)
                        ])
                        raise DeserializeError

                    else:
                        traceback_str = str(traceback.format_exc())
                        logger.error("traceback:\n{}".format(traceback_str))
                        reply_socket.send_multipart([
                            remote_constants.EXCEPTION_TAG,
                            to_byte(error_str + "\ntraceback:\n" +
                                    traceback_str)
                        ])
                        break

            # receive DELETE_TAG from actor, and stop replying worker heartbeat
            elif tag == remote_constants.KILLJOB_TAG:
                reply_socket.send_multipart([remote_constants.NORMAL_TAG])
                logger.warning(
                    "An actor exits and this job {} will exit.".format(
                        job_address))
                break
            else:
                logger.error(
                    "The job receives an unknown message: {}".format(message))
                raise NotImplementedError