Esempio n. 1
0
    def _check_and_monitor_job(self, job_heartbeat_address,
                               ping_heartbeat_address, max_memory):
        """ Sometimes the client may receive a job that is dead, thus 
        we have to check if this job is still alive before adding it to the `actor_num`.
        """
        # job_heartbeat_socket: sends heartbeat signal to job
        job_heartbeat_socket = self.ctx.socket(zmq.REQ)
        job_heartbeat_socket.linger = 0
        job_heartbeat_socket.setsockopt(zmq.RCVTIMEO, int(0.9 * 1000))
        job_heartbeat_socket.connect("tcp://" + ping_heartbeat_address)
        try:
            job_heartbeat_socket.send_multipart(
                [remote_constants.HEARTBEAT_TAG,
                 to_byte(str(max_memory))])
            job_heartbeat_socket.recv_multipart()
        except zmq.error.Again:
            job_heartbeat_socket.close(0)
            logger.error(
                "[Client] connects to a finished job, will try again, ping_heartbeat_address:{}"
                .format(ping_heartbeat_address))
            return False
        job_heartbeat_socket.disconnect("tcp://" + ping_heartbeat_address)
        job_heartbeat_socket.connect("tcp://" + job_heartbeat_address)
        job_heartbeat_socket.setsockopt(
            zmq.RCVTIMEO, remote_constants.HEARTBEAT_TIMEOUT_S * 1000)

        # a thread for sending heartbeat signals to job
        thread = threading.Thread(target=self._create_job_monitor,
                                  args=(job_heartbeat_socket, ))
        thread.setDaemon(True)
        thread.start()
        return True
Esempio n. 2
0
 def send_file(self, socket):
     try:
         socket.send_multipart([
             remote_constants.SEND_FILE_TAG,
             self.GLOBAL_CLIENT.pyfiles
         ])
         _ = socket.recv_multipart()
     except zmq.error.Again as e:
         logger.error("Send python files failed.")
Esempio n. 3
0
    def wait_for_connection(self, reply_socket):
        """Wait for connection from the remote object.

        The remote object will send its class information and initialization
        arguments to the job, these parameters are then used to create a
        local instance in the job process.

        Args:
            reply_socket (sockert): main socket to accept commands of remote object.

        Returns:
            A local instance of the remote class object.
        """

        message = reply_socket.recv_multipart()
        tag = message[0]
        obj = None

        if tag == remote_constants.INIT_OBJECT_TAG:
            try:
                file_name, class_name, end_of_file = cloudpickle.loads(
                    message[1])
                #/home/nlp-ol/Firework/baidu/nlp/evokit/python_api/es_agent -> es_agent
                file_name = file_name.split(os.sep)[-1]
                cls = load_remote_class(file_name, class_name, end_of_file)
                args, kwargs = cloudpickle.loads(message[2])
                logfile_path = os.path.join(self.log_dir, 'stdout.log')
                with redirect_stdout_to_file(logfile_path):
                    obj = cls(*args, **kwargs)
            except Exception as e:
                traceback_str = str(traceback.format_exc())
                error_str = str(e)
                logger.error("traceback:\n{}".format(traceback_str))
                reply_socket.send_multipart([
                    remote_constants.EXCEPTION_TAG,
                    to_byte(error_str + "\ntraceback:\n" + traceback_str)
                ])
                return None
            reply_socket.send_multipart([remote_constants.NORMAL_TAG])
        else:
            logger.error("Message from job {}".format(message))
            reply_socket.send_multipart([
                remote_constants.EXCEPTION_TAG,
                b"[job]Unkonwn tag when tried to receive the class definition"
            ])
            raise NotImplementedError

        return obj
Esempio n. 4
0
def test():
    # 创建环境
    game = FlappyBird()
    env = PLE(game, fps=30, display_screen=True)
    obs_dim = len(env.getGameState())
    act_dim = len(env.getActionSet())
    print('action set:', env.getActionSet())
    logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))

    # 创建经验池
    rpm = ReplayMemory(MEMORY_SIZE)  # DQN的经验回放池

    # 根据parl框架构建agent
    model = Model(act_dim=act_dim)
    algorithm = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE)
    agent = Agent(algorithm,
                  obs_dim=obs_dim,
                  act_dim=act_dim,
                  e_greed=0.3,
                  e_greed_decrement=1e-6)

    # 加载模型
    save_path = './DQN/checkpoints/episode_V14600.ckpt'
    print('checkpoints:', save_path)
    if os.path.exists(save_path):
        logger.info('load ckpt success!')
        agent.restore(save_path)
    else:
        logger.error('load ckpt error!')

    action_set = env.getActionSet()
    env.init()
    episode_reward = 0
    steps = 0
    while not env.game_over():
        steps += 1
        if (steps == 1):
            continue
        obs = list(env.getGameState().values())
        action_idx = agent.predict(obs)  # 预测动作,只选最优动作
        act = action_set[action_idx]
        reward = env.act(act)
        episode_reward += reward
        reward_str = str(int(episode_reward))
        drawText(env.game.screen, reward_str, 288, 0, 48, (255, 0, 0),
                 (255, 255, 255))
    env.reset_game()
    logger.info('[Test] steps:{}, reward:{}'.format(steps, episode_reward))
Esempio n. 5
0
    def wait_for_files(self, reply_socket, job_address):
        """Wait for python files from remote object.

        When a remote object receives the allocated job address, it will send
        the python files to the job. Later, the job will save these files to a
        temporary directory and add the temporary diretory to Python's working
        directory.

        Args:
            reply_socket (sockert): main socket to accept commands of remote object.
            job_address (String): address of reply_socket.

        Returns:
            A temporary directory containing the python files.
        """

        message = reply_socket.recv_multipart()
        tag = message[0]
        if tag == remote_constants.SEND_FILE_TAG:
            pyfiles = pickle.loads(message[1])
            # save python files to temporary directory
            envdir = tempfile.mkdtemp()
            for file, code in pyfiles['python_files'].items():
                file = os.path.join(envdir, file)
                with open(file, 'wb') as code_file:
                    code_file.write(code)

            # save other files to current directory
            for file, content in pyfiles['other_files'].items():
                # create directory (i.e. ./rom_files/)
                if '/' in file:
                    try:
                        sep = os.sep
                        recursive_dirs = os.path.join(*(file.split(sep)[:-1]))
                        recursive_dirs = os.path.join(envdir, recursive_dirs)
                        os.makedirs(recursive_dirs)
                    except OSError as e:
                        pass
                file = os.path.join(envdir, file)
                with open(file, 'wb') as f:
                    f.write(content)
            reply_socket.send_multipart([remote_constants.NORMAL_TAG])
            return envdir
        else:
            logger.error("NotImplementedError:{}, received tag:{}".format(
                job_address, ))
            raise NotImplementedError
Esempio n. 6
0
    def run(self, job_address_sender, job_id_sender):
        """An infinite loop waiting for a new task.

        Args:
            job_address_sender(sending end of multiprocessing.Pipe): send job address of reply_socket to main process.
        """
        ctx = zmq.Context()

        # create the reply_socket
        reply_socket = ctx.socket(zmq.REP)
        job_port = reply_socket.bind_to_random_port(addr="tcp://*")
        reply_socket.linger = 0
        job_ip = get_ip_address()
        job_address = "{}:{}".format(job_ip, job_port)

        job_id = job_address.replace(':', '_') + '_' + str(int(time.time()))
        self.log_dir = os.path.expanduser('~/.parl_data/job/{}'.format(job_id))
        logger.set_dir(self.log_dir)
        logger.info(
            "[Job] Job {} initialized. Reply heartbeat socket Address: {}.".
            format(job_id, job_address))

        job_address_sender.send(job_address)
        job_id_sender.send(job_id)

        try:
            # receive source code from the actor and append them to the environment variables.
            envdir = self.wait_for_files(reply_socket, job_address)
            sys.path.insert(0, envdir)
            os.chdir(envdir)

            obj = self.wait_for_connection(reply_socket)
            assert obj is not None
            self.single_task(obj, reply_socket, job_address)
        except Exception as e:
            logger.error(
                "Error occurs when running a single task. We will reset this job. \nReason:{}"
                .format(e))
            traceback_str = str(traceback.format_exc())
            logger.error("traceback:\n{}".format(traceback_str))
Esempio n. 7
0
    def playGame(self, verbose=False):
        """
        Executes one episode of a game.

        Returns:
            either
                winner: player who won the game (1 if player1, -1 if player2)
            or
                draw result returned from the game that is neither 1, -1, nor 0.
        """
        players = [self.player2, None, self.player1]
        curPlayer = 1
        board = self.game.getInitBoard()
        it = 0
        while self.game.getGameEnded(board, curPlayer) == 0:
            it += 1
            if verbose:
                assert self.display
                print("Turn ", str(it), "Player ", str(curPlayer))
                self.display(board)
            action = players[curPlayer + 1](self.game.getCanonicalForm(
                board, curPlayer))

            valids = self.game.getValidMoves(
                self.game.getCanonicalForm(board, curPlayer), 1)

            if valids[action] == 0:
                logger.error('Action {} is not valid!'.format(action))
                logger.debug('valids = {}'.format(valids))
                assert valids[action] > 0
            board, curPlayer = self.game.getNextState(board, curPlayer, action)
        if verbose:
            assert self.display
            print("Game over: Turn ", str(it), "Result ",
                  str(self.game.getGameEnded(board, 1)))
            self.display(board)
        return curPlayer * self.game.getGameEnded(board, curPlayer)
Esempio n. 8
0
 def _reply_client_heartbeat(self, socket):
     """Create a socket that replies heartbeat signals from the client.
     If the job losts connection with the client, it will exit too.
     """
     while True:
         try:
             message = socket.recv_multipart()
             stop_job = self._check_used_memory()
             socket.send_multipart([
                 remote_constants.HEARTBEAT_TAG,
                 to_byte(str(stop_job)),
                 to_byte(self.job_address)
             ])
             if stop_job == True:
                 logger.error(
                     "Memory used by this job exceeds {}. This job will exist."
                     .format(self.max_memory))
                 time.sleep(5)
                 socket.close(0)
                 os._exit(1)
         except zmq.error.Again as e:
             logger.warning(
                 "[Job] Cannot connect to the client. This job will exit and inform the worker."
             )
             break
     socket.close(0)
     with self.lock:
         self.kill_job_socket.send_multipart(
             [remote_constants.KILLJOB_TAG,
              to_byte(self.job_address)])
         try:
             _ = self.kill_job_socket.recv_multipart()
         except zmq.error.Again as e:
             pass
     logger.warning("[Job]lost connection with the client, will exit")
     os._exit(1)
Esempio n. 9
0
    def _create_job_monitor(self, job_heartbeat_socket):
        """Send heartbeat signals to check target's status"""

        job_is_alive = True
        while job_is_alive and self.client_is_alive:
            try:
                job_heartbeat_socket.send_multipart(
                    [remote_constants.HEARTBEAT_TAG])
                job_message = job_heartbeat_socket.recv_multipart()
                stop_job = to_str(job_message[1])
                job_address = to_str(job_message[2])

                if stop_job == 'True':
                    logger.error(
                        'Job {} exceeds max memory usage, will stop this job.'.
                        format(job_address))
                    self.lock.acquire()
                    self.actor_num -= 1
                    self.lock.release()
                    job_is_alive = False
                else:
                    time.sleep(remote_constants.HEARTBEAT_INTERVAL_S)

            except zmq.error.Again as e:
                job_is_alive = False
                self.lock.acquire()
                self.actor_num -= 1
                logger.error(
                    '[xparl] lost connection with a job, current actor num: {}'
                    .format(self.actor_num))
                self.lock.release()

            except zmq.error.ZMQError as e:
                break

        job_heartbeat_socket.close(0)
Esempio n. 10
0
    def _create_jobs(self):
        """Create jobs and send a instance of ``InitializedWorker`` that contains the worker information to the master."""
        try:
            self.request_master_socket.send_multipart(
                [remote_constants.WORKER_CONNECT_TAG])
            _ = self.request_master_socket.recv_multipart()
        except zmq.error.Again as e:
            logger.error("Can not connect to the master, "
                         "please check if master is started.")
            self.master_is_alive = False
            return

        initialized_jobs = self._init_jobs(job_num=self.cpu_num)
        self.request_master_socket.setsockopt(
            zmq.RCVTIMEO, remote_constants.HEARTBEAT_TIMEOUT_S * 1000)

        self.reply_master_hearbeat_thread = threading.Thread(
            target=self._reply_heartbeat,
            args=("master {}".format(self.master_address), ))
        self.reply_master_hearbeat_thread.start()
        self.heartbeat_socket_initialized.wait()

        for job in initialized_jobs:
            job.worker_address = self.master_heartbeat_address

        initialized_worker = InitializedWorker(self.master_heartbeat_address,
                                               initialized_jobs, self.cpu_num,
                                               socket.gethostname())
        self.request_master_socket.send_multipart([
            remote_constants.WORKER_INITIALIZED_TAG,
            cloudpickle.dumps(initialized_worker)
        ])

        _ = self.request_master_socket.recv_multipart()
        self.worker_status = WorkerStatus(self.master_heartbeat_address,
                                          initialized_jobs, self.cpu_num)
Esempio n. 11
0
    def single_task(self, obj, reply_socket, job_address):
        """An infinite loop waiting for commands from the remote object.

        Each job will receive two kinds of message from the remote object:

        1. When the remote object calls a function, job will run the
           function on the local instance and return the results to the
           remote object.
        2. When the remote object is deleted, the job will quit and release
           related computation resources.

        Args:
            reply_socket (sockert): main socket to accept commands of remote object.
            job_address (String): address of reply_socket.
        """

        while True:
            message = reply_socket.recv_multipart()

            tag = message[0]

            if tag in [
                    remote_constants.CALL_TAG, remote_constants.GET_ATTRIBUTE,
                    remote_constants.SET_ATTRIBUTE
            ]:
                # if tag == remote_constants.CALL_TAG:
                try:
                    if tag == remote_constants.CALL_TAG:
                        function_name = to_str(message[1])
                        data = message[2]
                        args, kwargs = loads_argument(data)

                        # Redirect stdout to stdout.log temporarily
                        logfile_path = os.path.join(self.log_dir, 'stdout.log')
                        with redirect_stdout_to_file(logfile_path):
                            ret = getattr(obj, function_name)(*args, **kwargs)

                        ret = dumps_return(ret)

                        reply_socket.send_multipart(
                            [remote_constants.NORMAL_TAG, ret])

                    elif tag == remote_constants.GET_ATTRIBUTE:
                        attribute_name = to_str(message[1])
                        logfile_path = os.path.join(self.log_dir, 'stdout.log')
                        with redirect_stdout_to_file(logfile_path):
                            ret = getattr(obj, attribute_name)
                        ret = dumps_return(ret)
                        reply_socket.send_multipart(
                            [remote_constants.NORMAL_TAG, ret])
                    else:
                        attribute_name = to_str(message[1])
                        attribute_value = loads_return(message[2])
                        logfile_path = os.path.join(self.log_dir, 'stdout.log')
                        with redirect_stdout_to_file(logfile_path):
                            setattr(obj, attribute_name, attribute_value)
                        reply_socket.send_multipart(
                            [remote_constants.NORMAL_TAG])

                except Exception as e:
                    # reset the job

                    error_str = str(e)
                    logger.error(error_str)

                    if type(e) == AttributeError:
                        reply_socket.send_multipart([
                            remote_constants.ATTRIBUTE_EXCEPTION_TAG,
                            to_byte(error_str)
                        ])
                        raise AttributeError

                    elif type(e) == SerializeError:
                        reply_socket.send_multipart([
                            remote_constants.SERIALIZE_EXCEPTION_TAG,
                            to_byte(error_str)
                        ])
                        raise SerializeError

                    elif type(e) == DeserializeError:
                        reply_socket.send_multipart([
                            remote_constants.DESERIALIZE_EXCEPTION_TAG,
                            to_byte(error_str)
                        ])
                        raise DeserializeError

                    else:
                        traceback_str = str(traceback.format_exc())
                        logger.error("traceback:\n{}".format(traceback_str))
                        reply_socket.send_multipart([
                            remote_constants.EXCEPTION_TAG,
                            to_byte(error_str + "\ntraceback:\n" +
                                    traceback_str)
                        ])
                        break

            # receive DELETE_TAG from actor, and stop replying worker heartbeat
            elif tag == remote_constants.KILLJOB_TAG:
                reply_socket.send_multipart([remote_constants.NORMAL_TAG])
                logger.warning(
                    "An actor exits and this job {} will exit.".format(
                        job_address))
                break
            else:
                logger.error(
                    "The job receives an unknown message: {}".format(message))
                raise NotImplementedError