def start(self): """ run evaluator """ _ag = AgentGroup(self.env_para, self.alg_para, self.agent_para) while True: recv_data = self.recv_broker.get() cmd = get_msg_info(recv_data, "cmd") logging.debug("evaluator get meg: {}".format(recv_data)) if cmd not in ["eval"]: continue model_name = get_msg_data(recv_data) _ag.restore(model_name) # fixme: load weight 'file' from the disk eval_data = _ag.evaluate(self.bm_eval.get("episodes_per_eval", 1)) # return each rewards for each agent record_item = tuple([eval_data, model_name]) print_immediately("collect eval results: {}".format(record_item)) record_item = message( record_item, cmd="eval_result", broker_id=self.broker_id, test_id=self.test_id, ) self.send_broker.send(record_item)
def infer_action(self, state, use_explore): """ Infer an action with the `state` :param state: :param use_explore: Used True, in train, False in evaluate :return: action value """ # if explore action if use_explore and random.random() < self.epsilon: action = np.random.randint(0, self.alg.action_dim) elif use_explore: # explore with remote predict # Get Q values with deliver for each action. send_data = message(state, cmd="predict") self.send_explorer.send(send_data) action = self.recv_explorer.recv() else: # don't explore, used in evaluate action = self.alg.predict(state) # update episode value if use_explore: self.epsilon -= 1.0 / self.episode_count self.epsilon = max(0.01, self.epsilon) # update transition data self.transition_data.update( {"cur_state": state, "action": action,} ) return action
def _dist_model(self, dist_model_name=("none", "none"), save_index=-1): """dist model tool""" ctr_info = self.alg.dist_model_policy.get_dist_info(save_index) # Not do distribute model with empty list if isinstance(ctr_info, list): for _ctr in ctr_info: to_send_data = message(dist_model_name, cmd="dist_model", **_ctr) self.model_q.send(to_send_data) else: to_send_data = message(dist_model_name, cmd="dist_model", **ctr_info) self.model_q.send(to_send_data)
def put_test_model(self, model_name): """ send test model """ key = self.get_avail_node() ctr_info = {"cmd": "eval", "broker_id": key[0], "test_id": key[1]} eval_cmd = message(model_name, **ctr_info) self.send_broker.send(eval_cmd) logging.debug("put evaluate model: {}".format(model_name)) self.used_node[key] += 1
def create_evaluator(self, broker_id, test_id): """ create evaluator """ config = deepcopy(self.config_info) config.update({"test_id": test_id}) create_cmd = message(config, cmd="create_evaluator", broker_id=broker_id) self.send_broker.send(create_cmd)
def get_trajectory(self): for env_id in range(self.vector_env_size): for _data_key in ("cur_state", "logit", "action", "reward", "done", "info"): self.trajectory[_data_key].extend( self.sample_vector[env_id][_data_key]) # merge data into env_num * seq_len for _data_key in self.trajectory: self.trajectory[_data_key] = np.stack(self.trajectory[_data_key]) self.trajectory["action"].astype(np.int32) trajectory = message(self.trajectory.copy()) set_msg_info(trajectory, agent_id=self.id) return trajectory
def handle_env_feedback(self, next_raw_state, reward, done, info, use_explore): self.transition_data.update({ "next_state": next_raw_state, "reward": np.sign(reward) if use_explore else reward, "done": done, "info": info }) # deliver this transition data to learner, trigger train process. if use_explore: train_data = {k: [v] for k, v in self.transition_data.items()} train_data = message(train_data, agent_id=self.id) self.send_explorer.send(train_data) return self.transition_data
def explore(self, episode_count): """ agent_num impact on the api about run interaction with environment. == 1: use standalone api, `run_one_episode` >= 2 and env.api_type == "standalone": agent.run_one_episode >= 2 and env.api_type == "unified": agent.do_one_interaction. :param episode_count: :return: """ _start0 = time() model_name = self.agents[0].sync_model() # fixme: async alg dummy self.ag_stats.wait_model_time = time() - _start0 logging.debug("get sync model: {}".format(model_name)) if isinstance(model_name, dict) or \ (isinstance(model_name, list) and "none" not in model_name): _start1 = time() self.restore(model_name) self.ag_stats.restore_model_time = time() - _start1 # single agent, always use the `run_one_episode` api. # multi agent with `standalone` api_type, use the `run_one_episode` api. if self.env_info["api_type"] == "standalone": # (use_explore, collect) _paras = [ (True, False if _ag.alg.async_flag else True) for _ag in self.agents ] job_funcs = [agent.run_one_episode for agent in self.agents] for _epi_index in range(episode_count): _start2 = time() self.env.reset() for agent in self.agents: agent.reset() trajectory_list = self.bot.do_multi_job(job_funcs, _paras) for _ag, trajectory in zip(self.agents, trajectory_list): if not _ag.alg.async_flag: self.trajectories.append(trajectory) self.send_explorer.send(trajectory) self._post_processes() self.ag_stats.explore_time_in_epi = time() - _start2 if _epi_index == episode_count - 1: self.ag_stats.update_with_agent_stats( [_a.get_perf_stats() for _a in self.agents] ) elif self.env_info["api_type"] == "unified": for _ in range(episode_count): _start2 = time() trajectories = self._run_one_unified_episode( use_explore=True, collect=True ) for _ag, trajectory in zip(self.agents, trajectories): if not _ag.alg.async_flag: self.trajectories.append(trajectory) self.send_explorer.send(trajectory) self._post_processes() self.ag_stats.explore_time_in_epi = time() - _start2 else: raise ValueError( "invalid 'api_type':{} from environment".format(self.env_info) ) stats_info = self.ag_stats.get() stats_msg = message(stats_info, cmd="stats_msg") self.send_explorer.send(stats_msg)
def setup_explorer(broker_master, config_info, env_id): config = deepcopy(config_info) config["env_para"].update({"env_id": env_id}) msg = message(config, cmd="create_explorer") broker_master.recv_local_q.send(msg)
def stop(self): """ stop all system """ close_cmd = message(None, cmd="close") self.recv_local_q.send(close_cmd)
def get_trajectory(self): trajectory = message(self.trajectory.copy()) set_msg_info(trajectory, agent_id=self.id) return trajectory