def __init__(self, example, size, B, initial_replay_buffer_dict, discount=1, n_step_return=1, fix_ratio=0.1): self.T = T = math.ceil(size / B) self.B = B self.size = T * B self.discount = discount self.n_step_return = n_step_return self.t = 0 # Cursor (in T dimension). if initial_replay_buffer_dict is None: self.samples = buffer_from_example(example, (T, B), share_memory=self.async_) else: self.samples = initial_replay_buffer_dict['buffer'] if n_step_return > 1: # right now n_step_return = 1 self.samples_return_ = buffer_from_example( example.reward, (T, B), share_memory=self.async_) self.samples_done_n = buffer_from_example(example.done, (T, B), share_memory=self.async_) else: self.samples_return_ = self.samples.reward self.samples_done_n = self.samples.done self._buffer_full = False self.off_backward = n_step_return # Current invalid samples. self.off_forward = 1 # i.e. current cursor, prev_action overwritten. #! self.fix_T = math.ceil(self.size * fix_ratio / B)
def collect_evaluation(self, itr): traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))] completed_traj_infos = list() observations = list() for env in self.envs: observations.append(env.reset()) observation = buffer_from_example(observations[0], len(self.envs)) action = buffer_from_example( self.envs[0].action_space.sample(null=True), len(self.envs)) reward = np.zeros(len(self.envs), dtype="float32") obs_pyt, act_pyt, rew_pyt = torchify_buffer( (observation, action, reward)) self.agent.reset() for t in range(self.max_T): act_pyt, agent_info = self.agent.step(obs_pyt, act_pyt, rew_pyt) action = numpify_buffer(act_pyt) for b, env in enumerate(self.envs): o, r, d, env_info = env.step(action[b]) traj_infos[b].step(observation[b], action[b], r, d, agent_info[b], env_info) if getattr(env_info, "traj_done", d): completed_traj_infos.append(traj_infos[b].terminate(o)) traj_infos[b] = self.TrajInfoCls() o = env.reset() if d: action[b] = 0 # Prev_action for next step. r = 0 self.agent.reset_one(idx=b) observation[b] = o reward[b] = r if (self.max_trajectories is not None and len(completed_traj_infos) >= self.max_trajectories): break return completed_traj_infos
def collect_evaluation(self, itr): traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))] observations = list() for env in self.envs: observations.append(env.reset()) observation = buffer_from_example(observations[0], len(self.envs)) for b, o in enumerate(observations): observation[b] = o action = buffer_from_example(self.envs[0].action_space.null_value(), len(self.envs)) reward = np.zeros(len(self.envs), dtype="float32") obs_pyt, act_pyt, rew_pyt = torchify_buffer( (observation, action, reward)) self.agent.reset() self.agent.eval_mode(itr) for t in range(self.max_T): act_pyt, agent_info = self.agent.step(obs_pyt, act_pyt, rew_pyt) action = numpify_buffer(act_pyt) for b, env in enumerate(self.envs): o, r, d, env_info = env.step(action[b]) traj_infos[b].step(observation[b], action[b], r, d, agent_info[b], env_info) if getattr(env_info, "traj_done", d): self.traj_infos_queue.put(traj_infos[b].terminate(o)) traj_infos[b] = self.TrajInfoCls() o = env.reset() if d: action[b] = 0 # Next prev_action. r = 0 self.agent.reset_one(idx=b) observation[b] = o reward[b] = r if self.sync.stop_eval.value: break self.traj_infos_queue.put(None) # End sentinel.
def __init__(self, example, sampler_B, optim_B, batch_T, discount=1, n_step_return=1, T_target=100): super().__init__() self.samples = buffer_from_example(example, (batch_T, optim_B), share_memory=self.async_) field_names = [f for f in example._fields if f != "prev_rnn_state"] global SamplesToBuffer self.SamplesToBuffer = namedarraytuple("SamplesToBuffer", field_names) buffer_example = self.SamplesToBuffer(*(v for k, v in example.items() if k != "prev_rnn_state")) self.buffer_size = optim_B * T_target # self.buffer_size = sampler_B * (T_target * optim_B // sampler_B) self.samples = buffer_from_example(buffer_example, (batch_T, self.buffer_size), share_memory=self.async_) self.samples_prev_rnn_state = buffer_from_example( example.prev_rnn_state, (self.buffer_size, ), share_memory=self.async_) self.sleep_length = 0.01 self.T_target = T_target self.t = 0 self.optim_batch_B = optim_B
def build_step_buffer(examples, B): bufs = tuple( buffer_from_example(examples[k], B, shared_memory=True) for k in ["observation", "action", "reward", "done", "agent_info"]) need_reset = buffer_from_example(examples["done"], B, shared_memory=True) step_buffer_np = StepBuffer(*bufs, need_reset) step_buffer_pyt = torchify_buffer(step_buffer_np) return step_buffer_pyt, step_buffer_np
def collect_evaluation(self, itr): assert self.max_trajectories == len(self.envs) traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))] completed_traj_infos = list() observations = list() for env in self.envs: observations.append(env.reset()) observation = buffer_from_example(observations[0], len(self.envs)) for b, o in enumerate(observations): observation[b] = o action = buffer_from_example(self.envs[0].action_space.null_value(), len(self.envs)) reward = np.zeros(len(self.envs), dtype="float32") obs_pyt, act_pyt, rew_pyt = torchify_buffer( (observation, action, reward)) self.agent.reset() self.agent.eval_mode(itr) live_envs = list(range(len(self.envs))) for t in range(self.max_T): act_pyt, agent_info = self.agent.step(obs_pyt, act_pyt, rew_pyt) action = numpify_buffer(act_pyt) b = 0 while b < len( live_envs ): # don't want to do a for loop since live envs changes over time env_id = live_envs[b] o, r, d, env_info = self.envs[env_id].step(action[b]) traj_infos[env_id].step(observation[b], action[b], r, d, agent_info[b], env_info) if getattr(env_info, "traj_done", d): completed_traj_infos.append( traj_infos[env_id].terminate(o)) observation = delete_ind_from_array(observation, b) reward = delete_ind_from_array(reward, b) action = delete_ind_from_array(action, b) obs_pyt, act_pyt, rew_pyt = torchify_buffer( (observation, action, reward)) del live_envs[b] b -= 1 # live_envs[b] is now the next env, so go back one. else: observation[b] = o reward[b] = r b += 1 if (self.max_trajectories is not None and len(completed_traj_infos) >= self.max_trajectories): logger.log("Evaluation reached max num trajectories " f"({self.max_trajectories}).") return completed_traj_infos if t == self.max_T - 1: logger.log("Evaluation reached max num time steps " f"({self.max_T}).") return completed_traj_infos
def build_samples_buffer(agent, env, batch_spec, bootstrap_value=False, agent_shared=True, env_shared=True, subprocess=True, examples=None): """Recommended to step/reset agent and env in subprocess, so it doesn't affect settings in master before forking workers (e.g. torch num_threads (MKL) may be set at first forward computation.)""" # import ipdb; ipdb.set_trace() if examples is None: if subprocess: mgr = mp.Manager() examples = mgr.dict() # Examples pickled back to master. w = mp.Process(target=get_example_outputs, args=(agent, env, examples, subprocess)) w.start() w.join() else: examples = dict() get_example_outputs(agent, env, examples) T, B = batch_spec all_action = buffer_from_example(examples["action"], (T + 1, B), agent_shared) action = all_action[1:] prev_action = all_action[:-1] # Writing to action will populate prev_action. # import ipdb; ipdb.set_trace() agent_info = buffer_from_example(examples["agent_info"], (T, B), agent_shared) agent_buffer = AgentSamples( action=action, prev_action=prev_action, agent_info=agent_info, ) if bootstrap_value: if agent.dual_model: bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared) int_bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared) agent_buffer = AgentSamplesBsvTwin(*agent_buffer, bootstrap_value=bv, int_bootstrap_value=int_bv) else: bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared) agent_buffer = AgentSamplesBsv(*agent_buffer, bootstrap_value=bv) observation = buffer_from_example(examples["observation"], (T, B), env_shared) # all zero arrays (except 0th index should equal o_reset) next_observation = buffer_from_example(examples["observation"], (T, B), env_shared) all_reward = buffer_from_example(examples["reward"], (T + 1, B), env_shared) # all zero values reward = all_reward[1:] prev_reward = all_reward[:-1] # Writing to reward will populate prev_reward. done = buffer_from_example(examples["done"], (T, B), env_shared) env_info = buffer_from_example(examples["env_info"], (T, B), env_shared) env_buffer = EnvSamples( observation=observation, next_observation=next_observation, prev_reward=prev_reward, reward=reward, done=done, env_info=env_info, ) samples_np = Samples(agent=agent_buffer, env=env_buffer) samples_pyt = torchify_buffer(samples_np) # this links the two (changes to samples_np will reflect in samples_pyt) return samples_pyt, samples_np, examples
def collect_evaluation(self, itr, include_observations=False): traj_infos = [ self.TrajInfoCls(include_observations=include_observations) for _ in range(len(self.envs)) ] completed_traj_infos = list() observations = list() for env in self.envs: observations.append(env.reset()) observation = buffer_from_example(observations[0], len(self.envs)) action = buffer_from_example(self.envs[0].action_space.null_value(), len(self.envs)) reward = np.zeros(len(self.envs), dtype="float32") obs_pyt, act_pyt, rew_pyt = torchify_buffer( (observation, action, reward)) self.agent.reset() self.agent.eval_mode(itr) for t in range(self.max_T): act_pyt, agent_info = self.agent.step(obs_pyt, act_pyt, rew_pyt) action = numpify_buffer(act_pyt) for b, env in enumerate(self.envs): o, r, d, env_info = env.step(action[b]) if include_observations: traj_infos[b].step(env.render(), action[b], r, d, agent_info[b], env_info) else: traj_infos[b].step(observation[b], action[b], r, d, agent_info[b], env_info) if getattr(env_info, "traj_done", d): completed_traj_infos.append(traj_infos[b].terminate(o)) traj_infos[b] = self.TrajInfoCls( include_observations=include_observations) o = env.reset() if d: action[b] = 0 # Prev_action for next step. r = 0 self.agent.reset_one(idx=b) observation[b] = o reward[b] = r if (self.max_trajectories is not None and len(completed_traj_infos) >= self.max_trajectories): logger.log("Evaluation reached max num trajectories " f"({self.max_trajectories}).") break if t == self.max_T - 1: logger.log("Evaluation reached max num time steps " f"({self.max_T}).") return completed_traj_infos
def start_envs(self, max_decorrelation_steps=0): """Calls reset() on every env and returns agent_inputs buffer.""" traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))] observations = list() for env in self.envs: observations.append(env.reset()) observation = buffer_from_example(observations[0], len(self.envs)) for b, obs in enumerate(observations): observation[b] = obs # numpy array or namedarraytuple prev_action = self.envs[0].action_space.sample(len(self.envs), null=True) prev_reward = np.zeros(len(self.envs), dtype="float32") if self.rank == 0: logger.log("Sampler decorrelating envs, max steps: " f"{max_decorrelation_steps}") if max_decorrelation_steps == 0: return AgentInputs(observation, prev_action, prev_reward), traj_infos for b, env in enumerate(self.envs): n_steps = 1 + int(np.random.rand() * max_decorrelation_steps) env_actions = env.action_space.sample(n_steps) for a in env_actions: o, r, d, info = env.step(a) traj_infos[b].step(o, a, r, d, None, info) if getattr(info, "traj_done", d): o = env.reset() traj_infos[b] = self.TrajInfoCls() if d: a = env.action_space.sample(null=True) r = 0 observation[b] = o prev_action[b] = a prev_reward[b] = r return AgentInputs(observation, prev_action, prev_reward), traj_infos
def __init__(self, example, size, B, rnn_state_interval, batch_T=None, **kwargs): self.rnn_state_interval = rnn_state_interval self.batch_T = batch_T # Maybe required fixed depending on replay type. if rnn_state_interval <= 1: # Store no rnn state or every rnn state. buffer_example = example else: # Store some of rnn states; remove from samples. field_names = [f for f in example._fields if f != "prev_rnn_state"] global SamplesToBuffer SamplesToBuffer = namedarraytuple("SamplesToBuffer", field_names) buffer_example = SamplesToBuffer(*(v for k, v in example.items() if k != "prev_rnn_state")) size = ( B * rnn_state_interval * math.ceil( # T as multiple of interval. math.ceil(size / B) / rnn_state_interval)) self.samples_prev_rnn_state = buffer_from_example( example.prev_rnn_state, (size // (B * rnn_state_interval), B), share_memory=self.async_, ) super().__init__(example=buffer_example, size=size, B=B, **kwargs) if rnn_state_interval > 1: assert self.T % rnn_state_interval == 0 self.rnn_T = self.T // rnn_state_interval
def __init__(self, example, size, B, replay_T): self.T = T = math.ceil(size / B) self.B = B self.size = T * B self.t = 0 # cursor self.replay_T = replay_T self.samples = buffer_from_example(example, (T, B), share_memory=self.async_) self._buffer_full = False
def build_step_buffer(examples, B): step_bufs = { k: buffer_from_example(examples[k], B, share_memory=True) for k in ["observation", "action", "reward", "done", "agent_info"] } step_buffer_np = StepBuffer(**step_bufs) step_buffer_pyt = torchify_buffer(step_buffer_np) return step_buffer_pyt, step_buffer_np
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if self.n_step_return > 1: self.samples_timeout_n = buffer_from_example( self.samples.timeout[0, 0], (self.T, self.B), share_memory=self.async_) else: self.samples_timeout_n = self.samples.timeout
def build_samples_buffer(agent, env, batch_spec, bootstrap_value=False, agent_shared=True, env_shared=True, subprocess=True, examples=None): """Recommended to step/reset agent and env in subprocess, so it doesn't affect settings in master before forking workers (e.g. torch num_threads (MKL) may be set at first forward computation.)""" if examples is None: if subprocess: mgr = mp.Manager() examples = mgr.dict() # Examples pickled back to master. w = mp.Process(target=get_example_outputs, args=(agent, env, examples, subprocess)) w.start() w.join() else: examples = dict() get_example_outputs(agent, env, examples) T, B = batch_spec all_action = buffer_from_example(examples["action"], (T + 1, B), agent_shared) action = all_action[1:] prev_action = all_action[: -1] # Writing to action will populate prev_action. agent_info = buffer_from_example(examples["agent_info"], (T, B), agent_shared) agent_buffer = AgentSamples( action=action, prev_action=prev_action, agent_info=agent_info, ) if bootstrap_value: bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared) agent_buffer = AgentSamplesBsv(*agent_buffer, bootstrap_value=bv) observation = buffer_from_example(examples["observation"], (T, B), env_shared) all_reward = buffer_from_example(examples["reward"], (T + 1, B), env_shared) reward = all_reward[1:] prev_reward = all_reward[: -1] # Writing to reward will populate prev_reward. done = buffer_from_example(examples["done"], (T, B), env_shared) env_info = buffer_from_example(examples["env_info"], (T, B), env_shared) env_buffer = EnvSamples( observation=observation, reward=reward, prev_reward=prev_reward, done=done, env_info=env_info, ) samples_np = Samples(agent=agent_buffer, env=env_buffer) samples_pyt = torchify_buffer(samples_np) return samples_pyt, samples_np, examples
def simulate_policy(env, agent): # snapshot = torch.load(path_to_params, map_location=torch.device('cpu')) # agent_state_dict = snapshot['agent_state_dict'] # env = GymEnvWrapper(gym.make(env_id, render=True)) # env = gym.make('HopperPyBulletEnv-v0') # env.render(mode='human') # env = GymEnvWrapper(env) # agent_kwargs = dict(ModelCls=PiMcpVisionModel, QModelCls=QofMcpVisionModel) # agent = SacAgent(**agent_kwargs) # agent = SacAgent(model_kwargs=dict(hidden_sizes=[512,256, 256]), q_model_kwargs=dict(hidden_sizes=[512, 256, 256])) # agent = MujocoFfAgent(ModelCls=PPOMcpModel) # agent.initialize(env_spaces=env.spaces) # agent.load_state_dict(agent_state_dict) # agent.eval_mode(0) obs = env.reset() observation = buffer_from_example(obs, 1) loop_time = 0.04 while True: observation[0] = env.reset() action = buffer_from_example(env.action_space.null_value(), 1) reward = np.zeros(1, dtype="float32") obs_pyt, act_pyt, rew_pyt = torchify_buffer( (observation, action, reward)) done = False step = 0 reward_sum = 0 env.render() # time.sleep(5) while not done: loop_start = time.time() step += 1 act_pyt, agent_info = agent.step(obs_pyt, act_pyt, rew_pyt) action = numpify_buffer(act_pyt) obs, reward, done, info = env.step(action[0]) reward_sum += reward observation[0] = obs rew_pyt[0] = reward sleep_time = loop_time - (time.time() - loop_start) sleep_time = 0 if (sleep_time < 0) else sleep_time time.sleep(sleep_time) env.render(mode='human') print('return: ' + str(reward_sum) + ' num_steps: ' + str(step))
def __init__(self, example, size, B, discount=1, n_step_return=1): self.T = T = math.ceil(size / B) self.B = B self.size = T * B self.discount = discount self.n_step_return = n_step_return self.t = 0 # Cursor (in T dimension). self.samples = buffer_from_example(example, (T, B), share_memory=self.async_) if n_step_return > 1: self.samples_return_ = buffer_from_example( example.reward, (T, B), share_memory=self.async_) self.samples_done_n = buffer_from_example(example.done, (T, B), share_memory=self.async_) else: self.samples_return_ = self.samples.reward self.samples_done_n = self.samples.done self._buffer_full = False self.off_backward = n_step_return # Current invalid samples. self.off_forward = 1 # i.e. current cursor, prev_action overwritten.
def __init__(self, total_n_samples, example_samples): self.total_n_samples = total_n_samples replay_samples = DiscrimReplaySamples( all_observation=example_samples.env.observation, all_action=example_samples.agent.action) T, B = get_leading_dims(replay_samples, n_dim=2) assert total_n_samples >= T * B > 0, (total_n_samples, T * B) self.circ_buf = buffer_from_example(replay_samples[0, 0], (total_n_samples, )) self.samples_in_buffer = 0 self.ptr = 0
def initialize( self, agent, affinity=None, seed=None, bootstrap_value=False, traj_info_kwargs=None, rank=0, world_size=1, ): assert world_size == 1 # world size used in async samplers, not relevant for this class T, B = self.batch_spec self.agent = agent self.env = self.EnvCls(batch_T=T, batch_B=B, **self.env_kwargs) env_ranks = list(range(rank * B, (rank + 1) * B)) agent.initialize(self.env.spaces, share_memory=False, global_B=B, env_ranks=env_ranks) self.samples_pyt, self.samples_np, examples = build_samples_buffer( agent, self.env, self.batch_spec, bootstrap_value, agent_shared=False, env_shared=False, subprocess=False, examples=self._get_example_outputs()) self.samples_np.env.done[:-1, :] = False self.samples_np.env.done[-1, :] = True self.traj_info_kwargs = traj_info_kwargs self.agent_inputs = AgentInputs( buffer_from_example(examples["observation"], (B, )), buffer_from_example(examples["action"], (B, )), buffer_from_example(examples["reward"], (B, ))) self._start_agent(B, env_ranks) logger.log("BatchedEpisodicSampler initialized.") return examples
def __init__(self, example, size, B, replay_T, discount, n_step_return, alpha, beta): self.T = T = math.ceil(size / B) self.B = B self.size = T * B self.t = 0 # cursor self.replay_T = replay_T self.discount = discount self.n_step_return = n_step_return self.alpha = alpha self.beta = beta self.samples = buffer_from_example(example, (T, B), share_memory=self.async_) if n_step_return > 1: self.samples_return_ = buffer_from_example(example.reward, (T, B)) self.samples_done_n = buffer_from_example(example.done, (T, B)) else: self.samples_return_ = self.samples.reward self.samples_done_n = self.samples.done self._buffer_full = False self.init_priority_tree()
def start_envs(self, max_decorrelation_steps=0): """Calls ``reset()`` on every environment instance, then steps each one through a random number of random actions, and returns the resulting agent_inputs buffer (`observation`, `prev_action`, `prev_reward`).""" traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))] prev_action = np.stack( [env.action_space.null_value() for env in self.envs]) # noop prev_reward = np.zeros( len(self.envs), dtype="float32") # total reward (extrinsic + intrinsic) observations = list() for env in self.envs: o = env.reset() observations.append(deepcopy(o)) # emulates stepping with noop observation = buffer_from_example(observations[0], len(self.envs)) for b, obs in enumerate(observations): observation[b] = obs if self.rank == 0: logger.log("Sampler decorrelating envs, max steps: " f"{max_decorrelation_steps}") if max_decorrelation_steps != 0: for b, env in enumerate(self.envs): n_steps = 1 + int(np.random.rand() * max_decorrelation_steps) for _ in range(n_steps): a = env.action_space.sample() if a.shape == ( ): # 'a' gets stored, but if form is array(3) you need to pass int(3) for env action = int(a) else: action = a o, r, d, info = env.step(action) traj_infos[b].step(o, a, r, d, None, info) if getattr(info, "traj_done", d): o = env.reset() traj_infos[b] = self.TrajInfoCls() if d: a = env.action_space.null_value() r = 0 observation[b] = o prev_action[b] = a prev_reward[b] = r # For action-server samplers. if hasattr(self, "step_buffer_np") and self.step_buffer_np is not None: self.step_buffer_np.prev_action[:] = prev_action self.step_buffer_np.prev_reward[:] = prev_reward self.step_buffer_np.observation[:] = observation return AgentInputs(observation, prev_action, prev_reward), traj_infos
def build_samples_buffer(agent, env, batch_spec, bootstrap_value=False, agent_shared=True, env_shared=True, subprocess=True, examples=None): """Recommended to step/reset agent and env in subprocess, so it doesn't affect settings in master before forking workers (e.g. torch num_threads (MKL) may be set at first forward computation.) :param agent: 一个Agent类的对象。 :param env: 一个environment类的对象。 :param batch_spec: 一个BatchSpec类的对象。 """ if examples is None: if subprocess: # 创建子进程 mgr = mp.Manager() # Manager模块用于资源共享 examples = mgr.dict() # Examples pickled back to master. 可以被子进程共享的全局变量 w = mp.Process(target=get_example_outputs, args=(agent, env, examples, subprocess)) # 创建worker进程,此进程执行的是target指定的函数,参数由args指定 w.start() w.join() else: examples = dict() get_example_outputs(agent, env, examples) # examples会在get_example_outputs()函数中被更新,所以没有返回值 T, B = batch_spec # time step数,以及environment实例数 all_action = buffer_from_example(examples["action"], (T + 1, B), agent_shared) action = all_action[1:] prev_action = all_action[:-1] # Writing to action will populate prev_action. agent_info = buffer_from_example(examples["agent_info"], (T, B), agent_shared) agent_buffer = AgentSamples( action=action, prev_action=prev_action, agent_info=agent_info, ) if bootstrap_value: bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared) agent_buffer = AgentSamplesBsv(*agent_buffer, bootstrap_value=bv) observation = buffer_from_example(examples["observation"], (T, B), env_shared) all_reward = buffer_from_example(examples["reward"], (T + 1, B), env_shared) reward = all_reward[1:] prev_reward = all_reward[:-1] # Writing to reward will populate prev_reward. done = buffer_from_example(examples["done"], (T, B), env_shared) env_info = buffer_from_example(examples["env_info"], (T, B), env_shared) env_buffer = EnvSamples( observation=observation, reward=reward, prev_reward=prev_reward, done=done, env_info=env_info, ) samples_np = Samples(agent=agent_buffer, env=env_buffer) samples_pyt = torchify_buffer(samples_np) return samples_pyt, samples_np, examples
def simulate_policy(env, agent, render): static_decoder_path = './qec/referee_decoders/nn_d5_DP_p5' static_decoder = load_model(static_decoder_path, compile=True) obs = env.reset() observation = buffer_from_example(obs, 1) loop_time = 0.01 returns = [] mses = [] lifetimes = [] while True: observation[0] = env.reset() action = buffer_from_example(env.action_space.null_value(), 1) reward = np.zeros(1, dtype="float32") obs_pyt, act_pyt, rew_pyt = torchify_buffer( (observation, action, reward)) agent.reset() done = False step = 0 reward_sum = 0 while not done: loop_start = time.time() step += 1 act_pyt, agent_info = agent.step(obs_pyt, act_pyt, rew_pyt) action = numpify_buffer(act_pyt)[0] obs, reward, done, info = env.step(action) # done = np.argmax(static_decoder(info.static_decoder_input)[0]) != info.correct_label reward_sum += reward observation[0] = obs rew_pyt[0] = float(reward) returns.append(reward_sum) lifetimes.append(info.lifetime) print('avg return: ' + str(sum(returns) / len(returns)) + ' return: ' + str(reward_sum) + ' num_steps: ' + str(step)) print( f'average lifetime: {sum(lifetimes)/len(lifetimes)} lifetime: {info.lifetime}' )
def __init__(self, example, shared_memory=False, **kwargs): field_names = [f for f in example._fields if f != "observation"] global BufferSamples BufferSamples = namedarraytuple("BufferSamples", field_names) buffer_example = BufferSamples(*(v for k, v in example.items() if k != "observation")) super().__init__(example=buffer_example, shared_memory=shared_memory, **kwargs) # Equivalent to image.shape[0] if observation is image array (C,H,W): self.n_frames = n_frames = get_leading_dims(example.observation, n_dim=1)[0] logger.log(f"Frame-based buffer using {n_frames}-frame sequences.") # frames: oldest stored at t; duplicate n_frames - 1 beginning & end. self.samples_frames = buffer_from_example( example.observation[0], (self.T + n_frames - 1, self.B), shared_memory=shared_memory) # [T+n_frames-1,B,H,W] # new_frames: shifted so newest stored at t; no duplication. self.samples_new_frames = self.samples_frames[n_frames - 1:] # [T,B,H,W] self.samples_n_blanks = buffer_from_example( np.zeros(1, dtype="uint8"), (self.T, self.B), shared_memory=shared_memory) self.off_forward = max(self.off_forward, n_frames - 1)
def start_envs(self, max_decorrelation_steps=0): """Calls ``reset()`` on every environment instance, then steps each one through a random number of random actions, and returns the resulting agent_inputs buffer (`observation`, `prev_action`, `prev_reward`).""" traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))] observations = list() for env in self.envs: observations.append(env.reset()) observation = buffer_from_example(observations[0], len(self.envs)) for b, obs in enumerate(observations): observation[b] = obs # numpy array or namedarraytuple prev_action = np.stack([env.action_space.null_value() for env in self.envs]) prev_reward = np.zeros(len(self.envs), dtype="float32") if self.rank == 0: logger.log("Sampler decorrelating envs, max steps: " f"{max_decorrelation_steps}") if max_decorrelation_steps != 0: for b, env in enumerate(self.envs): n_steps = 1 + int(np.random.rand() * max_decorrelation_steps) for _ in range(n_steps): a = env.action_space.sample() o, r, d, info = env.step(a) traj_infos[b].step(o, a, r, d, None, info) if getattr(info, "traj_done", d): o = env.reset() traj_infos[b] = self.TrajInfoCls() if ((type(d) is np.ndarray and d.any()) or (type(d) is bool and d)): a = env.action_space.null_value() r = 0 observation[b] = o prev_action[b] = a prev_reward[b] = r # For action-server samplers. if hasattr(self, "step_buffer_np") and self.step_buffer_np is not None: self.step_buffer_np.observation[:] = observation self.step_buffer_np.action[:] = prev_action self.step_buffer_np.reward[:] = prev_reward return AgentInputs(observation, prev_action, prev_reward), traj_infos
def _decorrelate_envs(self): """Return agent_inputs and traj_info at the end of decorrelation using random actions (collector.start_envs)""" o = self.env.reset() prev_observation = buffer_from_example(o[0], self.batch_spec.B) prev_reward = np.zeros(self.batch_spec.B, dtype="float32") prev_action = np.zeros(self.batch_spec.B, dtype=int) traj_infos = self.TrajInfoCls(B=self.batch_spec.B) for _ in range(self.decorrelation_steps): prev_action[:] = self.env.action_space.sample( ) # Sample random actions for each prev_observation[:], prev_reward[:], d, info = self.env.step( prev_action[:]) # Take step traj_infos.step(prev_observation, prev_action, prev_reward, d, None, info, reset_dones=True) # Update traj_info return AgentInputs(prev_observation, prev_action, prev_reward), traj_infos
def build_intrinsic_samples_buffer(agent, env, batch_spec, bootstrap_value=False, next_obs=False, agent_shared=True, env_shared=True, subprocess=True, examples=None): """ Replaces ``build_samples_buffer`` to add additional buffer space for intrinsic bonus agents. If bootstrap_value=True, also adds space for int_bootstrap_value from intrinsic value head. If next_obs=True, also adds space for next observations (NOTE: This is memory intensive with raw pixel states, as it doubles the space to store images. Keep this as False unless the algorithm needs it). """ if examples is None: if subprocess: mgr = mp.Manager() examples = mgr.dict() # Examples pickled back to master. w = mp.Process(target=get_example_outputs, args=(agent, env, examples, subprocess)) w.start() w.join() else: examples = dict() get_example_outputs(agent, env, examples) T, B = batch_spec all_action = buffer_from_example(examples["action"], (T + 1, B), agent_shared) action = all_action[1:] prev_action = all_action[: -1] # Writing to action will populate prev_action. agent_info = buffer_from_example(examples["agent_info"], (T, B), agent_shared) agent_buffer = AgentSamples( action=action, prev_action=prev_action, agent_info=agent_info, ) if bootstrap_value: # Added buffer space for intrinsic bootstrap value bv = buffer_from_example(examples["agent_info"].ext_value, (1, B), agent_shared) int_bv = buffer_from_example(examples["agent_info"].int_value, (1, B), agent_shared) agent_buffer = IntAgentSamplesBsv(*agent_buffer, bootstrap_value=bv, int_bootstrap_value=int_bv) observation = buffer_from_example(examples["observation"], (T, B), env_shared) all_reward = buffer_from_example(examples["reward"], (T + 1, B), env_shared) reward = all_reward[1:] prev_reward = all_reward[: -1] # Writing to reward will populate prev_reward. done = buffer_from_example(examples["done"], (T, B), env_shared) env_info = buffer_from_example(examples["env_info"], (T, B), env_shared) if next_obs: # Add buffer space for next obs, if specified next_observation = buffer_from_example(examples["observation"], (T, B), env_shared) env_buffer = EnvSamplesPlus( observation=observation, next_observation=next_observation, reward=reward, prev_reward=prev_reward, done=done, env_info=env_info, ) else: env_buffer = EnvSamples( observation=observation, reward=reward, prev_reward=prev_reward, done=done, env_info=env_info, ) samples_np = Samples(agent=agent_buffer, env=env_buffer) samples_pyt = torchify_buffer(samples_np) return samples_pyt, samples_np, examples
def start_envs(self, max_decorrelation_steps=0): """ Calls reset() on every env and returns agent_inputs buffer. 这个函数在Sampler类(例如SerialSampler)中的 initialize() 里会被调用,进行诸如收集(采样)第一批数据的工作。 :param: max_decorrelation_steps: 最大[去相关性]的步数。 :return 一个 namedarraytuple,包含3个元素(observation,action,reward),每个元素又分别是一个list;以及trajectory的一些统计 信息(TrajInfo类对象组成的一个list)。 """ traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs)) ] # 每一个environment都对应一个TrajInfo对象 observations = list() for env in self.envs: # self.envs是一个environment的list,它是在sampler类(例如SerialSampler)里面实例化的 observations.append(env.reset()) observation = buffer_from_example(observations[0], len(self.envs)) for b, obs in enumerate(observations): observation[b] = obs # numpy array or namedarraytuple prev_action = np.stack( [env.action_space.null_value() for env in self.envs]) prev_reward = np.zeros(len(self.envs), dtype="float32") if self.rank == 0: logger.log("Sampler decorrelating envs, max steps: " f"{max_decorrelation_steps}") """ 在所有environment内,依次采样一批数据。按我的理解,这里的decorrelation逻辑是这样的:首先指定一个步数(例如100),然后对每一个 environment都走100步来采样,如果不到100步environment就走到头了也没关系,reset之后从头继续走,反正一共走够100步。所有environment 里的数据混在一起返回,这样做确实起到了decorrelation的作用。 """ if max_decorrelation_steps != 0: for b, env in enumerate( self.envs ): # 遍历所有environment,b为从0开始的索引值,env为envs里面的每一个environment实例 n_steps = 1 + int(np.random.rand() * max_decorrelation_steps) # +1是防止结果为0导致逻辑不通 for _ in range(n_steps): """ 关于env.action_space,可参考Env._action_space这个成员变量的值。这里的 env.action_space.sample(),对AtariEnv 就是计算IntBox.sample(),即在action space内随机选一个动作的index(并非实际动作),这里没有直接得到action,而是得到 一个action space内的一个index,原因是:在env.step(a)里会根据index获取一个action。另外,这里之所以随机获取action space内的一个index,是因为此时是在Collector类的start_envs()函数中,也就是说此时刚开始从environment里收集数据, 因此第一次收集的话,是不知道应该采取什么action的(不像后面已经得到一个network的时候可以根据前面的observation算出一个 action),所以这里就随机选取一个index就好了。 """ a = env.action_space.sample() o, r, d, info = env.step( a ) # 执行action,得到observation, reward, done(是否完成标志), info(一些统计信息) traj_infos[b].step(o, a, r, d, None, info) # 更新trajectory的一些统计信息 """ info是一个namedtuple,取出来的traj_done属性值,是一个bool,表明是否game over了(对Atari游戏来说),如果没有game over,还要看是不是已经done了(比如游戏通关了),所以getattr()的default value设置成了done标志。 """ if getattr(info, "traj_done", d): o = env.reset() # 重置environment,回到最初状态 traj_infos[b] = self.TrajInfoCls() # TrajInfo类的对象 if d: # done(比如游戏通关) a = env.action_space.null_value() r = 0 observation[b] = o prev_action[b] = a prev_reward[b] = r # For action-server samplers. rlpyt有一种并行模式是Parallel-GPU,在这种模式下,会有一个action-server的概念(参考rlpyt论文) if hasattr(self, "step_buffer_np") and self.step_buffer_np is not None: self.step_buffer_np.observation[:] = observation self.step_buffer_np.action[:] = prev_action self.step_buffer_np.reward[:] = prev_reward return AgentInputs(observation, prev_action, prev_reward), traj_infos
def collect_evaluation(self, itr): traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))] observations = list() for env in self.envs: observations.append(env.reset()) observation = buffer_from_example(observations[0], len(self.envs)) for b, o in enumerate(observations): observation[b] = o action = buffer_from_example(self.envs[0].action_space.null_value(), len(self.envs)) reward = np.zeros(len(self.envs), dtype="float32") obs_pyt, act_pyt, rew_pyt = torchify_buffer( (observation, action, reward)) self.agent.reset() self.agent.eval_mode(itr) #* Modifying the eval logic here: always return traj for each env of a worker # obs_pyt: num_eval_env_per x obs_dim(3); act_pyt: num_eval_env_per x act_dim(1); rew_pyt: num_eval_env_per envs_done_flag = np.zeros((len(self.envs))) for t in range(self.max_T): # max_T=100, not eval_max_steps act_pyt, agent_info = self.agent.step(obs_pyt, act_pyt, rew_pyt) action = numpify_buffer(act_pyt) # Go through each env in a worker for b, env in enumerate(self.envs): o, r, d, env_info = env.step(action[b]) traj_infos[b].step(observation[b], action[b], r, d, agent_info[b], env_info) # Right now this one is never activated since our custom env (pendulum) does not return any info at each step if getattr(env_info, "traj_done", d): self.traj_infos_queue.put(traj_infos[b].terminate(o)) traj_infos[b] = self.TrajInfoCls() o = env.reset() envs_done_flag[b] = 1 # Right now this one is never activated since our custom env (pendulum) does not say done if d: action[b] = 0 # Next prev_action. r = 0 self.agent.reset_one( idx=b) # this does not do anything right now envs_done_flag[b] = 1 # Save saliency if t == 10 and b == 0 and self.agent.saliency_dir is not None: saliency(img=o, model=self.agent.model, save_path=self.agent.saliency_dir + str(itr) + '.png') observation[b] = o reward[b] = r if self.sync.stop_eval.value: break # Regardless, add to queue TODO: need to tell traj_info the global index of envs (like which image was used) for b in range(len(self.envs)): if envs_done_flag[b] < 1e-4: self.traj_infos_queue.put(traj_infos[b].terminate(o)) self.traj_infos_queue.put(None) # End sentinel.
def start_envs(self, max_decorrelation_steps=0): """Calls ``reset()`` on every environment instance, then steps each one through a random number of random actions, and returns the resulting agent_inputs buffer (`observation`, `prev_action`, `prev_reward`).""" player_traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))] if isinstance(self.envs[0], CWTO_EnvWrapperAtari): observer_traj_infos = [ self.TrajInfoCls(n_obs=env.window_size, serial=env.serial) for env in self.envs ] else: observer_traj_infos = [ self.TrajInfoCls(n_obs=env.obs_size, serial=env.serial) for env in self.envs ] player_observations = list() observer_observations = list() for env in self.envs: observer_observations.append(env.reset()) player_observations.append( env.player_observation_space.null_value()) observer_observation = buffer_from_example(observer_observations[0], len(self.envs)) player_observation = buffer_from_example(player_observations[0], len(self.envs)) for b, obs in enumerate(observer_observations): observer_observation[b] = obs # numpy array or namedarraytuple player_prev_action = np.stack( [env.player_action_space.null_value() for env in self.envs]) observer_prev_action = np.stack( [env.observer_action_space.null_value() for env in self.envs]) player_prev_reward = np.zeros(len(self.envs), dtype="float32") observer_prev_reward = np.zeros(len(self.envs), dtype="float32") player_prev_cost = np.zeros(len(self.envs), dtype="float32") observer_prev_cost = np.zeros(len(self.envs), dtype="float32") player_done = np.zeros(len(self.envs), dtype=bool) observer_done = np.zeros(len(self.envs), dtype=bool) if self.rank == 0: logger.log("Sampler decorrelating envs, max steps: " f"{max_decorrelation_steps}") if max_decorrelation_steps != 0: for b, env in enumerate(self.envs): n_steps = 1 + int(np.random.rand() * max_decorrelation_steps) if n_steps % 2 != 0: if n_steps < max_decorrelation_steps or n_steps <= 1: n_steps += 1 else: n_steps -= 1 for cstep in range(n_steps): if env.player_turn: a = env.action_space().sample() o, r, d, info = env.step(a) player_prev_action[b] = a r_obs, cost_obs = env.observer_reward_shaping( r, env.last_obs_act) observer_prev_reward[b] = r_obs observer_prev_cost[b] = cost_obs observer_done[b] = d if cstep > 0: observer_traj_infos[b].step( observer_observation[b], observer_prev_action[b], observer_prev_reward[b], observer_done[b], None, info, cost=cost_obs, obs_act=env.last_obs_act) if d: o = env.reset() observer_prev_reward[b] = 0 observer_traj_infos[b] = self.TrajInfoCls( n_obs=env.obs_size, serial=env.serial) player_prev_reward[b] = 0 player_traj_infos[b] = self.TrajInfoCls() player_done[b] = d observer_observation[b] = o else: if env.serial: while not env.player_turn: a = env.action_space().sample() o, r, d, info = env.step(a) assert not d observer_prev_action[b] = a if env.player_turn: r_ply, cost_ply = env.player_reward_shaping( r, env.last_obs_act) player_prev_reward[b] = r_ply player_done[b] = d if cstep > 0: player_traj_infos[b].step( player_observation[b], player_prev_action[b], player_prev_reward[b], player_done[b], None, info, cost_ply) player_observation[b] = o else: observer_prev_reward[b] = r observer_done[b] = d if cstep > 0: observer_traj_infos[b].step( observer_observation[b], observer_prev_action[b], observer_prev_reward[b], observer_done[b], None, info, cost=0) observer_observation[b] = o else: a = env.action_space().sample() o, r, d, info = env.step(a) r_ply, cost_ply = env.player_reward_shaping( r, env.last_obs_act) assert not d observer_prev_action[b] = a player_prev_reward[b] = r_ply player_done[b] = d if cstep > 0: player_traj_infos[b].step( player_observation[b], player_prev_action[b], player_prev_reward[b], player_done[b], None, info, cost_ply) player_observation[b] = o # For action-server samplers. if hasattr(self, "observer_step_buffer_np" ) and self.observer_step_buffer_np is not None: self.observer_step_buffer_np.observation[:] = observer_observation self.observer_step_buffer_np.action[:] = observer_prev_action self.observer_step_buffer_np.reward[:] = observer_prev_reward if hasattr(self, "player_step_buffer_np" ) and self.player_step_buffer_np is not None: self.player_step_buffer_np.observation[:] = player_observation self.player_step_buffer_np.action[:] = player_prev_action self.player_step_buffer_np.reward[:] = player_prev_reward return AgentInputs(player_observation, player_prev_action, player_prev_reward), player_traj_infos, AgentInputs( observer_observation, observer_prev_action, observer_prev_reward), observer_traj_infos
def collect_evaluation(self, itr, max_episodes=1): assert len( self.envs ) == 1, 'qec eval collector needs max 1 env. Otherwise evaluation will be biased' traj_infos = [self.TrajInfoCls() for _ in range(len(self.envs))] observations = list() for env in self.envs: observations.append(env.reset()) observation = buffer_from_example(observations[0], len(self.envs)) for b, o in enumerate(observations): observation[b] = o action = buffer_from_example(self.envs[0].action_space.null_value(), len(self.envs)) reward = np.zeros(len(self.envs), dtype="float32") obs_pyt, act_pyt, rew_pyt = torchify_buffer( (observation, action, reward)) self.agent.reset() self.agent.eval_mode(itr) num_completed_episodes = 0 for t in range(self.max_T): act_pyt, agent_info = self.agent.step(obs_pyt, act_pyt, rew_pyt) action = numpify_buffer(act_pyt) static_decoder_inputs = [] correct_labels = [] env_infos = [] done = [] for b, env in enumerate(self.envs): o, r, d, env_info = env.step(action[b]) done.append(d) observation[b] = o reward[b] = r env_infos.append(env_info) static_decoder_inputs.append(env_info.static_decoder_input) correct_labels.append(env_info.correct_label) static_decoder_inputs = np.stack(static_decoder_inputs) correct_labels = np.stack(correct_labels) label_prediction = np.argmax( self.static_decoder(static_decoder_inputs), axis=-1).squeeze(axis=1) done = label_prediction != correct_labels for b, env in enumerate(self.envs): traj_infos[b].step(observation[b], action[b], reward[b], done[b], agent_info[b], env_infos[b]) if getattr(env_infos[b], "traj_done", done[b]): self.traj_infos_queue.put(traj_infos[b].terminate( observation[b])) traj_infos[b] = self.TrajInfoCls() observation[b] = env.reset() if done[b]: action[b] = 0 # Next prev_action. reward[b] = 0 self.agent.reset_one(idx=b) num_completed_episodes += 1 if num_completed_episodes >= max_episodes: print('reached max episodes') break if self.sync.stop_eval.value: print(f'sync stop') break self.traj_infos_queue.put(None) # End sentinel.