def test_loss(): batch_b = 2 batch_t = 4 stoch_state_dim = 3 deter_state_dim = 4 action_size = 3 img_size = (3, 64, 64) # TODO: figure out why atari games have 4 channels. dreamer = make_dreamer(action_size) # categorical action tensor action = torch.randint(action_size, (batch_t, batch_b)) prev_action = torch.randn(batch_t, batch_b, action_size) observation = torch.randn(batch_t, batch_b, *img_size) env_reward = torch.randn(batch_t, batch_b, 1) prev_reward = torch.randn(batch_t, batch_b) done = torch.zeros(batch_t, batch_b, dtype=torch.bool) env_info = EnvInfo() prev_state = make_rssm_state(batch_t, batch_b, stoch_state_dim, deter_state_dim) agent_info = DreamerAgentInfo(prev_state=prev_state) agent_samples = AgentSamples(action=action, prev_action=prev_action, agent_info=agent_info) env_samples = EnvSamples(observation=observation, reward=env_reward, prev_reward=prev_reward, done=done, env_info=env_info) samples = Samples(agent=agent_samples, env=env_samples) loss = dreamer.loss(samples) # Check we have a single-element FloatTensor with a gradient assert isinstance(loss, torch.FloatTensor) assert loss.requires_grad assert loss.shape == () # Check it still works if we pass in discrete actions num_actions = 6 dreamer = make_dreamer(num_actions) action = torch.randint(0, num_actions, (batch_t, batch_b)) prev_action = torch.randint(0, num_actions, (batch_t, batch_b)) agent_samples = AgentSamples(action=action, prev_action=prev_action, agent_info=agent_info) env_samples = EnvSamples(observation=observation, reward=env_reward, prev_reward=prev_reward, done=done, env_info=env_info) samples = Samples(agent=agent_samples, env=env_samples) loss = dreamer.loss(samples) assert isinstance(loss, torch.FloatTensor) assert loss.requires_grad assert loss.shape == ()
def build_samples_buffer(agent, env, batch_spec, bootstrap_value=False, agent_shared=True, env_shared=True, subprocess=True, examples=None): """Recommended to step/reset agent and env in subprocess, so it doesn't affect settings in master before forking workers (e.g. torch num_threads (MKL) may be set at first forward computation.)""" if examples is None: if subprocess: mgr = mp.Manager() examples = mgr.dict() # Examples pickled back to master. w = mp.Process(target=get_example_outputs, args=(agent, env, examples, subprocess)) w.start() w.join() else: examples = dict() get_example_outputs(agent, env, examples) T, B = batch_spec all_action = buffer_from_example(examples["action"], (T + 1, B), agent_shared) action = all_action[1:] prev_action = all_action[: -1] # Writing to action will populate prev_action. agent_info = buffer_from_example(examples["agent_info"], (T, B), agent_shared) agent_buffer = AgentSamples( action=action, prev_action=prev_action, agent_info=agent_info, ) if bootstrap_value: bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared) agent_buffer = AgentSamplesBsv(*agent_buffer, bootstrap_value=bv) observation = buffer_from_example(examples["observation"], (T, B), env_shared) all_reward = buffer_from_example(examples["reward"], (T + 1, B), env_shared) reward = all_reward[1:] prev_reward = all_reward[: -1] # Writing to reward will populate prev_reward. done = buffer_from_example(examples["done"], (T, B), env_shared) env_info = buffer_from_example(examples["env_info"], (T, B), env_shared) env_buffer = EnvSamples( observation=observation, reward=reward, prev_reward=prev_reward, done=done, env_info=env_info, ) samples_np = Samples(agent=agent_buffer, env=env_buffer) samples_pyt = torchify_buffer(samples_np) return samples_pyt, samples_np, examples
def build_samples_buffer(agent, env, batch_spec, bootstrap_value=False, agent_shared=True, env_shared=True, subprocess=True, examples=None): """Recommended to step/reset agent and env in subprocess, so it doesn't affect settings in master before forking workers (e.g. torch num_threads (MKL) may be set at first forward computation.)""" # import ipdb; ipdb.set_trace() if examples is None: if subprocess: mgr = mp.Manager() examples = mgr.dict() # Examples pickled back to master. w = mp.Process(target=get_example_outputs, args=(agent, env, examples, subprocess)) w.start() w.join() else: examples = dict() get_example_outputs(agent, env, examples) T, B = batch_spec all_action = buffer_from_example(examples["action"], (T + 1, B), agent_shared) action = all_action[1:] prev_action = all_action[:-1] # Writing to action will populate prev_action. # import ipdb; ipdb.set_trace() agent_info = buffer_from_example(examples["agent_info"], (T, B), agent_shared) agent_buffer = AgentSamples( action=action, prev_action=prev_action, agent_info=agent_info, ) if bootstrap_value: if agent.dual_model: bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared) int_bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared) agent_buffer = AgentSamplesBsvTwin(*agent_buffer, bootstrap_value=bv, int_bootstrap_value=int_bv) else: bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared) agent_buffer = AgentSamplesBsv(*agent_buffer, bootstrap_value=bv) observation = buffer_from_example(examples["observation"], (T, B), env_shared) # all zero arrays (except 0th index should equal o_reset) next_observation = buffer_from_example(examples["observation"], (T, B), env_shared) all_reward = buffer_from_example(examples["reward"], (T + 1, B), env_shared) # all zero values reward = all_reward[1:] prev_reward = all_reward[:-1] # Writing to reward will populate prev_reward. done = buffer_from_example(examples["done"], (T, B), env_shared) env_info = buffer_from_example(examples["env_info"], (T, B), env_shared) env_buffer = EnvSamples( observation=observation, next_observation=next_observation, prev_reward=prev_reward, reward=reward, done=done, env_info=env_info, ) samples_np = Samples(agent=agent_buffer, env=env_buffer) samples_pyt = torchify_buffer(samples_np) # this links the two (changes to samples_np will reflect in samples_pyt) return samples_pyt, samples_np, examples
def build_samples_buffer(agent, env, batch_spec, bootstrap_value=False, agent_shared=True, env_shared=True, subprocess=True, examples=None): """Recommended to step/reset agent and env in subprocess, so it doesn't affect settings in master before forking workers (e.g. torch num_threads (MKL) may be set at first forward computation.) :param agent: 一个Agent类的对象。 :param env: 一个environment类的对象。 :param batch_spec: 一个BatchSpec类的对象。 """ if examples is None: if subprocess: # 创建子进程 mgr = mp.Manager() # Manager模块用于资源共享 examples = mgr.dict() # Examples pickled back to master. 可以被子进程共享的全局变量 w = mp.Process(target=get_example_outputs, args=(agent, env, examples, subprocess)) # 创建worker进程,此进程执行的是target指定的函数,参数由args指定 w.start() w.join() else: examples = dict() get_example_outputs(agent, env, examples) # examples会在get_example_outputs()函数中被更新,所以没有返回值 T, B = batch_spec # time step数,以及environment实例数 all_action = buffer_from_example(examples["action"], (T + 1, B), agent_shared) action = all_action[1:] prev_action = all_action[:-1] # Writing to action will populate prev_action. agent_info = buffer_from_example(examples["agent_info"], (T, B), agent_shared) agent_buffer = AgentSamples( action=action, prev_action=prev_action, agent_info=agent_info, ) if bootstrap_value: bv = buffer_from_example(examples["agent_info"].value, (1, B), agent_shared) agent_buffer = AgentSamplesBsv(*agent_buffer, bootstrap_value=bv) observation = buffer_from_example(examples["observation"], (T, B), env_shared) all_reward = buffer_from_example(examples["reward"], (T + 1, B), env_shared) reward = all_reward[1:] prev_reward = all_reward[:-1] # Writing to reward will populate prev_reward. done = buffer_from_example(examples["done"], (T, B), env_shared) env_info = buffer_from_example(examples["env_info"], (T, B), env_shared) env_buffer = EnvSamples( observation=observation, reward=reward, prev_reward=prev_reward, done=done, env_info=env_info, ) samples_np = Samples(agent=agent_buffer, env=env_buffer) samples_pyt = torchify_buffer(samples_np) return samples_pyt, samples_np, examples
def build_intrinsic_samples_buffer(agent, env, batch_spec, bootstrap_value=False, next_obs=False, agent_shared=True, env_shared=True, subprocess=True, examples=None): """ Replaces ``build_samples_buffer`` to add additional buffer space for intrinsic bonus agents. If bootstrap_value=True, also adds space for int_bootstrap_value from intrinsic value head. If next_obs=True, also adds space for next observations (NOTE: This is memory intensive with raw pixel states, as it doubles the space to store images. Keep this as False unless the algorithm needs it). """ if examples is None: if subprocess: mgr = mp.Manager() examples = mgr.dict() # Examples pickled back to master. w = mp.Process(target=get_example_outputs, args=(agent, env, examples, subprocess)) w.start() w.join() else: examples = dict() get_example_outputs(agent, env, examples) T, B = batch_spec all_action = buffer_from_example(examples["action"], (T + 1, B), agent_shared) action = all_action[1:] prev_action = all_action[: -1] # Writing to action will populate prev_action. agent_info = buffer_from_example(examples["agent_info"], (T, B), agent_shared) agent_buffer = AgentSamples( action=action, prev_action=prev_action, agent_info=agent_info, ) if bootstrap_value: # Added buffer space for intrinsic bootstrap value bv = buffer_from_example(examples["agent_info"].ext_value, (1, B), agent_shared) int_bv = buffer_from_example(examples["agent_info"].int_value, (1, B), agent_shared) agent_buffer = IntAgentSamplesBsv(*agent_buffer, bootstrap_value=bv, int_bootstrap_value=int_bv) observation = buffer_from_example(examples["observation"], (T, B), env_shared) all_reward = buffer_from_example(examples["reward"], (T + 1, B), env_shared) reward = all_reward[1:] prev_reward = all_reward[: -1] # Writing to reward will populate prev_reward. done = buffer_from_example(examples["done"], (T, B), env_shared) env_info = buffer_from_example(examples["env_info"], (T, B), env_shared) if next_obs: # Add buffer space for next obs, if specified next_observation = buffer_from_example(examples["observation"], (T, B), env_shared) env_buffer = EnvSamplesPlus( observation=observation, next_observation=next_observation, reward=reward, prev_reward=prev_reward, done=done, env_info=env_info, ) else: env_buffer = EnvSamples( observation=observation, reward=reward, prev_reward=prev_reward, done=done, env_info=env_info, ) samples_np = Samples(agent=agent_buffer, env=env_buffer) samples_pyt = torchify_buffer(samples_np) return samples_pyt, samples_np, examples