def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, save_video=True, video_filename='sim_out.mp4', reset_args=None, policy_contexts=None): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] images = [] o = env.reset(reset_args=reset_args, policy_contexts=policy_contexts) agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: # and not animated: # TODO testing break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if save_video: from PIL import Image image = env.wrapped_env.wrapped_env.get_viewer().get_image() pil_image = Image.frombytes('RGB', (image[1], image[2]), image[0]) images.append(np.flipud(np.array(pil_image))) if animated: if save_video and len(images) >= max_path_length: import moviepy.editor as mpy clip = mpy.ImageSequenceClip(images, fps=20 * speedup) if video_filename[-3:] == 'gif': clip.write_gif(video_filename, fps=20 * speedup) else: clip.write_videofile(video_filename, fps=20 * speedup) #return return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def rollout(env, agent, max_path_length=10000, animated=False, speedup=1): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a,o) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if animated: env.render() return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if animated: env.render() return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def rollout_snn(env, agent, max_path_length=np.inf, reset_start_rollout=True, switch_lat_every=0, animated=False, speedup=1): """ :param reset_start_rollout: whether to reset at the start of every rollout :param switch_lat_every: potential change in latents (by resetting the agent with forced resample lat) """ observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] if reset_start_rollout: o = env.reset( ) # in case rollout is called to produce parts of a trajectory: otherwise it will never advance!! else: if isinstance(env, NormalizedEnv): o = env.wrapped_env.get_current_obs() else: o = env.get_current_obs() agent.reset() # this resamples a latent in SNNs! path_length = 0 if animated: env.render() while path_length < max_path_length: if switch_lat_every > 0 and path_length % switch_lat_every == 0: agent.reset( force_resample_lat=True) # here forced to resample a latent a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list( env_infos), # here it concatenates all lower-level paths! # So all elements are np.arrays of max_path_length x time_steps_agg x corresp_dim # hence the next concatenation done by sampler at the higher level doesn't work because the mismatched dim # 1 and not 0!! )
def rollout_record(env, agent, max_path_length=np.inf, animated=False, speedup=1): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() # environment descriptions descs = defaultdict(list) for key, val in env.describe().items(): descs[key].append(val) agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) # NOTE: this runs describe after the environment is done, so describe # must be happy with this. # NOTE: this appending is dangerous! it makes the assumption that vals of # different keys must all be simulatenously present, which might not be true; # the result is a misalignment of vals of different keys further down! for key, val in env.describe().items(): descs[key].append(val) path_length += 1 if d: break o = next_o if animated: env.render() results = dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos)) for key, val in descs.items(): # Keys ending with with _varlist should be concatenated not stacked if len(key) > 8 and key[-8:] == '_varlist': results[key[:-8]] = np.concatenate(val, axis=0) else: results[key] = tensor_utils.stack_tensor_list(val) return results
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=True): violation_cost = 0 boundary_violation_cost = 0 observations = [] actions = [] rewards = [] succ_rewards = [] succ_rate = 0 agent_infos = [] env_infos = [] o = env.reset() agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) succ_rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if animated and not always_return_paths: return succ_rate = 1 sum = 0 for i in range(len(rewards)): sum += rewards[i] print("Episode Reward : ", sum) return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), violation_cost=violation_cost, boundary_violation_cost=boundary_violation_cost, succ_return=succ_rewards, succ_rate=succ_rate, )
def rollout_policy(agent, env, max_path_length=200, speedup=1, get_image_observations=False, animated=False): """ Mostly taken from https://github.com/bstadie/third_person_im/blob/master/sandbox/bradly/third_person/algos/cyberpunk_trainer.py#L164 Generate a rollout for a given policy """ observations = [] im_observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() path_length = 0 while path_length <= max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 o = next_o if get_image_observations: if not animated: pixel_array = env.render(mode="rgb_array") else: pixel_array = env.render() if pixel_array is None and not animated: # Not convinced that behaviour works for all environments, so until # such a time as I'm convinced of this, drop into a debug shell print("Problem! Couldn't get pixels! Dropping into debug shell.") import pdb; pdb.set_trace() im_observations.append(pixel_array) if d: rewards.append(r) break else: rewards.append(r) if animated: env.render(close=True) im_observations = tensor_utils.stack_tensor_list(im_observations) observations = tensor_utils.stack_tensor_list(observations) rewards = tensor_utils.stack_tensor_list(rewards) return dict( observations=observations, im_observations=im_observations, actions=tensor_utils.stack_tensor_list(actions), rewards=rewards, agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() agent.reset() path_length = 0 if animated: env.render() count = 0 indexes = np.concatenate([[i for i in [np.array((0.09, 0.0, 0.0)), np.array((0.0, 0.09, 0.0)), np.array((-0.09, 0.0, 0.0)), np.array((0.0, -0.09, 0.0)) ]] for x in range(500)]) while path_length < max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) if r == 10.0: env.wrapped_env.env.goal = env.wrapped_env.env.goal + indexes[count] count = count + 1 print("newnew: {0} {1}".format(env.wrapped_env.env.goal, path_length)) print(np.array( [env.wrapped_env.env.sim.data.qvel[x] for x in env.wrapped_env.env._ref_joint_vel_indexes] )) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if animated: env.render() env.wrapped_env.env.viewer.viewer.add_marker(pos=env.wrapped_env.env.goal, size=np.array((0.02, 0.02, 0.02)), label='goal', rgba=[1, 0, 0, 0.5]) # timestep = 0.05 # time.sleep(timestep / speedup) if animated and not always_return_paths: return return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def rarl_rollout(env, agent1, agent2, policy_num, policy2_num,max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False): #logger.log("rollout~~~~~~~~~~~~~~~~~~~") observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() agent1.reset() agent2.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a1, agent1_info = agent1.get_action(o) a2, agent2_info = agent2.get_action(o) action_true = np.append(a1,a2) Action = {} Action['action'] = np.append(a1,a2) # Action['dist1'] = agent1_info # Action['dist2'] = agent2_info Action['policy_num'] = policy_num Action['policy2_num'] = policy2_num next_o, r, d, env_info = env.step(Action) # print(' ') # print('policy_num: ',policy_num,' a1: ',a1,' a2: ',a2,' reward: ',r) observations.append(env.observation_space.flatten(o)) if policy_num==1: rewards.append(r) actions.append(env.action_space.flatten(a1)) agent_infos.append(agent1_info) else: rewards.append(r) actions.append(env.action_space.flatten(a2)) agent_infos.append(agent2_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if animated and not always_return_paths: return return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] if hasattr(env._wrapped_env, 'generate_grid'): env._wrapped_env.generate_grid = False if hasattr(env._wrapped_env, 'generate_b0_start_goal'): env._wrapped_env.generate_b0_start_goal = False # print(env._wrapped_env.env_img) o = env.reset() if hasattr(agent, 'prob_network') and hasattr( agent.prob_network, '_l_gru') and hasattr( agent.prob_network._l_gru, 'map'): agent.reset(env._wrapped_env.env_img, env._wrapped_env.goal_img, env._wrapped_env.b0_img) else: agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) # print('action: ',a) # print('env ob: ',o) next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if animated and not always_return_paths: return return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False, env_start_state=None, agent_start_state=None): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] if env_start_state is not None: # not all envs support init_state o = env.reset(init_state=env_start_state) else: o = env.reset() if agent_start_state is not None: agent.reset(init_state=agent_start_state) else: agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if np.any( np.logical_or(np.logical_or(np.isinf(a), np.isnan(a)), np.abs(a) > 1e3)): warnings.warn("Invalid action detected") rewards[-1] = -1000.0 break if d: break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if animated and not always_return_paths: return return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False): observations = [] next_observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) try: if agent.scale_action: # check if action needs to be scaled if isinstance(env.action_space, Box): # rescale the action lb, ub = env.action_space.bounds a = lb + (a + 1.) * 0.5 * (ub - lb) a = np.clip(a, lb, ub) except AttributeError: pass next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) next_observations.append(env.observation_space.flatten(next_o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) print(o, r, a, next_o, d) path_length += 1 if d: break o = next_o if animated and not always_return_paths: return return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), next_observations=tensor_utils.stack_tensor_list(next_observations), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False, include_original_frames=False): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] if include_original_frames: bare_env = _get_bare_env(env) if not hasattr(bare_env, 'get_original_frames'): include_original_frames = False o = env.reset() agent.reset() path_length = 0 if animated: env.render() next_o = None while path_length < max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if animated and not always_return_paths: return result = dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), last_observation=next_o, ) if include_original_frames: original_frames = bare_env.get_original_frames() if original_frames is not None: result['original_frames'] = original_frames return result
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, init_state=None, no_action=False): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] dones = [] # no_action = True if init_state is not None: o = env.reset(init_state) else: o = env.reset() agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) #print(np.clip(a, -1., 1.)) if no_action: a = np.zeros_like(a) next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) dones.append(d) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if animated: env.render(close=False) return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), dones=np.asarray(dones), last_obs=o, )
def dec_rollout(env, agents, max_path_length=np.inf, animated=False, speedup=1): """Decentralized rollout""" n_agents = len(env.agents) observations = [[] for _ in range(n_agents)] actions = [[] for _ in range(n_agents)] rewards = [[] for _ in range(n_agents)] agent_infos = [[] for _ in range(n_agents)] env_infos = [[] for _ in range(n_agents)] olist = env.reset() assert len(olist) == n_agents, "{} != {}".format(len(olist), n_agents) agents.reset(dones=[True for _ in range(n_agents)]) path_length = 0 if animated: env.render() while path_length < max_path_length: alist, agent_info_list = agents.get_actions(olist) agent_info_list = tensor_utils.split_tensor_dict_list(agent_info_list) # For each agent for i, o in enumerate(olist): observations[i].append(env.observation_space.flatten(o)) actions[i].append(env.action_space.flatten(alist[i])) if agent_info_list is None: agent_infos[i].append({}) else: agent_infos[i].append(agent_info_list[i]) next_olist, rlist, d, env_info = env.step(np.asarray(alist)) for i, r in enumerate(rlist): rewards[i].append(r) env_infos[i].append(env_info) path_length += 1 if d: break olist = next_olist if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if animated: env.render() return [ dict( observations=tensor_utils.stack_tensor_list(observations[i]), actions=tensor_utils.stack_tensor_list(actions[i]), rewards=tensor_utils.stack_tensor_list(rewards[i]), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos[i]), env_infos=tensor_utils.stack_tensor_dict_list(env_infos[i]), ) for i in range(n_agents) ]
def conc_rollout(env, agents, max_path_length=np.inf, animated=False, speedup=1): """Concurrent rollout""" n_agents = len(env.agents) observations = [[] for _ in range(n_agents)] actions = [[] for _ in range(n_agents)] rewards = [[] for _ in range(n_agents)] agent_infos = [[] for _ in range(n_agents)] env_infos = [[] for _ in range(n_agents)] olist = env.reset() for agent in agents: agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: alist = [] # For each agent for i, o in enumerate(olist): a, ainfo = agents[i].get_action(o) alist.append(a) observations[i].append(env.observation_space.flatten(o)) actions[i].append(env.action_space.flatten(a)) if ainfo is None: agent_infos[i].append({}) else: agent_infos[i].append(ainfo) next_olist, rlist, d, env_info = env.step(np.asarray(alist)) for i, r in enumerate(rlist): rewards[i].append(r) env_infos[i].append(env_info) path_length += 1 if d: break olist = next_olist if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if animated: env.render() return [ dict( observations=tensor_utils.stack_tensor_list(observations[i]), actions=tensor_utils.stack_tensor_list(actions[i]), rewards=tensor_utils.stack_tensor_list(rewards[i]), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos[i]), env_infos=tensor_utils.stack_tensor_dict_list(env_infos[i]), ) for i in range(n_agents) ]
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] dones = [] o = env.reset() agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) if isinstance(env.observation_space, list): n = len(env.shadow_envs) observations.append([ env.shadow_envs[i].observation_space.flatten_n(o[i]) for i in range(n) ]) rewards.append(r) actions.append([ env.shadow_envs[i].action_space.flatten_n(a[i]) for i in range(n) ]) else: observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) dones.append(d) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if animated: env.render(close=True) return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), dones=np.asarray(dones), last_obs=o, )
def rollout(env, agents, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False): ############################################## # SEEK AGENT seek_observations = [] seek_actions = [] seek_rewards = [] seek_agent_infos = [] seek_env_infos = [] obs = env.reset() agents['seek'].reset() seek_path_length = 0 if animated: env.render() while seek_path_length < max_path_length: # print('rollout: obs shape = ', obs[0].shape) a, agent_info = agents['seek'].get_action(obs) if animated: env.render() obs_next, r, d, env_info = env.step(a) seek_observations.append(obs) seek_rewards.append(r) seek_actions.append(env.action_space.flatten(a)) seek_agent_infos.append(agent_info) seek_env_infos.append(env_info) seek_path_length += 1 obs = obs_next if d: break if animated and not always_return_paths: return seek_paths = dict( observations=e2e_tensor_utils.stack_tensor_list(seek_observations), actions=tensor_utils.stack_tensor_list(seek_actions), rewards=tensor_utils.stack_tensor_list(seek_rewards), agent_infos=tensor_utils.stack_tensor_dict_list(seek_agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(seek_env_infos), ) seek_paths['actions'] = seek_paths['actions'].astype(glob_config.dtype) seek_paths['rewards'] = seek_paths['rewards'].astype(glob_config.dtype) return {'seek': seek_paths}
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, save_video=True, video_filename='sim_out.mp4', reset_arg=None): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] images = [] o = env.reset(reset_args=reset_arg) agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: # and not animated: # TODO testing break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if save_video: from PIL import Image image = env.wrapped_env.wrapped_env.get_viewer().get_image() pil_image = Image.frombytes('RGB', (image[1], image[2]), image[0]) images.append(np.flipud(np.array(pil_image))) if animated: if save_video and len(images) >= max_path_length: import moviepy.editor as mpy clip = mpy.ImageSequenceClip(images, fps=20*speedup) if video_filename[-3:] == 'gif': clip.write_gif(video_filename, fps=20*speedup) else: clip.write_videofile(video_filename, fps=20*speedup) #return return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def rollout_tendon(env, agent, always_return_paths=True, render_mode="", save_frames=False, lengths=None, goal=None, tangent_vec_goal=None): """adjusted rollout method agent=policy""" observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] # reset environment according to given starting point, and goal, or randomly if lengths is not None and goal is not None and tangent_vec_goal is not None: o = env._wrapped_env.reset(lengths, goal, tangent_vec_goal) elif lengths is None and goal is not None and tangent_vec_goal is not None: o = env._wrapped_env.reset(lengths, goal, tangent_vec_goal) elif lengths is None and goal is not None and tangent_vec_goal is None: o = env._wrapped_env.reset(lengths, goal, tangent_vec_goal) else: o = env.reset() agent.reset() if render_mode: env.render(mode=render_mode, save_frames=save_frames) while True: a, agent_info = agent.get_action(o) # agent = policy next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) if render_mode: env.render(mode=render_mode, save_frames=save_frames) if d: env.render(mode=render_mode, save_frames=save_frames) observations.append(env.observation_space.flatten(next_o)) # also append terminal observation env_infos.append(env_info) # only append terminal info break o = next_o if not always_return_paths: return return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def rollout(env, pi_low, pi_high, tx=700, ty=0, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() x, y, z = env.robot.body_xyz r, p, yaw = env.robot.body_rpy target_theta = np.arctan2( ty - y, tx - x) angle_to_target = target_theta - yaw print('direction: ', o[0], o[1]) path_length = 0 if animated: env.render() while path_length < max_path_length: a_high = pi_high([x, y, tx, ty]) feed_o[0] = np.cos(a_high) # get direction feed_o[1] = np.sin(a_high) a, agent_info = agent.get_action(feed_o) next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if animated and not always_return_paths: return return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def rollout_hide(env, agents, mode, max_path_length=np.inf, init_state=None, init_goal=None): # Reset the model configuration env.reset() obs = env.reload_model(pose=init_state, goal=init_goal) hide_observations = [] hide_states = [] hide_actions = [] hide_rewards = [] hide_agent_infos = [] hide_env_infos = [] print("rollout : HIDE") agents['hide'].reset() hide_path_length = 0 while hide_path_length < max_path_length: env.render() a, agent_info = agents['hide'].get_action(obs) hide_states.append(env.unwrapped.get_all_pose()) #print('-------> state: ', obs[0:2]) obs_next, r, d, env_info = env.step(a) hide_observations.append(obs) hide_rewards.append(r) hide_actions.append(a) hide_agent_infos.append(agent_info) hide_env_infos.append(env_info) hide_path_length += 1 obs = obs_next if d: break hide_paths = dict( observations=e2e_tensor_utils.stack_tensor_list(hide_observations), actions=tensor_utils.stack_tensor_list(hide_actions), rewards=tensor_utils.stack_tensor_list(hide_rewards), agent_infos=tensor_utils.stack_tensor_dict_list(hide_agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(hide_env_infos), states=hide_states, ) print('Episode done:', hide_path_length) return hide_paths
def collect_trajs_for_cost(self, n_trajs, pol, env, dom, cls): paths = [] #print(n_trajs) for iter_step in trange(0, n_trajs): paths.append( self.cyberpunk_rollout(agent=pol, env=env, max_path_length=self.horizon, reward_extractor=None)) # for p in paths: # print(p['im_observations'].shape) data_matrix = tensor_utils.stack_tensor_list( [p['im_observations'] for p in paths]) class_matrix = np.tile(cls, (n_trajs, self.horizon, 1)) dom_matrix = np.tile(dom, (n_trajs, self.horizon, 1)) #data_matrix = np.zeros(shape=(n_trajs, self.horizon, self.im_height, self.im_width, self.im_channels)) #class_matrix = np.zeros(shape=(n_trajs, self.horizon, 2)) #dom_matrix = np.zeros(shape=(n_trajs, self.horizon, 2)) #for path, path_step in zip(paths, range(0, len(paths))): # for sub_path, time_step in zip(path['im_observations'], range(0, self.horizon)): # data_matrix[path_step, time_step, :, :, :] = sub_path # class_matrix[path_step, time_step, :] = path['class'] # dom_matrix[path_step, time_step, :] = path['dom'] return dict(data=data_matrix, classes=class_matrix, domains=dom_matrix)
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False, action_noise=0.0): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a_sampled, agent_info = agent.get_action(o) # use the mean here for eval. a = agent_info['mean'] a += np.random.randn(len(a)) * action_noise next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 # No slowmotion # time.sleep(timestep / speedup) # if animated and not always_return_paths: # return return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) # print("action", a) # a = 4 next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) # print("action", a) # print("env_infos", env_info) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 # time.sleep(timestep / speedup) if animated and not always_return_paths: return # print("agent_infos", agent_infos) # print("env_infos", env_infos) return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if animated: env.render() env.wrapped_env.env.viewer.viewer.add_marker( pos=env.wrapped_env.env.goal, size=np.array((0.02, 0.02, 0.02)), label='goal', rgba=[1, 0, 0, 0.5]) # timestep = 0.05 # time.sleep(timestep / speedup) if animated and not always_return_paths: return return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def reset(): sh.agent.reset() if not sh.observations is None: path = dict( observations=tensor_utils.stack_tensor_list(sh.observations), actions=tensor_utils.stack_tensor_list(sh.actions), rewards=tensor_utils.stack_tensor_list(sh.rewards), agent_infos=tensor_utils.stack_tensor_dict_list(sh.agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(sh.env_infos), ) sh.paths.append(path) sh.count += len(sh.observations) # check if it is time to update if sh.count > sh.algor.batch_size: itr = sh.itera with logger.prefix('itr #%d | ' % itr): paths = sh.paths samples_data = sh.algor.sampler.process_samples(itr, paths) sh.algor.log_diagnostics(paths) sh.algor.optimize_policy(itr, samples_data) logger.log("saving snapshot...") params = sh.algor.get_itr_snapshot(itr, samples_data) sh.algor.current_itr = itr + 1 params["algo"] = sh.algor if sh.algor.store_paths: params["paths"] = samples_data["paths"] logger.save_itr_params(itr, params) logger.log("saved") logger.dump_tabular(with_prefix=False) sh.paths = [] if sh.algor.plot: sh.algor.update_plot() if sh.algor.pause_for_plot: raw_input("Plotting evaluation run: Press Enter to " "continue...") sh.itera += 1 sh.count = 0 # reset arrays sh.observations, sh.actions, sh.rewards, sh.agent_infos, sh.env_infos = [], [], [], [], []
def cent_rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1): """Centralized rollout""" observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) if isinstance(r, (list, np.ndarray)): assert (r == r[0]).all() r = r[0] rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if animated: env.render() return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def gif_rollout(env, agent, max_path_length=np.inf, save_gif=False, **kwargs): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() initial_simparams = env.initial_simparams #agent.reset() path_length = 0 #if animated: # env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) observations.append(o.reshape(-1)) rewards.append(r) actions.append(a.reshape(-1)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if save_gif: actions = [ np.clip(action, *env.j.action_space_bounds(initial_simparams)) for action in actions ] env.save_gif(initial_simparams, np.column_stack(actions), kwargs['filename']) #env.render(close=True) #env.render() return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def collect_trajs_for_cost(self, n_trajs, pol, env, cls): paths = [] #print(n_trajs) for iter_step in range(0, n_trajs): paths.append(self.cyberpunk_rollout(agent=pol, env=env, max_path_length=self.horizon, reward_extractor=None)) data_matrix = tensor_utils.stack_tensor_list([p['im_observations'] for p in paths]) class_matrix = np.tile(cls, (n_trajs, self.horizon, 1)) return dict(data=data_matrix, classes=class_matrix)
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: #debug = open('debug.log', 'a'); debug.write('rllab/sampler/utils.py/agent.get_action(o)\n'); debug.close() a, agent_info = agent.get_action(o) #debug = open('debug.log', 'a'); debug.write('rllab/sampler/utils.py/env.step(a)\n'); debug.close() next_o, r, d, env_info = env.step(a) #debug = open('debug.log', 'a'); debug.write('rllab/sampler/utils.py/observations.append()\n'); debug.close() observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if animated: env.render(close=True) # env.render() return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def rollout(env, agent, max_path_length=np.inf, speedup=1, save_video=False, video_filename='sim_out.mp4', reset_arg=3, renderMode='human', return_images=False): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] images = [] o = env.reset(reset_args=reset_arg) agent.reset() path_length = 0 while path_length < max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) if return_images or save_video: images.append(env.render(renderMode)) path_length += 1 if d: # and not animated: # TODO testing break o = next_o if save_video: import moviepy.editor as mpy clip = mpy.ImageSequenceClip(images, fps=20 * speedup) if video_filename[-3:] == 'gif': clip.write_gif(video_filename, fps=20 * speedup) else: clip.write_videofile(video_filename, fps=20 * speedup) if return_images: return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), images=np.array(images)) else: return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if animated and not always_return_paths: return return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def sample_paths(N, policy, baseline, env_mode='train', T=1e6, gamma=1, normalized_env=False, env=None): # Directly specifying env works only when sampling in series # set random seed (needed for multiprocessing) np.random.seed() if env == None: env = get_environment(env_mode) T = min(T, env.horizon) T = max(1, T) # sometimes, env is not initialized correctly in multiprocessing # this is just a sanity check and step size should essentially be zero. print "####### Worker started #######" paths = [] for ep in range(N): observations=[] actions=[] rewards=[] agent_infos = [] env_infos = [] qpos = [] qvel = [] o = env.reset() if normalized_env: qpos.append(env.wrapped_env.env.model.data.qpos.reshape(-1)) qvel.append(env.wrapped_env.env.model.data.qvel.reshape(-1)) else: qpos.append(env.env.model.data.qpos.reshape(-1)) qvel.append(env.env.model.data.qvel.reshape(-1)) done = False t = 0 while t < T and done != True: a, agent_info = policy.get_action(o) next_o, r, done, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) actions.append(env.action_space.flatten(a)) rewards.append(r) agent_infos.append(agent_info) env_infos.append(env_info) if normalized_env: qpos.append(env.wrapped_env.env.model.data.qpos.reshape(-1)) qvel.append(env.wrapped_env.env.model.data.qvel.reshape(-1)) else: qpos.append(env.env.model.data.qpos.reshape(-1)) qvel.append(env.env.model.data.qvel.reshape(-1)) o = next_o t += 1 # make a path dictionary # Also store the path belief and env data used in the trajectory try: path_belief = env.env.belief except Exception as e: path_belief = str(e) # path_model = env.env path = dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), qpos=tensor_utils.stack_tensor_list(qpos), qvel=tensor_utils.stack_tensor_list(qvel), #path_belief=path_belief, #path_model=path_model, ) # TODO: Storing the path model is too space inefficient. Need to find alternative # compute returns using the path path_baseline = baseline.predict(path) advantages = [] returns = [] return_so_far = 0 for t in xrange(len(rewards) - 1, -1, -1): return_so_far = rewards[t] + gamma * return_so_far returns.append(return_so_far) advantage = return_so_far - path_baseline[t] advantages.append(advantage) # advantages and returns are stored backward in time advantages = np.array(advantages[::-1]) returns = np.array(returns[::-1]) # normalize advantages advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8) path["advantages"] = advantages path["returns"] = returns paths.append(path) #print "Env body_mass : ", env.env.model.body_mass[1] print "====== Worker finished ======" return paths
def obtain_samples(self, itr): logger.log("Obtaining samples for iteration %d..." % itr) paths = [] n_samples = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 import time while n_samples < self.algo.batch_size: t = time.time() self.algo.policy.reset(dones) actions, agent_infos = self.algo.policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in xrange(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in xrange(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append(dict( observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular("PolicyExecTime", policy_time) logger.record_tabular("EnvExecTime", env_time) logger.record_tabular("ProcessExecTime", process_time) return paths
def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix=''): # reset_args: arguments to pass to the environments to reset # return_dict: whether or not to return a dictionary or list form of paths logger.log("Obtaining samples for iteration %d..." % itr) #paths = [] paths = {} for i in range(self.vec_env.num_envs): paths[i] = [] # if the reset args are not list/numpy, we set the same args for each env if reset_args is not None and (type(reset_args) != list and type(reset_args)!=np.ndarray): reset_args = [reset_args]*self.vec_env.num_envs n_samples = 0 obses = self.vec_env.reset(reset_args) dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions, reset_args) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths[idx].append(dict( observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular(log_prefix+"PolicyExecTime", policy_time) logger.record_tabular(log_prefix+"EnvExecTime", env_time) logger.record_tabular(log_prefix+"ProcessExecTime", process_time) if not return_dict: flatten_list = lambda l: [item for sublist in l for item in sublist] paths = flatten_list(paths.values()) #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()]) return paths