def make_env(args): symbolic = args.env_kwargs['observation_mode'] != 'cam_rgb' args.encoder_type = 'identity' if symbolic else 'pixel' env = Env(args.env_name, symbolic, args.seed, 200, 1, 8, args.im_size, env_kwargs=args.env_kwargs, normalize_observation=False, scale_reward=args.scale_reward, clip_obs=args.clip_obs) env.seed(args.seed) return env
def generate_env_state(env_name): kwargs = env_arg_dict[env_name] kwargs['headless'] = True kwargs['use_cached_states'] = False kwargs['num_variations'] = 1000 kwargs['save_cached_states'] = True # Env wrappter env = Env(env_name, False, 100, 200, 1, 8, 128, kwargs) return env
def interact(env: Env, agent: Agent, start_obs: Arrayable) -> Tuple[array, array, array]: """One step interaction between env and agent. :args env: environment :args agent: agent :args start_obs: initial observation :return: (next observation, reward, terminal?) """ action = agent.step(start_obs) next_obs, reward, done, information = env.step(action) time_limit = information[ 'time_limit'] if 'time_limit' in information else None agent.observe(next_obs, reward, done, time_limit) return next_obs, reward, done
def main(policy_file, seed, n_test_rollouts, render, exploit, record_video): if torch.cuda.is_available(): device = torch.device('cuda:1') torch.cuda.manual_seed(seed) else: device = torch.device('cpu') np.random.seed(seed=seed) json_file = os.path.join(os.path.dirname(policy_file), 'variant.json') print('Load variants from {}'.format(json_file)) with open(json_file) as f: vv = json.load(f) vv['env_kwargs']['headless'] = 1 - render vv['saved_models'] = policy_file env = Env(vv['env_name'], vv['symbolic_env'], vv['seed'], vv['max_episode_length'], vv['action_repeat'], vv['bit_depth'], env_kwargs=vv['env_kwargs']) agent = PlaNetAgent(env, vv, device) all_rewards = [] agent.set_model_eval() with torch.no_grad(): for i in range(n_test_rollouts): observation, total_reward = agent.env.reset(), 0 belief, posterior_state, action = torch.zeros(1, vv['belief_size'], device=device), \ torch.zeros(1, vv['state_size'], device=device), \ torch.zeros(1, env.action_size, device=device) for t in range(vv['env_kwargs']['horizon']): belief, posterior_state, action, next_observation, reward, done = \ agent.update_belief_and_act(agent.env, belief, posterior_state, action, observation.to(device=agent.device), explore=(exploit != 0)) total_reward += reward observation = next_observation if done: break print('episode: {}, total reward: {}'.format(i, total_reward)) all_rewards.append(total_reward) print('Average total reward:', np.mean(np.array(all_rewards)))
def run_task(arg_vv, log_dir, exp_name): if arg_vv['algorithm'] == 'planet': from planet.config import DEFAULT_PARAMS elif arg_vv['algorithm'] == 'dreamer': from dreamer.config import DEFAULT_PARAMS else: raise NotImplementedError vv = DEFAULT_PARAMS vv.update(**arg_vv) vv = update_env_kwargs(vv) vv['max_episode_length'] = vv['env_kwargs']['horizon'] # Configure logger logger.configure(dir=log_dir, exp_name=exp_name) logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Configure torch if torch.cuda.is_available(): device = torch.device('cuda:1') if torch.cuda.device_count( ) > 1 else torch.device('cuda:0') torch.cuda.manual_seed(vv['seed']) else: device = torch.device('cpu') # Dump parameters with open(osp.join(logger.get_dir(), 'variant.json'), 'w') as f: json.dump(vv, f, indent=2, sort_keys=True) env = Env(vv['env_name'], vv['symbolic_env'], vv['seed'], vv['max_episode_length'], vv['action_repeat'], vv['bit_depth'], vv['image_dim'], env_kwargs=vv['env_kwargs']) if vv['algorithm'] == 'planet': from planet.planet_agent import PlaNetAgent agent = PlaNetAgent(env, vv, device) agent.train(train_epoch=vv['train_epoch']) env.close() elif vv['algorithm'] == 'dreamer': from dreamer.dreamer_agent import DreamerAgent agent = DreamerAgent(env, vv, device) agent.train(train_episode=vv['train_episode']) env.close()
def main(args): if args.seed == -1: args.__dict__["seed"] = np.random.randint(1, 1000000) utils.set_seed_everywhere(args.seed) args.__dict__ = update_env_kwargs(args.__dict__) # Update env_kwargs symbolic = args.env_kwargs['observation_mode'] != 'cam_rgb' args.encoder_type = 'identity' if symbolic else 'pixel' env = Env(args.env_name, symbolic, args.seed, 200, 1, 8, args.pre_transform_image_size, env_kwargs=args.env_kwargs, normalize_observation=False, scale_reward=args.scale_reward, clip_obs=args.clip_obs) env.seed(args.seed) # make directory ts = time.gmtime() ts = time.strftime("%m-%d", ts) args.work_dir = logger.get_dir() video_dir = utils.make_dir(os.path.join(args.work_dir, 'video')) model_dir = utils.make_dir(os.path.join(args.work_dir, 'model')) buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer')) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') action_shape = env.action_space.shape if args.encoder_type == 'pixel': obs_shape = (3, args.image_size, args.image_size) pre_aug_obs_shape = (3, args.pre_transform_image_size, args.pre_transform_image_size) else: obs_shape = env.observation_space.shape pre_aug_obs_shape = obs_shape replay_buffer = utils.ReplayBuffer( obs_shape=pre_aug_obs_shape, action_shape=action_shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device, image_size=args.image_size, ) agent = make_agent(obs_shape=obs_shape, action_shape=action_shape, args=args, device=device) L = Logger(args.work_dir, use_tb=args.save_tb, chester_logger=logger) episode, episode_reward, done, ep_info = 0, 0, True, [] start_time = time.time() for step in range(args.num_train_steps): # evaluate agent periodically if step % args.eval_freq == 0: L.log('eval/episode', episode, step) evaluate(env, agent, video_dir, args.num_eval_episodes, L, step, args) if args.save_model and (step % (args.eval_freq * 5) == 0): agent.save(model_dir, step) if args.save_buffer: replay_buffer.save(buffer_dir) if done: if step > 0: if step % args.log_interval == 0: L.log('train/duration', time.time() - start_time, step) for key, val in get_info_stats([ep_info]).items(): L.log('train/info_' + key, val, step) L.dump(step) start_time = time.time() if step % args.log_interval == 0: L.log('train/episode_reward', episode_reward, step) obs = env.reset() done = False ep_info = [] episode_reward = 0 episode_step = 0 episode += 1 if step % args.log_interval == 0: L.log('train/episode', episode, step) # sample action for data collection if step < args.init_steps: action = env.action_space.sample() else: with utils.eval_mode(agent): action = agent.sample_action(obs) # run training update if step >= args.init_steps: num_updates = 1 for _ in range(num_updates): agent.update(replay_buffer, L, step) next_obs, reward, done, info = env.step(action) # allow infinit bootstrap ep_info.append(info) done_bool = 0 if episode_step + 1 == env.horizon else float(done) episode_reward += reward replay_buffer.add(obs, action, reward, next_obs, done_bool) obs = next_obs episode_step += 1
def evaluate( dt: float, epoch: int, env: Env, agent: Agent, eval_gap: float, # noqa: C901 time_limit: Optional[float] = None, eval_return: bool = False, progress_bar: bool = False, video: bool = False, no_log: bool = False, test: bool = False, eval_policy: bool = True) -> Optional[float]: """Evaluate agent in environment. :args dt: time discretization :args epoch: index of the current epoch :args env: environment :args agent: interacting agent :args eval_gap: number of normalized epochs (epochs divided by dt) between training steps :args time_limit: maximal physical time (number of steps divided by dt) spent in the environment :args eval_return: do we only perform specific evaluation? :args progress_bar: use a progress bar? :args video: log a video of the interaction? :args no_log: do we log results :args test: log to a different test summary :args eval_policy: if the exploitation policy is noisy, remove the noise before evaluating :return: return evaluated, None if no return is evaluated """ log_gap = int(eval_gap / dt) agent.eval() if not eval_policy and isinstance(agent, OnlineAgent): agent.noisy_eval() agent.reset() R = None if eval_return: rewards, dones = [], [] imgs = [] time_limit = time_limit if time_limit else 10 nb_steps = int(time_limit / dt) info(f"eval> evaluating on a physical time {time_limit}" f" ({nb_steps} steps in total)") obs = env.reset() iter_range = tqdm(range(nb_steps)) if progress_bar else range(nb_steps) for _ in iter_range: obs, reward, done = interact(env, agent, obs) rewards.append(reward) dones.append(done) if video: imgs.append(env.render(mode='rgb_array')) R = compute_return(np.stack(rewards, axis=0), np.stack(dones, axis=0)) tag = "noisy" if not eval_policy else "" info(f"eval> At epoch {epoch}, {tag} return: {R}") if not no_log: if not eval_policy: log("Return_noisy", R, epoch) elif not video: # don't log when outputing video if not test: log("Return", R, epoch) else: log("Return_test", R, epoch) if video: log_video("demo", epoch, np.stack(imgs, axis=0)) if not no_log: specific_evaluation(epoch, log_gap, dt, env, agent) return R
def full_render(self, mode='human'): for remote in self.remotes: remote.send(('render', None)) imgs = [remote.recv() for remote in self.remotes] bigimg = tile_images(imgs) if mode == 'human': import cv2 cv2.imshow('vecenv', bigimg[:, :, ::-1]) cv2.waitKey(1) elif mode == 'rgb_array': return bigimg else: raise NotImplementedError Env.register(SubprocVecEnv) def VEnv(envs): if len(envs) == 1: return SingleVecEnv(envs) else: return SubprocVecEnv(envs) if __name__ == '__main__': from envs.pusher import DiscretePusherEnv nenvs = 64 envs = [DiscretePusherEnv() for _ in range(nenvs)] vec_env = SubprocVecEnv(envs)
def main(): load_path = [ # 'data/corl_data/0717_planet_water/0717_planet_water_2020_07_17_03_05_41_0002', #PourWater 'data/corl_data/0716_planet_cloth/0716_planet_cloth_2020_07_16_18_13_13_0004/', # ClothFlatten # './data/corl_data/0723-planet-PassWater/0723-planet-PassWater_2020_07_23_03_11_22_0003', # './data/corl_data/0724-planet-TransportTorus/0724-planet-TransportTorus_2020_07_24_03_04_09_0002/' # './data/corl_data/0722_planet_rigid_cloth_fold/0722_planet_rigid_cloth_fold_2020_07_22_22_37_24_0003/', # Rigid Cloth Fold # Cloth Fold # 'data/corl_data/0719_planet_cloth_fold/0719_planet_cloth_fold_2020_07_19_02_35_15_0002' # './data/corl_data/0717_planet_rigid_cloth/0717_planet_rigid_cloth_2020_07_17_21_32_45_0001' # Rigid Cloth Drop ] seed = 0 n_test_rollouts = 8 render = 0 save_dir = 'data/planet_open_loop_predictions' for path in load_path: policy_file = osp.join(path, 'models_550.pth') if torch.cuda.is_available(): device = torch.device('cuda:0') torch.cuda.manual_seed(seed) else: device = torch.device('cpu') np.random.seed(seed=seed) json_file = os.path.join(os.path.dirname(policy_file), 'variant.json') print('Load variants from {}'.format(json_file)) with open(json_file) as f: vv = json.load(f) vv['env_kwargs']['headless'] = 1 vv['saved_models'] = policy_file env = Env(vv['env_name'], vv['symbolic_env'], vv['seed'], vv['max_episode_length'], vv['action_repeat'], vv['bit_depth'], vv['image_dim'], env_kwargs=vv['env_kwargs']) agent = PlaNetAgent(env, vv, device) all_rewards, all_frames, all_frames_reconstr = [], [], [] agent.set_model_eval() with torch.no_grad(): for i in range(n_test_rollouts): observation, total_reward = agent.env.reset(), 0 belief, posterior_state, action = torch.zeros(1, vv['belief_size'], device=device), \ torch.zeros(1, vv['state_size'], device=device), \ torch.zeros(1, env.action_size, device=device) initial_belief, initial_posterior, initial_observation = belief.clone(), posterior_state.clone(), observation.clone() recorded_actions = [action] frames, frames_reconstr = [observation], [observation] for t in range(vv['env_kwargs']['horizon']): belief, posterior_state, action, next_observation, reward, done, info = \ agent.update_belief_and_act(agent.env, belief, posterior_state, action, observation.to(device=agent.device), explore=False) recorded_actions.append(action) total_reward += reward observation = next_observation frames.append(observation) # frames.extend(info['flex_env_recorded_frames']) if done: break # Re-imagine without observation belief, state = initial_belief, initial_posterior for idx, action in enumerate(recorded_actions): print('idx: ', idx) if idx <= 5: belief, _, _, _, state, _, _ = agent.transition_model(state, action.unsqueeze(dim=0), belief, agent.encoder(frames[idx].to(device=agent.device)).unsqueeze(dim=0)) else: belief, state, _, _, = agent.transition_model(posterior_state, action.unsqueeze(dim=0), belief) belief, state = belief.squeeze(dim=0), state.squeeze(dim=0) # print('belief size:', belief.size(), 'state size:', state.size()) frames_reconstr.append(agent.observation_model(belief, state).cpu()) print('episode: {}, total reward: {}'.format(i, total_reward)) all_rewards.append(total_reward) all_frames.append(frames) all_frames_reconstr.append(frames_reconstr) # Pick key frames num_key_frames = 5 if vv['env_name'] in ['RigidClothDrop', 'ClothDrop']: key_idx = get_spaced_idx(len(frames[:5]), num_key_frames) elif vv['env_name'] in ['RigidClothFold', 'ClothFold']: key_idx = get_spaced_idx(len(frames[:15]), num_key_frames) else: key_idx = get_spaced_idx(len(frames[:30]), num_key_frames) frame = torch.cat([frames[idx] for idx in key_idx], dim=0) + 0.5 frame_reconstr = torch.cat([frames_reconstr[idx] for idx in key_idx], dim=0) + 0.5 image_grid = make_grid(torch.cat([frame, frame_reconstr], dim=0), nrow=num_key_frames, pad_value=0.4706, padding=5) save_image(image_grid, osp.join(save_dir, vv['env_name'] + '_{}.png'.format(i))) # save_image(torch.as_tensor(frame), osp.join(save_dir, vv['env_name'] + '_gt_{}.png'.format(i))) # save_image(torch.as_tensor(frame_reconstr), osp.join(save_dir, vv['env_name'] + '_prediction_{}.png'.format(i))) for idx in [0, 4]: all_frames_ = all_frames[idx:idx + 4] # Only take the first 8 episodes to visualize all_frames_reconstr_ = all_frames_reconstr[idx:idx + 4] video_frames = [] for i in range(len(all_frames_[0])): frame = torch.cat([x[i] for x in all_frames_]) frame_reconstr = torch.cat([x[i] for x in all_frames_reconstr_]) video_frames.append(make_grid(torch.cat([frame, frame_reconstr], dim=3) + 0.5, nrow=4).numpy()) print(video_frames[0].shape) write_video(video_frames, vv['env_name'] + str(idx), save_dir) # Lossy compression print('Average total reward:', np.mean(np.array(all_rewards)))
# Setup results_dir = os.path.join('results', args.id) os.makedirs(results_dir, exist_ok=True) np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available() and not args.disable_cuda: args.device = torch.device('cuda') torch.cuda.manual_seed(args.seed) else: args.device = torch.device('cpu') metrics = {'steps': [], 'episodes': [], 'train_rewards': [], 'test_episodes': [], 'test_rewards': [], 'observation_loss': [], 'reward_loss': [], 'kl_loss': []} # Initialise training environment and experience replay memory env = Env(args.env, args.symbolic_env, args.seed, args.max_episode_length, args.action_repeat, args.bit_dpth) if args.experience_replay is not '' and os.path.exists(args.experience_replay): D = torch.load(args.experience_replay) metrics['steps'], metrics['episodes'] = [D.steps] * D.episodes, list(range(1, D.episodes + 1)) elif not args.test: D = ExperienceReplay(args.experience_size, args.symbolic_env, env.observation_size, env.action_size, args.bit_depth, args.device) # Initialise dataset D with S random seed episodes for s in range(1, args.seed_episodes + 1): observation, done, t = env.reset(), False, 0 while not done: action = env.sample_random_action() next_observation, reward, done = env.step(action) D.append(observation, action, reward, done) observation = next_observation t += 1