def make_env(cfg): """Helper function to create dm_control environment""" if cfg.env == 'ball_in_cup_catch': domain_name = 'ball_in_cup' task_name = 'catch' elif cfg.env == 'point_mass_easy': domain_name = 'point_mass' task_name = 'easy' else: domain_name = cfg.env.split('_')[0] task_name = '_'.join(cfg.env.split('_')[1:]) # per dreamer: https://github.com/danijar/dreamer/blob/02f0210f5991c7710826ca7881f19c64a012290c/wrappers.py#L26 camera_id = 2 if domain_name == 'quadruped' else 0 env = dmc2gym.make(domain_name=domain_name, task_name=task_name, seed=cfg.seed, visualize_reward=False, from_pixels=True, height=cfg.image_size, width=cfg.image_size, frame_skip=cfg.action_repeat, camera_id=camera_id) env = utils.FrameStack(env, k=cfg.frame_stack) env.seed(cfg.seed) assert env.action_space.low.min() >= -1 assert env.action_space.high.max() <= 1 return env
def make_env(cfg): """Helper function to create dm_control environment""" if cfg.env == 'ball_in_cup_catch': domain_name = 'ball_in_cup' task_name = 'catch' elif cfg.env == 'point_mass_easy': domain_name = 'point_mass' task_name = 'easy' elif cfg.env == 'cartpole_two_poles': domain_name = 'cartpole' task_name = 'two_poles' elif cfg.env == 'cartpole_three_poles': domain_name = 'cartpole' task_name = 'three_poles' else: domain_name = cfg.env.split('_')[0] task_name = '_'.join(cfg.env.split('_')[1:]) # per dreamer: https://github.com/danijar/dreamer/blob/02f0210f5991c7710826ca7881f19c64a012290c/wrappers.py#L26 camera_id = 2 if domain_name == 'quadruped' else 0 env = dmc2gym.make(domain_name=domain_name, task_name=task_name, seed=cfg.seed, visualize_reward=False, from_pixels=True, height=cfg.image_size, width=cfg.image_size, frame_skip=cfg.action_repeat, camera_id=camera_id) # env = dmc2gym_noisy.make( # domain_name=domain_name, # task_name=task_name, # resource_files='../../../../../experiments/distractors/images/*.mp4', # img_source='video', # total_frames=10000, # seed=cfg.seed, # visualize_reward=False, # from_pixels=True, # height=84, # width=84, # frame_skip=cfg.action_repeat, # camera_id=camera_id # ) env = utils.FrameStack(env, k=cfg.frame_stack) env.seed(cfg.seed) assert env.action_space.low.min() >= -1 assert env.action_space.high.max() <= 1 return env
def make_env(cfg): """Helper function to create dm_control environment""" if cfg.env == 'ball_in_cup_catch': domain_name = 'ball_in_cup' task_name = 'catch' elif cfg.env == 'point_mass_easy': domain_name = 'point_mass' task_name = 'easy' else: domain_name = cfg.env.split('_')[0] task_name = '_'.join(cfg.env.split('_')[1:]) # per dreamer: https://github.com/danijar/dreamer/blob/02f0210f5991c7710826ca7881f19c64a012290c/wrappers.py#L26 camera_id = 2 if domain_name == 'quadruped' else 0 # env = dmc2gym.make(domain_name=domain_name, # task_name=task_name, # seed=cfg.seed, # visualize_reward=False, # from_pixels=True, # height=cfg.image_size, # width=cfg.image_size, # frame_skip=cfg.action_repeat, # camera_id=camera_id) # env = gym.make("CarRacing-v0") env_ = gym_tetris.make('TetrisA-v0') env = JoypadSpace(env_, SIMPLE_MOVEMENT) # env = MaxAndSkipEnv(env) # env._max_episode_steps = env_._max_episode_steps max_episode_steps = 10000 env = WrapPyTorch(env, max_episode_steps) env.seed(cfg.seed) # print(env.ram) obs = env.reset() print(obs.shape) # env.seed(cfg.seed) env = utils.FrameStack(env, k=cfg.frame_stack) print("Init env done") # assert env.action_space.low.min() >= -1 # assert env.action_space.high.max() <= 1 return env
def make_env(cfg, eval=False): """Helper function to create dm_control environment""" if cfg.env == "ball_in_cup_catch": domain_name = "ball_in_cup" task_name = "catch" elif cfg.env == "point_mass_easy": domain_name = "point_mass" task_name = "easy" else: domain_name = cfg.env.split("_")[0] task_name = "_".join(cfg.env.split("_")[1:]) # per dreamer: https://github.com/danijar/dreamer/blob/02f0210f5991c7710826ca7881f19c64a012290c/wrappers.py#L26 camera_id = 2 if domain_name == "quadruped" else 0 if eval: seed = cfg.seed + 1004 else: seed = cfg.seed env = dmc2gym.make( domain_name=domain_name, task_name=task_name, seed=seed, visualize_reward=False, from_pixels=True, height=cfg.pre_image_size, width=cfg.pre_image_size, frame_skip=cfg.action_repeat, camera_id=camera_id, ) env = utils.FrameStack(env, k=cfg.frame_stack) env.seed(seed) assert env.action_space.low.min() >= -1 assert env.action_space.high.max() <= 1 return env
def __init__(self, cfg): self.work_dir = os.getcwd() print(f'workspace: {self.work_dir}') self.cfg = cfg self.logger = Logger(self.work_dir, save_tb=cfg.log_save_tb, log_frequency=cfg.log_frequency_step, agent=cfg.agent.name, action_repeat=cfg.action_repeat) utils.set_seed_everywhere(cfg.seed) self.device = torch.device(cfg.device) gibson_config_filename = os.path.join( os.path.dirname(gibson2.__file__), '../examples/configs/hand_drawer.yaml') self.env = HandDrawerEnv(config_file=gibson_config_filename, mode='headless') self.env = utils.FrameStack(self.env, k=cfg.frame_stack) cfg.agent.params.obs_shape = self.env.observation_space.shape cfg.agent.params.action_shape = self.env.action_space.shape cfg.agent.params.action_range = [ float(self.env.action_space.low.min()), float(self.env.action_space.high.max()) ] self.agent = hydra.utils.instantiate(cfg.agent) self.replay_buffer = ReplayBuffer(self.env.observation_space.shape, self.env.action_space.shape, cfg.replay_buffer_capacity, self.cfg.image_pad, self.device) self.video_recorder = VideoRecorder( self.work_dir if cfg.save_video else None) self.step = 0
loss = BCE + KLD return loss env = dmc2gym.make(domain_name=args.domain_name, task_name=args.task_name, resource_files=args.resource_files, img_source=args.img_source, total_frames=10, seed=args.seed, visualize_reward=False, from_pixels=(args.encoder_type == 'pixel'), height=args.image_size, width=args.image_size, frame_skip=args.action_repeat) env = utils.FrameStack(env, k=args.frame_stack) vae = VAE(env.observation_space.shape) train_dataset = torch.load('train_dataset.pt') optimizer = torch.optim.Adam(vae.parameters(), lr=1e-3) train_loader = torch.utils.data.DataLoader(train_dataset['obs'], batch_size=32, shuffle=True) # training loop for i in range(100): total_loss = [] for obs_batch in train_loader: optimizer.zero_grad() loss = vae.train(obs_batch.to(device).float()) loss.backward() optimizer.step()
def main(): args = parse_args() if args.seed == -1: args.__dict__["seed"] = np.random.randint(1, 1000000) utils.set_seed_everywhere(args.seed) pre_transform_image_size = args.pre_transform_image_size if 'crop' in args.data_augs else args.image_size pre_image_size = args.pre_transform_image_size # record the pre transform image size for translation env = dmc2gym.make(domain_name=args.domain_name, task_name=args.task_name, seed=args.seed, visualize_reward=False, from_pixels=(args.encoder_type == 'pixel'), height=pre_transform_image_size, width=pre_transform_image_size, frame_skip=args.action_repeat) env.seed(args.seed) # stack several consecutive frames together if args.encoder_type == 'pixel': env = utils.FrameStack(env, k=args.frame_stack) # make directory ts = time.gmtime() ts = time.strftime("%m-%d", ts) env_name = args.domain_name + '-' + args.task_name exp_name = env_name + '-' + ts + '-im' + str(args.image_size) +'-b' \ + str(args.batch_size) + '-s' + str(args.seed) + '-' + args.encoder_type args.work_dir = args.work_dir + '/' + exp_name utils.make_dir(args.work_dir) video_dir = utils.make_dir(os.path.join(args.work_dir, 'video')) model_dir = utils.make_dir(os.path.join(args.work_dir, 'model')) buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer')) video = VideoRecorder(video_dir if args.save_video else None) with open(os.path.join(args.work_dir, 'args.json'), 'w') as f: json.dump(vars(args), f, sort_keys=True, indent=4) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') action_shape = env.action_space.shape if args.encoder_type == 'pixel': obs_shape = (3 * args.frame_stack, args.image_size, args.image_size) pre_aug_obs_shape = (3 * args.frame_stack, pre_transform_image_size, pre_transform_image_size) else: obs_shape = env.observation_space.shape pre_aug_obs_shape = obs_shape replay_buffer = utils.ReplayBuffer( obs_shape=pre_aug_obs_shape, action_shape=action_shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device, image_size=args.image_size, pre_image_size=pre_image_size, ) agent = make_agent(obs_shape=obs_shape, action_shape=action_shape, args=args, device=device) L = Logger(args.work_dir, use_tb=args.save_tb) episode, episode_reward, done = 0, 0, True start_time = time.time() for step in range(args.num_train_steps): # evaluate agent periodically if step % args.eval_freq == 0: L.log('eval/episode', episode, step) evaluate(env, agent, video, args.num_eval_episodes, L, step, args) if args.save_model: agent.save_curl(model_dir, step) if args.save_buffer: replay_buffer.save(buffer_dir) if done: if step > 0: if step % args.log_interval == 0: L.log('train/duration', time.time() - start_time, step) L.dump(step) start_time = time.time() if step % args.log_interval == 0: L.log('train/episode_reward', episode_reward, step) obs = env.reset() done = False episode_reward = 0 episode_step = 0 episode += 1 if step % args.log_interval == 0: L.log('train/episode', episode, step) # sample action for data collection if step < args.init_steps: action = env.action_space.sample() else: with utils.eval_mode(agent): action = agent.sample_action(obs / 255.) # run training update if step >= args.init_steps: num_updates = 1 for _ in range(num_updates): agent.update(replay_buffer, L, step) next_obs, reward, done, _ = env.step(action) # allow infinit bootstrap done_bool = 0 if episode_step + 1 == env._max_episode_steps else float( done) episode_reward += reward replay_buffer.add(obs, action, reward, next_obs, done_bool) obs = next_obs episode_step += 1
def main(): args = parse_args() utils.set_seed_everywhere(args.seed) # Robot stuff action_space = ActionSpace.DELTA_EE_POSE_IMPEDANCE blocking_action = True env = RobotEnv(name='peg_in_hole', simulation=True, action_space=action_space, isotropic_gains=True, render=False, blocking_action=blocking_action, rotation_axis=(0, 0, 1), observation_type=dict(camera=1, q=0, dq=0, tau=0, x=0, dx=0)) # stack several consecutive frames together if args.encoder_type == 'pixel': env = utils.FrameStack(env, k=args.frame_stack) utils.make_dir(args.work_dir) video_dir = utils.make_dir(os.path.join(args.work_dir, 'video')) model_dir = utils.make_dir(os.path.join(args.work_dir, 'model')) buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer')) video = VideoRecorder(video_dir if args.save_video else None) with open(os.path.join(args.work_dir, 'args.json'), 'w') as f: json.dump(vars(args), f, sort_keys=True, indent=4) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # the dmc2gym wrapper standardizes actions #assert env.action_space.low.min() >= -1 #assert env.action_space.high.max() <= 1 replay_buffer = utils.ReplayBuffer( obs_shape=env.observation_space['camera'], action_shape=env.action_space.shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device) agent = make_agent(obs_shape=env.observation_space['camera'], action_shape=env.action_space.shape, args=args, device=device) L = Logger(args.work_dir, use_tb=args.save_tb) episode, episode_reward, prev_episode_reward, done = 0, 0, 0, True start_time = time.time() for step in range(args.num_train_steps): if done: if step > 0: L.log('train/duration', time.time() - start_time, step) start_time = time.time() L.dump(step) # evaluate agent periodically if step % args.eval_freq == 0 and step > 0: L.log('eval/episode', episode, step) evaluate(env, agent, video, args.num_eval_episodes, L, step) if args.save_model: agent.save(model_dir, step) if args.save_buffer: replay_buffer.save(buffer_dir) L.log('train/episode_reward', episode_reward, step) env.step(np.array([0, 0, 0.1, 0, 0, 0])) # Prevent getting stuck obs = env.reset() done, episode_reward, episode_step = False, 0, 0 episode += 1 L.log('train/episode', episode, step) # sample action for data collection if step < args.init_steps: action = env.action_space.sample() else: with utils.eval_mode(agent): action = agent.sample_action(obs) temp = action print("Temp action: {}".format(temp)) action = np.multiply(action, env.action_space.high) # run training update if step >= args.init_steps: num_updates = args.init_steps if step == args.init_steps else 1 for _ in range(num_updates): agent.update(replay_buffer, L, step) next_obs, reward, done, _ = env.step(action) print("E: {} | S: {} | R: {:.4f} | ER: {:.4f} | A: {}".format( episode, step, round(reward, 4), round(episode_reward, 4), action)) # Reset environment if agent gets stuck (stuck means for 100 steps no increase in reward) if step % 100 == 0 and step > 0: if np.abs( prev_episode_reward - episode_reward ) < 1e-5: # If change in reward is negligible after 100 steps restart env.step(np.array([0, 0, 0.1, 0, 0, 0])) obs = env.reset() prev_episode_reward = episode_reward # allow infinit bootstrap done_bool = 0 if episode_step + 1 == env._max_episode_steps else float( done) episode_reward += reward replay_buffer.add(obs, action, reward, next_obs, done_bool) obs = next_obs episode_step += 1
def main(): args = parse_args() utils.set_seed_everywhere(args.seed) env = dmc2gym.make(domain_name=args.domain_name, task_name=args.task_name, seed=args.seed, visualize_reward=False, from_pixels=(args.encoder_type == 'pixel'), height=args.image_size, width=args.image_size, frame_skip=args.action_repeat) env.seed(args.seed) # stack several consecutive frames together if args.encoder_type == 'pixel': env = utils.FrameStack(env, k=args.frame_stack) args.work_dir = os.path.join( args.work_dir, f'{args.domain_name}-{args.task_name}-seed{args.seed}-{datetime.now().strftime("%Y%m%d-%H%M")}' ) utils.make_dir(args.work_dir) video_dir = utils.make_dir(os.path.join(args.work_dir, 'video')) model_dir = utils.make_dir(os.path.join(args.work_dir, 'model')) buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer')) video = VideoRecorder(video_dir if args.save_video else None) with open(os.path.join(args.work_dir, 'args.json'), 'w') as f: json.dump(vars(args), f, sort_keys=True, indent=4) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("Using device: ", device) # the dmc2gym wrapper standardizes actions assert env.action_space.low.min() >= -1 assert env.action_space.high.max() <= 1 replay_buffer = utils.ReplayBuffer(obs_shape=env.observation_space.shape, action_shape=env.action_space.shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device) agent = make_agent(obs_shape=env.observation_space.shape, action_shape=env.action_space.shape, args=args, device=device) L = Logger(args.work_dir, use_tb=args.save_tb) episode, episode_reward, done = 0, 0, True start_time = time.time() for step in range(args.num_train_steps): if done: if step > 0: L.log('train/duration', time.time() - start_time, step) start_time = time.time() L.dump(step) # evaluate agent periodically if step % args.eval_freq == 0: L.log('eval/episode', episode, step) evaluate(env, agent, video, args.num_eval_episodes, L, step) if args.save_model: agent.save(model_dir, step) if args.save_buffer: replay_buffer.save(buffer_dir) L.log('train/episode_reward', episode_reward, step) obs = env.reset() done = False episode_reward = 0 episode_step = 0 episode += 1 L.log('train/episode', episode, step) # sample action for data collection if step < args.init_steps: action = env.action_space.sample() else: with utils.eval_mode(agent): action = agent.sample_action(obs) # run training update if step >= args.init_steps: num_updates = args.init_steps if step == args.init_steps else 1 for _ in range(num_updates): agent.update(replay_buffer, L, step) next_obs, reward, done, _ = env.step(action) # allow infinit bootstrap done_bool = 0 if episode_step + 1 == env._max_episode_steps else float( done) episode_reward += reward replay_buffer.add(obs, action, reward, next_obs, done_bool) obs = next_obs episode_step += 1
def main(): args = parse_args() if args.seed == -1: args.__dict__["seed"] = np.random.randint(1, 1000000) exp_id = str(int(np.random.random() * 100000)) utils.set_seed_everywhere(args.seed) env = env_wrapper.make(domain_name=args.domain_name, task_name=args.task_name, seed=args.seed, visualize_reward=False, from_pixels=(args.observation_type == 'pixel' or args.observation_type == 'hybrid'), cameras=args.cameras, height=args.pre_transform_image_size, width=args.pre_transform_image_size, frame_skip=args.action_repeat, reward_type=args.reward_type, change_model=args.change_model) env.seed(args.seed) if args.special_reset is not None: env.set_special_reset(args.special_reset) if args.demo_special_reset is not None: env.set_special_reset(args.demo_special_reset) if args.observation_type == 'hybrid': env.set_hybrid_obs() # stack several consecutive frames together if args.encoder_type == 'pixel': env = utils.FrameStack(env, k=args.frame_stack) # make directory ts = time.gmtime() ts = time.strftime("%m-%d", ts) if args.task_name is None: env_name = args.domain_name else: env_name = args.domain_name + '-' + args.task_name exp_name = args.reward_type + '-' + args.agent + '-' + args.encoder_type + '-' + args.data_augs exp_name += '-' + ts + '-' + env_name + '-im' + str( args.image_size) + '-b' + str(args.batch_size) + '-nu' + str( args.num_updates) if args.observation_type == 'hybrid': exp_name += '-hybrid' if args.change_model: exp_name += '-change_model' if args.bc_only: exp_name += '-bc_only' exp_name += '-s' + str(args.seed) exp_name += '-id' + exp_id args.work_dir = args.work_dir + '/' + exp_name utils.make_dir(args.work_dir) video_dir = utils.make_dir(os.path.join(args.work_dir, 'video')) model_dir = utils.make_dir(os.path.join(args.work_dir, 'model')) buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer')) print("Working in directory:", args.work_dir) video = VideoRecorder(video_dir if args.save_video else None, camera_id=args.cameras[0]) with open(os.path.join(args.work_dir, 'args.json'), 'w') as f: json.dump(vars(args), f, sort_keys=True, indent=4) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') action_shape = env.action_space.shape if args.encoder_type == 'pixel': cpf = 3 * len(args.cameras) obs_shape = (cpf * args.frame_stack, args.image_size, args.image_size) pre_aug_obs_shape = (cpf * args.frame_stack, args.pre_transform_image_size, args.pre_transform_image_size) else: obs_shape = env.observation_space.shape pre_aug_obs_shape = obs_shape replay_buffer = utils.ReplayBuffer( obs_shape=pre_aug_obs_shape, action_shape=action_shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device, image_size=args.image_size, hybrid_state_shape=env.hybrid_state_shape, load_dir=args.replay_buffer_load_dir) if args.demo_model_dir is not None: # collect demonstrations using a state-trained expert episode_step, done = 0, True state_obs, obs = None, None episode_success = False original_encoder_type = args.encoder_type args.encoder_type = 'identity' if isinstance(env, utils.FrameStack): original_env = env.env else: original_env = env expert_agent = make_agent( obs_shape=original_env.observation_space.shape, action_shape=action_shape, args=args, device=device, hybrid_state_shape=env.hybrid_state_shape) args.encoder_type = original_encoder_type expert_agent.load(args.demo_model_dir, args.demo_model_step) print('Collecting expert trajectories...') t = 0 while t < args.demo_samples: if done: episode_step = 0 episode_success = False if args.demo_special_reset is not None: env.reset(save_special_steps=True) special_steps_dict = env.special_reset_save obs_list = special_steps_dict['obs'] act_list = special_steps_dict['act'] reward_list = special_steps_dict['reward'] for i in range(len(act_list)): replay_buffer.add(obs_list[i], act_list[i], reward_list[i], obs_list[i + 1], False) episode_step += len(act_list) t += len(act_list) obs = obs_list[-1] state_obs = original_env._get_state_obs() else: obs = env.reset() state_obs = original_env._get_state_obs() action = expert_agent.sample_action(state_obs) next_obs, reward, done, info = env.step(action) if info.get('is_success'): episode_success = True state_obs = original_env._get_state_obs() # allow infinite bootstrap done_bool = 0 if episode_step + 1 == env._max_episode_steps else float( done) replay_buffer.add(obs, action, reward, next_obs, done_bool) obs = next_obs episode_step += 1 t += 1 if args.success_demo_only and done and not episode_success: t -= episode_step replay_buffer.idx -= episode_step env.set_special_reset(args.special_reset) print('Starting with replay buffer filled to {}.'.format( replay_buffer.idx)) # args.init_steps = max(0, args.init_steps - args.replay_buffer_load_pi_t) # maybe tune this agent = make_agent(obs_shape=obs_shape, action_shape=action_shape, args=args, device=device, hybrid_state_shape=env.hybrid_state_shape) if args.model_dir is not None: agent.load(args.model_dir, args.model_step) L = Logger(args.work_dir, use_tb=args.save_tb) episode, episode_reward, done = 0, 0, True start_time = time.time() def eval_and_save(): if args.save_model: agent.save_curl(model_dir, step) if args.save_buffer: replay_buffer.save(buffer_dir) if args.save_sac: agent.save(model_dir, step) L.log('eval/episode', episode, step) print('evaluating') evaluate(env, agent, video, args.num_eval_episodes, L, step, args) if args.warmup_cpc: print("Warming up cpc for " + str(args.warmup_cpc) + ' steps.') for i in range(args.warmup_cpc): agent.update_cpc_only(replay_buffer, L, step=0, ema=args.warmup_cpc_ema) print('Warmed up cpc.') if args.warmup_offline_sac: for i in range(args.warmup_offline_sac): agent.update_sac_only(replay_buffer, L, step=0) if args.bc_only: step = 0 for i in range(100): agent.train_bc(replay_buffer) step += 1 eval_and_save() return time_computing = 0 time_acting = 0 callback_fn = None step = 0 if args.synch_update: callback_fn = lambda: lambda: [ agent.update(replay_buffer, L, step, log_networks=nu == 0 and step % args.log_networks_freq == 0) for nu in range(args.num_updates) ] if step >= args.init_steps and not is_eval else 0 # pointers should all work properly, and execute in the proper frame if callback_fn is not None: env.env._env.env.set_callback( callback_fn) # envwrapper (camera), framestack, timelimit # for step in range(args.num_train_steps): while step < args.num_train_steps: # evaluate agent periodically if step % args.eval_freq == 0: is_eval = True eval_and_save() is_eval = False if done: if step > 0: if step % args.log_interval == 0: L.log('train/duration', time.time() - start_time, step) L.dump(step) start_time = time.time() if step % args.log_interval == 0: L.log('train/episode_reward', episode_reward, step) time_tmp = time.time() obs = env.reset() time_acting += time.time() - time_tmp episode_reward = 0 episode_step = 0 episode += 1 if step % args.log_interval == 0: L.log('train/episode', episode, step) # sample action for data collection if step < args.init_steps: action = env.action_space.sample() else: with utils.eval_mode(agent): action = agent.sample_action(obs) if step == args.init_steps and args.demo_samples == 0: if args.warmup_cpc: for i in range(args.warmup_cpc): print("Warming up cpc for " + str(args.warmup_cpc) + ' steps.') agent.update_cpc_only(replay_buffer, L, step=0) print('Warmed up cpc.') # run training update time_tmp = time.time() if step >= args.init_steps and not args.synch_update: for nu in range(args.num_updates): agent.update(replay_buffer, L, step, log_networks=nu == 0 and step % args.log_networks_freq == 0) time_computing += time.time() - time_tmp time_tmp = time.time() next_obs, reward, done, _ = env.step(action) time_acting += time.time() - time_tmp # allow infinite bootstrap done_bool = 0 if episode_step + 1 == env._max_episode_steps else float( done) episode_reward += reward replay_buffer.add(obs, action, reward, next_obs, done_bool) obs = next_obs episode_step += 1 step += 1 step = args.num_train_steps print("time spent computing:", time_computing) print("time spent acting:", time_acting) eval_and_save()
def main(): args = parse_args() if args.seed == -1: args.__dict__["seed"] = np.random.randint(1, 1000000) utils.set_seed_everywhere(args.seed) gibson_config_filename = os.path.join( os.path.dirname(gibson2.__file__), '../examples/configs/hand_drawer.yaml') env = HandDrawerEnv(config_file=gibson_config_filename, mode='headless') env.seed(args.seed) # stack several consecutive frames together if args.encoder_type == 'pixel': env = utils.FrameStack(env, k=args.frame_stack) # make directory ts = time.gmtime() ts = time.strftime("%m-%d", ts) env_name = args.domain_name + '-' + args.task_name exp_name = env_name + '-' + ts + '-im' + str(args.image_size) +'-b' \ + str(args.batch_size) + '-s' + str(args.seed) + '-' + args.encoder_type args.work_dir = args.work_dir + '/' + exp_name utils.make_dir(args.work_dir) video_dir = utils.make_dir(os.path.join(args.work_dir, 'video')) model_dir = utils.make_dir(os.path.join(args.work_dir, 'model')) buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer')) video = VideoRecorder(video_dir if args.save_video else None) with open(os.path.join(args.work_dir, 'args.json'), 'w') as f: json.dump(vars(args), f, sort_keys=True, indent=4) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') action_shape = env.action_space.shape if args.encoder_type == 'pixel': obs_shape = (2 * args.frame_stack, args.image_size, args.image_size) pre_aug_obs_shape = (2 * args.frame_stack, args.pre_transform_image_size, args.pre_transform_image_size) else: obs_shape = env.observation_space.shape pre_aug_obs_shape = obs_shape replay_buffer = utils.ReplayBuffer( obs_shape=pre_aug_obs_shape, action_shape=action_shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device, image_size=args.image_size, ) agent = make_agent(obs_shape=obs_shape, action_shape=action_shape, args=args, device=device) L = Logger(args.work_dir, use_tb=args.save_tb) evaluate(env, agent, video, args.num_eval_episodes, L, step, args)
def main(): args = parse_args() if args.seed == -1: args.__dict__["seed"] = np.random.randint(1, 1000000) utils.set_seed_everywhere(args.seed) goal_env = SRLEnv(args.action_repeat, args.environment, args.srl_model, args.pre_transform_image_size, args.pre_transform_image_size, args.renders, args.is_discrete, args.force_down) goal_env.seed(args.seed) # stack several consecutive frames together if args.encoder_type == 'pixel': goal_env = utils.FrameStack(goal_env, k=args.frame_stack) # make directory ts = time.gmtime() ts = time.strftime("%m-%d-%H:%M:%S", ts) env_name = args.environment exp_name = env_name + '-' + args.agent + '-' + ts + '-im' + str(args.image_size) +'-b' \ + str(args.batch_size) + '-s' + str(args.seed) + '-' + args.encoder_type args.work_dir = args.work_dir + '/' + exp_name utils.make_dir(args.work_dir) video_dir = utils.make_dir(os.path.join(args.work_dir, 'video')) model_dir = utils.make_dir(os.path.join(args.work_dir, 'model')) buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer')) pre_buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'pre_buffer')) csv_dir = utils.make_dir(os.path.join(args.work_dir, 'csv')) image_dir = utils.make_dir(os.path.join(args.work_dir, 'image')) log_csv = { "step": [], "mean_reward": [], "mean_distance_to_goal": [], "std_distance_to_goal": [] } video = VideoRecorder(video_dir if args.save_video else None) with open(os.path.join(args.work_dir, 'args.json'), 'w') as f: json.dump(vars(args), f, sort_keys=True, indent=4) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') action_shape = goal_env.action_space.shape if args.encoder_type == 'pixel': obs_shape = (3 * args.frame_stack, args.image_size, args.image_size) if args.agent == 'sac_ae': pre_aug_obs_shape = obs_shape else: pre_aug_obs_shape = (3 * args.frame_stack, args.pre_transform_image_size, args.pre_transform_image_size) else: obs_shape = goal_env.observation_space.shape pre_aug_obs_shape = obs_shape if args.reward_type == 'dist': success_samples = goal_env.get_goal_image() goal_env.close() def sample_goal(): success_sample = random.choice(success_samples) if args.encoder_type == 'pixel': frames = [] for _ in range(args.frame_stack): frames.append(success_sample) return np.concatenate(frames, axis=0) else: return success_sample env = SRLEnv(args.action_repeat, args.environment, args.srl_model, args.pre_transform_image_size, args.pre_transform_image_size) env.seed(args.seed) # stack several consecutive frames together if args.encoder_type == 'pixel': env = utils.FrameStack(env, k=args.frame_stack) replay_buffer = utils.ReplayBuffer( obs_shape=pre_aug_obs_shape, action_shape=action_shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device, image_size=args.image_size, ) pre_replay_buffer = utils.ReplayBuffer( obs_shape=pre_aug_obs_shape, action_shape=action_shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device, image_size=args.image_size, ) agent = make_agent(obs_shape=obs_shape, action_shape=action_shape, args=args, device=device) L = Logger(args.work_dir, use_tb=args.save_tb) episode, episode_reward, done = 0, 0, True start_time = time.time() for step in range(args.pre_training_steps): # evaluate agent periodically if step % args.eval_freq == 0: if args.save_model: agent.save_curl(model_dir, step) if args.save_buffer: pre_replay_buffer.save(pre_buffer_dir) if done: obs = env.reset() done = False episode_step = 0 episode += 1 # sample action for data collection action = env.action_space.sample() if args.environment == 'kuka': action[2] = -abs(action[2]) # run training update if step >= args.init_steps: num_updates = 1 for _ in range(num_updates): agent.update(pre_replay_buffer, L, step, enc_train=True) next_obs, reward, done, distance = env.step(action) goal_obs = sample_goal() if args.reward_type == 'dist': reward = agent.dist_reward(next_obs, goal_obs) # allow infinit bootstrap done_bool = 0 if episode_step + 1 == env._max_episode_steps else float( done) pre_replay_buffer.add(obs, action, reward, next_obs, goal_obs, done_bool) obs = next_obs episode_step += 1 episode, episode_reward, done = 0, 0, True start_time = time.time() for step in range(args.num_train_steps): # evaluate agent periodically if step % args.eval_freq == 0: L.log('eval/episode', episode, step) if args.reward_type == 'dist': evaluate(env, agent, replay_buffer, video, args.num_eval_episodes, L, csv_dir, log_csv, image_dir, step, args, sample_goal()) else: evaluate(env, agent, replay_buffer, video, args.num_eval_episodes, L, csv_dir, log_csv, image_dir, step, args, None) if args.save_model: agent.save_curl(model_dir, step) if args.save_buffer: replay_buffer.save(buffer_dir) if done: if step > 0: if step % args.log_interval == 0: L.log('train/duration', time.time() - start_time, step) L.dump(step) start_time = time.time() if step % args.log_interval == 0: L.log('train/episode_reward', episode_reward, step) obs = env.reset() done = False episode_reward = 0 episode_step = 0 episode += 1 goal_obs = sample_goal() if step % args.log_interval == 0: L.log('train/episode', episode, step) if step < args.init_steps: action = env.action_space.sample() else: with utils.eval_mode(agent): action = agent.sample_action(obs, goal_obs) # run training update if step >= args.init_steps: num_updates = 1 for _ in range(num_updates): agent.update(replay_buffer, L, step, enc_train=False) next_obs, reward, done, distance = env.step(action) if args.reward_type == 'dist': reward = agent.dist_reward(next_obs, goal_obs) # allow infinit bootstrap done_bool = 0 if episode_step + 1 == env._max_episode_steps else float( done) episode_reward += reward replay_buffer.add(obs, action, reward, next_obs, goal_obs, done_bool) obs = next_obs episode_step += 1
def main(): args = parse_args() utils.set_seed_everywhere(args.seed) if args.domain_name == 'carla': env = CarlaEnv( render_display=args.render, # for local debugging only display_text=args.render, # for local debugging only changing_weather_speed=0.1, # [0, +inf) rl_image_size=args.image_size, max_episode_steps=1000, frame_skip=args.action_repeat, is_other_cars=True, port=args.port) # TODO: implement env.seed(args.seed) ? eval_env = env else: env = dmc2gym.make(domain_name=args.domain_name, task_name=args.task_name, resource_files=args.resource_files, img_source=args.img_source, total_frames=args.total_frames, seed=args.seed, visualize_reward=False, from_pixels=(args.encoder_type == 'pixel'), height=args.image_size, width=args.image_size, frame_skip=args.action_repeat) env.seed(args.seed) eval_env = dmc2gym.make(domain_name=args.domain_name, task_name=args.task_name, resource_files=args.eval_resource_files, img_source=args.img_source, total_frames=args.total_frames, seed=args.seed, visualize_reward=False, from_pixels=(args.encoder_type == 'pixel'), height=args.image_size, width=args.image_size, frame_skip=args.action_repeat) # stack several consecutive frames together if args.encoder_type.startswith('pixel'): env = utils.FrameStack(env, k=args.frame_stack) eval_env = utils.FrameStack(eval_env, k=args.frame_stack) utils.make_dir(args.work_dir) video_dir = utils.make_dir(os.path.join(args.work_dir, 'video')) model_dir = utils.make_dir(os.path.join(args.work_dir, 'model')) buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer')) video = VideoRecorder(video_dir if args.save_video else None) with open(os.path.join(args.work_dir, 'args.json'), 'w') as f: json.dump(vars(args), f, sort_keys=True, indent=4) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # the dmc2gym wrapper standardizes actions assert env.action_space.low.min() >= -1 assert env.action_space.high.max() <= 1 replay_buffer = utils.ReplayBuffer(obs_shape=env.observation_space.shape, action_shape=env.action_space.shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device) agent = make_agent(obs_shape=env.observation_space.shape, action_shape=env.action_space.shape, args=args, device=device) L = Logger(args.work_dir, use_tb=args.save_tb) episode, episode_reward, done = 0, 0, True start_time = time.time() for step in range(args.num_train_steps): if done: if args.decoder_type == 'inverse': for i in range( 1, args.k): # fill k_obs with 0s if episode is done replay_buffer.k_obses[replay_buffer.idx - i] = 0 if step > 0: L.log('train/duration', time.time() - start_time, step) start_time = time.time() L.dump(step) # evaluate agent periodically if episode % args.eval_freq == 0: L.log('eval/episode', episode, step) evaluate(eval_env, agent, video, args.num_eval_episodes, L, step) if args.save_model: agent.save(model_dir, step) if args.save_buffer: replay_buffer.save(buffer_dir) L.log('train/episode_reward', episode_reward, step) obs = env.reset() done = False episode_reward = 0 episode_step = 0 episode += 1 reward = 0 L.log('train/episode', episode, step) # sample action for data collection if step < args.init_steps: action = env.action_space.sample() else: with utils.eval_mode(agent): action = agent.sample_action(obs) # run training update if step >= args.init_steps: num_updates = args.init_steps if step == args.init_steps else 1 for _ in range(num_updates): agent.update(replay_buffer, L, step) curr_reward = reward next_obs, reward, done, _ = env.step(action) # allow infinit bootstrap done_bool = 0 if episode_step + 1 == env._max_episode_steps else float( done) episode_reward += reward replay_buffer.add(obs, action, curr_reward, reward, next_obs, done_bool) np.copyto(replay_buffer.k_obses[replay_buffer.idx - args.k], next_obs) obs = next_obs episode_step += 1
def main(): args = parse_args() if args.seed == -1: args.__dict__["seed"] = np.random.randint(1, 1000000) utils.set_seed_everywhere(args.seed) pre_transform_image_size = args.pre_transform_image_size if 'crop' in args.data_augs else args.image_size pre_image_size = args.pre_transform_image_size # record the pre transform image size for translation env = dmc2gym.make(domain_name=args.domain_name, task_name=args.task_name, seed=args.seed, visualize_reward=False, from_pixels=(args.encoder_type == 'pixel'), height=pre_transform_image_size, width=pre_transform_image_size, frame_skip=args.action_repeat) env.seed(args.seed) # stack several consecutive frames together if args.encoder_type == 'pixel': env = utils.FrameStack(env, k=args.frame_stack) # make directory ts = time.gmtime() ts = time.strftime("%m-%d", ts) env_name = args.domain_name + '-' + args.task_name # exp_name = env_name + '-' + ts + '-im' + str(args.image_size) +'-b' \ # + str(args.batch_size) + '-s' + str(args.seed) + '-' + args.encoder_type # args.work_dir = args.work_dir + '/' + exp_name # modded for checking augmentation and corruption if args.augmix: aug_name = "augs-augmix" else: aug_name = "augs-" + args.data_augs exp_name = env_name + '-' + aug_name + '-s' + str(args.seed) args.work_dir = args.work_dir + '/' + exp_name utils.make_dir(args.work_dir) video_dir = utils.make_dir(os.path.join(args.work_dir, 'video')) model_dir = utils.make_dir(os.path.join(args.work_dir, 'model')) buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer')) eval_dir = utils.make_dir(os.path.join(args.work_dir, 'eval')) video = VideoRecorder(video_dir if args.save_video else None) print("Args:") print(args) print("work dir:", args.work_dir) os.makedirs(args.work_dir, exist_ok=True) with open(os.path.join(args.work_dir, 'args.json'), 'w') as f: json.dump(vars(args), f, sort_keys=True, indent=4) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') action_shape = env.action_space.shape if args.encoder_type == 'pixel': obs_shape = (3 * args.frame_stack, args.image_size, args.image_size) pre_aug_obs_shape = (3 * args.frame_stack, pre_transform_image_size, pre_transform_image_size) else: obs_shape = env.observation_space.shape pre_aug_obs_shape = obs_shape replay_buffer = utils.ReplayBuffer( obs_shape=pre_aug_obs_shape, action_shape=action_shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device, image_size=args.image_size, pre_image_size=pre_image_size, ) agent = make_agent(obs_shape=obs_shape, action_shape=action_shape, args=args, device=device) L = Logger(args.work_dir, use_tb=args.save_tb) episode, episode_reward, done = 0, 0, True start_time = time.time() results = [] best_mean_reward = 0 for step in range(args.num_train_steps): # evaluate agent periodically if step % args.eval_freq == 0: agent.load(model_dir, step, map_to_cpu=True) L.log('eval/episode', episode, step) # evaluate(env, agent, video, args.num_eval_episodes, L, step,args) res = evaluate_corruptions(env, agent, video, args.num_eval_episodes, L, step, args) if best_mean_reward < res['mean_ep_reward']: best_mean_reward = res['mean_ep_reward'] res['best_overall'] = best_mean_reward results.append(res) print(f"Best Mean overall:{best_mean_reward:.4f}") import pickle results_fname = os.path.join(eval_dir, f"{args.cor_func}{args.cor_sev}.pkl") pickle.dump(results, open(results_fname, "wb"))
def main(): args = parse_args() dm_envs = { 'finger': ['finger', 'spin'], 'cartpole': ['cartpole', 'swingup'], 'reacher': ['reacher', 'easy'], 'cheetah': ['cheetah', 'run'], 'walker': ['walker', 'walk'], 'ball': ['ball_in_cup', 'catch'], 'humanoid': ['humanoid', 'stand'], 'bring_ball': ['manipulator', 'bring_ball'], 'bring_peg': ['manipulator', 'bring_peg'], 'insert_ball': ['manipulator', 'insert_ball'], 'insert_peg': ['manipulator', 'insert_peg'], } if args.env == 'cartpole': args.action_repeat = 8 elif args.env in ['finger', 'walker']: args.action_repeat = 2 else: args.action_repeat = 4 args.domain_name, args.task_name = dm_envs[args.env] global logger logger = wandb.init( project='d2rl', config=args, dir='wandb_logs', group='{}_{}'.format(args.env), ) if args.seed == -1: args.__dict__["seed"] = np.random.randint(1, 1000000) utils.set_seed_everywhere(args.seed) env = dmc2gym.make(domain_name=args.domain_name, task_name=args.task_name, seed=args.seed, visualize_reward=False, from_pixels=(args.encoder_type == 'pixel'), height=args.pre_transform_image_size, width=args.pre_transform_image_size, frame_skip=args.action_repeat) env.seed(args.seed) # stack several consecutive frames together if args.encoder_type == 'pixel': env = utils.FrameStack(env, k=args.frame_stack) # make directory ts = time.gmtime() ts = time.strftime("%m-%d", ts) env_name = args.domain_name + '-' + args.task_name exp_name = env_name + '-' + ts + '-im' + str(args.image_size) +'-b' \ + str(args.batch_size) + '-s' + str(args.seed) + '-' + args.encoder_type device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') action_shape = env.action_space.shape if args.encoder_type == 'pixel': obs_shape = (3 * args.frame_stack, args.image_size, args.image_size) pre_aug_obs_shape = (3 * args.frame_stack, args.pre_transform_image_size, args.pre_transform_image_size) else: obs_shape = env.observation_space.shape pre_aug_obs_shape = obs_shape replay_buffer = utils.ReplayBuffer( obs_shape=pre_aug_obs_shape, action_shape=action_shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device, image_size=args.image_size, ) agent = make_agent(obs_shape=obs_shape, action_shape=action_shape, args=args, device=device) episode, episode_reward, done = 0, 0, True start_time = time.time() for step in range(args.num_train_steps): # evaluate agent periodically if step % args.eval_freq == 0: evaluate(env, agent, args.num_eval_episodes, step, args) if args.save_model: agent.save_curl(model_dir, step) if args.save_buffer: replay_buffer.save(buffer_dir) if done: obs = env.reset() done = False episode_reward = 0 episode_step = 0 episode += 1 # sample action for data collection if step < args.init_steps: action = env.action_space.sample() else: with utils.eval_mode(agent): action = agent.sample_action(obs) # run training update if step >= args.init_steps: num_updates = 1 for _ in range(num_updates): agent.update(replay_buffer, step) next_obs, reward, done, _ = env.step(action) # allow infinit bootstrap done_bool = 0 if episode_step + 1 == env._max_episode_steps else float( done) episode_reward += reward replay_buffer.add(obs, action, reward, next_obs, done_bool) obs = next_obs episode_step += 1
def main(): args = parse_args() if args.seed == -1: args.__dict__["seed"] = np.random.randint(1,1000000) utils.set_seed_everywhere(args.seed) pre_transform_image_size = args.pre_transform_image_size if 'crop' in args.data_augs else args.image_size env = dmc2gym.make( domain_name=args.domain_name, task_name=args.task_name, seed=args.seed, visualize_reward=False, from_pixels=(args.encoder_type == 'pixel'), height=pre_transform_image_size, width=pre_transform_image_size, frame_skip=args.action_repeat ) env.seed(args.seed) project_name=args.domain_name+args.task_name group_name=""+str(args.replay_buffer_capacity//1000)+"k"+str(args.steps_until_freeze//1000)+"k"+str(args.num_copies) # stack several consecutive frames together if args.encoder_type == 'pixel': env = utils.FrameStack(env, k=args.frame_stack) # make directory # ts = time.gmtime() # ts = time.strftime("%m-%d", ts) # env_name = args.domain_name + '-' + args.task_name # exp_name = env_name + '-' + ts + '-im' + str(args.image_size) +'-b' \ # + str(args.batch_size) + '-s' + str(args.seed) + '-' + args.encoder_type original_work_dir = args.work_dir exp_name = project_name + "-" + group_name + "-s" + str(args.seed) args.work_dir = args.work_dir + '/' + exp_name utils.make_dir(args.work_dir) video_dir = utils.make_dir(os.path.join(args.work_dir, 'video')) model_dir = utils.make_dir(os.path.join(args.work_dir, 'model')) video = VideoRecorder(video_dir if args.save_video else None) with open(os.path.join(args.work_dir, 'args.json'), 'w') as f: json.dump(vars(args), f, sort_keys=True, indent=4) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') action_shape = env.action_space.shape if args.encoder_type == 'pixel': obs_shape = (3*args.frame_stack, args.image_size, args.image_size) pre_aug_obs_shape = (3*args.frame_stack,pre_transform_image_size,pre_transform_image_size) else: obs_shape = env.observation_space.shape pre_aug_obs_shape = obs_shape replay_buffer = utils.ReplayBuffer( obs_shape=pre_aug_obs_shape, action_shape=action_shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device, image_size=args.image_size, ) p = 3 * pre_aug_obs_shape[1] * pre_aug_obs_shape[2] l = args.encoder_feature_dim c_prime = min(args.num_train_steps, int(np.floor(args.replay_buffer_capacity * p / l / 4 / 2 / args.num_copies))) print('If frozen replay capacity will increase to ', c_prime) latent_buffer_critic = utils.ReplayBuffer( obs_shape=(args.encoder_feature_dim, 1), action_shape=action_shape, capacity=c_prime, batch_size=args.batch_size, device=device, is_latent=True, num_copies=args.num_copies ) latent_buffer_actor = utils.ReplayBuffer( obs_shape=(args.encoder_feature_dim, 1), action_shape=action_shape, capacity=c_prime, batch_size=args.batch_size, device=device, is_latent=True, num_copies=args.num_copies ) agent = make_agent( obs_shape=obs_shape, action_shape=action_shape, args=args, device=device ) L = Logger(args.work_dir, use_tb=args.save_tb) episode, episode_reward, done = 0, 0, True start_time = time.time() def get_cropped_obs_batch(obs, next_obs): obs = obs.astype(np.uint8) next_obs = next_obs.astype(np.uint8) cpu_obs_tmp = utils.random_crop(obs, args.image_size) obs_tmp = torch.as_tensor(cpu_obs_tmp, device=device).float() cpu_next_obs_tmp = utils.random_crop(next_obs, args.image_size) next_obs_tmp = torch.as_tensor(cpu_next_obs_tmp, device=device).float() return obs_tmp / 255, next_obs_tmp / 255 def get_latent_obs(network, obses, next_obses): network.encoder(obses) conv4_obses = network.encoder.outputs['conv4'] latent_obses = network.encoder.outputs['fc'] network.encoder(next_obses) conv4_next_obses = network.encoder.outputs['conv4'] latent_next_obses = network.encoder.outputs['fc'] return latent_obses, latent_next_obses, conv4_obses, conv4_next_obses def move_ac_rew_nd(replay_buffer, buffers, num_transitions): for buffer in buffers: buffer.actions[:num_transitions] = replay_buffer.actions[:num_transitions] buffer.rewards[:num_transitions] = replay_buffer.rewards[:num_transitions] buffer.not_dones[:num_transitions] = replay_buffer.not_dones[:num_transitions] def move_imgs_to_latent(replay_buffer, buffers, networks, tmp_batch_size, num_transitions): k = 0 # move in batches to avoid cuda out of memory while k * tmp_batch_size < num_transitions: start = k * tmp_batch_size end = min((k + 1) * tmp_batch_size, num_transitions) # repeat num_copies times along batch dimension to get different crops raw_obses_repeated = np.repeat(replay_buffer.obses[start:end], args.num_copies, axis=0) raw_next_obses_repeated = np.repeat(replay_buffer.next_obses[start:end], args.num_copies, axis=0) tmp_obses, tmp_next_obses = get_cropped_obs_batch(raw_obses_repeated, raw_next_obses_repeated) conv4_obses, conv4_next_obses = None, None for i in range(len(buffers)): network, buffer = networks[i], buffers[i] # for the actor network we only need to run the fc layer, so use previous conv4_obses from critic network # (the networks are tied at their convolutional layers) if conv4_obses is not None: latent_obses, latent_next_obses, _, _ = get_latent_obs(network, conv4_obses, conv4_next_obses) else: latent_obses, latent_next_obses, conv4_obses, conv4_next_obses = get_latent_obs(network, tmp_obses, tmp_next_obses) latent_obses = latent_obses.detach().cpu().numpy() latent_next_obses = latent_next_obses.detach().cpu().numpy() # store args.num_copies random crops for each observation in the current batch buffer.obses[start:end] = latent_obses.reshape((end - start, args.num_copies, args.encoder_feature_dim, 1)) buffer.next_obses[start:end] = latent_next_obses.reshape((end - start, args.num_copies, args.encoder_feature_dim, 1)) # set buffer.idx and buffer.full appropriately (handles case where buffer.capacity > replay_buffer.capacity) buffer.idx = max(replay_buffer.idx, num_transitions) buffer.full = num_transitions >= buffer.capacity k += 1 for step in range(args.num_train_steps): # evaluate agent periodically if step % args.eval_freq == 0: L.log('eval/episode', episode, step) evaluate(env, agent, video, args.num_eval_episodes, L, step,args) if args.save_model: agent.save_curl(model_dir, step) agent.save(model_dir, step) if args.save_buffer: buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer' + str(step))) replay_buffer.save(buffer_dir) if done: if step > 0: if step % args.log_interval == 0: L.log('train/duration', time.time() - start_time, step) L.dump(step) start_time = time.time() if step % args.log_interval == 0: L.log('train/episode_reward', episode_reward, step) obs = env.reset() done = False episode_reward = 0 episode_step = 0 episode += 1 if step % args.log_interval == 0: L.log('train/episode', episode, step) # sample action for data collection if step < args.init_steps: action = env.action_space.sample() else: with utils.eval_mode(agent): action = agent.sample_action(obs / 255.) # run training update if step >= args.init_steps: num_updates = 1 for _ in range(num_updates): if step < args.steps_until_freeze: agent.update(replay_buffer, L, step, detach_fc=False) elif step == args.steps_until_freeze: print("detaching fc layer") agent.critic.encoder.detach_fc = True agent.critic_target.encoder.detach_fc = True agent.actor.encoder.detach_fc = True num_transitions = min(step, replay_buffer.capacity) utils.soft_update_params(agent.critic, agent.critic_target, 1) # set critic_target params to critic params with torch.no_grad(): networks = [agent.critic, agent.actor] buffers = [latent_buffer_critic, latent_buffer_actor] # move actions, rewards, and not_dones to latent buffers move_ac_rew_nd(replay_buffer, buffers, num_transitions) # move obs and next_obs to latent buffers move_imgs_to_latent(replay_buffer, buffers, networks, 100, num_transitions) agent.update_with_latent(latent_buffer_critic, latent_buffer_actor, L, step) else: agent.update_with_latent(latent_buffer_critic, latent_buffer_actor, L, step) next_obs, reward, done, _ = env.step(action) # allow infinit bootstrap done_bool = 0 if episode_step + 1 == env._max_episode_steps else float( done ) episode_reward += reward if step <= args.steps_until_freeze: replay_buffer.add(obs, action, reward, next_obs, done_bool) else: # add to latent buffers # similar to the "elif step == args.steps_until_freeze" procedure raw_obs_repeated = np.repeat(np.expand_dims(obs, axis=0), args.num_copies, axis=0) raw_next_obs_repeated = np.repeat(np.expand_dims(next_obs, axis=0), args.num_copies, axis=0) obs_tmp, next_obs_tmp = get_cropped_obs_batch(raw_obs_repeated, raw_next_obs_repeated) networks = [agent.critic, agent.actor] buffers = [latent_buffer_critic, latent_buffer_actor] conv4_obs, conv4_next_obs = None, None for i in range(len(buffers)): network, buffer = networks[i], buffers[i] if conv4_obs is not None: latent_obs, latent_next_obs, _, _ = get_latent_obs(network, conv4_obs, conv4_next_obs) else: latent_obs, latent_next_obs, conv4_obs, conv4_next_obs = get_latent_obs(network, obs_tmp, next_obs_tmp) latent_obs = latent_obs.unsqueeze(-1).detach().cpu().numpy() latent_next_obs = latent_next_obs.unsqueeze(-1).detach().cpu().numpy() buffer.add(latent_obs, action, reward, latent_next_obs, done_bool) obs = next_obs episode_step += 1
def main(): args = parse_args() if args.seed == -1: args.__dict__["seed"] = np.random.randint(1, 1000000) assert ( args.init_steps == args.batch_size and args.num_train_steps * args.action_repeat in [100000, 500000] ) utils.set_seed_everywhere(args.seed) env = dmc2gym.make( domain_name=args.domain_name, task_name=args.task_name, seed=args.seed, visualize_reward=False, from_pixels=(args.encoder_type == "pixel"), height=args.pre_transform_image_size, width=args.pre_transform_image_size, frame_skip=args.action_repeat, ) env.seed(args.seed) # stack several consecutive frames together if args.encoder_type == "pixel": env = utils.FrameStack(env, k=args.frame_stack) # make directory ts = time.gmtime() ts = time.strftime("%m-%d", ts) env_name = args.domain_name + "-" + args.task_name exp_name = ( env_name + "-" + ts + "-im" + str(args.image_size) + "-b" + str(args.batch_size) + "-nes" + str(args.num_train_steps * args.action_repeat) + "-s" + str(args.seed) + "-" + args.encoder_type + "-" + args.agent ) args.work_dir = args.work_dir + "/" + exp_name utils.make_dir(args.work_dir) video_dir = utils.make_dir(os.path.join(args.work_dir, "video")) model_dir = utils.make_dir(os.path.join(args.work_dir, "model")) buffer_dir = utils.make_dir(os.path.join(args.work_dir, "buffer")) video = VideoRecorder(video_dir if args.save_video else None) with open(os.path.join(args.work_dir, "args.json"), "w") as f: json.dump(vars(args), f, sort_keys=True, indent=4) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") action_shape = env.action_space.shape if args.encoder_type == "pixel": obs_shape = (3 * args.frame_stack, args.image_size, args.image_size) pre_aug_obs_shape = ( 3 * args.frame_stack, args.pre_transform_image_size, args.pre_transform_image_size, ) else: obs_shape = env.observation_space.shape pre_aug_obs_shape = obs_shape replay_buffer = utils.ReplayBuffer( obs_shape=pre_aug_obs_shape, action_shape=action_shape, # capacity=args.replay_buffer_capacity, capacity=args.batch_size, batch_size=args.batch_size, device=device, image_size=args.image_size, center_crop_anchor=args.center_crop_anchor, ) agent = make_agent( obs_shape=obs_shape, action_shape=action_shape, args=args, device=device ) L = Logger(args.work_dir, use_tb=args.save_tb) episode, episode_reward, done = 0, 0, True start_time = time.time() reward_window = deque([], maxlen=5) best_reward = 0.0 best_step = 0 for step in range(args.num_train_steps): # evaluate agent periodically # if step % args.eval_freq == 0: # L.log("eval/episode", episode, step) # with utils.eval_mode(agent): # evaluate(env, agent, video, args.num_eval_episodes, L, step, args) # if args.save_model: # agent.save_curl(model_dir, step) if done: if step > 0: # if step % args.log_interval == 0: L.log("train/duration", time.time() - start_time, step) L.log("train/episode_reward", episode_reward, step) L.dump(step) start_time = time.time() reward_window.append(episode_reward) if len(reward_window) == reward_window.maxlen: mean_reward = np.mean(reward_window) if mean_reward > best_reward: best_reward = mean_reward best_step = step agent.save(model_dir, best_step) obs = env.reset() done = False episode_reward = 0 episode_step = 0 episode += 1 # if step % args.log_interval == 0: L.log("train/episode", episode, step) # sample action for data collection if step < args.init_steps: action = env.action_space.sample() else: with utils.eval_mode(agent): action = agent.sample_action(obs) # run training update if step >= args.init_steps: num_updates = 1 for _ in range(num_updates): agent.update(replay_buffer, L, step) next_obs, reward, done, _ = env.step(action) # allow infinite bootstrap done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(done) episode_reward += reward replay_buffer.add(obs, action, reward, next_obs, done_bool) obs = next_obs episode_step += 1 L.log("eval/episode", episode, step) agent.load(model_dir, best_step) with utils.eval_mode(agent): evaluate(env, agent, video, args.num_eval_episodes, L, step, args)
def main(): args = parse_args() if args.seed == -1: args.__dict__["seed"] = np.random.randint(1, 1000000) utils.set_seed_everywhere(args.seed) env = Env() # stack several consecutive frames together env = utils.FrameStack(env, k=args.frame_stack) # make directory ts = time.gmtime() ts = time.strftime("%m-%d", ts) env_name = args.domain_name + '-' + args.task_name exp_name = env_name + '-' + ts + '-im' + str(args.image_size) +'-b' \ + str(args.batch_size) + '-s' + str(args.seed) + '-' + args.encoder_type args.work_dir = args.work_dir + '/' + exp_name utils.make_dir(args.work_dir) model_dir = utils.make_dir(os.path.join(args.work_dir, 'model')) buffer_dir = utils.make_dir(os.path.join(args.work_dir, 'buffer')) with open(os.path.join(args.work_dir, 'args.json'), 'w') as f: json.dump(vars(args), f, sort_keys=True, indent=4) torch.cuda.current_device() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') action_shape = env.ACTION_SPACE_SIZE obs_shape = (3 * args.frame_stack, args.image_size, args.image_size) pre_aug_obs_shape = (3 * args.frame_stack, args.pre_transform_image_size, args.pre_transform_image_size) replay_buffer = utils.ReplayBuffer( obs_shape=pre_aug_obs_shape, action_shape=action_shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device, image_size=args.image_size, ) agent = make_agent(obs_shape=obs_shape, action_shape=action_shape, args=args, device=device) #agent.load("D:/curl-master/fps-train-09-29-im84-b64-s829604-pixel/model", 2000) L = Logger(args.work_dir, use_tb=args.save_tb) episode, episode_reward, done = 0, 0, True start_time = time.time() for step in range(args.num_train_steps): pre_time = time.time() # evaluate agent periodically if step % args.eval_freq == 0 and step != 0: L.log('eval/episode', episode, step) evaluate(env, agent, args.num_eval_episodes, L, step, args) if args.save_model: agent.save_curl(model_dir, step) agent.save(model_dir, step) if args.save_buffer: replay_buffer.save(buffer_dir) if done: if step > 0: if step % args.log_interval == 0: L.log('train/duration', time.time() - start_time, step) L.dump(step) start_time = time.time() if step % args.log_interval == 0: L.log('train/episode_reward', episode_reward, step) obs = env.reset() done = False episode_reward = 0 episode_step = 0 episode += 1 if step % args.log_interval == 0: L.log('train/episode', episode, step) # sample action for data collection if step < args.init_steps: print("random") action = random.randint(0, 11) else: with utils.eval_mode(agent): print("random 2") action = random.randint(0, 11) # run training update if step >= args.init_steps: num_updates = 1 for _ in range(num_updates): agent.update(replay_buffer, L, step) print("Agent selected 2") next_obs, reward, done = env.step(action) # allow infinit bootstrap done_bool = 0 if episode_step + 1 == 500000000 else float(done) episode_reward += reward replay_buffer.add(obs, action, reward, next_obs, done_bool) obs = next_obs episode_step += 1 time_used = time.time() - pre_time print(time_used) print("FPS:") fps = 1 / time_used print(fps)