def run_environment(env_name, episode_length, num_episodes): env = EnvWithGoal(create_maze_env.create_maze_env(env_name), env_name) def action_fn(obs): action_space = env.action_space action_space_mean = (action_space.low + action_space.high) / 2.0 action_space_magn = (action_space.high - action_space.low) / 2.0 random_action = ( action_space_mean + action_space_magn * np.random.uniform(low=-1.0, high=1.0, size=action_space.shape)) return random_action rewards = [] successes = [] for ep in range(num_episodes): rewards.append(0.0) successes.append(False) obs = env.reset() for _ in range(episode_length): env.render() print(env.get_image().shape) obs, reward, done, _ = env.step(action_fn(obs)) rewards[-1] += reward successes[-1] = success_fn(reward) if done: break print('Episode {} reward: {}, Success: {}'.format( ep + 1, rewards[-1], successes[-1])) print('Average Reward over {} episodes: {}'.format(num_episodes, np.mean(rewards))) print('Average Success over {} episodes: {}'.format( num_episodes, np.mean(successes)))
def test_subgoal_transition(self): env = EnvWithGoal(create_maze_env(ENV_NAME), ENV_NAME) subgoal = Subgoal() subgoal_dim = subgoal.action_dim state_dim, goal_dim, action_dim, scale_low = spawn_dims(env) scale_high = subgoal.action_space.high * np.ones(subgoal_dim) agent = HiroAgent( state_dim=state_dim, action_dim=action_dim, goal_dim=goal_dim, subgoal_dim=subgoal_dim, scale_low=scale_low, scale_high=scale_high) goal = np.array([5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0]) state = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) next_state = np.array([1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) subgoal = agent.subgoal_transition(state, goal, next_state) # distance from current state to current goal should be maintained self.assertEqual(goal-state, subgoal-next_state)
def test_low_reward_negative(self): env = EnvWithGoal(create_maze_env(ENV_NAME), ENV_NAME) subgoal = Subgoal() subgoal_dim = subgoal.action_dim state_dim, goal_dim, action_dim, scale_low = spawn_dims(env) scale_high = subgoal.action_space.high * np.ones(subgoal_dim) agent = HiroAgent( state_dim=state_dim, action_dim=action_dim, goal_dim=goal_dim, subgoal_dim=subgoal_dim, scale_low=scale_low, scale_high=scale_high) goal = np.array([5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0]) state = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) next_state = np.array([1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) reward1 = agent.low_reward(state, goal, next_state) state = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) next_state = np.array([-1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) reward2 = agent.low_reward(state, goal, next_state) self.assertTrue(reward1 > reward2)
def run_hiro(args): if not os.path.exists("./results"): os.makedirs("./results") if args.save_models and not os.path.exists("./pytorch_models"): os.makedirs("./pytorch_models") if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) if not os.path.exists(os.path.join(args.log_dir, args.log_file)): can_load = False os.makedirs(os.path.join(args.log_dir, args.log_file)) else: can_load = True print("Existing directory found; may be able to load weights.") output_dir = os.path.join(args.log_dir, args.log_file) print("Logging in {}".format(output_dir)) if args.env_name in ["MountainCarContinuous-v0", "LunarLanderContinuous-v2", "Pendulum-v0"]: env = EnvWithGoal( gym.make(args.env_name), args.env_name, use_real_reward=True, should_scale_obs=args.should_reach_subgoal ) # env.env.reward_type = args.reward_type if args.env_name == "MountainCarContinuous-v0": env.distance_threshold = -1 # We want a positive reward (e.g. a negative distance) min_obs, max_obs = env.base_env.observation_space.low, env.base_env.observation_space.high man_scale = (max_obs - min_obs) / 2 elif args.env_name == "LunarLanderContinuous-v2": env.distance_threshold = -60 # We want at least a reward of 60 (e.g. a distance of -60) # Can't use the observation_space bounds directly, because those go from -inf to +inf # So I just arbitrarily picked the value 100 (no idea if this is good or not) man_scale = np.ones(2) * 5 # env.base_env.observation_space.low.shape[0] else: env.distance_threshold = -150 # We want a reward of 150 (TODO: bullshit value, fix it) min_obs, max_obs = env.base_env.observation_space.low, env.base_env.observation_space.high man_scale = (max_obs - min_obs) / 2 if args.should_reach_subgoal: man_scale = np.ones(man_scale.shape) controller_goal_dim = man_scale.shape[0] no_xy = False # Can't just take out first dimensions; movement here is different than for ants. controller_with_tanh = True elif "-v" in args.env_name: env = gym.make(args.env_name) env.env.reward_type = args.reward_type env.distance_threshold = env.env.distance_threshold max_action = np.array([1.54302745e+00, 1.21865324e+00, 9.98163424e-01, 1.97805133e-04, 7.15193042e-05, 2.56647627e-02, 2.30302501e-02, 2.13756120e-02, 1.19019512e-02, 6.31742249e-03]) min_action = np.array( [7.95019864e-01, - 5.56192570e-02, 3.32176206e-01, 0.00000000e+00, 0.00000000e+00, - 2.58566763e-02, - 2.46581777e-02, - 1.77669761e-02, - 1.13476014e-02, - 5.08970149e-04]) man_scale = max_action - min_action controller_goal_dim = man_scale.shape[0] no_xy = False # Can't just take out first dimensions; movement here is different than for ants. controller_with_tanh = True else: # We'll be running on one of the various Ant envs env = EnvWithGoal(create_maze_env(args.env_name), args.env_name) # TODO: Where to these magic numbers come from? low = np.array((-10, -10, -0.5, -1, -1, -1, -1, -0.5, -0.3, -0.5, -0.3, -0.5, -0.3, -0.5, -0.3)) high = -low man_scale = (high - low) / 2 controller_goal_dim = man_scale.shape[0] # scale = np.array([10, 10, 0.5, 1, 1, 1] + [60]*3 + [40]*3 # + [60]*3 + [40]*3 # + [60]*3 + [40]*3 # + [60]*3 + [40]*3) no_xy = True controller_with_tanh = True obs = env.reset() goal = obs['desired_goal'] state = obs['observation'] # # Write Hyperparameters to file # print("---------------------------------------") # print("Current Arguments:") # with open(os.path.join(args.log_dir, args.log_file, "hps.txt"), 'w') as f: # for arg in vars(args): # print("{}: {}".format(arg, getattr(args, arg))) # f.write("{}: {}\n".format(arg, getattr(args, arg))) # print("---------------------------------------\n") writer = SummaryWriter(logdir=os.path.join(args.log_dir, args.log_file)) # torch.cuda.set_device(0) current_time = datetime.now().strftime('%b%d_%H-%M-%S') file_name = 'hiro_{}_{}'.format(args.env_name, current_time) # Set seeds env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = state.shape[0] goal_dim = goal.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # The goal dim is smaller than the state dim. This is very strange and doesn't seem to be compatible with # the paper and the formula g' = s + g - s' (since the states have different dimensions than the goals) # This works because the goal is a subpart of the state, so the update rule they actually use is: # g' = s[:goal_dim] + g - s'[:goal_dim] # Initialize policy, replay buffers controller_policy = hiro.Controller( state_dim=state_dim, goal_dim=controller_goal_dim, action_dim=action_dim, max_action=max_action, actor_lr=args.ctrl_act_lr, critic_lr=args.ctrl_crit_lr, ctrl_rew_type=args.ctrl_rew_type, no_xy=no_xy, use_tanh=controller_with_tanh ) manager_policy = hiro.Manager( state_dim=state_dim, goal_dim=goal_dim, action_dim=controller_goal_dim, actor_lr=args.man_act_lr, critic_lr=args.man_crit_lr, candidate_goals=args.candidate_goals, correction=not args.no_correction, scale=man_scale, should_reach_subgoal=args.should_reach_subgoal, subgoal_dist_cost_cf=args.man_subgoal_dist_cf ) calculate_controller_reward = get_reward_function(controller_goal_dim) if args.noise_type == "ou": man_noise = utils.OUNoise(state_dim, sigma=args.man_noise_sigma) ctrl_noise = utils.OUNoise(action_dim, sigma=args.ctrl_noise_sigma) elif args.noise_type == "normal": man_noise = utils.NormalNoise(sigma=args.man_noise_sigma) ctrl_noise = utils.NormalNoise(sigma=args.ctrl_noise_sigma) manager_buffer = utils.ReplayBuffer(maxsize=args.man_buffer_size) controller_buffer = utils.ReplayBuffer(maxsize=args.ctrl_buffer_size) if can_load and args.load: try: manager_policy.load(output_dir) controller_policy.load(output_dir) manager_buffer.load(os.path.join(output_dir, "mbuf.npz")) controller_buffer.load(os.path.join(output_dir, "cbuf.npz")) with open(os.path.join(output_dir, "iter.pkl"), "rb") as f: iter = pkl.load(f) + 1 print("Loaded successfully") just_loaded = True except Exception as e: iter = 0 just_loaded = False print(e, "Not loading") else: iter = 0 just_loaded = False # Logging Parameters total_timesteps = iter timesteps_since_eval = 0 timesteps_since_manager = 0 episode_timesteps = 0 timesteps_since_subgoal = 0 episode_num = 0 done = True evaluations = [] ACTION_AND_SUGBGOAL_LOGGING_FREQUENCY = 1 # Units: episodes while total_timesteps < args.max_timesteps: # Periodically save everything (controller, manager, buffers and total time steps) if args.save_every > 0 and (total_timesteps + 1) % args.save_every == 0: print("Saving") controller_policy.save(output_dir) manager_policy.save(output_dir) manager_buffer.save(os.path.join(output_dir, "mbuf.npz")) controller_buffer.save(os.path.join(output_dir, "cbuf.npz")) with open(os.path.join(output_dir, "iter.pkl"), "wb") as f: pkl.dump(total_timesteps, f) # If we finished the episode, we might have to (1) train the controller (2) evaluate the current policy # and (3) process final state/obs, store manager transition, if it was not just created # We train the controller at the end of every episode and the manager every X timesteps (not episodes!) if done: if total_timesteps != 0 and not just_loaded: print("Timestep", total_timesteps, "Reward for episode", episode_reward) # print('Training Controller...') ctrl_act_loss, ctrl_crit_loss = controller_policy.train(controller_buffer, episode_timesteps, writer, total_timesteps, args.ctrl_batch_size, args.ctrl_discount, args.ctrl_tau,) print("Timestep", total_timesteps, "Actor loss", ctrl_act_loss, "Critic loss", ctrl_crit_loss) writer.add_scalar('data/controller_actor_loss', ctrl_act_loss, total_timesteps) writer.add_scalar('data/controller_critic_loss', ctrl_crit_loss, total_timesteps) writer.add_scalar('data/controller_ep_rew', episode_reward, total_timesteps) writer.add_scalar('data/manager_ep_rew', episode_reward, total_timesteps) # Train Manager perdiocally if timesteps_since_manager >= args.train_manager_freq: # print('Training Manager...') timesteps_since_manager = 0 man_act_loss, man_crit_loss = manager_policy.train( controller_policy, manager_buffer, ceil(episode_timesteps / args.train_manager_freq), writer, total_timesteps, args.man_batch_size, args.discount, args.man_tau ) writer.add_scalar('data/manager_actor_loss', man_act_loss, total_timesteps) writer.add_scalar('data/manager_critic_loss', man_crit_loss, total_timesteps) # Evaluate episode if timesteps_since_eval >= args.eval_freq: timesteps_since_eval = 0 avg_ep_rew, avg_controller_rew, avg_steps, avg_env_finish = evaluate_policy( env, writer, manager_policy, controller_policy, calculate_controller_reward, args.ctrl_rew_scale, args.manager_propose_freq, len(evaluations), render=args.render_in_eval ) writer.add_scalar('eval/avg_ep_rew', avg_ep_rew, total_timesteps) writer.add_scalar('eval/avg_controller_rew', avg_controller_rew, total_timesteps) writer.add_scalar('eval/avg_steps_to_finish', avg_steps, total_timesteps) writer.add_scalar('eval/perc_env_goal_achieved', avg_env_finish, total_timesteps) evaluations.append([avg_ep_rew, avg_controller_rew, avg_steps]) if args.save_models: controller_policy.save(file_name + '_controller', directory="./pytorch_models") manager_policy.save(file_name + '_manager', directory="./pytorch_models") np.save("./results/%s" % file_name, evaluations) # Process final state/obs, store manager transition, if it was not just created if len(manager_transition[-2]) != 1: # If there's more than 1 state in the transition # Manager transitions are a list of the form # [initial state, final state, goal, subgoal, manager reward, done, states, actions] manager_transition[1] = state # Store the final state manager_transition[5] = float(True) # Set done to true # Every manager transition should have same length of sequences # In practice, the only reason we care about storing the low level actions is so that # we can adjust the subgoals in the meta transition (to take into account the fact that # the low level controller changed). We try different subgoals and see which ones makes # the stored observations / actions the most likely and pick that one. There's nothing # here that requires a specific length, it's just more convenient. What they do is # put +inf, which results in +inf in the calculations later, and then they replace # all those +inf by 0 in the cost, which solves everything at once. # # Therefore, having actions of different sizes isn't a potential problem, it's just more annoying. if len(manager_transition[-2]) <= args.manager_propose_freq: # The original code just had np.inf, but for Lunar Lander that caused problems # so what I do is simply create an action array filled with np.inf. This seemed # to fix the problem fake_action = np.repeat([np.inf], manager_transition[-1][-1].shape[0]) while len(manager_transition[-2]) <= args.manager_propose_freq: manager_transition[-1].append(fake_action) manager_transition[-2].append(state) manager_buffer.add(manager_transition) # Reset environment obs = env.reset() goal = obs['desired_goal'] state = obs['observation'] done = False episode_reward = 0 episode_timesteps = 0 just_loaded = False episode_num += 1 # Create new manager transition (sample new subgoal) subgoal = manager_policy.sample_subgoal(state, goal) # print(total_timesteps, subgoal) if episode_num % ACTION_AND_SUGBGOAL_LOGGING_FREQUENCY == 0: for i in range(min(subgoal.shape[0], 3)): writer.add_scalar('values/subgoal_%d' % i, subgoal[i], total_timesteps) timesteps_since_subgoal = 0 # Create a high level transition manager_transition = [state, None, goal, subgoal, 0, False, [state], []] # TODO: Scale action to environment action = controller_policy.select_action(state, subgoal) action = ctrl_noise.perturb_action(action, max_action) if episode_num % ACTION_AND_SUGBGOAL_LOGGING_FREQUENCY == 0: for i in range(min(action.shape[0], 2)): writer.add_scalar('values/action_%d' % i, action[i], total_timesteps) # Perform action, get (nextst, r, d) next_tup, manager_reward, env_done, _ = env.step(action) writer.add_scalar('values/env_reward', manager_reward, total_timesteps) # Update cumulative reward (env. reward) for manager manager_transition[4] += manager_reward * args.man_rew_scale # Process next_goal = next_tup['desired_goal'] next_state = next_tup['observation'] # Append low level sequence for off policy correction if utils.has_nan_or_inf(action): raise Exception() manager_transition[-1].append(action) manager_transition[-2].append(next_state) # Calculate reward, transition subgoal # print(np.sum(np.abs(state - next_state)), subgoal) controller_reward = calculate_controller_reward(state, subgoal, next_state, args.ctrl_rew_scale) subgoal = controller_policy.subgoal_transition(state, subgoal, next_state) controller_goal = subgoal # Is the episode over? if env_done: done = True episode_reward += controller_reward # Store low level transition if args.inner_dones: ctrl_done = done or timesteps_since_subgoal % args.manager_propose_freq == 0 else: ctrl_done = done controller_buffer.add((state, next_state, controller_goal, action, controller_reward, float(ctrl_done), [], [])) # Update state parameters state = next_state goal = next_goal # Update counters episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1 timesteps_since_manager += 1 timesteps_since_subgoal += 1 # Every X timesteps, store manager transition in buffer and pick a new subgoal if timesteps_since_subgoal % args.manager_propose_freq == 0: # Finish, add transition manager_transition[1] = state manager_transition[5] = float(done) manager_buffer.add(manager_transition) subgoal = manager_policy.sample_subgoal(state, goal) subgoal = man_noise.perturb_action(subgoal, max_action=man_scale) # print(total_timesteps, subgoal) if episode_num % ACTION_AND_SUGBGOAL_LOGGING_FREQUENCY == 0: for i in range(min(subgoal.shape[0], 3)): writer.add_scalar('values/subgoal_%d' % i, subgoal[i], total_timesteps) # Reset number of timesteps since we sampled a subgoal timesteps_since_subgoal = 0 # Create a high level transition manager_transition = [state, None, goal, subgoal, 0, False, [state], []] # Final evaluation evaluations.append([evaluate_policy(env, writer, manager_policy, controller_policy, calculate_controller_reward, args.ctrl_rew_scale, args.manager_propose_freq, len(evaluations))]) if args.save_models: controller_policy.save(file_name + '_controller', directory="./pytorch_models") manager_policy.save(file_name + '_manager', directory="./pytorch_models") np.save("./results/%s" % file_name, evaluations)
def get_env_and_policy(args): # Load environment if "-v" in args.env_name: env = gym.make(args.env_name) env.env.reward_type = args.reward_type env.distance_threshold = env.env.distance_threshold max_action = np.array([1.54302745e+00, 1.21865324e+00, 9.98163424e-01, 1.97805133e-04, 7.15193042e-05, 2.56647627e-02, 2.30302501e-02, 2.13756120e-02, 1.19019512e-02, 6.31742249e-03]) min_action = np.array( [7.95019864e-01, - 5.56192570e-02, 3.32176206e-01, 0.00000000e+00, 0.00000000e+00, - 2.58566763e-02, - 2.46581777e-02, - 1.77669761e-02, - 1.13476014e-02, - 5.08970149e-04]) man_scale = max_action - min_action controller_goal_dim = man_scale.shape[0] no_xy = False # Can't just take out first dimensions; movement here is different than for ants. else: # We'll be running on one of the various Ant envs env = EnvWithGoal(create_maze_env(args.env_name), args.env_name) low = np.array((-10, -10, -0.5, -1, -1, -1, -1, -0.5, -0.3, -0.5, -0.3, -0.5, -0.3, -0.5, -0.3)) high = -low man_scale = (high - low) / 2 controller_goal_dim = man_scale.shape[0] # scale = np.array([10, 10, 0.5, 1, 1, 1] + [60]*3 + [40]*3 # + [60]*3 + [40]*3 # + [60]*3 + [40]*3 # + [60]*3 + [40]*3) no_xy = True # Fetch environment meta info obs = env.reset() goal = obs['desired_goal'] state = obs['observation'] state_dim = state.shape[0] goal_dim = goal.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy, replay buffers controller_policy = hiro.Controller( state_dim=state_dim, goal_dim=controller_goal_dim, action_dim=action_dim, max_action=max_action, actor_lr=args.ctrl_act_lr, critic_lr=args.ctrl_crit_lr, ctrl_rew_type=args.ctrl_rew_type, no_xy=no_xy, ) manager_policy = hiro.Manager( state_dim=state_dim, goal_dim=goal_dim, action_dim=controller_goal_dim, actor_lr=args.man_act_lr, critic_lr=args.man_crit_lr, candidate_goals=args.candidate_goals, correction=not args.no_correction, scale=man_scale, should_reach_subgoal=args.should_reach_subgoal, subgoal_dist_cost_cf=args.man_subgoal_dist_cf ) # Reload weights from file output_dir = os.path.join(args.log_dir, args.log_file) manager_policy.load(output_dir) controller_policy.load(output_dir) calculate_controller_reward = get_reward_function(controller_goal_dim) return env, controller_policy, manager_policy, calculate_controller_reward
def run_hiro(args): if not os.path.exists("./results"): os.makedirs("./results") if args.save_models and not os.path.exists("./pytorch_models"): os.makedirs("./pytorch_models") if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) if not os.path.exists(os.path.join(args.log_dir, args.log_file)): can_load = False os.makedirs(os.path.join(args.log_dir, args.log_file)) else: can_load = True print("Existing directory found; may be able to load weights.") output_dir = os.path.join(args.log_dir, args.log_file) print("Logging in {}".format(output_dir)) if "-v" in args.env_name: env = gym.make(args.env_name) env.env.reward_type = args.reward_type env.distance_threshold = env.env.distance_threshold max_action = np.array([ 1.54302745e+00, 1.21865324e+00, 9.98163424e-01, 1.97805133e-04, 7.15193042e-05, 2.56647627e-02, 2.30302501e-02, 2.13756120e-02, 1.19019512e-02, 6.31742249e-03 ]) min_action = np.array([ 7.95019864e-01, -5.56192570e-02, 3.32176206e-01, 0.00000000e+00, 0.00000000e+00, -2.58566763e-02, -2.46581777e-02, -1.77669761e-02, -1.13476014e-02, -5.08970149e-04 ]) man_scale = max_action - min_action controller_goal_dim = man_scale.shape[0] no_xy = False # Can't just take out first dimensions; movement here is different than for ants. else: print(args.env_name) # We'll be running on one of the various Ant envs env = EnvWithGoal(create_maze_env(args.env_name), args.env_name) low = np.array((-10, -10, -0.5, -1, -1, -1, -1, -0.5, -0.3, -0.5, -0.3, -0.5, -0.3, -0.5, -0.3)) high = -low man_scale = (high - low) / 2 controller_goal_dim = man_scale.shape[0] # scale = np.array([10, 10, 0.5, 1, 1, 1] + [60]*3 + [40]*3 # + [60]*3 + [40]*3 # + [60]*3 + [40]*3 # + [60]*3 + [40]*3) no_xy = True obs = env.reset() goal = obs['desired_goal'] state = obs['observation'] # # Write Hyperparameters to file # print("---------------------------------------") # print("Current Arguments:") # with open(os.path.join(args.log_dir, args.log_file, "hps.txt"), 'w') as f: # for arg in vars(args): # print("{}: {}".format(arg, getattr(args, arg))) # f.write("{}: {}\n".format(arg, getattr(args, arg))) # print("---------------------------------------\n") writer = SummaryWriter(logdir=os.path.join(args.log_dir, args.log_file)) torch.cuda.set_device(0) env_name = type(env).__name__ file_name = 'hiro_{}'.format(env_name) # Set seeds env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = state.shape[0] goal_dim = goal.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy, replay buffers controller_policy = hiro.Controller( state_dim=state_dim, goal_dim=controller_goal_dim, action_dim=action_dim, max_action=max_action, actor_lr=args.ctrl_act_lr, critic_lr=args.ctrl_crit_lr, ctrl_rew_type=args.ctrl_rew_type, no_xy=no_xy, ) manager_policy = hiro.Manager(state_dim=state_dim, goal_dim=goal_dim, action_dim=controller_goal_dim, actor_lr=args.man_act_lr, critic_lr=args.man_crit_lr, candidate_goals=args.candidate_goals, correction=not args.no_correction, scale=man_scale) calculate_controller_reward = get_reward_function(controller_goal_dim) if args.noise_type == "ou": man_noise = utils.OUNoise(state_dim, sigma=args.man_noise_sigma) ctrl_noise = utils.OUNoise(action_dim, sigma=args.ctrl_noise_sigma) elif args.noise_type == "normal": man_noise = utils.NormalNoise(sigma=args.man_noise_sigma) ctrl_noise = utils.NormalNoise(sigma=args.ctrl_noise_sigma) manager_buffer = utils.ReplayBuffer(maxsize=args.man_buffer_size) controller_buffer = utils.ReplayBuffer(maxsize=args.ctrl_buffer_size) if can_load and args.load: try: manager_policy.load(output_dir) controller_policy.load(output_dir) manager_buffer.load(os.path.join(output_dir, "mbuf.npz")) controller_buffer.load(os.path.join(output_dir, "cbuf.npz")) with open(os.path.join(output_dir, "iter.pkl"), "rb") as f: iter = pkl.load(f) + 1 print("Loaded successfully") just_loaded = True except Exception as e: iter = 0 just_loaded = False print(e, "Not loading") else: iter = 0 just_loaded = False # Logging Parameters total_timesteps = iter timesteps_since_eval = 0 timesteps_since_manager = 0 episode_timesteps = 0 timesteps_since_subgoal = 0 episode_num = 0 done = True evaluations = [] while total_timesteps < args.max_timesteps: if args.save_every > 0 and (total_timesteps + 1) % args.save_every == 0: print("Saving") controller_policy.save(output_dir) manager_policy.save(output_dir) manager_buffer.save(os.path.join(output_dir, "mbuf.npz")) controller_buffer.save(os.path.join(output_dir, "cbuf.npz")) with open(os.path.join(output_dir, "iter.pkl"), "wb") as f: pkl.dump(total_timesteps, f) if done: if total_timesteps != 0 and not just_loaded: # print('Training Controller...') ctrl_act_loss, ctrl_crit_loss = controller_policy.train( controller_buffer, episode_timesteps, args.ctrl_batch_size, args.ctrl_discount, args.ctrl_tau) # print(ctrl_act_loss, ctrl_crit_loss) writer.add_scalar('data/controller_actor_loss', ctrl_act_loss, total_timesteps) writer.add_scalar('data/controller_critic_loss', ctrl_crit_loss, total_timesteps) writer.add_scalar('data/controller_ep_rew', episode_reward, total_timesteps) writer.add_scalar('data/manager_ep_rew', manager_transition[4], total_timesteps) # Train Manager if timesteps_since_manager >= args.train_manager_freq: # print('Training Manager...') timesteps_since_manager = 0 man_act_loss, man_crit_loss = manager_policy.train( controller_policy, manager_buffer, ceil(episode_timesteps / args.train_manager_freq), args.man_batch_size, args.discount, args.man_tau) writer.add_scalar('data/manager_actor_loss', man_act_loss, total_timesteps) writer.add_scalar('data/manager_critic_loss', man_crit_loss, total_timesteps) # Evaluate episode if timesteps_since_eval >= args.eval_freq: timesteps_since_eval = 0 avg_ep_rew, avg_controller_rew, avg_steps, avg_env_finish =\ evaluate_policy(env, writer, manager_policy, controller_policy, calculate_controller_reward, args.ctrl_rew_scale, args.manager_propose_freq, len(evaluations)) writer.add_scalar('eval/avg_ep_rew', avg_ep_rew, total_timesteps) writer.add_scalar('eval/avg_controller_rew', avg_controller_rew, total_timesteps) writer.add_scalar('eval/avg_steps_to_finish', avg_steps, total_timesteps) writer.add_scalar('eval/perc_env_goal_achieved', avg_env_finish, total_timesteps) evaluations.append( [avg_ep_rew, avg_controller_rew, avg_steps]) if args.save_models: controller_policy.save(file_name + '_controller', directory="./pytorch_models") manager_policy.save(file_name + '_manager', directory="./pytorch_models") np.save("./results/%s" % file_name, evaluations) # Process final state/obs, store manager transition, if it was not just created if len(manager_transition[-2]) != 1: manager_transition[1] = state manager_transition[5] = float(True) # Every manager transition should have same length of sequences if len(manager_transition[-2] ) <= args.manager_propose_freq: while len(manager_transition[-2] ) <= args.manager_propose_freq: manager_transition[-1].append(np.inf) manager_transition[-2].append(state) manager_buffer.add(manager_transition) # Reset environment obs = env.reset() goal = obs['desired_goal'] state = obs['observation'] done = False episode_reward = 0 episode_timesteps = 0 just_loaded = False episode_num += 1 # Create new manager transition subgoal = manager_policy.sample_goal(state, goal) timesteps_since_subgoal = 0 # Create a high level transition manager_transition = [ state, None, goal, subgoal, 0, False, [state], [] ] # TODO: Scale action to environment action = controller_policy.select_action(state, subgoal) action = ctrl_noise.perturb_action(action, max_action) # Perform action, get (nextst, r, d) next_tup, manager_reward, env_done, _ = env.step(action) # Update cumulative reward (env. reward) for manager manager_transition[4] += manager_reward * args.man_rew_scale # Process next_goal = next_tup['desired_goal'] next_state = next_tup['observation'] # Append low level sequence for off policy correction manager_transition[-1].append(action) manager_transition[-2].append(next_state) # Calculate reward, transition subgoal # print(np.sum(np.abs(state - next_state)), subgoal) controller_reward = calculate_controller_reward( state, subgoal, next_state, args.ctrl_rew_scale) subgoal = controller_policy.subgoal_transition(state, subgoal, next_state) controller_goal = subgoal # Is the episode over? if env_done: done = True episode_reward += controller_reward # Store low level transition if args.inner_dones: ctrl_done = done or timesteps_since_subgoal % \ args.manager_propose_freq == 0 else: ctrl_done = done controller_buffer.add( (state, next_state, controller_goal, action, controller_reward, float(ctrl_done), [], [])) # Update state parameters state = next_state goal = next_goal # Update counters episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1 timesteps_since_manager += 1 timesteps_since_subgoal += 1 if total_timesteps % 1000 == 0: print('total timesteps', total_timesteps) if timesteps_since_subgoal % args.manager_propose_freq == 0: # Finish, add transition manager_transition[1] = state manager_transition[5] = float(done) manager_buffer.add(manager_transition) subgoal = manager_policy.sample_goal(state, goal) subgoal = man_noise.perturb_action(subgoal, max_action=man_scale) # Reset number of timesteps since we sampled a subgoal timesteps_since_subgoal = 0 # Create a high level transition manager_transition = [ state, None, goal, subgoal, 0, False, [state], [] ] # Final evaluation evaluations.append([ evaluate_policy(env, writer, manager_policy, controller_policy, calculate_controller_reward, args.ctrl_rew_scale, args.manager_propose_freq, len(evaluations)) ]) if args.save_models: controller_policy.save(file_name + '_controller', directory="./pytorch_models") manager_policy.save(file_name + '_manager', directory="./pytorch_models") np.save("./results/%s" % (file_name), evaluations)
def eval_hrac(args): if "Maze" in args.env_name: env = EnvWithGoal(create_maze_env(args.env_name, args.seed), args.env_name) else: env = GatherEnv(create_gather_env(args.env_name, args.seed), args.env_name) if "Ant" in args.env_name: low = np.array((-10, -10, -0.5, -1, -1, -1, -1, -0.5, -0.3, -0.5, -0.3, -0.5, -0.3, -0.5, -0.3)) max_action = float(env.action_space.high[0]) else: raise NotImplementedError high = -low man_scale = (high - low) / 2 controller_goal_dim = 2 if args.absolute_goal: man_scale[0] = 12 man_scale[1] = 12 no_xy = False else: no_xy = True obs = env.reset() goal = obs["desired_goal"] state = obs["observation"] torch.cuda.set_device(args.gid) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") file_name = "{}_{}_{}".format(args.env_name, args.algo, args.seed) output_data = {"frames": [], "reward": [], "dist": []} env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = state.shape[0] if args.env_name == "AntMaze": goal_dim = goal.shape[0] else: goal_dim = 0 action_dim = env.action_space.shape[0] controller_policy = hrac.Controller(state_dim=state_dim, goal_dim=controller_goal_dim, action_dim=action_dim, max_action=max_action, actor_lr=0, critic_lr=0, no_xy=no_xy, absolute_goal=args.absolute_goal, policy_noise=0, noise_clip=0) manager_policy = hrac.Manager(state_dim=state_dim, goal_dim=goal_dim, action_dim=controller_goal_dim, actor_lr=0, critic_lr=0, candidate_goals=10, correction=True, scale=man_scale, goal_loss_coeff=10., absolute_goal=args.absolute_goal) if args.load: try: manager_policy.load(args.model_dir, args.env_name, args.algo) controller_policy.load(args.model_dir, args.env_name, args.algo) print("Loaded successfully.") just_loaded = True except Exception as e: just_loaded = False print(e, "Loading failed.") else: just_loaded = False calculate_controller_reward = get_reward_function( controller_goal_dim, absolute_goal=args.absolute_goal, binary_reward=args.binary_int_reward) evaluate_policy(env, args.env_name, manager_policy, controller_policy, calculate_controller_reward, 1.0, args.manager_propose_freq, args.eval_episodes)
def run_hrac(args): if not os.path.exists("./results"): os.makedirs("./results") if args.save_models and not os.path.exists("./models"): os.makedirs("./models") if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) if not os.path.exists(os.path.join(args.log_dir, args.algo)): os.makedirs(os.path.join(args.log_dir, args.algo)) output_dir = os.path.join(args.log_dir, args.algo) print("Logging in {}".format(output_dir)) if "Maze" in args.env_name: env = EnvWithGoal(create_maze_env(args.env_name, args.seed), args.env_name) else: env = GatherEnv(create_gather_env(args.env_name, args.seed), args.env_name) if "Ant" in args.env_name: low = np.array((-10, -10, -0.5, -1, -1, -1, -1, -0.5, -0.3, -0.5, -0.3, -0.5, -0.3, -0.5, -0.3)) max_action = float(env.action_space.high[0]) policy_noise = 0.2 noise_clip = 0.5 else: raise NotImplementedError high = -low man_scale = (high - low) / 2 controller_goal_dim = 2 if args.absolute_goal: man_scale[0] = 12 man_scale[1] = 12 no_xy = False else: no_xy = True obs = env.reset() goal = obs["desired_goal"] state = obs["observation"] writer = SummaryWriter(log_dir=os.path.join(args.log_dir, args.algo)) torch.cuda.set_device(args.gid) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") file_name = "{}_{}_{}".format(args.env_name, args.algo, args.seed) output_data = {"frames": [], "reward": [], "dist": []} env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = state.shape[0] if args.env_name == "AntMaze": goal_dim = goal.shape[0] else: goal_dim = 0 action_dim = env.action_space.shape[0] controller_policy = hrac.Controller(state_dim=state_dim, goal_dim=controller_goal_dim, action_dim=action_dim, max_action=max_action, actor_lr=args.ctrl_act_lr, critic_lr=args.ctrl_crit_lr, no_xy=no_xy, absolute_goal=args.absolute_goal, policy_noise=policy_noise, noise_clip=noise_clip) manager_policy = hrac.Manager(state_dim=state_dim, goal_dim=goal_dim, action_dim=controller_goal_dim, actor_lr=args.man_act_lr, critic_lr=args.man_crit_lr, candidate_goals=args.candidate_goals, correction=not args.no_correction, scale=man_scale, goal_loss_coeff=args.goal_loss_coeff, absolute_goal=args.absolute_goal) calculate_controller_reward = get_reward_function( controller_goal_dim, absolute_goal=args.absolute_goal, binary_reward=args.binary_int_reward) if args.noise_type == "ou": man_noise = utils.OUNoise(state_dim, sigma=args.man_noise_sigma) ctrl_noise = utils.OUNoise(action_dim, sigma=args.ctrl_noise_sigma) elif args.noise_type == "normal": man_noise = utils.NormalNoise(sigma=args.man_noise_sigma) ctrl_noise = utils.NormalNoise(sigma=args.ctrl_noise_sigma) manager_buffer = utils.ReplayBuffer(maxsize=args.man_buffer_size) controller_buffer = utils.ReplayBuffer(maxsize=args.ctrl_buffer_size) # Initialize adjacency matrix and adjacency network n_states = 0 state_list = [] state_dict = {} adj_mat = np.diag(np.ones(1000, dtype=np.uint8)) traj_buffer = utils.TrajectoryBuffer(capacity=args.traj_buffer_size) a_net = ANet(controller_goal_dim, args.r_hidden_dim, args.r_embedding_dim) if args.load_adj_net: print("Loading adjacency network...") a_net.load_state_dict(torch.load("./models/a_network.pth")) a_net.to(device) optimizer_r = optim.Adam(a_net.parameters(), lr=args.lr_r) if args.load: try: manager_policy.load("./models") controller_policy.load("./models") print("Loaded successfully.") just_loaded = True except Exception as e: just_loaded = False print(e, "Loading failed.") else: just_loaded = False # Logging Parameters total_timesteps = 0 timesteps_since_eval = 0 timesteps_since_manager = 0 episode_timesteps = 0 timesteps_since_subgoal = 0 episode_num = 0 done = True evaluations = [] while total_timesteps < args.max_timesteps: if done: if total_timesteps != 0 and not just_loaded: if episode_num % 10 == 0: print("Episode {}".format(episode_num)) # Train controller ctrl_act_loss, ctrl_crit_loss = controller_policy.train( controller_buffer, episode_timesteps, args.ctrl_batch_size, args.ctrl_discount, args.ctrl_tau) if episode_num % 10 == 0: print( "Controller actor loss: {:.3f}".format(ctrl_act_loss)) print("Controller critic loss: {:.3f}".format( ctrl_crit_loss)) writer.add_scalar("data/controller_actor_loss", ctrl_act_loss, total_timesteps) writer.add_scalar("data/controller_critic_loss", ctrl_crit_loss, total_timesteps) writer.add_scalar("data/controller_ep_rew", episode_reward, total_timesteps) writer.add_scalar("data/manager_ep_rew", manager_transition[4], total_timesteps) # Train manager if timesteps_since_manager >= args.train_manager_freq: timesteps_since_manager = 0 r_margin = (args.r_margin_pos + args.r_margin_neg) / 2 man_act_loss, man_crit_loss, man_goal_loss = manager_policy.train( controller_policy, manager_buffer, ceil(episode_timesteps / args.train_manager_freq), batch_size=args.man_batch_size, discount=args.discount, tau=args.man_tau, a_net=a_net, r_margin=r_margin) writer.add_scalar("data/manager_actor_loss", man_act_loss, total_timesteps) writer.add_scalar("data/manager_critic_loss", man_crit_loss, total_timesteps) writer.add_scalar("data/manager_goal_loss", man_goal_loss, total_timesteps) if episode_num % 10 == 0: print( "Manager actor loss: {:.3f}".format(man_act_loss)) print("Manager critic loss: {:.3f}".format( man_crit_loss)) print( "Manager goal loss: {:.3f}".format(man_goal_loss)) # Evaluate if timesteps_since_eval >= args.eval_freq: timesteps_since_eval = 0 avg_ep_rew, avg_controller_rew, avg_steps, avg_env_finish =\ evaluate_policy(env, args.env_name, manager_policy, controller_policy, calculate_controller_reward, args.ctrl_rew_scale, args.manager_propose_freq, len(evaluations)) writer.add_scalar("eval/avg_ep_rew", avg_ep_rew, total_timesteps) writer.add_scalar("eval/avg_controller_rew", avg_controller_rew, total_timesteps) if "Maze" in args.env_name: writer.add_scalar("eval/avg_steps_to_finish", avg_steps, total_timesteps) writer.add_scalar("eval/perc_env_goal_achieved", avg_env_finish, total_timesteps) evaluations.append( [avg_ep_rew, avg_controller_rew, avg_steps]) output_data["frames"].append(total_timesteps) if "Maze" in args.env_name: output_data["reward"].append(avg_env_finish) else: output_data["reward"].append(avg_ep_rew) output_data["dist"].append(-avg_controller_rew) if args.save_models: controller_policy.save("./models", args.env_name, args.algo) manager_policy.save("./models", args.env_name, args.algo) if traj_buffer.full(): for traj in traj_buffer.get_trajectory(): for i in range(len(traj)): for j in range( 1, min(args.manager_propose_freq, len(traj) - i)): s1 = tuple( np.round( traj[i][:controller_goal_dim]).astype( np.int32)) s2 = tuple( np.round( traj[i + j][:controller_goal_dim]).astype( np.int32)) if s1 not in state_list: state_list.append(s1) state_dict[s1] = n_states n_states += 1 if s2 not in state_list: state_list.append(s2) state_dict[s2] = n_states n_states += 1 adj_mat[state_dict[s1], state_dict[s2]] = 1 adj_mat[state_dict[s2], state_dict[s1]] = 1 print("Explored states: {}".format(n_states)) flags = np.ones((25, 25)) for s in state_list: flags[int(s[0]), int(s[1])] = 0 print(flags) if not args.load_adj_net: print("Training adjacency network...") utils.train_adj_net(a_net, state_list, adj_mat[:n_states, :n_states], optimizer_r, args.r_margin_pos, args.r_margin_neg, n_epochs=args.r_training_epochs, batch_size=args.r_batch_size, device=device, verbose=True) if args.save_models: r_filename = os.path.join( "./models", "{}_{}_a_network.pth".format( args.env_name, args.algo)) torch.save(a_net.state_dict(), r_filename) print("----- Adjacency network {} saved. -----". format(episode_num)) traj_buffer.reset() if len(manager_transition[-2]) != 1: manager_transition[1] = state manager_transition[5] = float(True) manager_buffer.add(manager_transition) obs = env.reset() goal = obs["desired_goal"] state = obs["observation"] traj_buffer.create_new_trajectory() traj_buffer.append(state) done = False episode_reward = 0 episode_timesteps = 0 just_loaded = False episode_num += 1 subgoal = manager_policy.sample_goal(state, goal) timesteps_since_subgoal = 0 manager_transition = [ state, None, goal, subgoal, 0, False, [state], [] ] action = controller_policy.select_action(state, subgoal) action = ctrl_noise.perturb_action(action, -max_action, max_action) action_copy = action.copy() next_tup, manager_reward, env_done, _ = env.step(action_copy) # Update cumulative reward for the manager manager_transition[4] += manager_reward * args.man_rew_scale next_goal = next_tup["desired_goal"] next_state = next_tup["observation"] traj_buffer.append(next_state) # Append low level sequence for off policy correction manager_transition[-1].append(action) manager_transition[-2].append(next_state) controller_reward = calculate_controller_reward( state, subgoal, next_state, args.ctrl_rew_scale) subgoal = controller_policy.subgoal_transition(state, subgoal, next_state) controller_goal = subgoal if env_done: done = True episode_reward += controller_reward # Store low level transition if args.inner_dones: ctrl_done = done or timesteps_since_subgoal % args.manager_propose_freq == 0 else: ctrl_done = done controller_buffer.add( (state, next_state, controller_goal, action, controller_reward, float(ctrl_done), [], [])) state = next_state goal = next_goal episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1 timesteps_since_manager += 1 timesteps_since_subgoal += 1 if timesteps_since_subgoal % args.manager_propose_freq == 0: manager_transition[1] = state manager_transition[5] = float(done) manager_buffer.add(manager_transition) subgoal = manager_policy.sample_goal(state, goal) if not args.absolute_goal: subgoal = man_noise.perturb_action( subgoal, min_action=-man_scale[:controller_goal_dim], max_action=man_scale[:controller_goal_dim]) else: subgoal = man_noise.perturb_action( subgoal, min_action=-man_scale[:controller_goal_dim] + 8, max_action=man_scale[:controller_goal_dim] + 8) # Reset number of timesteps since we sampled a subgoal timesteps_since_subgoal = 0 # Create a high level transition manager_transition = [ state, None, goal, subgoal, 0, False, [state], [] ] # Final evaluation avg_ep_rew, avg_controller_rew, avg_steps, avg_env_finish = evaluate_policy( env, args.env_name, manager_policy, controller_policy, calculate_controller_reward, args.ctrl_rew_scale, args.manager_propose_freq, len(evaluations)) evaluations.append([avg_ep_rew, avg_controller_rew, avg_steps]) output_data["frames"].append(total_timesteps) if "Maze" in args.env_name: output_data["reward"].append(avg_env_finish) else: output_data["reward"].append(avg_ep_rew) output_data["dist"].append(-avg_controller_rew) if args.save_models: controller_policy.save("./models", args.env_name, args.algo) manager_policy.save("./models", args.env_name, args.algo) output_df = pd.DataFrame(output_data) output_df.to_csv(os.path.join("./results", file_name + ".csv"), float_format="%.4f", index=False) print("Training finished.")
# Select or Generate a name for this experiment if args.exp_name: experiment_name = args.exp_name else: if args.eval: # choose most updated experiment for evaluation dirs_str = listdirs(args.model_path) dirs = np.array(list(map(int, dirs_str))) experiment_name = dirs_str[np.argmax(dirs)] else: experiment_name = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') print(experiment_name) # Environment and its attributes env = EnvWithGoal(create_maze_env(args.env), args.env) goal_dim = 2 state_dim = env.state_dim action_dim = env.action_dim scale = env.action_space.high * np.ones(action_dim) # Spawn an agent if args.td3: agent = TD3Agent( state_dim=state_dim, action_dim=action_dim, goal_dim=goal_dim, scale=scale, model_save_freq=args.model_save_freq, model_path=os.path.join(args.model_path, experiment_name), buffer_size=args.buffer_size,