コード例 #1
0
def run_environment(env_name, episode_length, num_episodes):
    env = EnvWithGoal(create_maze_env.create_maze_env(env_name), env_name)

    def action_fn(obs):
        action_space = env.action_space
        action_space_mean = (action_space.low + action_space.high) / 2.0
        action_space_magn = (action_space.high - action_space.low) / 2.0
        random_action = (
            action_space_mean + action_space_magn *
            np.random.uniform(low=-1.0, high=1.0, size=action_space.shape))

        return random_action

    rewards = []
    successes = []
    for ep in range(num_episodes):
        rewards.append(0.0)
        successes.append(False)
        obs = env.reset()
        for _ in range(episode_length):
            env.render()
            print(env.get_image().shape)
            obs, reward, done, _ = env.step(action_fn(obs))
            rewards[-1] += reward
            successes[-1] = success_fn(reward)
            if done:
                break

        print('Episode {} reward: {}, Success: {}'.format(
            ep + 1, rewards[-1], successes[-1]))

    print('Average Reward over {} episodes: {}'.format(num_episodes,
                                                       np.mean(rewards)))
    print('Average Success over {} episodes: {}'.format(
        num_episodes, np.mean(successes)))
コード例 #2
0
    def test_subgoal_transition(self):
        env = EnvWithGoal(create_maze_env(ENV_NAME), ENV_NAME)
        subgoal = Subgoal()

        subgoal_dim = subgoal.action_dim
        state_dim, goal_dim, action_dim, scale_low = spawn_dims(env)
        scale_high = subgoal.action_space.high * np.ones(subgoal_dim)

        agent = HiroAgent(
            state_dim=state_dim,
            action_dim=action_dim,
            goal_dim=goal_dim,
            subgoal_dim=subgoal_dim,
            scale_low=scale_low,
            scale_high=scale_high)

        goal = np.array([5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0])

        state = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
        next_state = np.array([1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
        subgoal = agent.subgoal_transition(state, goal, next_state)

        # distance from current state to current goal should be maintained
        self.assertEqual(goal-state, subgoal-next_state)
コード例 #3
0
    def test_low_reward_negative(self):
        env = EnvWithGoal(create_maze_env(ENV_NAME), ENV_NAME)
        subgoal = Subgoal()

        subgoal_dim = subgoal.action_dim
        state_dim, goal_dim, action_dim, scale_low = spawn_dims(env)
        scale_high = subgoal.action_space.high * np.ones(subgoal_dim)

        agent = HiroAgent(
            state_dim=state_dim,
            action_dim=action_dim,
            goal_dim=goal_dim,
            subgoal_dim=subgoal_dim,
            scale_low=scale_low,
            scale_high=scale_high)

        goal = np.array([5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0])

        state = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
        next_state = np.array([1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
        reward1 = agent.low_reward(state, goal, next_state)

        state = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
        next_state = np.array([-1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
        reward2 = agent.low_reward(state, goal, next_state)

        self.assertTrue(reward1 > reward2)
コード例 #4
0
def run_hiro(args):
    if not os.path.exists("./results"):
        os.makedirs("./results")
    if args.save_models and not os.path.exists("./pytorch_models"):
        os.makedirs("./pytorch_models")
    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)
    if not os.path.exists(os.path.join(args.log_dir, args.log_file)):
        can_load = False
        os.makedirs(os.path.join(args.log_dir, args.log_file))
    else:
        can_load = True
        print("Existing directory found; may be able to load weights.")
    output_dir = os.path.join(args.log_dir, args.log_file)
    print("Logging in {}".format(output_dir))

    if args.env_name in ["MountainCarContinuous-v0", "LunarLanderContinuous-v2", "Pendulum-v0"]:
        env = EnvWithGoal(
            gym.make(args.env_name),
            args.env_name,
            use_real_reward=True,
            should_scale_obs=args.should_reach_subgoal
        )
        # env.env.reward_type = args.reward_type
        if args.env_name == "MountainCarContinuous-v0":
            env.distance_threshold = -1  # We want a positive reward (e.g. a negative distance)
            min_obs, max_obs = env.base_env.observation_space.low, env.base_env.observation_space.high
            man_scale = (max_obs - min_obs) / 2
        elif args.env_name == "LunarLanderContinuous-v2":
            env.distance_threshold = -60  # We want at least a reward of 60 (e.g. a distance of -60)
            # Can't use the observation_space bounds directly, because those go from -inf to +inf
            # So I just arbitrarily picked the value 100 (no idea if this is good or not)
            man_scale = np.ones(2) * 5  # env.base_env.observation_space.low.shape[0]
        else:
            env.distance_threshold = -150  # We want a reward of 150 (TODO: bullshit value, fix it)
            min_obs, max_obs = env.base_env.observation_space.low, env.base_env.observation_space.high
            man_scale = (max_obs - min_obs) / 2

        if args.should_reach_subgoal:
            man_scale = np.ones(man_scale.shape)

        controller_goal_dim = man_scale.shape[0]
        no_xy = False  # Can't just take out first dimensions; movement here is different than for ants.

        controller_with_tanh = True
    elif "-v" in args.env_name:
        env = gym.make(args.env_name)
        env.env.reward_type = args.reward_type
        env.distance_threshold = env.env.distance_threshold
        max_action = np.array([1.54302745e+00, 1.21865324e+00, 9.98163424e-01, 1.97805133e-04,
                               7.15193042e-05, 2.56647627e-02, 2.30302501e-02, 2.13756120e-02,
                               1.19019512e-02, 6.31742249e-03])
        min_action = np.array(
            [7.95019864e-01, - 5.56192570e-02, 3.32176206e-01, 0.00000000e+00, 0.00000000e+00, - 2.58566763e-02,
             - 2.46581777e-02, - 1.77669761e-02, - 1.13476014e-02, - 5.08970149e-04])
        man_scale = max_action - min_action
        controller_goal_dim = man_scale.shape[0]
        no_xy = False  # Can't just take out first dimensions; movement here is different than for ants.
        controller_with_tanh = True
    else:
        # We'll be running on one of the various Ant envs
        env = EnvWithGoal(create_maze_env(args.env_name), args.env_name)

        # TODO: Where to these magic numbers come from?
        low = np.array((-10, -10, -0.5, -1, -1, -1, -1,
                        -0.5, -0.3, -0.5, -0.3, -0.5, -0.3, -0.5, -0.3))
        high = -low
        man_scale = (high - low) / 2
        controller_goal_dim = man_scale.shape[0]
        # scale = np.array([10, 10, 0.5, 1, 1, 1] + [60]*3 + [40]*3
        #                  + [60]*3 + [40]*3
        #                  + [60]*3 + [40]*3
        #                  + [60]*3 + [40]*3)
        no_xy = True
        controller_with_tanh = True

    obs = env.reset()

    goal = obs['desired_goal']
    state = obs['observation']

    # # Write Hyperparameters to file
    # print("---------------------------------------")
    # print("Current Arguments:")
    # with open(os.path.join(args.log_dir, args.log_file, "hps.txt"), 'w') as f:
    #     for arg in vars(args):
    #         print("{}: {}".format(arg, getattr(args, arg)))
    #         f.write("{}: {}\n".format(arg, getattr(args, arg)))
    # print("---------------------------------------\n")

    writer = SummaryWriter(logdir=os.path.join(args.log_dir, args.log_file))
    # torch.cuda.set_device(0)

    current_time = datetime.now().strftime('%b%d_%H-%M-%S')
    file_name = 'hiro_{}_{}'.format(args.env_name, current_time)

    # Set seeds
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    state_dim = state.shape[0]
    goal_dim = goal.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # The goal dim is smaller than the state dim. This is very strange and doesn't seem to be compatible with
    # the paper and the formula g' = s + g - s' (since the states have different dimensions than the goals)
    # This works because the goal is a subpart of the state, so the update rule they actually use is:
    #    g' = s[:goal_dim] + g - s'[:goal_dim]
    # Initialize policy, replay buffers
    controller_policy = hiro.Controller(
        state_dim=state_dim,
        goal_dim=controller_goal_dim,
        action_dim=action_dim,
        max_action=max_action,
        actor_lr=args.ctrl_act_lr,
        critic_lr=args.ctrl_crit_lr,
        ctrl_rew_type=args.ctrl_rew_type,
        no_xy=no_xy,
        use_tanh=controller_with_tanh
    )

    manager_policy = hiro.Manager(
        state_dim=state_dim,
        goal_dim=goal_dim,
        action_dim=controller_goal_dim,
        actor_lr=args.man_act_lr,
        critic_lr=args.man_crit_lr,
        candidate_goals=args.candidate_goals,
        correction=not args.no_correction,
        scale=man_scale,
        should_reach_subgoal=args.should_reach_subgoal,
        subgoal_dist_cost_cf=args.man_subgoal_dist_cf
    )
    calculate_controller_reward = get_reward_function(controller_goal_dim)

    if args.noise_type == "ou":
        man_noise = utils.OUNoise(state_dim, sigma=args.man_noise_sigma)
        ctrl_noise = utils.OUNoise(action_dim, sigma=args.ctrl_noise_sigma)

    elif args.noise_type == "normal":
        man_noise = utils.NormalNoise(sigma=args.man_noise_sigma)
        ctrl_noise = utils.NormalNoise(sigma=args.ctrl_noise_sigma)

    manager_buffer = utils.ReplayBuffer(maxsize=args.man_buffer_size)
    controller_buffer = utils.ReplayBuffer(maxsize=args.ctrl_buffer_size)

    if can_load and args.load:
        try:
            manager_policy.load(output_dir)
            controller_policy.load(output_dir)
            manager_buffer.load(os.path.join(output_dir, "mbuf.npz"))
            controller_buffer.load(os.path.join(output_dir, "cbuf.npz"))
            with open(os.path.join(output_dir, "iter.pkl"), "rb") as f:
                iter = pkl.load(f) + 1
            print("Loaded successfully")
            just_loaded = True
        except Exception as e:
            iter = 0
            just_loaded = False
            print(e, "Not loading")
    else:
        iter = 0
        just_loaded = False

    # Logging Parameters
    total_timesteps = iter
    timesteps_since_eval = 0
    timesteps_since_manager = 0
    episode_timesteps = 0
    timesteps_since_subgoal = 0
    episode_num = 0
    done = True
    evaluations = []

    ACTION_AND_SUGBGOAL_LOGGING_FREQUENCY = 1  # Units: episodes

    while total_timesteps < args.max_timesteps:
        # Periodically save everything (controller, manager, buffers and total time steps)
        if args.save_every > 0 and (total_timesteps + 1) % args.save_every == 0:
            print("Saving")
            controller_policy.save(output_dir)
            manager_policy.save(output_dir)
            manager_buffer.save(os.path.join(output_dir, "mbuf.npz"))
            controller_buffer.save(os.path.join(output_dir, "cbuf.npz"))
            with open(os.path.join(output_dir, "iter.pkl"), "wb") as f:
                pkl.dump(total_timesteps, f)

        # If we finished the episode, we might have to (1) train the controller (2) evaluate the current policy
        # and (3) process final state/obs, store manager transition, if it was not just created
        # We train the controller at the end of every episode and the manager every X timesteps (not episodes!)
        if done:
            if total_timesteps != 0 and not just_loaded:
                print("Timestep", total_timesteps, "Reward for episode", episode_reward)

                # print('Training Controller...')
                ctrl_act_loss, ctrl_crit_loss = controller_policy.train(controller_buffer, episode_timesteps,
                                                                        writer, total_timesteps,
                                                                        args.ctrl_batch_size, args.ctrl_discount,
                                                                        args.ctrl_tau,)

                print("Timestep", total_timesteps, "Actor loss", ctrl_act_loss, "Critic loss", ctrl_crit_loss)
                writer.add_scalar('data/controller_actor_loss', ctrl_act_loss, total_timesteps)
                writer.add_scalar('data/controller_critic_loss', ctrl_crit_loss, total_timesteps)

                writer.add_scalar('data/controller_ep_rew', episode_reward, total_timesteps)
                writer.add_scalar('data/manager_ep_rew', episode_reward, total_timesteps)

                # Train Manager perdiocally
                if timesteps_since_manager >= args.train_manager_freq:
                    # print('Training Manager...')
                    timesteps_since_manager = 0
                    man_act_loss, man_crit_loss = manager_policy.train(
                        controller_policy,
                        manager_buffer,
                        ceil(episode_timesteps / args.train_manager_freq),
                        writer, total_timesteps,
                        args.man_batch_size, args.discount,
                        args.man_tau
                    )

                    writer.add_scalar('data/manager_actor_loss', man_act_loss, total_timesteps)
                    writer.add_scalar('data/manager_critic_loss', man_crit_loss, total_timesteps)

                # Evaluate episode
                if timesteps_since_eval >= args.eval_freq:
                    timesteps_since_eval = 0
                    avg_ep_rew, avg_controller_rew, avg_steps, avg_env_finish = evaluate_policy(
                        env, writer, manager_policy, controller_policy, calculate_controller_reward,
                        args.ctrl_rew_scale, args.manager_propose_freq, len(evaluations),
                        render=args.render_in_eval
                    )

                    writer.add_scalar('eval/avg_ep_rew', avg_ep_rew, total_timesteps)
                    writer.add_scalar('eval/avg_controller_rew', avg_controller_rew, total_timesteps)
                    writer.add_scalar('eval/avg_steps_to_finish', avg_steps, total_timesteps)
                    writer.add_scalar('eval/perc_env_goal_achieved', avg_env_finish, total_timesteps)

                    evaluations.append([avg_ep_rew, avg_controller_rew, avg_steps])

                    if args.save_models:
                        controller_policy.save(file_name + '_controller', directory="./pytorch_models")
                        manager_policy.save(file_name + '_manager', directory="./pytorch_models")

                    np.save("./results/%s" % file_name, evaluations)

                # Process final state/obs, store manager transition, if it was not just created
                if len(manager_transition[-2]) != 1:  # If there's more than 1 state in the transition
                    # Manager transitions are a list of the form
                    # [initial state, final state, goal, subgoal, manager reward, done, states, actions]
                    manager_transition[1] = state  # Store the final state
                    manager_transition[5] = float(True)  # Set done to true

                    # Every manager transition should have same length of sequences
                    # In practice, the only reason we care about storing the low level actions is so that
                    # we can adjust the subgoals in the meta transition (to take into account the fact that
                    # the low level controller changed). We try different subgoals and see which ones makes
                    # the stored observations / actions the most likely and pick that one. There's nothing
                    # here that requires a specific length, it's just more convenient. What they do is
                    # put +inf, which results in +inf in the calculations later, and then they replace
                    # all those +inf by 0 in the cost, which solves everything at once.
                    #
                    # Therefore, having actions of different sizes isn't a potential problem, it's just more annoying.
                    if len(manager_transition[-2]) <= args.manager_propose_freq:
                        # The original code just had np.inf, but for Lunar Lander that caused problems
                        # so what I do is simply create an action array filled with np.inf. This seemed
                        # to fix the problem
                        fake_action = np.repeat([np.inf], manager_transition[-1][-1].shape[0])
                        while len(manager_transition[-2]) <= args.manager_propose_freq:
                            manager_transition[-1].append(fake_action)
                            manager_transition[-2].append(state)

                    manager_buffer.add(manager_transition)

            # Reset environment
            obs = env.reset()
            goal = obs['desired_goal']
            state = obs['observation']

            done = False
            episode_reward = 0
            episode_timesteps = 0
            just_loaded = False
            episode_num += 1

            # Create new manager transition (sample new subgoal)
            subgoal = manager_policy.sample_subgoal(state, goal)
            # print(total_timesteps, subgoal)

            if episode_num % ACTION_AND_SUGBGOAL_LOGGING_FREQUENCY == 0:
                for i in range(min(subgoal.shape[0], 3)):
                    writer.add_scalar('values/subgoal_%d' % i, subgoal[i], total_timesteps)

            timesteps_since_subgoal = 0

            # Create a high level transition
            manager_transition = [state, None, goal, subgoal, 0, False, [state], []]

        # TODO: Scale action to environment
        action = controller_policy.select_action(state, subgoal)
        action = ctrl_noise.perturb_action(action, max_action)

        if episode_num % ACTION_AND_SUGBGOAL_LOGGING_FREQUENCY == 0:
            for i in range(min(action.shape[0], 2)):
                writer.add_scalar('values/action_%d' % i, action[i], total_timesteps)

        # Perform action, get (nextst, r, d)
        next_tup, manager_reward, env_done, _ = env.step(action)

        writer.add_scalar('values/env_reward', manager_reward, total_timesteps)

        # Update cumulative reward (env. reward) for manager
        manager_transition[4] += manager_reward * args.man_rew_scale

        # Process
        next_goal = next_tup['desired_goal']
        next_state = next_tup['observation']

        # Append low level sequence for off policy correction
        if utils.has_nan_or_inf(action):
            raise Exception()
        manager_transition[-1].append(action)
        manager_transition[-2].append(next_state)

        # Calculate reward, transition subgoal
        # print(np.sum(np.abs(state - next_state)), subgoal)

        controller_reward = calculate_controller_reward(state, subgoal, next_state, args.ctrl_rew_scale)
        subgoal = controller_policy.subgoal_transition(state, subgoal, next_state)

        controller_goal = subgoal
        # Is the episode over?
        if env_done:
            done = True

        episode_reward += controller_reward

        # Store low level transition
        if args.inner_dones:
            ctrl_done = done or timesteps_since_subgoal % args.manager_propose_freq == 0
        else:
            ctrl_done = done
        controller_buffer.add((state, next_state, controller_goal, action, controller_reward, float(ctrl_done), [], []))

        # Update state parameters
        state = next_state
        goal = next_goal

        # Update counters
        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1
        timesteps_since_manager += 1
        timesteps_since_subgoal += 1

        # Every X timesteps, store manager transition in buffer and pick a new subgoal
        if timesteps_since_subgoal % args.manager_propose_freq == 0:
            # Finish, add transition
            manager_transition[1] = state
            manager_transition[5] = float(done)

            manager_buffer.add(manager_transition)

            subgoal = manager_policy.sample_subgoal(state, goal)
            subgoal = man_noise.perturb_action(subgoal, max_action=man_scale)
            # print(total_timesteps, subgoal)

            if episode_num % ACTION_AND_SUGBGOAL_LOGGING_FREQUENCY == 0:
                for i in range(min(subgoal.shape[0], 3)):
                    writer.add_scalar('values/subgoal_%d' % i, subgoal[i], total_timesteps)

            # Reset number of timesteps since we sampled a subgoal
            timesteps_since_subgoal = 0

            # Create a high level transition
            manager_transition = [state, None, goal, subgoal, 0, False, [state], []]

    # Final evaluation
    evaluations.append([evaluate_policy(env, writer, manager_policy, controller_policy,
                                        calculate_controller_reward, args.ctrl_rew_scale,
                                        args.manager_propose_freq, len(evaluations))])

    if args.save_models:
        controller_policy.save(file_name + '_controller', directory="./pytorch_models")
        manager_policy.save(file_name + '_manager', directory="./pytorch_models")

    np.save("./results/%s" % file_name, evaluations)
コード例 #5
0
ファイル: eval_hiro.py プロジェクト: bonaert/explainable_rl
def get_env_and_policy(args):
    # Load environment
    if "-v" in args.env_name:
        env = gym.make(args.env_name)
        env.env.reward_type = args.reward_type
        env.distance_threshold = env.env.distance_threshold
        max_action = np.array([1.54302745e+00, 1.21865324e+00, 9.98163424e-01, 1.97805133e-04,
                               7.15193042e-05, 2.56647627e-02, 2.30302501e-02, 2.13756120e-02,
                               1.19019512e-02, 6.31742249e-03])
        min_action = np.array(
            [7.95019864e-01, - 5.56192570e-02, 3.32176206e-01, 0.00000000e+00, 0.00000000e+00, - 2.58566763e-02,
             - 2.46581777e-02, - 1.77669761e-02, - 1.13476014e-02, - 5.08970149e-04])
        man_scale = max_action - min_action
        controller_goal_dim = man_scale.shape[0]
        no_xy = False  # Can't just take out first dimensions; movement here is different than for ants.
    else:
        # We'll be running on one of the various Ant envs
        env = EnvWithGoal(create_maze_env(args.env_name), args.env_name)

        low = np.array((-10, -10, -0.5, -1, -1, -1, -1,
                        -0.5, -0.3, -0.5, -0.3, -0.5, -0.3, -0.5, -0.3))
        high = -low
        man_scale = (high - low) / 2
        controller_goal_dim = man_scale.shape[0]
        # scale = np.array([10, 10, 0.5, 1, 1, 1] + [60]*3 + [40]*3
        #                  + [60]*3 + [40]*3
        #                  + [60]*3 + [40]*3
        #                  + [60]*3 + [40]*3)
        no_xy = True
    # Fetch environment meta info
    obs = env.reset()
    goal = obs['desired_goal']
    state = obs['observation']
    state_dim = state.shape[0]
    goal_dim = goal.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    # Initialize policy, replay buffers
    controller_policy = hiro.Controller(
        state_dim=state_dim,
        goal_dim=controller_goal_dim,
        action_dim=action_dim,
        max_action=max_action,
        actor_lr=args.ctrl_act_lr,
        critic_lr=args.ctrl_crit_lr,
        ctrl_rew_type=args.ctrl_rew_type,
        no_xy=no_xy,
    )
    manager_policy = hiro.Manager(
        state_dim=state_dim,
        goal_dim=goal_dim,
        action_dim=controller_goal_dim,
        actor_lr=args.man_act_lr,
        critic_lr=args.man_crit_lr,
        candidate_goals=args.candidate_goals,
        correction=not args.no_correction,
        scale=man_scale,
        should_reach_subgoal=args.should_reach_subgoal,
        subgoal_dist_cost_cf=args.man_subgoal_dist_cf
    )
    # Reload weights from file
    output_dir = os.path.join(args.log_dir, args.log_file)
    manager_policy.load(output_dir)
    controller_policy.load(output_dir)
    calculate_controller_reward = get_reward_function(controller_goal_dim)
    return env, controller_policy, manager_policy, calculate_controller_reward
コード例 #6
0
def run_hiro(args):
    if not os.path.exists("./results"):
        os.makedirs("./results")
    if args.save_models and not os.path.exists("./pytorch_models"):
        os.makedirs("./pytorch_models")
    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)
    if not os.path.exists(os.path.join(args.log_dir, args.log_file)):
        can_load = False
        os.makedirs(os.path.join(args.log_dir, args.log_file))
    else:
        can_load = True
        print("Existing directory found; may be able to load weights.")
    output_dir = os.path.join(args.log_dir, args.log_file)
    print("Logging in {}".format(output_dir))

    if "-v" in args.env_name:
        env = gym.make(args.env_name)
        env.env.reward_type = args.reward_type
        env.distance_threshold = env.env.distance_threshold
        max_action = np.array([
            1.54302745e+00, 1.21865324e+00, 9.98163424e-01, 1.97805133e-04,
            7.15193042e-05, 2.56647627e-02, 2.30302501e-02, 2.13756120e-02,
            1.19019512e-02, 6.31742249e-03
        ])
        min_action = np.array([
            7.95019864e-01, -5.56192570e-02, 3.32176206e-01, 0.00000000e+00,
            0.00000000e+00, -2.58566763e-02, -2.46581777e-02, -1.77669761e-02,
            -1.13476014e-02, -5.08970149e-04
        ])
        man_scale = max_action - min_action
        controller_goal_dim = man_scale.shape[0]
        no_xy = False  # Can't just take out first dimensions; movement here is different than for ants.
    else:
        print(args.env_name)
        # We'll be running on one of the various Ant envs
        env = EnvWithGoal(create_maze_env(args.env_name), args.env_name)

        low = np.array((-10, -10, -0.5, -1, -1, -1, -1, -0.5, -0.3, -0.5, -0.3,
                        -0.5, -0.3, -0.5, -0.3))
        high = -low
        man_scale = (high - low) / 2
        controller_goal_dim = man_scale.shape[0]
        # scale = np.array([10, 10, 0.5, 1, 1, 1] + [60]*3 + [40]*3
        #                  + [60]*3 + [40]*3
        #                  + [60]*3 + [40]*3
        #                  + [60]*3 + [40]*3)
        no_xy = True

    obs = env.reset()

    goal = obs['desired_goal']
    state = obs['observation']

    # # Write Hyperparameters to file
    # print("---------------------------------------")
    # print("Current Arguments:")
    # with open(os.path.join(args.log_dir, args.log_file, "hps.txt"), 'w') as f:
    #     for arg in vars(args):
    #         print("{}: {}".format(arg, getattr(args, arg)))
    #         f.write("{}: {}\n".format(arg, getattr(args, arg)))
    # print("---------------------------------------\n")

    writer = SummaryWriter(logdir=os.path.join(args.log_dir, args.log_file))
    torch.cuda.set_device(0)

    env_name = type(env).__name__
    file_name = 'hiro_{}'.format(env_name)

    # Set seeds
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    state_dim = state.shape[0]
    goal_dim = goal.shape[0]
    action_dim = env.action_space.shape[0]

    max_action = float(env.action_space.high[0])

    # Initialize policy, replay buffers
    controller_policy = hiro.Controller(
        state_dim=state_dim,
        goal_dim=controller_goal_dim,
        action_dim=action_dim,
        max_action=max_action,
        actor_lr=args.ctrl_act_lr,
        critic_lr=args.ctrl_crit_lr,
        ctrl_rew_type=args.ctrl_rew_type,
        no_xy=no_xy,
    )

    manager_policy = hiro.Manager(state_dim=state_dim,
                                  goal_dim=goal_dim,
                                  action_dim=controller_goal_dim,
                                  actor_lr=args.man_act_lr,
                                  critic_lr=args.man_crit_lr,
                                  candidate_goals=args.candidate_goals,
                                  correction=not args.no_correction,
                                  scale=man_scale)
    calculate_controller_reward = get_reward_function(controller_goal_dim)

    if args.noise_type == "ou":
        man_noise = utils.OUNoise(state_dim, sigma=args.man_noise_sigma)
        ctrl_noise = utils.OUNoise(action_dim, sigma=args.ctrl_noise_sigma)

    elif args.noise_type == "normal":
        man_noise = utils.NormalNoise(sigma=args.man_noise_sigma)
        ctrl_noise = utils.NormalNoise(sigma=args.ctrl_noise_sigma)

    manager_buffer = utils.ReplayBuffer(maxsize=args.man_buffer_size)
    controller_buffer = utils.ReplayBuffer(maxsize=args.ctrl_buffer_size)

    if can_load and args.load:
        try:
            manager_policy.load(output_dir)
            controller_policy.load(output_dir)
            manager_buffer.load(os.path.join(output_dir, "mbuf.npz"))
            controller_buffer.load(os.path.join(output_dir, "cbuf.npz"))
            with open(os.path.join(output_dir, "iter.pkl"), "rb") as f:
                iter = pkl.load(f) + 1
            print("Loaded successfully")
            just_loaded = True
        except Exception as e:
            iter = 0
            just_loaded = False
            print(e, "Not loading")
    else:
        iter = 0
        just_loaded = False

    # Logging Parameters
    total_timesteps = iter
    timesteps_since_eval = 0
    timesteps_since_manager = 0
    episode_timesteps = 0
    timesteps_since_subgoal = 0
    episode_num = 0
    done = True
    evaluations = []

    while total_timesteps < args.max_timesteps:
        if args.save_every > 0 and (total_timesteps +
                                    1) % args.save_every == 0:
            print("Saving")
            controller_policy.save(output_dir)
            manager_policy.save(output_dir)
            manager_buffer.save(os.path.join(output_dir, "mbuf.npz"))
            controller_buffer.save(os.path.join(output_dir, "cbuf.npz"))
            with open(os.path.join(output_dir, "iter.pkl"), "wb") as f:
                pkl.dump(total_timesteps, f)

        if done:
            if total_timesteps != 0 and not just_loaded:
                # print('Training Controller...')
                ctrl_act_loss, ctrl_crit_loss = controller_policy.train(
                    controller_buffer, episode_timesteps, args.ctrl_batch_size,
                    args.ctrl_discount, args.ctrl_tau)

                # print(ctrl_act_loss, ctrl_crit_loss)
                writer.add_scalar('data/controller_actor_loss', ctrl_act_loss,
                                  total_timesteps)
                writer.add_scalar('data/controller_critic_loss',
                                  ctrl_crit_loss, total_timesteps)

                writer.add_scalar('data/controller_ep_rew', episode_reward,
                                  total_timesteps)
                writer.add_scalar('data/manager_ep_rew', manager_transition[4],
                                  total_timesteps)

                # Train Manager
                if timesteps_since_manager >= args.train_manager_freq:
                    # print('Training Manager...')

                    timesteps_since_manager = 0
                    man_act_loss, man_crit_loss = manager_policy.train(
                        controller_policy, manager_buffer,
                        ceil(episode_timesteps / args.train_manager_freq),
                        args.man_batch_size, args.discount, args.man_tau)

                    writer.add_scalar('data/manager_actor_loss', man_act_loss,
                                      total_timesteps)
                    writer.add_scalar('data/manager_critic_loss',
                                      man_crit_loss, total_timesteps)

                # Evaluate episode
                if timesteps_since_eval >= args.eval_freq:
                    timesteps_since_eval = 0
                    avg_ep_rew, avg_controller_rew, avg_steps, avg_env_finish =\
                        evaluate_policy(env, writer, manager_policy, controller_policy, calculate_controller_reward,
                                        args.ctrl_rew_scale, args.manager_propose_freq, len(evaluations))

                    writer.add_scalar('eval/avg_ep_rew', avg_ep_rew,
                                      total_timesteps)
                    writer.add_scalar('eval/avg_controller_rew',
                                      avg_controller_rew, total_timesteps)
                    writer.add_scalar('eval/avg_steps_to_finish', avg_steps,
                                      total_timesteps)
                    writer.add_scalar('eval/perc_env_goal_achieved',
                                      avg_env_finish, total_timesteps)

                    evaluations.append(
                        [avg_ep_rew, avg_controller_rew, avg_steps])

                    if args.save_models:
                        controller_policy.save(file_name + '_controller',
                                               directory="./pytorch_models")
                        manager_policy.save(file_name + '_manager',
                                            directory="./pytorch_models")

                    np.save("./results/%s" % file_name, evaluations)

                # Process final state/obs, store manager transition, if it was not just created
                if len(manager_transition[-2]) != 1:
                    manager_transition[1] = state
                    manager_transition[5] = float(True)

                    # Every manager transition should have same length of sequences
                    if len(manager_transition[-2]
                           ) <= args.manager_propose_freq:
                        while len(manager_transition[-2]
                                  ) <= args.manager_propose_freq:
                            manager_transition[-1].append(np.inf)
                            manager_transition[-2].append(state)

                    manager_buffer.add(manager_transition)

            # Reset environment
            obs = env.reset()
            goal = obs['desired_goal']
            state = obs['observation']

            done = False
            episode_reward = 0
            episode_timesteps = 0
            just_loaded = False
            episode_num += 1

            # Create new manager transition
            subgoal = manager_policy.sample_goal(state, goal)

            timesteps_since_subgoal = 0

            # Create a high level transition
            manager_transition = [
                state, None, goal, subgoal, 0, False, [state], []
            ]

        # TODO: Scale action to environment
        action = controller_policy.select_action(state, subgoal)
        action = ctrl_noise.perturb_action(action, max_action)

        # Perform action, get (nextst, r, d)
        next_tup, manager_reward, env_done, _ = env.step(action)

        # Update cumulative reward (env. reward) for manager
        manager_transition[4] += manager_reward * args.man_rew_scale

        # Process
        next_goal = next_tup['desired_goal']
        next_state = next_tup['observation']

        # Append low level sequence for off policy correction
        manager_transition[-1].append(action)
        manager_transition[-2].append(next_state)

        # Calculate reward, transition subgoal
        # print(np.sum(np.abs(state - next_state)), subgoal)

        controller_reward = calculate_controller_reward(
            state, subgoal, next_state, args.ctrl_rew_scale)
        subgoal = controller_policy.subgoal_transition(state, subgoal,
                                                       next_state)

        controller_goal = subgoal
        # Is the episode over?
        if env_done:
            done = True

        episode_reward += controller_reward

        # Store low level transition
        if args.inner_dones:
            ctrl_done = done or timesteps_since_subgoal % \
                         args.manager_propose_freq == 0
        else:
            ctrl_done = done
        controller_buffer.add(
            (state, next_state, controller_goal, action, controller_reward,
             float(ctrl_done), [], []))

        # Update state parameters
        state = next_state
        goal = next_goal

        # Update counters
        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1
        timesteps_since_manager += 1
        timesteps_since_subgoal += 1
        if total_timesteps % 1000 == 0:
            print('total timesteps', total_timesteps)
        if timesteps_since_subgoal % args.manager_propose_freq == 0:
            # Finish, add transition
            manager_transition[1] = state
            manager_transition[5] = float(done)

            manager_buffer.add(manager_transition)

            subgoal = manager_policy.sample_goal(state, goal)
            subgoal = man_noise.perturb_action(subgoal, max_action=man_scale)

            # Reset number of timesteps since we sampled a subgoal
            timesteps_since_subgoal = 0

            # Create a high level transition
            manager_transition = [
                state, None, goal, subgoal, 0, False, [state], []
            ]

    # Final evaluation
    evaluations.append([
        evaluate_policy(env, writer, manager_policy, controller_policy,
                        calculate_controller_reward, args.ctrl_rew_scale,
                        args.manager_propose_freq, len(evaluations))
    ])

    if args.save_models:
        controller_policy.save(file_name + '_controller',
                               directory="./pytorch_models")
        manager_policy.save(file_name + '_manager',
                            directory="./pytorch_models")

    np.save("./results/%s" % (file_name), evaluations)
コード例 #7
0
def eval_hrac(args):
    if "Maze" in args.env_name:
        env = EnvWithGoal(create_maze_env(args.env_name, args.seed),
                          args.env_name)
    else:
        env = GatherEnv(create_gather_env(args.env_name, args.seed),
                        args.env_name)

    if "Ant" in args.env_name:
        low = np.array((-10, -10, -0.5, -1, -1, -1, -1, -0.5, -0.3, -0.5, -0.3,
                        -0.5, -0.3, -0.5, -0.3))
        max_action = float(env.action_space.high[0])
    else:
        raise NotImplementedError

    high = -low
    man_scale = (high - low) / 2

    controller_goal_dim = 2

    if args.absolute_goal:
        man_scale[0] = 12
        man_scale[1] = 12
        no_xy = False
    else:
        no_xy = True

    obs = env.reset()

    goal = obs["desired_goal"]
    state = obs["observation"]

    torch.cuda.set_device(args.gid)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    file_name = "{}_{}_{}".format(args.env_name, args.algo, args.seed)
    output_data = {"frames": [], "reward": [], "dist": []}

    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    state_dim = state.shape[0]
    if args.env_name == "AntMaze":
        goal_dim = goal.shape[0]
    else:
        goal_dim = 0
    action_dim = env.action_space.shape[0]

    controller_policy = hrac.Controller(state_dim=state_dim,
                                        goal_dim=controller_goal_dim,
                                        action_dim=action_dim,
                                        max_action=max_action,
                                        actor_lr=0,
                                        critic_lr=0,
                                        no_xy=no_xy,
                                        absolute_goal=args.absolute_goal,
                                        policy_noise=0,
                                        noise_clip=0)

    manager_policy = hrac.Manager(state_dim=state_dim,
                                  goal_dim=goal_dim,
                                  action_dim=controller_goal_dim,
                                  actor_lr=0,
                                  critic_lr=0,
                                  candidate_goals=10,
                                  correction=True,
                                  scale=man_scale,
                                  goal_loss_coeff=10.,
                                  absolute_goal=args.absolute_goal)

    if args.load:
        try:
            manager_policy.load(args.model_dir, args.env_name, args.algo)
            controller_policy.load(args.model_dir, args.env_name, args.algo)
            print("Loaded successfully.")
            just_loaded = True
        except Exception as e:
            just_loaded = False
            print(e, "Loading failed.")
    else:
        just_loaded = False

    calculate_controller_reward = get_reward_function(
        controller_goal_dim,
        absolute_goal=args.absolute_goal,
        binary_reward=args.binary_int_reward)

    evaluate_policy(env, args.env_name, manager_policy, controller_policy,
                    calculate_controller_reward, 1.0,
                    args.manager_propose_freq, args.eval_episodes)
コード例 #8
0
def run_hrac(args):
    if not os.path.exists("./results"):
        os.makedirs("./results")
    if args.save_models and not os.path.exists("./models"):
        os.makedirs("./models")
    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)
    if not os.path.exists(os.path.join(args.log_dir, args.algo)):
        os.makedirs(os.path.join(args.log_dir, args.algo))
    output_dir = os.path.join(args.log_dir, args.algo)
    print("Logging in {}".format(output_dir))

    if "Maze" in args.env_name:
        env = EnvWithGoal(create_maze_env(args.env_name, args.seed),
                          args.env_name)
    else:
        env = GatherEnv(create_gather_env(args.env_name, args.seed),
                        args.env_name)

    if "Ant" in args.env_name:
        low = np.array((-10, -10, -0.5, -1, -1, -1, -1, -0.5, -0.3, -0.5, -0.3,
                        -0.5, -0.3, -0.5, -0.3))
        max_action = float(env.action_space.high[0])
        policy_noise = 0.2
        noise_clip = 0.5
    else:
        raise NotImplementedError

    high = -low
    man_scale = (high - low) / 2

    controller_goal_dim = 2

    if args.absolute_goal:
        man_scale[0] = 12
        man_scale[1] = 12
        no_xy = False
    else:
        no_xy = True

    obs = env.reset()

    goal = obs["desired_goal"]
    state = obs["observation"]

    writer = SummaryWriter(log_dir=os.path.join(args.log_dir, args.algo))
    torch.cuda.set_device(args.gid)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    file_name = "{}_{}_{}".format(args.env_name, args.algo, args.seed)
    output_data = {"frames": [], "reward": [], "dist": []}

    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    state_dim = state.shape[0]
    if args.env_name == "AntMaze":
        goal_dim = goal.shape[0]
    else:
        goal_dim = 0
    action_dim = env.action_space.shape[0]

    controller_policy = hrac.Controller(state_dim=state_dim,
                                        goal_dim=controller_goal_dim,
                                        action_dim=action_dim,
                                        max_action=max_action,
                                        actor_lr=args.ctrl_act_lr,
                                        critic_lr=args.ctrl_crit_lr,
                                        no_xy=no_xy,
                                        absolute_goal=args.absolute_goal,
                                        policy_noise=policy_noise,
                                        noise_clip=noise_clip)

    manager_policy = hrac.Manager(state_dim=state_dim,
                                  goal_dim=goal_dim,
                                  action_dim=controller_goal_dim,
                                  actor_lr=args.man_act_lr,
                                  critic_lr=args.man_crit_lr,
                                  candidate_goals=args.candidate_goals,
                                  correction=not args.no_correction,
                                  scale=man_scale,
                                  goal_loss_coeff=args.goal_loss_coeff,
                                  absolute_goal=args.absolute_goal)
    calculate_controller_reward = get_reward_function(
        controller_goal_dim,
        absolute_goal=args.absolute_goal,
        binary_reward=args.binary_int_reward)

    if args.noise_type == "ou":
        man_noise = utils.OUNoise(state_dim, sigma=args.man_noise_sigma)
        ctrl_noise = utils.OUNoise(action_dim, sigma=args.ctrl_noise_sigma)

    elif args.noise_type == "normal":
        man_noise = utils.NormalNoise(sigma=args.man_noise_sigma)
        ctrl_noise = utils.NormalNoise(sigma=args.ctrl_noise_sigma)

    manager_buffer = utils.ReplayBuffer(maxsize=args.man_buffer_size)
    controller_buffer = utils.ReplayBuffer(maxsize=args.ctrl_buffer_size)

    # Initialize adjacency matrix and adjacency network
    n_states = 0
    state_list = []
    state_dict = {}
    adj_mat = np.diag(np.ones(1000, dtype=np.uint8))
    traj_buffer = utils.TrajectoryBuffer(capacity=args.traj_buffer_size)
    a_net = ANet(controller_goal_dim, args.r_hidden_dim, args.r_embedding_dim)
    if args.load_adj_net:
        print("Loading adjacency network...")
        a_net.load_state_dict(torch.load("./models/a_network.pth"))
    a_net.to(device)
    optimizer_r = optim.Adam(a_net.parameters(), lr=args.lr_r)

    if args.load:
        try:
            manager_policy.load("./models")
            controller_policy.load("./models")
            print("Loaded successfully.")
            just_loaded = True
        except Exception as e:
            just_loaded = False
            print(e, "Loading failed.")
    else:
        just_loaded = False

    # Logging Parameters
    total_timesteps = 0
    timesteps_since_eval = 0
    timesteps_since_manager = 0
    episode_timesteps = 0
    timesteps_since_subgoal = 0
    episode_num = 0
    done = True
    evaluations = []

    while total_timesteps < args.max_timesteps:
        if done:
            if total_timesteps != 0 and not just_loaded:
                if episode_num % 10 == 0:
                    print("Episode {}".format(episode_num))
                # Train controller
                ctrl_act_loss, ctrl_crit_loss = controller_policy.train(
                    controller_buffer, episode_timesteps, args.ctrl_batch_size,
                    args.ctrl_discount, args.ctrl_tau)
                if episode_num % 10 == 0:
                    print(
                        "Controller actor loss: {:.3f}".format(ctrl_act_loss))
                    print("Controller critic loss: {:.3f}".format(
                        ctrl_crit_loss))
                writer.add_scalar("data/controller_actor_loss", ctrl_act_loss,
                                  total_timesteps)
                writer.add_scalar("data/controller_critic_loss",
                                  ctrl_crit_loss, total_timesteps)

                writer.add_scalar("data/controller_ep_rew", episode_reward,
                                  total_timesteps)
                writer.add_scalar("data/manager_ep_rew", manager_transition[4],
                                  total_timesteps)

                # Train manager
                if timesteps_since_manager >= args.train_manager_freq:
                    timesteps_since_manager = 0
                    r_margin = (args.r_margin_pos + args.r_margin_neg) / 2

                    man_act_loss, man_crit_loss, man_goal_loss = manager_policy.train(
                        controller_policy,
                        manager_buffer,
                        ceil(episode_timesteps / args.train_manager_freq),
                        batch_size=args.man_batch_size,
                        discount=args.discount,
                        tau=args.man_tau,
                        a_net=a_net,
                        r_margin=r_margin)

                    writer.add_scalar("data/manager_actor_loss", man_act_loss,
                                      total_timesteps)
                    writer.add_scalar("data/manager_critic_loss",
                                      man_crit_loss, total_timesteps)
                    writer.add_scalar("data/manager_goal_loss", man_goal_loss,
                                      total_timesteps)

                    if episode_num % 10 == 0:
                        print(
                            "Manager actor loss: {:.3f}".format(man_act_loss))
                        print("Manager critic loss: {:.3f}".format(
                            man_crit_loss))
                        print(
                            "Manager goal loss: {:.3f}".format(man_goal_loss))

                # Evaluate
                if timesteps_since_eval >= args.eval_freq:
                    timesteps_since_eval = 0
                    avg_ep_rew, avg_controller_rew, avg_steps, avg_env_finish =\
                        evaluate_policy(env, args.env_name, manager_policy, controller_policy, calculate_controller_reward,
                                        args.ctrl_rew_scale, args.manager_propose_freq, len(evaluations))

                    writer.add_scalar("eval/avg_ep_rew", avg_ep_rew,
                                      total_timesteps)
                    writer.add_scalar("eval/avg_controller_rew",
                                      avg_controller_rew, total_timesteps)
                    if "Maze" in args.env_name:
                        writer.add_scalar("eval/avg_steps_to_finish",
                                          avg_steps, total_timesteps)
                        writer.add_scalar("eval/perc_env_goal_achieved",
                                          avg_env_finish, total_timesteps)

                    evaluations.append(
                        [avg_ep_rew, avg_controller_rew, avg_steps])
                    output_data["frames"].append(total_timesteps)
                    if "Maze" in args.env_name:
                        output_data["reward"].append(avg_env_finish)
                    else:
                        output_data["reward"].append(avg_ep_rew)
                    output_data["dist"].append(-avg_controller_rew)

                    if args.save_models:
                        controller_policy.save("./models", args.env_name,
                                               args.algo)
                        manager_policy.save("./models", args.env_name,
                                            args.algo)

                if traj_buffer.full():
                    for traj in traj_buffer.get_trajectory():
                        for i in range(len(traj)):
                            for j in range(
                                    1,
                                    min(args.manager_propose_freq,
                                        len(traj) - i)):
                                s1 = tuple(
                                    np.round(
                                        traj[i][:controller_goal_dim]).astype(
                                            np.int32))
                                s2 = tuple(
                                    np.round(
                                        traj[i +
                                             j][:controller_goal_dim]).astype(
                                                 np.int32))
                                if s1 not in state_list:
                                    state_list.append(s1)
                                    state_dict[s1] = n_states
                                    n_states += 1
                                if s2 not in state_list:
                                    state_list.append(s2)
                                    state_dict[s2] = n_states
                                    n_states += 1
                                adj_mat[state_dict[s1], state_dict[s2]] = 1
                                adj_mat[state_dict[s2], state_dict[s1]] = 1
                    print("Explored states: {}".format(n_states))
                    flags = np.ones((25, 25))
                    for s in state_list:
                        flags[int(s[0]), int(s[1])] = 0
                    print(flags)
                    if not args.load_adj_net:
                        print("Training adjacency network...")
                        utils.train_adj_net(a_net,
                                            state_list,
                                            adj_mat[:n_states, :n_states],
                                            optimizer_r,
                                            args.r_margin_pos,
                                            args.r_margin_neg,
                                            n_epochs=args.r_training_epochs,
                                            batch_size=args.r_batch_size,
                                            device=device,
                                            verbose=True)

                        if args.save_models:
                            r_filename = os.path.join(
                                "./models", "{}_{}_a_network.pth".format(
                                    args.env_name, args.algo))
                            torch.save(a_net.state_dict(), r_filename)
                            print("----- Adjacency network {} saved. -----".
                                  format(episode_num))

                    traj_buffer.reset()

                if len(manager_transition[-2]) != 1:
                    manager_transition[1] = state
                    manager_transition[5] = float(True)
                    manager_buffer.add(manager_transition)

            obs = env.reset()
            goal = obs["desired_goal"]
            state = obs["observation"]
            traj_buffer.create_new_trajectory()
            traj_buffer.append(state)
            done = False
            episode_reward = 0
            episode_timesteps = 0
            just_loaded = False
            episode_num += 1

            subgoal = manager_policy.sample_goal(state, goal)
            timesteps_since_subgoal = 0

            manager_transition = [
                state, None, goal, subgoal, 0, False, [state], []
            ]

        action = controller_policy.select_action(state, subgoal)
        action = ctrl_noise.perturb_action(action, -max_action, max_action)
        action_copy = action.copy()

        next_tup, manager_reward, env_done, _ = env.step(action_copy)

        # Update cumulative reward for the manager
        manager_transition[4] += manager_reward * args.man_rew_scale

        next_goal = next_tup["desired_goal"]
        next_state = next_tup["observation"]

        traj_buffer.append(next_state)

        # Append low level sequence for off policy correction
        manager_transition[-1].append(action)
        manager_transition[-2].append(next_state)

        controller_reward = calculate_controller_reward(
            state, subgoal, next_state, args.ctrl_rew_scale)
        subgoal = controller_policy.subgoal_transition(state, subgoal,
                                                       next_state)

        controller_goal = subgoal
        if env_done:
            done = True

        episode_reward += controller_reward

        # Store low level transition
        if args.inner_dones:
            ctrl_done = done or timesteps_since_subgoal % args.manager_propose_freq == 0
        else:
            ctrl_done = done

        controller_buffer.add(
            (state, next_state, controller_goal, action, controller_reward,
             float(ctrl_done), [], []))

        state = next_state
        goal = next_goal

        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1
        timesteps_since_manager += 1
        timesteps_since_subgoal += 1

        if timesteps_since_subgoal % args.manager_propose_freq == 0:
            manager_transition[1] = state
            manager_transition[5] = float(done)

            manager_buffer.add(manager_transition)

            subgoal = manager_policy.sample_goal(state, goal)

            if not args.absolute_goal:
                subgoal = man_noise.perturb_action(
                    subgoal,
                    min_action=-man_scale[:controller_goal_dim],
                    max_action=man_scale[:controller_goal_dim])
            else:
                subgoal = man_noise.perturb_action(
                    subgoal,
                    min_action=-man_scale[:controller_goal_dim] + 8,
                    max_action=man_scale[:controller_goal_dim] + 8)

            # Reset number of timesteps since we sampled a subgoal
            timesteps_since_subgoal = 0

            # Create a high level transition
            manager_transition = [
                state, None, goal, subgoal, 0, False, [state], []
            ]

    # Final evaluation
    avg_ep_rew, avg_controller_rew, avg_steps, avg_env_finish = evaluate_policy(
        env, args.env_name, manager_policy, controller_policy,
        calculate_controller_reward, args.ctrl_rew_scale,
        args.manager_propose_freq, len(evaluations))
    evaluations.append([avg_ep_rew, avg_controller_rew, avg_steps])
    output_data["frames"].append(total_timesteps)
    if "Maze" in args.env_name:
        output_data["reward"].append(avg_env_finish)
    else:
        output_data["reward"].append(avg_ep_rew)
    output_data["dist"].append(-avg_controller_rew)

    if args.save_models:
        controller_policy.save("./models", args.env_name, args.algo)
        manager_policy.save("./models", args.env_name, args.algo)

    output_df = pd.DataFrame(output_data)
    output_df.to_csv(os.path.join("./results", file_name + ".csv"),
                     float_format="%.4f",
                     index=False)
    print("Training finished.")
コード例 #9
0
ファイル: main.py プロジェクト: txing-casia/hiro_pytorch
    # Select or Generate a name for this experiment
    if args.exp_name:
        experiment_name = args.exp_name
    else:
        if args.eval:
            # choose most updated experiment for evaluation
            dirs_str = listdirs(args.model_path)
            dirs = np.array(list(map(int, dirs_str)))
            experiment_name = dirs_str[np.argmax(dirs)]
        else:
            experiment_name = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    print(experiment_name)

    # Environment and its attributes
    env = EnvWithGoal(create_maze_env(args.env), args.env)
    goal_dim = 2
    state_dim = env.state_dim
    action_dim = env.action_dim
    scale = env.action_space.high * np.ones(action_dim)

    # Spawn an agent
    if args.td3:
        agent = TD3Agent(
            state_dim=state_dim,
            action_dim=action_dim,
            goal_dim=goal_dim,
            scale=scale,
            model_save_freq=args.model_save_freq,
            model_path=os.path.join(args.model_path, experiment_name),
            buffer_size=args.buffer_size,