Ejemplo n.º 1
0
    def __init__(self, agent_dict={}, model_dict={}):
        """ Initialize Agent object

        Params
        ======
            agent_dict(dict): dictionary containing parameters for agent
            model_dict(dict): dictionary containing parameters for agents model
        """
        if agent_dict.get("enable_gpu", False):
            self.device = torch.device(
                "cuda:0" if torch.cuda.is_available() else "cpu")
        else:
            self.device = torch.device("cpu")
        self.num_episodes = agent_dict.get("num_episodes", 10000)
        self.save_after = agent_dict.get("save_after", -1)
        self.name = agent_dict.get("name", "banana_collector")

        self.gamma = agent_dict.get("gamma", 0.9)

        self.epsilon = agent_dict.get("epsilon_start", 1.0)
        self.epsilon_decay = agent_dict.get("epsilon_decay", 0.9)
        self.epsilon_min = agent_dict.get("epsilon_min", 0.1)

        self.tau = agent_dict.get("tau", 0.1)

        self.num_replays = agent_dict.get("num_replays", 1)

        self.criterion = nn.MSELoss()

        memory_size = agent_dict.get("memory_size", 2**14)
        batchsize = agent_dict.get("batchsize", 2**10)
        replay_reg = agent_dict.get("replay_reg", 0.0)

        self.replay_buffer = utils.PrioritizedReplayBuffer(memory_size,
                                                           batchsize,
                                                           epsilon=replay_reg)

        self.decision_model = model.Model(model_dict).to(self.device)
        self.policy_model = model.Model(model_dict).to(self.device)

        self.optimizer = optim.Adam(self.decision_model.parameters(), lr=1E-3)

        utils.copy_model(self.decision_model, self.policy_model, tau=1.0)
        seed = agent_dict.get("seed", 0)

        torch.manual_seed(seed)
        np.random.seed(seed)
Ejemplo n.º 2
0
def train(config, start_timesteps, max_timesteps, policy_noise, expl_noise,
          noise_clip, policy_freq, batch_size, seed, policy,
          prioritized_replay, env_name, eval_freq, discount, tau, use_rank):
    if prioritized_replay:
        alpha = float(config["alpha"])
        beta = float(config["beta"])
    else:
        discount = float(config["discount"])
        tau = float(config["tau"])

    import pybulletgym
    warnings.filterwarnings("ignore")
    env = gym.make(env_name)

    # Set seeds
    env.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    kwargs = {
        "state_dim": state_dim,
        "action_dim": action_dim,
        "max_action": max_action,
        "discount": discount,
        "tau": tau,
    }

    # Initialize policy
    if policy == "TD3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = policy_noise * max_action
        kwargs["noise_clip"] = noise_clip * max_action
        kwargs["policy_freq"] = policy_freq
        kwargs["prioritized_replay"] = prioritized_replay
        kwargs["use_rank"] = use_rank
        policy = TD3.TD3(**kwargs)
    elif policy == "OurDDPG":
        policy = OurDDPG.DDPG(**kwargs)
    elif policy == "DDPG":
        policy = DDPG.DDPG(**kwargs)

    if prioritized_replay:
        replay_buffer = utils.PrioritizedReplayBuffer(state_dim,
                                                      action_dim,
                                                      max_timesteps,
                                                      start_timesteps,
                                                      alpha=alpha,
                                                      beta=beta)
    else:
        replay_buffer = utils.ReplayBuffer(state_dim, action_dim)

    # Evaluate untrained policy
    evaluations = [eval_policy(policy, env_name, seed)]

    state, done = env.reset(), False
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0

    for t in range(int(max_timesteps)):

        episode_timesteps += 1
        # Select action randomly or according to policy
        if t < start_timesteps:
            action = env.action_space.sample()
        else:
            action = (policy.select_action(np.array(state)) + np.random.normal(
                0, max_action * expl_noise, size=action_dim)).clip(
                    -max_action, max_action)

        # Perform action
        next_state, reward, done, _ = env.step(action)
        done_bool = float(
            done) if episode_timesteps < env._max_episode_steps else 0

        # Store data in replay buffer
        replay_buffer.add(state, action, next_state, reward, done_bool)

        state = next_state
        episode_reward += reward

        # Train agent after collecting sufficient data
        if t >= start_timesteps:
            policy.train(replay_buffer, batch_size)

        if done:
            # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True
            print(
                f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}"
            )
            # Reset environment
            state, done = env.reset(), False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

        # Evaluate episode
        if (t + 1) % eval_freq == 0:
            avg_reward = eval_policy(policy, env_name, seed)
            tune.report(episode_reward_mean=avg_reward)
            evaluations.append(avg_reward)
Ejemplo n.º 3
0
    # state_dim = env.observation_space.shape[0]
    state_dim = env.observation_space.shape
    action_dim = env.action_space.n

    # Initialize policy
    if args.policy == "SAC":
        policy = SAC.SAC(state_dim, action_dim, max_action)
    elif args.policy == "DDPG":
        policy = DDPG(state_dim, action_dim, max_action)
    elif args.policy == "DQN":
        policy = DQN(state_dim, action_dim)
    else:
        raise ValueError("invalid policy {}".format(args.policy))

    replay_buffer = utils.PrioritizedReplayBuffer()

    # Evaluate untrained policy
    evaluations = [evaluate_policy(policy)]

    timesteps_since_eval = 0
    episode_num = 0
    done = False
    episode_reward = 0
    episode_timesteps = 0

    checkpoint = tf.train.Checkpoint(policy=policy)
    checkpoint_manager = tf.train.CheckpointManager(checkpoint,
                                                    directory=args.parameter,
                                                    max_to_keep=5)
Ejemplo n.º 4
0
def train(config, args):
    if not os.path.exists("./results"):
        os.makedirs("./results")

    if args.save_model and not os.path.exists("./models"):
        os.makedirs("./models")

    import pybulletgym
    warnings.filterwarnings("ignore")
    eps_bounds = args.reacher_epsilon_bounds      # just aliasing with shorter variable name
    utils_object = utils.GeneralUtils(args)

    if args.tune_run:
        if args.prioritized_replay:
            args.alpha = float(config["alpha"])
            args.beta = float(config["beta"])
            args.discount = float(config.get("discount", args.discount))
            args.tau = float(config.get("tau", args.tau))
        elif args.custom_env and args.use_hindsight:
            eps_bounds = [float(config["epsilons"][0]), float(config["epsilons"][1])]
            args.seed = int(config["seed"])
        else:
            args.discount = float(config.get("discount", args.discount))
            args.tau = float(config.get("tau", args.tau))
    
    if args.custom_env:
        gym.envs.register(
            id='OurReacher-v0',
            entry_point='our_reacher_env:OurReacherEnv',
            max_episode_steps=50,
            reward_threshold=100.0,
        )

        # this is assuming we only use epsilon for custom env or fetch reach, where episode tsteps is 50 !!!!
        max_episode_steps = 50

        # retrieve epsilon range
        [a, b] = eps_bounds
        epsilons = utils_object.epsilon_calc(a, b, max_episode_steps)
        env = gym.make('OurReacher-v0', epsilon=epsilons[0], render=False)
    else:
        env = gym.make(args.env)

    if utils_object.fetch_reach and utils_object.args.fetch_reach_dense:
        env.env.reward_type = "dense"

    # Set seeds
    env.seed(int(args.seed))
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    if utils_object.fetch_reach:
        state_dim = env.reset()["observation"].shape[0]
    else:
        state_dim = env.observation_space.shape[0]
    if args.use_hindsight:          # include both current state and goal state
        if args.custom_env:
            state_dim += 2          # reacher nonsense; goal = (x, y)
        elif utils_object.fetch_reach:
            state_dim += 3          # include fetchreach goal state (x,y,z position)
        else:
            state_dim *= 2

    action_dim = env.action_space.shape[0] 
    max_action = float(env.action_space.high[0])

    kwargs = {
        "state_dim": state_dim,
        "action_dim": action_dim,
        "max_action": max_action,
        "discount": args.discount,
        "tau": args.tau,
    }

    # Initialize policy
    if args.policy == "TD3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = args.policy_noise * max_action
        kwargs["noise_clip"] = args.noise_clip * max_action
        kwargs["policy_freq"] = args.policy_freq
        kwargs["prioritized_replay"] = args.prioritized_replay
        kwargs["use_rank"] = args.use_rank
        kwargs["use_hindsight"] = args.use_hindsight
        
        policy = TD3.TD3(**kwargs)
    elif args.policy == "OurDDPG":
        policy = OurDDPG.DDPG(**kwargs)
    elif args.policy == "DDPG":
        policy = DDPG.DDPG(**kwargs)

    exp_descriptors = [
        args.policy, 'CustomReacher' if args.custom_env else args.env,
        f"{'rank' if args.use_rank else 'proportional'}PER" if args.prioritized_replay else '', 
        'HER' if args.use_hindsight else '',
        f"{args.decay_type}decay-eps{f'{eps_bounds[0]}-{eps_bounds[1]}' if eps_bounds[0] != eps_bounds[1] else f'{eps_bounds[0]}'}" if args.custom_env else "",
        f"k{args.k}",
        datetime.now().strftime('%Y%m%d%H%M')
    ]
    if args.tune_run:
        # fudgy: assumes tune_run for non-HER experiments
        exp_descriptors = [
            args.policy, 'CustomReacher' if args.custom_env else args.env,
            f"{'rank' if args.use_rank else 'proportional'}PER" if args.prioritized_replay else '', 
            f"tau{args.tau}", f"discount{args.discount}",
            f"alpha{args.alpha}" if args.prioritized_replay else '',
            f"beta{args.beta}" if args.prioritized_replay else '',
            f"k{args.k}",
            datetime.now().strftime('%Y%m%d%H%M')
        ]

    exp_descriptors = [x for x in exp_descriptors if len(x) > 0]
    file_name = "_".join(exp_descriptors)

    if args.load_model != "":
        policy_file = file_name if args.load_model == "default" else args.load_model
        policy.load(f"./models/{policy_file}")

    if args.prioritized_replay:
        replay_buffer = utils.PrioritizedReplayBuffer(state_dim, action_dim,
                                                      args.max_timesteps, args.start_timesteps,
                                                      alpha=args.alpha, beta=args.beta)
    else:
        replay_buffer = utils.ReplayBuffer(state_dim, action_dim)
    
    # Evaluate untrained policy
    evaluations = [eval_policy(policy, args.env, args.seed, utils_object=utils_object)]
 
    state, done = env.reset(), False

    original_episode_reward = 0
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0

    trajectory = []

    for t in range(int(args.max_timesteps)):
        
        episode_timesteps += 1
        x, goal = utils_object.compute_x_goal(state, env)
        
        # Select action randomly or according to policy
        if t < args.start_timesteps:
            action = env.action_space.sample()
        else:
            action = (
                policy.select_action(np.array(x))
                + np.random.normal(0, max_action * args.expl_noise, size=action_dim)
            ).clip(-max_action, max_action)

        # Perform action
        next_state, reward, done, _ = env.step(action)
        done_bool = float(done) if episode_timesteps < env._max_episode_steps else 0

        if args.use_hindsight:
            if utils_object.fetch_reach:
                goal = state["desired_goal"]
                next_x = np.concatenate([np.array(next_state["observation"]), goal])
            else:
                # env.set_goal(goal)
                next_x = np.concatenate([np.array(next_state), goal])
        elif utils_object.fetch_reach:
            next_x = np.array(next_state["observation"])
        else:
            next_x = next_state

        # Store data in replay buffer
        if not args.use_hindsight:
            replay_buffer.add(x, action, next_x, reward, done_bool)

        trajectory.append((state, action, next_state, reward, done_bool))

        state = next_state
        episode_reward += reward
        if args.custom_env:
          original_episode_reward += env.original_rewards

        # Train agent after collecting sufficient data
        if t >= args.start_timesteps:
            policy.train(replay_buffer, args.batch_size)

        if done:
            if args.use_hindsight:
                replay_buffer.add_hindsight(trajectory, goal, env, k=args.k, fetch_reach=utils_object.fetch_reach)
            # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True
            print(
                f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f} Original Reward: {original_episode_reward:.3f}")
            # Reset environment
            state, done = env.reset(), False
            episode_reward = 0
            original_episode_reward = 0
            episode_timesteps = 0
            episode_num += 1
            if args.custom_env:
                epsilon = epsilons[episode_num]
                env.set_epsilon(epsilon)

            trajectory = []

        # Evaluate episode
        if (t + 1) % args.eval_freq == 0:
            evaled_policy = eval_policy(policy, args.env, args.seed, utils_object=utils_object)
            evaluations.append(evaled_policy)
            np.save(f"./results/{file_name}", evaluations)
            if args.save_model:
                policy.save(f"./models/{file_name}")
            if args.plot:
                plotter.plot(file_name, args.custom_env)
            if args.tune_run:
                tune.report(episode_reward_mean=evaled_policy[0])
Ejemplo n.º 5
0
        "max_action": max_action,
        "discount": args.discount,
        "tau": args.tau,
        "policy_noise": args.policy_noise * max_action,
        "noise_clip": args.noise_clip * max_action,
        "policy_freq": args.policy_freq
    }

    # Initialize policy and replay buffer
    if args.algorithm == "TD3":
        policy = TD3.TD3(**kwargs)
        replay_buffer = utils.ReplayBuffer(state_dim, action_dim)

    elif args.algorithm == "PER_TD3":
        policy = PER_TD3.PER_TD3(**kwargs)
        replay_buffer = utils.PrioritizedReplayBuffer(state_dim, action_dim)

    kwargs["alpha"] = args.alpha
    kwargs["min_priority"] = args.min_priority

    if args.algorithm == "LAP_TD3":
        policy = LAP_TD3.LAP_TD3(**kwargs)
        replay_buffer = utils.PrioritizedReplayBuffer(state_dim, action_dim)

    elif args.algorithm == "PAL_TD3":
        policy = PAL_TD3.PAL_TD3(**kwargs)
        replay_buffer = utils.ReplayBuffer(state_dim, action_dim)

    # Evaluate untrained policy
    evaluations = [eval_policy(policy, args.env, args.seed)]