Esempio n. 1
0
def run():
    env = gym.make('CorridorSmall-v10')
    action_space = list(range(env.action_space.n))

    q = Approximator_ResidualBoosting(action_space)
    initial_learning_rate = 0.15
    learning_rate = initial_learning_rate
    initial_epsilon = 0.15
    epsilon = initial_epsilon
    batch_size = 10

    for learning_iteration in range(1000):
        policy = Policy_EpsilonGreedy(q, epsilon)
        episodes = [rollout(policy, env) for _ in range(batch_size)]
        targets = TD0_targets(episodes, q)
        X, Y_target = zip(*targets)
        Y_target = np.reshape(Y_target, (-1, 1))

        learning_rate = decay(initial_learning_rate, learning_iteration)
        epsilon = decay(initial_epsilon, learning_iteration)
        q.learn(learning_rate, X, Y_target)

        if learning_iteration % 1 == 0:
            greedy_policy = Policy_Greedy(q)
            reward_sum = avg(
                test_policy(greedy_policy, env) for _ in range(10))
            print(
                f"Episode {learning_iteration*batch_size} Reward {reward_sum} lr {learning_rate} epsilon {epsilon}"
            )
Esempio n. 2
0
                G = Qs[tau % n]
                for k in range(tau, min(tau + n - 1, T - 1)):
                    G += Z * deltas[k % n]
                    Z = gamma * Z * ((1 - sigma) * pis[(k + 1) % n] + sigma)
                    p = p * (1 - sigma + sigma * ratios[k % n])
                s = states[tau % n]
                a = actions[tau % n]
                # Update state-action value function.
                Q[s, a] += alpha * p * (G - Q[s, a])
                action_values = [Q[s, i] for i in range(4)]
                policy[s] = np.argmax(action_values)
            t += 1
        epsilon = decay(epsilon)
        if episode % 100 == 0:
            print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return policy


if __name__ == '__main__':
    n = 4
    alpha = 0.0001
    gamma = 1
    sigma = 0.5
    epsilon = 1
    n_episodes = 50000
    n_tests = 10
    env = GridWorld()
    policy = n_step_Q_sigma(env, n, alpha, gamma, sigma, epsilon, n_episodes)
    test_policy(env, policy, n_tests)
Esempio n. 3
0
# coding: utf-8
import time

from utils import test_policy

if __name__ == '__main__':
    test_policy("test-app", [(10, 0.5), (16, 0.5), (12, 0.5), (7, 0.5),
                             (23, 0.4), (25, 0.5), (14, 0.5), (10, 1)])
Esempio n. 4
0
from metaworld.policies.sawyer_door_lock_v1_policy import SawyerDoorLockV1Policy
import metaworld
import random
from utils import test_policy

ml45 = metaworld.ML45()

name = "door-lock-v1"
env_cls = ml45.test_classes[name]
policy = SawyerDoorLockV1Policy()

all_tasks = [task for task in ml45.test_tasks if task.env_name == name]

env = env_cls()
query_task = random.choice(all_tasks[25:])
env.set_task(query_task)
env.max_path_length = 200
test_policy(env, policy, render=True, stop=False)
Esempio n. 5
0
# coding: utf-8
import time

from utils import test_policy, send_retrain

if __name__ == '__main__':
    test_policy("test-app", [(6, 3), (10, 0.2), (5, 1)])
    send_retrain("test-app")
    time.sleep(2)
    send_retrain("test-app")
    test_policy("test-app", [(4, 1), (2, 0.2), (25, 2)])
    send_retrain("test-app")
Esempio n. 6
0
        "-ld",
        "--log_dir",
        type=str,
        required=False,
        help="directory to store log file in",
    )
    parser.add_argument("-v",
                        "--verbose",
                        help="increase output verbosity",
                        action="store_true")

    args = parser.parse_args()
    env = utils.make_env(args.env_name)
    observation_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    policy = MlpPolicy(observation_size, action_size, args.policy_hidden_dim)
    checkpoint = torch.load(args.policy_path)
    policy.load_state_dict(checkpoint["policy_state_dict"])

    utils.test_policy(
        policy,
        env,
        args.num_episodes,
        args.deterministic,
        args.max_episode_len,
        args.log_dir,
        args.verbose,
    )

    env.close()
Esempio n. 7
0
import gym
import torch

from env_wrappers import ActionNormalizedEnv
from models import DDPG_Actor
from utils import test_policy

model_name = 'ddpg_01'
env_id = "Pendulum-v0"
identity = model_name + '_' + env_id
env = ActionNormalizedEnv(gym.make(env_id))

obs_size = env.observation_space.shape[0]
act_size = env.action_space.shape[0]
act_net = DDPG_Actor(obs_size, act_size)
act_net.load_state_dict(torch.load(identity + '_act.pth'))

mean_return = test_policy(act_net, env, True)
print('mean_return: %.3f' % mean_return)
Esempio n. 8
0
# coding: utf-8
import time

from utils import test_policy

if __name__ == '__main__':
    test_policy("test-app", [(6, 8), (10, 0.2), (5, 8), (4, 8), (2, 0.2),
                             (25, 8)])
            tau = t - n + 1
            if tau > -1:
                Z = 1
                G = Qs[tau % n]
                for k in range(tau, min(tau + n - 1, T - 1)):
                    G += Z * deltas[k % n]
                    Z *= gamma * Z * pis[(k + 1) % n]
                s = states[tau % n]
                a = actions[tau % n]
                # Update state-action value function.
                Q[s, a] += alpha * (G - Q[s, a])
                # Make policy greedy w.r.t. Q.
                action_values = [Q[s, i] for i in range(4)]
                policy[s] = np.argmax(action_values)
        epsilon = decay(epsilon)
        if episode % 100 == 0:
            print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return policy


if __name__ == '__main__':
    n = 4
    alpha = 0.01
    gamma = 1
    epsilon = 1
    n_episodes = 1000
    env = GridWorld()
    policy = n_step_tree_backup(env, n, alpha, gamma, epsilon, n_episodes)
    test_policy(env, policy, 10)
    def _learn(self):
        try:
            update_count = 0

            if self.log_path is not None:
                writer = SummaryWriter(self.log_path)
                writer.add_text("hyperparameters", f"{self.hp}")

            while update_count < self.hp.max_updates:

                if self.hp.verbose >= 2:
                    print(
                        f"[learner_{self.id}] Beginning Update_{update_count + 1}"
                    )

                # set up tracking variables
                traj_count = 0
                value_fn_loss = 0.0
                policy_loss = 0.0
                policy_entropy = 0.0
                loss = torch.zeros(1,
                                   device=device,
                                   dtype=dtype,
                                   requires_grad=True)
                reward = 0.0

                # process batch of trajectories
                while traj_count < self.hp.batch_size:
                    try:
                        traj = self.q.get(timeout=self.timeout)
                    except queue.Empty as e:
                        print(
                            f"[learner_{self.id}] No trajectory recieved for {self.timeout}"
                            f" seconds. Exiting!")
                        if self.log_path is not None:
                            writer.close()
                        self.completion.set()
                        raise e

                    if self.hp.verbose >= 2:
                        print(f"[learner_{self.id}] Processing traj_{traj.id}")
                    traj_len = len(traj.r)
                    obs = torch.stack(traj.obs)
                    actions = torch.stack(traj.a)
                    r = torch.stack(traj.r)
                    reward += torch.sum(r).item() / self.hp.batch_size
                    disc = self.hp.gamma * (~torch.stack(traj.d))

                    # compute value estimates and logits for observed states
                    v = self.value_fn(obs).squeeze(1)
                    curr_logits = self.policy(obs[:-1])

                    # compute log probs for current and old policies
                    curr_log_probs = action_log_probs(curr_logits, actions)
                    traj_log_probs = action_log_probs(torch.stack(traj.logits),
                                                      actions)

                    # computing v trace targets recursively
                    with torch.no_grad():
                        imp_sampling = torch.exp(curr_log_probs -
                                                 traj_log_probs).squeeze(1)
                        rho = torch.clamp(imp_sampling, max=self.hp.rho_bar)
                        c = torch.clamp(imp_sampling, max=self.hp.c_bar)
                        delta = rho * (r + self.hp.gamma * v[1:] - v[:1])
                        vt = torch.zeros(traj_len + 1,
                                         device=device,
                                         dtype=dtype)

                        for i in range(traj_len - 1, -1, -1):
                            vt[i] = delta[i] + disc[i] * c[i] * (vt[i + 1] -
                                                                 v[i + 1])
                        vt = torch.add(vt, v)

                        # vt = (vt - torch.mean(vt)) / torch.std(vt)

                        pg_adv = rho * (r + disc * vt[1:] - v[:-1])

                    # print(f"v: {v}")
                    # print(f"vt: {vt}")
                    # print(f"pg_adv: {pg_adv}")
                    # print(f"rho: {rho}")

                    # compute loss as sum of value loss, policy loss and entropy
                    # traj_value_fn_loss = 0.5 * torch.sum(torch.pow(v - vt, 2))
                    # traj_policy_loss = torch.sum(curr_log_probs * pg_adv.detach())
                    # traj_policy_entropy = -1 * torch.sum(
                    #     F.softmax(curr_logits, dim=-1)
                    #     * F.log_softmax(curr_logits, dim=-1)
                    # )
                    traj_value_fn_loss = compute_baseline_loss(v - vt)
                    traj_policy_loss = compute_policy_gradient_loss(
                        curr_logits, actions, pg_adv)
                    traj_policy_entropy = -1 * compute_entropy_loss(
                        curr_logits)
                    traj_loss = (self.hp.v_loss_c * traj_value_fn_loss +
                                 self.hp.policy_loss_c * traj_policy_loss -
                                 self.hp.entropy_c * traj_policy_entropy)
                    loss = torch.add(loss, traj_loss / self.hp.batch_size)
                    value_fn_loss += traj_value_fn_loss.item(
                    ) / self.hp.batch_size
                    policy_loss += traj_policy_loss.item() / self.hp.batch_size
                    policy_entropy += traj_policy_entropy.item(
                    ) / self.hp.batch_size
                    traj_count += 1

                if self.hp.verbose >= 2:
                    print(f"[learner_{self.id}] Updating model weights "
                          f" for Update {update_count + 1}")

                # backpropogating loss and updating weights
                # self.policy_optimizer.zero_grad()
                # self.value_fn_optimizer.zero_grad()
                self.optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.policy.parameters(),
                                               self.hp.max_norm)
                torch.nn.utils.clip_grad_norm_(self.value_fn.parameters(),
                                               self.hp.max_norm)
                self.optimizer.step()
                self.scheduler.step()
                # self.policy_optimizer.step()
                # self.value_fn_optimizer.step()

                # log to console
                if self.hp.verbose >= 1:
                    print(
                        f"[learner_{self.id}] Update {update_count + 1} | "
                        f"Batch Mean Reward: {reward:.2f} | Loss: {loss.item():.2f}"
                    )

                # evaluate current policy
                if self.hp.eval_every is not None:
                    if (update_count + 1) % self.hp.eval_every == 0:
                        eval_r, eval_std = utils.test_policy(
                            self.policy,
                            self.hp.env_name,
                            self.hp.eval_eps,
                            True,
                            self.hp.max_timesteps,
                        )
                        if self.hp.verbose >= 1:
                            print(
                                f"[learner_{self.id}] Update {update_count + 1} | "
                                f"Evaluation Reward: {eval_r:.2f}, Std Dev: {eval_std:.2f}"
                            )
                        if self.log_path is not None:
                            writer.add_scalar(
                                f"learner_{self.id}/rewards/evaluation_reward",
                                eval_r,
                                update_count + 1,
                            )

                # log to tensorboard
                if self.log_path is not None:
                    writer.add_scalar(
                        f"learner_{self.id}/rewards/batch_mean_reward",
                        reward,
                        update_count + 1,
                    )
                    writer.add_scalar(
                        f"learner_{self.id}/loss/policy_loss",
                        policy_loss,
                        update_count + 1,
                    )
                    writer.add_scalar(
                        f"learner_{self.id}/loss/value_fn_loss",
                        value_fn_loss,
                        update_count + 1,
                    )
                    writer.add_scalar(
                        f"learner_{self.id}/loss/policy_entropy",
                        policy_entropy,
                        update_count + 1,
                    )
                    writer.add_scalar(f"learner_{self.id}/loss/total_loss",
                                      loss, update_count + 1)

                # save model weights every given interval
                if (update_count + 1) % self.hp.save_every == 0:
                    path = self.log_path / Path(
                        f"IMPALA_{self.hp.env_name}_l{self.id}_{update_count+1}.pt"
                    )
                    self.save(path)
                    print(f"[learner_{self.id}] Saved model weights at "
                          f"update {update_count+1} to {path}")

                # increment update counter
                self.update_counter.increment()
                update_count = self.update_counter.value

            if self.log_path is not None:
                writer.close()

            print(f"[learner_{self.id}] Finished learning")
            self.completion.set()
            return

        except KeyboardInterrupt:
            print(f"[learner_{self.id}] Interrupted")
            if self.log_path is not None:
                writer.close()
            self.completion.set()
            return

        except Exception as e:
            if self.log_path is not None:
                writer.close()
            print(f"[learner_{self.id}] Encoutered exception")
            raise e
Esempio n. 11
0
def train_dqn(args):
    """Runs DQN training procedure.
    """
    # setup TensorBoard logging
    writer = SummaryWriter(log_dir=args.logdir)
    # make environment
    env = gym.make(args.env_ID)
    env.seed(args.random_seed)
    # instantiate reward model + buffers and optimizers for training DQN
    q_net, q_target, replay_buffer, optimizer_agent, epsilon_schedule = init_dqn(
        args)
    # begin training
    ep_return = 0
    i_episode = 0
    state = env.reset()
    for step in range(args.n_agent_steps):
        # agent interact with env
        epsilon = epsilon_schedule.value(step)
        action = q_net.act(state, epsilon)
        next_state, rew, done, _ = env.step(action)
        # record step info
        replay_buffer.push(state, action, rew, next_state, done)
        ep_return += rew
        # prepare for next step
        state = next_state
        if done:
            state = env.reset()
            writer.add_scalar('1.ep_return', ep_return, step)
            ep_return = 0
            i_episode += 1

            # q_net gradient step at end of each episode
            if step >= args.agent_learning_starts and len(
                    replay_buffer
            ) >= 3 * args.batch_size_agent:  # we now make learning updates at the end of every episode
                loss = q_learning_loss(q_net, q_target, replay_buffer, args)
                optimizer_agent.zero_grad()
                loss.backward()
                optimizer_agent.step()
                writer.add_scalar('3.loss', loss, step)
                writer.add_scalar('4.epsilon', epsilon, step)
            if args.epsilon_annealing_scheme == 'exp':
                epsilon_schedule.step()

        # update q_target
        if step % args.target_update_period == 0:  # update target parameters
            for target_param, local_param in zip(q_target.parameters(),
                                                 q_net.parameters()):
                target_param.data.copy_(q_net.tau * local_param.data +
                                        (1.0 - q_net.tau) * target_param.data)

        # evalulate agent performance
        if step > 0 and step % args.agent_test_period == 0 or step == args.n_agent_steps - 1:
            logging.info(
                "Agent has taken {} steps. Testing performance for 100 episodes"
                .format(step))
            mean_ep_return = test_policy(q_net, args, writer)
            writer.add_scalar('2.mean_ep_return_test', mean_ep_return, step)
            # save current policy
            save_policy(q_net, optimizer_agent, step, args)
            # Possibly end training if mean_ep_return is above the threshold
            if env.spec.reward_threshold != None and mean_ep_return >= env.spec.reward_threshold:
                raise SystemExit(
                    "Environment solved after {} episodes!".format(i_episode))
    writer.close()