Exemple #1
0
def run_trpo():
    ch.debug.debug()
    for i, env_name in enumerate(sweep.SWEEP):
        dm_env = bsuite.load_and_record_to_csv(env_name,
                                               results_dir=TRPO_RESULTS_PATH,
                                               overwrite=True)

        #  Instanciate the env and agent
        env = gym_wrapper.GymWrapper(dm_env)
        env = ch.envs.Torch(env)
        env = ch.envs.Runner(env)
        policy = Policy(env)
        baseline = LinearValue(env.state_size)

        #  Generate the results
        replay = ch.ExperienceReplay()
        for episode in tqdm(range(1, 1 + env.bsuite_num_episodes),
                            desc=env_name):
            replay += env.run(policy, episodes=1)
            if episode % 10 == 0:
                trpo_update(replay, policy, baseline)
                replay.empty()
Exemple #2
0
def main(
    experiment='dev',
    env_name='Particles2D-v1',
    adapt_lr=0.1,
    meta_lr=0.01,
    adapt_steps=1,
    num_iterations=200,
    meta_bsz=20,
    adapt_bsz=20,
    tau=1.00,
    gamma=0.99,
    num_workers=2,
    seed=42,
):
    random.seed(seed)
    np.random.seed(seed)
    th.manual_seed(seed)

    def make_env():
        return gym.make(env_name)

    env = l2l.gym.AsyncVectorEnv([make_env for _ in range(num_workers)])
    env.seed(seed)
    env = ch.envs.Torch(env)
    policy = DiagNormalPolicy(env.state_size, env.action_size)
    meta_learner = l2l.algorithms.MetaSGD(policy, lr=meta_lr)
    baseline = LinearValue(env.state_size, env.action_size)
    opt = optim.Adam(policy.parameters(), lr=meta_lr)
    all_rewards = []

    for iteration in range(num_iterations):
        iteration_loss = 0.0
        iteration_reward = 0.0
        for task_config in tqdm(
                env.sample_tasks(meta_bsz)):  # Samples a new config
            learner = meta_learner.clone()
            env.set_task(task_config)
            env.reset()
            task = ch.envs.Runner(env)

            # Fast Adapt
            for step in range(adapt_steps):
                train_episodes = task.run(learner, episodes=adapt_bsz)
                loss = maml_a2c_loss(train_episodes, learner, baseline, gamma,
                                     tau)
                learner.adapt(loss)

            # Compute Validation Loss
            valid_episodes = task.run(learner, episodes=adapt_bsz)
            loss = maml_a2c_loss(valid_episodes, learner, baseline, gamma, tau)
            iteration_loss += loss
            iteration_reward += valid_episodes.reward().sum().item(
            ) / adapt_bsz

        # Print statistics
        print('\nIteration', iteration)
        adaptation_reward = iteration_reward / meta_bsz
        print('adaptation_reward', adaptation_reward)
        all_rewards.append(adaptation_reward)

        adaptation_loss = iteration_loss / meta_bsz
        print('adaptation_loss', adaptation_loss.item())

        opt.zero_grad()
        adaptation_loss.backward()
        opt.step()
Exemple #3
0
def main(
    env_name='AntDirection-v1',
    adapt_lr=0.1,
    meta_lr=1.0,
    adapt_steps=1,
    num_iterations=1000,
    meta_bsz=40,
    adapt_bsz=20,
    tau=1.00,
    gamma=0.99,
    seed=42,
    num_workers=2,
    cuda=0,
):
    cuda = bool(cuda)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed(seed)

    def make_env():
        env = gym.make(env_name)
        env = ch.envs.ActionSpaceScaler(env)
        return env

    env = l2l.gym.AsyncVectorEnv([make_env for _ in range(num_workers)])
    env.seed(seed)
    env.set_task(env.sample_tasks(1)[0])
    env = ch.envs.Torch(env)
    policy = DiagNormalPolicy(env.state_size, env.action_size)
    if cuda:
        policy.to('cuda')
    baseline = LinearValue(env.state_size, env.action_size)

    for iteration in range(num_iterations):
        iteration_reward = 0.0
        iteration_replays = []
        iteration_policies = []

        for task_config in tqdm(env.sample_tasks(meta_bsz),
                                leave=False,
                                desc='Data'):  # Samples a new config
            clone = deepcopy(policy)
            env.set_task(task_config)
            env.reset()
            task = ch.envs.Runner(env)
            task_replay = []

            # Fast Adapt
            for step in range(adapt_steps):
                train_episodes = task.run(clone, episodes=adapt_bsz)
                clone = fast_adapt_a2c(clone,
                                       train_episodes,
                                       adapt_lr,
                                       baseline,
                                       gamma,
                                       tau,
                                       first_order=True)
                task_replay.append(train_episodes)

            # Compute Validation Loss
            valid_episodes = task.run(clone, episodes=adapt_bsz)
            task_replay.append(valid_episodes)
            iteration_reward += valid_episodes.reward().sum().item(
            ) / adapt_bsz
            iteration_replays.append(task_replay)
            iteration_policies.append(clone)

        # Print statistics
        print('\nIteration', iteration)
        adaptation_reward = iteration_reward / meta_bsz
        print('adaptation_reward', adaptation_reward)

        # TRPO meta-optimization
        backtrack_factor = 0.5
        ls_max_steps = 15
        max_kl = 0.01
        if cuda:
            policy.to('cuda', non_blocking=True)
            baseline.to('cuda', non_blocking=True)
            iteration_replays = [[
                r.to('cuda', non_blocking=True) for r in task_replays
            ] for task_replays in iteration_replays]

        # Compute CG step direction
        old_loss, old_kl = meta_surrogate_loss(iteration_replays,
                                               iteration_policies, policy,
                                               baseline, tau, gamma, adapt_lr)
        grad = autograd.grad(old_loss, policy.parameters(), retain_graph=True)
        grad = parameters_to_vector([g.detach() for g in grad])
        Fvp = trpo.hessian_vector_product(old_kl, policy.parameters())
        step = trpo.conjugate_gradient(Fvp, grad)
        shs = 0.5 * torch.dot(step, Fvp(step))
        lagrange_multiplier = torch.sqrt(shs / max_kl)
        step = step / lagrange_multiplier
        step_ = [torch.zeros_like(p.data) for p in policy.parameters()]
        vector_to_parameters(step, step_)
        step = step_
        del old_kl, Fvp, grad
        old_loss.detach_()

        # Line-search
        for ls_step in range(ls_max_steps):
            stepsize = backtrack_factor**ls_step * meta_lr
            clone = deepcopy(policy)
            for p, u in zip(clone.parameters(), step):
                p.data.add_(-stepsize, u.data)
            new_loss, kl = meta_surrogate_loss(iteration_replays,
                                               iteration_policies, clone,
                                               baseline, tau, gamma, adapt_lr)
            if new_loss < old_loss and kl < max_kl:
                for p, u in zip(policy.parameters(), step):
                    p.data.add_(-stepsize, u.data)
                break
Exemple #4
0
def main(
    env_name='AntDirection-v1',
    adapt_lr=0.1,
    meta_lr=3e-4,
    adapt_steps=3,
    num_iterations=1000,
    meta_bsz=40,
    adapt_bsz=20,
    ppo_clip=0.3,
    ppo_steps=5,
    tau=1.00,
    gamma=0.99,
    eta=0.0005,
    adaptive_penalty=False,
    kl_target=0.01,
    num_workers=4,
    seed=421,
):
    random.seed(seed)
    np.random.seed(seed)
    th.manual_seed(seed)

    def make_env():
        env = gym.make(env_name)
        env = ch.envs.ActionSpaceScaler(env)
        return env

    env = l2l.gym.AsyncVectorEnv([make_env for _ in range(num_workers)])
    env.seed(seed)
    env = ch.envs.ActionSpaceScaler(env)
    env = ch.envs.Torch(env)
    policy = DiagNormalPolicy(input_size=env.state_size,
                              output_size=env.action_size,
                              hiddens=[64, 64],
                              activation='tanh')
    meta_learner = l2l.algorithms.MAML(policy, lr=meta_lr)
    baseline = LinearValue(env.state_size, env.action_size)
    opt = optim.Adam(meta_learner.parameters(), lr=meta_lr)

    for iteration in range(num_iterations):
        iteration_reward = 0.0
        iteration_replays = []
        iteration_policies = []

        # Sample Trajectories
        for task_config in tqdm(env.sample_tasks(meta_bsz),
                                leave=False,
                                desc='Data'):
            clone = deepcopy(meta_learner)
            env.set_task(task_config)
            env.reset()
            task = ch.envs.Runner(env)
            task_replay = []
            task_policies = []

            # Fast Adapt
            for step in range(adapt_steps):
                for p in clone.parameters():
                    p.detach_().requires_grad_()
                task_policies.append(deepcopy(clone))
                train_episodes = task.run(clone, episodes=adapt_bsz)
                clone = fast_adapt_a2c(clone,
                                       train_episodes,
                                       adapt_lr,
                                       baseline,
                                       gamma,
                                       tau,
                                       first_order=True)
                task_replay.append(train_episodes)

            # Compute Validation Loss
            for p in clone.parameters():
                p.detach_().requires_grad_()
            task_policies.append(deepcopy(clone))
            valid_episodes = task.run(clone, episodes=adapt_bsz)
            task_replay.append(valid_episodes)
            iteration_reward += valid_episodes.reward().sum().item(
            ) / adapt_bsz
            iteration_replays.append(task_replay)
            iteration_policies.append(task_policies)

        # Print statistics
        print('\nIteration', iteration)
        adaptation_reward = iteration_reward / meta_bsz
        print('adaptation_reward', adaptation_reward)

        # ProMP meta-optimization
        for ppo_step in tqdm(range(ppo_steps), leave=False, desc='Optim'):
            promp_loss = 0.0
            kl_total = 0.0
            for task_replays, old_policies in zip(iteration_replays,
                                                  iteration_policies):
                new_policy = meta_learner.clone()
                states = task_replays[0].state()
                actions = task_replays[0].action()
                rewards = task_replays[0].reward()
                dones = task_replays[0].done()
                next_states = task_replays[0].next_state()
                old_policy = old_policies[0]
                (old_density, new_density, old_log_probs,
                 new_log_probs) = precompute_quantities(
                     states, actions, old_policy, new_policy)
                advantages = compute_advantages(baseline, tau, gamma, rewards,
                                                dones, states, next_states)
                advantages = ch.normalize(advantages).detach()
                for step in range(adapt_steps):
                    # Compute KL penalty
                    kl_pen = kl_divergence(old_density, new_density).mean()
                    kl_total += kl_pen.item()

                    # Update the clone
                    surr_loss = trpo.policy_loss(new_log_probs, old_log_probs,
                                                 advantages)
                    new_policy.adapt(surr_loss)

                    # Move to next adaptation step
                    states = task_replays[step + 1].state()
                    actions = task_replays[step + 1].action()
                    rewards = task_replays[step + 1].reward()
                    dones = task_replays[step + 1].done()
                    next_states = task_replays[step + 1].next_state()
                    old_policy = old_policies[step + 1]
                    (old_density, new_density, old_log_probs,
                     new_log_probs) = precompute_quantities(
                         states, actions, old_policy, new_policy)

                    # Compute clip loss
                    advantages = compute_advantages(baseline, tau, gamma,
                                                    rewards, dones, states,
                                                    next_states)
                    advantages = ch.normalize(advantages).detach()
                    clip_loss = ppo.policy_loss(new_log_probs,
                                                old_log_probs,
                                                advantages,
                                                clip=ppo_clip)

                    # Combine into ProMP loss
                    promp_loss += clip_loss + eta * kl_pen

            kl_total /= meta_bsz * adapt_steps
            promp_loss /= meta_bsz * adapt_steps
            opt.zero_grad()
            promp_loss.backward(retain_graph=True)
            opt.step()

            # Adapt KL penalty based on desired target
            if adaptive_penalty:
                if kl_total < kl_target / 1.5:
                    eta /= 2.0
                elif kl_total > kl_target * 1.5:
                    eta *= 2.0
def main(
    benchmark=ML10,  # Choose between ML1, ML10, ML45
    adapt_lr=0.1,
    meta_lr=0.1,
    adapt_steps=1,
    num_iterations=1000,
    meta_bsz=20,
    adapt_bsz=10,  # Number of episodes to sample per task
    tau=1.00,
    gamma=0.99,
    seed=42,
    num_workers=10,  # Currently tasks are distributed evenly so adapt_bsz should be divisible by num_workers
    cuda=0):
    env = make_env(benchmark, seed, num_workers)

    cuda = bool(cuda)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed(seed)

    policy = DiagNormalPolicy(env.state_size,
                              env.action_size,
                              activation='tanh')
    if cuda:
        policy.to('cuda')
    baseline = LinearValue(env.state_size, env.action_size)

    for iteration in range(num_iterations):
        iteration_reward = 0.0
        iteration_replays = []
        iteration_policies = []

        for task_config in tqdm(env.sample_tasks(meta_bsz),
                                leave=False,
                                desc='Data'):  # Samples a new config
            clone = deepcopy(policy)
            env.set_task(task_config)
            env.reset()
            task = ch.envs.Runner(env)
            task_replay = []

            # Fast Adapt
            for step in range(adapt_steps):
                train_episodes = task.run(clone, episodes=adapt_bsz)
                clone = fast_adapt_a2c(clone,
                                       train_episodes,
                                       adapt_lr,
                                       baseline,
                                       gamma,
                                       tau,
                                       first_order=True)
                task_replay.append(train_episodes)

            # Compute Validation Loss
            valid_episodes = task.run(clone, episodes=adapt_bsz)
            task_replay.append(valid_episodes)

            iteration_reward += valid_episodes.reward().sum().item(
            ) / adapt_bsz
            iteration_replays.append(task_replay)
            iteration_policies.append(clone)

        # Print statistics
        print('\nIteration', iteration)
        validation_reward = iteration_reward / meta_bsz
        print('validation_reward', validation_reward)

        # TRPO meta-optimization
        backtrack_factor = 0.5
        ls_max_steps = 15
        max_kl = 0.01
        if cuda:
            policy.to('cuda', non_blocking=True)
            baseline.to('cuda', non_blocking=True)
            iteration_replays = [[
                r.to('cuda', non_blocking=True) for r in task_replays
            ] for task_replays in iteration_replays]

        # Compute CG step direction
        old_loss, old_kl = meta_surrogate_loss(iteration_replays,
                                               iteration_policies, policy,
                                               baseline, tau, gamma, adapt_lr)
        grad = autograd.grad(old_loss, policy.parameters(), retain_graph=True)
        grad = parameters_to_vector([g.detach() for g in grad])
        Fvp = trpo.hessian_vector_product(old_kl, policy.parameters())
        step = trpo.conjugate_gradient(Fvp, grad)
        shs = 0.5 * torch.dot(step, Fvp(step))
        lagrange_multiplier = torch.sqrt(shs / max_kl)
        step = step / lagrange_multiplier
        step_ = [torch.zeros_like(p.data) for p in policy.parameters()]
        vector_to_parameters(step, step_)
        step = step_
        del old_kl, Fvp, grad
        old_loss.detach_()

        # Line-search
        for ls_step in range(ls_max_steps):
            stepsize = backtrack_factor**ls_step * meta_lr
            clone = deepcopy(policy)
            for p, u in zip(clone.parameters(), step):
                p.data.add_(-stepsize, u.data)
            new_loss, kl = meta_surrogate_loss(iteration_replays,
                                               iteration_policies, clone,
                                               baseline, tau, gamma, adapt_lr)
            if new_loss < old_loss and kl < max_kl:
                for p, u in zip(policy.parameters(), step):
                    p.data.add_(-stepsize, u.data)
                break

    # Evaluate on a set of unseen tasks
    evaluate(benchmark, policy, baseline, adapt_lr, gamma, tau, num_workers,
             seed)
Exemple #6
0
def main(
    experiment='dev',
    env_name='2DNavigation-v0',
    adapt_lr=0.1,
    meta_lr=0.01,
    adapt_steps=1,
    num_iterations=20,
    meta_bsz=10,
    adapt_bsz=10,
    tau=1.00,
    gamma=0.99,
    num_workers=1,
    seed=42,
):
    random.seed(seed)
    np.random.seed(seed)
    th.manual_seed(seed)

    def make_env():
        return gym.make(env_name)

    env = l2l.gym.AsyncVectorEnv([make_env for _ in range(num_workers)])
    env.seed(seed)
    env = ch.envs.Torch(env)
    policy = DiagNormalPolicy(env.state_size, env.action_size)
    meta_learner = l2l.MAML(policy, lr=meta_lr)
    baseline = LinearValue(env.state_size, env.action_size)
    opt = optim.Adam(meta_learner.parameters(), lr=meta_lr)

    all_rewards = []
    for iteration in range(num_iterations):
        iteration_reward = 0.0
        iteration_replays = []
        iteration_policies = []
        policy.to('cpu')
        baseline.to('cpu')

        for task_config in tqdm(env.sample_tasks(meta_bsz),
                                leave=False,
                                desc='Data'):  # Samples a new config
            learner = meta_learner.clone()
            env.reset_task(task_config)
            env.reset()
            task = ch.envs.Runner(env)
            task_replay = []

            # Fast Adapt
            for step in range(adapt_steps):
                train_episodes = task.run(learner, episodes=adapt_bsz)
                learner = fast_adapt_a2c(learner,
                                         train_episodes,
                                         adapt_lr,
                                         baseline,
                                         gamma,
                                         tau,
                                         first_order=True)
                task_replay.append(train_episodes)

            # Compute Validation Loss
            valid_episodes = task.run(learner, episodes=adapt_bsz)
            task_replay.append(valid_episodes)
            iteration_reward += valid_episodes.reward().sum().item(
            ) / adapt_bsz
            iteration_replays.append(task_replay)
            iteration_policies.append(learner)

        # Print statistics
        print('\nIteration', iteration)
        adaptation_reward = iteration_reward / meta_bsz
        all_rewards.append(adaptation_reward)
        print('adaptation_reward', adaptation_reward)

        # PPO meta-optimization
        for ppo_step in tqdm(range(10), leave=False, desc='Optim'):
            ppo_loss = 0.0
            for task_replays, old_policy in zip(iteration_replays,
                                                iteration_policies):
                train_replays = task_replays[:-1]
                valid_replay = task_replays[-1]

                # Fast adapt new policy, starting from the current init
                new_policy = meta_learner.clone()
                for train_episodes in train_replays:
                    new_policy = fast_adapt_a2c(new_policy, train_episodes,
                                                adapt_lr, baseline, gamma, tau)

                # Compute PPO loss between old and new clones
                states = valid_replay.state()
                actions = valid_replay.action()
                rewards = valid_replay.reward()
                dones = valid_replay.done()
                next_states = valid_replay.next_state()
                old_log_probs = old_policy.log_prob(states, actions).detach()
                new_log_probs = new_policy.log_prob(states, actions)
                advantages = compute_advantages(baseline, tau, gamma, rewards,
                                                dones, states, next_states)
                advantages = ch.normalize(advantages).detach()
                ppo_loss += ppo.policy_loss(new_log_probs,
                                            old_log_probs,
                                            advantages,
                                            clip=0.1)

            ppo_loss /= meta_bsz
            opt.zero_grad()
            ppo_loss.backward()
            opt.step()
Exemple #7
0
def maml_pg(
    env_name='AntDirection-v1',
    policy_hidden=[128],
    adapt_lr=0.001,
    meta_lr=0.001,
    adapt_steps=1,
    num_iterations=200,
    meta_batch_size=20,
    adapt_batch_size=20,
    discount=0.99,
    num_workers=4,
    seed=0,
):
    """
        Runs MAML with policy gradient on environment
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    def make_env():
        return gym.make(env_name)

    env = l2l.gym.AsyncVectorEnv([make_env for _ in range(num_workers)])
    env.seed(seed)
    env = ch.envs.Torch(env)
    policy = Policy(input_size=env.state_size,
                    output_size=env.action_size,
                    hidden_dims=policy_hidden)
    meta_learner = l2l.algorithms.MAML(policy, lr=adapt_lr)
    baseline = LinearValue(env.state_size, env.action_size)
    opt = optim.Adam(policy.parameters(), lr=meta_lr)
    all_rewards = []

    for iteration in range(num_iterations):
        iteration_loss = 0.0
        iteration_reward = 0.0
        for task_config in tqdm(env.sample_tasks(meta_batch_size),
                                leave=False,
                                desc='Data'):
            learner = meta_learner.clone()
            env.set_task(task_config)
            env.reset()
            task = ch.envs.Runner(env)

            # adptation
            for step in range(adapt_steps):
                train_episodes = task.run(learner, episodes=adapt_batch_size)
                loss = pg_loss(train_episodes, learner, baseline, discount)
                learner.adapt(loss)

            # validation
            valid_episodes = task.run(learner, episodes=adapt_batch_size)
            loss = pg_loss(valid_episodes, learner, baseline, discount)
            iteration_loss += loss
            iteration_reward += valid_episodes.reward().sum().item(
            ) / adapt_batch_size

        print('\nIteration', iteration)
        adaptation_reward = iteration_reward / meta_batch_size
        print('adaptation_reward', adaptation_reward)
        all_rewards.append(adaptation_reward)
        adaptation_loss = iteration_loss / meta_batch_size
        # print('adaptation_loss', adaptation_loss.item())

        opt.zero_grad()
        adaptation_loss.backward()
        opt.step()

    torch.save(
        learner.state_dict(), './models/' + env_name + "/" +
        "_".join([str(n) for n in policy_hidden]) + '/pg_maml' + '.pt')
Exemple #8
0
def train_pg(env_name='AntDirection-v1',
             policy_hidden=[128],
             lr=0.001,
             num_iterations=5,
             batch_size=20,
             discount=0.99,
             num_workers=4,
             seed=0,
             filepath=None,
             mode_str="scratch",
             num_samples=10):
    """
        Trains policy gradient on samples from environment distribution
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    def make_env():
        return gym.make(env_name)

    train_rewards = np.zeros((num_iterations, ))
    val_rewards = np.zeros((num_iterations, ))

    for i in range(num_samples):
        print("Sample task " + str(i))
        env = l2l.gym.AsyncVectorEnv([make_env for _ in range(num_workers)])
        env.seed(seed)
        env = ch.envs.Torch(env)
        policy = Policy(input_size=env.state_size,
                        output_size=env.action_size,
                        hidden_dims=policy_hidden)
        learner = BaseLearner(policy)
        if filepath:
            print("Using weights from ", filepath)
            learner.load_state_dict(torch.load(filepath))
        baseline = LinearValue(env.state_size, env.action_size)
        opt = optim.Adam(policy.parameters(), lr=lr)

        task_config = env.sample_tasks(1)

        for iteration in range(num_iterations):
            task_config = env.sample_tasks(1)[0]
            env.set_task(task_config)
            env.reset()
            task = ch.envs.Runner(env)

            # update policy
            train_episodes = task.run(learner, episodes=batch_size)
            train_loss = pg_loss(train_episodes, learner, baseline, discount)
            train_reward = train_episodes.reward().sum().item() / batch_size
            train_rewards[iteration] += train_reward

            opt.zero_grad()
            train_loss.backward()
            opt.step()

            # validation
            valid_episodes = task.run(learner, episodes=batch_size)
            validation_loss = pg_loss(valid_episodes, learner, baseline,
                                      discount)
            validation_reward = valid_episodes.reward().sum().item(
            ) / batch_size
            val_rewards[iteration] += validation_reward

            print('\nIteration', iteration)
            print('Validation Reward', validation_reward)
            # print('Validation loss', validation_loss.item())

    train_rewards /= num_samples
    val_rewards /= num_samples
    # torch.save(learner.state_dict(), './models/' + env_name + "/" + "_".join([str(n) for n in policy_hidden]) + '/pg_' + 'train_' + mode_str + '.pt')
    np.save(
        './performance_data/' + env_name + "/" +
        "_".join([str(n) for n in policy_hidden]) + '/pg_' + 'train_' +
        mode_str + '_train_rewards.npy', train_rewards)
    np.save(
        './performance_data/' + env_name + "/" +
        "_".join([str(n) for n in policy_hidden]) + '/pg_' + 'train_' +
        mode_str + '_val_rewards.npy', val_rewards)