Exemple #1
0
def main(args):
    wandb.config.update({
        k: v
        for k, v in vars(args).items()
        if k in ['env_name', 'tau', 'critic_lr']
    })
    continuous_actions = (args.env_name in [
        'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1',
        'HalfCheetahDir-v1', '2DNavigation-v0'
    ])

    save_folder = './saves/{0}'.format(args.output_folder)
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    with open(os.path.join(save_folder, 'config.json'), 'w') as f:
        config = {k: v for (k, v) in vars(args).items() if k != 'device'}
        config.update(device=args.device.type)
        json.dump(config, f, indent=2)

    sampler = BatchSampler(args.env_name,
                           args.seed,
                           batch_size=args.fast_batch_size,
                           num_workers=args.num_workers)
    if continuous_actions:
        policy = NormalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            int(np.prod(sampler.envs.action_space.shape)),
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    else:
        policy = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    baseline = LinearFeatureBaseline(
        int(np.prod(sampler.envs.observation_space.shape)))
    critic = Critic(int(np.prod(sampler.envs.observation_space.shape)),
                    1,
                    hidden_sizes=(args.hidden_size, ) * args.num_layers)

    metalearner = ActorCriticMetaLearner(sampler,
                                         policy,
                                         critic,
                                         gamma=args.gamma,
                                         fast_lr=args.fast_lr,
                                         tau=args.tau,
                                         device=args.device,
                                         critic_lr=args.critic_lr)
    wandb.watch(metalearner.critic)

    for batch in range(args.num_batches):
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        episodes = metalearner.sample(tasks, first_order=args.first_order)
        meta_critic_loss = metalearner.step(
            episodes,
            max_kl=args.max_kl,
            cg_iters=args.cg_iters,
            cg_damping=args.cg_damping,
            ls_max_steps=args.ls_max_steps,
            ls_backtrack_ratio=args.ls_backtrack_ratio)

        # Logging
        wandb.log(
            {
                'total_rewards/before_update':
                total_rewards([ep.rewards for ep, _ in episodes])
            },
            step=batch)
        wandb.log(
            {
                'total_rewards/after_update':
                total_rewards([ep.rewards for _, ep in episodes])
            },
            step=batch)
        wandb.log({'meta critic loss': meta_critic_loss.detach().item()},
                  step=batch)

        # Save policy network
        with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)),
                  'wb') as f:
            torch.save(policy.state_dict(), f)
Exemple #2
0
def main(args):
    # Setup for logging
    tb_writer = SummaryWriter('./logs/tb_{}'.format(
        args.log_name))  # Tensorboard logging
    log = set_log(args)

    # Setup before meta-train starts
    sampler = BatchSampler(env_name=args.env_name,
                           batch_size=args.fast_batch_size,
                           num_workers=args.num_workers,
                           args=args)

    # NOTE Observation space is a list with [predator0, predator1, ..., prey]
    # Thus using the index of 0
    policy = NormalMLPPolicy(
        input_size=int(np.prod(sampler.envs.observation_space[0].shape)),
        output_size=int(np.prod(sampler.envs.action_space[0].shape)),
        hidden_sizes=(args.hidden_size, ) * args.num_layers)

    baseline = LinearFeatureBaseline(
        input_size=int(np.prod(sampler.envs.observation_space[0].shape)))

    meta_learner = MetaLearner(sampler,
                               policy,
                               baseline,
                               gamma=args.gamma,
                               fast_lr=args.fast_lr,
                               tau=args.tau,
                               device=args.device,
                               args=args,
                               log=log,
                               tb_writer=tb_writer)

    # meta_learner.load(
    #     filename="theta_200", directory="./pytorch_models")

    meta_tester = MetaTester(sampler,
                             policy,
                             baseline,
                             gamma=args.gamma,
                             fast_lr=args.fast_lr,
                             tau=args.tau,
                             device=args.device,
                             args=args,
                             log=log,
                             tb_writer=tb_writer)

    prey = Prey(env=sampler._env,
                args=args,
                log=log,
                tb_writer=tb_writer,
                name="prey",
                i_agent=0)

    # Meta-train starts
    iteration = 0
    while True:
        # Sample train and validation episode
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size,
                                     test=False)
        episodes = meta_learner.sample(tasks,
                                       prey,
                                       first_order=args.first_order,
                                       iteration=iteration)

        # Train meta-policy
        meta_learner.step(episodes=episodes, args=args)

        # Test meta-policy
        if iteration % 10 == 0:
            test_tasks = sampler.sample_tasks(num_tasks=5, test=True)
            meta_tester.few_shot_adaptation(meta_policy=meta_learner.policy,
                                            tasks=test_tasks,
                                            first_order=args.first_order,
                                            iteration=iteration,
                                            prey=prey)

        if iteration % 100 == 0:
            meta_learner.save(iteration)

        iteration += 1
Exemple #3
0
def main(args):
    continuous_actions = (args.env_name in [
        'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1',
        'HalfCheetahDir-v1', '2DNavigation-v0', 'Point2DWalls-corner-v0',
        'Ant-v0', 'HalfCheetah-v0'
    ])

    writer = SummaryWriter(log_dir=args.log_dir)

    logger.configure(dir=args.log_dir, format_strs=['stdout', 'log', 'csv'])
    logger.log(args)
    json.dump(vars(args),
              open(os.path.join(
                  args.log_dir,
                  'params.json',
              ), 'w'),
              indent=2)

    sampler = BatchSampler(args.env_name,
                           batch_size=args.fast_batch_size,
                           num_workers=args.num_workers)
    if continuous_actions:
        policy = NormalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            int(np.prod(sampler.envs.action_space.shape)),
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    else:
        policy = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    baseline = LinearFeatureBaseline(
        int(np.prod(sampler.envs.observation_space.shape)))

    metalearner = MetaLearner(sampler,
                              policy,
                              baseline,
                              gamma=args.gamma,
                              fast_lr=args.fast_lr,
                              tau=args.tau,
                              device=args.device)

    for batch in range(args.num_batches):
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        episodes = metalearner.sample(tasks, first_order=args.first_order)
        metalearner.step(episodes,
                         max_kl=args.max_kl,
                         cg_iters=args.cg_iters,
                         cg_damping=args.cg_damping,
                         ls_max_steps=args.ls_max_steps,
                         ls_backtrack_ratio=args.ls_backtrack_ratio)

        # # Tensorboard
        writer.add_scalar('total_rewards/before_update',
                          total_rewards([ep.rewards for ep, _ in episodes]),
                          batch)
        writer.add_scalar('total_rewards/after_update',
                          total_rewards([ep.rewards for _, ep in episodes]),
                          batch)

        logger.logkv('return_avg_pre',
                     total_rewards([ep.rewards for ep, _ in episodes]))
        logger.logkv('return_avg_post',
                     total_rewards([ep.rewards for _, ep in episodes]))
        logger.dumpkvs()
Exemple #4
0
def main(args):
    continuous_actions = (args.env_name in ['AntVel-v1', 'AntDir-v1',
        'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1',
        '2DNavigation-v0', '2DPointEnvCorner-v0'])

    save_folder = './saves/{0}'.format(args.env_name+'/'+args.output_folder)
    if args.output_folder!='maml-trial' and args.output_folder!='trial':
        i=0
        while os.path.exists(save_folder):
            args.output_folder=str(i+1)
            i+=1
            save_folder = './saves/{0}'.format(args.env_name+'/'+args.output_folder)
            log_directory = './logs/{0}'.format(args.env_name+'/'+args.output_folder)
        os.makedirs(save_folder)
    writer = SummaryWriter('./logs/{0}'.format(args.env_name+'/'+args.output_folder))

    with open(os.path.join(save_folder, 'config.json'), 'w') as f:
        config = {k: v for (k, v) in vars(args).items() if k != 'device'}
        config.update(device=args.device.type)
        json.dump(config, f, indent=2)


    sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size,
        num_workers=args.num_workers)
    if continuous_actions:
        policy = NormalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            int(np.prod(sampler.envs.action_space.shape)),
            hidden_sizes=(args.hidden_size,) * args.num_layers)
    else:
        policy = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size,) * args.num_layers)
    baseline = LinearFeatureBaseline(
        int(np.prod(sampler.envs.observation_space.shape)))

    if args.load_dir is not None:
        policy.load_state_dict(torch.load(args.load_dir))

    metalearner = MetaLearner(sampler, policy, baseline, args, gamma=args.gamma,
        fast_lr=args.fast_lr, tau=args.tau, device=args.device)

    for batch in range(args.num_batches):
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        episodes = metalearner.sample(tasks, first_order=args.first_order)
        metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters,
            cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps,
            ls_backtrack_ratio=args.ls_backtrack_ratio)

        print('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch)
        print('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch)
        
        # Plotting figure
        # plotting(episodes, batch, save_folder,args.num_plots)

        if args.load_dir is not None:
            sys.exit(0)
            
        # Tensorboard
        writer.add_scalar('total_rewards/before_update',
            total_rewards([ep.rewards for ep, _ in episodes]), batch)
        writer.add_scalar('total_rewards/after_update',
            total_rewards([ep.rewards for _, ep in episodes]), batch)

        # Save policy network
        with open(os.path.join(save_folder,
                'policy-{0}.pt'.format(batch)), 'wb') as f:
            torch.save(policy.state_dict(), f)
Exemple #5
0
def main(args):
    set_random_seed(args.random)

    continuous_actions = (args.env_name in [
        'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1',
        'HalfCheetahDir-v1', '2DNavigation-v0', '2DNavigationBiased-v0'
    ])

    writer = SummaryWriter('./logs/{0}'.format(args.alg))
    save_folder = './saves/{0}'.format(args.alg)
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    with open(os.path.join(save_folder, 'config.json'), 'w') as f:
        config = {k: v for (k, v) in vars(args).items() if k != 'device'}
        config.update(device=args.device.type)
        json.dump(config, f, indent=2)

    sampler = BatchSampler(args.env_name,
                           batch_size=args.fast_batch_size,
                           num_workers=args.num_workers,
                           seed=args.random)
    if continuous_actions:
        policy = NormalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            int(np.prod(sampler.envs.action_space.shape)),
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    else:
        policy = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    baseline = LinearFeatureBaseline(
        int(np.prod(sampler.envs.observation_space.shape)))

    if args.alg == 'simul':
        # vanilla maml
        metalearner = MetaLearner(sampler,
                                  policy,
                                  baseline,
                                  gamma=args.gamma,
                                  fast_lr=args.fast_lr,
                                  tau=args.tau,
                                  device=args.device)

        for batch in range(args.meta_policy_num * args.num_batches):
            # first sample tasks under the distribution
            tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
            # get episodes in the form of (train episodes, test episodes after adaption)
            episodes = metalearner.sample(tasks, first_order=args.first_order)
            metalearner.step(episodes,
                             max_kl=args.max_kl,
                             cg_iters=args.cg_iters,
                             cg_damping=args.cg_damping,
                             ls_max_steps=args.ls_max_steps,
                             ls_backtrack_ratio=args.ls_backtrack_ratio)

            # Tensorboard
            writer.add_scalar(
                'maml/before_update',
                total_rewards([ep.rewards for ep, _ in episodes]), batch)
            writer.add_scalar(
                'maml/after_update',
                total_rewards([ep.rewards for _, ep in episodes]), batch)

            # Save policy network
            with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)),
                      'wb') as f:
                torch.save(policy.state_dict(), f)

    elif args.alg == 'greedy':
        # multi-policy maml
        metalearner = KPolicyMetaLearner(sampler,
                                         policy,
                                         baseline,
                                         args.meta_policy_num,
                                         gamma=args.gamma,
                                         fast_lr=args.fast_lr,
                                         tau=args.tau,
                                         device=args.device)

        # visualize the poolicies' behavior
        trajectories = []
        for policy_idx in range(args.meta_policy_num):
            print(policy_idx)
            metalearner.optimize_policy_index(policy_idx)

            for batch in range(args.num_batches):
                print('batch num %d' % batch)

                tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
                metalearner.evaluate_optimized_policies(tasks)

                episodes = metalearner.sample(tasks,
                                              first_order=args.first_order)
                # loss is computed inside, then update policies
                metalearner.step(episodes,
                                 max_kl=args.max_kl,
                                 cg_iters=args.cg_iters,
                                 cg_damping=args.cg_damping,
                                 ls_max_steps=args.ls_max_steps,
                                 ls_backtrack_ratio=args.ls_backtrack_ratio)

                # not sure what to write in tensorboard...
                for epIdx in range(len(episodes)):
                    writer.add_scalar(
                        'kmaml/pi_' + str(policy_idx) + '_task_' + str(epIdx),
                        total_rewards([episodes[epIdx][1].rewards]), batch)
            # use a random task (no update here anyway) to visualize meta-policies
            tasks = sampler.sample_tasks(num_tasks=1)
            trajectories.append(metalearner.sample_meta_policy(tasks[0]))
        plotTrajectories(trajectories)
baseline = LinearFeatureBaseline(
    int(np.prod(sampler.envs.observation_space.shape)))

metalearner = MetaLearner(sampler,
                          the_model,
                          baseline,
                          gamma=args.gamma,
                          fast_lr=args.fast_lr,
                          tau=args.tau,
                          device=args.device)

test_batch_size = 2
test_reward_before = []
test_reward_after = []

for test_batch in range(test_batch_size):
    #sample one task
    test_task = sampler.sample_tasks(num_tasks=1)
    print("test_task: ", test_task)
    sampler.reset_task(test_task[0])

    #sample some episodes for that task
    episodes = metalearner.sample(test_task, first_order=args.first_order)
    test_reward_before.append(total_rewards([ep.rewards
                                             for ep, _ in episodes]))
    test_reward_after.append(total_rewards([ep.rewards for _, ep in episodes]))

print("before:", test_reward_before, "; after: ", test_reward_after, "\n")
print("before average: ", np.mean(test_reward_before), "after average: ",
      np.mean(test_reward_after))
Exemple #7
0
def main(args):
    save_folder = f'saves/{args.output_folder + get_date_str()}'
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    with open(os.path.join(save_folder, 'config.json'), 'w') as f:
        config = {k: v for (k, v) in vars(args).items() if k != 'device'}
        config.update(device=args.device.type)
        json.dump(config, f, indent=2)

    print('Initializing samplers...')

    sampler = BatchSampler(args.env_name,
                           batch_size=args.fast_batch_size,
                           num_workers=args.num_workers)

    test_sampler = BatchSampler(args.env_name,
                                test_env=True,
                                batch_size=args.fast_batch_size,
                                num_workers=max(1, args.num_workers // 2))

    policy = NormalMLPPolicy(
        int(np.prod(sampler.envs.observation_space.shape)),
        int(np.prod(sampler.envs.action_space.shape)),
        hidden_sizes=(args.hidden_size, ) * args.num_layers)

    baseline = LinearFeatureBaseline(
        int(np.prod(sampler.envs.observation_space.shape)))

    print('Initializing meta-learners...')

    metalearner = MetaLearner(sampler,
                              policy,
                              baseline,
                              gamma=args.gamma,
                              fast_lr=args.fast_lr,
                              tau=args.tau,
                              device=args.device)  # noqa: E128

    # NOTE: we need this metalearner only to sample test tasks
    test_metalearner = MetaLearner(test_sampler,
                                   policy,
                                   baseline,
                                   gamma=args.gamma,
                                   fast_lr=args.fast_lr,
                                   tau=args.tau,
                                   device=args.device)  # noqa: E128

    print('Starting the training')

    # Initialize logging
    wandb.init()
    wandb.config.update(args)

    task_name2id = {name: i for i, name in enumerate(sampler._env._task_names)}
    task_id2name = sampler._env._task_names
    task2prob = np.ones(sampler._env.num_tasks) / sampler._env.num_tasks
    uniform = np.ones_like(task2prob) / sampler._env.num_tasks

    # outer loop (meta-training)
    for i in range(args.num_batches):
        print(f'Batch {i}')

        # sample trajectories from random tasks
        print(f'\tSampling a batch of {args.meta_batch_size} training tasks')
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size,
                                     task2prob=0.99 * task2prob +
                                     0.01 * uniform)
        # Note: Dirty hack to overcome metaworld dirty hack
        task_names = [sampler._env._task_names[t['task']] for t in tasks]

        # inner loop (adaptation)
        # returns list of tuples (train_episodes, valid_episodes)
        print(f'\tTraining')
        episodes = metalearner.sample(tasks, first_order=args.first_order)

        print(f'\tUpdating the meta-model')
        metalearner.step(episodes,
                         max_kl=args.max_kl,
                         cg_iters=args.cg_iters,
                         cg_damping=args.cg_damping,
                         ls_max_steps=args.ls_max_steps,
                         ls_backtrack_ratio=args.ls_backtrack_ratio)

        # Logging
        # before: before parameters update
        # after: after parameters adaptation to the task

        r_before = total_rewards([ep.rewards for ep, _ in episodes])
        r_after = total_rewards([ep.rewards for _, ep in episodes])

        test_episode_infos = [ep._info_list for ep, _ in episodes]
        success_rate_before, task_success_rate_before = get_success_rate(
            test_episode_infos, task_names, per_task=True)

        test_episode_infos = [ep._info_list for _, ep in episodes]
        success_rate_after, task_success_rate_after = get_success_rate(
            test_episode_infos, task_names, per_task=True)

        wandb.log(
            {
                'total_rewards/before_update':
                r_before,
                'total_rewards/after_update':
                r_after,
                'success_rate/before_update':
                success_rate_before,
                'success_rate/after_update':
                success_rate_after,
                'success_rate/improvement':
                success_rate_after - success_rate_before,
                'success_rate/before_update_macro':
                np.mean(list(task_success_rate_before.values())),
                'success_rate/after_update_macro':
                np.mean(list(task_success_rate_after.values())),
            },
            step=i)
        wandb.log(
            {
                f'success_rate/after_update/{task}': rate
                for task, rate in task_success_rate_after.items()
            },
            step=i)
        wandb.log(
            {
                f'success_rate/before_update/{task}': rate
                for task, rate in task_success_rate_before.items()
            },
            step=i)
        wandb.log(
            {
                f'success_rate/imrovement/{task}':
                task_success_rate_after[task] - task_success_rate_before[task]
                for task in task_success_rate_before.keys()
            },
            step=i)
        wandb.log(
            {
                f'n_acquired_tasks/before_update/at_{x}': sum(
                    rate > x for rate in task_success_rate_before.values())
                for x in [0.001, 0.01, 0.05, 0.1, 0.5]
            },
            step=i)
        wandb.log(
            {
                f'n_acquired_tasks/after_update/at_{x}': sum(
                    rate > x for rate in task_success_rate_after.values())
                for x in [0.001, 0.01, 0.05, 0.1, 0.5]
            },
            step=i)

        if args.active_learning:
            new_task2prob = np.zeros_like(task2prob)

            if args.prob_f == 'linear':
                norm = 1e-7 + sum(task_success_rate_after.values())
                for task, rate in task_success_rate_after.items():
                    task_id = task_name2id[task]
                    new_task2prob[task_id] = 1. - rate / norm

            elif args.prob_f == 'softmax':  # softmax(1 - rate)
                # numerical stability trick
                # http://cs231n.github.io/linear-classify/#softmax
                max_f = 1 - min(task_success_rate_after.values())
                for task, rate in task_success_rate_after.items():
                    task_id = task_name2id[task]
                    f = 1 - rate
                    new_task2prob[task_id] = np.exp(
                        (f - max_f) / args.temperature)

                new_task2prob = new_task2prob / (1e-7 + sum(new_task2prob))

            elif args.prob_f == 'softmax2':  # 1 - softmax(rate)
                max_f = max(task_success_rate_after.values())
                for task, rate in task_success_rate_after.items():
                    task_id = task_name2id[task]
                    new_task2prob[task_id] = np.exp(
                        (rate - max_f) / args.temperature)

                new_task2prob = 1. - new_task2prob / (1e-7 +
                                                      sum(new_task2prob))
            else:
                raise RuntimeError(
                    'prob-f should be either "softmax", "softmax2" or "linear"'
                )

            alpha = args.success_rate_smoothing
            task2prob = alpha * task2prob + (1 - alpha) * new_task2prob

            task2prob /= sum(task2prob)
            assert all(task2prob > 0)  # strictly!

            wandb.log(
                {
                    f'task2prob/{task_id2name[task_id]}': prob
                    for task_id, prob in enumerate(task2prob)
                },
                step=i)

        # meta-test
        if i % args.eval_every == 0:
            print(f'Evaluating on meta-test')

            # save policy network
            _save_path = os.path.join(save_folder, 'policy-{0}.pt'.format(i))
            with open(_save_path, 'wb') as f:
                torch.save(policy.state_dict(), f)
            wandb.save(_save_path)

            # Evaluate on meta-test
            tasks = test_sampler.sample_tasks(num_tasks=2 *
                                              args.meta_batch_size)
            # Note: Dirty hack to overcome metaworld dirty hack
            task_names = [
                test_sampler._env._task_names[t['task']] for t in tasks
            ]

            episodes = test_metalearner.sample(tasks,
                                               first_order=args.first_order)

            r_before = total_rewards([ep.rewards for ep, _ in episodes])
            r_after = total_rewards([ep.rewards for _, ep in episodes])

            test_episode_infos = [ep._info_list for ep, _ in episodes]
            success_rate_before, task_success_rate_before = get_success_rate(
                test_episode_infos, task_names, per_task=True)

            test_episode_infos = [ep._info_list for _, ep in episodes]
            success_rate_after, task_success_rate_after = get_success_rate(
                test_episode_infos, task_names, per_task=True)

            wandb.log(
                {
                    'total_rewards_test/before_update':
                    r_before,
                    'total_rewards_test/after_update':
                    r_after,
                    'success_rate_test/before_update':
                    success_rate_before,
                    'success_rate_test/after_update':
                    success_rate_after,
                    'success_rate_test/improvement':
                    success_rate_after - success_rate_before
                },
                step=i)
            wandb.log(
                {
                    f'success_rate_test/after_update/{task}': rate
                    for task, rate in task_success_rate_after.items()
                },  # noqa: E501
                step=i)
            wandb.log(
                {
                    f'success_rate_test/before_update/{task}': rate
                    for task, rate in task_success_rate_before.items()
                },  # noqa: E501
                step=i)
            wandb.log(
                {
                    f'success_rate_test/imrovement/{task}':
                    task_success_rate_after[task] -
                    task_success_rate_before[task]
                    for task in task_success_rate_before.keys()
                },
                step=i)

    print('Saving the final model')
    # save final policy
    _save_path = os.path.join(save_folder, 'policy-final.pt')
    with open(_save_path, 'wb') as f:
        torch.save(policy.state_dict(), f)
    wandb.save(_save_path)
Exemple #8
0
def main(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    torch.manual_seed(args.seed)

    continuous_actions = (args.env_name in [
        'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1',
        'HalfCheetahDir-v1', '2DNavigation-v0'
    ])

    writer = SummaryWriter('./logs/{0}'.format(args.output_folder))
    save_folder = './saves/{0}'.format(args.output_folder)
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    with open(os.path.join(save_folder, 'config.json'), 'w') as f:
        config = {k: v for (k, v) in vars(args).items() if k != 'device'}
        config.update(device=args.device.type)
        json.dump(config, f, indent=2)

    sampler = BatchSampler(args.env_name,
                           batch_size=args.fast_batch_size,
                           num_workers=args.num_workers)
    if continuous_actions:
        policy = NormalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            int(np.prod(sampler.envs.action_space.shape)),
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    else:
        policy = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    baseline = LinearFeatureBaseline(
        int(np.prod(sampler.envs.observation_space.shape)))

    metalearner = MetaLearner(
        sampler,
        policy,
        baseline,
        gamma=args.gamma,
        fast_lr=args.fast_lr,
        tau=args.tau,
        q_inner=args.inner_q == 'true',
        q_residuce_gradient=args.inner_q_residue_gradient == 'true',
        q_soft=args.inner_q_soft == 'true',
        q_soft_temp=args.inner_q_soft_temp,
        device=args.device,
    )

    for batch in range(args.num_batches):
        if args.device.type == 'cuda':
            torch.cuda.empty_cache()
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        episodes, adaptation_info = metalearner.sample(
            tasks, first_order=args.first_order)
        metalearner.step(episodes,
                         max_kl=args.max_kl,
                         cg_iters=args.cg_iters,
                         cg_damping=args.cg_damping,
                         ls_max_steps=args.ls_max_steps,
                         ls_backtrack_ratio=args.ls_backtrack_ratio)

        # Tensorboard
        pre_update_rewards = total_rewards([ep.rewards for ep, _ in episodes])
        post_update_rewards = total_rewards([ep.rewards for _, ep in episodes])

        writer.add_scalar('total_rewards/before_update', pre_update_rewards,
                          batch)
        writer.add_scalar('total_rewards/after_update', post_update_rewards,
                          batch)
        writer.add_scalar('total_rewards/rewards_improvement',
                          post_update_rewards - pre_update_rewards, batch)

        writer.add_scalar('adaptation/pre_update_inner_loss',
                          adaptation_info.mean_pre_update_loss, batch)
        writer.add_scalar('adaptation/post_update_inner_loss',
                          adaptation_info.mean_post_update_loss, batch)
        writer.add_scalar('adaptation/inner_loss_improvement',
                          adaptation_info.mean_loss_improvment, batch)
        writer.add_scalar('adaptation/weight_change',
                          adaptation_info.mean_weight_change, batch)

        # Save policy network
        with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)),
                  'wb') as f:
            torch.save(policy.state_dict(), f)
Exemple #9
0
def main(args):
    continuous_actions = (args.env_name in [
        'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1',
        'HalfCheetahDir-v1', '2DNavigation-v0'
    ])

    writer = SummaryWriter('./logs/{0}'.format(args.output_folder))
    save_folder = './saves/{0}'.format(args.output_folder)
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    with open(os.path.join(save_folder, 'config.json'), 'w') as f:
        config = {k: v for (k, v) in vars(args).items() if k != 'device'}
        config.update(device=args.device.type)
        json.dump(config, f, indent=2)

    sampler = BatchSampler(args.env_name,
                           batch_size=args.fast_batch_size,
                           num_workers=args.num_workers)
    if continuous_actions:
        policy = NormalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            int(np.prod(sampler.envs.action_space.shape)),
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    else:
        policy = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    baseline = LinearFeatureBaseline(
        int(np.prod(sampler.envs.observation_space.shape)))

    metalearner = MetaLearnerNGLVCVPG(sampler,
                                      policy,
                                      baseline,
                                      gamma=args.gamma,
                                      fast_lr=args.fast_lr,
                                      tau=args.tau,
                                      device=args.device,
                                      verbose=args.verbose)

    tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
    for batch in range(args.num_batches):
        start = time.time()
        episodes = metalearner.sample(tasks,
                                      first_order=args.first_order,
                                      cg_iters=args.cg_iters)
        sample_time = time.time() - start
        start = time.time()
        if args.optimizer is 'sgd':
            metalearner.step_sgd(episodes,
                                 max_kl=args.max_kl,
                                 cg_iters=args.cg_iters,
                                 cg_damping=args.cg_damping,
                                 ls_max_steps=args.ls_max_steps,
                                 ls_backtrack_ratio=args.ls_backtrack_ratio)
        else:
            metalearner.step_adam(episodes,
                                  max_kl=args.max_kl,
                                  cg_iters=args.cg_iters,
                                  cg_damping=args.cg_damping,
                                  ls_max_steps=args.ls_max_steps,
                                  ls_backtrack_ratio=args.ls_backtrack_ratio)
        update_time = time.time() - start

        # Tensorboard
        writer.add_scalar('total_rewards/before_update',
                          total_rewards([ep.rewards for ep, _ in episodes]),
                          batch)
        writer.add_scalar('total_rewards/after_update',
                          total_rewards([ep.rewards for _, ep in episodes]),
                          batch)

        print(
            "Batch {}. before_update: {}, after_update: {}\n sample time {}, update_time {}"
            .format(batch, total_rewards([ep.rewards for ep, _ in episodes]),
                    total_rewards([ep.rewards for _, ep in episodes]),
                    sample_time, update_time))
        # Save policy network
        with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)),
                  'wb') as f:
            torch.save(policy.state_dict(), f)