Beispiel #1
0
print("new task: ", test_task[0], ", where 1 is forward")

#task = env.unwrapped.sample_tasks(1)
env.unwrapped.reset_task(test_task[0])
observations = env.reset()
print("new task: ", env.step([1])[3]['task'], ", where 1 is forward")
_theta = env.step([1])[3]['task']
degrees = 180 * _theta['theta'] / np.pi
print("new task in degrees: ", degrees)

train_episodes = metalearner.sampler.sample(the_model,
                                            gamma=args.gamma,
                                            device=args.device)
print("len of train episoid: ", len(train_episodes))
print(train_episodes)
params = metalearner.adapt(train_episodes, first_order=args.first_order)
valid_episodes = metalearner.sampler.sample(the_model,
                                            params=params,
                                            gamma=args.gamma,
                                            device=args.device)
episodes.append((train_episodes, valid_episodes))

for param in [None, params]:
    for i in np.arange(1):
        observations = env.reset()
        if param == None:
            print("New episode before gradient update")
        else:
            print("New episode after one gradient update")

        rewards = 0
Beispiel #2
0
def train_pretrained_model(args):
    continuous_actions = (args.env_name in [
        'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1',
        'HalfCheetahDir-v1', '2DNavigation-v0', 'MountainCarContinuousVT-v0'
    ])

    # writer = SummaryWriter('./logs/{0}'.format(args.output_folder + '_pretrained'))
    save_folder = './saves/{0}'.format(args.output_folder + '_pretrained')
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    with open(os.path.join(save_folder, 'config.json'), 'w') as f:
        config = {k: v for (k, v) in vars(args).items() if k != 'device'}
        config.update(device=args.device.type)
        json.dump(config, f, indent=2)

    #batch_size=2*args.fast_batch_size to match the amount of data used in meta-learning
    sampler = BatchSampler(args.env_name,
                           batch_size=2 * args.fast_batch_size,
                           num_workers=args.num_workers)
    if continuous_actions:
        policy = NormalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            int(np.prod(sampler.envs.action_space.shape)),
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    else:
        policy = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    baseline = LinearFeatureBaseline(
        int(np.prod(sampler.envs.observation_space.shape)))

    #load pretrained model
    cont_from_batch = 0
    if args.start_from_batch != -1:
        pretrained_model = os.path.join(
            save_folder, 'policy-{0}.pt'.format(args.start_from_batch - 1))
        if os.path.exists(pretrained_model):
            policy.load_state_dict(torch.load(pretrained_model))
            cont_from_batch = args.start_from_batch

    metalearner = MetaLearner(sampler,
                              policy,
                              baseline,
                              gamma=args.gamma,
                              fast_lr=args.fast_lr,
                              tau=args.tau,
                              device=args.device)

    for batch in range(cont_from_batch, args.num_batches):
        print('Currently processing Batch: {}'.format(batch + 1))

        task_sampling_time = time.time()
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        task_sampling_time = time.time() - task_sampling_time

        episode_generating_time = time.time()
        episodes = metalearner.sample_for_pretraining(
            tasks, first_order=args.first_order)
        episode_generating_time = time.time() - episode_generating_time

        learning_step_time = time.time()
        params = metalearner.adapt(episodes, first_order=args.first_order)
        metalearner.policy.load_state_dict(params, strict=True)
        learning_step_time = time.time() - learning_step_time

        print('Tasking Sampling Time: {}'.format(task_sampling_time))
        print('Episode Generating Time: {}'.format(episode_generating_time))
        print('Learning Step Time: {}'.format(learning_step_time))

        # Tensorboard
        # writer.add_scalar('total_rewards/before_update',
        #     total_rewards([ep.rewards for ep, _ in episodes]), batch)
        # writer.add_scalar('total_rewards/after_update',
        #     total_rewards([ep.rewards for _, ep in episodes]), batch)
        # experiment.log_metric("Avg Disc Reward (Pretrained)", total_rewards([episodes.rewards], args.gamma), batch+1)

        # Save policy network
        with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)),
                  'wb') as f:
            torch.save(metalearner.policy.state_dict(), f)

    return