Ejemplo n.º 1
0
def main(args):
    continuous_actions = True

    writer = SummaryWriter('./logs/{0}'.format(args.output_folder))
    save_folder = './saves/{0}'.format(args.output_folder)
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    with open(os.path.join(save_folder, 'config.json'), 'w') as f:
        config = {k: v for (k, v) in vars(args).items() if k != 'device'}
        config.update(device=args.device.type)
        json.dump(config, f, indent=2)

    sampler = BatchSampler(args.env_name,
                           batch_size=args.fast_batch_size,
                           num_workers=args.num_workers)
    if continuous_actions:
        policy = NormalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            int(np.prod(sampler.envs.action_space.shape)),
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    else:
        policy = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    baseline = LinearFeatureBaseline(
        int(np.prod(sampler.envs.observation_space.shape)))

    metalearner = MetaLearner(sampler,
                              policy,
                              baseline,
                              gamma=args.gamma,
                              fast_lr=args.fast_lr,
                              tau=args.tau,
                              device=args.device)
    for batch in range(args.num_batches):
        print("========== BATCH NUMBER {0} ==========".format(batch))
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        episodes = metalearner.sample(tasks, first_order=args.first_order)
        metalearner.step(episodes,
                         max_kl=args.max_kl,
                         cg_iters=args.cg_iters,
                         cg_damping=args.cg_damping,
                         ls_max_steps=args.ls_max_steps,
                         ls_backtrack_ratio=args.ls_backtrack_ratio)

        # Tensorboard
        writer.add_scalar('total_rewards/before_update',
                          total_rewards([ep.rewards for ep, _ in episodes]),
                          batch)
        writer.add_scalar('total_rewards/after_update',
                          total_rewards([ep.rewards for _, ep in episodes]),
                          batch)

        # Save policy network
        with open(
                os.path.join(save_folder, 'policy-{0}.pt'.format(batch + 256)),
                'wb') as f:
            torch.save(policy.state_dict(), f)
def main(args):
    continuous_actions = (args.env_name in [
        'AntVelEnv-v1', 'AntDirEnv-v1', 'HalfCheetahVelEnv-v1',
        'HalfCheetahDirEnv-v1', '2DNavigation-v0'
    ])

    save_folder = os.path.join('tmp', args.output_folder)
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    sampler = BatchSampler(args.env_name,
                           batch_size=args.fast_batch_size,
                           num_workers=args.num_workers)
    if continuous_actions:
        policy = NormalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            int(np.prod(sampler.envs.action_space.shape)),
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    else:
        policy = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    baseline = LinearFeatureBaseline(
        int(np.prod(sampler.envs.observation_space.shape)))

    # Load model
    with open(args.model, 'rb') as f:
        state_dict = torch.load(f)
        policy.load_state_dict(state_dict)

    metalearner = MetaLearner(sampler,
                              policy,
                              baseline,
                              gamma=args.gamma,
                              fast_lr=args.fast_lr,
                              tau=args.tau,
                              device=args.device)

    args.meta_batch_size = 81
    # velocities = np.linspace(-1., 3., num=args.meta_batch_size)
    # tasks = [{'velocity': velocity} for velocity in velocities]
    tasks = [{'direction': direction} for direction in [-1, 1]]

    for batch in range(args.num_batches):
        episodes = metalearner.sample(tasks)
        train_returns = [ep.rewards.sum(0).cpu().numpy() for ep, _ in episodes]
        valid_returns = [ep.rewards.sum(0).cpu().numpy() for _, ep in episodes]

        with open(os.path.join(save_folder, '{0}.npz'.format(batch)),
                  'wb') as f:
            np.savez(f, train=train_returns, valid=valid_returns)
        print('Batch {0}'.format(batch))
Ejemplo n.º 3
0
def clone_policy(policy, params=None, with_names=False):

    if params is None:
        params = policy.get_trainable_variables()

    if isinstance(policy, CategoricalMLPPolicy):
        cloned_policy = CategoricalMLPPolicy(input_size=policy.input_size,
                                             output_size=policy.output_size,
                                             hidden_sizes=policy.hidden_sizes,
                                             nonlinearity=policy.nonlinearity)
    elif isinstance(policy, NormalMLPPolicy):
        cloned_policy = NormalMLPPolicy(input_size=policy.input_size,
                                        output_size=policy.output_size,
                                        hidden_sizes=policy.hidden_sizes,
                                        nonlinearity=policy.nonlinearity)
    else:
        raise NotImplementedError('Only `Categorical` and `Normal` '
                                  'policies are valid policies at the moment.')

    #x = tf.zeros(shape=(1, cloned_policy.input_size))
    #cloned_policy(x)

    if with_names:
        cloned_policy.set_params_with_name(params)
    else:
        cloned_policy.set_params(params)

    return cloned_policy
Ejemplo n.º 4
0
def main(args):

	args.output_folder = args.env_name

	# TODO
	continuous_actions = (args.env_name in ['AntVel-v1', 'AntDir-v1',
	                                        'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1',
	                                        '2DNavigation-v0'])

	# writer = SummaryWriter('./logs/{0}'.format(args.output_folder))
	save_folder = './saves/{0}'.format(args.output_folder)
	if not os.path.exists(save_folder):
		os.makedirs(save_folder)

	with open(os.path.join(save_folder, 'config.json'), 'w') as f:
		# config = {k: v for (k, v) in vars(args).iteritems() if k != 'device'}
		config = {k: v for (k, v) in vars(args).items() if k != 'device'}
		config.update(device=args.device.type)
		json.dump(config, f, indent=2)
		print(config)

	sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers)

	if continuous_actions:
		policy = NormalMLPPolicy(
			int(np.prod(sampler.envs.observation_space.shape)), # input shape
			int(np.prod(sampler.envs.action_space.shape)), # output shape
			hidden_sizes=(args.hidden_size,) * args.num_layers) # [100, 100]
	else:
		policy = CategoricalMLPPolicy(
			int(np.prod(sampler.envs.observation_space.shape)),
			sampler.envs.action_space.n,
			hidden_sizes=(args.hidden_size,) * args.num_layers)

	baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape)))

	metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma,
	                          fast_lr=args.fast_lr, tau=args.tau, device=args.device)

	for batch in range(args.num_batches): # number of epoches

		tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
		episodes = metalearner.sample(tasks, first_order=args.first_order)

		metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters,
		                 cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps,
		                 ls_backtrack_ratio=args.ls_backtrack_ratio)

		# Tensorboard
		# writer.add_scalar('total_rewards/before_update',
		#                   total_rewards([ep.rewards for ep, _ in episodes]), batch)
		# writer.add_scalar('total_rewards/after_update',
		#                   total_rewards([ep.rewards for _, ep in episodes]), batch)


		# # Save policy network
		# with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f:
		# 	torch.save(policy.state_dict(), f)

		print(batch, total_rewards([ep.rewards for ep, _ in episodes]), total_rewards([ep.rewards for _, ep in episodes]))
Ejemplo n.º 5
0
def get_policy_for_env(env, hidden_sizes=(100, 100), nonlinearity='relu'):
    continuous_actions = isinstance(env.action_space, gym.spaces.Box)
    input_size = get_input_size(env)
    nonlinearity = getattr(torch, nonlinearity)

    if continuous_actions:
        output_size = reduce(mul, env.action_space.shape, 1)
        policy = NormalMLPPolicy(input_size,
                                 output_size,
                                 hidden_sizes=tuple(hidden_sizes),
                                 nonlinearity=nonlinearity)
    else:
        output_size = env.action_space.n
        policy = CategoricalMLPPolicy(input_size,
                                      output_size,
                                      hidden_sizes=tuple(hidden_sizes),
                                      nonlinearity=nonlinearity)
    return policy
Ejemplo n.º 6
0
def get_policy_for_env(env, hidden_sizes=(100, 100), nonlinearity='relu'):
    # 判断是连续控制还是离散控制问题
    continuous_actions = isinstance(env.action_space, gym.spaces.Box)
    input_size = get_input_size(env)  # 输入是状态空间的维数乘积
    nonlinearity = getattr(torch, nonlinearity)  # 返回对象的属性值,这里是一个torch函数

    if continuous_actions:
        output_size = reduce(mul, env.action_space.shape, 1)  # 输出是动作空间维数乘积
        policy = NormalMLPPolicy(input_size,
                                 output_size,
                                 hidden_sizes=tuple(hidden_sizes),
                                 nonlinearity=nonlinearity)
    else:
        output_size = env.action_space.n
        policy = CategoricalMLPPolicy(input_size,
                                      output_size,
                                      hidden_sizes=tuple(hidden_sizes),
                                      nonlinearity=nonlinearity)
    return policy
Ejemplo n.º 7
0
def train_meta_learning_model(args):
    # import matplotlib.pyplot as plt
    # import matplotlib.animation as animation
    # from matplotlib import style

    # style.use('fivethirtyeight')
    # fig = plt.figure()
    # ax1 = fig.add_subplot(1,1,1)
    # xs = []
    # ys = []
    # def animate(i):
    #     ax1.clear()
    #     ax1.plot(xs, ys)
    rewards_before_ml = []
    rewards_after_ml = []

    continuous_actions = (args.env_name in [
        'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1',
        'HalfCheetahDir-v1', '2DNavigation-v0', 'MountainCarContinuousVT-v0'
    ])

    # writer = SummaryWriter('./logs/{0}'.format(args.output_folder + '_metalearned'))
    save_folder = './saves/{0}'.format(args.output_folder + '_metalearned')
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    with open(os.path.join(save_folder, 'config.json'), 'w') as f:
        config = {k: v for (k, v) in vars(args).items() if k != 'device'}
        config.update(device=args.device.type)
        json.dump(config, f, indent=2)

    sampler = BatchSampler(args.env_name,
                           batch_size=args.fast_batch_size,
                           num_workers=args.num_workers)
    torch.manual_seed(args.random_seed)
    if continuous_actions:
        policy = NormalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            int(np.prod(sampler.envs.action_space.shape)),
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    else:
        policy = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    baseline = LinearFeatureBaseline(
        int(np.prod(sampler.envs.observation_space.shape)))

    #load pretrained model
    cont_from_batch = 0
    if args.start_from_batch != -1:
        metalearned_model = os.path.join(
            save_folder, 'policy-{0}.pt'.format(args.start_from_batch - 1))
        if os.path.exists(metalearned_model):
            policy.load_state_dict(torch.load(metalearned_model))
            cont_from_batch = args.start_from_batch

    metalearner = MetaLearner(sampler,
                              policy,
                              baseline,
                              gamma=args.gamma,
                              fast_lr=args.fast_lr,
                              tau=args.tau,
                              device=args.device)

    for batch in range(cont_from_batch, args.num_batches):
        print('Currently processing Batch: {}'.format(batch + 1))

        task_sampling_time = time.time()
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size,
                                     sampling_type=args.sampling_type,
                                     points_per_dim=args.points_per_dim)
        task_sampling_time = time.time() - task_sampling_time

        episode_generating_time = time.time()
        episodes = metalearner.sample(tasks, first_order=args.first_order)
        episode_generating_time = time.time() - episode_generating_time

        learning_step_time = time.time()
        metalearner.step(episodes,
                         max_kl=args.max_kl,
                         cg_iters=args.cg_iters,
                         cg_damping=args.cg_damping,
                         ls_max_steps=args.ls_max_steps,
                         ls_backtrack_ratio=args.ls_backtrack_ratio)
        learning_step_time = time.time() - learning_step_time

        print('Tasking Sampling Time: {}'.format(task_sampling_time))
        print('Episode Generating Time: {}'.format(episode_generating_time))
        print('Learning Step Time: {}'.format(learning_step_time))
        reward_before_ml = total_rewards([ep.rewards for ep, _ in episodes],
                                         args.gamma)
        reward_after_ml = total_rewards([ep.rewards for _, ep in episodes],
                                        args.gamma)
        print('Before Update: {} After Update: {}'.format(
            reward_before_ml, reward_after_ml))
        # experiment.log_metric("Avg Reward Before Update (MetaLearned)", reward_before_ml)
        experiment.log_metric("Avg Reward", reward_after_ml, batch + 1)

        rewards_before_ml.append(reward_before_ml)
        rewards_after_ml.append(reward_after_ml)
        # xs.append(batch+1)
        # ys.append(total_rewards([ep.rewards for _, ep in episodes], args.gamma))
        # ani = animation.FuncAnimation(fig, animate, interval=1000)
        # plt.savefig('navg_baseline_monitor')
        # Save policy network
        with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)),
                  'wb') as f:
            torch.save(metalearner.policy.state_dict(), f)

    # tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
    # episodes = metalearner.sample(tasks, first_order=args.first_order)
    # print("Avg Reward After Update (MetaLearned)", total_rewards([ep.rewards for _, ep in episodes], args.gamma))

    testing_sampler = BatchSampler(args.env_name,
                                   batch_size=args.testing_fbs,
                                   num_workers=args.num_workers)
    testing_metalearner = MetaLearner(testing_sampler,
                                      metalearner.policy,
                                      baseline,
                                      gamma=args.gamma,
                                      fast_lr=args.fast_lr,
                                      tau=args.tau,
                                      device=args.device)
    test_tasks = testing_sampler.sample_tasks(num_tasks=args.testing_mbs,
                                              sampling_type='rand',
                                              points_per_dim=-1)
    test_episodes = testing_metalearner.sample(test_tasks,
                                               first_order=args.first_order,
                                               no_update=True)
    test_reward = total_rewards([ep.rewards for ep in test_episodes],
                                args.gamma)
    print('-------------------------------------------------')
    print('Test Time reward is: ' + str(test_reward))
    print('-------------------------------------------------')

    pickle_reward_data_file = os.path.join(save_folder, 'reward_data.pkl')
    with open(pickle_reward_data_file, 'wb') as f:
        pickle.dump(rewards_before_ml, f)
        pickle.dump(rewards_after_ml, f)

    pickle_final_reward_file = os.path.join(save_folder, 'final_reward.pkl')
    with open(pickle_final_reward_file, 'wb') as f:
        pickle.dump(test_reward, f)

    return
Ejemplo n.º 8
0
def main(args):
    logging.basicConfig(filename=args.debug_file,
                        level=logging.WARNING,
                        filemode='w')
    logging.getLogger('metalearner').setLevel(logging.INFO)

    continuous_actions = (args.env_name in [
        'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1',
        'HalfCheetahDir-v1', '2DNavigation-v0', 'PendulumTheta-v0'
    ])

    writer = SummaryWriter('./logs/{0}'.format(args.output_folder))
    save_folder = './saves/{0}'.format(args.output_folder)
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    with open(os.path.join(save_folder, 'config.json'), 'w') as f:
        config = {k: v for (k, v) in vars(args).items() if k != 'device'}
        config.update(device=args.device.type)
        json.dump(config, f, indent=2)

    sampler = BatchSampler(args.env_name,
                           batch_size=args.fast_batch_size,
                           num_workers=args.num_workers)
    #if args.baseline == 'critic shared':
    #    policy = NormalMLPPolicyA2C(int(np.prod(sampler.envs.observation_space.shape)),
    #        int(np.prod(sampler.envs.action_space.shape)),
    #        hidden_sizes=(args.hidden_size,) * args.num_layers)
    if continuous_actions:
        policy = NormalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            int(np.prod(sampler.envs.action_space.shape)),
            hidden_sizes=(args.hidden_size, ) * args.num_layers)

    else:
        policy = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size, ) * args.num_layers)

    if args.baseline == 'linear':
        baseline = LinearFeatureBaseline(
            int(np.prod(sampler.envs.observation_space.shape)))
    elif args.baseline == 'critic separate':
        baseline = CriticFunction(
            int(np.prod(sampler.envs.observation_space.shape)),
            1,
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    #elif args.baseline == 'critic shared':
    # RANJANI TO DO

    metalearner = MetaLearner(sampler,
                              policy,
                              baseline,
                              gamma=args.gamma,
                              fast_lr=args.fast_lr,
                              tau=args.tau,
                              device=args.device,
                              baseline_type=args.baseline,
                              cliprange=args.cliprange,
                              noptepochs=args.noptepochs,
                              usePPO=args.usePPO,
                              nminibatches=args.nminibatches,
                              ppo_lr=args.ppo_lr,
                              useSGD=args.useSGD,
                              ppo_momentum=args.ppo_momentum,
                              grad_clip=args.grad_clip)

    for batch in range(args.num_batches):
        print("*********************** Batch: " + str(batch) +
              "  ****************************")

        print("Creating tasks...")
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)

        print("Creating episodes...")
        episodes, grad_norm = metalearner.sample(tasks,
                                                 first_order=args.first_order)

        print("Taking a meta step...")
        metalearner.step(episodes,
                         max_kl=args.max_kl,
                         cg_iters=args.cg_iters,
                         cg_damping=args.cg_damping,
                         ls_max_steps=args.ls_max_steps,
                         ls_backtrack_ratio=args.ls_backtrack_ratio)

        print("Writing results to tensorboard...")
        # Tensorboard
        writer.add_scalar('total_rewards/before_update',
                          total_rewards([ep.rewards for ep, _ in episodes]),
                          batch)
        writer.add_scalar('total_rewards/after_update',
                          total_rewards([ep.rewards for _, ep in episodes]),
                          batch)

        if grad_norm:
            writer.add_scalar('PPO mb grad norm', np.average(grad_norm))
            print(np.average(grad_norm))

        print("Saving policy network...")
        # Save policy network
        with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)),
                  'wb') as f:
            torch.save(policy.state_dict(), f)
        print("***************************************************")
def main(args):
    env_name = 'RVONavigationAll-v0'  #['2DNavigation-v0', 'RVONavigation-v0',  'RVONavigationAll-v0']
    test_folder = './{0}'.format('test_nav')
    fast_batch_size = 40  # number of trajectories
    saved_policy_file = os.path.join(
        './TrainingResults/result3/saves/{0}'.format('maml-2DNavigation-dir'),
        'policy-180.pt')

    sampler = BatchSampler(env_name, batch_size=fast_batch_size, num_workers=3)
    policy = NormalMLPPolicy(int(np.prod(
        sampler.envs.observation_space.shape)),
                             int(np.prod(sampler.envs.action_space.shape)),
                             hidden_sizes=(100, ) * 2)

    # Loading policy
    if os.path.isfile(saved_policy_file):
        policy_info = torch.load(saved_policy_file,
                                 map_location=lambda storage, loc: storage)
        policy.load_state_dict(policy_info)
        print('Loaded saved policy')
    else:
        sys.exit("The requested policy does not exist for loading")

    # Creating test folder
    if not os.path.exists(test_folder):
        os.makedirs(test_folder)

    # Generate tasks
    # goal = [[-0.8, 0.9]]
    # task = [{'goal': goal}][0]
    tasks = sampler.sample_tasks(num_tasks=1)
    task = tasks[0]

    # Start validation
    print("Starting to test...Total step = ", args.grad_steps)
    start_time = time.time()
    # baseline = LinearFeatureBaseline(int(np.prod(sampler.envs.observation_space.shape)))
    baseline = LinearFeatureBaseline(int(np.prod((2, ))))
    metalearner = MetaLearner(sampler,
                              policy,
                              baseline,
                              gamma=0.9,
                              fast_lr=0.01,
                              tau=0.99,
                              device='cpu')

    # test_episodes = metalearner.sample(tasks)
    # for train, valid in test_episodes:
    #     total_reward, dist_reward, col_reward = total_rewards(train.rewards)
    #     print(total_reward)
    #     total_reward, dist_reward, col_reward = total_rewards(valid.rewards)
    #     print(total_reward)

    test_episodes = metalearner.test(task, n_grad=args.grad_steps)
    print('-------------------')
    for n_grad, ep in test_episodes:
        total_reward, dist_reward, col_reward = total_rewards(ep.rewards)
        print(total_reward)
    #     with open(os.path.join(test_folder, 'test_episodes_grad'+str(n_grad)+'.pkl'), 'wb') as f:
    #         pickle.dump([ep.observations.cpu().numpy(), ep], f)

    # with open(os.path.join(test_folder, 'task.pkl'), 'wb') as f:
    #     pickle.dump(task, f)
    print('Finished test. Time elapsed = {}'.format(
        time_elapsed(time.time() - start_time)))
Ejemplo n.º 10
0
def main(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    torch.manual_seed(args.seed)

    continuous_actions = (args.env_name in [
        'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1',
        'HalfCheetahDir-v1', '2DNavigation-v0'
    ])

    writer = SummaryWriter('./logs/{0}'.format(args.output_folder))
    save_folder = './saves/{0}'.format(args.output_folder)
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    with open(os.path.join(save_folder, 'config.json'), 'w') as f:
        config = {k: v for (k, v) in vars(args).items() if k != 'device'}
        config.update(device=args.device.type)
        json.dump(config, f, indent=2)

    sampler = BatchSampler(args.env_name,
                           batch_size=args.fast_batch_size,
                           num_workers=args.num_workers)
    if continuous_actions:
        policy = NormalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            int(np.prod(sampler.envs.action_space.shape)),
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    else:
        policy = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    baseline = LinearFeatureBaseline(
        int(np.prod(sampler.envs.observation_space.shape)))

    metalearner = MetaLearner(
        sampler,
        policy,
        baseline,
        gamma=args.gamma,
        fast_lr=args.fast_lr,
        tau=args.tau,
        q_inner=args.inner_q == 'true',
        q_residuce_gradient=args.inner_q_residue_gradient == 'true',
        q_soft=args.inner_q_soft == 'true',
        q_soft_temp=args.inner_q_soft_temp,
        device=args.device,
    )

    for batch in range(args.num_batches):
        if args.device.type == 'cuda':
            torch.cuda.empty_cache()
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        episodes, adaptation_info = metalearner.sample(
            tasks, first_order=args.first_order)
        metalearner.step(episodes,
                         max_kl=args.max_kl,
                         cg_iters=args.cg_iters,
                         cg_damping=args.cg_damping,
                         ls_max_steps=args.ls_max_steps,
                         ls_backtrack_ratio=args.ls_backtrack_ratio)

        # Tensorboard
        pre_update_rewards = total_rewards([ep.rewards for ep, _ in episodes])
        post_update_rewards = total_rewards([ep.rewards for _, ep in episodes])

        writer.add_scalar('total_rewards/before_update', pre_update_rewards,
                          batch)
        writer.add_scalar('total_rewards/after_update', post_update_rewards,
                          batch)
        writer.add_scalar('total_rewards/rewards_improvement',
                          post_update_rewards - pre_update_rewards, batch)

        writer.add_scalar('adaptation/pre_update_inner_loss',
                          adaptation_info.mean_pre_update_loss, batch)
        writer.add_scalar('adaptation/post_update_inner_loss',
                          adaptation_info.mean_post_update_loss, batch)
        writer.add_scalar('adaptation/inner_loss_improvement',
                          adaptation_info.mean_loss_improvment, batch)
        writer.add_scalar('adaptation/weight_change',
                          adaptation_info.mean_weight_change, batch)

        # Save policy network
        with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)),
                  'wb') as f:
            torch.save(policy.state_dict(), f)
Ejemplo n.º 11
0
def main(args):
    continuous_actions = (args.env_name in [
        'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1',
        'HalfCheetahDir-v1', '2DNavigation-v0'
    ])

    writer = tf.summary.create_file_writer('./logs/{0}'.format(
        args.output_folder))

    save_folder = './saves/{0}'.format(args.output_folder)
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    with open(os.path.join(save_folder, 'config.json'), 'w') as f:
        config = {k: v for (k, v) in vars(args).items() if k != 'device'}
        config.update(device=args.device)
        json.dump(config, f, indent=2)

    sampler = BatchSampler(args.env_name,
                           batch_size=args.fast_batch_size,
                           num_workers=args.num_workers)

    # Create policy for the given task
    with tf.name_scope('policy') as scope:
        if continuous_actions:
            policy = NormalMLPPolicy(
                int(np.prod(sampler.envs.observation_space.shape)),
                int(np.prod(sampler.envs.action_space.shape)),
                hidden_sizes=(args.hidden_size, ) * args.num_layers,
                name=scope)
        else:
            policy = CategoricalMLPPolicy(
                int(np.prod(sampler.envs.observation_space.shape)),
                sampler.envs.action_space.n,
                hidden_sizes=(args.hidden_size, ) * args.num_layers,
                name=scope)

    baseline = LinearFeatureBaseline(
        int(np.prod(sampler.envs.observation_space.shape)))

    optimizer = ConjugateGradientOptimizer(args.cg_damping, args.cg_iters,
                                           args.ls_backtrack_ratio,
                                           args.ls_max_steps, args.max_kl,
                                           policy)

    metalearner = MetaLearner(sampler,
                              policy,
                              baseline,
                              optimizer=optimizer,
                              gamma=args.gamma,
                              fast_lr=args.fast_lr,
                              tau=args.tau)

    optimizer.setup(metalearner)

    for batch in range(args.num_batches):
        print(f"----------Batch number {batch+1}----------")
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        episodes = metalearner.sample(tasks, first_order=args.first_order)
        metalearner.step(episodes)

        with writer.as_default():
            return_before = total_rewards([ep.rewards for ep, _ in episodes])
            return_after = total_rewards([ep.rewards for _, ep in episodes])
            tf.summary.scalar('total_rewards/before_update', return_before,
                              batch)
            tf.summary.scalar('total_rewards/after_update', return_after,
                              batch)
            print(
                f"{batch+1}:: \t Before: {return_before} \t After: {return_after}"
            )
            writer.flush()

        if (batch + 1) % args.save_iters == 0:
            # Save policy network
            policy.save_weights(save_folder + f"/policy-{batch+1}",
                                overwrite=True)
            baseline.save_weights(save_folder + f"/baseline-{batch + 1}",
                                  overwrite=True)
            print(f"Policy saved at iteration {batch+1}")
Ejemplo n.º 12
0
def main(args):
    continuous_actions = (args.env_name in [
        'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1',
        'HalfCheetahDir-v1', '2DNavigation-v0'
    ])

    writer = SummaryWriter('./logs/{0}'.format(args.output_folder))
    save_folder = './saves/{0}'.format(args.output_folder)
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    with open(os.path.join(save_folder, 'config.json'), 'w') as f:
        config = {k: v for (k, v) in vars(args).items() if k != 'device'}
        config.update(device=args.device.type)
        json.dump(config, f, indent=2)

    sampler = BatchSampler(args.env_name,
                           batch_size=args.fast_batch_size,
                           num_workers=args.num_workers)

    if args.env_name == 'AntVel-v1':
        param_bounds = {"goal": [0, 3]}

    if args.env_name == 'AntPos-v0':
        param_bounds = {"x": [-3, 3], "y": [-3, 3]}

    teacher = TeacherController(args.teacher,
                                args.nb_test_episodes,
                                param_bounds,
                                seed=args.seed,
                                teacher_params={})
    tree = TreeLSTM(args.tree_hidden_layer,
                    len(param_bounds.keys()),
                    args.cluster_0,
                    args.cluster_1,
                    device=args.device)
    if continuous_actions:
        policy = NormalMLPPolicy(int(
            np.prod(sampler.envs.observation_space.shape) +
            args.tree_hidden_layer),
                                 int(np.prod(sampler.envs.action_space.shape)),
                                 hidden_sizes=(args.hidden_size, ) *
                                 args.num_layers,
                                 tree=tree)
    else:
        policy = CategoricalMLPPolicy(int(
            np.prod(sampler.envs.observation_space.shape) +
            args.tree_hidden_layer),
                                      sampler.envs.action_space.n,
                                      hidden_sizes=(args.hidden_size, ) *
                                      args.num_layers,
                                      tree=tree)
    baseline = LinearFeatureBaseline(
        int(np.prod(sampler.envs.observation_space.shape)) +
        args.tree_hidden_layer)

    metalearner = MetaLearner(sampler,
                              policy,
                              baseline,
                              tree=tree,
                              gamma=args.gamma,
                              fast_lr=args.fast_lr,
                              tau=args.tau,
                              device=args.device)

    all_tasks = []
    for batch in range(args.num_batches):
        print("starting iteration {}".format(batch))
        tasks = []
        for _ in range(args.meta_batch_size):
            if args.env_name == 'AntPos-v0':
                tasks.append(
                    {"position": teacher.task_generator.sample_task()})
            if args.env_name == 'AntVel-v1':
                tasks.append(
                    {"velocity": teacher.task_generator.sample_task()[0]})
        all_tasks.append(tasks)
        # tasks = np.array(tasks)
        # tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        with open('./logs/{0}/task_list.pkl'.format(args.output_folder),
                  'wb') as pf:
            pickle.dump(all_tasks, pf)
        episodes = metalearner.sample(tasks, first_order=args.first_order)
        metalearner.step(episodes,
                         max_kl=args.max_kl,
                         cg_iters=args.cg_iters,
                         cg_damping=args.cg_damping,
                         ls_max_steps=args.ls_max_steps,
                         ls_backtrack_ratio=args.ls_backtrack_ratio)

        # Tensorboard
        writer.add_scalar('total_rewards/before_update',
                          total_rewards([ep.rewards for ep, _ in episodes]),
                          batch)
        writer.add_scalar('total_rewards/after_update',
                          total_rewards([ep.rewards for _, ep in episodes]),
                          batch)

        tr = [ep.rewards for _, ep in episodes]
        tr = [torch.mean(torch.sum(rewards, dim=0)).item() for rewards in tr]
        print("rewards:", tr)
        for t in range(args.meta_batch_size):
            if args.env_name == 'AntPos-v0':
                teacher.task_generator.update(tasks[t]["position"], tr[t])
            if args.env_name == 'AntVel-v1':
                teacher.task_generator.update(np.array([tasks[t]["velocity"]]),
                                              tr[t])

        # Save policy network
        with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)),
                  'wb') as f:
            torch.save(policy.state_dict(), f)

        # Save tree
        torch.save(tree, os.path.join(save_folder,
                                      'tree-{0}.pt'.format(batch)))
Ejemplo n.º 13
0
def main(args):
    save_folder = f'saves/{args.output_folder + get_date_str()}'
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    with open(os.path.join(save_folder, 'config.json'), 'w') as f:
        config = {k: v for (k, v) in vars(args).items() if k != 'device'}
        config.update(device=args.device.type)
        json.dump(config, f, indent=2)

    print('Initializing samplers...')

    sampler = BatchSampler(args.env_name,
                           batch_size=args.fast_batch_size,
                           num_workers=args.num_workers)

    test_sampler = BatchSampler(args.env_name,
                                test_env=True,
                                batch_size=args.fast_batch_size,
                                num_workers=max(1, args.num_workers // 2))

    policy = NormalMLPPolicy(
        int(np.prod(sampler.envs.observation_space.shape)),
        int(np.prod(sampler.envs.action_space.shape)),
        hidden_sizes=(args.hidden_size, ) * args.num_layers)

    baseline = LinearFeatureBaseline(
        int(np.prod(sampler.envs.observation_space.shape)))

    print('Initializing meta-learners...')

    metalearner = MetaLearner(sampler,
                              policy,
                              baseline,
                              gamma=args.gamma,
                              fast_lr=args.fast_lr,
                              tau=args.tau,
                              device=args.device)  # noqa: E128

    # NOTE: we need this metalearner only to sample test tasks
    test_metalearner = MetaLearner(test_sampler,
                                   policy,
                                   baseline,
                                   gamma=args.gamma,
                                   fast_lr=args.fast_lr,
                                   tau=args.tau,
                                   device=args.device)  # noqa: E128

    print('Starting the training')

    # Initialize logging
    wandb.init()
    wandb.config.update(args)

    task_name2id = {name: i for i, name in enumerate(sampler._env._task_names)}
    task_id2name = sampler._env._task_names
    task2prob = np.ones(sampler._env.num_tasks) / sampler._env.num_tasks
    uniform = np.ones_like(task2prob) / sampler._env.num_tasks

    # outer loop (meta-training)
    for i in range(args.num_batches):
        print(f'Batch {i}')

        # sample trajectories from random tasks
        print(f'\tSampling a batch of {args.meta_batch_size} training tasks')
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size,
                                     task2prob=0.99 * task2prob +
                                     0.01 * uniform)
        # Note: Dirty hack to overcome metaworld dirty hack
        task_names = [sampler._env._task_names[t['task']] for t in tasks]

        # inner loop (adaptation)
        # returns list of tuples (train_episodes, valid_episodes)
        print(f'\tTraining')
        episodes = metalearner.sample(tasks, first_order=args.first_order)

        print(f'\tUpdating the meta-model')
        metalearner.step(episodes,
                         max_kl=args.max_kl,
                         cg_iters=args.cg_iters,
                         cg_damping=args.cg_damping,
                         ls_max_steps=args.ls_max_steps,
                         ls_backtrack_ratio=args.ls_backtrack_ratio)

        # Logging
        # before: before parameters update
        # after: after parameters adaptation to the task

        r_before = total_rewards([ep.rewards for ep, _ in episodes])
        r_after = total_rewards([ep.rewards for _, ep in episodes])

        test_episode_infos = [ep._info_list for ep, _ in episodes]
        success_rate_before, task_success_rate_before = get_success_rate(
            test_episode_infos, task_names, per_task=True)

        test_episode_infos = [ep._info_list for _, ep in episodes]
        success_rate_after, task_success_rate_after = get_success_rate(
            test_episode_infos, task_names, per_task=True)

        wandb.log(
            {
                'total_rewards/before_update':
                r_before,
                'total_rewards/after_update':
                r_after,
                'success_rate/before_update':
                success_rate_before,
                'success_rate/after_update':
                success_rate_after,
                'success_rate/improvement':
                success_rate_after - success_rate_before,
                'success_rate/before_update_macro':
                np.mean(list(task_success_rate_before.values())),
                'success_rate/after_update_macro':
                np.mean(list(task_success_rate_after.values())),
            },
            step=i)
        wandb.log(
            {
                f'success_rate/after_update/{task}': rate
                for task, rate in task_success_rate_after.items()
            },
            step=i)
        wandb.log(
            {
                f'success_rate/before_update/{task}': rate
                for task, rate in task_success_rate_before.items()
            },
            step=i)
        wandb.log(
            {
                f'success_rate/imrovement/{task}':
                task_success_rate_after[task] - task_success_rate_before[task]
                for task in task_success_rate_before.keys()
            },
            step=i)
        wandb.log(
            {
                f'n_acquired_tasks/before_update/at_{x}': sum(
                    rate > x for rate in task_success_rate_before.values())
                for x in [0.001, 0.01, 0.05, 0.1, 0.5]
            },
            step=i)
        wandb.log(
            {
                f'n_acquired_tasks/after_update/at_{x}': sum(
                    rate > x for rate in task_success_rate_after.values())
                for x in [0.001, 0.01, 0.05, 0.1, 0.5]
            },
            step=i)

        if args.active_learning:
            new_task2prob = np.zeros_like(task2prob)

            if args.prob_f == 'linear':
                norm = 1e-7 + sum(task_success_rate_after.values())
                for task, rate in task_success_rate_after.items():
                    task_id = task_name2id[task]
                    new_task2prob[task_id] = 1. - rate / norm

            elif args.prob_f == 'softmax':  # softmax(1 - rate)
                # numerical stability trick
                # http://cs231n.github.io/linear-classify/#softmax
                max_f = 1 - min(task_success_rate_after.values())
                for task, rate in task_success_rate_after.items():
                    task_id = task_name2id[task]
                    f = 1 - rate
                    new_task2prob[task_id] = np.exp(
                        (f - max_f) / args.temperature)

                new_task2prob = new_task2prob / (1e-7 + sum(new_task2prob))

            elif args.prob_f == 'softmax2':  # 1 - softmax(rate)
                max_f = max(task_success_rate_after.values())
                for task, rate in task_success_rate_after.items():
                    task_id = task_name2id[task]
                    new_task2prob[task_id] = np.exp(
                        (rate - max_f) / args.temperature)

                new_task2prob = 1. - new_task2prob / (1e-7 +
                                                      sum(new_task2prob))
            else:
                raise RuntimeError(
                    'prob-f should be either "softmax", "softmax2" or "linear"'
                )

            alpha = args.success_rate_smoothing
            task2prob = alpha * task2prob + (1 - alpha) * new_task2prob

            task2prob /= sum(task2prob)
            assert all(task2prob > 0)  # strictly!

            wandb.log(
                {
                    f'task2prob/{task_id2name[task_id]}': prob
                    for task_id, prob in enumerate(task2prob)
                },
                step=i)

        # meta-test
        if i % args.eval_every == 0:
            print(f'Evaluating on meta-test')

            # save policy network
            _save_path = os.path.join(save_folder, 'policy-{0}.pt'.format(i))
            with open(_save_path, 'wb') as f:
                torch.save(policy.state_dict(), f)
            wandb.save(_save_path)

            # Evaluate on meta-test
            tasks = test_sampler.sample_tasks(num_tasks=2 *
                                              args.meta_batch_size)
            # Note: Dirty hack to overcome metaworld dirty hack
            task_names = [
                test_sampler._env._task_names[t['task']] for t in tasks
            ]

            episodes = test_metalearner.sample(tasks,
                                               first_order=args.first_order)

            r_before = total_rewards([ep.rewards for ep, _ in episodes])
            r_after = total_rewards([ep.rewards for _, ep in episodes])

            test_episode_infos = [ep._info_list for ep, _ in episodes]
            success_rate_before, task_success_rate_before = get_success_rate(
                test_episode_infos, task_names, per_task=True)

            test_episode_infos = [ep._info_list for _, ep in episodes]
            success_rate_after, task_success_rate_after = get_success_rate(
                test_episode_infos, task_names, per_task=True)

            wandb.log(
                {
                    'total_rewards_test/before_update':
                    r_before,
                    'total_rewards_test/after_update':
                    r_after,
                    'success_rate_test/before_update':
                    success_rate_before,
                    'success_rate_test/after_update':
                    success_rate_after,
                    'success_rate_test/improvement':
                    success_rate_after - success_rate_before
                },
                step=i)
            wandb.log(
                {
                    f'success_rate_test/after_update/{task}': rate
                    for task, rate in task_success_rate_after.items()
                },  # noqa: E501
                step=i)
            wandb.log(
                {
                    f'success_rate_test/before_update/{task}': rate
                    for task, rate in task_success_rate_before.items()
                },  # noqa: E501
                step=i)
            wandb.log(
                {
                    f'success_rate_test/imrovement/{task}':
                    task_success_rate_after[task] -
                    task_success_rate_before[task]
                    for task in task_success_rate_before.keys()
                },
                step=i)

    print('Saving the final model')
    # save final policy
    _save_path = os.path.join(save_folder, 'policy-final.pt')
    with open(_save_path, 'wb') as f:
        torch.save(policy.state_dict(), f)
    wandb.save(_save_path)
Ejemplo n.º 14
0
def main(args):
    continuous_actions = (args.env_name in [
        'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1',
        'HalfCheetahDir-v1', '2DNavigation-v0', 'Point2DWalls-corner-v0',
        'Ant-v0', 'HalfCheetah-v0'
    ])

    writer = SummaryWriter(log_dir=args.log_dir)

    logger.configure(dir=args.log_dir, format_strs=['stdout', 'log', 'csv'])
    logger.log(args)
    json.dump(vars(args),
              open(os.path.join(
                  args.log_dir,
                  'params.json',
              ), 'w'),
              indent=2)

    sampler = BatchSampler(args.env_name,
                           batch_size=args.fast_batch_size,
                           num_workers=args.num_workers)
    if continuous_actions:
        policy = NormalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            int(np.prod(sampler.envs.action_space.shape)),
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    else:
        policy = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    baseline = LinearFeatureBaseline(
        int(np.prod(sampler.envs.observation_space.shape)))

    metalearner = MetaLearner(sampler,
                              policy,
                              baseline,
                              gamma=args.gamma,
                              fast_lr=args.fast_lr,
                              tau=args.tau,
                              device=args.device)

    for batch in range(args.num_batches):
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        episodes = metalearner.sample(tasks, first_order=args.first_order)
        metalearner.step(episodes,
                         max_kl=args.max_kl,
                         cg_iters=args.cg_iters,
                         cg_damping=args.cg_damping,
                         ls_max_steps=args.ls_max_steps,
                         ls_backtrack_ratio=args.ls_backtrack_ratio)

        # # Tensorboard
        writer.add_scalar('total_rewards/before_update',
                          total_rewards([ep.rewards for ep, _ in episodes]),
                          batch)
        writer.add_scalar('total_rewards/after_update',
                          total_rewards([ep.rewards for _, ep in episodes]),
                          batch)

        logger.logkv('return_avg_pre',
                     total_rewards([ep.rewards for ep, _ in episodes]))
        logger.logkv('return_avg_post',
                     total_rewards([ep.rewards for _, ep in episodes]))
        logger.dumpkvs()
Ejemplo n.º 15
0
def main(args):
    set_random_seed(args.random)

    continuous_actions = (args.env_name in [
        'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1',
        'HalfCheetahDir-v1', '2DNavigation-v0', '2DNavigationBiased-v0'
    ])

    writer = SummaryWriter('./logs/{0}'.format(args.alg))
    save_folder = './saves/{0}'.format(args.alg)
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    with open(os.path.join(save_folder, 'config.json'), 'w') as f:
        config = {k: v for (k, v) in vars(args).items() if k != 'device'}
        config.update(device=args.device.type)
        json.dump(config, f, indent=2)

    sampler = BatchSampler(args.env_name,
                           batch_size=args.fast_batch_size,
                           num_workers=args.num_workers,
                           seed=args.random)
    if continuous_actions:
        policy = NormalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            int(np.prod(sampler.envs.action_space.shape)),
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    else:
        policy = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    baseline = LinearFeatureBaseline(
        int(np.prod(sampler.envs.observation_space.shape)))

    if args.alg == 'simul':
        # vanilla maml
        metalearner = MetaLearner(sampler,
                                  policy,
                                  baseline,
                                  gamma=args.gamma,
                                  fast_lr=args.fast_lr,
                                  tau=args.tau,
                                  device=args.device)

        for batch in range(args.meta_policy_num * args.num_batches):
            # first sample tasks under the distribution
            tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
            # get episodes in the form of (train episodes, test episodes after adaption)
            episodes = metalearner.sample(tasks, first_order=args.first_order)
            metalearner.step(episodes,
                             max_kl=args.max_kl,
                             cg_iters=args.cg_iters,
                             cg_damping=args.cg_damping,
                             ls_max_steps=args.ls_max_steps,
                             ls_backtrack_ratio=args.ls_backtrack_ratio)

            # Tensorboard
            writer.add_scalar(
                'maml/before_update',
                total_rewards([ep.rewards for ep, _ in episodes]), batch)
            writer.add_scalar(
                'maml/after_update',
                total_rewards([ep.rewards for _, ep in episodes]), batch)

            # Save policy network
            with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)),
                      'wb') as f:
                torch.save(policy.state_dict(), f)

    elif args.alg == 'greedy':
        # multi-policy maml
        metalearner = KPolicyMetaLearner(sampler,
                                         policy,
                                         baseline,
                                         args.meta_policy_num,
                                         gamma=args.gamma,
                                         fast_lr=args.fast_lr,
                                         tau=args.tau,
                                         device=args.device)

        # visualize the poolicies' behavior
        trajectories = []
        for policy_idx in range(args.meta_policy_num):
            print(policy_idx)
            metalearner.optimize_policy_index(policy_idx)

            for batch in range(args.num_batches):
                print('batch num %d' % batch)

                tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
                metalearner.evaluate_optimized_policies(tasks)

                episodes = metalearner.sample(tasks,
                                              first_order=args.first_order)
                # loss is computed inside, then update policies
                metalearner.step(episodes,
                                 max_kl=args.max_kl,
                                 cg_iters=args.cg_iters,
                                 cg_damping=args.cg_damping,
                                 ls_max_steps=args.ls_max_steps,
                                 ls_backtrack_ratio=args.ls_backtrack_ratio)

                # not sure what to write in tensorboard...
                for epIdx in range(len(episodes)):
                    writer.add_scalar(
                        'kmaml/pi_' + str(policy_idx) + '_task_' + str(epIdx),
                        total_rewards([episodes[epIdx][1].rewards]), batch)
            # use a random task (no update here anyway) to visualize meta-policies
            tasks = sampler.sample_tasks(num_tasks=1)
            trajectories.append(metalearner.sample_meta_policy(tasks[0]))
        plotTrajectories(trajectories)
Ejemplo n.º 16
0
def main(args):
    group_name = ''.join([
        random.choice(string.ascii_letters + string.digits) for n in range(4)
    ])
    wandb.init(group=group_name, job_type='optimizer', tensorboard=True)
    wandb.config.update(args)

    device = torch.device(args.device)

    continuous_actions = (args.env_name in [
        'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1',
        'HalfCheetahDir-v1', '2DNavigation-v0'
    ])

    writer = SummaryWriter('./logs/{0}'.format(args.output_folder))
    save_folder = './saves/{0}'.format(args.output_folder)
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    with open(os.path.join(save_folder, 'config.json'), 'w') as f:
        config = {k: v for (k, v) in vars(args).items() if k != 'device'}
        config.update(device=device.type)
        json.dump(config, f, indent=2)

    sampler = BatchSampler(group_name,
                           args.env_name,
                           batch_size=args.fast_batch_size,
                           num_workers=args.num_workers)
    if continuous_actions:
        policy = NormalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            int(np.prod(sampler.envs.action_space.shape)),
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    else:
        policy = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size, ) * args.num_layers)

    baseline = LinearFeatureBaseline(
        int(np.prod(sampler.envs.observation_space.shape)))

    metalearner = MetaLearner(sampler,
                              policy,
                              baseline,
                              gamma=args.gamma,
                              fast_lr=args.fast_lr,
                              tau=args.tau,
                              device=device)

    for batch in range(args.num_batches):
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        episodes = metalearner.sample(tasks, first_order=args.first_order)
        metalearner.step(episodes,
                         max_kl=args.max_kl,
                         cg_iters=args.cg_iters,
                         cg_damping=args.cg_damping,
                         ls_max_steps=args.ls_max_steps,
                         ls_backtrack_ratio=args.ls_backtrack_ratio)

        # Tensorboard
        writer.add_scalar('total_rewards/before_update',
                          total_rewards([ep.rewards for ep, _ in episodes]),
                          batch)
        writer.add_scalar('total_rewards/after_update',
                          total_rewards([ep.rewards for _, ep in episodes]),
                          batch)

        # Save policy network
        with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)),
                  'wb') as f:
            torch.save(policy.state_dict(), f)
Ejemplo n.º 17
0
def main(args):
    continuous_actions = (args.env_name in [
        'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1',
        'HalfCheetahDir-v1', '2DNavigation-v0', 'Pusher'
    ])

    writer = SummaryWriter('./logs/{0}'.format(args.output_folder))
    save_folder = './saves/{0}'.format(args.output_folder)
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    with open(os.path.join(save_folder, 'config.json'), 'w') as f:
        config = {k: v for (k, v) in vars(args).items() if k != 'device'}
        config.update(device=args.device.type)
        json.dump(config, f, indent=2)

    if not args.hierarchical:

        sampler = BatchSampler(args.env_name,
                               batch_size=args.fast_batch_size,
                               num_workers=args.num_workers)
        if continuous_actions:
            policy = NormalMLPPolicy(
                int(np.prod(sampler.envs.observation_space.shape)),
                int(np.prod(sampler.envs.action_space.shape)),
                hidden_sizes=(args.hidden_size, ) * args.num_layers)
        else:
            policy = CategoricalMLPPolicy(
                int(np.prod(sampler.envs.observation_space.shape)),
                sampler.envs.action_space.n,
                hidden_sizes=(args.hidden_size, ) * args.num_layers)
        baseline = LinearFeatureBaseline(
            int(np.prod(sampler.envs.observation_space.shape)))

        metalearner = MetaLearner(sampler,
                                  policy,
                                  baseline,
                                  gamma=args.gamma,
                                  fast_lr=args.fast_lr,
                                  tau=args.tau,
                                  device=args.device)

        for i, batch in enumerate(range(args.num_batches)):
            tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
            episodes = metalearner.sample(tasks, first_order=args.first_order)
            metalearner.step(episodes,
                             max_kl=args.max_kl,
                             cg_iters=args.cg_iters,
                             cg_damping=args.cg_damping,
                             ls_max_steps=args.ls_max_steps,
                             ls_backtrack_ratio=args.ls_backtrack_ratio)

            print('Total Rewards',
                  str(total_rewards([ep.rewards for _, ep in episodes])))
            # Tensorboard
            writer.add_scalar(
                'total_rewards/before_update',
                total_rewards([ep.rewards for ep, _ in episodes]), batch)
            writer.add_scalar(
                'total_rewards/after_update',
                total_rewards([ep.rewards for _, ep in episodes]), batch)

            if (i + 1) % args.save_every == 0:
                # Save policy network
                with open(
                        os.path.join(save_folder,
                                     'policy-{0}.pt'.format(batch)),
                        'wb') as f:
                    torch.save(policy, f)

    else:

        sampler = BatchSampler(args.env_name,
                               batch_size=args.fast_batch_size,
                               num_workers=args.num_workers)

        # Get the policies
        higher_policy, lower_trainer, baseline = hierarchical_meta_policy(
            args.env_name,
            args.skills_dim,
            sampler=sampler,
            net_size=args.hidden_size,
            output_size=1)

        # Define the hierarchical meta learner
        hr_meta_learner = HierarchicalMetaLearner(sampler,
                                                  higher_policy,
                                                  baseline,
                                                  gamma=args.gamma,
                                                  fast_lr=args.fast_lr,
                                                  tau=args.tau,
                                                  device=args.device)

        # Training procedure
        for i, batch in enumerate(range(args.num_batches)):

            # Train the lower level policy
            lower_trainer.train()

            # Now freeze the lower level policy
            lower_networks = lower_trainer.networks
            lower_policy = lower_networks[0]
            lower_policy.trainable = False

            # Sample the different tasks
            tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)

            # Sample the different episodes for the different tasks
            episodes = hr_meta_learner.sample(tasks,
                                              lower_policy,
                                              first_order=args.first_order)

            hr_meta_learner.step(episodes,
                                 max_kl=args.max_kl,
                                 cg_iters=args.cg_iters,
                                 cg_damping=args.cg_damping,
                                 ls_max_steps=args.ls_max_steps,
                                 ls_backtrack_ratio=args.ls_backtrack_ratio)

            print('Total Rewards',
                  str(total_rewards([ep.rewards for _, ep in episodes])))

            lower_policy.trainable = True

            # Tensorboard
            writer.add_scalar(
                'total_rewards/before_update',
                total_rewards([ep.rewards for ep, _ in episodes]), batch)
            writer.add_scalar(
                'total_rewards/after_update',
                total_rewards([ep.rewards for _, ep in episodes]), batch)

            if (i + 1) % args.save_every == 0:
                # Save the policy networks
                with open(
                        os.path.join(save_folder,
                                     'h_policy-{0}.pt'.format(batch)),
                        'wb') as f:
                    torch.save(higher_policy, f)

                with open(
                        os.path.join(save_folder,
                                     'l_policy-{0}.pt'.format(batch)),
                        'wb') as f:
                    torch.save(lower_policy, f)

    with open(os.path.join(save_folder, 'baseline.pt'), 'wb') as f:
        torch.save(baseline, f)
Ejemplo n.º 18
0
def k_shot_experiments(args):
    continuous_actions = (args.env_name in [
        'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1',
        'HalfCheetahDir-v1', '2DNavigation-v0', 'MountainCarContinuousVT-v0'
    ])

    sampler = BatchSampler(args.env_name,
                           batch_size=args.fast_batch_size,
                           num_workers=args.num_workers)
    if continuous_actions:
        policy_pretrained = NormalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            int(np.prod(sampler.envs.action_space.shape)),
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
        policy_metalearned = NormalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            int(np.prod(sampler.envs.action_space.shape)),
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
        policy_random = NormalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            int(np.prod(sampler.envs.action_space.shape)),
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    else:
        policy_pretrained = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
        policy_metalearned = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
        policy_random = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size, ) * args.num_layers)

    # save_folder_pretrained = './saves/{0}'.format(args.output_folder + '_pretrained')
    # pretrained_model = os.path.join(save_folder_pretrained, 'policy-{0}.pt'.format(args.num_batches-1))
    # policy_pretrained.load_state_dict(torch.load(pretrained_model))

    save_folder_metalearned = './saves/{0}'.format(args.output_folder +
                                                   '_metalearned')
    metalearned_model = os.path.join(
        save_folder_metalearned, 'policy-{0}.pt'.format(args.num_batches - 1))
    policy_metalearned.load_state_dict(torch.load(metalearned_model))

    # metalearned_tester = k_shot_tester(args.K_shot_batch_num, policy_metalearned, args.K_shot_batch_size, args.K_shot_num_tasks, 'MetaLearned', args)
    # avg_discounted_returns_metalearned = metalearned_tester.run_k_shot_exp()
    # print('Metalearned KSHOT result: ', avg_discounted_returns_metalearned)
    # print('Mean: ', torch.mean(avg_discounted_returns_metalearned, 0))
    results_folder = './saves/{0}'.format(args.output_folder + '_results')
    if not os.path.exists(results_folder):
        os.makedirs(results_folder)
    kshot_fig_path1 = os.path.join(results_folder, 'kshot_testing')
    # kshot_fig_path2 = os.path.join(results_folder, 'ml_pre_diff')
    result_data_path = os.path.join(results_folder, 'data_')

    metalearned_tester = k_shot_tester(args.K_shot_batch_num,
                                       policy_metalearned,
                                       args.K_shot_batch_size,
                                       args.K_shot_num_tasks, 'MetaLearned',
                                       args)
    avg_discounted_returns_metalearned = metalearned_tester.run_k_shot_exp()
    # pretrained_tester = k_shot_tester(args.K_shot_batch_num, policy_pretrained, args.K_shot_batch_size, args.K_shot_num_tasks, 'Pretrained', args)
    # avg_discounted_returns_pretrained = pretrained_tester.run_k_shot_exp()

    # random_tester = k_shot_tester(args.K_shot_batch_num, policy_random, args.K_shot_batch_size, args.K_shot_num_tasks, 'Random', args)
    # avg_discounted_returns_random = random_tester.run_k_shot_exp()

    plt.figure('K Shot: Testing Curves')
    # plt.plot([i for i in range(args.K_shot_batch_num + 1)], avg_discounted_returns_pretrained, color=np.array([0.,0.,1.]), label='Pre-Trained')
    # plt.plot([i for i in range(args.K_shot_batch_num + 1)], avg_discounted_returns_metalearned, color=np.array([0.,1.,0.]), label='Meta-Learned')
    # plt.plot([i for i in range(args.K_shot_batch_num + 1)], avg_discounted_returns_random, color=np.array([0.,0.,0.]), label='Random')
    # plt.errorbar([i for i in range(args.K_shot_batch_num + 1)], torch.mean(avg_discounted_returns_pretrained, 0).tolist(), torch.std(avg_discounted_returns_pretrained, 0).tolist(), color=np.array([0.,0.,1.]), label='Pre-Trained', capsize=5, capthick=2)
    plt.errorbar([i for i in range(args.K_shot_batch_num + 1)],
                 torch.mean(avg_discounted_returns_metalearned, 0).tolist(),
                 torch.std(avg_discounted_returns_metalearned, 0).tolist(),
                 color=np.array([0., 1., 0.]),
                 label='Meta-Learned',
                 capsize=5,
                 capthick=2)
    # plt.errorbar([i for i in range(args.K_shot_batch_num + 1)], torch.mean(avg_discounted_returns_random, 0).tolist(), torch.std(avg_discounted_returns_random, 0).tolist(), color=np.array([0.,0.,0.]), label='Random', capsize=5, capthick=2)

    plt.xlabel('Gradient Descent Iteration Number')
    plt.ylabel('Average Discounted Return')
    plt.title('K Shot: Testing Curves')
    plt.legend(loc='upper left')
    plt.savefig(kshot_fig_path1)
    # plt.show()

    # plt.figure('K Shot: Difference between Metalearned and Pretrained')

    # plt.errorbar([i for i in range(args.K_shot_batch_num + 1)], torch.mean(avg_discounted_returns_metalearned-avg_discounted_returns_pretrained, 0).tolist(), torch.std(avg_discounted_returns_metalearned-avg_discounted_returns_pretrained, 0).tolist(), color=np.array([0.,0.,0.]), capsize=5, capthick=2)

    # plt.xlabel('Gradient Descent Iteration Number')
    # plt.ylabel('Average Discounted Return Difference')
    # plt.title('K Shot: Difference between Metalearned and Pretrained')
    # plt.savefig(kshot_fig_path2)
    # plt.show()

    #save torch tensor results to combine with other experiments
    # torch.save(avg_discounted_returns_pretrained, result_data_path + 'pretrained')
    torch.save(avg_discounted_returns_metalearned,
               result_data_path + 'metalearned')
    return
Ejemplo n.º 19
0
def train_pretrained_model(args):
    continuous_actions = (args.env_name in [
        'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1',
        'HalfCheetahDir-v1', '2DNavigation-v0', 'MountainCarContinuousVT-v0'
    ])

    # writer = SummaryWriter('./logs/{0}'.format(args.output_folder + '_pretrained'))
    save_folder = './saves/{0}'.format(args.output_folder + '_pretrained')
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    with open(os.path.join(save_folder, 'config.json'), 'w') as f:
        config = {k: v for (k, v) in vars(args).items() if k != 'device'}
        config.update(device=args.device.type)
        json.dump(config, f, indent=2)

    #batch_size=2*args.fast_batch_size to match the amount of data used in meta-learning
    sampler = BatchSampler(args.env_name,
                           batch_size=2 * args.fast_batch_size,
                           num_workers=args.num_workers)
    if continuous_actions:
        policy = NormalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            int(np.prod(sampler.envs.action_space.shape)),
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    else:
        policy = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    baseline = LinearFeatureBaseline(
        int(np.prod(sampler.envs.observation_space.shape)))

    #load pretrained model
    cont_from_batch = 0
    if args.start_from_batch != -1:
        pretrained_model = os.path.join(
            save_folder, 'policy-{0}.pt'.format(args.start_from_batch - 1))
        if os.path.exists(pretrained_model):
            policy.load_state_dict(torch.load(pretrained_model))
            cont_from_batch = args.start_from_batch

    metalearner = MetaLearner(sampler,
                              policy,
                              baseline,
                              gamma=args.gamma,
                              fast_lr=args.fast_lr,
                              tau=args.tau,
                              device=args.device)

    for batch in range(cont_from_batch, args.num_batches):
        print('Currently processing Batch: {}'.format(batch + 1))

        task_sampling_time = time.time()
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        task_sampling_time = time.time() - task_sampling_time

        episode_generating_time = time.time()
        episodes = metalearner.sample_for_pretraining(
            tasks, first_order=args.first_order)
        episode_generating_time = time.time() - episode_generating_time

        learning_step_time = time.time()
        params = metalearner.adapt(episodes, first_order=args.first_order)
        metalearner.policy.load_state_dict(params, strict=True)
        learning_step_time = time.time() - learning_step_time

        print('Tasking Sampling Time: {}'.format(task_sampling_time))
        print('Episode Generating Time: {}'.format(episode_generating_time))
        print('Learning Step Time: {}'.format(learning_step_time))

        # Tensorboard
        # writer.add_scalar('total_rewards/before_update',
        #     total_rewards([ep.rewards for ep, _ in episodes]), batch)
        # writer.add_scalar('total_rewards/after_update',
        #     total_rewards([ep.rewards for _, ep in episodes]), batch)
        # experiment.log_metric("Avg Disc Reward (Pretrained)", total_rewards([episodes.rewards], args.gamma), batch+1)

        # Save policy network
        with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)),
                  'wb') as f:
            torch.save(metalearner.policy.state_dict(), f)

    return
Ejemplo n.º 20
0
def main(args):
    # Setup for logging
    tb_writer = SummaryWriter('./logs/tb_{}'.format(
        args.log_name))  # Tensorboard logging
    log = set_log(args)

    # Setup before meta-train starts
    sampler = BatchSampler(env_name=args.env_name,
                           batch_size=args.fast_batch_size,
                           num_workers=args.num_workers,
                           args=args)

    # NOTE Observation space is a list with [predator0, predator1, ..., prey]
    # Thus using the index of 0
    policy = NormalMLPPolicy(
        input_size=int(np.prod(sampler.envs.observation_space[0].shape)),
        output_size=int(np.prod(sampler.envs.action_space[0].shape)),
        hidden_sizes=(args.hidden_size, ) * args.num_layers)

    baseline = LinearFeatureBaseline(
        input_size=int(np.prod(sampler.envs.observation_space[0].shape)))

    meta_learner = MetaLearner(sampler,
                               policy,
                               baseline,
                               gamma=args.gamma,
                               fast_lr=args.fast_lr,
                               tau=args.tau,
                               device=args.device,
                               args=args,
                               log=log,
                               tb_writer=tb_writer)

    # meta_learner.load(
    #     filename="theta_200", directory="./pytorch_models")

    meta_tester = MetaTester(sampler,
                             policy,
                             baseline,
                             gamma=args.gamma,
                             fast_lr=args.fast_lr,
                             tau=args.tau,
                             device=args.device,
                             args=args,
                             log=log,
                             tb_writer=tb_writer)

    prey = Prey(env=sampler._env,
                args=args,
                log=log,
                tb_writer=tb_writer,
                name="prey",
                i_agent=0)

    # Meta-train starts
    iteration = 0
    while True:
        # Sample train and validation episode
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size,
                                     test=False)
        episodes = meta_learner.sample(tasks,
                                       prey,
                                       first_order=args.first_order,
                                       iteration=iteration)

        # Train meta-policy
        meta_learner.step(episodes=episodes, args=args)

        # Test meta-policy
        if iteration % 10 == 0:
            test_tasks = sampler.sample_tasks(num_tasks=5, test=True)
            meta_tester.few_shot_adaptation(meta_policy=meta_learner.policy,
                                            tasks=test_tasks,
                                            first_order=args.first_order,
                                            iteration=iteration,
                                            prey=prey)

        if iteration % 100 == 0:
            meta_learner.save(iteration)

        iteration += 1
Ejemplo n.º 21
0
def main(args):
    wandb.config.update({
        k: v
        for k, v in vars(args).items()
        if k in ['env_name', 'tau', 'critic_lr']
    })
    continuous_actions = (args.env_name in [
        'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1',
        'HalfCheetahDir-v1', '2DNavigation-v0'
    ])

    save_folder = './saves/{0}'.format(args.output_folder)
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    with open(os.path.join(save_folder, 'config.json'), 'w') as f:
        config = {k: v for (k, v) in vars(args).items() if k != 'device'}
        config.update(device=args.device.type)
        json.dump(config, f, indent=2)

    sampler = BatchSampler(args.env_name,
                           args.seed,
                           batch_size=args.fast_batch_size,
                           num_workers=args.num_workers)
    if continuous_actions:
        policy = NormalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            int(np.prod(sampler.envs.action_space.shape)),
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    else:
        policy = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    baseline = LinearFeatureBaseline(
        int(np.prod(sampler.envs.observation_space.shape)))
    critic = Critic(int(np.prod(sampler.envs.observation_space.shape)),
                    1,
                    hidden_sizes=(args.hidden_size, ) * args.num_layers)

    metalearner = ActorCriticMetaLearner(sampler,
                                         policy,
                                         critic,
                                         gamma=args.gamma,
                                         fast_lr=args.fast_lr,
                                         tau=args.tau,
                                         device=args.device,
                                         critic_lr=args.critic_lr)
    wandb.watch(metalearner.critic)

    for batch in range(args.num_batches):
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        episodes = metalearner.sample(tasks, first_order=args.first_order)
        meta_critic_loss = metalearner.step(
            episodes,
            max_kl=args.max_kl,
            cg_iters=args.cg_iters,
            cg_damping=args.cg_damping,
            ls_max_steps=args.ls_max_steps,
            ls_backtrack_ratio=args.ls_backtrack_ratio)

        # Logging
        wandb.log(
            {
                'total_rewards/before_update':
                total_rewards([ep.rewards for ep, _ in episodes])
            },
            step=batch)
        wandb.log(
            {
                'total_rewards/after_update':
                total_rewards([ep.rewards for _, ep in episodes])
            },
            step=batch)
        wandb.log({'meta critic loss': meta_critic_loss.detach().item()},
                  step=batch)

        # Save policy network
        with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)),
                  'wb') as f:
            torch.save(policy.state_dict(), f)
Ejemplo n.º 22
0
def main(args):
    continuous_actions = (args.env_name in ['AntVel-v1', 'AntDir-v1',
        'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1',
        '2DNavigation-v0', '2DPointEnvCorner-v0'])

    save_folder = './saves/{0}'.format(args.env_name+'/'+args.output_folder)
    if args.output_folder!='maml-trial' and args.output_folder!='trial':
        i=0
        while os.path.exists(save_folder):
            args.output_folder=str(i+1)
            i+=1
            save_folder = './saves/{0}'.format(args.env_name+'/'+args.output_folder)
            log_directory = './logs/{0}'.format(args.env_name+'/'+args.output_folder)
        os.makedirs(save_folder)
    writer = SummaryWriter('./logs/{0}'.format(args.env_name+'/'+args.output_folder))

    with open(os.path.join(save_folder, 'config.json'), 'w') as f:
        config = {k: v for (k, v) in vars(args).items() if k != 'device'}
        config.update(device=args.device.type)
        json.dump(config, f, indent=2)


    sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size,
        num_workers=args.num_workers)
    if continuous_actions:
        policy = NormalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            int(np.prod(sampler.envs.action_space.shape)),
            hidden_sizes=(args.hidden_size,) * args.num_layers)
    else:
        policy = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size,) * args.num_layers)
    baseline = LinearFeatureBaseline(
        int(np.prod(sampler.envs.observation_space.shape)))

    if args.load_dir is not None:
        policy.load_state_dict(torch.load(args.load_dir))

    metalearner = MetaLearner(sampler, policy, baseline, args, gamma=args.gamma,
        fast_lr=args.fast_lr, tau=args.tau, device=args.device)

    for batch in range(args.num_batches):
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        episodes = metalearner.sample(tasks, first_order=args.first_order)
        metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters,
            cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps,
            ls_backtrack_ratio=args.ls_backtrack_ratio)

        print('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch)
        print('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch)
        
        # Plotting figure
        # plotting(episodes, batch, save_folder,args.num_plots)

        if args.load_dir is not None:
            sys.exit(0)
            
        # Tensorboard
        writer.add_scalar('total_rewards/before_update',
            total_rewards([ep.rewards for ep, _ in episodes]), batch)
        writer.add_scalar('total_rewards/after_update',
            total_rewards([ep.rewards for _, ep in episodes]), batch)

        # Save policy network
        with open(os.path.join(save_folder,
                'policy-{0}.pt'.format(batch)), 'wb') as f:
            torch.save(policy.state_dict(), f)
def main(args):

    continuous_actions = (args.env_name in [
        'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1',
        'HalfCheetahDir-v1', '2DNavigation-v0', 'RVONavigation-v0',
        'RVONavigationAll-v0'
    ])

    assert continuous_actions == True

    writer = SummaryWriter('./logs/{0}'.format(args.output_folder))
    save_folder = './saves/{0}'.format(args.output_folder)
    log_traj_folder = './logs/{0}'.format(args.output_traj_folder)

    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    if not os.path.exists(log_traj_folder):
        os.makedirs(log_traj_folder)
    with open(os.path.join(save_folder, 'config.json'), 'w') as f:
        config = {k: v for (k, v) in vars(args).items() if k != 'device'}
        config.update(device=args.device.type)
        json.dump(config, f, indent=2)

    # log_reward_total_file = open('./logs/reward_total.txt', 'a')
    # log_reward_dist_file = open('./logs/reward_dist.txt', 'a')
    # log_reward_col_file = open('./logs/reward_col.txt', 'a')

    sampler = BatchSampler(args.env_name,
                           batch_size=args.fast_batch_size,
                           num_workers=args.num_workers)

    # print(sampler.envs.observation_space.shape)
    # print(sampler.envs.action_space.shape)

    # eewfe

    if continuous_actions:
        policy = NormalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            int(np.prod(sampler.envs.action_space.shape)),
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    else:
        policy = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    # baseline = LinearFeatureBaseline(
    #     int(np.prod(sampler.envs.observation_space.shape)))
    baseline = LinearFeatureBaseline(int(np.prod((2, ))))

    resume_training = True

    if resume_training:
        saved_policy_path = os.path.join(
            './TrainingResults/result2//saves/{0}'.format(
                'maml-2DNavigation-dir'), 'policy-180.pt')
        if os.path.isfile(saved_policy_path):
            print('Loading a saved policy')
            policy_info = torch.load(saved_policy_path)
            policy.load_state_dict(policy_info)
        else:
            sys.exit("The requested policy does not exist for loading")

    metalearner = MetaLearner(sampler,
                              policy,
                              baseline,
                              gamma=args.gamma,
                              fast_lr=args.fast_lr,
                              tau=args.tau,
                              device=args.device)

    start_time = time.time()
    for batch in range(args.num_batches):
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        episodes = metalearner.sample(tasks, first_order=args.first_order)

        metalearner.step(episodes,
                         max_kl=args.max_kl,
                         cg_iters=args.cg_iters,
                         cg_damping=args.cg_damping,
                         ls_max_steps=args.ls_max_steps,
                         ls_backtrack_ratio=args.ls_backtrack_ratio)

        # print("observations shape: ")
        # print(episodes[0][1].observations.shape)

        # ewerw

        # Tensorboard
        total_reward_be, dist_reward_be, col_reward_be = total_rewards(
            [ep.rewards for ep, _ in episodes])
        total_reward_af, dist_reward_af, col_reward_af = total_rewards(
            [ep.rewards for _, ep in episodes])

        log_reward_total_file = open('./logs/reward_total.txt', 'a')
        log_reward_dist_file = open('./logs/reward_dist.txt', 'a')
        log_reward_col_file = open('./logs/reward_col.txt', 'a')

        log_reward_total_file.write(
            str(batch) + ',' + str(total_reward_be) + ',' +
            str(total_reward_af) + '\n')
        log_reward_dist_file.write(
            str(batch) + ',' + str(dist_reward_be) + ',' +
            str(dist_reward_af) + '\n')
        log_reward_col_file.write(
            str(batch) + ',' + str(col_reward_be) + ',' + str(col_reward_af) +
            '\n')

        log_reward_total_file.close(
        )  # not sure if open and close immediantly will help save the appended logs in-place
        log_reward_dist_file.close()
        log_reward_col_file.close()

        writer.add_scalar('total_rewards/before_update', total_reward_be,
                          batch)
        writer.add_scalar('total_rewards/after_update', total_reward_af, batch)

        writer.add_scalar('distance_reward/before_update', dist_reward_be,
                          batch)
        writer.add_scalar('distance_reward/after_update', dist_reward_af,
                          batch)

        writer.add_scalar('collison_rewards/before_update', col_reward_be,
                          batch)
        writer.add_scalar('collison_rewards/after_update', col_reward_af,
                          batch)

        if batch % args.save_every == 0:  # maybe it can save time/space if the models are saved only periodically
            # Save policy network
            print('Saving model {}'.format(batch))
            with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)),
                      'wb') as f:
                torch.save(policy.state_dict(), f)

        if batch % 30 == 0:
            with open(
                    os.path.join(
                        log_traj_folder,
                        'train_episodes_observ_' + str(batch) + '.pkl'),
                    'wb') as f:
                pickle.dump(
                    [ep.observations.cpu().numpy() for ep, _ in episodes], f)
            with open(
                    os.path.join(
                        log_traj_folder,
                        'valid_episodes_observ_' + str(batch) + '.pkl'),
                    'wb') as f:
                pickle.dump(
                    [ep.observations.cpu().numpy() for _, ep in episodes], f)

            # with open(os.path.join(log_traj_folder, 'train_episodes_ped_state_'+str(batch)+'.pkl'), 'wb') as f:
            #     pickle.dump([ep.hid_observations.cpu().numpy() for ep, _ in episodes], f)
            # with open(os.path.join(log_traj_folder, 'valid_episodes_ped_state_'+str(batch)+'.pkl'), 'wb') as f:
            #     pickle.dump([ep.hid_observations.cpu().numpy() for _, ep in episodes], f)
            # save tasks
            # a sample task list of 2: [{'goal': array([0.0209588 , 0.15981938])}, {'goal': array([0.45034602, 0.17282322])}]
            with open(
                    os.path.join(log_traj_folder,
                                 'tasks_' + str(batch) + '.pkl'), 'wb') as f:
                pickle.dump(tasks, f)

        else:
            # supposed to be overwritten for each batch
            with open(
                    os.path.join(log_traj_folder,
                                 'latest_train_episodes_observ.pkl'),
                    'wb') as f:
                pickle.dump(
                    [ep.observations.cpu().numpy() for ep, _ in episodes], f)
            with open(
                    os.path.join(log_traj_folder,
                                 'latest_valid_episodes_observ.pkl'),
                    'wb') as f:
                pickle.dump(
                    [ep.observations.cpu().numpy() for _, ep in episodes], f)

            # with open(os.path.join(log_traj_folder, 'latest_train_episodes_ped_state.pkl'), 'wb') as f:
            #     pickle.dump([ep.hid_observations.cpu().numpy() for ep, _ in episodes], f)
            # with open(os.path.join(log_traj_folder, 'latest_valid_episodes_ped_state.pkl'), 'wb') as f:
            #     pickle.dump([ep.hid_observations.cpu().numpy() for _, ep in episodes], f)

            with open(os.path.join(log_traj_folder, 'latest_tasks.pkl'),
                      'wb') as f:
                pickle.dump(tasks, f)

        print('finished epoch {}; time elapsed: {}'.format(
            batch, time_elapsed(time.time() - start_time)))
Ejemplo n.º 24
0
#ant flips over way too easily.. seems to need to keep kl and fast_lr both small

#try giving it more episodes

continuous_actions = (args.env_name in [
    'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1',
    'HalfCheetahDir-v1', '2DNavigation-v0', 'PendulumTheta-v0'
])

sampler = BatchSampler(args.env_name,
                       batch_size=args.fast_batch_size,
                       num_workers=args.num_workers)

if continuous_actions:
    the_model = NormalMLPPolicy(
        int(np.prod(sampler.envs.observation_space.shape)),
        int(np.prod(sampler.envs.action_space.shape)),
        hidden_sizes=(args.hidden_size, ) * args.num_layers)
else:
    the_model = CategoricalMLPPolicy(
        int(np.prod(sampler.envs.observation_space.shape)),
        sampler.envs.action_space.n,
        hidden_sizes=(args.hidden_size, ) * args.num_layers)

#loading the model
save_folder = './saves/{0}'.format(args.output_folder)
the_model.load_state_dict(
    torch.load(os.path.join(save_folder, 'policy-{0}.pt'.format(batch))))

baseline = LinearFeatureBaseline(
    int(np.prod(sampler.envs.observation_space.shape)))
Ejemplo n.º 25
0
def main(args):
    continuous_actions = (args.env_name in ['AntVel-v1', 'AntDir-v1',
        'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1',
        '2DNavigation-v0'])

    writer = SummaryWriter('./logs/{0}'.format(args.output_folder))
    save_folder = './saves/{0}'.format(args.output_folder)
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    with open(os.path.join(save_folder, 'config.json'), 'w') as f:
        config = {k: v for (k, v) in vars(args).items() if k != 'device'}
        config.update(device=args.device.type)
        json.dump(config, f, indent=2)

    sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size,
        num_workers=args.num_workers)
    if continuous_actions:
        policy = NormalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            int(np.prod(sampler.envs.action_space.shape)),
            hidden_sizes=(args.hidden_size,) * args.num_layers)
    else:
        policy = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size,) * args.num_layers)
    baseline = LinearFeatureBaseline(
        int(np.prod(sampler.envs.observation_space.shape)))

    metalearner = MetaLearnerNGLVCVPG(sampler, policy, baseline, gamma=args.gamma,
        fast_lr=args.fast_lr, tau=args.tau, device=args.device, verbose=args.verbose)

    for batch in range(args.num_batches):
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        start = time.time()
        episodes, kls, param_diffs = metalearner.sample(tasks, first_order=args.first_order, cg_iters=args.cg_iters)
        sample_time = time.time() - start
        start = time.time()
        if args.optimizer is 'sgd':
            metalearner.step_sgd(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters,
                cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps,
                ls_backtrack_ratio=args.ls_backtrack_ratio)
        else:
            metalearner.step_adam(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters,
                             cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps,
                             ls_backtrack_ratio=args.ls_backtrack_ratio)
        update_time = time.time() - start

        # Tensorboard
        writer.add_scalar('total_rewards/before_update',
            total_rewards([ep.rewards for ep, _ in episodes]), batch)
        writer.add_scalar('total_rewards/after_update',
            total_rewards([ep.rewards for _, ep in episodes]), batch)
        writer.add_scalar('kl-mean between meta update',
                          torch.mean(torch.stack(kls)), batch)
        writer.add_scalar('kl-std between meta update',
                          torch.std(torch.stack(kls)), batch)
        writer.add_scalar('Euclidean-distance-mean between meta update',
                          torch.mean(torch.stack(param_diffs)), batch)
        writer.add_scalar('Euclidean-distance-std between meta update',
                          torch.std(torch.stack(param_diffs)), batch)

        print("Batch {}. before_update: {}, after_update: {}\n sample time {}, update_time {}".format(batch,
                         total_rewards([ep.rewards for ep, _ in episodes]),
                         total_rewards([ep.rewards for _, ep in episodes]), sample_time, update_time))
        print("Batch {}. kl-divergence between meta update: {}, kl std: {}".format(
            batch, torch.mean(torch.stack(kls)), torch.std(torch.stack(kls))))
        print("Batch {}. Euclidean-distance-mean meta update: {}, Euclidean-distance-std: {}".format(
            batch, torch.mean(torch.stack(param_diffs)), torch.std(torch.stack(param_diffs))))
# Save policy network
        with open(os.path.join(save_folder,
                'policy-{0}.pt'.format(batch)), 'wb') as f:
            torch.save(policy.state_dict(), f)
Ejemplo n.º 26
0
def eval(args):
    continuous_actions = (args.env_name in [
        'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1',
        'HalfCheetahDir-v1', '2DNavigation-v0'
    ])

    # writer = SummaryWriter('./logs/{0}'.format(args.output_folder))
    save_folder = './saves/{0}'.format(args.output_folder)
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    log_folder = './logs/{0}'.format(args.output_folder)
    if not os.path.exists(log_folder):
        os.makedirs(log_folder)

    sampler = BatchSampler(args.env_name,
                           batch_size=args.fast_batch_size,
                           num_workers=args.num_workers)

    if args.env_name == 'AntPos-v0':
        param_bounds = {"x": [-3, 3], "y": [-3, 3]}

    tree = TreeLSTM(args.tree_hidden_layer,
                    len(param_bounds.keys()),
                    args.cluster_0,
                    args.cluster_1,
                    device=args.device)

    if continuous_actions:
        policy = NormalMLPPolicy(int(
            np.prod(sampler.envs.observation_space.shape) +
            args.tree_hidden_layer),
                                 int(np.prod(sampler.envs.action_space.shape)),
                                 hidden_sizes=(args.hidden_size, ) *
                                 args.num_layers)
    else:
        policy = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    policy.eval()
    tree.eval()

    all_tasks = []
    # torch.autograd.set_detect_anomaly(True)
    reward_list = []
    for batch in range(args.num_batches + 1):
        print("starting iteration {}".format(batch))
        try:
            policy.load_state_dict(
                torch.load(
                    os.path.join(save_folder, 'policy-{0}.pt'.format(batch))))
            tree = torch.load(
                os.path.join(save_folder, 'tree-{0}.pt'.format(batch)))
            tree.eval()
        except Exception:
            with open(
                    './logs/{0}/reward_list_eval.pkl'.format(
                        args.output_folder), 'wb') as pf:
                pickle.dump(reward_list, pf)

            print(reward_list)
            return

        # tree.load_state_dict(torch.load(os.path.join(save_folder,
        #                        'tree-{0}.pt'.format(batch))))

        tasks = sampler.sample_tasks(args.meta_batch_size)

        all_tasks.append(tasks)
        # tasks = np.array(tasks)
        # tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        with open('./logs/{0}/task_list_eval.pkl'.format(args.output_folder),
                  'wb') as pf:
            pickle.dump(all_tasks, pf)

        print("evaluating...".format(batch))
        all_rewards = []
        for task in tasks:
            print(task["position"])
            episodes = sampler.sample(policy, task, tree=tree)
            # print("training...".format(batch))

            # tr = [ep.rewards for ep in episodes]
            # tr = np.mean([torch.mean(torch.sum(rewards, dim=0)).item() for rewards in tr])
            all_rewards.append(total_rewards(episodes.rewards))

        reward_list.append(np.mean(all_rewards))

    with open('./logs/{0}/reward_list_eval.pkl'.format(args.output_folder),
              'wb') as pf:
        pickle.dump(reward_list, pf)

    print(reward_list)
Ejemplo n.º 27
0
def main(args):
    continuous_actions = (args.env_name in [
        'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1',
        'HalfCheetahDir-v1', '2DNavigation-v0', 'Point2DWalls-corner-v0',
        'Ant-v0', 'HalfCheetah-v0'
    ])

    logger.configure(dir=args.log_dir, format_strs=['stdout', 'log', 'csv'])
    logger.log(args)
    json.dump(vars(args),
              open(os.path.join(
                  args.log_dir,
                  'params.json',
              ), 'w'),
              indent=2)

    sampler = BatchSamplerMultiworld(args)
    sampler_val = BatchSamplerMultiworld(args, val=True)

    if continuous_actions:
        policy = NormalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            int(np.prod(sampler.envs.action_space.shape)),
            hidden_sizes=(args.hidden_size, ) * args.num_layers,
            bias_transformation_size=args.bias_transformation_size,
            init_gain=args.init_gain,
        )
    else:
        raise NotImplementedError
    baseline = LinearFeatureBaseline(
        int(np.prod(sampler.envs.observation_space.shape)))

    metalearner = MetaLearner(sampler,
                              policy,
                              baseline,
                              gamma=args.gamma,
                              fast_lr=args.fast_lr,
                              tau=args.tau,
                              entropy_coef=args.entropy_coef,
                              device=args.device)

    start_time = time.time()

    processes = []

    for batch in range(args.num_batches):
        metalearner.reset()
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        episodes = metalearner.sample(tasks, first_order=args.first_order)
        if sampler.rewarder.fit_counter > 0:
            metalearner.step(episodes,
                             max_kl=args.max_kl,
                             cg_iters=args.cg_iters,
                             cg_damping=args.cg_damping,
                             ls_max_steps=args.ls_max_steps,
                             ls_backtrack_ratio=args.ls_backtrack_ratio)

        if batch % args.rewarder_fit_period == 0:
            sampler.fit_rewarder(logger)

        if args.rewarder == 'unsupervised':
            sampler.log_unsupervised(logger)
        log_main(logger, episodes, batch, args, start_time, metalearner)

        if batch % args.save_period == 0 or batch == args.num_batches - 1:
            save_model_maml(args, policy, batch)

        if batch % args.val_period == 0 or batch == args.num_batches - 1:
            val(args, sampler_val, policy, baseline, batch)

        if batch % args.vis_period == 0 or batch == args.num_batches - 1:
            if args.plot:
                p = Popen(
                    'python maml_rl/utils/visualize.py --log-dir {}'.format(
                        args.log_dir),
                    shell=True)
                processes.append(p)

        logger.dumpkvs()
Ejemplo n.º 28
0
        main(args)
    else:
        env = gym.make(id)
        # maml = []
        #indexes = [e for e in range(400) if e % 10 == 9]
        #indexes = [0] + indexes
        indexes = [399]
        num_test_tasks = 100
        buckets = 1
        successes = []
        for index in indexes:
            sampler = BatchSampler(args.env_name,
                                   batch_size=args.fast_batch_size,
                                   num_workers=args.num_workers)
            model = NormalMLPPolicy(
                int(np.prod(sampler.envs.observation_space.shape)),
                int(np.prod(sampler.envs.action_space.shape)),
                hidden_sizes=(args.hidden_size, ) * args.num_layers)
            checkpoint = torch.load(
                '../final_models/meta/{0}/policy-{1}.pt'.format(
                    args.to_pickle, index))
            model.load_state_dict(checkpoint)
            baseline = LinearFeatureBaseline(
                int(np.prod(sampler.envs.observation_space.shape)))

            metalearner = MetaLearner(sampler,
                                      model,
                                      baseline,
                                      gamma=args.gamma,
                                      fast_lr=args.fast_lr,
                                      tau=args.tau,
                                      device=args.device)