def experiment(variant, expl_env, eval_env, policy, checkpoint_dir, load_iter):
    print(f'Environment: {expl_env.name}')
    print(f'Policy: {policy.name}')

    expl_env = NormalizedBoxEnv(expl_env)
    eval_env = NormalizedBoxEnv(eval_env)
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    if load_iter is not None:
        load_dir = os.path.join(checkpoint_dir, 'iteration_' + load_iter)
        model_checkpoint = torch.load(os.path.join(load_dir, 'model.pt'))
        with open(os.path.join(load_dir, 'replay_buffer.pkl'), 'rb') as f:
            attr_dict = pickle.load(f)
        start_epoch = attr_dict['iteration'] + 1
    else:
        model_checkpoint = None
        attr_dict = None
        start_epoch = 0

    layer_size = variant['layer_size']
    number_of_layers = variant['number_of_layers']
    hidden_sizes = [layer_size] * number_of_layers
    q_input_size = obs_dim + action_dim
    q_output_size = 1
    qf1 = FlattenMlp(input_size=q_input_size,
                     output_size=q_output_size,
                     hidden_sizes=hidden_sizes)
    qf2 = FlattenMlp(input_size=q_input_size,
                     output_size=q_output_size,
                     hidden_sizes=hidden_sizes)
    target_qf1 = FlattenMlp(input_size=q_input_size,
                            output_size=q_output_size,
                            hidden_sizes=hidden_sizes)
    target_qf2 = FlattenMlp(input_size=q_input_size,
                            output_size=q_output_size,
                            hidden_sizes=hidden_sizes)
    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(eval_env, eval_policy)
    expl_path_collector = MdpPathCollector(expl_env, policy)
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env,
                                    attr_dict)
    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         checkpoint=model_checkpoint,
                         **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        save_dir=checkpoint_dir,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train(start_epoch=start_epoch)
Exemple #2
0
def experiment(variant):
    env_name = variant["env_name"]
    env_kwargs = variant["env_kwargs"]
    expl_env = make_robosuite_env(env_name, env_kwargs)
    eval_env = make_robosuite_env(env_name, env_kwargs)
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    M = variant["layer_size"]
    qf1 = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    qf2 = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf1 = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf2 = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M],
    )
    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant["replay_buffer_size"],
        expl_env,
    )
    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant["trainer_kwargs"])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant["algorithm_kwargs"])
    algorithm.to(ptu.device)
    algorithm.train()
    def __init__(self,
                 env,
                 policy,
                 discriminator,
                 policy_optimizer,
                 expert_replay_buffer,
                 num_trajs_per_update=8,
                 batch_size=1024,
                 disc_lr=1e-3,
                 disc_optimizer_class=optim.Adam,
                 use_grad_pen=True,
                 grad_pen_weight=10,
                 plotter=None,
                 render_eval_paths=False,
                 eval_deterministic=True,
                 **kwargs):
        assert disc_lr != 1e-3, 'Just checking that this is being taken from the spec file'
        if eval_deterministic:
            eval_policy = MakeDeterministic(policy)
        else:
            eval_policy = policy

        # FOR IN THE AIR FETCH EASY LARGER Z RANGE
        # self.acts_max = Variable(ptu.from_numpy(np.array([0.24995736, 0.2499716 , 0.24999983, 0.01499852])), requires_grad=False)
        # self.acts_min = Variable(ptu.from_numpy(np.array([-0.24989959, -0.24995068, -0.2499989 , -0.01499998])), requires_grad=False)
        # self.observation_max = Variable(ptu.from_numpy(np.array([0.0499439 , 0.04998455, 0.00098634, 0.09421162, 0.10457129,
        # 0.3022664 , 0.05094975, 0.05090175, 0.01024486, 0.01029508])), requires_grad=False)
        # self.observation_min = Variable(ptu.from_numpy(np.array([-4.98090099e-02, -4.97771561e-02, -1.10015137e-01, -9.60775777e-02,
        # -1.03508767e-01, -3.50153560e-03,  0.00000000e+00, -8.67902630e-08,
        # -9.47353981e-03, -9.62584145e-03])), requires_grad=False)
        # self.SCALE = 0.99

        # FOR IN THE AIR FETCH EASY LARGER X-Y RANGE
        # self.acts_max = Variable(ptu.from_numpy(np.array([0.24999749, 0.2499975 , 0.2499998 , 0.01499951])), requires_grad=False)
        # self.acts_min = Variable(ptu.from_numpy(np.array([-0.24999754, -0.24999917, -0.24999704, -0.01499989])), requires_grad=False)
        # self.observation_max = Variable(ptu.from_numpy(np.array([0.14953716, 0.14865454, 0.00155898, 0.28595684, 0.27644423,
        # 0.20200016, 0.05094223, 0.05082468, 0.01033346, 0.0103368 ])), requires_grad=False)
        # self.observation_min = Variable(ptu.from_numpy(np.array([-1.49931348e-01, -1.49895902e-01, -1.10015137e-01, -2.80037372e-01,
        # -2.82756899e-01, -3.44387360e-03,  0.00000000e+00, -8.67902630e-08,
        # -9.53356933e-03, -9.71619128e-03])), requires_grad=False)
        # self.SCALE = 0.99

        # FOR IN THE AIR FETCH EASY LARGER OBJECT RANGE
        self.acts_max = Variable(ptu.from_numpy(
            np.array([0.24999844, 0.24999035, 0.24999848, 0.01499987])),
                                 requires_grad=False)
        self.acts_min = Variable(ptu.from_numpy(
            np.array([-0.24999948, -0.24999969, -0.24999971, -0.01499985])),
                                 requires_grad=False)
        self.observation_max = Variable(ptu.from_numpy(
            np.array([
                0.14981718, 0.14922823, 0.00105448, 0.19316468, 0.20144443,
                0.20205348, 0.05088978, 0.05087405, 0.01012868, 0.01011336
            ])),
                                        requires_grad=False)
        self.observation_min = Variable(ptu.from_numpy(
            np.array([
                -1.49439076e-01, -1.49636276e-01, -1.10015137e-01,
                -1.99832936e-01, -1.96645722e-01, -3.35041414e-03,
                0.00000000e+00, -8.67902630e-08, -9.49761703e-03,
                -9.71219664e-03
            ])),
                                        requires_grad=False)
        self.SCALE = 0.99

        super().__init__(env=env,
                         exploration_policy=policy,
                         eval_policy=eval_policy,
                         expert_replay_buffer=expert_replay_buffer,
                         policy_optimizer=policy_optimizer,
                         **kwargs)

        self.discriminator = discriminator
        self.rewardf_eval_statistics = None
        self.disc_optimizer = disc_optimizer_class(
            self.discriminator.parameters(),
            lr=disc_lr,
        )

        self.num_trajs_per_update = num_trajs_per_update
        self.traj_len = 65
        self.batch_size = batch_size

        self.bce = nn.BCEWithLogitsLoss()
        self.bce_targets = torch.cat(
            [torch.ones(self.batch_size, 1),
             torch.zeros(self.batch_size, 1)],
            dim=0)
        self.bce_targets = Variable(self.bce_targets)
        if ptu.gpu_enabled():
            self.bce.cuda()
            self.bce_targets = self.bce_targets.cuda()

        self.use_grad_pen = use_grad_pen
        self.grad_pen_weight = grad_pen_weight
Exemple #4
0
def gather_eval_data(alg,
                     num_rollouts_per_context=8,
                     deterministic=True,
                     num_diff_context=1,
                     eval_params_sampler=None,
                     expert_buffer_for_eval_tasks=None,
                     evaluating_expert=False,
                     eval_deterministic=True,
                     eval_no_task_info=False):
    context_sizes = [1]
    if not evaluating_expert:
        alg.encoder.eval()

    all_statistics = {}
    task_num = 0

    # env = alg.env
    env = Walker2DRandomDynamicsEnv()

    _means = []
    _stds = []

    for task_params, obs_task_params in eval_params_sampler:
        env.reset(task_params=task_params, obs_task_params=obs_task_params)
        task_rets = []
        print('\tEvaluating task {}...'.format(obs_task_params))
        print(task_params)
        task_num += 1
        task_id = env.task_identifier

        for context_size in context_sizes:
            _cont_size_dict = {}

            for c_idx in range(num_diff_context):
                if not evaluating_expert:
                    if eval_no_task_info:
                        print('Evaluting with no task information!')
                        new_task_params = {}
                        for k in task_params:
                            new_task_params[k] = np.ones(task_params[k].shape)
                        raise NotImplementedError()
                    else:
                        list_of_trajs = alg.expert_buffer_for_eval_tasks.sample_trajs_from_task(
                            task_id, context_size)
                    alg.encoder.eval()
                    post_dist = alg.encoder([list_of_trajs])
                    z = post_dist.sample()
                    z = z.cpu().data.numpy()[0]
                    # post_cond_policy = PostCondMLPPolicyWrapper(alg.main_policy, z)
                    post_cond_policy = PostCondMLPPolicyWrapper(
                        alg.main_policy, z)
                    post_cond_policy.policy.eval()
                else:
                    # if eval_no_task_info:
                    #     print('Evaluting with no task information!')
                    #     post_cond_policy = alg.get_eval_policy(0.0*np.ones(obs_task_params.shape))
                    # else:
                    #     post_cond_policy = alg.get_eval_policy(np.ones(obs_task_params))

                    # For evaluating a standard walker expert
                    # post_cond_policy = alg.policy
                    # post_cond_policy = alg.eval_policy
                    post_cond_policy = MakeDeterministic(alg.policy)

                post_cond_policy.deterministic = eval_deterministic
                context_returns = []
                for _ in range(num_rollouts_per_context):
                    stacked_path = rollout_path(env, task_params,
                                                obs_task_params,
                                                post_cond_policy,
                                                alg.max_path_length)
                    context_returns.append(np.sum(stacked_path['rewards']))
                task_rets.extend(context_returns)

        all_statistics[task_id] = task_rets
        print('\nReturns: %.4f +/- %.4f' %
              (np.mean(task_rets), np.std(task_rets)))
        _means.append(np.mean(task_rets))
        _stds.append(np.std(task_rets))
    for i in range(len(_means)):
        print('%.4f +/- %.4f' % (_means[i], _stds[i]))
    return all_statistics