def setup_method(self):
     self.env = DummyDictEnv()
     self.obs = self.env.reset()
     self.replay_buffer = HerReplayBuffer(
         env_spec=self.env.spec,
         size_in_transitions=3,
         time_horizon=1,
         replay_k=0.4,
         reward_fun=self.env.compute_reward)
 def setup_method(self):
     self.env = DummyDictEnv()
     obs = self.env.reset()
     self.replay_buffer = HerReplayBuffer(
         env_spec=self.env.spec,
         size_in_transitions=3,
         time_horizon=1,
         replay_k=0.4,
         reward_fun=self.env.compute_reward)
     # process observations
     self.d_g = obs['desired_goal']
     self.a_g = obs['achieved_goal']
     self.obs = obs['observation']
def her_ddpg_fetchreach(ctxt=None, seed=1):
    """Train DDPG + HER on the goal-conditioned FetchReach env.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    set_seed(seed)
    with LocalTFRunner(snapshot_config=ctxt) as runner:
        env = TfEnv(gym.make('FetchReach-v1'))

        policy = ContinuousMLPPolicy(
            env_spec=env.spec,
            name='Policy',
            hidden_sizes=[256, 256, 256],
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh,
        )

        exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec,
                                                       policy,
                                                       sigma=0.2)

        qf = ContinuousMLPQFunction(
            env_spec=env.spec,
            name='QFunction',
            hidden_sizes=[256, 256, 256],
            hidden_nonlinearity=tf.nn.relu,
        )

        replay_buffer = HerReplayBuffer(env_spec=env.spec,
                                        size_in_transitions=int(1e6),
                                        time_horizon=100,
                                        replay_k=0.4,
                                        reward_fun=env.compute_reward)

        ddpg = DDPG(
            env_spec=env.spec,
            policy=policy,
            policy_lr=1e-3,
            qf_lr=1e-3,
            qf=qf,
            replay_buffer=replay_buffer,
            target_update_tau=0.05,
            steps_per_epoch=20,
            max_path_length=100,
            n_train_steps=40,
            discount=0.9,
            exploration_policy=exploration_policy,
            policy_optimizer=tf.compat.v1.train.AdamOptimizer,
            qf_optimizer=tf.compat.v1.train.AdamOptimizer,
            buffer_batch_size=256,
        )

        runner.setup(algo=ddpg, env=env)

        runner.train(n_epochs=50, batch_size=100)
Exemple #4
0
def her_garage_tf(ctxt, env_id, seed):
    """Create garage TensorFlow HER model and training.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the
            snapshotter.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with LocalTFRunner(ctxt) as runner:
        env = TfEnv(normalize(gym.make(env_id)))

        policy = ContinuousMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=hyper_parameters['policy_hidden_sizes'],
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh,
        )

        exploration_policy = AddOrnsteinUhlenbeckNoise(
            env_spec=env.spec, policy=policy, sigma=hyper_parameters['sigma'])

        qf = ContinuousMLPQFunction(
            env_spec=env.spec,
            hidden_sizes=hyper_parameters['qf_hidden_sizes'],
            hidden_nonlinearity=tf.nn.relu,
        )

        replay_buffer = HerReplayBuffer(
            env_spec=env.spec,
            size_in_transitions=hyper_parameters['replay_buffer_size'],
            time_horizon=hyper_parameters['n_rollout_steps'],
            replay_k=0.4,
            reward_fun=env.compute_reward,
        )

        algo = DDPG(
            env_spec=env.spec,
            policy=policy,
            qf=qf,
            replay_buffer=replay_buffer,
            steps_per_epoch=hyper_parameters['steps_per_epoch'],
            policy_lr=hyper_parameters['policy_lr'],
            qf_lr=hyper_parameters['qf_lr'],
            target_update_tau=hyper_parameters['tau'],
            n_train_steps=hyper_parameters['n_train_steps'],
            discount=hyper_parameters['discount'],
            exploration_policy=exploration_policy,
            policy_optimizer=tf.compat.v1.train.AdamOptimizer,
            qf_optimizer=tf.compat.v1.train.AdamOptimizer,
            buffer_batch_size=256,
        )

        runner.setup(algo, env)
        runner.train(n_epochs=hyper_parameters['n_epochs'],
                     batch_size=hyper_parameters['n_rollout_steps'])
Exemple #5
0
def run_task(snapshot_config, *_):
    """Run task.

    Args:
        snapshot_config (garage.experiment.SnapshotConfig): The snapshot
            configuration used by LocalRunner to create the snapshotter.
        *_ (object): Ignored by this function.

    """
    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        env = TfEnv(gym.make('FetchReach-v1'))

        action_noise = OUStrategy(env.spec, sigma=0.2)

        policy = ContinuousMLPPolicy(
            env_spec=env.spec,
            name='Policy',
            hidden_sizes=[256, 256, 256],
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh,
            input_include_goal=True,
        )

        qf = ContinuousMLPQFunction(
            env_spec=env.spec,
            name='QFunction',
            hidden_sizes=[256, 256, 256],
            hidden_nonlinearity=tf.nn.relu,
            input_include_goal=True,
        )

        replay_buffer = HerReplayBuffer(env_spec=env.spec,
                                        size_in_transitions=int(1e6),
                                        time_horizon=100,
                                        replay_k=0.4,
                                        reward_fun=env.compute_reward)

        ddpg = DDPG(
            env_spec=env.spec,
            policy=policy,
            policy_lr=1e-3,
            qf_lr=1e-3,
            qf=qf,
            replay_buffer=replay_buffer,
            target_update_tau=0.05,
            steps_per_epoch=20,
            max_path_length=100,
            n_train_steps=40,
            discount=0.9,
            exploration_strategy=action_noise,
            policy_optimizer=tf.train.AdamOptimizer,
            qf_optimizer=tf.train.AdamOptimizer,
            buffer_batch_size=256,
            input_include_goal=True,
        )

        runner.setup(algo=ddpg, env=env)

        runner.train(n_epochs=50, batch_size=100)
def run_task(*_):
    """
    Wrap DDPG training task in the run_task function.

    :param _:
    :return:
    """
    env = TfEnv(gym.make('FetchReach-v1'))

    action_noise = OUStrategy(env.spec, sigma=0.2)

    policy = ContinuousMLPPolicy(
        env_spec=env.spec,
        name="Policy",
        hidden_sizes=[256, 256, 256],
        hidden_nonlinearity=tf.nn.relu,
        output_nonlinearity=tf.nn.tanh,
        input_include_goal=True,
    )

    qf = ContinuousMLPQFunction(
        env_spec=env.spec,
        name="QFunction",
        hidden_sizes=[256, 256, 256],
        hidden_nonlinearity=tf.nn.relu,
        input_include_goal=True,
    )

    replay_buffer = HerReplayBuffer(env_spec=env.spec,
                                    size_in_transitions=int(1e6),
                                    time_horizon=100,
                                    replay_k=0.4,
                                    reward_fun=env.compute_reward)

    ddpg = DDPG(
        env,
        policy=policy,
        policy_lr=1e-3,
        qf_lr=1e-3,
        qf=qf,
        replay_buffer=replay_buffer,
        plot=False,
        target_update_tau=0.05,
        n_epochs=50,
        n_epoch_cycles=20,
        max_path_length=100,
        n_train_steps=40,
        discount=0.9,
        exploration_strategy=action_noise,
        policy_optimizer=tf.train.AdamOptimizer,
        qf_optimizer=tf.train.AdamOptimizer,
        buffer_batch_size=256,
        input_include_goal=True,
    )

    ddpg.train()
Exemple #7
0
def run_task(*_):
    with LocalRunner() as runner:
        env = TfEnv(gym.make('FetchReach-v1'))

        action_noise = OUStrategy(env.spec, sigma=0.2)

        policy = ContinuousMLPPolicy(
            env_spec=env.spec,
            name='Policy',
            hidden_sizes=[256, 256, 256],
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh,
            input_include_goal=True,
        )

        qf = ContinuousMLPQFunction(
            env_spec=env.spec,
            name='QFunction',
            hidden_sizes=[256, 256, 256],
            hidden_nonlinearity=tf.nn.relu,
            input_include_goal=True,
        )

        replay_buffer = HerReplayBuffer(
            env_spec=env.spec,
            size_in_transitions=int(1e6),
            time_horizon=100,
            replay_k=0.4,
            reward_fun=env.compute_reward)

        ddpg = DDPG(
            env_spec=env.spec,
            policy=policy,
            policy_lr=1e-3,
            qf_lr=1e-3,
            qf=qf,
            replay_buffer=replay_buffer,
            target_update_tau=0.05,
            n_epoch_cycles=20,
            max_path_length=100,
            n_train_steps=40,
            discount=0.9,
            exploration_strategy=action_noise,
            policy_optimizer=tf.train.AdamOptimizer,
            qf_optimizer=tf.train.AdamOptimizer,
            buffer_batch_size=256,
            input_include_goal=True,
        )

        runner.setup(algo=ddpg, env=env)

        runner.train(n_epochs=50, n_epoch_cycles=20)
Exemple #8
0
def run_garage(env, seed, log_dir):
    '''
    Create garage model and training.

    Replace the ppo with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trial.
    :param log_dir: Log dir path.
    :return:
    '''
    deterministic.set_seed(seed)
    env.reset()

    with LocalRunner() as runner:
        env = TfEnv(normalize(env))

        action_noise = OUStrategy(env.spec, sigma=params['sigma'])

        policy = ContinuousMLPPolicyWithModel(
            env_spec=env.spec,
            hidden_sizes=params['policy_hidden_sizes'],
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh,
            input_include_goal=True,
        )

        qf = ContinuousMLPQFunction(
            env_spec=env.spec,
            hidden_sizes=params['qf_hidden_sizes'],
            hidden_nonlinearity=tf.nn.relu,
            input_include_goal=True,
        )

        replay_buffer = HerReplayBuffer(
            env_spec=env.spec,
            size_in_transitions=params['replay_buffer_size'],
            time_horizon=params['n_rollout_steps'],
            replay_k=0.4,
            reward_fun=env.compute_reward,
        )

        algo = DDPG(
            env_spec=env.spec,
            policy=policy,
            qf=qf,
            replay_buffer=replay_buffer,
            policy_lr=params['policy_lr'],
            qf_lr=params['qf_lr'],
            target_update_tau=params['tau'],
            n_train_steps=params['n_train_steps'],
            discount=params['discount'],
            exploration_strategy=action_noise,
            policy_optimizer=tf.train.AdamOptimizer,
            qf_optimizer=tf.train.AdamOptimizer,
            buffer_batch_size=256,
            input_include_goal=True,
        )

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        logger.add_output(dowel.StdOutput())
        logger.add_output(dowel.CsvOutput(tabular_log_file))
        logger.add_output(dowel.TensorBoardOutput(log_dir))

        runner.setup(algo, env)
        runner.train(n_epochs=params['n_epochs'],
                     n_epoch_cycles=params['n_epoch_cycles'],
                     batch_size=params['n_rollout_steps'])

        logger.remove_all()

        return tabular_log_file
Exemple #9
0
def run_garage(env, seed, log_dir):
    """
    Create garage model and training.

    Replace the ppo with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trial.
    :param log_dir: Log dir path.
    :return:
    """
    deterministic.set_seed(seed)
    env.reset()

    with LocalRunner() as runner:
        env = TfEnv(env)

        action_noise = OUStrategy(env.spec, sigma=params["sigma"])

        policy = ContinuousMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=params["policy_hidden_sizes"],
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh,
            input_include_goal=True,
        )

        qf = ContinuousMLPQFunction(
            env_spec=env.spec,
            hidden_sizes=params["qf_hidden_sizes"],
            hidden_nonlinearity=tf.nn.relu,
            input_include_goal=True,
        )

        replay_buffer = HerReplayBuffer(
            env_spec=env.spec,
            size_in_transitions=params["replay_buffer_size"],
            time_horizon=params["n_rollout_steps"],
            replay_k=0.4,
            reward_fun=env.compute_reward,
        )

        algo = DDPG(
            env,
            policy=policy,
            qf=qf,
            replay_buffer=replay_buffer,
            policy_lr=params["policy_lr"],
            qf_lr=params["qf_lr"],
            plot=False,
            target_update_tau=params["tau"],
            n_epochs=params["n_epochs"],
            n_epoch_cycles=params["n_epoch_cycles"],
            n_train_steps=params["n_train_steps"],
            discount=params["discount"],
            exploration_strategy=action_noise,
            policy_optimizer=tf.train.AdamOptimizer,
            qf_optimizer=tf.train.AdamOptimizer,
            buffer_batch_size=256,
            input_include_goal=True,
        )

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, "progress.csv")
        garage_logger.add_tabular_output(tabular_log_file)
        garage_logger.set_tensorboard_dir(log_dir)

        runner.setup(algo, env)
        runner.train(
            n_epochs=params['n_epochs'],
            n_epoch_cycles=params['n_epoch_cycles'],
            batch_size=params["n_rollout_steps"])

        garage_logger.remove_tabular_output(tabular_log_file)

        return tabular_log_file
class TestHerReplayBuffer:
    def setup_method(self):
        self.env = DummyDictEnv()
        self.obs = self.env.reset()
        self.replay_buffer = HerReplayBuffer(
            env_spec=self.env.spec,
            size_in_transitions=3,
            time_horizon=1,
            replay_k=0.4,
            reward_fun=self.env.compute_reward)

    def _add_single_transition(self):
        self.replay_buffer.add_transition(
            observation=self.obs,
            action=self.env.action_space.sample(),
            terminal=False,
            next_observation=self.obs)

    def _add_transitions(self):
        self.replay_buffer.add_transitions(
            observation=[self.obs],
            action=[self.env.action_space.sample()],
            terminal=[False],
            next_observation=[self.obs])

    def test_add_transition_dtype(self):
        self._add_single_transition()
        sample = self.replay_buffer.sample(1)

        assert sample['observation'].dtype == self.env.observation_space[
            'observation'].dtype
        assert sample['achieved_goal'].dtype == self.env.observation_space[
            'achieved_goal'].dtype
        assert sample['goal'].dtype == self.env.observation_space[
            'desired_goal'].dtype
        assert sample['action'].dtype == self.env.action_space.dtype

    def test_add_transitions_dtype(self):
        self._add_transitions()
        sample = self.replay_buffer.sample(1)

        assert sample['observation'].dtype == self.env.observation_space[
            'observation'].dtype
        assert sample['achieved_goal'].dtype == self.env.observation_space[
            'achieved_goal'].dtype
        assert sample['goal'].dtype == self.env.observation_space[
            'desired_goal'].dtype
        assert sample['action'].dtype == self.env.action_space.dtype

    def test_eviction_policy(self):
        self.replay_buffer.add_transitions(
            observation=[self.obs, self.obs],
            next_observation=[self.obs, self.obs],
            terminal=[False, False],
            action=[1, 2])
        assert not self.replay_buffer.full
        self.replay_buffer.add_transitions(
            observation=[self.obs, self.obs],
            next_observation=[self.obs, self.obs],
            terminal=[False, False],
            action=[3, 4])
        assert self.replay_buffer.full
        self.replay_buffer.add_transitions(
            observation=[self.obs, self.obs],
            next_observation=[self.obs, self.obs],
            terminal=[False, False],
            action=[5, 6])
        self.replay_buffer.add_transitions(
            observation=[self.obs, self.obs],
            next_observation=[self.obs, self.obs],
            terminal=[False, False],
            action=[7, 8])

        assert np.array_equal(self.replay_buffer._buffer['action'],
                              [[7], [8], [6]])
        assert self.replay_buffer.n_transitions_stored == 3

    def test_pickleable(self):
        self._add_transitions()
        replay_buffer_pickled = pickle.loads(pickle.dumps(self.replay_buffer))
        assert replay_buffer_pickled._buffer.keys(
        ) == self.replay_buffer._buffer.keys()
        for k in replay_buffer_pickled._buffer:
            assert replay_buffer_pickled._buffer[
                k].shape == self.replay_buffer._buffer[k].shape
        sample = self.replay_buffer.sample(1)
        sample2 = replay_buffer_pickled.sample(1)
        for k in self.replay_buffer._buffer:
            assert sample[k].shape == sample2[k].shape
Exemple #11
0
    def _initialize(self):
        with tf.name_scope(self.name, "DDPG"):
            with tf.name_scope("setup_networks"):
                """Set up the actor, critic and target network."""
                # Set up the actor and critic network
                self.actor._build_net(trainable=True)
                self.critic._build_net(trainable=True)

                # Create target actor and critic network
                target_actor = copy(self.actor)
                target_critic = copy(self.critic)

                # Set up the target network
                target_actor.name = "TargetActor"
                target_actor._build_net(trainable=False)
                target_critic.name = "TargetCritic"
                target_critic._build_net(trainable=False)

            input_shapes = dims_to_shapes(self.input_dims)

            # Initialize replay buffer
            if self.use_her:
                buffer_shapes = {
                    key: (self.n_rollout_steps + 1
                          if key == "observation" or key == "achieved_goal"
                          else self.n_rollout_steps, *input_shapes[key])
                    for key, val in input_shapes.items()
                }

                replay_buffer = HerReplayBuffer(
                    buffer_shapes=buffer_shapes,
                    size_in_transitions=self.replay_buffer_size,
                    time_horizon=self.n_rollout_steps,
                    sample_transitions=make_her_sample(
                        self.replay_k, self.env.compute_reward))
            else:
                replay_buffer = ReplayBuffer(
                    buffer_shapes=input_shapes,
                    max_buffer_size=self.replay_buffer_size)

            # Set up target init and update function
            with tf.name_scope("setup_target"):
                actor_init_ops, actor_update_ops = get_target_ops(
                    self.actor.global_vars, target_actor.global_vars, self.tau)
                critic_init_ops, critic_update_ops = get_target_ops(
                    self.critic.global_vars, target_critic.global_vars,
                    self.tau)
                target_init_op = actor_init_ops + critic_init_ops
                target_update_op = actor_update_ops + critic_update_ops

            f_init_target = tensor_utils.compile_function(
                inputs=[], outputs=target_init_op)
            f_update_target = tensor_utils.compile_function(
                inputs=[], outputs=target_update_op)

            with tf.name_scope("inputs"):
                obs_dim = (
                    self.input_dims["observation"] + self.input_dims["goal"]
                ) if self.use_her else self.input_dims["observation"]
                y = tf.placeholder(tf.float32, shape=(None, 1), name="input_y")
                obs = tf.placeholder(
                    tf.float32,
                    shape=(None, obs_dim),
                    name="input_observation")
                actions = tf.placeholder(
                    tf.float32,
                    shape=(None, self.input_dims["action"]),
                    name="input_action")

            # Set up actor training function
            next_action = self.actor.get_action_sym(obs, name="actor_action")
            next_qval = self.critic.get_qval_sym(
                obs, next_action, name="actor_qval")
            with tf.name_scope("action_loss"):
                action_loss = -tf.reduce_mean(next_qval)
                if self.actor_weight_decay > 0.:
                    actor_reg = tc.layers.apply_regularization(
                        tc.layers.l2_regularizer(self.actor_weight_decay),
                        weights_list=self.actor.regularizable_vars)
                    action_loss += actor_reg

            with tf.name_scope("minimize_action_loss"):
                actor_train_op = self.actor_optimizer(
                    self.actor_lr, name="ActorOptimizer").minimize(
                        action_loss, var_list=self.actor.trainable_vars)

            f_train_actor = tensor_utils.compile_function(
                inputs=[obs], outputs=[actor_train_op, action_loss])

            # Set up critic training function
            qval = self.critic.get_qval_sym(obs, actions, name="q_value")
            with tf.name_scope("qval_loss"):
                qval_loss = tf.reduce_mean(tf.squared_difference(y, qval))
                if self.critic_weight_decay > 0.:
                    critic_reg = tc.layers.apply_regularization(
                        tc.layers.l2_regularizer(self.critic_weight_decay),
                        weights_list=self.critic.regularizable_vars)
                    qval_loss += critic_reg

            with tf.name_scope("minimize_critic_loss"):
                critic_train_op = self.critic_optimizer(
                    self.critic_lr, name="CriticOptimizer").minimize(
                        qval_loss, var_list=self.critic.trainable_vars)

            f_train_critic = tensor_utils.compile_function(
                inputs=[y, obs, actions],
                outputs=[critic_train_op, qval_loss, qval])

            self.f_train_actor = f_train_actor
            self.f_train_critic = f_train_critic
            self.f_init_target = f_init_target
            self.f_update_target = f_update_target
            self.replay_buffer = replay_buffer
            self.target_critic = target_critic
            self.target_actor = target_actor