Python StagingArea.get Examples

Programming Language: Python

Namespace/Package Name: tensorflow.contrib.staging

Class/Type: StagingArea

Method/Function: get

Examples at hotexamples.com: 28

Python StagingArea.get - 28 examples found. These are the top rated real world Python examples of tensorflow.contrib.staging.StagingArea.get extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

put(28)

get(26)

StagingArea(24)

clear(1)

Example #1

Show file

File: input_data.py Project: xtanitfy/tensorpack

 def setup_staging_areas(self):
     for idx, device in enumerate(self._devices):
         with tf.device(device):
             inputs = self._input.get_input_tensors()
             dtypes = [x.dtype for x in inputs]
             stage = StagingArea(dtypes, shapes=None)
             self._stage_ops.append(stage.put(inputs))
             self._areas.append(stage)
             outputs = stage.get()
             for vin, vout in zip(inputs, outputs):
                 vout.set_shape(vin.get_shape())
             self._unstage_ops.append(outputs)

Example #2

Show file

 def stage_data(self, batch, memory_gb=1, n_threads=4):
     ''''''
     with tf.device('/gpu:0'):
         dtypes = [t.dtype for t in batch]
         shapes = [t.get_shape() for t in batch]
         SA = StagingArea(dtypes,
                          shapes=shapes,
                          memory_limit=memory_gb * 1e9)
         get, put, clear = SA.get(), SA.put(batch), SA.clear()
     tf.train.add_queue_runner(
         tf.train.QueueRunner(queue=SA,
                              enqueue_ops=[put] * n_threads,
                              close_op=clear,
                              cancel_op=clear))
     return get

Example #3

Show file

 def setup_staging_areas(self):
     for idx, device in enumerate(self._devices):
         with tf.device(device):
             inputs = self._input.get_input_tensors()
             dtypes = [x.dtype for x in inputs]
             stage = StagingArea(dtypes, shapes=None)
             self._stage_ops.append(stage.put(inputs))
             self._areas.append(stage)
             outputs = stage.get()
             if isinstance(
                     outputs,
                     tf.Tensor):  # when size=1, TF doesn't return a list
                 outputs = [outputs]
             for vin, vout in zip(inputs, outputs):
                 vout.set_shape(vin.get_shape())
             self._unstage_ops.append(outputs)

Example #4

Show file

    def _prepare_staging(self):
        with tf.variable_scope('staging', reuse=tf.AUTO_REUSE):
            staging_area_tf = StagingArea(
                dtypes=[tf.float32 for _ in self._stage_shapes.keys()],
                shapes=[(None, *shape)
                        for shape in self._stage_shapes.values()])
            input_ph_tf = [
                tf.placeholder(tf.float32, shape=(None, *shape))
                for shape in self._stage_shapes.values()
            ]
            staging_op_tf = staging_area_tf.put(input_ph_tf)

            batch_tf = OrderedDict([
                (key, batch_item) for key, batch_item in zip(
                    self._stage_shapes.keys(), staging_area_tf.get())
            ])

        return staging_area_tf, input_ph_tf, staging_op_tf, batch_tf

Example #5

Show file

File: ddpg.py Project: knowledgetechnologyuhh/goal_conditioned_RL_baselines

class DDPG_PDDL(Policy):
    @store_args
    def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size,
                 Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T,
                 rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return,
                 sample_transitions, gamma, n_preds, reuse=False, **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
        """
        Policy.__init__(self, input_dims, T, rollout_batch_size, **kwargs)

        self.hidden = hidden
        self.layers = layers
        self.max_u = max_u
        self.network_class = network_class
        self.sample_transitions = sample_transitions
        self.scope = scope
        self.subtract_goals = subtract_goals
        self.relative_goals = relative_goals
        self.clip_obs = clip_obs
        self.Q_lr = Q_lr
        self.pi_lr = pi_lr
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.clip_pos_returns = clip_pos_returns
        self.gamma = gamma
        self.polyak = polyak
        self.clip_return = clip_return
        self.norm_eps = norm_eps
        self.norm_clip = norm_clip
        self.action_l2 = action_l2
        self.n_preds = n_preds
        self.rep_lr = Q_lr
        if self.clip_return is None:
            self.clip_return = np.inf
        self.create_actor_critic = import_function(self.network_class)
        self.rep_network = import_function(kwargs['rep_network_class'])


        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values()]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {key: (self.T if key != 'o' else self.T+1, *self.input_shapes[key])
                         for key, val in self.input_shapes.items()}
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T+1, self.dimg)

        buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)

        # Creat rep. network
        with tf.variable_scope(self.scope):
            self._create_rep_network(reuse=reuse)
        self.obs2preds_buffer = Obs2PredsBuffer(buffer_len=2000)

    def _random_action(self, n):
        return np.random.uniform(low=-self.max_u, high=self.max_u, size=(n, self.dimu))

    def _preprocess_og(self, o, ag, g):
        if self.relative_goals:
            g_shape = g.shape
            g = g.reshape(-1, self.dimg)
            ag = ag.reshape(-1, self.dimg)
            g = self.subtract_goals(g, ag)
            g = g.reshape(*g_shape)
        o = np.clip(o, -self.clip_obs, self.clip_obs)
        g = np.clip(g, -self.clip_obs, self.clip_obs)
        return o, g

    def get_actions(self, o, ag, g, noise_eps=0., random_eps=0., use_target_net=False,
                    compute_Q=False, exploit=True):

        noise_eps = noise_eps if not exploit else 0.
        random_eps = random_eps if not exploit else 0.

        o, g = self._preprocess_og(o, ag, g)
        policy = self.target if use_target_net else self.main
        # values to compute
        vals = [policy.pi_tf]
        if compute_Q:
            vals += [policy.Q_pi_tf]
        # feed
        feed = {
            policy.o_tf: o.reshape(-1, self.dimo),
            policy.g_tf: g.reshape(-1, self.dimg),
            policy.u_tf: np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32)
        }

        ret = self.sess.run(vals, feed_dict=feed)
        # action postprocessing
        u = ret[0]
        noise = noise_eps * self.max_u * np.random.randn(*u.shape)  # gaussian noise
        u += noise
        u = np.clip(u, -self.max_u, self.max_u)
        u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (self._random_action(u.shape[0]) - u)  # eps-greedy
        if u.shape[0] == 1:
            u = u[0]
        u = u.copy()
        ret[0] = u

        if len(ret) == 1:
            return ret[0]
        else:
            return ret

    def store_episode(self, episode_batch, update_stats=True):
        """
        episode_batch: array of batch_size x (T or T+1) x dim_key
                       'o' is of size T+1, others are of size T
        """

        self.buffer.store_episode(episode_batch)

        if update_stats:
            # add transitions to normalizer
            episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
            episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
            num_normalizing_transitions = transitions_in_episode_batch(episode_batch)
            transitions = self.sample_transitions(episode_batch, num_normalizing_transitions)

            o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions['g'], transitions['ag']
            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
            # No need to preprocess the o_2 and g_2 since this is only used for stats

            self.o_stats.update(transitions['o'])
            self.g_stats.update(transitions['g'])

            self.o_stats.recompute_stats()
            self.g_stats.recompute_stats()

    def get_current_buffer_size(self):
        return self.buffer.get_current_size()

    def _sync_optimizers(self):
        self.Q_adam.sync()
        self.pi_adam.sync()

    def _sync_rep_optimizers(self):
        self.rep_adam.sync()
        # self.pi_adam.sync()

    def _grads(self):
        # Avoid feed_dict here for performance!
        critic_loss, actor_loss, Q_grad, pi_grad = self.sess.run([
            self.Q_loss_tf,
            self.main.Q_pi_tf,
            self.Q_grad_tf,
            self.pi_grad_tf
        ])
        return critic_loss, actor_loss, Q_grad, pi_grad

    def _update(self, Q_grad, pi_grad):
        import os
        # print("PID: {}. Updating AC.".format(os.getpid()))
        self.Q_adam.update(Q_grad, self.Q_lr)
        self.pi_adam.update(pi_grad, self.pi_lr)

    def sample_batch(self):
        transitions = self.buffer.sample(self.batch_size)
        o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
        ag, ag_2 = transitions['ag'], transitions['ag_2']
        transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
        transitions['o_2'], transitions['g_2'] = self._preprocess_og(o_2, ag_2, g)
        transitions_batch = [transitions[key] for key in self.stage_shapes.keys()]
        return transitions_batch

    def stage_batch(self, batch=None):
        if batch is None:
            batch = self.sample_batch()
        assert len(self.buffer_ph_tf) == len(batch)
        self.sess.run(self.stage_op, feed_dict=dict(zip(self.buffer_ph_tf, batch)))

    def train(self, stage=True):
        if stage:
            self.stage_batch()
        critic_loss, actor_loss, Q_grad, pi_grad = self._grads()
        self._update(Q_grad, pi_grad)
        return critic_loss, actor_loss

    def train_representation(self):
        rep_batch_size = 64
        batch = self.obs2preds_buffer.sample_batch(rep_batch_size)
        indexes = batch['indexes']
        feed_dict = {self.obs2preds_model.inputs_o: batch['obs'],
                                 self.obs2preds_model.inputs_g: batch['goals'],
                                 self.obs2preds_model.preds: batch['preds']}

        rep_grad = self.sess.run([self.rep_grad_tf], feed_dict=feed_dict)[0]
        self.rep_adam.update(rep_grad, self.rep_lr)
        # opti_res, celoss, celosses = self.sess.run([self.obs2preds_model.optimizer,
        #                                   self.obs2preds_model.celoss,
        #                                   self.obs2preds_model.celosses],
        #               feed_dict=feed_dict)
        #
        # celosses = np.mean(celosses, axis=-1)
        _, celosses_after = self.predict_representation(batch)
        celoss = np.mean(celosses_after)
        return celoss, celosses_after, indexes

    def predict_representation(self, batch):
        feed_dict = {self.obs2preds_model.inputs_o: batch['obs'],
                     self.obs2preds_model.inputs_g: batch['goals']}
        pred_dist = self.sess.run([self.obs2preds_model.prob_out],
                                  feed_dict=feed_dict)
        losses = None
        if 'preds' in batch:
            preds = batch['preds']
            if len(preds.shape) != 3:
                preds_probdist = np.zeros(shape=[preds.shape[0], preds.shape[1], 2])
                for j,p in enumerate(preds):
                    for i, v in enumerate(p):
                        preds_probdist[j][i][int(v)] = 1
                preds = preds_probdist
            feed_dict.update({self.obs2preds_model.preds: preds})
            pred_dist, loss = self.sess.run([self.obs2preds_model.prob_out, self.obs2preds_model.celosses],
                                      feed_dict=feed_dict)
            loss = np.mean(loss, axis=-1)

            losses = np.reshape(loss,newshape=(preds.shape[0]))
        preds = prob_dist2discrete(pred_dist)
        return preds, losses



    def _init_target_net(self):
        self.sess.run(self.init_target_net_op)

    def update_target_net(self):
        self.sess.run(self.update_target_net_op)

    def clear_buffer(self):
        self.buffer.clear_buffer()

    def _vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope + '/' + scope)
        assert len(res) > 0
        return res

    def _global_vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope + '/' + scope)
        return res

    def _create_rep_network(self, reuse=False):
        self.obs2preds_model = self.rep_network(self.n_preds, self.dimo, self.dimg)
        self.rep_loss_tf = tf.reduce_mean(self.obs2preds_model.celoss)
        rep_grads_tf = tf.gradients(self.rep_loss_tf, self._vars('obs2preds'))
        self.rep_grad_tf = flatten_grads(grads=rep_grads_tf, var_list=self._vars('obs2preds'))
        self.rep_adam = MpiAdam(self._vars('obs2preds'), scale_grad_by_procs=False)
        self._sync_rep_optimizers()

    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([(key, batch[i])
                                for i, key in enumerate(self.stage_shapes.keys())])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(
                target_batch_tf, net_type='target', **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
        self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))
        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats')
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()

    def logs(self, prefix=''):
        logs = []
        logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
        logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
        logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
        logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]

        if prefix is not '' and not prefix.endswith('/'):
            return [(prefix + '/' + key, val) for key, val in logs]
        else:
            return logs

    def __getstate__(self):
        """Our policies can be loaded from pkl, but after unpickling you cannot continue training.
        """
        # [print(key, ": ", item) for key,item in self.__dict__.items()]
        excluded_subnames = ['_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats',
                             'main', 'target', 'lock', 'env', 'sample_transitions',
                             'stage_shapes', 'create_actor_critic',
                             'obs2preds_buffer', 'obs2preds_model']

        state = {k: v for k, v in self.__dict__.items() if all([not subname in k for subname in excluded_subnames])}
        state['buffer_size'] = self.buffer_size
        state['tf'] = self.sess.run([x for x in self._global_vars('') if 'buffer' not in x.name and 'obs2preds_buffer' not in x.name])
        return state

    def __setstate__(self, state):
        if 'sample_transitions' not in state:
            # We don't need this for playing the policy.
            state['sample_transitions'] = None

        self.__init__(**state)
        # set up stats (they are overwritten in __init__)
        for k, v in state.items():
            if k[-6:] == '_stats':
                self.__dict__[k] = v
        # load TF variables
        vars = [x for x in self._global_vars('') if 'buffer' not in x.name and 'obs2preds_buffer' not in x.name]
        assert(len(vars) == len(state["tf"]))
        node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
        self.sess.run(node)

Example #6

Show file

class DDPG(object):
    @store_args
    def __init__(self,
                 input_dims,
                 buffer_size,
                 hidden,
                 layers,
                 network_class_actor_critic,
                 network_class_discriminator,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 mi_lr,
                 sk_lr,
                 r_scale,
                 mi_r_scale,
                 sk_r_scale,
                 et_r_scale,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 sample_transitions,
                 gamma,
                 env_name,
                 max_timesteps,
                 pretrain_weights,
                 finetune_pi,
                 mi_prioritization,
                 sac,
                 reuse=False,
                 history_len=10000,
                 **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
        """
        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(
            self.network_class_actor_critic)
        self.create_discriminator = import_function(
            self.network_class_discriminator)

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        self.dimz = self.input_dims['z']
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']

        self.env_name = env_name

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )
        stage_shapes['w'] = (None, )
        stage_shapes['m'] = (None, )
        stage_shapes['s'] = (None, )
        stage_shapes['m_w'] = ()
        stage_shapes['s_w'] = ()
        stage_shapes['r_w'] = ()
        stage_shapes['e_w'] = ()
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(pretrain_weights,
                                 mi_prioritization,
                                 reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {
            key: (self.T if key != 'o' else self.T + 1, *input_shapes[key])
            for key, val in input_shapes.items()
        }
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T + 1, self.dimg)
        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size

        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                   self.sample_transitions, mi_prioritization)

        self.mi_r_history = deque(maxlen=history_len)
        self.gl_r_history = deque(maxlen=history_len)
        self.sk_r_history = deque(maxlen=history_len)
        self.et_r_history = deque(maxlen=history_len)
        self.mi_current = 0
        self.finetune_pi = finetune_pi

    def _random_action(self, n):
        return np.random.uniform(low=-self.max_u,
                                 high=self.max_u,
                                 size=(n, self.dimu))

    def _preprocess_og(self, o, ag, g):
        if self.relative_goals:
            g_shape = g.shape
            g = g.reshape(-1, self.dimg)
            ag = ag.reshape(-1, self.dimg)
            g = self.subtract_goals(g, ag)
            g = g.reshape(*g_shape)
        o = np.clip(o, -self.clip_obs, self.clip_obs)
        g = np.clip(g, -self.clip_obs, self.clip_obs)
        return o, g

    def get_actions(self,
                    o,
                    z,
                    ag,
                    g,
                    noise_eps=0.,
                    random_eps=0.,
                    use_target_net=False,
                    compute_Q=False):
        o, g = self._preprocess_og(o, ag, g)
        policy = self.target if use_target_net else self.main
        # values to compute
        if self.sac:
            vals = [policy.mu_tf]
        else:
            vals = [policy.pi_tf]
        if compute_Q:
            vals += [policy.Q_pi_tf]

        feed = {
            policy.o_tf:
            o.reshape(-1, self.dimo),
            policy.z_tf:
            z.reshape(-1, self.dimz),
            policy.g_tf:
            g.reshape(-1, self.dimg),
            policy.u_tf:
            np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32)
        }

        ret = self.sess.run(vals, feed_dict=feed)

        # action postprocessing
        u = ret[0]
        noise = noise_eps * self.max_u * np.random.randn(
            *u.shape)  # gaussian noise
        u += noise
        u = np.clip(u, -self.max_u, self.max_u)
        u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (
            self._random_action(u.shape[0]) - u)  # eps-greedy
        if u.shape[0] == 1:
            u = u[0]
        u = u.copy()
        ret[0] = u

        if len(ret) == 1:
            return ret[0]
        else:
            return ret

    def store_episode(self, episode_batch, update_stats=True):
        """
        episode_batch: array of batch_size x (T or T+1) x dim_key
                       'o' is of size T+1, others are of size T
        """

        # update the mutual information reward into the episode batch
        episode_batch['m'] = np.empty([episode_batch['o'].shape[0], 1])
        episode_batch['s'] = np.empty([episode_batch['o'].shape[0], 1])
        # #

        self.buffer.store_episode(episode_batch, self)

        if update_stats:
            # add transitions to normalizer
            episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
            episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
            num_normalizing_transitions = transitions_in_episode_batch(
                episode_batch)

            transitions = self.sample_transitions(self, False, episode_batch,
                                                  num_normalizing_transitions,
                                                  0, 0, 0)

            o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions[
                'g'], transitions['ag']
            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)

            self.o_stats.update(transitions['o'])
            self.g_stats.update(transitions['g'])

            self.o_stats.recompute_stats()
            self.g_stats.recompute_stats()

    def get_current_buffer_size(self):
        return self.buffer.get_current_size()

    def _sync_optimizers(self):
        self.Q_adam.sync()
        self.pi_adam.sync()
        self.mi_adam.sync()
        self.sk_adam.sync()

    def _grads_mi(self, data):
        mi, mi_grad = self.sess.run([
            self.main_ir.mi_tf,
            self.mi_grad_tf,
        ],
                                    feed_dict={self.o_tau_tf: data})
        return mi, mi_grad

    def _grads_sk(self, o_s_batch, z_s_batch):
        sk, sk_grad = self.sess.run([
            self.main_ir.sk_tf,
            self.sk_grad_tf,
        ],
                                    feed_dict={
                                        self.main_ir.o_tf: o_s_batch,
                                        self.main_ir.z_tf: z_s_batch
                                    })
        return sk, sk_grad

    def _grads(self):
        critic_loss, actor_loss, Q_grad, pi_grad, neg_logp_pi, e_w = self.sess.run(
            [
                self.Q_loss_tf,
                self.main.Q_pi_tf,
                self.Q_grad_tf,
                self.pi_grad_tf,
                self.main.neg_logp_pi_tf,
                self.e_w_tf,
            ])
        return critic_loss, actor_loss, Q_grad, pi_grad, neg_logp_pi, e_w

    def _update_mi(self, mi_grad):
        self.mi_adam.update(mi_grad, self.mi_lr)

    def _update_sk(self, sk_grad):
        self.sk_adam.update(sk_grad, self.sk_lr)

    def _update(self, Q_grad, pi_grad):
        self.Q_adam.update(Q_grad, self.Q_lr)
        self.pi_adam.update(pi_grad, self.pi_lr)

    def sample_batch(self, ir, t):

        transitions = self.buffer.sample(self, ir, self.batch_size,
                                         self.mi_r_scale, self.sk_r_scale, t)
        weights = np.ones_like(transitions['r']).copy()
        if ir:
            self.mi_r_history.extend(
                ((np.clip((self.mi_r_scale * transitions['m']), *(0, 1)) -
                  (1 if not self.mi_r_scale == 0 else 0)) *
                 transitions['m_w']).tolist())
            self.sk_r_history.extend(
                ((np.clip(self.sk_r_scale * transitions['s'], *(-1, 0))) *
                 1.00).tolist())
            self.gl_r_history.extend(self.r_scale * transitions['r'])

        o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
        ag, ag_2 = transitions['ag'], transitions['ag_2']
        transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
        transitions['o_2'], transitions['g_2'] = self._preprocess_og(
            o_2, ag_2, g)

        transitions['w'] = weights.flatten().copy()  # note: ordered dict
        transitions_batch = [
            transitions[key] for key in self.stage_shapes.keys()
        ]

        return transitions_batch

    def stage_batch(self, ir, t, batch=None):
        if batch is None:
            batch = self.sample_batch(ir, t)
        assert len(self.buffer_ph_tf) == len(batch)
        self.sess.run(self.stage_op,
                      feed_dict=dict(zip(self.buffer_ph_tf, batch)))

    def run_mi(self, o_s):
        feed_dict = {self.o_tau_tf: o_s.copy()}
        neg_l = self.sess.run(self.main_ir.mi_tf, feed_dict=feed_dict)
        return neg_l

    def run_sk(self, o, z):
        feed_dict = {self.main_ir.o_tf: o, self.main_ir.z_tf: z}
        sk_r = self.sess.run(self.main_ir.sk_r_tf, feed_dict=feed_dict)
        return sk_r

    def train_mi(self, data, stage=True):
        mi, mi_grad = self._grads_mi(data)
        self._update_mi(mi_grad)
        self.mi_current = -mi.mean()
        return -mi.mean()

    def train_sk(self, o_s_batch, z_s_batch, stage=True):
        sk, sk_grad = self._grads_sk(o_s_batch, z_s_batch)
        self._update_sk(sk_grad)
        return -sk.mean()

    def train(self, t, stage=True):
        if not self.buffer.current_size == 0:
            if stage:
                self.stage_batch(ir=True, t=t)
            critic_loss, actor_loss, Q_grad, pi_grad, neg_logp_pi, e_w = self._grads(
            )
            self._update(Q_grad, pi_grad)
            self.et_r_history.extend(((np.clip(
                (self.et_r_scale * neg_logp_pi), *(-1, 0))) * e_w).tolist())
            return critic_loss, actor_loss

    def _init_target_net(self):
        self.sess.run(self.init_target_net_op)

    def update_target_net(self):
        self.sess.run(self.update_target_net_op)

    def clear_buffer(self):
        self.buffer.clear_buffer()

    def _vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                scope=self.scope + '/' + scope)
        assert len(res) > 0
        return res

    def _global_vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                scope=self.scope + '/' + scope)
        return res

    def _create_network(self,
                        pretrain_weights,
                        mi_prioritization,
                        reuse=False):
        if self.sac:
            logger.info("Creating a SAC agent with action space %d x %s..." %
                        (self.dimu, self.max_u))
        else:
            logger.info("Creating a DDPG agent with action space %d x %s..." %
                        (self.dimu, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([
            (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())
        ])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])
        batch_tf['w'] = tf.reshape(batch_tf['w'], [-1, 1])
        batch_tf['m'] = tf.reshape(batch_tf['m'], [-1, 1])
        batch_tf['s'] = tf.reshape(batch_tf['s'], [-1, 1])

        self.o_tau_tf = tf.placeholder(tf.float32,
                                       shape=(None, None, self.dimo))

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf,
                                                 net_type='main',
                                                 **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(target_batch_tf,
                                                   net_type='target',
                                                   **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # intrinsic reward (ir) network for mutual information
        with tf.variable_scope('ir') as vs:
            if reuse:
                vs.reuse_variables()
            self.main_ir = self.create_discriminator(batch_tf,
                                                     net_type='ir',
                                                     **self.__dict__)
            vs.reuse_variables()

        # loss functions

        mi_grads_tf = tf.gradients(tf.reduce_mean(self.main_ir.mi_tf),
                                   self._vars('ir/state_mi'))
        assert len(self._vars('ir/state_mi')) == len(mi_grads_tf)
        self.mi_grads_vars_tf = zip(mi_grads_tf, self._vars('ir/state_mi'))
        self.mi_grad_tf = flatten_grads(grads=mi_grads_tf,
                                        var_list=self._vars('ir/state_mi'))
        self.mi_adam = MpiAdam(self._vars('ir/state_mi'),
                               scale_grad_by_procs=False)

        sk_grads_tf = tf.gradients(tf.reduce_mean(self.main_ir.sk_tf),
                                   self._vars('ir/skill_ds'))
        assert len(self._vars('ir/skill_ds')) == len(sk_grads_tf)
        self.sk_grads_vars_tf = zip(sk_grads_tf, self._vars('ir/skill_ds'))
        self.sk_grad_tf = flatten_grads(grads=sk_grads_tf,
                                        var_list=self._vars('ir/skill_ds'))
        self.sk_adam = MpiAdam(self._vars('ir/skill_ds'),
                               scale_grad_by_procs=False)

        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return,
                      self.clip_return if self.clip_pos_returns else np.inf)

        self.e_w_tf = batch_tf['e_w']

        if not self.sac:
            self.main.neg_logp_pi_tf = tf.zeros(1)

        target_tf = tf.clip_by_value(
            self.r_scale * batch_tf['r'] * batch_tf['r_w'] +
            (tf.clip_by_value(self.mi_r_scale * batch_tf['m'], *(0, 1)) -
             (1 if not self.mi_r_scale == 0 else 0)) * batch_tf['m_w'] +
            (tf.clip_by_value(self.sk_r_scale * batch_tf['s'], *(-1, 0))) *
            batch_tf['s_w'] +
            (tf.clip_by_value(self.et_r_scale * self.main.neg_logp_pi_tf,
                              *(-1, 0))) * self.e_w_tf +
            self.gamma * target_Q_pi_tf, *clip_range)

        self.td_error_tf = tf.stop_gradient(target_tf) - self.main.Q_tf
        self.errors_tf = tf.square(self.td_error_tf)
        self.errors_tf = tf.reduce_mean(batch_tf['w'] * self.errors_tf)
        self.Q_loss_tf = tf.reduce_mean(self.errors_tf)

        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(
            tf.square(self.main.pi_tf / self.max_u))
        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf,
                                       var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf,
                                        var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'),
                               scale_grad_by_procs=False)

        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')

        # polyak averaging
        self.stats_vars = self._global_vars('o_stats') + self._global_vars(
            'g_stats')
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]),
                zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(
                lambda v: v[0].assign(self.polyak * v[0] +
                                      (1. - self.polyak) * v[1]),
                zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        if pretrain_weights:
            load_weight(self.sess, pretrain_weights, ['state_mi'])
            if self.finetune_pi:
                load_weight(self.sess, pretrain_weights, ['main'])

        self._sync_optimizers()
        if pretrain_weights and self.finetune_pi:
            load_weight(self.sess, pretrain_weights, ['target'])
        else:
            self._init_target_net()

    def logs(self, prefix=''):
        logs = []
        logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
        logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
        logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
        logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]
        logs += [('mi_reward/mean', np.mean(self.mi_r_history))]
        logs += [('mi_reward/std', np.std(self.mi_r_history))]
        logs += [('mi_reward/max', np.max(self.mi_r_history))]
        logs += [('mi_reward/min', np.min(self.mi_r_history))]
        logs += [('mi_train/-neg_l', self.mi_current)]
        logs += [('sk_reward/mean', np.mean(self.sk_r_history))]
        logs += [('sk_reward/std', np.std(self.sk_r_history))]
        logs += [('sk_reward/max', np.max(self.sk_r_history))]
        logs += [('sk_reward/min', np.min(self.sk_r_history))]
        logs += [('et_reward/mean', np.mean(self.et_r_history))]
        logs += [('et_reward/std', np.std(self.et_r_history))]
        logs += [('et_reward/max', np.max(self.et_r_history))]
        logs += [('et_reward/min', np.min(self.et_r_history))]
        logs += [('gl_reward/mean', np.mean(self.gl_r_history))]
        logs += [('gl_reward/std', np.std(self.gl_r_history))]
        logs += [('gl_reward/max', np.max(self.gl_r_history))]
        logs += [('gl_reward/min', np.min(self.gl_r_history))]

        if prefix is not '' and not prefix.endswith('/'):
            return [(prefix + '/' + key, val) for key, val in logs]
        else:
            return logs

    def __getstate__(self):
        """Our policies can be loaded from pkl, but after unpickling you cannot continue training.
        """
        excluded_subnames = [
            '_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats', 'main',
            'target', 'lock', 'sample_transitions', 'stage_shapes',
            'create_actor_critic', 'create_discriminator', '_history'
        ]

        state = {
            k: v
            for k, v in self.__dict__.items()
            if all([not subname in k for subname in excluded_subnames])
        }
        state['buffer_size'] = self.buffer_size
        state['tf'] = self.sess.run(
            [x for x in self._global_vars('') if 'buffer' not in x.name])
        return state

    def __setstate__(self, state):
        if 'sample_transitions' not in state:
            # We don't need this for playing the policy.
            state['sample_transitions'] = None
        if 'env_name' not in state:
            state['env_name'] = 'FetchPickAndPlace-v1'
        if 'network_class_discriminator' not in state:
            state[
                'network_class_discriminator'] = 'baselines.her.discriminator:Discriminator'
        if 'mi_r_scale' not in state:
            state['mi_r_scale'] = 1
        if 'mi_lr' not in state:
            state['mi_lr'] = 0.001
        if 'sk_r_scale' not in state:
            state['sk_r_scale'] = 1
        if 'sk_lr' not in state:
            state['sk_lr'] = 0.001
        if 'et_r_scale' not in state:
            state['et_r_scale'] = 1
        if 'finetune_pi' not in state:
            state['finetune_pi'] = None
        if 'no_train_mi' not in state:
            state['no_train_mi'] = None
        if 'load_weight' not in state:
            state['load_weight'] = None
        if 'pretrain_weights' not in state:
            state['pretrain_weights'] = None
        if 'mi_prioritization' not in state:
            state['mi_prioritization'] = None
        if 'sac' not in state:
            state['sac'] = None

        self.__init__(**state)
        # set up stats (they are overwritten in __init__)
        for k, v in state.items():
            if k[-6:] == '_stats':
                self.__dict__[k] = v
        # load TF variables
        vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
        assert (len(vars) == len(state["tf"]))
        node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
        self.sess.run(node)

Example #7

Show file

class DDPG(object):
    @store_args
    def __init__(self,
                 input_dims,
                 buffer_size,
                 hidden,
                 layers,
                 network_class,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 sample_transitions,
                 gamma,
                 temperature,
                 prioritization,
                 env_name,
                 alpha,
                 beta0,
                 beta_iters,
                 eps,
                 max_timesteps,
                 rank_method,
                 reuse=False,
                 **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
        """
        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(self.network_class)

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']

        self.prioritization = prioritization
        self.env_name = env_name
        self.temperature = temperature
        self.rank_method = rank_method

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )
        stage_shapes['w'] = (None, )
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {
            key: (self.T if key != 'o' else self.T + 1, *input_shapes[key])
            for key, val in input_shapes.items()
        }
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T + 1, self.dimg)
        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size

        if self.prioritization == 'entropy':
            self.buffer = ReplayBufferEntropy(buffer_shapes, buffer_size,
                                              self.T, self.sample_transitions,
                                              self.prioritization,
                                              self.env_name)
        elif self.prioritization == 'tderror':
            self.buffer = PrioritizedReplayBuffer(buffer_shapes, buffer_size,
                                                  self.T,
                                                  self.sample_transitions,
                                                  alpha, self.env_name)
            if beta_iters is None:
                beta_iters = max_timesteps
            self.beta_schedule = LinearSchedule(beta_iters,
                                                initial_p=beta0,
                                                final_p=1.0)
        else:
            self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                       self.sample_transitions)

    def _random_action(self, n):
        return np.random.uniform(low=-self.max_u,
                                 high=self.max_u,
                                 size=(n, self.dimu))

    def _preprocess_og(self, o, ag, g):
        if self.relative_goals:
            g_shape = g.shape
            g = g.reshape(-1, self.dimg)
            ag = ag.reshape(-1, self.dimg)
            g = self.subtract_goals(g, ag)
            g = g.reshape(*g_shape)
        o = np.clip(o, -self.clip_obs, self.clip_obs)
        g = np.clip(g, -self.clip_obs, self.clip_obs)
        return o, g

    def get_actions(self,
                    o,
                    ag,
                    g,
                    noise_eps=0.,
                    random_eps=0.,
                    use_target_net=False,
                    compute_Q=False):
        o, g = self._preprocess_og(o, ag, g)
        policy = self.target if use_target_net else self.main
        # values to compute
        vals = [policy.pi_tf]
        if compute_Q:
            vals += [policy.Q_pi_tf]
        # feed
        feed = {
            policy.o_tf:
            o.reshape(-1, self.dimo),
            policy.g_tf:
            g.reshape(-1, self.dimg),
            policy.u_tf:
            np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32)
        }

        ret = self.sess.run(vals, feed_dict=feed)

        # action postprocessing
        u = ret[0]
        noise = noise_eps * self.max_u * np.random.randn(
            *u.shape)  # gaussian noise
        u += noise
        u = np.clip(u, -self.max_u, self.max_u)
        u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (
            self._random_action(u.shape[0]) - u)  # eps-greedy
        if u.shape[0] == 1:
            u = u[0]
        u = u.copy()
        ret[0] = u

        if len(ret) == 1:
            return ret[0]
        else:
            return ret

    def get_td_errors(self, o, g, u):
        o, g = self._preprocess_og(o, g, g)
        vals = [self.td_error_tf]
        r = np.ones((o.reshape(-1, self.dimo).shape[0], 1))

        feed = {
            self.target.o_tf: o.reshape(-1, self.dimo),
            self.target.g_tf: g.reshape(-1, self.dimg),
            self.bath_tf_r: r,
            self.main.o_tf: o.reshape(-1, self.dimo),
            self.main.g_tf: g.reshape(-1, self.dimg),
            self.main.u_tf: u.reshape(-1, self.dimu)
        }
        td_errors = self.sess.run(vals, feed_dict=feed)
        td_errors = td_errors.copy()

        return td_errors

    def fit_density_model(self):
        self.buffer.fit_density_model()

    def store_episode(self,
                      episode_batch,
                      dump_buffer,
                      rank_method,
                      epoch,
                      update_stats=True):
        """
        episode_batch: array of batch_size x (T or T+1) x dim_key
                       'o' is of size T+1, others are of size T
        """
        if self.prioritization == 'tderror':
            self.buffer.store_episode(episode_batch, dump_buffer)
        elif self.prioritization == 'entropy':
            self.buffer.store_episode(episode_batch, rank_method, epoch)
        else:
            self.buffer.store_episode(episode_batch)

        if update_stats:
            # add transitions to normalizer
            episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
            episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
            num_normalizing_transitions = transitions_in_episode_batch(
                episode_batch)

            if self.prioritization == 'entropy':
                if not self.buffer.current_size == 0 and not len(
                        episode_batch['ag']) == 0:
                    transitions = self.sample_transitions(
                        episode_batch, num_normalizing_transitions, 'none',
                        1.0, True)
            elif self.prioritization == 'tderror':
                transitions, weights, episode_idxs = \
                self.sample_transitions(self.buffer, episode_batch, num_normalizing_transitions, beta=0)
            else:
                transitions = self.sample_transitions(
                    episode_batch, num_normalizing_transitions)

            o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions[
                'g'], transitions['ag']
            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
            # No need to preprocess the o_2 and g_2 since this is only used for stats

            self.o_stats.update(transitions['o'])
            self.g_stats.update(transitions['g'])

            self.o_stats.recompute_stats()
            self.g_stats.recompute_stats()

    def get_current_buffer_size(self):
        return self.buffer.get_current_size()

    def dump_buffer(self, epoch):
        self.buffer.dump_buffer(epoch)

    def _sync_optimizers(self):
        self.Q_adam.sync()
        self.pi_adam.sync()

    def _grads(self):
        # Avoid feed_dict here for performance!
        critic_loss, actor_loss, Q_grad, pi_grad, td_error = self.sess.run([
            self.Q_loss_tf, self.main.Q_pi_tf, self.Q_grad_tf, self.pi_grad_tf,
            self.td_error_tf
        ])
        return critic_loss, actor_loss, Q_grad, pi_grad, td_error

    def _update(self, Q_grad, pi_grad):
        self.Q_adam.update(Q_grad, self.Q_lr)
        self.pi_adam.update(pi_grad, self.pi_lr)

    def sample_batch(self, t):

        if self.prioritization == 'entropy':
            transitions = self.buffer.sample(self.batch_size,
                                             self.rank_method,
                                             temperature=self.temperature)
            weights = np.ones_like(transitions['r']).copy()
        elif self.prioritization == 'tderror':
            transitions, weights, idxs = self.buffer.sample(
                self.batch_size, beta=self.beta_schedule.value(t))
        else:
            transitions = self.buffer.sample(self.batch_size)
            weights = np.ones_like(transitions['r']).copy()

        o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
        ag, ag_2 = transitions['ag'], transitions['ag_2']
        transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
        transitions['o_2'], transitions['g_2'] = self._preprocess_og(
            o_2, ag_2, g)

        transitions['w'] = weights.flatten().copy()  # note: ordered dict
        transitions_batch = [
            transitions[key] for key in self.stage_shapes.keys()
        ]

        if self.prioritization == 'tderror':
            return (transitions_batch, idxs)
        else:
            return transitions_batch

    def stage_batch(self, t, batch=None):
        if batch is None:
            if self.prioritization == 'tderror':
                batch, idxs = self.sample_batch(t)
            else:
                batch = self.sample_batch(t)
        assert len(self.buffer_ph_tf) == len(batch)
        self.sess.run(self.stage_op,
                      feed_dict=dict(zip(self.buffer_ph_tf, batch)))

        if self.prioritization == 'tderror':
            return idxs

    def train(self, t, dump_buffer, stage=True):
        if not self.buffer.current_size == 0:
            if stage:
                if self.prioritization == 'tderror':
                    idxs = self.stage_batch(t)
                else:
                    self.stage_batch(t)
            critic_loss, actor_loss, Q_grad, pi_grad, td_error = self._grads()
            if self.prioritization == 'tderror':
                new_priorities = np.abs(td_error) + self.eps  # td_error

                if dump_buffer:
                    T = self.buffer.buffers['u'].shape[1]
                    episode_idxs = idxs // T
                    t_samples = idxs % T
                    batch_size = td_error.shape[0]
                    with self.buffer.lock:
                        for i in range(batch_size):
                            self.buffer.buffers['td'][episode_idxs[i]][
                                t_samples[i]] = td_error[i]

                self.buffer.update_priorities(idxs, new_priorities)
            self._update(Q_grad, pi_grad)
            return critic_loss, actor_loss

    def _init_target_net(self):
        self.sess.run(self.init_target_net_op)

    def update_target_net(self):
        self.sess.run(self.update_target_net_op)

    def clear_buffer(self):
        self.buffer.clear_buffer()

    def _vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                scope=self.scope + '/' + scope)
        assert len(res) > 0
        return res

    def _global_vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                scope=self.scope + '/' + scope)
        return res

    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." %
                    (self.dimu, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([
            (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())
        ])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])
        batch_tf['w'] = tf.reshape(batch_tf['w'], [-1, 1])

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf,
                                                 net_type='main',
                                                 **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(target_batch_tf,
                                                   net_type='target',
                                                   **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(
            batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)

        self.td_error_tf = tf.stop_gradient(target_tf) - self.main.Q_tf
        self.errors_tf = tf.square(self.td_error_tf)
        self.errors_tf = tf.reduce_mean(batch_tf['w'] * self.errors_tf)
        self.Q_loss_tf = tf.reduce_mean(self.errors_tf)
        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(
            tf.square(self.main.pi_tf / self.max_u))
        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf,
                                       var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf,
                                        var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'),
                               scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars(
            'g_stats')
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]),
                zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(
                lambda v: v[0].assign(self.polyak * v[0] +
                                      (1. - self.polyak) * v[1]),
                zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()

    def logs(self, prefix=''):
        logs = []
        logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
        logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
        logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
        logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]

        if prefix is not '' and not prefix.endswith('/'):
            return [(prefix + '/' + key, val) for key, val in logs]
        else:
            return logs

    def __getstate__(self):
        """Our policies can be loaded from pkl, but after unpickling you cannot continue training.
        """
        excluded_subnames = [
            '_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats', 'main',
            'target', 'lock', 'env', 'sample_transitions', 'stage_shapes',
            'create_actor_critic'
        ]

        state = {
            k: v
            for k, v in self.__dict__.items()
            if all([not subname in k for subname in excluded_subnames])
        }
        state['buffer_size'] = self.buffer_size
        state['tf'] = self.sess.run(
            [x for x in self._global_vars('') if 'buffer' not in x.name])
        return state

    def __setstate__(self, state):
        if 'sample_transitions' not in state:
            # We don't need this for playing the policy.
            state['sample_transitions'] = None
        state['env_name'] = None  # No need for playing the policy

        self.__init__(**state)
        # set up stats (they are overwritten in __init__)
        for k, v in state.items():
            if k[-6:] == '_stats':
                self.__dict__[k] = v
        # load TF variables
        vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
        assert (len(vars) == len(state["tf"]))
        node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
        self.sess.run(node)

Example #8

Show file

class DDPG(object):
    @store_args
    def __init__(self, FLAGS, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size,
                 Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T,
                 rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return,
                 bc_loss, q_filter, num_demo, demo_batch_size, prm_loss_weight, aux_loss_weight,
                 
                #  sample_transitions, gamma, reuse=False, **kwargs):
                 sample_transitions, gamma, td3_policy_freq, td3_policy_noise, td3_noise_clip, reuse=False, *agent_params, **kwargs): ##
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).
            Added functionality to use demonstrations for training to Overcome exploration problem.

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
            bc_loss: whether or not the behavior cloning loss should be used as an auxilliary loss
            q_filter: whether or not a filter on the q value update should be used when training with demonstartions
            num_demo: Number of episodes in to be used in the demonstration buffer
            demo_batch_size: number of samples to be used from the demonstrations buffer, per mpi thread
            prm_loss_weight: Weight corresponding to the primary loss
            aux_loss_weight: Weight corresponding to the auxilliary loss also called the cloning loss

            agent_params: for HAC agent params
        """
        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(self.network_class)

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        # self.dimo1= self.input_dims['o1'] ##A.R add for TD3 (has obs0, obs1)
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']
        #추가된 내용
        #parameters for using TD3 variant of DDPG
        #https://arxiv.org/abs/1802.09477
        self.td3_policy_freq = td3_policy_freq
        self.td3_policy_noise = td3_policy_noise
        self.td3_noise_clip = td3_noise_clip

        ## for HAC
        self.FLAGS = FLAGS

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
        # for key in ['o', 'o1', 'g']: #o1 added by A.R
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None,)
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values()]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {key: (self.T-1 if key != 'o' else self.T, *input_shapes[key]) 
        # origin : buffer_shapes = {key: (self.T-1 if key != 'o' else self.T, *input_shapes[key]) 
        # buffer_shapes = {key: (self.T-1 if key != 'o' and key != 'o1' else self.T, *input_shapes[key]) #A.Rㅇ
                         for key, val in input_shapes.items()}
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T, self.dimg)

        buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)

        global DEMO_BUFFER
        DEMO_BUFFER = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions) #initialize the demo buffer; in the same way as the primary data buffer
        print("@ ddgp.py , buffer={}".format(self.buffer))
        
        # self.meta_controller = DDPG(self.dimo + self.dimg, self.dimo, self.clip_obs)
        # ##
        # self.low_replay_buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)
        # self.high_replay_buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)
        # ##

    def _random_action(self, n):
        return np.random.uniform(low=-self.max_u, high=self.max_u, size=(n, self.dimu))

    def _preprocess_og(self, o, ag, g):
    # def _preprocess_og(self, o, o1, ag, g): #A.R
        if self.relative_goals: ## goal reshape 해주는 곳. ag vs g..흠
            g_shape = g.shape
            g = g.reshape(-1, self.dimg)
            ag = ag.reshape(-1, self.dimg)
            g = self.subtract_goals(g, ag) #상대적인 골로 만들어 주는구나?..
            '''
            def simple_goal_subtract(a, b):
            assert a.shape == b.shape
            return a - b
            '''
            g = g.reshape(*g_shape)
        o = np.clip(o, -self.clip_obs, self.clip_obs)
        # o1 = np.clip(o1, -self.clip_obs, self.clip_obs) #A.R
        g = np.clip(g, -self.clip_obs, self.clip_obs)
        # return o, o1, g
        return o, g

    def step(self, obs):
        # FLAGS = FLAGS
        actions = self.get_actions(obs['observation'], obs['achieved_goal'], obs['desired_goal'])
        # actions = self.get_actions(obs['observation'], obs['achieved_goal'], obs['desired_goal'], FLAGS)
        # print("for debug, obs : {}".format(obs['observation']))
        return actions, None, None, None


    # def get_actions(self, o, o1, ag, g, noise_eps=0., random_eps=0., use_target_net=False, ##o1이 target 네트워크
    def get_actions(self, o, ag, g, noise_eps=0., random_eps=0., use_target_net=False,
    # def get_actions(self, o, ag, g, FLAGS, noise_eps=0., random_eps=0., use_target_net=False,
                    compute_Q=False):
        # o, o1, g = self._preprocess_og(o, o1, ag, g) ##
        
        o, g = self._preprocess_og(o, ag, g) 
        policy = self.target if use_target_net else self.main # rollout.py에서 넘어온다.
        # values to compute
        vals = [policy.pi_tf]
        
        if compute_Q:
            vals += [policy.Q_pi_tf]
        # feed
        feed = {
            policy.o_tf: o.reshape(-1, self.dimo),
            policy.g_tf: g.reshape(-1, self.dimg),
            policy.u_tf: np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32)
        }

        ret = self.sess.run(vals, feed_dict=feed)
        # action postprocessing
        u = ret[0]
        noise = noise_eps * self.max_u * np.random.randn(*u.shape)  # gaussian noise
        u += noise
        u = np.clip(u, -self.max_u, self.max_u)
        u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (self._random_action(u.shape[0]) - u)  # eps-greedy
        if u.shape[0] == 1:
            u = u[0]
        u = u.copy()
        ret[0] = u

        if len(ret) == 1:
            return ret[0]
        else:
            return ret

    def init_demo_buffer(self, demoDataFile, update_stats=True): #function that initializes the demo buffer

        demoData = np.load(demoDataFile) #load the demonstration data from data file
        info_keys = [key.replace('info_', '') for key in self.input_dims.keys() if key.startswith('info_')]
        info_values = [np.empty((self.T - 1, 1, self.input_dims['info_' + key]), np.float32) for key in info_keys]

        demo_data_obs = demoData['obs']
        demo_data_acs = demoData['acs']
        demo_data_info = demoData['info']

        for epsd in range(self.num_demo): # we initialize the whole demo buffer at the start of the training
            obs, acts, goals, achieved_goals = [], [] ,[] ,[]
            i = 0
            for transition in range(self.T - 1):
                obs.append([demo_data_obs[epsd][transition].get('observation')])
                acts.append([demo_data_acs[epsd][transition]])
                goals.append([demo_data_obs[epsd][transition].get('desired_goal')])
                achieved_goals.append([demo_data_obs[epsd][transition].get('achieved_goal')])
                for idx, key in enumerate(info_keys):
                    info_values[idx][transition, i] = demo_data_info[epsd][transition][key]


            obs.append([demo_data_obs[epsd][self.T - 1].get('observation')])
            achieved_goals.append([demo_data_obs[epsd][self.T - 1].get('achieved_goal')])

            episode = dict(o=obs,
                           u=acts,
                           g=goals,
                           ag=achieved_goals)
            for key, value in zip(info_keys, info_values):
                episode['info_{}'.format(key)] = value

            episode = convert_episode_to_batch_major(episode)
            global DEMO_BUFFER
            DEMO_BUFFER.store_episode(episode) # create the observation dict and append them into the demonstration buffer
            logger.debug("Demo buffer size currently ", DEMO_BUFFER.get_current_size()) #print out the demonstration buffer size

            if update_stats:
                # add transitions to normalizer to normalize the demo data as well
                episode['o_2'] = episode['o'][:, 1:, :]
                episode['ag_2'] = episode['ag'][:, 1:, :]
                num_normalizing_transitions = transitions_in_episode_batch(episode)
                transitions = self.sample_transitions(episode, num_normalizing_transitions)

                o, g, ag = transitions['o'], transitions['g'], transitions['ag']
                transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
                # No need to preprocess the o_2 and g_2 since this is only used for stats

                self.o_stats.update(transitions['o'])
                self.g_stats.update(transitions['g'])

                self.o_stats.recompute_stats()
                self.g_stats.recompute_stats()
            episode.clear()

        logger.info("Demo buffer size: ", DEMO_BUFFER.get_current_size()) #print out the demonstration buffer size

    def store_episode(self, episode_batch, update_stats=True):
        """
        episode_batch: array of batch_size x (T or T+1) x dim_key
                       'o' is of size T+1, others are of size T
        """

        self.buffer.store_episode(episode_batch)

        if update_stats:
            # add transitions to normalizer
            episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
            episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
            num_normalizing_transitions = transitions_in_episode_batch(episode_batch)
            transitions = self.sample_transitions(episode_batch, num_normalizing_transitions)

            o, g, ag = transitions['o'], transitions['g'], transitions['ag']
            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
            # No need to preprocess the o_2 and g_2 since this is only used for stats

            self.o_stats.update(transitions['o'])
            self.g_stats.update(transitions['g'])

            self.o_stats.recompute_stats()
            self.g_stats.recompute_stats()

    def get_current_buffer_size(self):
        return self.buffer.get_current_size()

    def _sync_optimizers(self):
        self.Q_adam.sync()
        self.pi_adam.sync()

    def _grads(self):
        # Avoid feed_dict here for performance!
        critic_loss, actor_loss, Q_grad, pi_grad = self.sess.run([
            self.Q_loss_tf,
            self.main.Q_pi_tf,
            self.Q_grad_tf,
            self.pi_grad_tf
        ])
        return critic_loss, actor_loss, Q_grad, pi_grad

    def _update(self, Q_grad, pi_grad):
        self.Q_adam.update(Q_grad, self.Q_lr)
        self.pi_adam.update(pi_grad, self.pi_lr)

    def sample_batch(self):
        if self.bc_loss: #use demonstration buffer to sample as well if bc_loss flag is set TRUE
            transitions = self.buffer.sample(self.batch_size - self.demo_batch_size)
            global DEMO_BUFFER
            transitions_demo = DEMO_BUFFER.sample(self.demo_batch_size) #sample from the demo buffer
            for k, values in transitions_demo.items():
                rolloutV = transitions[k].tolist()
                for v in values:
                    rolloutV.append(v.tolist())
                transitions[k] = np.array(rolloutV)
        else:
            transitions = self.buffer.sample(self.batch_size) #otherwise only sample from primary buffer

        o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
        # o1, o1_2, g = transitions['o1'], transitions['o1_2'] ## A.R
        ag, ag_2 = transitions['ag'], transitions['ag_2']
        transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
        transitions['o_2'], transitions['g_2'] = self._preprocess_og(o_2, ag_2, g)

        transitions_batch = [transitions[key] for key in self.stage_shapes.keys()]
        print("@ ddpg, sample_batch, transitions_batch={}".format(transitions_batch))
        return transitions_batch

    def stage_batch(self, batch=None):
        if batch is None:
            batch = self.sample_batch()
        assert len(self.buffer_ph_tf) == len(batch)
        self.sess.run(self.stage_op, feed_dict=dict(zip(self.buffer_ph_tf, batch)))

    def train(self, stage=True):
        if stage:
            self.stage_batch()
        critic_loss, actor_loss, Q_grad, pi_grad = self._grads() ## 현재 loss들 가져오는거
        self._update(Q_grad, pi_grad) ## 아담 업데이트 하는거
        return critic_loss, actor_loss

    def _init_target_net(self):
        self.sess.run(self.init_target1_net_op)
        self.sess.run(self.init_target2_net_op)

    def update_target_net(self):
        # self.sess.run(self.update_target_net_op)
        self.sess.run(self.update_target1_net_op)
        self.sess.run(self.update_target2_net_op)

    def clear_buffer(self):
        self.buffer.clear_buffer()

    def _vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope + '/' + scope)
        assert len(res) > 0 #######################이게 왜걸리지? 왜 다시 안걸리지?
        return res

    def _global_vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope + '/' + scope)
        # print("DEBUG, {}".format(res))
        
        return res

    def _create_network(self, reuse=False): ## num_demo 추가 -2
        logger.info("Debug : Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u))
        self.sess = tf_util.get_session()
        # self.num_demo = num_demo

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get() ## 그냥 꺼내오는거..
        batch_tf = OrderedDict([(key, batch[i])
                                for i, key in enumerate(self.stage_shapes.keys())])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        #choose only the demo buffer samples
        mask = np.concatenate((np.zeros(self.batch_size - self.demo_batch_size), np.ones(self.demo_batch_size)), axis = 0)

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__)
            vs.reuse_variables()
            print("tf.variable_scope(main) = {}".format(tf.variable_scope('target1'))) #-1

        with tf.variable_scope('target1') as vs:
            if reuse:
                vs.reuse_variables()
            target1_batch_tf = batch_tf.copy()
            target1_batch_tf['o'] = batch_tf['o_2']
            target1_batch_tf['g'] = batch_tf['g_2']
            self.target1 = self.create_actor_critic(
                target1_batch_tf, net_type='target1', **self.__dict__)
            vs.reuse_variables()
            print("tf.variable_scope(target1) = {}".format(tf.variable_scope('target1')))
            # print("batch= {}".format(target1_batch_tf))
            # print(type('target')) #<class 'baselines.her.actor_critic.ActorCritic'>
        assert len(self._vars("main")) == len(self._vars("target1"))

        with tf.variable_scope('target2') as vs:
            if reuse:
                vs.reuse_variables()
            target2_batch_tf = batch_tf.copy()
            target2_batch_tf['o'] = batch_tf['o_2']
            target2_batch_tf['g'] = batch_tf['g_2']
            self.target2 = self.create_actor_critic(
                target2_batch_tf, net_type='target2', **self.__dict__)
            vs.reuse_variables()
            print("tf.variable_scope(target2) = {}".format(tf.variable_scope('target2')))
            print("batch= {}".format(target2_batch_tf))
        assert len(self._vars("main")) == len(self._vars("target2"))

        for nd in range(self.num_demo):       

            ##A.R
            ##Compute the target Q value, Q1과 Q2중에 min값을 사용한다.

            target1_Q_pi_tf = self.target1.Q_pi_tf ##A.R policy training
            target2_Q_pi_tf = self.target2.Q_pi_tf ##A.R
            # target_Q_pi_tf = tf.minimum(target1_Q_pi_tf, target2_Q_pi_tf)
            # target1_Q_tf = self.target1.Q_tf ##A.R policy training
            # target2_Q_tf = self.target2.Q_tf ##A.R
            # print('target1={}/////target2={}'.format(target1_Q_tf,target2_Q_tf))
            target_Q_pi_tf = tf.minimum(target1_Q_pi_tf, target2_Q_pi_tf)
            # target_Q_tf = tf.minimum(target1_Q_tf, target2_Q_tf) ## 대체 코드

            # print("{}///{}///{}".format(target1_Q_pi_tf,target2_Q_pi_tf,tf.minimum(target1_Q_pi_tf, target2_Q_pi_tf)))
            ####
            #TD3에서 빠진 코드 :target_Q = reward + (done * discount * target_Q).detach()(L109) ->L428에서 해주고 clip한다

            # loss functions
            # for policy training, Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1])
            # target_Q_pi_tf = self.target.Q_pi_tf #original code
            clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
            target_Q_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
            # target_Q_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_tf, *clip_range) ## 대체 코드
            # self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))
            ##
            # current_Q1, current_Q2 = self.critic(state, action)

            # for critic training, Q_tf = nn(input_Q, [self.hidden] * self.layers + [1], reuse=True)
            # target_Q_pi_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_tf, *clip_range) #original code
            
            # self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf)) #critic taining 
            
            ## Get current Q estimates, for critic Q
            current_Q1 = self.main.Q_tf ##A.R
            current_Q2 = self.main.Q_tf
            # print("Q1={}".format(current_Q1))

            ## Compute critic loss
            ## Torch => critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q) 
            self.Q_loss_tf = tf.losses.mean_squared_error(current_Q1, target_Q_tf)+ tf.losses.mean_squared_error(current_Q2,target_Q_tf)
            # self.Q_loss_tf = tf.losses.mean_squared_error(current_Q1, target_Q_tf)+ tf.losses.mean_squared_error(current_Q2,target_Q_tf)
            # print("critic_loss ={}".format(self.Q_loss_tf))

            Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
            assert len(self._vars('main/Q')) == len(Q_grads_tf)

            ## Optimize the critic 아담 옵티마이저
            self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
            assert len(self._vars('main/Q')) == len(Q_grads_tf)
            self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
            self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q'))

            # ## Delayed policy updates
            if nd % self.td3_policy_freq == 0:
                # print("num_demo = {}".format(nd))
                target1_Q_pi_tf = self.target1.Q_pi_tf ##A.R policy training
                target2_Q_pi_tf = self.target2.Q_pi_tf ##A.R
                tf.print(target1_Q_pi_tf, [target1_Q_pi_tf])
                tf.print(target2_Q_pi_tf, [target2_Q_pi_tf])
                # print(target2_Q_pi_tf)
                target_Q_pi_tf = tf.minimum(target1_Q_pi_tf, target2_Q_pi_tf)

                # target_Q_pi_tf = self.target.Q_pi_tf
                clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
                target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
                self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))
                # Compute actor loss
                if self.bc_loss ==1 and self.q_filter == 1 : # train with demonstrations and use bc_loss and q_filter both
                    maskMain = tf.reshape(tf.boolean_mask(self.main.Q_tf > self.main.Q_pi_tf, mask), [-1]) #where is the demonstrator action better than actor action according to the critic? choose those samples only
                    #define the cloning loss on the actor's actions only on the samples which adhere to the above masks
                    self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask(tf.boolean_mask((self.main.pi_tf), mask), maskMain, axis=0) - tf.boolean_mask(tf.boolean_mask((batch_tf['u']), mask), maskMain, axis=0)))
                    self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf) #primary loss scaled by it's respective weight prm_loss_weight
                    self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) #L2 loss on action values scaled by the same weight prm_loss_weight
                    self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf #adding the cloning loss to the actor loss as an auxilliary loss scaled by its weight aux_loss_weight

                elif self.bc_loss == 1 and self.q_filter == 0: # train with demonstrations without q_filter
                    self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask((self.main.pi_tf), mask) - tf.boolean_mask((batch_tf['u']), mask)))
                    self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf)
                    self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
                    self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf

                else: #If  not training with demonstrations
                    self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
                    self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
                # self.pi_loss_tf = -tf.reduce_mean(self.main.pi_tf) ## what about target1?
                # self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
                # actor_loss = -tf.reduce_mean(self.main.Q_tf)
                # actor_loss += self.action_l2 * tf.reduce_mean(tf.square(self.main.Q_tf / self.max_u))

                pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
                assert len(self._vars('main/pi')) == len(pi_grads_tf)

                # Optimize the actor 
                # Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
                self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False)
                assert len(self._vars('main/pi')) == len(pi_grads_tf)
                self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
                self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi'))

                # Update the frozen target models
            ## torch code
                # for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                #     target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
                

                self.main_vars = self._vars('main/Q') + self._vars('main/pi')
                self.target1_vars = self._vars('target1/Q') + self._vars('target1/pi') ##A.R
                self.target2_vars = self._vars('target2/Q') + self._vars('target2/pi') ##A.R
                if target_Q_pi_tf == target1_Q_pi_tf:
                    target_vars = self.target1_vars
                else:
                    target_vars = self.target2_vars
                # self.target_vars = self._vars('target/Q') + self._vars('target/pi') #original
                self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats')
                self.init_target1_net_op = list(
                    map(lambda v: v[0].assign(v[1]), zip(self.target1_vars, self.main_vars)))
                self.init_target2_net_op = list(
                    map(lambda v: v[0].assign(v[1]), zip(self.target2_vars, self.main_vars)))

                self.update_target_net_op = list(
                    map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(target_vars, self.main_vars)))
                self.update_target1_net_op = list(
                    map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(target_vars, self.main_vars)))
                self.update_target2_net_op = list(
                    map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(target_vars, self.main_vars)))


                tf.variables_initializer(self._global_vars('')).run()
                self._sync_optimizers()
                self._init_target_net()



        # Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        # pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        # assert len(self._vars('main/Q')) == len(Q_grads_tf)
        # assert len(self._vars('main/pi')) == len(pi_grads_tf)
        # self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        # self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        # self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q'))
        # self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi'))

        # optimizers
        # self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        # self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False)

        # polyak averaging
        # self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        # self.target1_vars = self._vars('target1/Q') + self._vars('target1/pi') ##A.R
        # self.target2_vars = self._vars('target2/Q') + self._vars('target2/pi') ##A.R
        # # self.target_vars = self._vars('target/Q') + self._vars('target/pi') #original
        # self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats')
        # self.init_target1_net_op = list(
        #     map(lambda v: v[0].assign(v[1]), zip(self.target1_vars, self.main_vars)))
        # self.init_target2_net_op = list(
        #     map(lambda v: v[0].assign(v[1]), zip(self.target2_vars, self.main_vars)))


            
        # self.update_target_net_op = list(
        #     map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars)))

        #original
        # self.init_target_net_op = list(
        #     map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
        # self.update_target_net_op = list(
        #     map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars)))

        # # initialize all variables
        # tf.variables_initializer(self._global_vars('')).run()
        # self._sync_optimizers()
        # self._init_target_net()

    def logs(self, prefix=''):
        logs = []
        logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
        logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
        logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
        logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]

        if prefix is not '' and not prefix.endswith('/'):
            return [(prefix + '/' + key, val) for key, val in logs]
        else:
            return logs

    def __getstate__(self):
        """Our policies can be loaded from pkl, but after unpickling you cannot continue training.
        """
        excluded_subnames = ['_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats',
                            #  'main', 'target', 'lock', 'env', 'sample_transitions', #original code
                            'main', 'target1', 'target2', 'lock', 'env', 'sample_transitions',
                             'stage_shapes', 'create_actor_critic']

        state = {k: v for k, v in self.__dict__.items() if all([not subname in k for subname in excluded_subnames])}
        state['buffer_size'] = self.buffer_size
        state['tf'] = self.sess.run([x for x in self._global_vars('') if 'buffer' not in x.name])
        return state

    def __setstate__(self, state):
        if 'sample_transitions' not in state:
            # We don't need this for playing the policy.
            state['sample_transitions'] = None

        self.__init__(**state)
        # set up stats (they are overwritten in __init__)
        for k, v in state.items():
            if k[-6:] == '_stats':
                self.__dict__[k] = v
        # load TF variables
        vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
        assert(len(vars) == len(state["tf"]))
        node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
        self.sess.run(node)

    def save(self, save_path):
        tf_util.save_variables(save_path)

Example #9

Show file

class DDPG(object):
    @store_args
    def __init__(self,
                 input_dims,
                 buffer_size,
                 hidden,
                 layers,
                 network_class,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 sample_transitions,
                 gamma,
                 reuse=False,
                 **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'GHER.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
        """

        # # print("\n\n\n\n1--", input_dims, "\n2--", buffer_size, "\n3--", hidden,
        #         "\n4--", layers, "\n5--", network_class, "\n6--", polyak, "\n7--", batch_size,
        #          "\n8--", Q_lr, "\n9--", pi_lr, "\n10--", norm_eps, "\n11--", norm_clip,
        #          "\n12--", max_u, "\n13--", action_l2, "\n14--", clip_obs, "\n15--", scope, "\n16--", T,
        #          "\n17--", rollout_batch_size, "\n18--", subtract_goals, "\n19--", relative_goals,
        #          "\n20--", clip_pos_returns, "\n21--", clip_return,
        #          "\n22--", sample_transitions, "\n23--", gamma)
        """
         Example of parameter values in the FetchReach-v1 run:
            Input_dims (dict of ints): {'o': 10, 'u': 4, 'g': 3, 'info_is_success': 1} (o, u, g are both input to the network)
            Buffer_size (int): 1E6 (total number of experience pool samples)
            Hidden (int): 256 (number of hidden layer neurons)
            Layers (int): 3 (three-layer neural network)
            Network_class (str): GHER.ActorCritic'
            Polyak (float): 0.95 (smooth parameter updated by target-Network)
            Batch_size (int): 256 (bulk size)
            Q_lr (float): 0.001 (learning rate)
            Pi_lr (float): 0.001 (learning rate)
            Norm_eps (float): 0.01 (to avoid data overflow)
            Norm_clip (float): 5 (norm_clip)
            Max_u (float): 1.0 (the range of the action is [-1.0, 1.0])
            Action_l2 (float): 1.0 (loss coefficient of the actor network)
            Clip_obs (float): 200 (obs is limited to (-200, +200))
            Scope (str): "ddpg" (scope named field used by tensorflow)
            T (int): 50 (the number of cycles of interaction)
            Rollout_batch_size (int): 2 (number of parallel rollouts per DDPG agent)
            Subtract_goals (function): A function that preprocesses the goal, with inputs a and b, and output a-b
            Relative_goals (boolean): False (true if the need for function subtract_goals processing for the goal)
            Clip_pos_returns (boolean): True (Do you need to eliminate the positive return)
            Clip_return (float): 50 (limit the range of return to [-clip_return, clip_return])
            Sample_transitions (function): The function returned by her. The parameters are defined by config.py
            Gamma (float): 0.98 (the discount factor used when Q network update)

            Where sample_transition comes from the definition of HER and is a key part
        """

        if self.clip_return is None:
            self.clip_return = np.inf

        # The creation of the network structure and calculation graph is done by the actor_critic.py file
        self.create_actor_critic = import_function(self.network_class)

        # Extract dimension
        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']  # 10
        self.dimg = self.input_dims['g']  # 4
        self.dimu = self.input_dims['u']  # 3
        # print("+++", input_shapes)    #  {'o': (10,), 'u': (4,), 'g': (3,), 'info_is_success': (1,)}

        # https://www.tensorflow.org/performance/performance_models
        # StagingArea provides simpler functionality and can be executed in parallel with other phases in the CPU and GPU.
        # Split the input pipeline into 3 separate parallel operations, and this is scalable to take advantage of large multi-core environments

        # Define the required storage variable. Suppose self.dimo=10, self.dimg=5, self.dimu=5
        # Then state_shapes={'o':(None, 10), 'g':(None, 5), 'u':(None:5)}
        # Add the variable used by the target network at the same time state_shapes={'o_2':(None, 10), 'g_2': (None, 5)}
        # Prepare staging area for feeding data to the model.

        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )  # Reward for scalar
        self.stage_shapes = stage_shapes
        # After executing self.stage_shapes =
        # OrderedDict([('g', (None, 3)), ('o', (None, 10)), ('u', (None, 4)), ('o_2', (None, 10) ), ('g_2', (None, 3)), ('r', (None,))])
        # Including g, o, u, target used in o_2, g_2 and reward r
        # Create network.
        # Create tf variables based on state_shape, including g, o, u, o_2, g_2, r
        # self.buffer_ph_tf = [<tf.Tensor 'ddpg/Placeholder:0' shape=(?, 3) dtype=float32>,
        #                     <tf.Tensor 'ddpg/Placeholder_1:0' shape=(?, 10) dtype=float32>,
        #                     <tf.Tensor 'ddpg/Placeholder_2:0' shape=(?, 4) dtype=float32>,
        #                     <tf.Tensor 'ddpg/Placeholder_3:0' shape=(?, 10) dtype=float32>,
        #                     <tf.Tensor 'ddpg/Placeholder_4:0' shape=(?, 3) dtype=float32>,
        #                     <tf.Tensor 'ddpg/Placeholder_5:0' shape=(?,) dtype=float32>]
        with tf.variable_scope(self.scope):
            # Create a StagingArea variable
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            # Create a Tensorflow variable placeholder
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            # Correspond to the tensorflow variable and the StagingArea variable
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)
            #
            self._create_network(reuse=reuse)

        # Experience pool related operations
        # When T = 50, after execution, buffer_shapes=
        #         {'o': (51, 10), 'u': (50, 4), 'g': (50, 3), 'info_is_success': (50, 1), 'ag': (51, 3)}
        # Note that a, g, u all record all the samples experienced in a cycle, so it is 50 dimensions, but o and ag need 1 more? ? ? ?
        buffer_shapes = {
            key: (self.T if key != 'o' else self.T + 1, *input_shapes[key])
            for key, val in input_shapes.items()
        }  #
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)  #
        buffer_shapes['ag'] = (self.T + 1, self.dimg)  #
        # print("+++", buffer_shapes)

        # buffer_size Is the length counted by sample
        # self.buffer_size=1E6  self.rollout_batch_size=2 buffer_size=1E6
        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                   self.sample_transitions)

    def _random_action(self, n):
        """
            从 [-self.max_u, +self.max_u] Random sampling n actions
        """
        return np.random.uniform(low=-self.max_u,
                                 high=self.max_u,
                                 size=(n, self.dimu))

    def _preprocess_og(self, o, ag, g):
        """
            obs, goal, achieve_goal Pretreatment
            In case self.relative_goal=True， then goal = goal - achieved_goal
        """
        if self.relative_goals:
            g_shape = g.shape
            g = g.reshape(-1, self.dimg)  # Increase 1 dimension
            ag = ag.reshape(-1, self.dimg)  # Increase 1 dimension
            g = self.subtract_goals(g, ag)  # g = g - ag
            g = g.reshape(*g_shape)
        o = np.clip(o, -self.clip_obs, self.clip_obs)
        g = np.clip(g, -self.clip_obs, self.clip_obs)
        return o, g

    def get_actions(self,
                    o,
                    ag,
                    g,
                    noise_eps=0.,
                    random_eps=0.,
                    use_target_net=False,
                    compute_Q=False):
        """
            Select the action according to the self.main network, then add Gaussian noise, clip, epsilon-greedy operation, and output the processed action
        """
        # If self.relative_goal=True, then the goal is preprocessed. Otherwise only clip
        o, g = self._preprocess_og(o, ag, g)
        # After calling the function self._create_network of this class, the self.main network and the self.target network are created, both of which are ActorCritic objects.
        policy = self.target if use_target_net else self.main  # Select an action based on self.main
        # actor Network output action tensor
        vals = [policy.pi_tf]

        # print("+++")
        # print(vals.shape)

        # Enter the vals of the actor output into the critic network again, and get the output as Q_pi_tf
        if compute_Q:
            vals += [policy.Q_pi_tf]
        # The construction of feed_dict, including obs, goal and action, as input to Actor and Critic
        feed = {
            policy.o_tf:
            o.reshape(-1, self.dimo),
            policy.g_tf:
            g.reshape(-1, self.dimg),
            policy.u_tf:
            np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32)
        }
        # Execute the current policy network, output ret. ret[0] for action, ret[1] for Q value
        ret = self.sess.run(vals, feed_dict=feed)

        # action postprocessing
        # Add Gaussian noise to Action. np.random.randn refers to sampling from a Gaussian distribution, the noise obeys Gaussian distribution
        u = ret[0]
        noise = noise_eps * self.max_u * np.random.randn(
            *u.shape)  # gaussian noise
        u += noise
        u = np.clip(u, -self.max_u, self.max_u)  # After adding noise clip

        # Perform epsilon-greedy operation, epsilon for random_eps
        # Np.random.binomial refers to the binomial distribution, the output is 0 or 1, and the probability of output is 1 is random_eps
        # If the binomial distribution outputs 0, then u+=0 is equivalent to no operation; if the output is 1, then u = u + (random_action - u) = random_action
        u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (
            self._random_action(u.shape[0]) - u)  # eps-greedy
        if u.shape[0] == 1:
            u = u[0]
        u = u.copy()
        ret[0] = u
        #
        if len(ret) == 1:
            return ret[0]
        else:
            return ret

    def store_episode(self, episode_batch, update_stats=True, verbose=False):
        """
            Episode_batch: array of batch_size x (T or T+1) x dim_key
                        'o' is of size T+1, others are of size T
             Call the store_episode function in replay_buffer to store samples for one sample period
             O_stats and g_stats update and store the mean and standard deviation of obs and goal, respectively, and update them regularly

        """

        # Episode_batch stores a sample of the cycle generated by generate_rollout in rollout.py
        # episode_batch is a dictionary, the keys include o, g, u, ag, info, and the values of the values are respectively
        # o (2, 51, 10), u (2, 50, 4), g (2, 50, 3), ag (2, 51, 3), info_is_success (2, 50, 1)
        # where the first dimension is the number of workers, and the second dimension is determined by the length of the cycle.

        self.buffer.store_episode(episode_batch, verbose=verbose)

        # Update the mean and standard deviation of o_stats and g_stats
        if update_stats:
            # add transitions to normalizer
            episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
            episode_batch['ag_2'] = episode_batch[
                'ag'][:, 1:, :]  # Extract next_obs and next_state
            num_normalizing_transitions = transitions_in_episode_batch(
                episode_batch)  # Convert period to total number of samples

            # Call the sampling function in sample_transitions
            # Episode_batch is a dictionary with key and element shapes respectively o (2, 51, 10) u (2, 50, 4) g (2, 50, 3) ag (2, 51, 3) info_is_success (2, 50, 1)
            #                                          o_2 (2, 50, 10)  ag_2 (2, 50, 3)
            # Num_normalizing_transitions=100, there are 2 workers, each worker contains 50 samples of 1 cycle
            transitions = self.sample_transitions(episode_batch,
                                                  num_normalizing_transitions)

            # The sampled samples are preprocessed and used to update the calculations o_stats and g_stats, defined in the Normalizer, for storing mean and std
            o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions[
                'g'], transitions['ag']
            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
            # No need to preprocess the o_2 and g_2 since this is only used for stats

            self.o_stats.update(transitions['o'])
            self.g_stats.update(transitions['g'])

            self.o_stats.recompute_stats()
            self.g_stats.recompute_stats()

    def get_current_buffer_size(self):
        """
            Returns the number of samples in the current experience pool
        """
        return self.buffer.get_current_size()

    def _sync_optimizers(self):
        """
            Q_adam and pi_adam are operators for updating actor networks and critic networks.
        """
        self.Q_adam.sync()
        self.pi_adam.sync()

    def _grads(self):
        """
            Return loss function and gradient
            Q_loss_tf, main.Q_pi_tf, Q_grad_tf, pi_grad_tf are all defined in the _create_network function
        """

        # Avoid feed_dict here for performance!
        critic_loss, actor_loss, Q_grad, pi_grad = self.sess.run([
            self.Q_loss_tf,
            self.main.Q_pi_tf,
            self.Q_grad_tf,
            self.pi_grad_tf,
        ])
        return critic_loss, actor_loss, Q_grad, pi_grad

    def _update(self, Q_grad, pi_grad):
        """
            Update main Actor and Critic network
             The updated op is defined in _create_network
        """
        self.Q_adam.update(Q_grad, self.Q_lr)
        self.pi_adam.update(pi_grad, self.pi_lr)

    def sample_batch(self):
        """
            Sampling is called by calling the sample function in replay_buffer.py , which is derived from the definition in her.py
             The returned sample consists of batch, which is used to build the feed_dict in the self.stage_batch function.
             Feed_dict will be used as input to select actions and update network parameters

             Calls to sample a batch, then preprocesses o and g. The key of the sample includes o, o_2, ag, ag_2, g
        """
        # Call sample and return transition to dictionary, key and val.shape
        # o (256, 10) u (256, 4) g (256, 3) info_is_success (256, 1) ag (256, 3) o_2 (256, 10) ag_2 (256, 3) r (256,)
        # print("In DDPG: ", self.batch_size)
        transitions = self.buffer.sample(self.batch_size)
        o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
        ag, ag_2 = transitions['ag'], transitions['ag_2']
        transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
        transitions['o_2'], transitions['g_2'] = self._preprocess_og(
            o_2, ag_2, g)

        transitions_batch = [
            transitions[key] for key in self.stage_shapes.keys()
        ]
        return transitions_batch

    def stage_batch(self, batch=None):
        if batch is None:
            batch = self.sample_batch()
        assert len(self.buffer_ph_tf) == len(batch)
        self.sess.run(self.stage_op,
                      feed_dict=dict(zip(self.buffer_ph_tf, batch)))
        # tensorboard visualization
        self.tfboard_sample_batch = batch
        self.tfboard_sample_tf = self.buffer_ph_tf

    def train(self, stage=True):
        """
            Calculate the gradient and then update
             Self.stage_batch was executed before the parameter update was executed in the train to build the feed_dict used for training. This function is called.
                     The self.sample_batch function, which in turn calls self.buffer.sample, which calls config_her in config.py, which configures the parameters of her.py functions.
             The operators in train are defined in self._create_network .

        """
        if stage:
            self.stage_batch(
            )  # Returns a feed_dict constructed using the sampling method of her.py to calculate the gradient

        critic_loss, actor_loss, Q_grad, pi_grad = self._grads()
        self._update(Q_grad, pi_grad)
        return critic_loss, actor_loss

    def _init_target_net(self):
        self.sess.run(self.init_target_net_op)

    def update_target_net(self):
        """
            Update the target network, update_target_net_op is defined in the function _create_network
        """
        self.sess.run(self.update_target_net_op)

    def clear_buffer(self):
        self.buffer.clear_buffer()

    def _vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                scope=self.scope + '/' + scope)
        assert len(res) > 0
        return res

    def _global_vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                scope=self.scope + '/' + scope)
        return res

    def _create_network(self, reuse=False):
        """
            Define the calculation flow graph required to calculate Actor and Critic losses
        """
        logger.info("Creating a DDPG agent with action space %d x %s..." %
                    (self.dimu, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()
        # running averages
        # Define Normalizer objects for the rules obs and goal respectively
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        # mini-batch sampling.
        # Used to store the data structure of a batch sample, which is OrderedDict. After execution, batch_tf is as follows:
        # OrderedDict([('g', <tf.Tensor 'ddpg/ddpg/StagingArea_get:0' shape=(?, 3) dtype=float32>),
        # ('o', <tf.Tensor 'ddpg/ddpg/StagingArea_get:1' shape=(?, 10) dtype=float32>),
        # ('u', <tf.Tensor 'ddpg/ddpg/StagingArea_get:2' shape=(?, 4) dtype=float32>),
        # ('o_2', <tf.Tensor 'ddpg/ddpg/StagingArea_get:3' shape=(?, 10) dtype=float32>),
        # ('g_2', <tf.Tensor 'ddpg/ddpg/StagingArea_get:4' shape=(?, 3) dtype=float32>),
        # ('r', <tf.Tensor 'ddpg/Reshape:0' shape=(?, 1) dtype=float32>)])
        # Defined batch_tf variable will be used as input to the neural network

        batch = self.staging_tf.get()
        batch_tf = OrderedDict([
            (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())
        ])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        # Create main network according to ActorCritic.py
        # When creating an ActorCritic network, you don't need to explicitly pass arguments. Use self.__dict__ to assign the corresponding parameters of the DDPG class directly to the corresponding parameters of ActorCritic.
        # print(self.main.__dict__)
        # {'inputs_tf': OrderedDict([('g', <tf.Tensor 'ddpg/ddpg/StagingArea_get:0' shape=(?, 3) dtype=float32>), ('o', <tf.Tensor ' Ddpg/ddpg/StagingArea_get:1' shape=(?, 10) dtype=float32>), ('u', <tf.Tensor 'ddpg/ddpg/StagingArea_get:2' shape=(?, 4) dtype=float32> ), ('o_2', <tf.Tensor 'ddpg/ddpg/StagingArea_get:3' shape=(?, 10) dtype=float32>), ('g_2', <tf.Tensor 'ddpg/ddpg/StagingArea_get:4 ' shape=(?, 3) dtype=float32>), ('r', <tf.Tensor 'ddpg/Reshape:0' shape=(?, 1) dtype=float32>)]),
        # 'net_type': 'main', 'reuse': False, 'buffer_size': 1000000, 'hidden': 256, 'layers': 3, 'network_class': 'GHER.actor_critic:ActorCritic',
        # 'polyak': 0.95, 'batch_size': 256, 'Q_lr': 0.001, 'pi_lr': 0.001, 'norm_eps': 0.01, 'norm_clip': 5, 'max_u': 1.0,
        # 'action_l2': 1.0, 'clip_obs': 200.0, 'scope': 'ddpg', 'relative_goals': False, 'input_dims': {'o': 10, 'u': 4, 'g': 3, 'info_is_success': 1},
        # 'T': 50, 'clip_pos_returns': True, 'clip_return': 49.996, 'rollout_batch_size': 2, 'subtract_goals': <function simple_goal_subtract at 0x7fcf72caa510>, 'sample_transitions': <function make_sample_her_transitions.<locals>._sample_her_transitions at 0x7fcf6e2ce048>,
        # 'gamma': 0.98, 'info': {'env_name': 'FetchReach-v1'}, 'use_mpi': True, 'create_actor_critic': <class 'GHER.actor_critic.ActorCritic'>,
        # 'dimo': 10, 'dimg': 3, 'dimu': 4, 'stage_shapes': OrderedDict([('g', (None, 3)), ('o', (None, 10)), ('u', (None, 4)), ('o_2', (None, 10)), ('g_2', (None, 3)), ('r', (None,))]), ' Staging_tf': <tensorflow.python.ops.data_flow_ops.StagingArea object at 0x7fcf6e2dddd8>,
        # 'buffer_ph_tf': [<tf.Tensor 'ddpg/Placeholder:0' shape=(?, 3) dtype=float32>, <tf.Tensor 'ddpg/Placeholder_1:0' shape=(?, 10) dtype=float32 >, <tf.Tensor 'ddpg/Placeholder_2:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'ddpg/Placeholder_3:0' shape=(?, 10) dtype=float32>, <tf .Tensor 'ddpg/Placeholder_4:0' shape=(?, 3) dtype=float32>, <tf.Tensor 'ddpg/Placeholder_5:0' shape=(?,) dtype=float32>],
        # 'stage_op': <tf.Operation 'ddpg/ddpg/StagingArea_put' type=Stage>, 'sess': <tensorflow.python.client.session.InteractiveSession object at 0x7fcf6e2dde10>, 'o_stats': <GHER.normalizer.Normalizer Object at 0x7fcf6e2ee940>, 'g_stats': <GHER.normalizer.Normalizer object at 0x7fcf6e2ee898>,
        # 'o_tf': <tf.Tensor 'ddpg/ddpg/StagingArea_get:1' shape=(?, 10) dtype=float32>, 'g_tf': <tf.Tensor 'ddpg/ddpg/StagingArea_get:0' shape=( ?, 3) dtype=float32>, 'u_tf': <tf.Tensor 'ddpg/ddpg/StagingArea_get:2' shape=(?, 4) dtype=float32>, 'pi_tf': <tf.Tensor 'ddpg/main /pi/mul:0' shape=(?, 4) dtype=float32>, 'Q_pi_tf': <tf.Tensor 'ddpg/main/Q/_3/BiasAdd:0' shape=(?, 1) dtype=float32 >, '_input_Q': <tf.Tensor 'ddpg/main/Q/concat_1:0' shape=(?, 17) dtype=float32>, 'Q_tf': <tf.Tensor 'ddpg/main/Q/_3_1/ BiasAdd: 0' shape=(?, 1) dtype=float32>}

        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf,
                                                 net_type='main',
                                                 **self.__dict__)
            vs.reuse_variables()

        # O_2, g_2 is used to create target network
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf[
                'g_2']  # Since the target network is used to calculate the target-Q value, o and g need to use the value of the next state.
            self.target = self.create_actor_critic(target_batch_tf,
                                                   net_type='target',
                                                   **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        # To calculate Critic's target-Q value, you need to use the Actor's target network and Critic's target network.
        # target_Q_pi_tf uses the next state o_2 and g_2
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(
            batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
        # The loss function of Critic is the square of the difference between target_tf and Q_tf. Note that the gradient is not passed through target_tf.
        self.Q_loss_tf = tf.reduce_mean(
            tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))

        # The loss function of the Actor is the opposite of the Q value obtained by the actor's output in the main network.
        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
        # Add regulars to Actors
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(
            tf.square(self.main.pi_tf / self.max_u))

        # Calculating the gradient
        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(
            Q_grads_tf,
            self._vars('main/Q'))  # Gradient and variable name correspond
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf,
                                       var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf,
                                        var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'),
                               scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = self._vars('main/Q') + self._vars(
            'main/pi'
        )  # Put together the parameters of the Actor and Critic network
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars(
            'g_stats')
        self.init_target_net_op = list(  # Target Initialization operation, the main network parameter is directly assigned to the target
            map(lambda v: v[0].assign(v[1]),
                zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(  # In the target update operation, the main network and the target network need to be weighted according to the parameter polyak
            map(
                lambda v: v[0].assign(self.polyak * v[0] +
                                      (1. - self.polyak) * v[1]),
                zip(self.target_vars, self.main_vars)))

        # # Tensorboard visualization
        # tf.summary.scalar("Q_target-Q-mean", tf.reduce_mean(target_tf))
        # tf.summary.histogram("Q_target-Q", target_tf)
        # tf.summary.scalar("Q_Td-error-mean", tf.reduce_mean(target_tf - self.main.Q_tf))
        # tf.summary.histogram("Q_Td-error", target_tf - self.main.Q_tf)
        # tf.summary.scalar("Q_reward-mean", tf.reduce_mean(batch_tf['r']))
        # tf.summary.histogram("Q_reward", batch_tf['r'])
        # tf.summary.scalar("Q_loss_tf", self.Q_loss_tf)
        # tf.summary.scalar("pi_loss_tf", self.pi_loss_tf)
        # self.merged = tf.summary.merge_all()

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()

    def logs(self, prefix=''):
        logs = []
        logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
        logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
        logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
        logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]

        if prefix is not '' and not prefix.endswith('/'):
            return [(prefix + '/' + key, val) for key, val in logs]
        else:
            return logs

    def tfboard_func(self, summary_writer, step):
        """
            Tensorboard visualization
        """
        self.sess.run(self.stage_op,
                      feed_dict=dict(
                          zip(self.tfboard_sample_tf,
                              self.tfboard_sample_batch)))
        summary = self.sess.run(self.merged)
        summary_writer.add_summary(summary, global_step=step)

        print("S" + str(step), end=",")

    def __getstate__(self):
        """
            Our policies can be loaded from pkl, but after unpickling you cannot continue training.
        """
        excluded_subnames = [
            '_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats', 'main',
            'target', 'lock', 'env', 'sample_transitions', 'stage_shapes',
            'create_actor_critic'
        ]

        state = {
            k: v
            for k, v in self.__dict__.items()
            if all([not subname in k for subname in excluded_subnames])
        }
        state['buffer_size'] = self.buffer_size
        state['tf'] = self.sess.run(
            [x for x in self._global_vars('') if 'buffer' not in x.name])
        return state

    def __setstate__(self, state):
        if 'sample_transitions' not in state:
            # We don't need this for playing the policy.
            state['sample_transitions'] = None

        self.__init__(**state)
        # set up stats (they are overwritten in __init__)
        for k, v in state.items():
            if k[-6:] == '_stats':
                self.__dict__[k] = v
        # load TF variables
        vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
        assert (len(vars) == len(state["tf"]))
        node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
        self.sess.run(node)

    # -----------------------------------------
    def updata_loss_all(self, verbose=False):
        assert self.buffer.current_size > 0
        idxes = np.arange(self.buffer.current_size)
        print("--------------------------------------")
        print("Updata All loss start...")
        self.buffer.update_rnnLoss(idxes, verbose=verbose)
        print("Updata All loss end ...")

Example #10

Show file

class Algorithm(object):
    @store_args
    def __init__(self,
                 buffer,
                 input_dims,
                 hidden,
                 layers,
                 polyak,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 gamma,
                 vloss_type='normal',
                 priority=False,
                 reuse=False,
                 **kwargs):
        """
        buffer (object): buffer to save transitions
        input_dims (dict of ints): dimensions for the observation (o), the goal (g), 
            and the actions (u)
        hidden (int): number of units in the hidden layers
        layers (int): number of hidden layers
        polyak (float): coefficient for Polyak-averaging of the target network
        Q_lr (float): learning rate for the Q (critic) network
        pi_lr (float): learning rate for the pi (actor) network
        norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
        norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
        max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
        action_l2 (float): coefficient for L2 penalty on the actions
        clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
        scope (str): the scope used for the TensorFlow graph
        subtract_goals (function): function that subtracts goals from each other
        relative_goals (boolean): whether or not relative goals should be fed into the network
        clip_pos_returns (boolean): whether or not positive returns should be clipped
        clip_return (float): clip returns to be in [-clip_return, clip_return]
        gamma (float): gamma used for Q learning updates
        vloss_type (str): value loss type, 'normal', 'tf_gamma', 'target'
        priority(boolean): use priority or not
        reuse (boolean): whether or not the networks should be reused
        """
        if self.clip_return is None:
            self.clip_return = np.inf
        self.dimo, self.dimg, self.dimu = self.input_dims[
            'o'], self.input_dims['g'], self.input_dims['u']
        self.stage_shapes = self.get_stage_shapes()
        self.init_target_net_op = None
        self.update_target_net_op = None

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)
            self._create_network(reuse=reuse)

        logger.log('value loss type: {}'.format(self.vloss_type))

    def get_stage_shapes(self):
        # Prepare staging area for feeding data to the model. save data for HER
        input_shapes = dims_to_shapes(self.input_dims)
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )
        if self.vloss_type == 'tf_gamma':
            stage_shapes['gamma'] = (None, )
        if self.priority:
            stage_shapes['w'] = (None, )
        return stage_shapes

    def _create_normalizer(self, reuse):
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('u_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.u_stats = Normalizer(self.dimu,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

    def _get_batch_tf(self):
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([
            (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())
        ])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])
        if self.priority:
            batch_tf['w'] = tf.reshape(batch_tf['w'], [-1, 1])
        return batch_tf

    def _create_target_main(self, AC_class, reuse, batch_tf):
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = AC_class(batch_tf, self.dimo, self.dimg, self.dimu,
                                 self.max_u, self.o_stats, self.g_stats,
                                 self.hidden, self.layers, self.sess)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = AC_class(target_batch_tf, self.dimo, self.dimg,
                                   self.dimu, self.max_u, self.o_stats,
                                   self.g_stats, self.hidden, self.layers,
                                   self.sess)
            vs.reuse_variables()
        assert len(get_var(self.scope + "/main")) == len(
            get_var(self.scope + '/target'))

    def _clip_target(self, batch_tf, clip_range, target_V_tf):
        if self.vloss_type == 'tf_gamma':
            target_tf = tf.clip_by_value(
                batch_tf['r'] + batch_tf['gamma'] * target_V_tf, *clip_range)
        elif self.vloss_type == 'target':
            target_tf = tf.clip_by_value(batch_tf['r'], *clip_range)
        else:
            target_tf = tf.clip_by_value(
                batch_tf['r'] + self.gamma * target_V_tf, *clip_range)
        return target_tf

    def _create_network(self, reuse=False):
        raise NotImplementedError

    def _random_action(self, n):
        return np.random.uniform(low=-self.max_u,
                                 high=self.max_u,
                                 size=(n, self.dimu))

    def _preprocess_og(self, o, g, ag=None):
        if self.relative_goals and ag:
            g_shape = g.shape
            g, ag = g.reshape(-1, self.dimg), ag.reshape(-1, self.dimg)
            g = self.subtract_goals(g, ag)
            g = g.reshape(*g_shape)
        o = np.clip(o, -self.clip_obs, self.clip_obs)
        g = np.clip(g, -self.clip_obs, self.clip_obs)
        return o, g

    def step(self, obs):  # act without noise
        actions = self.get_actions(obs['observation'], obs['achieved_goal'],
                                   obs['desired_goal'])
        return actions, None, None, None

    def simple_get_action(self, o, g, use_target_net=False):
        o, g = self._preprocess_og(o=o, g=g)
        policy = self.target if use_target_net else self.main  # in n-step self.target performs better
        action = self.sess.run(policy.pi_tf,
                               feed_dict={
                                   policy.o_tf: o.reshape(-1, self.dimo),
                                   policy.g_tf: g.reshape(-1, self.dimg)
                               })
        return action

    def get_actions(self,
                    o,
                    ag,
                    g,
                    noise_eps=0.,
                    random_eps=0.,
                    use_target_net=False,
                    compute_Q=False):
        o, g = self._preprocess_og(o=o, g=g, ag=ag)
        u = self.simple_get_action(o, g, use_target_net)
        if compute_Q:
            Q_pi = self.get_Q_fun(o, g)

        noise = noise_eps * self.max_u * np.random.randn(
            *u.shape)  # gaussian noise
        u = np.clip(u + noise, -self.max_u, self.max_u)
        u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (
            self._random_action(u.shape[0]) - u)  # eps-greedy
        if u.shape[0] == 1:
            u = u[0]

        if compute_Q:
            return [u, Q_pi]
        else:
            return u

    def get_Q_fun(self, o, g, u=None, Q_pi=True):
        o, g = self._preprocess_og(o, g)
        policy = self.target
        if Q_pi or (u is None):
            return policy.get_Q_pi(o, g)
        else:
            return policy.get_Q(o, g, u)

    def store_episode(self, episode_batch, update_stats=True):
        """episode_batch: array of batch_size x (T or T+1) x dim_key, 'o' is of size T+1, others are of size T"""
        self.buffer.store_episode(episode_batch)
        if update_stats:  # episode doesn't has key o_2
            os, gs, ags = episode_batch['o'].copy(), episode_batch['g'].copy(
            ), episode_batch['ag'].copy()
            os, gs = self._preprocess_og(o=os, g=gs, ag=ags)
            # update normalizer online
            self.o_stats.update_all(os)
            self.g_stats.update_all(gs)

    def _sync_optimizers(self):
        raise NotImplementedError

    def _grads(self):  # Avoid feed_dict here for performance!
        raise NotImplementedError

    def _update(self, Q_grad, pi_grad):
        raise NotImplementedError

    def stage_batch(self, batch=None):
        if batch is None:
            if self.priority:
                transitions, idxes = self.buffer.sample()
            else:
                transitions = self.buffer.sample()
            o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
            ag, ag_2 = transitions['ag'], transitions['ag_2']
            transitions['o'], transitions['g'] = self._preprocess_og(o=o,
                                                                     g=g,
                                                                     ag=ag)
            transitions['o_2'], transitions['g_2'] = self._preprocess_og(
                o=o_2, g=g, ag=ag_2)

            batch = [transitions[key] for key in self.stage_shapes.keys()]
        assert len(self.buffer_ph_tf) == len(batch)
        self.sess.run(self.stage_op,
                      feed_dict=dict(zip(self.buffer_ph_tf, batch)))
        if self.priority:
            return idxes

    def train(self, stage=True):
        if stage:
            idxes = self.stage_batch()
        critic_loss, actor_loss, Value_grad, pi_grad, abs_td_error = self._grads(
        )
        self._update(Value_grad, pi_grad)
        if self.priority:
            self.buffer.update_priorities(idxes, abs_td_error)
        return critic_loss, actor_loss

    def _init_target_net(self):
        self.sess.run(self.init_target_net_op)

    def update_target_net(self):
        self.sess.run(self.update_target_net_op)

    def logs_stats(self, prefix=''):
        logs = []
        logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
        logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
        logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
        logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]
        logs += [('stats_u/mean', np.mean(self.sess.run([self.u_stats.mean])))]
        logs += [('stats_u/std', np.mean(self.sess.run([self.u_stats.std])))]
        if prefix != '' and not prefix.endswith('/'):
            return [(prefix + '/' + key, val) for key, val in logs]
        else:
            return logs

    def save(self, save_path):
        tf_util.save_variables(save_path)

Example #11

Show file

File: ddpg.py Project: dingyiming0427/goalgail

class DDPG(object):
    @store_args
    def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size,
                 Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T,
                 rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return,
                 sample_transitions, gamma, reuse=False, env=None, to_goal=None, nearby_action_penalty=False, nearby_penalty_weight=0,
                 sample_expert=False, expert_batch_size=0., bc_loss=0., anneal_bc=0., terminate_bootstrapping=False, mask_q = False,
                 two_qs=False, anneal_discriminator=False, **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
        """
        if self.clip_return is None:
            self.clip_return = np.inf
        self.create_actor_critic = import_function(self.network_class)

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None,)
        if two_qs:
            stage_shapes['r2'] = (None,)
            stage_shapes['w_q2'] = (None, )
        stage_shapes['successes'] = (None,)
        if nearby_action_penalty:
            stage_shapes['far_from_goal'] = (None, )
        if sample_expert:
            stage_shapes['is_demo'] = (None, )
            stage_shapes['annealing_factor'] = (None, )
        self.stage_shapes = stage_shapes

        # Create network.
        # print(self.stage_shapes.keys())
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values()]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)
            self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {key: (self.T if key != 'o' else self.T+1, *input_shapes[key])
                         for key, val in input_shapes.items()}
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T+1, self.dimg)
        buffer_shapes['successes'] = (self.T,)


        buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)
        self.expert_buffer = None

        self.all_variables = self._global_vars('')

        if to_goal is None:
            print("to goal is none!")
            self.to_goal = (0, 2)
        else:
            self.to_goal = to_goal
        self.to_goal_func = (lambda x: x[self.to_goal[0] : self.to_goal[1]]) if len(self.to_goal) == 2 else (lambda x: x[np.array(self.to_goal)])
        self.nearby_action_penalty = nearby_action_penalty
        self.nearby_penalty_weight = nearby_penalty_weight


    def _random_action(self, n):
        return np.random.uniform(low=-self.max_u, high=self.max_u, size=(n, self.dimu))

    def _preprocess_og(self, o, ag, g):
        if self.relative_goals:
            g_shape = g.shape
            g = g.reshape(-1, self.dimg)
            ag = ag.reshape(-1, self.dimg)
            g = self.subtract_goals(g, ag)
            g = g.reshape(*g_shape)
        o = np.clip(o, -self.clip_obs, self.clip_obs)
        g = np.clip(g, -self.clip_obs, self.clip_obs)
        return o, g

    def get_actions(self, o, ag, g, noise_eps=0., random_eps=0., use_target_net=False,
                    compute_Q=False):
        o, g = self._preprocess_og(o, ag, g)
        policy = self.target if use_target_net else self.main
        # values to compute
        vals = [policy.pi_tf]
        if compute_Q:
            vals += [policy.Q_pi_tf]
        # feed
        feed = {
            policy.o_tf: o.reshape(-1, self.dimo),
            policy.g_tf: g.reshape(-1, self.dimg),
            policy.u_tf: np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32),
        }

        ret = self.sess.run(vals, feed_dict=feed)
        # action postprocessing
        u = ret[0]
        noise = noise_eps * self.max_u * np.random.randn(*u.shape)  # gaussian noise
        u += noise
        u = np.clip(u, -self.max_u, self.max_u)
        u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (self._random_action(u.shape[0]) - u)  # eps-greedy
        if u.shape[0] == 1:
            u = u[0]
        u = u.copy()
        ret[0] = u

        if len(ret) == 1:
            return ret[0]
        else:
            return ret

    def get_action(self, o, noise=0.1):
        return self.get_actions([o], self.to_goal_func(o), self.env.current_goal, noise_eps=noise), None

    def store_episode(self, episode_batch, update_stats=True):
        """
        episode_batch: array of batch_size x (T or T+1) x dim_key
                       'o' is of size T+1, others are of size T
        """

        self.buffer.store_episode(episode_batch)

        if update_stats:
            # add transitions to normalizer
            episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
            episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
            num_normalizing_transitions = transitions_in_episode_batch(episode_batch)
            transitions = self.sample_transitions(episode_batch, num_normalizing_transitions)

            o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions['g'], transitions['ag']
            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
            # No need to preprocess the o_2 and g_2 since this is only used for stats

            self.o_stats.update(transitions['o'])
            self.g_stats.update(transitions['g'])

            self.o_stats.recompute_stats()
            self.g_stats.recompute_stats()

    def get_current_buffer_size(self):
        return self.buffer.get_current_size()

    def _sync_optimizers(self):
        self.Q_adam.sync()
        if self.two_qs:
            self.Q2_adam.sync()
        self.pi_adam.sync()

    def _grads(self):
        # Avoid feed_dict here for performance!
        if not self.two_qs:
            critic_loss, actor_loss, Q_grad, pi_grad = self.sess.run([
                self.Q_loss_tf,
                self.main.Q_pi_tf,
                self.Q_grad_tf,
                self.pi_grad_tf
            ])
            return critic_loss, actor_loss, Q_grad, pi_grad
        else:
            critic_loss, critic_loss2, actor_loss, Q_grad, Q2_grad, pi_grad = self.sess.run([
                self.Q_loss_tf,
                self.Q2_loss_tf,
                self.main.Q_pi_tf,
                self.Q_grad_tf,
                self.Q2_grad_tf,
                self.pi_grad_tf
            ])
            return critic_loss, critic_loss2, actor_loss, Q_grad, Q2_grad, pi_grad

    def _update(self, Q_grad, pi_grad, Q2_grad=None):
        self.Q_adam.update(Q_grad, self.Q_lr)
        if self.two_qs:
            self.Q2_adam.update(Q2_grad, self.Q_lr)
        self.pi_adam.update(pi_grad, self.pi_lr)

    def sample_batch_helper(self, buffer, batch_size, expert=False, annealing_factor=1., w_q2=1.):
        transitions = buffer.sample(batch_size)
        o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
        ag, ag_2 = transitions['ag'], transitions['ag_2']
        transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
        transitions['o_2'], transitions['g_2'] = self._preprocess_og(o_2, ag_2, g)
        transitions['is_demo'] = int(expert) * np.ones_like(transitions['r']).astype(np.float32)
        transitions['annealing_factor'] = annealing_factor * np.ones_like(transitions['r']).astype(np.float32)
        if self.two_qs:
            transitions['w_q2'] = w_q2 * np.ones_like(transitions['r']).astype(np.float32)
        if self.anneal_discriminator:
            transitions['r'] = transitions['r'] + w_q2 * transitions['r2']
        transitions_batch = [transitions[key] for key in self.stage_shapes.keys()]
        return transitions_batch

    def sample_batch(self, annealing_factor=1., w_q2=1.):
        transitions_batch = None
        if self.batch_size > 0:
            transitions_batch = self.sample_batch_helper(self.buffer, self.batch_size, w_q2=w_q2)
        if self.sample_expert and self.expert_buffer is not None:
            expert_batch = self.sample_batch_helper(self.expert_buffer, self.expert_batch_size, expert=True, annealing_factor=annealing_factor, w_q2=w_q2)
            transitions_batch = expert_batch if transitions_batch is None else\
                [np.concatenate([normal, expert]) for (normal, expert) in zip(transitions_batch, expert_batch)]
        return transitions_batch

    def stage_batch(self, batch=None, annealing_factor=1., w_q2=1.):
        if batch is None:
            batch = self.sample_batch(annealing_factor=annealing_factor, w_q2=w_q2)
            # return goals that are trained on

        assert len(self.buffer_ph_tf) == len(batch)
        # if not (batch[5] <= batch[6]).all():
            # import pdb;
            # pdb.set_trace()
        #print(batch[5])

        self.sess.run(self.stage_op, feed_dict=dict(zip(self.buffer_ph_tf, batch)))
        return batch

    def train(self, stage=True, annealing_factor=1., q_annealing=1.):
        if stage:
            batch = self.stage_batch(annealing_factor=annealing_factor, w_q2=q_annealing)
        if not self.two_qs:
            critic_loss, actor_loss, Q_grad, pi_grad = self._grads()
            self._update(Q_grad, pi_grad)
        else:
            critic_loss, critic_loss2, actor_loss, Q_grad, Q2_grad, pi_grad = self._grads()
            self._update(Q_grad, pi_grad, Q2_grad=Q2_grad)

        return critic_loss, actor_loss, batch


    def _init_target_net(self):
        self.sess.run(self.init_target_net_op)

    def update_target_net(self):
        self.sess.run(self.update_target_net_op)

    def clear_buffer(self):
        self.buffer.clear_buffer()

    def _vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope + '/' + scope)
        assert len(res) > 0
        return res

    def _global_vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope + '/' + scope)
        return res

    def _create_network(self, reuse=False):
        # logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()

        # running averages
        with tf.variable_scope('o_stats', reuse=reuse) as vs:
            # if reuse:
            #     vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess)
        with tf.variable_scope('g_stats', reuse=reuse) as vs:
            # if reuse:
            #     vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess)
        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([(key, batch[i])
                                for i, key in enumerate(self.stage_shapes.keys())])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])
        batch_tf['successes'] = tf.reshape(batch_tf['successes'], [-1, 1])
        # networks
        with tf.variable_scope('main', reuse=reuse) as vs:
            # if reuse:
            #     vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target', reuse=reuse) as vs:
            # if reuse:
            #     vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(
                target_batch_tf, net_type='target',  **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))
        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        if self.two_qs:
            target_Q2_pi_tf = self.target.Q2_pi_tf
        # clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
        clip_range = (-np.inf, self.clip_return)
        # print(clip_range)
        if self.terminate_bootstrapping:
            target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * (1 - batch_tf['successes']) * target_Q_pi_tf, *clip_range)
            if self.two_qs:
                target2_tf = tf.clip_by_value(batch_tf['r2'] + self.gamma * (1 - batch_tf['successes']) * target_Q2_pi_tf, *clip_range)
        else:
            target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
            if self.two_qs:
                target2_tf = tf.clip_by_value(batch_tf['r2'] + self.gamma * target_Q2_pi_tf, *clip_range)
        if self.nearby_action_penalty:
            target_tf -= tf.reshape(batch_tf['far_from_goal'] * self.nearby_penalty_weight * tf.norm(self.main.pi_tf - batch_tf['u'], axis=-1), (-1, 1))

        self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))
        if self.two_qs:
            self.Q2_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target2_tf) - self.main.Q2_tf))

        if self.mask_q:
            self.pi_loss_tf = 0
        else:
            if self.two_qs:
                self.pi_loss_tf = -tf.reduce_mean((1 - batch_tf['w_q2'])[:, None] * self.main.Q_pi_tf + batch_tf['w_q2'][:, None] * self.main.Q2_pi_tf)
            else:
                self.pi_loss_tf =  -tf.reduce_mean(self.main.Q_pi_tf)
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf/ self.max_u))
        if self.sample_expert:
            self.pi_loss_tf += (1 - self.anneal_bc * tf.to_float(tf.greater_equal(self.target.Q_pi_tf, self.target.Q_tf))) * \
                self.bc_loss * tf.reduce_mean(batch_tf['is_demo'] * batch_tf['annealing_factor'] *
                tf.reduce_sum(tf.square(self.main.pi_tf - batch_tf['u']), axis=-1 ))

        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))

        if self.two_qs:
            Q2_grads_tf = tf.gradients(self.Q2_loss_tf, self._vars('main/2Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))

        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)

        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        if self.two_qs:
            self.Q2_grads_vars_tf = zip(Q2_grads_tf, self._vars('main/2Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))

        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q'))
        if self.two_qs:
            self.Q2_grad_tf = flatten_grads(grads=Q2_grads_tf, var_list=self._vars('main/2Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        if self.two_qs:
            self.Q2_adam = MpiAdam(self._vars('main/2Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = self._vars('main/Q') + self._vars('main/pi') + (self._vars('main/2Q') if self.two_qs else [])
        self.target_vars = self._vars('target/Q') + self._vars('target/pi') + (self._vars('target/2Q') if self.two_qs else [])
        self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats')
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars)))
        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()

    def logs(self, prefix=''):
        logs = []
        logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
        logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
        logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
        logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]
        # import pdb; pdb.set_trace()
        self.stage_batch()
        # logs += [('action_diff', np.mean(self.sess.run([tf.norm(self.main.u_tf - self.main.pi_tf, axis=-1)])))]


        if prefix is not '' and not prefix.endswith('/'):
            return [(prefix + '/' + key, val) for key, val in logs]
        else:
            return logs

    # to be compatible with rollout collection in rllab
    def reset(self):
        pass

    def __getstate__(self):
        """Our policies can be loaded from pkl, but after unpickling you cannot continue training.
        """
        # import pdb; pdb.set_trace()
        excluded_subnames = ['_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats',
                             'main', 'target', 'lock', 'env', 'sample_transitions',
                             'stage_shapes', 'create_actor_critic', 'all_variables', 'to_goal_func']

        state = {k: v for k, v in self.__dict__.items() if all([not subname in k for subname in excluded_subnames])}
        state['buffer_size'] = self.buffer_size
        state['tf'] = self.sess.run([x for x in self.all_variables if 'buffer' not in x.name])
        # print("global variables", tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))
        # print("in get state")
        return state

    def __setstate__(self, state):
        if 'sample_transitions' not in state:
            # We don't need this for playing the policy.
            state['sample_transitions'] = None

        self.__init__(**state)
        # set up stats (they are overwritten in __init__)
        for k, v in state.items():
            if k[-6:] == '_stats':
                self.__dict__[k] = v
        # load TF variables
        vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
        # print(vars)
        # print(state['tf'])
        assert(len(vars) == len(state["tf"]))
        node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
        self.sess.run(node)

Example #12

Show file

File: pggd.py Project: matthew9671/BlockPuzzle-gym

class PGGD(object):

    DIMO = 0

    @store_args
    def __init__(self,
                 input_dims,
                 buffer_size,
                 hidden,
                 layers,
                 network_class,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 sample_transitions,
                 gamma,
                 reuse=False,
                 **kwargs):
        """Implementation of PGGD that is used in combination with Hindsight Experience Replay (HER).

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per PGGD agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
        """
        # ------------------
        # To access information of environment name and stuff
        self.kwargs = kwargs
        # ------------------

        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(self.network_class)

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']

        # ----------------------
        input_shapes['o'] = (None, )
        # ----------------------

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )

        # ----------------------
        stage_shapes['G'] = (None, )
        # ----------------------

        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {
            key: (self.T, *input_shapes[key]) if key != 'o' else
            (self.T + 1, PGGD.DIMO)
            for key, val in input_shapes.items()
        }
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T + 1, self.dimg)
        # -------------------
        buffer_shapes['G'] = (self.T, )
        buffer_shapes['sigma'] = (self.T, self.dimu)
        self.weight_path = None
        # -------------------
        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                   self.sample_transitions)

    def _random_action(self, n):
        return np.random.uniform(low=-self.max_u,
                                 high=self.max_u,
                                 size=(n, self.dimu))

    def _preprocess_og(self, o, ag, g):
        if self.relative_goals:
            g_shape = g.shape
            g = g.reshape(-1, self.dimg)
            ag = ag.reshape(-1, self.dimg)
            g = self.subtract_goals(g, ag)
            g = g.reshape(*g_shape)
        o = np.clip(o, -self.clip_obs, self.clip_obs)
        g = np.clip(g, -self.clip_obs, self.clip_obs)

        return o, g

    # -------------------------------
    # If observation has more dimensions than what the policy takes in
    # then just truncate it.
    def get_actions(self, o, ag, g, exploit=False):
        # if len(o.shape) == 1:
        #     o = o[:self.dimo]
        #     g = g[:self.dimg]
        #     ag = ag[:self.dimg]
        # else:
        #     o = o[:,:self.dimo]
        #     g = g[:,:self.dimg]
        #     ag = ag[:,:self.dimg]
        o, g = self._preprocess_og(o, ag, g)
        policy = self.main
        # values to compute
        if exploit:
            vals = [policy.da_tf]
        else:
            vals = [policy.a_tf]

        vals += [policy.raw_tf, policy.sigma_tf]
        # feed
        feed = {
            policy.o_tf:
            o.reshape(-1, self.dimo),
            policy.g_tf:
            g.reshape(-1, self.dimg),
            policy.u_tf:
            np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32)
        }

        ret = self.sess.run(vals, feed_dict=feed)
        # action postprocessing
        u, raw, sigma = ret
        if u.shape[0] == 1:
            u = u[0]
            raw = raw[0]
            sigma = sigma[0]
        u = u.copy()
        raw = raw.copy()
        sigma = sigma.copy()
        return u, raw, sigma

    # -------------------------------

    def store_episode(self, episode_batch, update_stats=True):
        """
        episode_batch: array of batch_size x (T or T+1) x dim_key
                       'o' is of size T+1, others are of size T
        """

        self.buffer.store_episode(episode_batch)

        if update_stats:
            # add transitions to normalizer
            episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
            episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
            num_normalizing_transitions = transitions_in_episode_batch(
                episode_batch)
            transitions = self.sample_transitions(episode_batch,
                                                  num_normalizing_transitions)

            o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions[
                'g'], transitions['ag']
            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
            # No need to preprocess the o_2 and g_2 since this is only used for stats
            if 'Variation' in self.kwargs['info']['env_name']:
                o = transitions['o'][:, 1:]
                # o = np.concatenate([transitions['o'][:,:ENV_FEATURES],
                #                     transitions['o'][:,ENV_FEATURES+1:]], axis=1)
            else:
                o = transitions['o']

            self.o_stats.update(o)
            self.G_stats.update(transitions['G'])
            self.sigma_stats.update(transitions['sigma'])
            # self.g_stats.update(transitions['g'])

            self.o_stats.recompute_stats()
            # self.g_stats.recompute_stats()
            self.G_stats.recompute_stats()
            self.sigma_stats.recompute_stats()

    def get_current_buffer_size(self):
        return self.buffer.get_current_size()

    def _sync_optimizers(self):
        self.pi_adam.sync()

    def _grads(self):
        # Avoid feed_dict here for performance!
        pi_loss, pi_grad, mu = self.sess.run(
            [self.pi_loss_tf, self.pi_grad_tf, self.main.mu_tf])
        # print(np.mean(mu), np.mean(pi_grad), np.mean(pi_loss))
        return pi_loss, pi_grad

    def _update(self, pi_grad):
        self.pi_adam.update(pi_grad, self.pi_lr)

    def sample_batch(self):
        transitions = self.buffer.sample(self.batch_size)
        o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
        ag, ag_2 = transitions['ag'], transitions['ag_2']
        transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
        transitions['o_2'], transitions['g_2'] = self._preprocess_og(
            o_2, ag_2, g)

        transitions_batch = [
            transitions[key] for key in self.stage_shapes.keys()
        ]
        # print(transitions['G'])
        return transitions_batch

    def stage_batch(self, batch=None):
        if batch is None:
            batch = self.sample_batch()
        assert len(self.buffer_ph_tf) == len(batch)
        self.sess.run(self.stage_op,
                      feed_dict=dict(zip(self.buffer_ph_tf, batch)))

    def train(self, stage=True):
        if stage:
            self.stage_batch()
        pi_loss, pi_grad = self._grads()
        self._update(pi_grad)
        # print(np.mean(pi_grad))
        return pi_loss

    def _init_target_net(self):
        self.sess.run(self.init_target_net_op)

    def update_target_net(self):
        self.sess.run(self.update_target_net_op)

    def clear_buffer(self):
        self.buffer.clear_buffer()

    def _vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                scope=self.scope + '/' + scope)
        assert len(res) > 0
        return res

    def _global_vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                scope=self.scope + '/' + scope)
        return res

    def _create_network(self, reuse=False):
        logger.info("Creating a PGGD agent with action space %d x %s..." %
                    (self.dimu, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            o_stats_dim = self.dimo
            if 'Variation' in self.kwargs['info']['env_name']:
                print("Found Variation in env name")
                o_stats_dim -= 1
            self.o_stats = Normalizer(o_stats_dim,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        # --------------
        with tf.variable_scope('G_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.G_stats = Normalizer(1,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('sigma_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.sigma_stats = Normalizer(self.dimu,
                                          self.norm_eps,
                                          self.norm_clip,
                                          sess=self.sess)
        # --------------
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([
            (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())
        ])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])
        # ------------
        batch_tf['G'] = tf.reshape(batch_tf['G'], [
            -1,
        ])
        # ------------

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf,
                                                 net_type='main',
                                                 **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(target_batch_tf,
                                                   net_type='target',
                                                   **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # ---------------------------
        # loss functions
        log_prob = tf.reduce_sum(tf.log(
            tf.clip_by_value(self.main.a_prob_tf, 1e-10, 1.0)),
                                 axis=1)
        neg_weighted_log_prob = -tf.multiply(batch_tf['G'], log_prob)
        self.pi_loss_tf = tf.reduce_mean(neg_weighted_log_prob)

        # https://github.com/tensorflow/tensorflow/issues/783
        def replace_none_with_zero(grads, var_list):
            return [
                grad if grad is not None else tf.zeros_like(var)
                for var, grad in zip(var_list, grads)
            ]

        pi_grads_tf = replace_none_with_zero(
            tf.gradients(self.pi_loss_tf, self._vars('main/pi')),
            self._vars('main/pi'))
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf,
                                        var_list=self._vars('main/pi'))
        # ---------------------------

        # optimizers
        self.pi_adam = MpiAdam(self._vars('main/pi'),
                               scale_grad_by_procs=False)

        # polyak averaging
        # self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        # self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars(
            'g_stats') + self._global_vars('G_stats') + self._global_vars(
                'sigma_stats')
        # self.init_target_net_op = list(
        #     map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
        # self.update_target_net_op = list(
        #     map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        # self._init_target_net()

    def logs(self, prefix=''):
        logs = []
        logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
        logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
        logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
        logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]
        logs += [('stats_G/mean', np.mean(self.sess.run([self.G_stats.mean])))]
        logs += [('stats_G/std', np.mean(self.sess.run([self.G_stats.std])))]
        logs += [('stats_stddev/mean',
                  np.mean(self.sess.run([self.sigma_stats.mean])))]

        if prefix is not '' and not prefix.endswith('/'):
            return [(prefix + '/' + key, val) for key, val in logs]
        else:
            return logs

    def __getstate__(self):
        """Our policies can be loaded from pkl, but after unpickling you cannot continue training.
        """
        excluded_subnames = [
            '_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats', 'main',
            'target', 'lock', 'env', 'sample_transitions', 'stage_shapes',
            'create_actor_critic'
        ]

        state = {
            k: v
            for k, v in self.__dict__.items()
            if all([not subname in k for subname in excluded_subnames])
        }
        state['buffer_size'] = self.buffer_size
        state['tf'] = self.sess.run(
            [x for x in self._global_vars('') if 'buffer' not in x.name])
        return state

    def set_sample_transitions(self, fn):
        self.sample_transitions = fn
        self.buffer.sample_transitions = fn

    def set_obs_size(self, dims):
        self.input_dims = dims
        self.dimo = dims['o']
        self.dimg = dims['g']
        self.dimu = dims['u']

    def save_weights(self, path):
        self.main.save_weights(self.sess, path)
        self.weight_path = path

    def __setstate__(self, state):
        if 'sample_transitions' not in state:
            # We don't need this for playing the policy.
            state['sample_transitions'] = None

        self.__init__(**state)
        # set up stats (they are overwritten in __init__)
        for k, v in state.items():
            if k[-6:] == '_stats':
                self.__dict__[k] = v
        self.weight_path = state['weight_path']
        # Hard override...
        # This is due to the fact that the directory that the weights are saved to
        # might not be the same when it is loaded again
        # TODO: Delete this!!!!
        self.weight_path = "/Users/matt/RL/Results/5-3blocks-GPGGD-3-256/weights"
        # load TF variables
        vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
        assert (len(vars) == len(state["tf"]))
        node = [
            tf.no_op() if 'o_stats' in var.name else tf.assign(var, val)
            for var, val in zip(vars, state["tf"])
        ]
        self.sess.run(node)
        if self.weight_path != None:
            print("Reading weights for sure this time!")
            print(self.weight_path)
            print(tf.train.latest_checkpoint(self.weight_path))
            self.main.load_weights(self.sess, self.weight_path)

Example #13

Show file

File: ddpg.py Project: s-bl/cwyc

class DDPG(ParallelModule):
    def __init__(self,
                 env_spec,
                 task_spec,
                 buffer_size,
                 network_params,
                 normalizer_params,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 random_eps,
                 noise_eps,
                 train_steps,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 replay_strategy,
                 replay_k,
                 noise_type,
                 share_experience,
                 noise_adaptation,
                 reuse=False):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).
            Added functionality to use demonstrations for training to Overcome exploration problem.

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
            bc_loss: whether or not the behavior cloning loss should be used as an auxilliary loss
            q_filter: whether or not a filter on the q value update should be used when training with demonstartions
            num_demo: Number of episodes in to be used in the demonstration buffer
            demo_batch_size: number of samples to be used from the demonstrations buffer, per mpi thread
            prm_loss_weight: Weight corresponding to the primary loss
            aux_loss_weight: Weight corresponding to the auxilliary loss also called the cloning loss
        """
        super().__init__(scope)
        self.replay_k = replay_k
        self.replay_strategy = replay_strategy
        self.clip_pos_returns = clip_pos_returns
        self.relative_goals = relative_goals
        self.train_steps = train_steps
        self.noise_eps = noise_eps
        self.random_eps = random_eps
        self.clip_obs = clip_obs
        self.action_l2 = action_l2
        self.max_u = max_u
        self.pi_lr = pi_lr
        self.Q_lr = Q_lr
        self.batch_size = batch_size
        self.normalizer_params = normalizer_params
        self.polyak = polyak
        self.buffer_size = buffer_size
        self._env_spec = env_spec
        self._T = self._env_spec['T']
        self._task_spec = task_spec
        self.network_params = network_params
        self._share_experience = share_experience
        self._noise_adaptation = noise_adaptation

        self._task_spec = deepcopy(task_spec)
        self._task_spec['buffer_size'] = 0
        self._task = Task(**self._task_spec)

        self._gamma = 1. - 1. / self._T
        self.clip_return = (1. / (1. - self._gamma)) if clip_return else np.inf

        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(network_params['net_type'])

        self.input_dims = dict(
            o=self._env_spec['o_dim'],
            a=self._env_spec['a_dim'],
            g=self._task_spec['g_dim'],
        )

        input_shapes = dims_to_shapes(self.input_dims)

        self.dimo = self._env_spec['o_dim']
        self.dimg = self._task_spec['g_dim']
        self.dima = self._env_spec['a_dim']

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_next'] = stage_shapes[key]
        stage_shapes['r'] = (None, )
        self.stage_shapes = stage_shapes

        self._action_noise, self._parameter_noise = get_noise_from_string(
            self._env_spec, noise_type)

        # Create network.
        with tf.variable_scope(self._scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)

        buffer_shapes = dict()
        buffer_shapes['o'] = (self.dimo, )
        buffer_shapes['o_next'] = buffer_shapes['o']
        buffer_shapes['g'] = (self.dimg, )
        buffer_shapes['ag'] = (self.dimg, )
        buffer_shapes['ag_next'] = (self.dimg, )
        buffer_shapes['a'] = (self.dima, )

        self.sample_transitions = make_sample_her_transitions(
            self.replay_strategy, self.replay_k,
            self._task.reward_done_success)

        self._buffer = ReplayBuffer(buffer_shapes, self.buffer_size, self._T,
                                    self.sample_transitions)

    def _random_action(self, n):
        return np.random.uniform(low=-self.max_u,
                                 high=self.max_u,
                                 size=(n, self.dima))

    def _preprocess_og(self, o, ag, g):
        if self.relative_goals:
            g_shape = g.shape
            g = g.reshape(-1, self.dimg)
            ag = ag.reshape(-1, self.dimg)
            g = simple_goal_subtract(g, ag)
            g = g.reshape(*g_shape)
        o = np.clip(o, -self.clip_obs, self.clip_obs)
        g = np.clip(g, -self.clip_obs, self.clip_obs)
        return o, g

    def step(self, obs):
        actions = self.get_actions(obs['observation'], obs['achieved_goal'],
                                   obs['desired_goal'])
        return actions, None, None, None

    def pre_rollout(self):
        if self._parameter_noise is not None:
            self.adapt_param_noise()
            self.sess.run(self.perturbe_op,
                          feed_dict={
                              self.param_noise_stddev:
                              self._parameter_noise.current_stddev
                          })

    def get_actions(self,
                    o,
                    ag,
                    g,
                    noise_eps=None,
                    random_eps=None,
                    use_target_net=False,
                    compute_Q=False,
                    success_rate=None,
                    mode=TRAIN):
        g = self._task.mg_fn(g)
        ag = self._task.mg_fn(ag)
        o, g = self._preprocess_og(o, ag, g)
        if mode == EVAL:
            policy = self.target if use_target_net else self.main
        else:
            if self._parameter_noise is not None:
                policy = self.perturbed
            else:
                policy = self.target if use_target_net else self.main
        # values to compute
        vals = [policy.pi_tf]
        if compute_Q:
            vals += [policy.Q_pi_tf]
        # feed
        feed = {
            policy.o_tf:
            o.reshape(-1, self.dimo),
            policy.g_tf:
            g.reshape(-1, self.dimg),
            policy.a_tf:
            np.zeros((o.size // self.dimo, self.dima), dtype=np.float32)
        }

        ret = self.sess.run(vals, feed_dict=feed)
        # action postprocessing
        u = ret[0]
        if mode == EVAL:
            u = np.clip(u, -self.max_u, self.max_u)
        else:
            if self._action_noise is not None:
                noise = self._action_noise()
                assert u.shape[0] == 1
                u = u[0]
                u += noise
                u = np.clip(u, -self.max_u, self.max_u)
            if self._parameter_noise is None and self._action_noise is None:
                if noise_eps is None: noise_eps = self.noise_eps
                if random_eps is None: random_eps = self.random_eps
                if self._noise_adaptation:
                    noise_eps *= 1 - success_rate
                    random_eps *= 1 - success_rate
                noise = noise_eps * self.max_u * np.random.randn(
                    *u.shape)  # gaussian noise
                u += noise
                u = np.clip(u, -self.max_u, self.max_u)
                u += np.random.binomial(1, random_eps, u.shape[0]).reshape(
                    -1, 1) * (self._random_action(u.shape[0]) - u
                              )  # eps-greedy
        if u.shape[0] == 1:
            u = u[0]
        u = u.copy()
        ret[0] = u

        if len(ret) == 1:
            return ret[0]
        else:
            return ret

    def store_transitions(self, episode, info, update_stats=True):
        """
        episode_batch: array of batch_size x (T or T+1) x dim_key
                       'o' is of size T+1, others are of size T
        """

        new_episode = episode
        if not self._share_experience:
            new_episode = {
                key: [
                    np.asarray(value[i][np.squeeze(episode['tasks'][i]) ==
                                        self._task_spec['id']])
                    for i in range(len(value)) if np.any(
                        np.squeeze(episode['tasks'][i]) ==
                        self._task_spec['id'])
                ]
                for key, value in episode.items()
            }

            batch_sizes = [len(value) for value in new_episode.values()]
            assert np.all(np.array(batch_sizes) == batch_sizes[0])
            batch_size = batch_sizes[0]
            if batch_size == 0: return

        new_batch = deepcopy(
            dict(
                o=new_episode['o'],
                o_next=new_episode['o_next'],
                a=new_episode['a'],
                ag=[self._task.mg_fn(ag) for ag in new_episode['ag']],
                ag_next=[
                    self._task.mg_fn(ag_next)
                    for ag_next in new_episode['ag_next']
                ],
                g=[self._task.mg_fn(g) for g in new_episode['g']],
                g_next=[
                    self._task.mg_fn(g_next)
                    for g_next in new_episode['g_next']
                ],
                r=new_episode['r'],
            ))

        self._buffer.store_episode(new_batch)

        if update_stats:
            num_normalizing_transitions = transitions_in_episode_batch(
                new_batch)
            new_batch['ep_T'] = np.asarray(
                [ep.shape[0] for ep in list(new_batch.values())[0]])
            new_batch = {
                key: np.asarray(value)
                for key, value in new_batch.items()
            }
            transitions = self.sample_transitions(new_batch,
                                                  num_normalizing_transitions)

            o, g, ag = transitions['o'], transitions['g'], transitions['ag']
            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)

            self.o_stats.update(transitions['o'])
            self.g_stats.update(transitions['g'])

            self.o_stats.recompute_stats()
            self.g_stats.recompute_stats()

    def get_current_buffer_size(self):
        return self._buffer.get_current_size()

    def sample_batch(self):
        transitions = self._buffer.sample(
            self.batch_size)  #otherwise only sample from primary buffer

        o, o_next, g = transitions['o'], transitions['o_next'], transitions[
            'g']
        ag, ag_next = transitions['ag'], transitions['ag_next']
        transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
        transitions['o_next'], transitions['g_next'] = self._preprocess_og(
            o_next, ag_next, g)

        transitions_batch = [
            transitions[key] for key in self.stage_shapes.keys()
        ]
        return transitions_batch

    def stage_batch(self, batch=None):
        if batch is None:
            batch = self.sample_batch()
        assert len(self.buffer_ph_tf) == len(batch)
        self.sess.run(self.stage_op,
                      feed_dict=dict(zip(self.buffer_ph_tf, batch)))

    def _sync_optimizers(self):
        self.Q_adam.sync()
        self.pi_adam.sync()

    def _grads(self):
        # Avoid feed_dict here for performance!
        Q_loss, pi_loss, Q, q_grad, pi_grad = self.sess.run([
            self.Q_loss_tf, self.main.Q_pi_tf, self.target.Q_pi_tf,
            self.Q_grad_tf, self.pi_grad_tf
        ])
        return Q_loss, pi_loss, Q, q_grad, pi_grad

    def _update(self, q_grad, pi_grad):
        self.Q_adam.update(q_grad, self.Q_lr)
        self.pi_adam.update(pi_grad, self.pi_lr)

    def train(self, stage=True):
        if self._buffer.get_current_size() == 0: return {}
        Q_losses = []
        pi_losses = []
        Qs = []
        for i in range(self.train_steps):
            if stage:
                self.stage_batch()
            Q_loss, pi_loss, Q, q_grads, pi_grads = self._grads()
            self._update(q_grads, pi_grads)
            # Q_loss, pi_loss, Q, q_grads, pi_grads, *_ = self.sess.run([self.Q_loss_tf, self.main.Q_pi_tf,
            #                                                             self.target.Q_pi_tf,
            #                                                             self.Q_grads_tf, self.pi_grads_tf,
            #                                                             self.Q_train_op, self.pi_train_op])
            Q_losses.append(Q_loss)
            pi_losses.append(np.mean(pi_loss))
            Qs.append(Q)
        self.update_target_net()
        return {
            'Q_loss': ('scalar', np.mean(Q_losses)),
            'pi_loss': ('scalar', np.mean(pi_losses)),
            'Q': ('hist', np.hstack(Qs)),
            'q_grads':
            ('hist', np.hstack([q_grad.reshape(-1) for q_grad in q_grads])),
            'pi_grads':
            ('hist', np.hstack([pi_grad.reshape(-1) for pi_grad in pi_grads])),
        }

    def _init_target_net(self):
        self.sess.run(self.init_target_net_op)

    def update_target_net(self):
        self.sess.run(self.update_target_net_op)

    def clear_buffer(self):
        self._buffer.clear_buffer()

    def _create_network(self, reuse=False):
        self.sess = tf.get_default_session() or tf.InteractiveSession(
            config=tf.ConfigProto(inter_op_parallelism_threads=1,
                                  intra_op_parallelism_threads=1))

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(
                self.dimo,
                self.normalizer_params['eps'],
                self.normalizer_params['default_clip_range'],
                sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(
                self.dimg,
                self.normalizer_params['eps'],
                self.normalizer_params['default_clip_range'],
                sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([
            (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())
        ])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        # networks
        network_params = deepcopy(self.network_params)
        del network_params['net_type']
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf,
                                                 net_type='main',
                                                 **network_params,
                                                 **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_next']
            target_batch_tf['g'] = batch_tf['g_next']
            self.target = self.create_actor_critic(target_batch_tf,
                                                   net_type='target',
                                                   **network_params,
                                                   **self.__dict__)
            vs.reuse_variables()
        assert len(self.vars("main")) == len(self.vars("target"))
        if self._parameter_noise is not None:
            with tf.variable_scope('perturbed') as vs:
                if reuse:
                    vs.reuse_variables()
                self.perturbed = self.create_actor_critic(batch_tf,
                                                          net_type='perturbed',
                                                          **network_params,
                                                          **self.__dict__)
                vs.reuse_variables()
            assert len(self.vars("main")) == len(self.vars("perturbed"))
            with tf.variable_scope('adaptive_param_noise') as vs:
                if reuse:
                    vs.reuse_variables()
                self.adaptive_param_noise = self.create_actor_critic(
                    batch_tf,
                    net_type='adaptive_param_noise',
                    **network_params,
                    **self.__dict__)
                vs.reuse_variables()
            assert len(self.vars("main")) == len(
                self.vars("adaptive_param_noise"))

            self.adaptive_param_noise_distance = tf.sqrt(
                tf.reduce_mean(
                    tf.square(self.main.pi_tf -
                              self.adaptive_param_noise.pi_tf)))

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(
            batch_tf['r'] + self._gamma * target_Q_pi_tf, *clip_range)
        self.Q_loss_tf = tf.reduce_mean(
            tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))

        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(
            tf.square(self.main.pi_tf / self.max_u))

        self.Q_grads_tf = tf.gradients(self.Q_loss_tf, self.vars('main/Q'))
        self.pi_grads_tf = tf.gradients(self.pi_loss_tf, self.vars('main/pi'))
        assert len(self.vars('main/Q')) == len(self.Q_grads_tf)
        assert len(self.vars('main/pi')) == len(self.pi_grads_tf)
        self.Q_grads_vars_tf = zip(self.Q_grads_tf, self.vars('main/Q'))
        self.pi_grads_vars_tf = zip(self.pi_grads_tf, self.vars('main/pi'))
        self.Q_grad_tf = flatten_grads(grads=self.Q_grads_tf,
                                       var_list=self.vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=self.pi_grads_tf,
                                        var_list=self.vars('main/pi'))

        self.Q_adam = MpiAdam(self.vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self.vars('main/pi'), scale_grad_by_procs=False)

        # self.Q_adam = tf.train.AdamOptimizer(learning_rate=self.Q_lr)
        # self.pi_adam = tf.train.AdamOptimizer(learning_rate=self.pi_lr)

        # self.Q_train_op = self.Q_adam.minimize(self.Q_loss_tf, var_list=self.vars('main/Q'))
        # self.pi_train_op = self.pi_adam.minimize(self.pi_loss_tf, var_list=self.vars('main/pi'))

        # polyak averaging
        self.main_vars = self.vars('main/Q') + self.vars('main/pi')
        self.target_vars = self.vars('target/Q') + self.vars('target/pi')
        self.stats_vars = self.global_vars('o_stats') + self.global_vars(
            'g_stats')
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]),
                zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(
                lambda v: v[0].assign(self.polyak * v[0] +
                                      (1. - self.polyak) * v[1]),
                zip(self.target_vars, self.main_vars)))

        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')

        # perturbe
        if self._parameter_noise is not None:
            self.perturbed_vars = self.vars('perturbed/Q') + self.vars(
                'perturbed/pi')
            self.perturbe_op = list(
                map(
                    lambda v: v[0].assign(v[1] + tf.random_normal(
                        tf.shape(v[1]),
                        mean=0.,
                        stddev=self.param_noise_stddev)),
                    zip(self.perturbed_vars, self.main_vars)))

            self.adaptive_param_noise_vars = self.vars(
                'adaptive_param_noise/Q') + self.vars(
                    'adaptive_param_noise/pi')
            self.adaptive_params_noise_perturbe_op = list(
                map(
                    lambda v: v[0].assign(v[1] + tf.random_normal(
                        tf.shape(v[1]),
                        mean=0.,
                        stddev=self.param_noise_stddev)),
                    zip(self.adaptive_param_noise_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self.global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()

        self._sync_source_tf = [
            tf.placeholder(tf.float32, x.shape) for x in self.vars('')
        ]
        self._sync_op_tf = [
            target.assign(source)
            for target, source in zip(self.vars(''), self._sync_source_tf)
        ]

        self._global_sync_source_tf = [
            tf.placeholder(tf.float32, x.shape) for x in self.global_vars('')
        ]
        self._global_sync_op_tf = [
            target.assign(source) for target, source in zip(
                self.global_vars(''), self._global_sync_source_tf)
        ]

    def adapt_param_noise(self):

        if not self._buffer.get_current_size() > 0: return

        self.sess.run(self.adaptive_params_noise_perturbe_op,
                      feed_dict={
                          self.param_noise_stddev:
                          self._parameter_noise.current_stddev
                      })

        self.stage_batch()
        distance = self.sess.run(self.adaptive_param_noise_distance)

        self._parameter_noise.adapt(distance)

    def vars(self, scope=''):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                scope=self._scope + '/' + scope)
        if scope == '':
            res += tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                     scope=self._scope + '/' + 'o_stats/mean')
            res += tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                     scope=self._scope + '/' + 'o_stats/std')
            res += tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                     scope=self._scope + '/' + 'g_stats/mean')
            res += tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                     scope=self._scope + '/' + 'g_stats/std')
        assert len(res) > 0
        return res

    def global_vars(self, scope=''):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                scope=self._scope + '/' + scope)
        return res

    def get_params(self, scope=''):
        return (self._scope, [(str(var), self.sess.run(var))
                              for var in self.vars(scope)])

    def get_global_params(self, scope=''):
        return (self._scope, [(str(var), self.sess.run(var))
                              for var in self.global_vars(scope)])

    def set_params(self, params, scope=''):
        params = [param[1] for param in params]
        self.sess.run(self._sync_op_tf,
                      feed_dict=dict([
                          (ph, param)
                          for ph, param in zip(self._sync_source_tf, params)
                      ]))

    def set_global_params(self, params, scope=''):
        params = [param[1] for param in params]
        self.sess.run(
            self._global_sync_op_tf,
            feed_dict=dict([
                (ph, param)
                for ph, param in zip(self._global_sync_source_tf, params)
            ]))

Example #14

Show file

File: naf.py Project: haldunbalim/RL-Project-HER

class NAF(object):
    @store_args
    def __init__(self,
                 input_dims,
                 buffer_size,
                 hidden,
                 layers,
                 network_class,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 bc_loss,
                 q_filter,
                 num_demo,
                 demo_batch_size,
                 prm_loss_weight,
                 aux_loss_weight,
                 sample_transitions,
                 gamma,
                 reuse=False,
                 use_seperate_networks=False,
                 **kwargs):

        if self.clip_return is None:
            self.clip_return = np.inf

        if use_seperate_networks:
            self.create_naf_network = import_function(
                "her.naf_utils.naf_network_seperate:Network")
        else:
            self.create_naf_network = import_function(
                "her.naf_utils.naf_network_shared:Network")

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']
        self.counter = 0

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {
            key: (self.T - 1 if key != 'o' else self.T, *input_shapes[key])
            for key, val in input_shapes.items()
        }
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T, self.dimg)

        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                   self.sample_transitions)

        global DEMO_BUFFER
        DEMO_BUFFER = ReplayBuffer(
            buffer_shapes, buffer_size, self.T, self.sample_transitions
        )  #initialize the demo buffer; in the same way as the primary data buffer

    def _random_action(self, n):
        return np.random.uniform(low=-self.max_u,
                                 high=self.max_u,
                                 size=(n, self.dimu))

    def _preprocess_og(self, o, ag, g):
        if self.relative_goals:
            g_shape = g.shape
            g = g.reshape(-1, self.dimg)
            ag = ag.reshape(-1, self.dimg)
            g = self.subtract_goals(g, ag)
            g = g.reshape(*g_shape)
        o = np.clip(o, -self.clip_obs, self.clip_obs)
        g = np.clip(g, -self.clip_obs, self.clip_obs)
        return o, g

    def step(self, obs):
        actions = self.get_actions(obs['observation'], obs['achieved_goal'],
                                   obs['desired_goal'])
        return actions, None, None, None

    def get_actions(self,
                    o,
                    ag,
                    g,
                    noise_eps=0.,
                    random_eps=0.1,
                    use_target_net=False,
                    compute_Q=False):
        o, g = self._preprocess_og(o, ag, g)
        policy = self.target if use_target_net else self.main
        # values to compute
        vals = [policy.pi_tf]
        if compute_Q:
            vals += [policy.Q]
        # feed
        feed = {
            policy.o_tf:
            o.reshape(-1, self.dimo),
            policy.g_tf:
            g.reshape(-1, self.dimg),
            policy.u_tf:
            np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32)
        }

        ret = self.sess.run(vals, feed_dict=feed)
        # action postprocessing
        u = ret[0]
        noise = noise_eps * self.max_u * np.random.randn(
            *u.shape)  # gaussian noise
        u += noise
        u = np.clip(u, -self.max_u, self.max_u)
        u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (
            self._random_action(u.shape[0]) - u)  # eps-greedy
        if u.shape[0] == 1:
            u = u[0]
        u = u.copy()
        ret[0] = u

        if len(ret) == 1:
            return ret[0]
        else:
            return ret

    def init_demo_buffer(
            self,
            demoDataFile,
            update_stats=True):  #function that initializes the demo buffer

        demoData = np.load(
            demoDataFile)  #load the demonstration data from data file
        info_keys = [
            key.replace('info_', '') for key in self.input_dims.keys()
            if key.startswith('info_')
        ]
        info_values = [
            np.empty((self.T - 1, 1, self.input_dims['info_' + key]),
                     np.float32) for key in info_keys
        ]

        demo_data_obs = demoData['obs']
        demo_data_acs = demoData['acs']
        demo_data_info = demoData['info']

        for epsd in range(
                self.num_demo
        ):  # we initialize the whole demo buffer at the start of the training
            obs, acts, goals, achieved_goals = [], [], [], []
            i = 0
            for transition in range(self.T - 1):
                obs.append(
                    [demo_data_obs[epsd][transition].get('observation')])
                acts.append([demo_data_acs[epsd][transition]])
                goals.append(
                    [demo_data_obs[epsd][transition].get('desired_goal')])
                achieved_goals.append(
                    [demo_data_obs[epsd][transition].get('achieved_goal')])
                for idx, key in enumerate(info_keys):
                    info_values[idx][transition,
                                     i] = demo_data_info[epsd][transition][key]

            obs.append([demo_data_obs[epsd][self.T - 1].get('observation')])
            achieved_goals.append(
                [demo_data_obs[epsd][self.T - 1].get('achieved_goal')])

            episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals)
            for key, value in zip(info_keys, info_values):
                episode['info_{}'.format(key)] = value

            episode = convert_episode_to_batch_major(episode)
            global DEMO_BUFFER
            DEMO_BUFFER.store_episode(
                episode
            )  # create the observation dict and append them into the demonstration buffer
            logger.debug("Demo buffer size currently ",
                         DEMO_BUFFER.get_current_size()
                         )  #print out the demonstration buffer size

            if update_stats:
                # add transitions to normalizer to normalize the demo data as well
                episode['o_2'] = episode['o'][:, 1:, :]
                episode['ag_2'] = episode['ag'][:, 1:, :]
                num_normalizing_transitions = transitions_in_episode_batch(
                    episode)
                transitions = self.sample_transitions(
                    episode, num_normalizing_transitions)

                o, g, ag = transitions['o'], transitions['g'], transitions[
                    'ag']
                transitions['o'], transitions['g'] = self._preprocess_og(
                    o, ag, g)
                # No need to preprocess the o_2 and g_2 since this is only used for stats

                self.o_stats.update(transitions['o'])
                self.g_stats.update(transitions['g'])

                self.o_stats.recompute_stats()
                self.g_stats.recompute_stats()
            episode.clear()

        logger.info("Demo buffer size: ", DEMO_BUFFER.get_current_size()
                    )  #print out the demonstration buffer size

    def store_episode(self, episode_batch, update_stats=True):
        """
        episode_batch: array of batch_size x (T or T+1) x dim_key
                       'o' is of size T+1, others are of size T
        """

        self.buffer.store_episode(episode_batch)

        if update_stats:
            # add transitions to normalizer
            episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
            episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
            num_normalizing_transitions = transitions_in_episode_batch(
                episode_batch)
            transitions = self.sample_transitions(episode_batch,
                                                  num_normalizing_transitions)

            o, g, ag = transitions['o'], transitions['g'], transitions['ag']
            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
            # No need to preprocess the o_2 and g_2 since this is only used for stats

            self.o_stats.update(transitions['o'])
            self.g_stats.update(transitions['g'])

            self.o_stats.recompute_stats()
            self.g_stats.recompute_stats()

    def get_current_buffer_size(self):
        return self.buffer.get_current_size()

    def _sync_optimizers(self):
        self.adam.sync()

    def _grads(self):
        # Avoid feed_dict here for performance!
        Q_grad = self.sess.run([self.Q_grad_tf])
        return Q_grad

    def _update(self, Q_grad):
        self.adam.update(Q_grad, self.Q_lr)

    def sample_batch(self):
        if self.bc_loss:  #use demonstration buffer to sample as well if bc_loss flag is set TRUE
            transitions = self.buffer.sample(self.batch_size -
                                             self.demo_batch_size)
            global DEMO_BUFFER
            transitions_demo = DEMO_BUFFER.sample(
                self.demo_batch_size)  #sample from the demo buffer
            for k, values in transitions_demo.items():
                rolloutV = transitions[k].tolist()
                for v in values:
                    rolloutV.append(v.tolist())
                transitions[k] = np.array(rolloutV)
        else:
            transitions = self.buffer.sample(
                self.batch_size)  #otherwise only sample from primary buffer

        o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
        ag, ag_2 = transitions['ag'], transitions['ag_2']
        transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
        transitions['o_2'], transitions['g_2'] = self._preprocess_og(
            o_2, ag_2, g)

        transitions_batch = [
            transitions[key] for key in self.stage_shapes.keys()
        ]
        return transitions_batch

    def stage_batch(self, batch=None):
        if batch is None:
            batch = self.sample_batch()
        assert len(self.buffer_ph_tf) == len(batch)
        self.sess.run(self.stage_op,
                      feed_dict=dict(zip(self.buffer_ph_tf, batch)))

    def train(self, stage=True):
        if stage:
            self.stage_batch()
        Q_grad = self._grads()
        self._update(Q_grad[0])

    def _init_target_net(self):
        self.sess.run(self.init_target_net_op)

    def update_target_net(self):
        self.sess.run(self.update_target_net_op)

    def clear_buffer(self):
        self.buffer.clear_buffer()

    def _vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                scope=self.scope + '/' + scope)
        assert len(res) > 0
        return res

    def _global_vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                scope=self.scope + '/' + scope)
        return res

    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." %
                    (self.dimu, self.max_u))
        self.sess = tf_util.get_session()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([
            (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())
        ])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        #choose only the demo buffer samples
        mask = np.concatenate(
            (np.zeros(self.batch_size - self.demo_batch_size),
             np.ones(self.demo_batch_size)),
            axis=0)

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_naf_network(batch_tf,
                                                net_type='main',
                                                **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_naf_network(target_batch_tf,
                                                  net_type='target',
                                                  **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_value = self.target.value
        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_value,
                                     *clip_range)
        self.Q_loss_tf = tf.reduce_mean(tf.square(target_tf - self.main.Q))
        tf.summary.histogram("Q_loss", self.Q_loss_tf)

        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main'))

        assert len(self._vars('main')) == len(Q_grads_tf)

        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main'))
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf,
                                       var_list=self._vars('main'))

        # optimizers
        self.adam = MpiAdam(self._vars('main'), scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = self._vars('main')
        self.target_vars = self._vars('target')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars(
            'g_stats')
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]),
                zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(
                lambda v: v[0].assign(self.polyak * v[0] +
                                      (1. - self.polyak) * v[1]),
                zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()

        visualize = True
        if visualize:
            writer = tf.summary.FileWriter("output", self.sess.graph)
            writer.close()
            saver = tf.train.Saver()
            saver.save(self.sess, "./models/model.ckpt")

    def logs(self, prefix=''):
        logs = []
        logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
        logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
        logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
        logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]

        if prefix != '' and not prefix.endswith('/'):
            return [(prefix + '/' + key, val) for key, val in logs]
        else:
            return logs

    def __getstate__(self):
        """Our policies can be loaded from pkl, but after unpickling you cannot continue training.
        """
        excluded_subnames = [
            '_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats', 'main',
            'target', 'lock', 'env', 'sample_transitions', 'stage_shapes',
            'create_actor_critic'
        ]

        state = {
            k: v
            for k, v in self.__dict__.items()
            if all([not subname in k for subname in excluded_subnames])
        }
        state['buffer_size'] = self.buffer_size
        state['tf'] = self.sess.run(
            [x for x in self._global_vars('') if 'buffer' not in x.name])
        return state

    def __setstate__(self, state):
        if 'sample_transitions' not in state:
            # We don't need this for playing the policy.
            state['sample_transitions'] = None

        self.__init__(**state)
        # set up stats (they are overwritten in __init__)
        for k, v in state.items():
            if k[-6:] == '_stats':
                self.__dict__[k] = v
        # load TF variables
        vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
        assert (len(vars) == len(state["tf"]))
        node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
        self.sess.run(node)

    def save(self, save_path):
        tf_util.save_variables(save_path)

Example #15

Show file

File: ddpg.py Project: wwxFromTju/curious

class DDPG(object):
    @store_args
    def __init__(self,
                 input_dims,
                 hidden,
                 layers,
                 network_class,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 normalize_obs,
                 sample_transitions,
                 gamma,
                 buffers=None,
                 reuse=False,
                 tasks_ag_id=None,
                 tasks_g_id=None,
                 task_replay='',
                 t_id=None,
                 eps_task=None,
                 **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused,
            buffers (list): buffers to be used to store new transition (usually one per task + 1
            task_ag_id (list): indices to find achieved goals for each task in the achieved goal vector
            task_g_id (list): indices to find agoals for each task in the goal vector
            task_replay (str): defines the task replay strategy (see train.py for info)
            t_id (int): index of the task corresponding to this policy when using a task-experts structure
            eps_task (float): epsilon parameter for the epsilon greedy strategy (task choice)
        """
        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(self.network_class)
        self.normalize_obs = normalize_obs

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        self.dimg = self.input_dims['g']
        self.dimag = self.input_dims['ag']
        self.dimu = self.input_dims['u']
        if self.structure == 'curious' or self.structure == 'task_experts':
            self.dimtd = self.input_dims['task_descr']

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, 1)
        self.stage_shapes = stage_shapes

        if t_id is not None:
            self.scope += str(t_id)
        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)

        # addition for multi-task structures
        if self.structure == 'curious' or self.structure == 'task_experts':
            self.tasks_g_id = tasks_g_id
            self.tasks_ag_id = tasks_ag_id
            self.nb_tasks = len(tasks_g_id)

        if buffers is not None:
            self.buffer = buffers
            if type(self.buffer) is list:
                if len(self.buffer) > 5:
                    # distractor buffers are equal
                    for i in range(6, len(self.buffer)):
                        self.buffer[i] = self.buffer[5]
        self.first = True

    def _random_action(self, n):
        return np.random.uniform(low=-self.max_u,
                                 high=self.max_u,
                                 size=(n, self.dimu))

    def _preprocess_og(self, o, ag, g):
        if self.relative_goals:
            g_shape = g.shape
            g = g.reshape(-1, self.dimg)
            ag = ag.reshape(-1, self.dimag)
            g = self.subtract_goals(g, ag)
            g = g.reshape(*g_shape)
        o = np.clip(o, -self.clip_obs, self.clip_obs)
        g = np.clip(g, -self.clip_obs, self.clip_obs)
        return o, g

    def get_actions(self,
                    o,
                    ag,
                    g,
                    task_descr=None,
                    noise_eps=0.,
                    random_eps=0.,
                    use_target_net=False,
                    compute_Q=False):
        o, g = self._preprocess_og(o, ag, g)
        policy = self.target if use_target_net else self.main
        # values to compute
        vals = [policy.pi_tf]
        if compute_Q:
            vals += [policy.Q_pi_tf]
        # feed
        feed = {
            policy.o_tf:
            o.reshape(-1, self.dimo),
            policy.g_tf:
            g.reshape(-1, self.dimg),
            policy.u_tf:
            np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32)
        }
        # addition for multi-task structures
        if self.structure == 'curious' or self.structure == 'task_experts':
            feed[policy.td_tf] = task_descr.reshape(-1, self.dimtd)

        ret = self.sess.run(vals, feed_dict=feed)
        # action postprocessing
        u = ret[0]
        noise = noise_eps * self.max_u * np.random.randn(
            *u.shape)  # gaussian noise
        u += noise
        u = np.clip(u, -self.max_u, self.max_u)
        u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (
            self._random_action(u.shape[0]) - u)  # eps-greedy
        if u.shape[0] == 1:
            u = u[0]
        u = u.copy()
        ret[0] = u

        if len(ret) == 1:
            return ret[0]
        else:
            return ret

    def store_episode(self, episode_batch, cp, n_ep, update_stats=True):
        """
        episode_batch: array of batch_size x (T or T+1) x dim_key
                       'o' is of size T+1, others are of size T
        """
        # decompose episode_batch in episodes
        batch_size = episode_batch['ag'].shape[0]
        # addition in the case of curious goals, compute count of achieved goal that moved in the n modules
        self.cp = cp
        self.n_episodes = n_ep
        # addition for multi-task structures
        if self.structure == 'curious' or self.structure == 'task_experts':
            new_count_local = np.zeros([self.nb_tasks])
            new_count_total = np.zeros([self.nb_tasks])
            # add a new transition in a buffer only if the corresponding outcome has changed compare to the initial outcome
            for b in range(batch_size):
                active_tasks = []
                for j in range(self.nb_tasks):
                    if any(episode_batch['change']
                           [b, -1,
                            self.tasks_ag_id[j][:len(self.tasks_g_id[j])]]):
                        new_count_local[j] += 1
                        if self.nb_tasks < 5 or j < 5:
                            active_tasks.append(j)
                MPI.COMM_WORLD.Allreduce(new_count_local,
                                         new_count_total,
                                         op=MPI.SUM)
                ep = dict()
                for key in episode_batch.keys():
                    ep[key] = episode_batch[key][b].reshape([
                        1, episode_batch[key].shape[1],
                        episode_batch[key].shape[2]
                    ])

                if 'buffer' in self.task_replay or self.task_replay == 'hand_designed':
                    if len(active_tasks) == 0:
                        ind_buffer = [0]
                    else:
                        for task in active_tasks:
                            self.buffer[task + 1].store_episode(ep)
                else:
                    self.buffer.store_episode(ep)

        elif self.structure == 'flat' or self.structure == 'task_experts':
            for b in range(batch_size):
                ep = dict()
                for key in episode_batch.keys():
                    ep[key] = episode_batch[key][b].reshape([
                        1, episode_batch[key].shape[1],
                        episode_batch[key].shape[2]
                    ])
                self.buffer.store_episode(ep)

        # update statistics for goal and observation normalizations
        if update_stats:
            # add transitions to normalizer
            episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
            episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
            num_normalizing_transitions = transitions_in_episode_batch(
                episode_batch)
            if self.structure == 'curious' or self.structure == 'task_experts':
                transitions = self.sample_transitions(
                    episode_batch,
                    num_normalizing_transitions,
                    task_to_replay=None)
            else:
                transitions = self.sample_transitions(
                    episode_batch, num_normalizing_transitions)
            o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions[
                'g'], transitions['ag']
            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
            # No need to preprocess the o_2 and g_2 since this is only used for stats

            self.o_stats.update(transitions['o'])
            self.g_stats.update(transitions['g'])
            self.o_stats.recompute_stats()
            self.g_stats.recompute_stats()

    def get_current_buffer_size(self):
        return sum(
            [self.buffer[i].get_current_size() for i in range(self.nb_tasks)])

    def _sync_optimizers(self):
        self.Q_adam.sync()
        self.pi_adam.sync()

    def _grads(self):
        # Avoid feed_dict here for performance!
        critic_loss, actor_loss, Q_grad, pi_grad = self.sess.run([
            self.Q_loss_tf, self.main.Q_pi_tf, self.Q_grad_tf, self.pi_grad_tf
        ])
        return critic_loss, actor_loss, Q_grad, pi_grad

    def _update(self, Q_grad, pi_grad):
        self.Q_adam.update(Q_grad, self.Q_lr)
        self.pi_adam.update(pi_grad, self.pi_lr)

    def sample_batch(self):
        # addition for multi-task structures
        if self.structure == 'curious' or self.structure == 'task_experts':
            if self.structure == 'curious':
                if 'buffer' in self.task_replay or self.task_replay == 'hand_designed':
                    buffers_sizes = np.array([
                        self.buffer[i].current_size * self.T
                        for i in range(self.nb_tasks + 1)
                    ])
                    self.proportions = np.zeros([self.nb_tasks + 1])
                    buffers_sizes = np.array([
                        self.buffer[i].current_size * self.T
                        for i in range(self.nb_tasks + 1)
                    ])
                    self.proportions = np.zeros([self.nb_tasks + 1])
                    if buffers_sizes[1:].sum() < self.T:
                        ind_valid_buffers = np.array([0])
                        n_valid = 1
                        self.proportions = buffers_sizes / buffers_sizes.sum(
                        ) * self.batch_size
                    else:
                        ind_valid_buffers = np.argwhere(buffers_sizes[1:] > 0)
                        ind_valid_buffers = ind_valid_buffers.reshape(
                            [ind_valid_buffers.size])
                        n_valid = len(ind_valid_buffers)

                        # draw transition from random buffers (random tasks)
                        if self.task_replay == 'replay_task_random_buffer':
                            proba = 1 / ind_valid_buffers.size * np.ones(
                                [n_valid])
                        elif self.task_replay == 'replay_task_cp_buffer':
                            CP = self.cp[ind_valid_buffers]
                            if CP.sum() == 0:
                                proba = (1 / n_valid) * np.ones([n_valid])
                            else:
                                proba = self.eps_task * (1 / n_valid) * np.ones([n_valid]) + \
                                         (1 - self.eps_task) * CP / CP.sum()
                            proba[-1] = 1 - proba[:-1].sum()
                        self.proportions[ind_valid_buffers +
                                         1] = proba * self.batch_size

                    self.proportions = self.proportions.astype(np.int)
                    remain = self.batch_size - self.proportions.sum()
                    for i in range(remain):
                        self.proportions[ind_valid_buffers[i % n_valid] +
                                         1] += 1
                    self.proportions = self.proportions.astype(np.int)

                elif self.task_replay == 'replay_cp_task_transition':
                    CP = self.cp.copy()
                    if CP.sum() == 0:
                        proba = (1 / self.nb_tasks) * np.ones([self.nb_tasks])
                    else:
                        proba = self.eps_task * (1 / self.nb_tasks) * np.ones([self.nb_tasks]) + \
                                (1 - self.eps_task) * CP / CP.sum()
                    proba[-1] = 1 - proba[:-1].sum()
                    transitions = self.buffer.sample(self.batch_size,
                                                     task_to_replay=None,
                                                     cp_proba=proba)

                else:
                    transitions = self.buffer.sample(self.batch_size,
                                                     task_to_replay=None,
                                                     cp_proba=None)

            elif self.structure == 'task_experts':
                if self.task_replay == 'replay_current_task_buffer':
                    buffers_sizes = np.array([
                        self.buffer[i].current_size * self.T
                        for i in range(self.nb_tasks + 1)
                    ])
                    ind_valid_buffers = np.argwhere(buffers_sizes > 0)
                    ind_valid_buffers = ind_valid_buffers.reshape(
                        [ind_valid_buffers.size])
                    n_valid = len(ind_valid_buffers)
                    self.proportions = np.zeros([self.nb_tasks + 1])
                    if buffers_sizes[self.t_id + 1] > 0:
                        self.proportions[self.t_id + 1] = 1
                    else:
                        self.proportions[ind_valid_buffers] = 1 / len(
                            ind_valid_buffers)
                    self.proportions *= self.batch_size
                    self.proportions = self.proportions.astype(np.int)
                    remain = self.batch_size - self.proportions.sum()
                    for i in range(remain):
                        self.proportions[ind_valid_buffers[i % n_valid]] += 1
                    self.proportions = self.proportions.astype(np.int)
                else:
                    transitions = self.buffer.sample(self.batch_size,
                                                     task_to_replay=None,
                                                     cp_proba=None)

            if 'buffer' in self.task_replay or self.task_replay == 'hand_designed':
                assert self.proportions.sum() == self.batch_size

                # sample transitions from different buffers
                trans = []
                for i in range(self.nb_tasks + 1):
                    if self.proportions[i] > 0:
                        if self.structure == 'curious':
                            if i > 0:
                                task_to_replay = i - 1
                            else:
                                task_to_replay = None
                        else:
                            task_to_replay = self.t_id
                        trans.append(self.buffer[i].sample(
                            self.proportions[i],
                            task_to_replay=task_to_replay))
                # concatenate transitions from different buffers and shuffle
                shuffle_inds = np.arange(self.batch_size)
                np.random.shuffle(shuffle_inds)
                transitions = dict()
                for key in trans[0].keys():
                    tmp = np.array([]).reshape([0, trans[0][key].shape[1]])
                    for ts in trans:
                        tmp = np.concatenate([tmp, ts[key]])
                    transitions[key] = tmp[shuffle_inds, :]

        elif self.structure == 'flat':
            transitions = self.buffer.sample(self.batch_size)

        o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
        ag, ag_2 = transitions['ag'], transitions['ag_2']
        transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
        transitions['o_2'], transitions['g_2'] = self._preprocess_og(
            o_2, ag_2, g)

        # #test addition !!
        # transitions['task_descr'] = np.zeros([256, 8])

        transitions_batch = [
            transitions[key] for key in self.stage_shapes.keys()
        ]

        return transitions_batch

    def stage_batch(self, batch=None):
        if batch is None:
            batch = self.sample_batch()
        assert len(self.buffer_ph_tf) == len(batch)
        self.sess.run(self.stage_op,
                      feed_dict=dict(zip(self.buffer_ph_tf, batch)))

    def train(self, stage=True):
        if stage:
            self.stage_batch()
        critic_loss, actor_loss, Q_grad, pi_grad = self._grads()
        self._update(Q_grad, pi_grad)
        return critic_loss, actor_loss

    def _init_target_net(self):
        self.sess.run(self.init_target_net_op)

    def update_target_net(self):
        self.sess.run(self.update_target_net_op)

    def clear_buffer(self):
        for i in range(self.nb_tasks):
            self.buffer[i].clear_buffer()

    def _vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                scope=self.scope + '/' + scope)
        assert len(res) > 0
        return res

    def _global_vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                scope=self.scope + '/' + scope)
        return res

    def _create_network(self, reuse=False):
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.info("g a DDPG agent with action space %d x %s..." %
                        (self.dimu, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([
            (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())
        ])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf,
                                                 net_type='main',
                                                 **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(target_batch_tf,
                                                   net_type='target',
                                                   **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(
            batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
        self.Q_loss_tf = tf.reduce_mean(
            tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))
        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(
            tf.square(self.main.pi_tf / self.max_u))
        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf,
                                       var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf,
                                        var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'),
                               scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars(
            'g_stats')
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]),
                zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(
                lambda v: v[0].assign(self.polyak * v[0] +
                                      (1. - self.polyak) * v[1]),
                zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()

    def logs(self, prefix=''):
        logs = []
        logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
        logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
        logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
        logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]

        if prefix is not '' and not prefix.endswith('/'):
            return [(prefix + '/' + key, val) for key, val in logs]
        else:
            return logs

    def save_weights(self, path):
        to_save = []
        scopes_var = ['main/Q', 'main/pi', 'target/Q', 'target/pi']
        scopes_global_var = ['o_stats', 'g_stats']
        for s in scopes_var:
            tmp = []
            for v in self._vars(s):
                tmp.append(v.eval())
            to_save.append(tmp)
        for s in scopes_global_var:
            tmp = []
            for v in self._global_vars(s):
                tmp.append(v.eval())
            to_save.append(tmp)

        with open(path + '_weights.pkl', 'wb') as f:
            pickle.dump(to_save, f)

    def load_weights(self, path):
        with open(path + '_weights.pkl', 'rb') as f:
            weights = pickle.load(f)
        scopes_var = ['main/Q', 'main/pi', 'target/Q', 'target/pi']
        scopes_global_var = ['o_stats', 'g_stats']
        for i_s, s in enumerate(scopes_var):
            for i_v, v in enumerate(self._vars(s)):
                v.load(weights[i_s][i_v])
        for i_s, s in enumerate(scopes_global_var):
            for i_v, v in enumerate(self._global_vars(s)):
                v.load(weights[i_s + len(scopes_var)][i_v])

    def __getstate__(self):
        """Our policies can be loaded from pkl, but after unpickling you cannot continue training.
        """
        excluded_subnames = [
            '_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats', 'main',
            'target', 'lock', 'env', 'sample_transitions', 'stage_shapes',
            'create_actor_critic'
        ]

        state = {
            k: v
            for k, v in self.__dict__.items()
            if all([not subname in k for subname in excluded_subnames])
        }
        # state['buffer_size'] = self.buffer_size
        state['tf'] = self.sess.run(
            [x for x in self._global_vars('') if 'buffer' not in x.name])
        return state

    def __setstate__(self, state):
        if 'sample_transitions' not in state:
            # We don't need this for playing the policy.
            state['sample_transitions'] = None

        self.__init__(**state)
        # set up stats (they are overwritten in __init__)
        for k, v in state.items():
            if k[-6:] == '_stats':
                self.__dict__[k] = v
        # load TF variables
        vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
        assert (len(vars) == len(state["tf"]))
        node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
        self.sess.run(node)

Example #16

Show file

File: ddpg_her_hrl_policy.py Project: knowledgetechnologyuhh/goal_conditioned_RL_baselines

class DDPG_HER_HRL_POLICY(HRL_Policy):
    @store_args
    def __init__(self,
                 input_dims,
                 buffer_size,
                 hidden,
                 layers,
                 network_class,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 sample_transitions,
                 gamma,
                 reuse=False,
                 **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
        """
        self.ep_ctr = 0
        self.hist_bins = 50
        self.draw_hist_freq = 3
        self._reset_hists()
        self.shared_pi_err_coeff = kwargs['shared_pi_err_coeff']

        HRL_Policy.__init__(self, input_dims, T, rollout_batch_size, **kwargs)

        self.hidden = hidden
        self.layers = layers
        self.max_u = max_u
        self.network_class = network_class
        self.sample_transitions = sample_transitions
        self.scope = scope
        self.subtract_goals = subtract_goals
        self.relative_goals = relative_goals
        self.clip_obs = clip_obs
        self.Q_lr = Q_lr
        self.pi_lr = pi_lr
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.clip_pos_returns = clip_pos_returns
        self.gamma = gamma
        self.polyak = polyak
        self.clip_return = clip_return
        self.norm_eps = norm_eps
        self.norm_clip = norm_clip
        self.action_l2 = action_l2
        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(self.network_class)
        self.stage_shapes['gamma'] = (None, )
        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {
            key:
            (self.T if key != 'o' else self.T + 1, *self.input_shapes[key])
            for key, val in self.input_shapes.items()
        }
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T + 1, self.dimg)
        buffer_shapes['p'] = (buffer_shapes['g'][0], 1)
        buffer_shapes['steps'] = buffer_shapes['p']
        buffer_size = self.buffer_size  #// self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                   self.sample_transitions)

        self.preproc_lr = (self.Q_lr + self.pi_lr) / 2

    def _reset_hists(self):
        self.hists = {"attn": None, "prob_in": None, "rnd": None}

    def draw_hists(self, img_dir):
        for hist_name, hist in self.hists.items():
            if hist is None:
                continue
            step_size = 1.0 / self.hist_bins
            xs = np.arange(0, 1, step_size)
            hist /= (self.ep_ctr * self.T)
            fig, ax = plt.subplots()
            ax.bar(xs, hist, step_size)
            plt.savefig(img_dir + "/{}_hist_l_{}_ep_{}.png".format(
                hist_name, self.h_level, self.ep_ctr))
        self._reset_hists()
        if self.child_policy is not None:
            self.child_policy.draw_hists(img_dir)

    def _random_action(self, n):
        return np.random.uniform(low=-self.max_u,
                                 high=self.max_u,
                                 size=(n, self.dimu))

    def _preprocess_og(self, o, ag, g):
        if self.relative_goals:
            g_shape = g.shape
            g = g.reshape(-1, self.dimg)
            ag = ag.reshape(-1, self.dimg)
            g = self.subtract_goals(g, ag)
            g = g.reshape(*g_shape)
        o = np.clip(o, -self.clip_obs, self.clip_obs)
        g = np.clip(g, -self.clip_obs, self.clip_obs)
        return o, g

    def get_actions(self,
                    o,
                    ag,
                    g,
                    noise_eps=0.,
                    random_eps=0.,
                    use_target_net=False,
                    compute_Q=False,
                    exploit=True,
                    **kwargs):
        # noise_eps = noise_eps if not exploit else 0.
        # random_eps = random_eps if not exploit else 0.

        o, g = self._preprocess_og(o, ag, g)
        policy = self.target if use_target_net else self.main
        # values to compute
        vals = [policy.pi_tf, policy.Q_pi_tf]
        # feed
        feed = {
            policy.o_tf:
            o.reshape(-1, self.dimo),
            policy.g_tf:
            g.reshape(-1, self.dimg),
            policy.u_tf:
            np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32)
        }

        ret = self.sess.run(vals, feed_dict=feed)
        # action postprocessing
        u = ret[0]
        q = ret[1]
        noise = noise_eps * self.max_u * np.random.randn(
            *u.shape)  # gaussian noise
        noisy_u = u + noise
        u = np.clip(noisy_u, -self.max_u, self.max_u)
        random_u = np.random.binomial(1, random_eps, u.shape[0]).reshape(
            -1, 1) * (self._random_action(u.shape[0]) - noisy_u)  # eps-greedy
        u += random_u
        u = u[0].copy()
        self.update_hists(feed, policy)
        return u, q

    def update_hists(self, feed, policy):
        vals = []
        hist_names_to_consider = []
        for hist_name, hist in self.hists.items():
            if hist_name in self.main.__dict__:
                hist_names_to_consider.append(hist_name)
                vals.append(eval("policy.{}".format(hist_name)))

        ret = self.sess.run(vals, feed_dict=feed)
        for val_idx, hist_name in enumerate(hist_names_to_consider):
            this_vals = ret[val_idx]
            this_hists = np.histogram(this_vals, self.hist_bins, range=(0, 1))
            if self.hists[hist_name] is None:
                self.hists[hist_name] = this_hists[0] / this_vals.shape[1]
            else:
                self.hists[hist_name] += this_hists[0] / this_vals.shape[1]

    def scale_and_offset_action(self, u):
        scaled_u = u.copy()
        scaled_u *= self.subgoal_scale
        scaled_u += self.subgoal_offset
        return scaled_u

    def inverse_scale_and_offset_action(self, scaled_u):
        u = scaled_u.copy()
        u -= self.subgoal_offset
        u /= self.subgoal_scale
        return u

    def store_episode(self, episode_batch, update_stats=True):
        """
        episode_batch: array of batch_size x (T or T+1) x dim_key
                       'o' is of size T+1, others are of size T
        """
        # print("Storing Episode h-level = {}".format(self.h_level))
        self.buffer.store_episode(episode_batch)
        if update_stats:
            # add transitions to normalizer

            episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
            episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
            # num_normalizing_transitions = episode_batch['u'].shape[1]
            num_normalizing_transitions = transitions_in_episode_batch(
                episode_batch)
            transitions = self.sample_transitions(episode_batch,
                                                  num_normalizing_transitions)

            o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions[
                'g'], transitions['ag']
            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
            # No need to preprocess the o_2 and g_2 since this is only used for stats

            self.o_stats.update(transitions['o'])
            self.o_stats.recompute_stats()

            self.g_stats.update(transitions['g'])
            self.g_stats.recompute_stats()

        self.ep_ctr += 1
        # print("Done storing Episode")

    def get_current_buffer_size(self):
        return self.buffer.get_current_size()

    def _sync_optimizers(self):
        self.Q_adam.sync()
        self.pi_adam.sync()
        self.shared_preproc_adam.sync()

    def _grads(self):
        # Avoid feed_dict here for performance!
        critic_loss, actor_loss, preproc_loss, Q_grad, pi_grad, preproc_grad = self.sess.run(
            [
                self.Q_loss_tf, self.main.Q_pi_tf, self.shared_preproc_loss_tf,
                self.Q_grad_tf, self.pi_grad_tf, self.shared_preproc_grad_tf
            ])
        return critic_loss, actor_loss, preproc_loss, Q_grad, pi_grad, preproc_grad

    def _update(self, Q_grad, pi_grad, preproc_grad):
        self.Q_adam.update(Q_grad, self.Q_lr)
        self.pi_adam.update(pi_grad, self.pi_lr)
        self.shared_preproc_adam.update(preproc_grad, self.preproc_lr)

    def sample_batch(self):
        transitions = self.buffer.sample(self.batch_size)
        o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
        ag, ag_2 = transitions['ag'], transitions['ag_2']
        transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
        transitions['o_2'], transitions['g_2'] = self._preprocess_og(
            o_2, ag_2, g)
        transitions_batch = [
            transitions[key] for key in self.stage_shapes.keys()
        ]
        return transitions_batch

    def stage_batch(self, batch=None):
        if batch is None:
            batch = self.sample_batch()
        assert len(self.buffer_ph_tf) == len(batch)
        self.sess.run(self.stage_op,
                      feed_dict=dict(zip(self.buffer_ph_tf, batch)))

    def train(self, stage=True):
        if stage:
            self.stage_batch()
        critic_loss, actor_loss, preproc_loss, Q_grad, pi_grad, preproc_grad = self._grads(
        )
        self._update(Q_grad, pi_grad, preproc_grad)
        return critic_loss, actor_loss, preproc_loss

    def _init_target_net(self):
        self.sess.run(self.init_target_net_op)

    def update_target_net(self):
        self.sess.run(self.update_target_net_op)

    def clear_buffer(self):
        self.buffer.clear_buffer()

    def _vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                scope=self.scope + '/' + scope)
        # assert len(res) > 0
        return res

    def _global_vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                scope=self.scope + '/' + scope)
        return res

    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG_HRL agent with action space %d x %s..." %
                    (self.dimu, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([
            (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())
        ])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf,
                                                 net_type='main',
                                                 **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(target_batch_tf,
                                                   net_type='target',
                                                   **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else np.inf)
        # target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
        target_tf = tf.clip_by_value(
            batch_tf['r'] +
            tf.transpose(self.gamma * batch_tf['gamma']) * target_Q_pi_tf,
            *clip_range)
        self.Q_loss_tf = tf.reduce_mean(
            tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))
        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(
            tf.square(self.main.pi_tf / self.max_u))
        self.shared_q_err_coeff = 1.0 - self.shared_pi_err_coeff
        self.shared_preproc_loss_tf = (
            self.shared_q_err_coeff * self.Q_loss_tf +
            self.shared_pi_err_coeff * self.pi_loss_tf)
        if "shared_preproc_err" in self.main.__dict__:
            self.shared_preproc_loss_tf += self.main.shared_preproc_err
        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        shared_preproc_grads_tf = tf.gradients(
            self.shared_preproc_loss_tf, self._vars('main/shared_preproc'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        assert len(
            self._vars('main/shared_preproc')) == len(shared_preproc_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.shared_preproc_grads_vars_tf = zip(
            shared_preproc_grads_tf, self._vars('main/shared_preproc'))
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf,
                                       var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf,
                                        var_list=self._vars('main/pi'))
        self.shared_preproc_grad_tf = flatten_grads(
            grads=shared_preproc_grads_tf,
            var_list=self._vars('main/shared_preproc'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'),
                               scale_grad_by_procs=False)
        self.shared_preproc_adam = MpiAdam(self._vars('main/shared_preproc'),
                                           scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = self._vars('main/Q') + self._vars(
            'main/pi') + self._vars('main/shared_preproc')
        self.target_vars = self._vars('target/Q') + self._vars(
            'target/pi') + self._vars('target/shared_preproc')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars(
            'g_stats')
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]),
                zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(
                lambda v: v[0].assign(self.polyak * v[0] +
                                      (1. - self.polyak) * v[1]),
                zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()

    def logs(self, prefix='policy'):
        logs = []
        logs += [('buffer_size', int(self.buffer.current_size))]
        logs = log_formater(logs, prefix + "_{}".format(self.h_level))
        if self.child_policy is not None:
            child_logs = self.child_policy.logs(prefix=prefix)
            logs += child_logs

        return logs

    def __getstate__(self):
        """Our policies can be loaded from pkl, but after unpickling you cannot continue training.
        """
        excluded_subnames = [
            '_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats', 'main',
            'target', 'lock', 'env', 'sample_transitions', 'stage_shapes',
            'create_actor_critic'
        ]

        state = {
            k: v
            for k, v in self.__dict__.items()
            if all([not subname in k for subname in excluded_subnames])
        }
        state['buffer_size'] = self.buffer_size
        state['tf'] = self.sess.run(
            [x for x in self._global_vars('') if 'buffer' not in x.name])
        return state

    def __setstate__(self, state):
        if 'sample_transitions' not in state:
            # We don't need this for playing the policy.
            state['sample_transitions'] = None

        self.__init__(**state)
        # set up stats (they are overwritten in __init__)
        for k, v in state.items():
            if k[-6:] == '_stats':
                self.__dict__[k] = v
        # load TF variables
        vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
        assert (len(vars) == len(state["tf"]))
        node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
        self.sess.run(node)

Example #17

Show file

File: ddpg.py Project: RyanRizzo96/Energy-Based-Hindsight-Experience-Prioritization

class DDPG(object):
    @store_args
    def __init__(self,
                 input_dims,
                 buffer_size,
                 hidden,
                 layers,
                 network_class,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 action_scale,
                 action_l2,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 bc_loss,
                 q_filter,
                 num_demo,
                 demo_batch_size,
                 prm_loss_weight,
                 aux_loss_weight,
                 sample_transitions,
                 gamma,
                 temperature,
                 prioritization,
                 env_name,
                 alpha,
                 beta0,
                 beta_iters,
                 total_timesteps,
                 rank_method,
                 reuse=False,
                 **kwargs):
        """
            Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).
            Added functionality to use demonstrations for training to Overcome exploration problem.
        Args:
            :param input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            :param buffer_size (int): number of transitions that are stored in the replay buffer
            :param hidden (int): number of units in the hidden layers
            :param layers (int): number of hidden layers
            :param network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            :param polyak (float): coefficient for Polyak-averaging of the target network
            :param batch_size (int): batch size for training
            :param Q_lr (float): learning rate for the Q (critic) network
            :param pi_lr (float): learning rate for the pi (actor) network
            :param norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            :param norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            :param action_scale(float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            :param action_l2 (float): coefficient for L2 penalty on the actions
            :param clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            :param scope (str): the scope used for the TensorFlow graph
            :param T (int): the time horizon for rollouts
            :param rollout_batch_size (int): number of parallel rollouts per DDPG agent
            :param subtract_goals (function): function that subtracts goals from each other
            :param relative_goals (boolean): whether or not relative goals should be fed into the network
            :param clip_pos_returns (boolean): whether or not positive returns should be clipped
            :param clip_return (float): clip returns to be in [-clip_return, clip_return]
            :param sample_transitions (function) function that samples from the replay buffer
            :param gamma (float): gamma used for Q learning updates
            :param reuse (boolean): whether or not the networks should be reused
            :param bc_loss: whether or not the behavior cloning loss should be used as an auxilliary loss
            :param q_filter: whether or not a filter on the q value update should be used when training with demonstartions
            :param num_demo: Number of episodes in to be used in the demonstration buffer
            :param demo_batch_size: number of samples to be used from the demonstrations buffer, per mpi thread
            :param prm_loss_weight: Weight corresponding to the primary loss
            :param aux_loss_weight: Weight corresponding to the auxilliary loss also called the cloning loss
        """
        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(
            self.network_class)  # points to actor_critic.py

        self.input_dims = input_dims

        input_shapes = dims_to_shapes(input_dims)
        self.dimo = input_dims['o']
        self.dimg = input_dims['g']
        self.dimu = input_dims['u']

        self.sample_count = 1
        self.cycle_count = 1

        self.critic_loss_episode = []
        self.actor_loss_episode = []
        self.critic_loss_avg = []
        self.actor_loss_avg = []

        # Energy based parameters
        self.prioritization = prioritization
        self.env_name = env_name
        self.temperature = temperature
        self.rank_method = rank_method

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)  # Creates DDPG agent

        # Configure the replay buffer.
        buffer_shapes = {
            key: (self.T - 1 if key != 'o' else self.T, *input_shapes[key])
            for key, val in input_shapes.items()
        }
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T, self.dimg)

        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size

        # print("begin init")
        if self.prioritization == 'energy':
            self.buffer = ReplayBufferEnergy(buffer_shapes, buffer_size,
                                             self.T, self.sample_transitions,
                                             self.prioritization,
                                             self.env_name)
        # elif self.prioritization == 'tderror':
        #     self.buffer = PrioritizedReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions, alpha)
        #     if beta_iters is None:
        #         beta_iters = total_timesteps
        #     self.beta_schedule = LinearSchedule(beta_iters, initial_p=beta0, final_p=1.0)
        else:
            self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                       self.sample_transitions)

        # print("finish init")

    def _random_action(self, n):
        return np.random.uniform(low=-self.action_scale,
                                 high=self.action_scale,
                                 size=(n, self.dimu))

    def _preprocess_og(self, o, ag, g):
        if self.relative_goals:  # no self.relative_goals
            print("self.relative_goals: ", self.relative_goals)
            g_shape = g.shape
            g = g.reshape(-1, self.dimg)
            ag = ag.reshape(-1, self.dimg)
            g = self.subtract_goals(g, ag)
            g = g.reshape(*g_shape)

        # Clip (limit) the values in an array.
        o = np.clip(o, -self.clip_obs, self.clip_obs)
        g = np.clip(g, -self.clip_obs, self.clip_obs)

        return o, g

    # Not used
    def step(self, obs):
        actions = self.get_actions(obs['observation'], obs['achieved_goal'],
                                   obs['desired_goal'])
        return actions, None, None, None

    def get_actions(self,
                    o,
                    ag,
                    g,
                    noise_eps=0.,
                    random_eps=0.,
                    use_target_net=False,
                    compute_Q=False):

        o, g = self._preprocess_og(o, ag, g)

        # Use target network use main network
        policy = self.target if use_target_net else self.main

        # values to compute
        policy_weights = [policy.actor_tf]

        if compute_Q:
            policy_weights += [policy.critic_with_actor_tf]

        # feeds
        agent_feed = {
            policy.obs:
            o.reshape(-1, self.dimo),
            policy.goals:
            g.reshape(-1, self.dimg),
            policy.actions:
            np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32)
        }

        # Evaluating policy weights with agent information
        ret = self.sess.run(policy_weights, feed_dict=agent_feed)

        # print(ret)

        # action postprocessing
        action = ret[0]
        noise = noise_eps * self.action_scale * np.random.randn(
            *action.shape)  # gaussian noise
        action += noise
        action = np.clip(action, -self.action_scale, self.action_scale)
        action += np.random.binomial(1, random_eps, action.shape[0]).reshape(
            -1, 1) * (self._random_action(action.shape[0]) - action
                      )  # eps-greedy
        if action.shape[0] == 1:
            action = action[0]
        action = action.copy()
        ret[0] = action

        if len(ret) == 1:
            return ret[0]
        else:
            return ret

    # Not used
    # def init_demo_buffer(self, demoDataFile, update_stats=True):  # function that initializes the demo buffer
    #
    #     demoData = np.load(demoDataFile)  # load the demonstration data from data file
    #     info_keys = [key.replace('info_', '') for key in self.input_dims.keys() if key.startswith('info_')]
    #     info_values = [np.empty((self.T - 1, 1, self.input_dims['info_' + key]), np.float32) for key in info_keys]
    #
    #     demo_data_obs = demoData['obs']
    #     demo_data_acs = demoData['acs']
    #     demo_data_info = demoData['info']
    #
    #     for epsd in range(self.num_demo):  # we initialize the whole demo buffer at the start of the training
    #         obs, acts, goals, achieved_goals = [], [], [], []
    #         i = 0
    #         for transition in range(self.T - 1):
    #             obs.append([demo_data_obs[epsd][transition].get('observation')])
    #             acts.append([demo_data_acs[epsd][transition]])
    #             goals.append([demo_data_obs[epsd][transition].get('desired_goal')])
    #             achieved_goals.append([demo_data_obs[epsd][transition].get('achieved_goal')])
    #             for idx, key in enumerate(info_keys):
    #                 info_values[idx][transition, i] = demo_data_info[epsd][transition][key]
    #
    #         obs.append([demo_data_obs[epsd][self.T - 1].get('observation')])
    #         achieved_goals.append([demo_data_obs[epsd][self.T - 1].get('achieved_goal')])
    #
    #         episode = dict(observations=obs,
    #                        u=acts,
    #                        g=goals,
    #                        ag=achieved_goals)
    #         for key, value in zip(info_keys, info_values):
    #             episode['info_{}'.format(key)] = value
    #
    #         episode = convert_episode_to_batch_major(episode)
    #         global DEMO_BUFFER
    #         DEMO_BUFFER.ddpg_store_episode(
    #             episode)  # create the observation dict and append them into the demonstration buffer
    #         logger.debug("Demo buffer size currently ",
    #                      DEMO_BUFFER.get_current_size())  # print out the demonstration buffer size
    #
    #         if update_stats:
    #             # add transitions to normalizer to normalize the demo data as well
    #             episode['o_2'] = episode['o'][:, 1:, :]
    #             episode['ag_2'] = episode['ag'][:, 1:, :]
    #             num_normalizing_transitions = transitions_in_episode_batch(episode)
    #             transitions = self.sample_transitions(episode, num_normalizing_transitions)
    #
    #             o, g, ag = transitions['o'], transitions['g'], transitions['ag']
    #             transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
    #             # No need to preprocess the o_2 and g_2 since this is only used for stats
    #
    #             self.o_stats.update(transitions['o'])
    #             self.g_stats.update(transitions['g'])
    #
    #             self.o_stats.recompute_stats()
    #             self.g_stats.recompute_stats()
    #         episode.clear()
    #
    #     logger.info("Demo buffer size: ", DEMO_BUFFER.get_current_size())  # print out the demonstration buffer size

    def ddpg_store_episode(self,
                           episode_batch,
                           dump_buffer,
                           w_potential,
                           w_linear,
                           w_rotational,
                           rank_method,
                           clip_energy,
                           update_stats=True):
        """
        episode_batch: array of batch_size x (T or T+1) x dim_key
                       'o' is of size T+1, others are of size T
        """

        # if self.prioritization == 'tderror':
        #     self.buffer.store_episode(episode_batch, dump_buffer)

        # print("DDPG BEGIN STORE episode")
        if self.prioritization == 'energy':
            self.buffer.store_episode(episode_batch, w_potential, w_linear,
                                      w_rotational, rank_method, clip_energy)
        else:
            self.buffer.store_episode(episode_batch)

        # print("DDPG END STORE episode")

        if update_stats:
            # add transitions to normalizer
            episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
            episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
            num_normalizing_transitions = transitions_in_episode_batch(
                episode_batch)
            # print("START ddpg sample transition")
            # n_cycles calls HER sampler
            if self.prioritization == 'energy':
                if not self.buffer.current_size == 0 and not len(
                        episode_batch['ag']) == 0:
                    transitions = self.sample_transitions(
                        episode_batch, num_normalizing_transitions, 'none',
                        1.0, self.sample_count, self.cycle_count, True)
            # elif self.prioritization == 'tderror':
            #     transitions, weights, episode_idxs = \
            #         self.sample_transitions(self.buffer, episode_batch, num_normalizing_transitions, beta=0)
            else:
                transitions = self.sample_transitions(
                    episode_batch, num_normalizing_transitions)
            # print("END ddpg sample transition")
            # print("DDPG END STORE episode 2")
            o, g, ag = transitions['o'], transitions['g'], transitions['ag']
            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
            # No need to preprocess the o_2 and g_2 since this is only used for stats

            self.o_stats.update(transitions['o'])
            self.g_stats.update(transitions['g'])

            self.o_stats.recompute_stats()
            self.g_stats.recompute_stats()

    def get_current_buffer_size(self):
        return self.buffer.get_current_size()

    def _sync_optimizers(self):
        self.critic_optimiser.sync()
        self.actor_optimiser.sync()

    def _grads(self):
        # Avoid feed_dict here for performance!
        critic_loss, actor_loss, critic_grad, actor_grad, td_error = self.sess.run(
            [
                self.critic_loss_tf,  # MSE of target_tf - main.critic_tf
                self.main.critic_with_actor_tf,  # actor_loss
                self.critic_grads,
                self.actor_grads,
                self.td_error_tf
            ])
        return critic_loss, actor_loss, critic_grad, actor_grad, td_error

    def _update(self, critic_grads, actor_grads):
        self.critic_optimiser.update(critic_grads, self.Q_lr)
        self.actor_optimiser.update(actor_grads, self.pi_lr)

    def sample_batch(self, t):
        # print("Begin Sample batch")
        if self.prioritization == 'energy':
            transitions = self.buffer.sample(self.batch_size,
                                             self.rank_method,
                                             temperature=self.temperature)
            weights = np.ones_like(transitions['r']).copy()
            # print("reach?")
        # elif self.prioritization == 'tderror':
        #     transitions, weights, idxs = self.buffer.sample(self.batch_size, beta=self.beta_schedule.value(t))
        else:
            transitions = self.buffer.sample(self.batch_size)
            weights = np.ones_like(transitions['r']).copy()

        o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
        ag, ag_2 = transitions['ag'], transitions['ag_2']
        transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
        transitions['o_2'], transitions['g_2'] = self._preprocess_og(
            o_2, ag_2, g)

        transitions['w'] = weights.flatten().copy()  # note: ordered dict
        transitions_batch = [
            transitions[key] for key in self.stage_shapes.keys()
        ]
        # if self.prioritization == 'tderror':
        #     return (transitions_batch, idxs)
        # else:
        # print("End sample batch")
        return transitions_batch

    def stage_batch(self, t, batch=None):
        if batch is None:
            # if self.prioritization == 'tderror':
            #     batch, idxs = self.sample_batch(t)
            # else:
            batch = self.sample_batch(t)
        assert len(self.buffer_ph_tf) == len(batch)
        self.sess.run(self.stage_op,
                      feed_dict=dict(zip(self.buffer_ph_tf, batch)))

        # if self.prioritization == 'tderror':
        #     return idxs

    def ddpg_train(self, t, dump_buffer, stage=True):
        if stage:
            # if self.prioritization == 'tderror':
            #     idxs = self.stage_batch(t)
            # else:
            self.stage_batch(t)

        self.critic_loss, self.actor_loss, Q_grad, pi_grad, td_error = self._grads(
        )

        # if self.prioritization == 'tderror':
        #     new_priorities = np.abs(td_error) + self.eps  # td_error
        #     if dump_buffer:
        #         T = self.buffer.buffers['u'].shape[1]
        #         episode_idxs = idxs // T
        #         t_samples = idxs % T
        #         batch_size = td_error.shape[0]
        #         with self.buffer.lock:
        #             for i in range(batch_size):
        #                 self.buffer.buffers['td'][episode_idxs[i]][t_samples[i]] = td_error[i]
        #
        #     self.buffer.update_priorities(idxs, new_priorities)

        # Update gradients for actor and critic networks
        self._update(Q_grad, pi_grad)

        # My variables
        self.visual_actor_loss = 1 - self.actor_loss
        self.critic_loss_episode.append(self.critic_loss)
        self.actor_loss_episode.append(self.visual_actor_loss)

        # print("Critic loss: ", self.critic_loss, " Actor loss: ", self.actor_loss)
        return self.critic_loss, np.mean(self.actor_loss)

    def _init_target_net(self):
        self.sess.run(self.init_target_net_op)

    def ddpg_update_target_net(self):
        # print("ddpg_cycle", self.cycle_count)
        self.cycle_count += 1
        self.critic_loss_avg = np.mean(self.critic_loss_episode)
        self.actor_loss_avg = np.mean(self.actor_loss_episode)

        self.sess.run(self.update_target_net_op)

    def clear_buffer(self):
        self.buffer.clear_buffer()

    def _vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                scope=self.scope + '/' + scope)
        assert len(res) > 0
        return res

    def _global_vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                scope=self.scope + '/' + scope)
        return res

    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." %
                    (self.dimu, self.action_scale))
        self.sess = tf_util.get_session()

        # running averages
        with tf.variable_scope('o_stats') as variable_scope:
            if reuse:
                variable_scope.reuse_variables()
            self.o_stats = Normalizer(self.dimo,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('g_stats') as variable_scope:
            if reuse:
                variable_scope.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([
            (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())
        ])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        # choose only the demo buffer samples
        mask = np.concatenate(
            (np.zeros(self.batch_size - self.demo_batch_size),
             np.ones(self.demo_batch_size)),
            axis=0)

        # networks
        with tf.variable_scope('main') as variable_scope:
            if reuse:
                variable_scope.reuse_variables()

            # Create actor critic network
            self.main = self.create_actor_critic(batch_tf,
                                                 net_type='main',
                                                 **self.__dict__)
            variable_scope.reuse_variables()

        with tf.variable_scope('target') as variable_scope:
            if reuse:
                variable_scope.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(target_batch_tf,
                                                   net_type='target',
                                                   **self.__dict__)
            variable_scope.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_critic_actor_tf = self.target.critic_with_actor_tf
        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else np.inf)

        target_tf = tf.clip_by_value(
            batch_tf['r'] + self.gamma * target_critic_actor_tf, *clip_range)

        # MSE of target_tf - critic_tf. This is the TD Learning step
        self.td_error_tf = tf.stop_gradient(target_tf) - self.main.critic_tf
        self.critic_loss_tf = tf.reduce_mean(
            tf.square(tf.stop_gradient(target_tf) - self.main.critic_tf))

        #
        self.actor_loss_tf = -tf.reduce_mean(self.main.critic_with_actor_tf)
        self.actor_loss_tf += self.action_l2 * tf.reduce_mean(
            tf.square(self.main.actor_tf / self.action_scale))

        # Constructs symbolic derivatives of sum of critic_loss_tf vs _vars('main/Q')
        critic_grads_tf = tf.gradients(self.critic_loss_tf,
                                       self._vars('main/Q'))
        actor_grads_tf = tf.gradients(self.actor_loss_tf,
                                      self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(critic_grads_tf)
        assert len(self._vars('main/pi')) == len(actor_grads_tf)
        self.critic_grads_vars_tf = zip(critic_grads_tf, self._vars('main/Q'))
        self.actor_grads_vars_tf = zip(actor_grads_tf, self._vars('main/pi'))

        # Flattens variables and their gradients.
        self.critic_grads = flatten_grads(grads=critic_grads_tf,
                                          var_list=self._vars('main/Q'))
        self.actor_grads = flatten_grads(grads=actor_grads_tf,
                                         var_list=self._vars('main/pi'))

        # optimizers
        self.critic_optimiser = MpiAdam(self._vars('main/Q'),
                                        scale_grad_by_procs=False)
        self.actor_optimiser = MpiAdam(self._vars('main/pi'),
                                       scale_grad_by_procs=False)

        # polyak averaging used to update target network
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars(
            'g_stats')

        # list( map( lambda( assign() ), zip()))
        self.init_target_net_op = list(
            map(  # Apply lambda to each item item in the zipped list
                lambda v: v[0].assign(v[1]),
                zip(self.target_vars, self.main_vars)))

        # Polyak-Ruppert averaging where most recent iterations are weighted more than past iterations.
        self.update_target_net_op = list(
            map(  # Apply lambda to each item item in the zipped list
                lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) *
                                      v[1]),  # polyak averaging
                zip(self.target_vars,
                    self.main_vars))  # [(target_vars, main_vars), (), ...]
        )

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()

    def logs(self, prefix=''):
        logs = []
        logs += [('actor_critic/critic_loss', self.critic_loss_avg)]
        logs += [('actor_critic/actor_loss', self.actor_loss_avg)]

        logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
        logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
        logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
        logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]

        # logs += [('critic_loss', np.mean(self.sess.run([self.critic_loss])))]
        # logs += [('actor_loss', np.mean(self.sess.run([self.actor_loss])))]

        if prefix != '' and not prefix.endswith('/'):
            return [(prefix + '/' + key, val) for key, val in logs]
        else:
            return logs

    def __getstate__(self):
        """Our policies can be loaded from pkl, but after unpickling you cannot continue training.
        """
        excluded_subnames = [
            '_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats', 'main',
            'target', 'lock', 'env', 'sample_transitions', 'stage_shapes',
            'create_actor_critic'
        ]

        state = {
            k: v
            for k, v in self.__dict__.items()
            if all([not subname in k for subname in excluded_subnames])
        }
        state['buffer_size'] = self.buffer_size
        state['tf'] = self.sess.run(
            [x for x in self._global_vars('') if 'buffer' not in x.name])
        return state

    def __setstate__(self, state):
        if 'sample_transitions' not in state:
            # We don't need this for playing the policy.
            state['sample_transitions'] = None

        self.__init__(**state)
        # set up stats (they are overwritten in __init__)
        for k, v in state.items():
            if k[-6:] == '_stats':
                self.__dict__[k] = v
        # load TF variables
        vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
        assert (len(vars) == len(state["tf"]))
        node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
        self.sess.run(node)

    def save(self, save_path):
        tf_util.save_variables(save_path)

Example #18

Show file

File: ddpg.py Project: poisonwine/GHER

class DDPG(object):
    @store_args
    def __init__(self,
                 input_dims,
                 buffer_size,
                 hidden,
                 layers,
                 network_class,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 sample_transitions,
                 gamma,
                 reuse=False,
                 **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'GHER.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
        """

        # # print("\n\n\n\n1--", input_dims, "\n2--", buffer_size, "\n3--", hidden,
        #         "\n4--", layers, "\n5--", network_class, "\n6--", polyak, "\n7--", batch_size,
        #          "\n8--", Q_lr, "\n9--", pi_lr, "\n10--", norm_eps, "\n11--", norm_clip,
        #          "\n12--", max_u, "\n13--", action_l2, "\n14--", clip_obs, "\n15--", scope, "\n16--", T,
        #          "\n17--", rollout_batch_size, "\n18--", subtract_goals, "\n19--", relative_goals,
        #          "\n20--", clip_pos_returns, "\n21--", clip_return,
        #          "\n22--", sample_transitions, "\n23--", gamma)
        """
        在FetchReach-v1运行中参数值示例：
            input_dims (dict of ints):  {'o': 10, 'u': 4, 'g': 3, 'info_is_success': 1}  （o,u,g均作为网络的输入） 
            buffer_size (int):  1E6     (经验池样本总数)
            hidden (int): 256          （隐含层神经元个数）
            layers (int): 3            （三层神经网络）
            network_class (str):        GHER.ActorCritic'
            polyak (float): 0.95       （target-Network更新的平滑的参数）
            batch_size (int): 256      （批量大小）
            Q_lr (float): 0.001         (学习率)
            pi_lr (float): 0.001        (学习率)
            norm_eps (float): 0.01      (为避免数据溢出使用)
            norm_clip (float): 5        (norm_clip)
            max_u (float): 1.0          (动作的范围是[-1.0, 1.0])
            action_l2 (float): 1.0      (Actor网络的损失正则项系数)
            clip_obs (float): 200       (obs限制在 (-200, +200))
            scope (str): "ddpg"         (tensorflow 使用的 scope 命名域)
            T (int): 50                 (周期的交互次数)
            rollout_batch_size (int): 2 (number of parallel rollouts per DDPG agent)
            subtract_goals (function):  对goal进行预处理的函数， 输入为a和b，输出a-b
            relative_goals (boolean):   False  (如果需要对goal进行函数subtract_goals处理，则为True）
            clip_pos_returns (boolean): True   (是否需要将正的return消除)
            clip_return (float): 50     (将return的范围限制在[-clip_return, clip_return])
            sample_transitions (function):  her返回的函数. 参数由 config.py 定义
            gamma (float): 0.98         (Q 网络更新时使用的折扣因子)

            其中 sample_transition 来自与 HER 的定义，是关键部分
        """

        if self.clip_return is None:
            self.clip_return = np.inf

        # 网络结构和计算图的创建由 actor_critic.py 文件完成
        self.create_actor_critic = import_function(self.network_class)

        # 提取维度
        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']  # 10
        self.dimg = self.input_dims['g']  # 4
        self.dimu = self.input_dims['u']  # 3
        # print("+++", input_shapes)    #  {'o': (10,), 'u': (4,), 'g': (3,), 'info_is_success': (1,)}

        # https://www.tensorflow.org/performance/performance_models
        # StagingArea 提供了更简单的功能且可在 CPU 和 GPU 中与其他阶段并行执行。
        #       将输入管道拆分为 3 个独立并行操作的阶段，并且这是可扩展的，充分利用大型的多核环境

        # 定义需要的存储变量. 假设 self.dimo=10, self.dimg=5, self.dimu=5
        # 则 state_shapes={'o':(None, 10), 'g':(None, 5), 'u':(None:5)}
        # 同时添加target网络使用的变量 state_shapes={'o_2':(None, 10), 'g_2': (None, 5)}
        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )  # 奖励为标量
        self.stage_shapes = stage_shapes
        # 执行后 self.stage_shapes =
        #       OrderedDict([('g', (None, 3)), ('o', (None, 10)), ('u', (None, 4)), ('o_2', (None, 10)), ('g_2', (None, 3)), ('r', (None,))])
        # 其中包括 g, o, u、target网络中使用的 o_2, g_2 和奖励 r

        # Create network.
        # 根据 state_shape 创建 tf 变量，其中包括 g, o, u, o_2, g_2, r
        # self.buffer_ph_tf = [<tf.Tensor 'ddpg/Placeholder:0' shape=(?, 3) dtype=float32>,
        #                     <tf.Tensor 'ddpg/Placeholder_1:0' shape=(?, 10) dtype=float32>,
        #                     <tf.Tensor 'ddpg/Placeholder_2:0' shape=(?, 4) dtype=float32>,
        #                     <tf.Tensor 'ddpg/Placeholder_3:0' shape=(?, 10) dtype=float32>,
        #                     <tf.Tensor 'ddpg/Placeholder_4:0' shape=(?, 3) dtype=float32>,
        #                     <tf.Tensor 'ddpg/Placeholder_5:0' shape=(?,) dtype=float32>]
        with tf.variable_scope(self.scope):
            # 创建 StagingArea 变量
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            # 创建 Tensorflow 变量 placeholder
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            # 将 tensorflow 变量与 StagingArea 变量相互对应
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)
            #
            self._create_network(reuse=reuse)

        # 经验池相关操作
        # 当T = 50时，执行结束后 buffer_shapes=
        #         {'o': (51, 10), 'u': (50, 4), 'g': (50, 3), 'info_is_success': (50, 1), 'ag': (51, 3)}
        # 注意 a,g,u 均记录一个周期内经历的所有样本，因此为 50 维，但 o 和 ag 需要多1维 ？？？？
        buffer_shapes = {
            key: (self.T if key != 'o' else self.T + 1, *input_shapes[key])
            for key, val in input_shapes.items()
        }  #
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)  #
        buffer_shapes['ag'] = (self.T + 1, self.dimg)  #
        # print("+++", buffer_shapes)

        # buffer_size 是按照样本进行计数的长度
        # self.buffer_size=1E6  self.rollout_batch_size=2 buffer_size=1E6
        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                   self.sample_transitions)

    def _random_action(self, n):
        """
            从 [-self.max_u, +self.max_u] 中随机采样 n 个动作
        """
        return np.random.uniform(low=-self.max_u,
                                 high=self.max_u,
                                 size=(n, self.dimu))

    def _preprocess_og(self, o, ag, g):
        """
            obs, goal, achieve_goal 进行预处理
            如果 self.relative_goal=True，则 goal = goal - achieved_goal
        """
        if self.relative_goals:
            g_shape = g.shape
            g = g.reshape(-1, self.dimg)  # 增加1维
            ag = ag.reshape(-1, self.dimg)  # 增加1维
            g = self.subtract_goals(g, ag)  # g = g - ag
            g = g.reshape(*g_shape)
        o = np.clip(o, -self.clip_obs, self.clip_obs)
        g = np.clip(g, -self.clip_obs, self.clip_obs)
        return o, g

    def get_actions(self,
                    o,
                    ag,
                    g,
                    noise_eps=0.,
                    random_eps=0.,
                    use_target_net=False,
                    compute_Q=False):
        """
            根据 self.main 网络选择动作，随后添加高斯噪声，clip，epsilon-greedy操作，输出处理后的动作
        """
        # 如果 self.relative_goal=True，则对 goal 进行预处理. 否则只进行 clip
        o, g = self._preprocess_og(o, ag, g)
        # 在调用本类的函数 self._create_network 后，创建了 self.main 网络和 self.target 网络，均为 ActorCritic 对象
        policy = self.target if use_target_net else self.main  # 根据 self.main 选择动作
        # actor 网络输出的动作的 tensor
        vals = [policy.pi_tf]

        # print("+++")
        # print(vals.shape)

        # 将 actor 输出的 vals 再次输入到 critic 网络中，获得输出为 Q_pi_tf
        if compute_Q:
            vals += [policy.Q_pi_tf]
        # feed_dict的构建，包括 obs, goal 和 action，作为 Actor和Critic的输入
        feed = {
            policy.o_tf:
            o.reshape(-1, self.dimo),
            policy.g_tf:
            g.reshape(-1, self.dimg),
            policy.u_tf:
            np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32)
        }
        # 执行当前的策略网络，输出ret.  ret[0]代表action, ret[1]代表Q值
        ret = self.sess.run(vals, feed_dict=feed)

        # action postprocessing
        # 对Action添加高斯噪声. np.random.randn 指从一个高斯分布中进行采样，噪声服从高斯分布
        u = ret[0]
        noise = noise_eps * self.max_u * np.random.randn(
            *u.shape)  # gaussian noise
        u += noise
        u = np.clip(u, -self.max_u, self.max_u)  # 添加噪声后进行 clip

        # 进行 epsilon-greedy 操作，epsilon为random_eps
        # np.random.binomial指二项分布，输出的结果是0或1，其中输出为1的概率为 random_eps.
        # 如果二项分布输出0，则 u+=0相当于没有操作；如果输出为1，则 u = u + (random_action - u) = random_action
        u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (
            self._random_action(u.shape[0]) - u)  # eps-greedy
        if u.shape[0] == 1:
            u = u[0]
        u = u.copy()
        ret[0] = u
        #
        if len(ret) == 1:
            return ret[0]
        else:
            return ret

    def store_episode(self, episode_batch, update_stats=True, verbose=False):
        """
            episode_batch: array of batch_size x (T or T+1) x dim_key
                       'o' is of size T+1, others are of size T
            调用 replay_buffer 中的 store_episode 函数对一个采样周期的样本进行存储
            o_stats 和 g_stats 分别更新和存储 obs 和 goal 的均值和标准差，并定期更新
        """

        # episode_batch 存储了 rollout.py 中 generate_rollout 产生的一个周期样本
        # episode_batch 是一个字典，键包括 o, g, u, ag, info，值的shape分别为
        #      o (2, 51, 10), u (2, 50, 4), g (2, 50, 3), ag (2, 51, 3), info_is_success (2, 50, 1)
        # 其中第1维是 worker的数目，第2维由周期长度决定

        self.buffer.store_episode(episode_batch, verbose=verbose)

        # 更新 o_stats 和 g_stats 的均值和标准差
        if update_stats:
            # add transitions to normalizer
            episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
            episode_batch['ag_2'] = episode_batch[
                'ag'][:, 1:, :]  # 提取出 next_obs 和 next_state
            num_normalizing_transitions = transitions_in_episode_batch(
                episode_batch)  # 将周期转换为总样本数

            # 调用 sample_transitions 中的采样函数
            # episode_batch是一个字典，键和元素shape分别为 o (2, 51, 10) u (2, 50, 4) g (2, 50, 3) ag (2, 51, 3) info_is_success (2, 50, 1)
            #                                          o_2 (2, 50, 10)  ag_2 (2, 50, 3)
            # num_normalizing_transitions=100，原有是有 2 个 worker，每个 worker 含有1个周期的50个样本
            transitions = self.sample_transitions(episode_batch,
                                                  num_normalizing_transitions)

            # 采样出的样本经过预处理后，用于更新计算 o_stats 和 g_stats，定义在Normalizer中，用于存储 mean 和 std
            o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions[
                'g'], transitions['ag']
            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
            # No need to preprocess the o_2 and g_2 since this is only used for stats

            self.o_stats.update(transitions['o'])
            self.g_stats.update(transitions['g'])

            self.o_stats.recompute_stats()
            self.g_stats.recompute_stats()

    def get_current_buffer_size(self):
        """
            返回当前经验池的样本数量
        """
        return self.buffer.get_current_size()

    def _sync_optimizers(self):
        """
            Q_adam 和 pi_adam 为更新 actor网络 和 critic网络的运算符
        """
        self.Q_adam.sync()
        self.pi_adam.sync()

    def _grads(self):
        """
            返回损失函数和梯度
            Q_loss_tf, main.Q_pi_tf, Q_grad_tf, pi_grad_tf 均定义在 _create_network 函数中
        """

        # Avoid feed_dict here for performance!
        critic_loss, actor_loss, Q_grad, pi_grad = self.sess.run([
            self.Q_loss_tf,
            self.main.Q_pi_tf,
            self.Q_grad_tf,
            self.pi_grad_tf,
        ])
        return critic_loss, actor_loss, Q_grad, pi_grad

    def _update(self, Q_grad, pi_grad):
        """
            更新 main 的 Actor 和 Critic 网络
            更新的 op 均定义在 _create_network 中
        """
        self.Q_adam.update(Q_grad, self.Q_lr)
        self.pi_adam.update(pi_grad, self.pi_lr)

    def sample_batch(self):
        """
            调用 replay_buffer.py 中的 sample 函数进行采样，后者调用的采样方法来源于 her.py 中的定义
            返回的样本组成 batch，用于 self.stage_batch 函数中构建 feed_dict
            feed_dict将作为 选择动作 和 更新网络参数 的输入

            调用采样一个批量的样本，随后对 o 和 g 进行预处理. 样本的 key 包括 o, o_2, ag, ag_2, g
        """
        # 调用sample后返回transition为字典, key 和 val.shape:
        # o (256, 10) u (256, 4) g (256, 3) info_is_success (256, 1) ag (256, 3) o_2 (256, 10) ag_2 (256, 3) r (256,)
        # print("In DDPG: ", self.batch_size)
        transitions = self.buffer.sample(self.batch_size)
        o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
        ag, ag_2 = transitions['ag'], transitions['ag_2']
        transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
        transitions['o_2'], transitions['g_2'] = self._preprocess_og(
            o_2, ag_2, g)

        transitions_batch = [
            transitions[key] for key in self.stage_shapes.keys()
        ]
        return transitions_batch

    def stage_batch(self, batch=None):
        if batch is None:
            batch = self.sample_batch()
        assert len(self.buffer_ph_tf) == len(batch)
        self.sess.run(self.stage_op,
                      feed_dict=dict(zip(self.buffer_ph_tf, batch)))
        # tensorboard可视化
        self.tfboard_sample_batch = batch
        self.tfboard_sample_tf = self.buffer_ph_tf

    def train(self, stage=True):
        """
            计算梯度，随后更新
            train 中执行参数更新之前先执行了 self.stage_batch，用于构建训练使用的feed_dict. 该函数中调用了 
                    self.sample_batch 函数，后者又调用了 self.buffer.sample，后者调用了 config.py 中的 config_her, 后者对 her.py 的函数进行参数配置.
            train 中的运算符在 self._create_network 中定义.
        """
        if stage:
            self.stage_batch()  # 返回使用 her.py 的采样方式构成的 feed_dict 用于计算梯度
        critic_loss, actor_loss, Q_grad, pi_grad = self._grads()
        self._update(Q_grad, pi_grad)
        return critic_loss, actor_loss

    def _init_target_net(self):
        self.sess.run(self.init_target_net_op)

    def update_target_net(self):
        """
            更新 target 网络，update_target_net_op 定义在函数 _create_network 中
        """
        self.sess.run(self.update_target_net_op)

    def clear_buffer(self):
        self.buffer.clear_buffer()

    def _vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                scope=self.scope + '/' + scope)
        assert len(res) > 0
        return res

    def _global_vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                scope=self.scope + '/' + scope)
        return res

    def _create_network(self, reuse=False):
        """
            定义计算 Actor 和 Critic 损失所需要的计算流图
        """
        logger.info("Creating a DDPG agent with action space %d x %s..." %
                    (self.dimu, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()
        # running averages
        # 分别定义用于规约 obs 和 goal 的 Normalizer 对象
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        # mini-batch sampling.
        # 用于存储一个批量样本的数据结构，为OrderedDict，执行后 batch_tf 如下:
        # OrderedDict([('g', <tf.Tensor 'ddpg/ddpg/StagingArea_get:0' shape=(?, 3) dtype=float32>),
        #              ('o', <tf.Tensor 'ddpg/ddpg/StagingArea_get:1' shape=(?, 10) dtype=float32>),
        #              ('u', <tf.Tensor 'ddpg/ddpg/StagingArea_get:2' shape=(?, 4) dtype=float32>),
        #              ('o_2', <tf.Tensor 'ddpg/ddpg/StagingArea_get:3' shape=(?, 10) dtype=float32>),
        #              ('g_2', <tf.Tensor 'ddpg/ddpg/StagingArea_get:4' shape=(?, 3) dtype=float32>),
        #              ('r', <tf.Tensor 'ddpg/Reshape:0' shape=(?, 1) dtype=float32>)])
        # 定义的 batch_tf 变量将作为神经网络的输入

        batch = self.staging_tf.get()
        batch_tf = OrderedDict([
            (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())
        ])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        #
        # 根据 ActorCritic.py 创建 main network
        # 在创建 ActorCritic 网络时，不需要显式的传参，利用 self.__dict__将DDPG类的对应参数直接赋值给 ActorCritic 的对应参数
        # print(self.main.__dict__)
        # {'inputs_tf': OrderedDict([('g', <tf.Tensor 'ddpg/ddpg/StagingArea_get:0' shape=(?, 3) dtype=float32>), ('o', <tf.Tensor 'ddpg/ddpg/StagingArea_get:1' shape=(?, 10) dtype=float32>), ('u', <tf.Tensor 'ddpg/ddpg/StagingArea_get:2' shape=(?, 4) dtype=float32>), ('o_2', <tf.Tensor 'ddpg/ddpg/StagingArea_get:3' shape=(?, 10) dtype=float32>), ('g_2', <tf.Tensor 'ddpg/ddpg/StagingArea_get:4' shape=(?, 3) dtype=float32>), ('r', <tf.Tensor 'ddpg/Reshape:0' shape=(?, 1) dtype=float32>)]),
        # 'net_type': 'main', 'reuse': False, 'buffer_size': 1000000, 'hidden': 256, 'layers': 3, 'network_class': 'GHER.actor_critic:ActorCritic',
        # 'polyak': 0.95, 'batch_size': 256, 'Q_lr': 0.001, 'pi_lr': 0.001, 'norm_eps': 0.01, 'norm_clip': 5, 'max_u': 1.0,
        # 'action_l2': 1.0, 'clip_obs': 200.0, 'scope': 'ddpg', 'relative_goals': False, 'input_dims': {'o': 10, 'u': 4, 'g': 3, 'info_is_success': 1},
        # 'T': 50, 'clip_pos_returns': True, 'clip_return': 49.996, 'rollout_batch_size': 2, 'subtract_goals': <function simple_goal_subtract at 0x7fcf72caa510>, 'sample_transitions': <function make_sample_her_transitions.<locals>._sample_her_transitions at 0x7fcf6e2ce048>,
        # 'gamma': 0.98, 'info': {'env_name': 'FetchReach-v1'}, 'use_mpi': True, 'create_actor_critic': <class 'GHER.actor_critic.ActorCritic'>,
        # 'dimo': 10, 'dimg': 3, 'dimu': 4, 'stage_shapes': OrderedDict([('g', (None, 3)), ('o', (None, 10)), ('u', (None, 4)), ('o_2', (None, 10)), ('g_2', (None, 3)), ('r', (None,))]), 'staging_tf': <tensorflow.python.ops.data_flow_ops.StagingArea object at 0x7fcf6e2dddd8>,
        # 'buffer_ph_tf': [<tf.Tensor 'ddpg/Placeholder:0' shape=(?, 3) dtype=float32>, <tf.Tensor 'ddpg/Placeholder_1:0' shape=(?, 10) dtype=float32>, <tf.Tensor 'ddpg/Placeholder_2:0' shape=(?, 4) dtype=float32>, <tf.Tensor 'ddpg/Placeholder_3:0' shape=(?, 10) dtype=float32>, <tf.Tensor 'ddpg/Placeholder_4:0' shape=(?, 3) dtype=float32>, <tf.Tensor 'ddpg/Placeholder_5:0' shape=(?,) dtype=float32>],
        # 'stage_op': <tf.Operation 'ddpg/ddpg/StagingArea_put' type=Stage>, 'sess': <tensorflow.python.client.session.InteractiveSession object at 0x7fcf6e2dde10>, 'o_stats': <GHER.normalizer.Normalizer object at 0x7fcf6e2ee940>, 'g_stats': <GHER.normalizer.Normalizer object at 0x7fcf6e2ee898>,
        # 'o_tf': <tf.Tensor 'ddpg/ddpg/StagingArea_get:1' shape=(?, 10) dtype=float32>, 'g_tf': <tf.Tensor 'ddpg/ddpg/StagingArea_get:0' shape=(?, 3) dtype=float32>, 'u_tf': <tf.Tensor 'ddpg/ddpg/StagingArea_get:2' shape=(?, 4) dtype=float32>, 'pi_tf': <tf.Tensor 'ddpg/main/pi/mul:0' shape=(?, 4) dtype=float32>, 'Q_pi_tf': <tf.Tensor 'ddpg/main/Q/_3/BiasAdd:0' shape=(?, 1) dtype=float32>, '_input_Q': <tf.Tensor 'ddpg/main/Q/concat_1:0' shape=(?, 17) dtype=float32>, 'Q_tf': <tf.Tensor 'ddpg/main/Q/_3_1/BiasAdd:0' shape=(?, 1) dtype=float32>}

        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf,
                                                 net_type='main',
                                                 **self.__dict__)
            vs.reuse_variables()

        # o_2, g_2 用来创建 target network
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf[
                'g_2']  # 由于 target 网络用于计算 target-Q 值，因此 o 和 g 需使用下一个状态的值
            self.target = self.create_actor_critic(target_batch_tf,
                                                   net_type='target',
                                                   **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        # 计算Critic的target-Q值，需要用到Actor的target网络 和 Critic的target网络
        # target_Q_pi_tf 使用的是下一个状态 o_2 和 g_2
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(
            batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
        # Critic 的损失函数为 target_tf 与 Q_tf 的差的平方，注意梯度不通过target_tf进行传递
        self.Q_loss_tf = tf.reduce_mean(
            tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))

        # Actor的损失函数为 main 网络中将actor的输出随后输入到critic网络中得到Q值的相反数
        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
        # Actor中加入正则
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(
            tf.square(self.main.pi_tf / self.max_u))

        # 计算梯度
        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf,
                                   self._vars('main/Q'))  # 梯度和变量名进行对应
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf,
                                       var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf,
                                        var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'),
                               scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = self._vars('main/Q') + self._vars(
            'main/pi')  # 将Actor和Critic网络的参数放在一起
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars(
            'g_stats')
        self.init_target_net_op = list(  # target 初始化操作中，main网络参数直接赋值给target
            map(lambda v: v[0].assign(v[1]),
                zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(  # target 更新操作中，需要将 main 网络和 target 网络按照参数 polyak 加权
            map(
                lambda v: v[0].assign(self.polyak * v[0] +
                                      (1. - self.polyak) * v[1]),
                zip(self.target_vars, self.main_vars)))

        # # tensorboard可视化
        # tf.summary.scalar("Q_target-Q-mean", tf.reduce_mean(target_tf))
        # tf.summary.histogram("Q_target-Q", target_tf)
        # tf.summary.scalar("Q_Td-error-mean", tf.reduce_mean(target_tf - self.main.Q_tf))
        # tf.summary.histogram("Q_Td-error", target_tf - self.main.Q_tf)
        # tf.summary.scalar("Q_reward-mean", tf.reduce_mean(batch_tf['r']))
        # tf.summary.histogram("Q_reward", batch_tf['r'])
        # tf.summary.scalar("Q_loss_tf", self.Q_loss_tf)
        # tf.summary.scalar("pi_loss_tf", self.pi_loss_tf)
        # self.merged = tf.summary.merge_all()

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()

    def logs(self, prefix=''):
        logs = []
        logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
        logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
        logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
        logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]

        if prefix is not '' and not prefix.endswith('/'):
            return [(prefix + '/' + key, val) for key, val in logs]
        else:
            return logs

    def tfboard_func(self, summary_writer, step):
        """
            tensorboard可视化
        """
        self.sess.run(self.stage_op,
                      feed_dict=dict(
                          zip(self.tfboard_sample_tf,
                              self.tfboard_sample_batch)))
        summary = self.sess.run(self.merged)
        summary_writer.add_summary(summary, global_step=step)

        print("S" + str(step), end=",")

    def __getstate__(self):
        """
            Our policies can be loaded from pkl, but after unpickling you cannot continue training.
        """
        excluded_subnames = [
            '_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats', 'main',
            'target', 'lock', 'env', 'sample_transitions', 'stage_shapes',
            'create_actor_critic'
        ]

        state = {
            k: v
            for k, v in self.__dict__.items()
            if all([not subname in k for subname in excluded_subnames])
        }
        state['buffer_size'] = self.buffer_size
        state['tf'] = self.sess.run(
            [x for x in self._global_vars('') if 'buffer' not in x.name])
        return state

    def __setstate__(self, state):
        if 'sample_transitions' not in state:
            # We don't need this for playing the policy.
            state['sample_transitions'] = None

        self.__init__(**state)
        # set up stats (they are overwritten in __init__)
        for k, v in state.items():
            if k[-6:] == '_stats':
                self.__dict__[k] = v
        # load TF variables
        vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
        assert (len(vars) == len(state["tf"]))
        node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
        self.sess.run(node)

    # -----------------------------------------
    def updata_loss_all(self, verbose=False):
        assert self.buffer.current_size > 0
        idxes = np.arange(self.buffer.current_size)
        print("--------------------------------------")
        print("Updata All loss start...")
        self.buffer.update_rnnLoss(idxes, verbose=verbose)
        print("Updata All loss end ...")

Example #19

Show file

File: ddpg.py Project: Xingyu-Lin/auxiliary-tasks-rl

class DDPG(object):
    @store_args
    def __init__(self,
                 use_aux_tasks,
                 input_dims,
                 image_input_shapes,
                 buffer_size,
                 hidden,
                 layers,
                 dim_latent_repr,
                 cnn_nonlinear,
                 use_bottleneck_layer,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 scope,
                 T,
                 rollout_batch_size,
                 clip_pos_returns,
                 clip_return,
                 log_loss,
                 sample_transitions,
                 gamma,
                 rank,
                 serialized=False,
                 reuse=False,
                 clip_grad_range=None,
                 aux_filter_interval=None,
                 scale_grad_by_procs=False,
                 aux_update_interval=5,
                 aux_base_lr=5,
                 **kwargs):
        """ See the documentation in main.py """
        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(
            'cnn_actor_critic:CNNActorCritic')

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']

        if self.use_aux_tasks:
            self.dim_bw_frame = self.input_dims['info_bw_frame']
            self.dim_op_flow = self.input_dims['info_op_flow']
            self.dim_transformed_frame = self.input_dims[
                'info_transformed_frame']

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()

        include_info = [
            'info_state_obs', 'info_transformed_frame', 'info_transformation',
            'info_op_flow', 'info_bw_frame'
        ]

        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_') and not key in include_info:
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            if self.use_aux_tasks:
                # Initialize OL-AUX
                self.num_auxiliary_tasks = 5
                self.aux_weights_lr = self.aux_base_lr * self.aux_update_interval

                self.aux_weight_vector_Q_tf = tf.Variable(
                    initial_value=1 * tf.ones(self.num_auxiliary_tasks),
                    dtype=tf.float32,
                    name='aux_weights')
                self.aux_weight_grads_buffer = []
                self.log_aux_losses_Q = self.log_aux_tasks_losses_pi = None  # Logging buffer for aux losses
                if self.aux_filter_interval is not None:
                    self.all_grad_history = deque(
                        maxlen=self.aux_filter_interval)

            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=self.reuse)

        # Configure the replay buffer.
        buffer_shapes = {
            key:
            (self.T if key != 'o' and not key.startswith('info_') else self.T +
             1, *input_shapes[key])
            for key, val in input_shapes.items()
        }
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T + 1, self.dimg)

        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                   self.sample_transitions)

    def _random_action(self, n):
        return np.random.uniform(low=-self.max_u,
                                 high=self.max_u,
                                 size=(n, self.dimu))

    def get_actions(self,
                    o,
                    ag,
                    g,
                    noise_eps=0.,
                    random_eps=0.,
                    use_target_net=False,
                    compute_Q=False):
        policy = self.target if use_target_net else self.main
        # values to compute
        vals = [policy.pi_tf]
        if compute_Q:
            vals += [policy.Q_pi_tf]
        # feed
        feed = {
            policy.o_tf:
            o.reshape(-1, self.dimo),
            policy.g_tf:
            g.reshape(-1, self.dimg),
            policy.u_tf:
            np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32)
        }
        ret = self.sess.run(vals, feed_dict=feed)
        # action postprocessing
        u = ret[0]
        noise = noise_eps * self.max_u * np.random.randn(
            *u.shape)  # gaussian noise
        u += noise
        u = np.clip(u, -self.max_u, self.max_u)
        u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (
            self._random_action(u.shape[0]) - u)  # eps-greedy
        if u.shape[0] == 1:
            u = u[0]
        u = u.copy()
        ret[0] = u

        if len(ret) == 1:
            return ret[0]
        else:
            return ret

    def store_episode(self, episode_batch, update_stats=True):
        """
        episode_batch: array of batch_size x (T or T+1) x dim_key
                       'o' is of size T+1, others are of size T
        """

        self.buffer.store_episode(episode_batch)

        if update_stats:
            # add transitions to normalizer
            episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
            episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
            num_normalizing_transitions = transitions_in_episode_batch(
                episode_batch)
            transitions = self.sample_transitions(episode_batch,
                                                  num_normalizing_transitions)

            o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions[
                'g'], transitions['ag']
            transitions['o'], transitions['g'] = o.copy(), g.copy()
            # No need to preprocess the o_2 and g_2 since this is only used for stats
            self.o_stats.update(transitions['o'])
            self.g_stats.update(transitions['g'])
            self.o_stats.recompute_stats()
            self.g_stats.recompute_stats()

            if self.use_aux_tasks:
                self.bw_frame_stats.update(transitions['info_bw_frame'])
                self.op_flow_stats.update(transitions['info_op_flow'])
                self.transformed_frame_stats.update(
                    transitions['info_transformed_frame'])
                self.bw_frame_stats.recompute_stats()
                self.op_flow_stats.recompute_stats()
                self.transformed_frame_stats.recompute_stats()

    def get_current_buffer_size(self):
        return self.buffer.get_current_size()

    def _sync_optimizers(self):
        self.Q_adam.sync()
        self.pi_adam.sync()

    def _grads(self):
        # Avoid feed_dict here for performance!
        assert not self.serialized
        run_vars = [
            self.Q_loss_tf, self.pi_loss_tf, self.Q_grad_tf, self.pi_grad_tf
        ]

        if self.use_aux_tasks:
            run_vars.append(self.main_task_Q_cnn_grad_flatten_tf)
            run_vars.extend(
                self.main.loss_auxiliary_tasks_Q_tf)  # Q Aux losses
            run_vars.extend(self.aux_Q_cnn_grads_flatten_tf)  # Q Aux grads
            run_vars.extend(
                self.main.loss_auxiliary_tasks_pi_tf)  # pi Aux losses
            assert len(
                self.aux_Q_cnn_grads_flatten_tf) == self.num_auxiliary_tasks
            rets = self.sess.run(run_vars)

            aux_losses_pi = copy.copy(rets[-self.num_auxiliary_tasks:])
            aux_grads_Q = copy.copy(
                rets[-2 * self.num_auxiliary_tasks:-self.num_auxiliary_tasks])
            aux_losses_Q = copy.copy(rets[-3 * self.num_auxiliary_tasks:-2 *
                                          self.num_auxiliary_tasks])

            rets = rets[:-3 * self.num_auxiliary_tasks] + [aux_losses_pi] + [
                aux_losses_Q
            ] + [aux_grads_Q]
        else:
            rets = self.sess.run(run_vars)
        return rets

    # noinspection PyAttributeOutsideInit
    def train(self, stage=True):
        # import cProfile, pstats, io
        # pr = cProfile.Profile()
        # pr.enable()
        if stage:
            self.stage_batch()
        if self.use_aux_tasks:
            critic_loss, actor_loss, Q_grad, pi_grad, main_task_grad, \
            aux_losses_pi, aux_losses_Q, aux_task_grads_Q = self._grads()

            self.log_aux_losses_Q = [loss for loss in aux_losses_Q]
            self.log_aux_losses_pi = [loss for loss in aux_losses_pi]

            self._update(Q_grad, pi_grad)
            self._update_aux_weights(main_task_grad, aux_task_grads_Q)
        else:
            critic_loss, actor_loss, Q_grad, pi_grad = self._grads()
            self._update(Q_grad, pi_grad)
        # pr.disable()
        # s = io.StringIO()
        # ps = pstats.Stats(pr, stream=s).sort_stats('time')
        # ps.print_stats(20)
        # print(s.getvalue())

        return critic_loss, actor_loss

    def _update(self, Q_grad, pi_grad):
        self.Q_adam.update(Q_grad, self.Q_lr)
        self.pi_adam.update(pi_grad, self.pi_lr)

    def sample_batch(self):
        transitions = self.buffer.sample(self.batch_size)
        o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
        transitions['o'], transitions['g'] = o.copy(), g.copy()
        transitions['o_2'], transitions['g_2'] = o_2.copy(), g.copy()

        transitions_batch = [
            transitions[key] for key in self.stage_shapes.keys()
        ]
        return transitions_batch

    def stage_batch(self, batch=None):
        if batch is None:
            batch = self.sample_batch()
        assert len(self.buffer_ph_tf) == len(batch)
        self.sess.run(self.stage_op,
                      feed_dict=dict(zip(self.buffer_ph_tf, batch)))

    def _update_aux_weights(self, main_task_grad, aux_task_grads):
        """
        Called during each iteration. But only update the auxiliary task weights according to the update interval
        :param main_task_grad: Gradient of the main task (of cnn)
        :param aux_task_grads: A list of the gradients from each of the auxiliary tasks (of cnn)
        """
        main_task_grad, aux_task_grads = self.aux_weight_updater.get_syncd_grad(
            main_task_grad, aux_task_grads)

        aux_weight_grad = np.zeros([self.num_auxiliary_tasks])
        aux_task_grads = np.array(aux_task_grads)
        main_task_grad = np.array(main_task_grad)

        if self.aux_filter_interval is not None:
            self.all_grad_history.append(
                (main_task_grad.copy(), aux_task_grads.copy()))
            main_task_grad = np.mean(np.array(
                [grad[0] for grad in self.all_grad_history]),
                                     axis=0)
            aux_task_grads = np.mean(np.array(
                [grad[1] for grad in self.all_grad_history]),
                                     axis=0)

        for i, aux_task_grad in enumerate(aux_task_grads):
            aux_weight_grad[i] = self.Q_lr * np.dot(aux_task_grad,
                                                    main_task_grad)
        self.aux_weight_grads_buffer.append(aux_weight_grad)

        if len(self.aux_weight_grads_buffer) == self.aux_update_interval:
            aggregate_aux_weight_grad = np.mean(np.array(
                self.aux_weight_grads_buffer),
                                                axis=0)
            self.aux_weight_updater.update(self.aux_weights_lr *
                                           aggregate_aux_weight_grad)
            self.aux_weight_grads_buffer = []

    def _init_target_net(self):
        self.sess.run(self.init_target_net_op)

    def update_target_net(self):
        self.sess.run(self.update_target_net_op)

    def clear_buffer(self):
        self.buffer.clear_buffer()

    def _vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                scope=self.scope + '/' + scope)
        return res

    def _global_vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                scope=self.scope + '/' + scope)
        return res

    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." %
                    (self.dimu, self.max_u))
        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()
        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        if self.use_aux_tasks:
            with tf.variable_scope('bw_frame_stats') as vs:
                if reuse:
                    vs.reuse_variables()
                self.bw_frame_stats = Normalizer(self.dim_bw_frame,
                                                 self.norm_eps,
                                                 self.norm_clip,
                                                 sess=self.sess)

            with tf.variable_scope('op_flow_stats') as vs:
                if reuse:
                    vs.reuse_variables()
                self.op_flow_stats = Normalizer(self.dim_op_flow,
                                                self.norm_eps,
                                                self.norm_clip,
                                                sess=self.sess)

            with tf.variable_scope('transformed_frame_stats') as vs:
                if reuse:
                    vs.reuse_variables()
                self.transformed_frame_stats = Normalizer(
                    self.dim_transformed_frame,
                    self.norm_eps,
                    self.norm_clip,
                    sess=self.sess)

        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([
            (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())
        ])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf,
                                                 net_type='main',
                                                 **self.__dict__)
            if self.use_aux_tasks:
                self.main.build_auxiliary_tasks()
            vs.reuse_variables()

        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(target_batch_tf,
                                                   net_type='target',
                                                   **self.__dict__)
            vs.reuse_variables()

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(
            batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
        self.Q_loss_tf = tf.reduce_mean(
            tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))
        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)

        if self.use_aux_tasks and self.log_loss:
            self.pi_loss_tf = tf.clip_by_value(
                self.pi_loss_tf,
                np.finfo(float).eps, np.Inf)  # So that log can be applied
            self.Q_loss_tf = tf.log(self.Q_loss_tf)
            self.pi_loss_tf = tf.log(self.pi_loss_tf)

        self.action_l2_loss_tf = self.action_l2 * tf.reduce_mean(
            tf.square(self.main.pi_tf / self.max_u))
        self.pi_loss_tf += self.action_l2_loss_tf

        if self.use_aux_tasks:
            if self.log_loss:
                for i, loss_tf in enumerate(
                        self.main.loss_auxiliary_tasks_Q_tf):
                    self.Q_loss_tf += tf.stop_gradient(
                        self.aux_weight_vector_Q_tf[i]) * tf.log(
                            loss_tf + self.log_min_loss)

                # Use the same weight of the auxiliary tasks in Q function also for pi.
                # Also possible to use separate aux weight vectors for Q and pi
                for i, loss_tf in enumerate(
                        self.main.loss_auxiliary_tasks_pi_tf):
                    self.pi_loss_tf += tf.stop_gradient(
                        self.aux_weight_vector_Q_tf[i]) * tf.log(
                            loss_tf + self.log_min_loss)
            else:
                for i, loss_tf in enumerate(
                        self.main.loss_auxiliary_tasks_Q_tf):
                    self.Q_loss_tf += tf.stop_gradient(
                        self.aux_weight_vector_Q_tf[i]) * loss_tf
                for i, loss_tf in enumerate(
                        self.main.loss_auxiliary_tasks_pi_tf):
                    self.pi_loss_tf += tf.stop_gradient(
                        self.aux_weight_vector_Q_tf[i]) * loss_tf

        Q_grads_tf = tf.gradients(self.Q_loss_tf,
                                  self._vars('main/Q'),
                                  name='Q_gradient')
        pi_grads_tf = tf.gradients(self.pi_loss_tf,
                                   self._vars('main/pi'),
                                   name='pi_gradient')

        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf,
                                       var_list=self._vars('main/Q'),
                                       clip_grad_range=self.clip_grad_range)
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf,
                                        var_list=self._vars('main/pi'),
                                        clip_grad_range=self.clip_grad_range)

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'),
                              scale_grad_by_procs=self.scale_grad_by_procs)
        self.pi_adam = MpiAdam(self._vars('main/pi'),
                               scale_grad_by_procs=self.scale_grad_by_procs)
        if self.use_aux_tasks:
            self.aux_weight_updater = MpiAuxUpdate(self._vars('aux_weights'),
                                                   scale_grad_by_procs=True)

        if self.use_aux_tasks:
            # Get gradient from the auxiliary tasks w.r.t. the shared cnn
            if self.log_loss:
                aux_Q_cnn_grads_tf = [
                    tf.gradients(
                        tf.log(loss_tf + self.log_min_loss, name=loss_name),
                        self._vars('main/Q/cnn'))
                    for (loss_tf,
                         loss_name) in zip(self.main.loss_auxiliary_tasks_Q_tf,
                                           self.main.name_auxiliary_tasks)
                ]
            else:
                aux_Q_cnn_grads_tf = [
                    tf.gradients(loss_tf, self._vars('main/Q/cnn'))
                    for loss_tf in self.main.loss_auxiliary_tasks_Q_tf
                ]
            self.aux_Q_cnn_grads_flatten_tf = [
                flatten_grads(grads=aux_grad_tf,
                              var_list=self._vars('main/Q/cnn'),
                              clip_grad_range=self.clip_grad_range)
                for aux_grad_tf in aux_Q_cnn_grads_tf
            ]

            # Get gradient of cnn from the main task
            self.main_task_Q_cnn_grad_tf = tf.gradients(
                self.Q_loss_tf,
                self._vars('main/Q/cnn'),
                name='aux_update_main_gradient_Q')
            self.main_task_Q_cnn_grad_flatten_tf = flatten_grads(
                grads=self.main_task_Q_cnn_grad_tf,
                var_list=self._vars('main/Q/cnn'),
                clip_grad_range=self.clip_grad_range)

        # polyak averaging, excluding the auxiliary variables
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.main_vars = [
            var for var in self.main_vars
            if var not in (self._vars('main/Q/aux') +
                           self._vars('main/pi/aux'))
        ]
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        assert len(self.main_vars) == len(self.target_vars)
        self.stats_vars = self._global_vars('o_stats') + self._global_vars('bw_frame_stats') + \
                          self._global_vars('op_flow_stats') + self._global_vars('g_stats') + \
                          self._global_vars('transformed_frame_stats')

        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]),
                zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(
                lambda v: v[0].assign(self.polyak * v[0] +
                                      (1. - self.polyak) * v[1]),
                zip(self.target_vars, self.main_vars)))

        tf.variables_initializer(self._global_vars('')).run()

        self._sync_optimizers()
        if self.use_aux_tasks:
            self.aux_weight_updater.sync()
        self._init_target_net()

    def logs(self, prefix=''):
        logs = []
        logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
        logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
        logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
        logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]

        transitions = self.buffer.sample(self.batch_size)
        action_mean = np.mean(np.abs(transitions['u']))
        action_std = np.std(transitions['u'])
        logs += [('buffer_a/abs_mean', action_mean)]
        logs += [('buffer_a/std', action_std)]

        if self.use_aux_tasks:
            # Log auxiliary task losses (After the log operator)
            for (aux_task_name,
                 aux_task_weight) in zip(self.main.name_auxiliary_tasks,
                                         self.log_aux_losses_Q):
                logs += [('aux_losses_Q/' + aux_task_name, aux_task_weight)]

            # Log auxiliary task weights
            curr_aux_weights = self.sess.run(self.aux_weight_vector_Q_tf)
            for (aux_task_name,
                 aux_task_weight) in zip(self.main.name_auxiliary_tasks,
                                         curr_aux_weights):
                logs += [('aux_weights_Q/' + aux_task_name, aux_task_weight)]
        if prefix is not '' and not prefix.endswith('/'):
            return [(prefix + '/' + key, val) for key, val in logs]
        else:
            return logs

    def __getstate__(self):
        """Our policies can be loaded from pkl, but after unpickling you cannot continue training.
        """
        excluded_subnames = [
            '_tf', '_op', '_vars', '_adam', '_updater', 'buffer', 'sess',
            '_stats', 'main', 'target', 'lock', 'env', 'sample_transitions',
            'stage_shapes', 'create_actor_critic'
        ]

        state = {
            k: v
            for k, v in self.__dict__.items()
            if all([not subname in k for subname in excluded_subnames])
        }
        state['buffer_size'] = self.buffer_size
        state['tf'] = self.sess.run(
            [x for x in self._global_vars('') if 'buffer' not in x.name])
        return state

    def __setstate__(self, state):
        if 'sample_transitions' not in state:
            # We don't need this for playing the policy.
            state['sample_transitions'] = None
        state['serialized'] = True
        self.__init__(**state)
        # set up stats (they are overwritten in __init__)
        for k, v in state.items():
            if k[-6:] == '_stats':
                self.__dict__[k] = v
        # load TF variables
        vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
        assert (len(vars) == len(state["tf"]))
        node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
        self.sess.run(node)

Example #20

Show file

class DDPG(object):
    @store_args
    def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size,
                 Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T,
                 rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return,
                 sample_transitions, gamma, replay_k, reward_fun=None, reuse=False, **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
        """
        if self.clip_return is None:
            self.clip_return = np.inf

        # Create the actor critic networks. network_class is defined in actor_critic.py
        # This class is assigned to network_class when DDPG objest is created
        self.create_actor_critic = import_function(self.network_class)

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        # Next state (o_2) and goal at next state (g_2)
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None,)
        self.stage_shapes = stage_shapes

        # Adding variable for correcting bias - Ameet
        self.stage_shapes_new = OrderedDict()
        self.stage_shapes_new['bias'] = (None,)
        ##############################################

        # Create network
        # Staging area is a datatype in tf to input data into GPUs
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values()]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)
            
            # Adding bias term from section 3.4 - Ameet
            self.staging_tf_new = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes_new.keys()],
                shapes=list(self.stage_shapes_new.values()))
            self.buffer_ph_tf_new = [
                tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes_new.values()]
            self.stage_op_new = self.staging_tf_new.put(self.buffer_ph_tf_new)
            ############################################

            self._create_network(reuse=reuse)

        # Configure the replay buffer
        buffer_shapes = {key: (self.T if key != 'o' else self.T+1, *input_shapes[key])
                         for key, val in input_shapes.items()}
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T+1, self.dimg)

        buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size

        # conf represents the parameters required for initializing the priority_queue
        # Remember: The bias gets annealed only conf.total_steps number of times
        conf = {'size': self.buffer_size,
                'learn_start': self.batch_size,
                'batch_size': self.batch_size,
                # Using some heuristic to set the partition_num as it matters only when the buffer is not full (unlikely)
                'partition_size': (self.replay_k)*100}

        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions, conf, self.replay_k)

        # global_steps represents the number of batches used for updates
        self.global_step = 0
        self.debug = {}

    def _random_action(self, n):
        return np.random.uniform(low=-self.max_u, high=self.max_u, size=(n, self.dimu))

    # Preprocessing by clipping the goal and state variables
    # Not sure about the relative_goal part
    def _preprocess_og(self, o, ag, g):
        if self.relative_goals:
            g_shape = g.shape
            g = g.reshape(-1, self.dimg)
            ag = ag.reshape(-1, self.dimg)
            g = self.subtract_goals(g, ag)
            g = g.reshape(*g_shape)
        o = np.clip(o, -self.clip_obs, self.clip_obs)
        g = np.clip(g, -self.clip_obs, self.clip_obs)
        return o, g

    # target is the target policy network and main is the one which is updated
    # target is updated by moving the parameters towards that of the main
    # pi_tf is the output of the policy network, Q_pi_tf is the output of the Q network used for training pi_tf
    # i.e., Q_pi_tf uses the pi_tf's action to evaluate the value 
    # While just Q_tf uses the action which was actually taken
    def get_actions(self, o, ag, g, noise_eps=0., random_eps=0., use_target_net=False,
                    compute_Q=False):
        o, g = self._preprocess_og(o, ag, g)
        policy = self.target if use_target_net else self.main
        # values to compute
        vals = [policy.pi_tf]
        if compute_Q:
            vals += [policy.Q_pi_tf]
        # feed
        feed = {
            policy.o_tf: o.reshape(-1, self.dimo),
            policy.g_tf: g.reshape(-1, self.dimg),
            policy.u_tf: np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32)
        }

        ret = self.sess.run(vals, feed_dict=feed)
        # action postprocessing
        u = ret[0]
        noise = noise_eps * self.max_u * np.random.randn(*u.shape)  # gaussian noise
        u += noise
        u = np.clip(u, -self.max_u, self.max_u)
        u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (self._random_action(u.shape[0]) - u)  # eps-greedy
        if u.shape[0] == 1:
            u = u[0]
        u = u.copy()
        ret[0] = u

        if len(ret) == 1:
            return ret[0]
        else:
            return ret

    def store_episode(self, episode_batch, update_stats=True):
        """
        episode_batch: array of batch_size x (T or T+1) x dim_key
                       'o' is of size T+1, others are of size T
        """

        ###### Remove the l value - Supposed to be a list of length 2
        # First entry consists of transitions with actual goals and second is alternate goals
        self.buffer.store_episode(episode_batch)

        # ###### Debug
        # # This functions was used to check the hypothesis that if TD error is high
        # # for a state with some goal, it is high for that states with all other goals
        # self.debug_td_error_alternate_actual(debug_transitions)


        # Updating stats

        ## Change this--------------
        update_stats = False
        ###--------------------------
        if update_stats:
            # add transitions to normalizer
            episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
            episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
            num_normalizing_transitions = transitions_in_episode_batch(episode_batch)
            transitions = self.sample_transitions(episode_batch, num_normalizing_transitions)

            o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions['g'], transitions['ag']
            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
            # No need to preprocess the o_2 and g_2 since this is only used for stats

            self.o_stats.update(transitions['o'])
            self.g_stats.update(transitions['g'])

            self.o_stats.recompute_stats()
            self.g_stats.recompute_stats()


    # This function is purely for Debugging purposes
    def debug_td_error_alternate_actual(self, debug_transitions):
        actual_transitions, alternate_transitions = debug_transitions[0], debug_transitions[1]
        actual_transitions, alternate_transitions = self.td_error_convert_to_format(actual_transitions),\
                                                    self.td_error_convert_to_format(alternate_transitions)

        # Calculated priorities
        priorities = []
        priorities.append(self.get_priorities(actual_transitions))
        priorities.append(self.get_priorities(alternate_transitions))

        f = open('act_alt_goals.txt', 'a')

        # Length of priorities[0] is 100 and priorities[1] is 400
        for i in range(len(priorities[0])):
            f.write(str(priorities[0][i])+" : ")
            for k in range(4):
                f.write(str(priorities[1][i*self.replay_k+k])+" : ")
            f.write('\n')

        f.write("Done Storing One Rollout\n\n\n")
        # f.write('The number of transitions are: '+str(len(priorities[0]))+" :: "+str(len(priorities[1]))+"\n")


    # This function is purely for Debugging purposes
    def td_error_convert_to_format(self, sample_transitions):
        # sample_transitions is now a list of transitions, convert it to the usual {key: batch X dim_key}
        keys = sample_transitions[0].keys()
        # print("Keys in _sample_her_transitions are: "+str(keys))
        transitions = {}
        for key in keys:
            # Initialize for all the keys
            transitions[key] = []

            # Add transitions one by one to the list
            for single_transition in range(len(sample_transitions)):
                transitions[key].append(sample_transitions[single_transition][key])
            transitions[key] = np.array(transitions[key])
        
        # Reconstruct info dictionary for reward  computation.
        info = {}
        for key, value in transitions.items():
            if key.startswith('info_'):
                info[key.replace('info_', '')] = value

        # print("The keys in transitions are: "+str(transitions.keys()))
        reward_params = {k: transitions[k] for k in ['ag_2', 'g']}
        reward_params['info'] = info
        transitions['r'] = self.reward_fun(**reward_params)

        # transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:])
        #                for k in transitions.keys()}


        return transitions

    def get_current_buffer_size(self):
        return self.buffer.get_current_size()

    def _sync_optimizers(self):
        self.Q_adam.sync()
        self.pi_adam.sync()

    def _grads(self):
        # Avoid feed_dict here for performance!
        critic_loss, actor_loss, Q_grad, pi_grad = self.sess.run([
            self.Q_loss_tf,
            self.main.Q_pi_tf,
            self.Q_grad_tf,
            self.pi_grad_tf
        ])
        return critic_loss, actor_loss, Q_grad, pi_grad

    # Adam update for Q and pi networks
    def _update(self, Q_grad, pi_grad):
        self.Q_adam.update(Q_grad, self.Q_lr)
        self.pi_adam.update(pi_grad, self.pi_lr)

    # Sample a batch for mini batch gradient descent, already defined in replay_buffer.py
    def sample_batch(self):
        # Increment the global step
        self.global_step += 1

        transitions, w, rank_e_id = self.buffer.sample(self.batch_size, self.global_step, self.uniform_priority)
        priorities = self.get_priorities(transitions)

        # ##### Debug function
        # self.debug_td_error(transitions, priorities)
        # #####
        o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
        ag, ag_2 = transitions['ag'], transitions['ag_2']
        transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
        transitions['o_2'], transitions['g_2'] = self._preprocess_og(o_2, ag_2, g)

        # # Remove
        # print("Stage Shape keys in sample_batch are: "+str(self.stage_shapes.keys()))

        transitions_batch = [transitions[key] for key in self.stage_shapes.keys()]

        # Updates the priorities of the sampled transitions in the priority queue
        self.buffer.update_priority(rank_e_id, priorities)

        return transitions_batch, [w]


    # This function is purely for debugging purposes
    def debug_td_error(self, transitions, priorities):
        f = open('td_error_debug.txt', 'a')
        self.debug['actual_goals'] = 0
        self.debug['alternate_goals'] = 0
        trans = transitions['is_actual_goal']
        for t in range(trans.shape[0]):
            if trans[t]:
                self.debug['actual_goals'] += 1
                # f.write('Actual goal transition: '+str(priorities[t])+'\n')
            else:
                self.debug['alternate_goals'] += 1
                # f.write('Alternate goal transition: '+str(priorities[t])+'\n')
        f.write('Ratio is: '+str(float(self.debug['alternate_goals'])/self.debug['actual_goals'])+'\n')
        del transitions['is_actual_goal']

    ###### Debug End

    def get_priorities(self, transitions):
        pi_target = self.target.pi_tf
        Q_pi_target = self.target.Q_pi_tf
        Q_main = self.main.Q_tf


        o = transitions['o']
        o_2 = transitions['o_2']
        u = transitions['u']
        g = transitions['g']
        r = transitions['r']
        # Check this with Srikanth
        ag = transitions['ag']

        priorities = np.zeros(o.shape[0])

        # file_obj = open("priorities_print","a")
        for i in range(o.shape[0]):
            o_2_i = np.clip(o_2[i], -self.clip_obs, self.clip_obs)
            o_i, g_i = self._preprocess_og(o[i], ag[i], g[i])
            u_i = u[i]

            # Not sure about the o_2_i.size // self.dimo. I guess we need not pass one at a time
            feed_target = {
                self.target.o_tf: o_2_i.reshape(-1, self.dimo),
                self.target.g_tf: g_i.reshape(-1, self.dimg),
                self.target.u_tf: np.zeros((o_2_i.size // self.dimo, self.dimu), dtype=np.float32)
            }

            # u_tf for main network is just the action taken at that state
            feed_main = {
                self.main.o_tf: o_i.reshape(-1, self.dimo),
                self.main.g_tf: g_i.reshape(-1, self.dimg),
                self.main.u_tf: u_i.reshape(-1, self.dimu)
            }

            TD = r[i] + self.gamma*self.sess.run(Q_pi_target, feed_dict=feed_target) - self.sess.run(Q_main, feed_dict=feed_main)

            priorities[i] = abs(TD)

            text = str(TD)
            # file_obj.write(text)
        # file_obj.close()

        return priorities


    def stage_batch(self, batch=None):
        if batch is None:
            batch, bias = self.sample_batch()
            # print("Batch type is: "+str(type(batch)))
            # print("Batch Shape is: "+str(len(batch)))
            # print(str(type(batch[0])))
        assert len(self.buffer_ph_tf) == len(batch), "Expected: "+str(len(self.buffer_ph_tf))+" Got: "+str(len(batch))
        self.sess.run(self.stage_op, feed_dict=dict(zip(self.buffer_ph_tf, batch)))

        ##### Adding for bias - Ameet
        assert len(self.buffer_ph_tf_new) == len(bias), "Expected: "+str(len(self.buffer_ph_tf_new))+" Got: "+str(len(bias))
        self.sess.run(self.stage_op_new, feed_dict=dict(zip(self.buffer_ph_tf_new, bias)))
        #####
        
        # print("Completed stage batch")

    def train(self, stage=True):
        if stage:
            self.stage_batch()
        critic_loss, actor_loss, Q_grad, pi_grad = self._grads()
        # print("In ddpg priority:: The shapes of Q_grad and pi_grad are: "+str(Q_grad.shape)+"::"+str(pi_grad.shape))
        # print("Their types are::"+str(type(Q_grad)))
        self._update(Q_grad, pi_grad)
        return critic_loss, actor_loss

    def _init_target_net(self):
        self.sess.run(self.init_target_net_op)

    def update_target_net(self):
        self.sess.run(self.update_target_net_op)

    def clear_buffer(self):
        self.buffer.clear_buffer()

    def _vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope + '/' + scope)
        assert len(res) > 0
        return res

    def _global_vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope + '/' + scope)
        return res

    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([(key, batch[i])
                                for i, key in enumerate(self.stage_shapes.keys())])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        ########### Getting the bias terms - Ameet
        bias = self.staging_tf_new.get()
        bias_tf = OrderedDict([(key, bias[i])
                                for i, key in enumerate(self.stage_shapes_new.keys())])
        bias_tf['bias'] = tf.reshape(bias_tf['bias'], [-1, 1])
        #######################################

        # Create main and target networks, each will have a pi_tf, Q_tf and Q_pi_tf
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(
                target_batch_tf, net_type='target', **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
        ############## Added for bias - Ameet
        error = (tf.stop_gradient(target_tf) - self.main.Q_tf) * bias_tf['bias']
        self.Q_loss_tf = tf.reduce_mean(tf.square(error))
        # self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf * bias_tf['bias'])
        # Note that the following statement does not include bias because of the remark in the IEEE paper
        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
        ##############
        # Regularization - L2 - Check - Penalty for taking the best action
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        ################### Shape Info
        ####Shape of Q_grads_tf is: 8
        ####Shape of Q_grads_tf[0] is: (17, 256)
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False)

        # polyak averaging
        # 'main/Q' is a way of communicating the scope of the variables
        # _vars has a way to understand this
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats')
        # Update the networks
        # target net is updated by using polyak averaging
        # target net is initialized by just copying the main net
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()

    def logs(self, prefix=''):
        logs = []
        logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
        logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
        logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
        logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]

        if prefix is not '' and not prefix.endswith('/'):
            return [(prefix + '/' + key, val) for key, val in logs]
        else:
            return logs

    def __getstate__(self):
        """Our policies can be loaded from pkl, but after unpickling you cannot continue training.
        """
        excluded_subnames = ['_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats',
                             'main', 'target', 'lock', 'env', 'sample_transitions',
                             'stage_shapes', 'create_actor_critic']

        state = {k: v for k, v in self.__dict__.items() if all([not subname in k for subname in excluded_subnames])}
        state['buffer_size'] = self.buffer_size
        state['tf'] = self.sess.run([x for x in self._global_vars('') if 'buffer' not in x.name])
        return state

    def __setstate__(self, state):
        if 'sample_transitions' not in state:
            # We don't need this for playing the policy.
            state['sample_transitions'] = None

        self.__init__(**state)
        # set up stats (they are overwritten in __init__)
        for k, v in state.items():
            if k[-6:] == '_stats':
                self.__dict__[k] = v
        # load TF variables
        vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
        assert(len(vars) == len(state["tf"]))
        node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
        self.sess.run(node)

Example #21

Show file

File: ddpg.py Project: jfsantos/stable-baselines

class DDPG(object):
    def __init__(self,
                 input_dims,
                 buffer_size,
                 hidden,
                 layers,
                 network_class,
                 polyak,
                 batch_size,
                 q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 time_horizon,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 sample_transitions,
                 gamma,
                 reuse=False):
        """
        Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).

        :param input_dims: ({str: int}) dimensions for the observation (o), the goal (g), and the actions (u)
        :param buffer_size: (int) number of transitions that are stored in the replay buffer
        :param hidden: (int) number of units in the hidden layers
        :param layers: (int) number of hidden layers
        :param network_class: (str) the network class that should be used (e.g. 'baselines.her.ActorCritic')
        :param polyak: (float) coefficient for Polyak-averaging of the target network
        :param batch_size: (int) batch size for training
        :param q_lr: (float) learning rate for the Q (critic) network
        :param pi_lr: (float) learning rate for the pi (actor) network
        :param norm_eps: (float) a small value used in the normalizer to avoid numerical instabilities
        :param norm_clip: (float) normalized inputs are clipped to be in [-norm_clip, norm_clip]
        :param max_u: (float) maximum action magnitude, i.e. actions are in [-max_u, max_u]
        :param action_l2: (float) coefficient for L2 penalty on the actions
        :param clip_obs: (float) clip observations before normalization to be in [-clip_obs, clip_obs]
        :param scope: (str) the scope used for the TensorFlow graph
        :param time_horizon: (int) the time horizon for rollouts
        :param rollout_batch_size: (int) number of parallel rollouts per DDPG agent
        :param subtract_goals: (function (numpy Number, numpy Number): numpy Number) function that subtracts goals
            from each other
        :param relative_goals: (boolean) whether or not relative goals should be fed into the network
        :param clip_pos_returns: (boolean) whether or not positive returns should be clipped
        :param clip_return: (float) clip returns to be in [-clip_return, clip_return]
        :param sample_transitions: (function (dict, int): dict) function that samples from the replay buffer
        :param gamma: (float) gamma used for Q learning updates
        :param reuse: (boolean) whether or not the networks should be reused
        """
        # Updated in experiments/config.py
        self.input_dims = input_dims
        self.buffer_size = buffer_size
        self.hidden = hidden
        self.layers = layers
        self.network_class = network_class
        self.polyak = polyak
        self.batch_size = batch_size
        self.q_lr = q_lr
        self.pi_lr = pi_lr
        self.norm_eps = norm_eps
        self.norm_clip = norm_clip
        self.max_u = max_u
        self.action_l2 = action_l2
        self.clip_obs = clip_obs
        self.scope = scope
        self.time_horizon = time_horizon
        self.rollout_batch_size = rollout_batch_size
        self.subtract_goals = subtract_goals
        self.relative_goals = relative_goals
        self.clip_pos_returns = clip_pos_returns
        self.clip_return = clip_return
        self.sample_transitions = sample_transitions
        self.gamma = gamma
        self.reuse = reuse

        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(self.network_class)

        input_shapes = dims_to_shapes(self.input_dims)
        self.dim_obs = self.input_dims['o']
        self.dim_goal = self.input_dims['g']
        self.dim_action = self.input_dims['u']

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {
            key: (self.time_horizon if key != 'o' else self.time_horizon + 1,
                  *input_shapes[key])
            for key, val in input_shapes.items()
        }
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dim_goal)
        buffer_shapes['ag'] = (self.time_horizon + 1, self.dim_goal)

        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size,
                                   self.time_horizon, self.sample_transitions)

    def _random_action(self, num):
        return np.random.uniform(low=-self.max_u,
                                 high=self.max_u,
                                 size=(num, self.dim_action))

    def _preprocess_obs_goal(self, obs, achieved_goal, goal):
        if self.relative_goals:
            g_shape = goal.shape
            goal = goal.reshape(-1, self.dim_goal)
            achieved_goal = achieved_goal.reshape(-1, self.dim_goal)
            goal = self.subtract_goals(goal, achieved_goal)
            goal = goal.reshape(*g_shape)
        obs = np.clip(obs, -self.clip_obs, self.clip_obs)
        goal = np.clip(goal, -self.clip_obs, self.clip_obs)
        return obs, goal

    def get_actions(self,
                    obs,
                    achieved_goal,
                    goal,
                    noise_eps=0.,
                    random_eps=0.,
                    use_target_net=False,
                    compute_q=False):
        """
        return the action from an observation and goal

        :param obs: (numpy Number) the observation
        :param achieved_goal: (numpy Number) the achieved goal
        :param goal: (numpy Number) the goal
        :param noise_eps: (float) the noise epsilon
        :param random_eps: (float) the random epsilon
        :param use_target_net: (bool) whether or not to use the target network
        :param compute_q: (bool) whether or not to compute Q value
        :return: (numpy float or float) the actions
        """
        obs, goal = self._preprocess_obs_goal(obs, achieved_goal, goal)
        policy = self.target if use_target_net else self.main
        # values to compute
        vals = [policy.pi_tf]
        if compute_q:
            vals += [policy.q_pi_tf]
        # feed
        feed = {
            policy.o_tf:
            obs.reshape(-1, self.dim_obs),
            policy.g_tf:
            goal.reshape(-1, self.dim_goal),
            policy.u_tf:
            np.zeros((obs.size // self.dim_obs, self.dim_action),
                     dtype=np.float32)
        }

        ret = self.sess.run(vals, feed_dict=feed)
        # action postprocessing
        action = ret[0]
        noise = noise_eps * self.max_u * np.random.randn(
            *action.shape)  # gaussian noise
        action += noise
        action = np.clip(action, -self.max_u, self.max_u)
        # eps-greedy
        n_ac = action.shape[0]
        action += np.random.binomial(1, random_eps, n_ac).reshape(
            -1, 1) * (self._random_action(n_ac) - action)
        if action.shape[0] == 1:
            action = action[0]
        action = action.copy()
        ret[0] = action

        if len(ret) == 1:
            return ret[0]
        else:
            return ret

    def store_episode(self, episode_batch, update_stats=True):
        """
        Story the episode transitions

        :param episode_batch: (numpy Number) array of batch_size x (T or T+1) x dim_key 'o' is of size T+1,
            others are of size T
        :param update_stats: (bool) whether to update stats or not
        """

        self.buffer.store_episode(episode_batch)

        if update_stats:
            # add transitions to normalizer
            episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
            episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
            num_normalizing_transitions = transitions_in_episode_batch(
                episode_batch)
            transitions = self.sample_transitions(episode_batch,
                                                  num_normalizing_transitions)

            obs, _, goal, achieved_goal = transitions['o'], transitions[
                'o_2'], transitions['g'], transitions['ag']
            transitions['o'], transitions['g'] = self._preprocess_obs_goal(
                obs, achieved_goal, goal)
            # No need to preprocess the o_2 and g_2 since this is only used for stats

            self.o_stats.update(transitions['o'])
            self.g_stats.update(transitions['g'])

            self.o_stats.recompute_stats()
            self.g_stats.recompute_stats()

    def get_current_buffer_size(self):
        """
        returns the current buffer size

        :return: (int) buffer size
        """
        return self.buffer.get_current_size()

    def _sync_optimizers(self):
        self.q_adam.sync()
        self.pi_adam.sync()

    def _grads(self):
        # Avoid feed_dict here for performance!
        critic_loss, actor_loss, q_grad, pi_grad = self.sess.run([
            self.q_loss_tf, self.main.q_pi_tf, self.q_grad_tf, self.pi_grad_tf
        ])
        return critic_loss, actor_loss, q_grad, pi_grad

    def _update(self, q_grad, pi_grad):
        self.q_adam.update(q_grad, self.q_lr)
        self.pi_adam.update(pi_grad, self.pi_lr)

    def sample_batch(self):
        """
        sample a batch

        :return: (dict) the batch
        """
        transitions = self.buffer.sample(self.batch_size)
        obs, obs_2, goal = transitions['o'], transitions['o_2'], transitions[
            'g']
        achieved_goal, achieved_goal_2 = transitions['ag'], transitions['ag_2']
        transitions['o'], transitions['g'] = self._preprocess_obs_goal(
            obs, achieved_goal, goal)
        transitions['o_2'], transitions['g_2'] = self._preprocess_obs_goal(
            obs_2, achieved_goal_2, goal)

        transitions_batch = [
            transitions[key] for key in self.stage_shapes.keys()
        ]
        return transitions_batch

    def stage_batch(self, batch=None):
        """
        apply a batch to staging

        :param batch: (dict) the batch to add to staging, if None: self.sample_batch()
        """
        if batch is None:
            batch = self.sample_batch()
        assert len(self.buffer_ph_tf) == len(batch)
        self.sess.run(self.stage_op,
                      feed_dict=dict(zip(self.buffer_ph_tf, batch)))

    def train(self, stage=True):
        """
        train DDPG

        :param stage: (bool) enable staging
        :return: (float, float) critic loss, actor loss
        """
        if stage:
            self.stage_batch()
        critic_loss, actor_loss, q_grad, pi_grad = self._grads()
        self._update(q_grad, pi_grad)
        return critic_loss, actor_loss

    def _init_target_net(self):
        self.sess.run(self.init_target_net_op)

    def update_target_net(self):
        """
        update the target network
        """
        self.sess.run(self.update_target_net_op)

    def clear_buffer(self):
        """
        clears the replay buffer
        """
        self.buffer.clear_buffer()

    def _vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                scope=self.scope + '/' + scope)
        assert len(res) > 0
        return res

    def _global_vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                scope=self.scope + '/' + scope)
        return res

    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." %
                    (self.dim_action, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()

        # running averages
        with tf.variable_scope('o_stats') as scope:
            if reuse:
                scope.reuse_variables()
            self.o_stats = Normalizer(self.dim_obs,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('g_stats') as scope:
            if reuse:
                scope.reuse_variables()
            self.g_stats = Normalizer(self.dim_goal,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([
            (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())
        ])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        # networks
        with tf.variable_scope('main') as scope:
            if reuse:
                scope.reuse_variables()
            self.main = self.create_actor_critic(batch_tf,
                                                 net_type='main',
                                                 **self.__dict__)
            scope.reuse_variables()
        with tf.variable_scope('target') as scope:
            if reuse:
                scope.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(target_batch_tf,
                                                   net_type='target',
                                                   **self.__dict__)
            scope.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_q_pi_tf = self.target.q_pi_tf
        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(
            batch_tf['r'] + self.gamma * target_q_pi_tf, *clip_range)

        self.q_loss_tf = tf.reduce_mean(
            tf.square(tf.stop_gradient(target_tf) - self.main.q_tf))
        self.pi_loss_tf = -tf.reduce_mean(self.main.q_pi_tf)
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(
            tf.square(self.main.pi_tf / self.max_u))

        q_grads_tf = tf.gradients(self.q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))

        assert len(self._vars('main/Q')) == len(q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)

        self.q_grads_vars_tf = zip(q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.q_grad_tf = flatten_grads(grads=q_grads_tf,
                                       var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf,
                                        var_list=self._vars('main/pi'))

        # optimizers
        self.q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'),
                               scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars(
            'g_stats')
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]),
                zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(
                lambda v: v[0].assign(self.polyak * v[0] +
                                      (1. - self.polyak) * v[1]),
                zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()

    def logs(self, prefix=''):
        """
        create a log dictionary
        :param prefix: (str) the prefix for evey index
        :return: ({str: Any}) the log
        """
        logs = []
        logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
        logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
        logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
        logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]

        if prefix is not '' and not prefix.endswith('/'):
            return [(prefix + '/' + key, val) for key, val in logs]
        else:
            return logs

    def __getstate__(self):
        """Our policies can be loaded from pkl, but after unpickling you cannot continue training.
        """
        excluded_subnames = [
            '_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats', 'main',
            'target', 'lock', 'env', 'sample_transitions', 'stage_shapes',
            'create_actor_critic'
        ]

        state = {
            k: v
            for k, v in self.__dict__.items()
            if all([subname not in k for subname in excluded_subnames])
        }
        state['buffer_size'] = self.buffer_size
        state['tf'] = self.sess.run(
            [x for x in self._global_vars('') if 'buffer' not in x.name])
        return state

    def __setstate__(self, state):
        if 'sample_transitions' not in state:
            # We don't need this for playing the policy.
            state['sample_transitions'] = None

        self.__init__(**state)
        # set up stats (they are overwritten in __init__)
        for key, value in state.items():
            if key[-6:] == '_stats':
                self.__dict__[key] = value
        # load TF variables
        _vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
        assert len(_vars) == len(state["tf"])
        node = [tf.assign(var, val) for var, val in zip(_vars, state["tf"])]
        self.sess.run(node)

Example #22

Show file

File: ddpg.py Project: MrGoogol/baselines

class DDPG(object):
    @store_args
    def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size,
                 Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T,
                 rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return,
                 bc_loss, q_filter, num_demo, demo_batch_size, prm_loss_weight, aux_loss_weight,
                 sample_transitions, gamma, reuse=False, **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).
            Added functionality to use demonstrations for training to Overcome exploration problem.

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
            bc_loss: whether or not the behavior cloning loss should be used as an auxilliary loss
            q_filter: whether or not a filter on the q value update should be used when training with demonstartions
            num_demo: Number of episodes in to be used in the demonstration buffer
            demo_batch_size: number of samples to be used from the demonstrations buffer, per mpi thread
            prm_loss_weight: Weight corresponding to the primary loss
            aux_loss_weight: Weight corresponding to the auxilliary loss also called the cloning loss
        """
        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(self.network_class)

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None,)
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values()]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {key: (self.T-1 if key != 'o' else self.T, *input_shapes[key])
                         for key, val in input_shapes.items()}
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T, self.dimg)

        buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)

        global DEMO_BUFFER
        DEMO_BUFFER = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions) #initialize the demo buffer; in the same way as the primary data buffer

    def _random_action(self, n):
        return np.random.uniform(low=-self.max_u, high=self.max_u, size=(n, self.dimu))

    def _preprocess_og(self, o, ag, g):
        if self.relative_goals:
            g_shape = g.shape
            g = g.reshape(-1, self.dimg)
            ag = ag.reshape(-1, self.dimg)
            g = self.subtract_goals(g, ag)
            g = g.reshape(*g_shape)
        o = np.clip(o, -self.clip_obs, self.clip_obs)
        g = np.clip(g, -self.clip_obs, self.clip_obs)
        return o, g

    def step(self, obs):
        actions = self.get_actions(obs['observation'], obs['achieved_goal'], obs['desired_goal'])
        return actions, None, None, None


    def get_actions(self, o, ag, g, noise_eps=0., random_eps=0., use_target_net=False,
                    compute_Q=False):
        o, g = self._preprocess_og(o, ag, g)
        policy = self.target if use_target_net else self.main
        # values to compute
        vals = [policy.pi_tf]
        if compute_Q:
            vals += [policy.Q_pi_tf]
        # feed
        feed = {
            policy.o_tf: o.reshape(-1, self.dimo),
            policy.g_tf: g.reshape(-1, self.dimg),
            policy.u_tf: np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32)
        }

        ret = self.sess.run(vals, feed_dict=feed)
        # action postprocessing
        u = ret[0]
        noise = noise_eps * self.max_u * np.random.randn(*u.shape)  # gaussian noise
        u += noise
        u = np.clip(u, -self.max_u, self.max_u)
        u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (self._random_action(u.shape[0]) - u)  # eps-greedy
        if u.shape[0] == 1:
            u = u[0]
        u = u.copy()
        ret[0] = u

        if len(ret) == 1:
            return ret[0]
        else:
            return ret

    def init_demo_buffer(self, demoDataFile, update_stats=True): #function that initializes the demo buffer

        demoData = np.load(demoDataFile) #load the demonstration data from data file
        info_keys = [key.replace('info_', '') for key in self.input_dims.keys() if key.startswith('info_')]
        info_values = [np.empty((self.T - 1, 1, self.input_dims['info_' + key]), np.float32) for key in info_keys]

        demo_data_obs = demoData['obs']
        demo_data_acs = demoData['acs']
        demo_data_info = demoData['info']

        for epsd in range(self.num_demo): # we initialize the whole demo buffer at the start of the training
            obs, acts, goals, achieved_goals = [], [] ,[] ,[]
            i = 0
            for transition in range(self.T - 1):
                obs.append([demo_data_obs[epsd][transition].get('observation')])
                acts.append([demo_data_acs[epsd][transition]])
                goals.append([demo_data_obs[epsd][transition].get('desired_goal')])
                achieved_goals.append([demo_data_obs[epsd][transition].get('achieved_goal')])
                for idx, key in enumerate(info_keys):
                    info_values[idx][transition, i] = demo_data_info[epsd][transition][key]


            obs.append([demo_data_obs[epsd][self.T - 1].get('observation')])
            achieved_goals.append([demo_data_obs[epsd][self.T - 1].get('achieved_goal')])

            episode = dict(o=obs,
                           u=acts,
                           g=goals,
                           ag=achieved_goals)
            for key, value in zip(info_keys, info_values):
                episode['info_{}'.format(key)] = value

            episode = convert_episode_to_batch_major(episode)
            global DEMO_BUFFER
            DEMO_BUFFER.store_episode(episode) # create the observation dict and append them into the demonstration buffer
            logger.debug("Demo buffer size currently ", DEMO_BUFFER.get_current_size()) #print out the demonstration buffer size

            if update_stats:
                # add transitions to normalizer to normalize the demo data as well
                episode['o_2'] = episode['o'][:, 1:, :]
                episode['ag_2'] = episode['ag'][:, 1:, :]
                num_normalizing_transitions = transitions_in_episode_batch(episode)
                transitions = self.sample_transitions(episode, num_normalizing_transitions)

                o, g, ag = transitions['o'], transitions['g'], transitions['ag']
                transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
                # No need to preprocess the o_2 and g_2 since this is only used for stats

                self.o_stats.update(transitions['o'])
                self.g_stats.update(transitions['g'])

                self.o_stats.recompute_stats()
                self.g_stats.recompute_stats()
            episode.clear()

        logger.info("Demo buffer size: ", DEMO_BUFFER.get_current_size()) #print out the demonstration buffer size

    def store_episode(self, episode_batch, update_stats=True):
        """
        episode_batch: array of batch_size x (T or T+1) x dim_key
                       'o' is of size T+1, others are of size T
        """

        self.buffer.store_episode(episode_batch)

        if update_stats:
            # add transitions to normalizer
            episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
            episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
            num_normalizing_transitions = transitions_in_episode_batch(episode_batch)
            transitions = self.sample_transitions(episode_batch, num_normalizing_transitions)

            o, g, ag = transitions['o'], transitions['g'], transitions['ag']
            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
            # No need to preprocess the o_2 and g_2 since this is only used for stats

            self.o_stats.update(transitions['o'])
            self.g_stats.update(transitions['g'])

            self.o_stats.recompute_stats()
            self.g_stats.recompute_stats()

    def get_current_buffer_size(self):
        return self.buffer.get_current_size()

    def _sync_optimizers(self):
        self.Q_adam.sync()
        self.pi_adam.sync()

    def _grads(self):
        # Avoid feed_dict here for performance!
        critic_loss, actor_loss, Q_grad, pi_grad = self.sess.run([
            self.Q_loss_tf,
            self.main.Q_pi_tf,
            self.Q_grad_tf,
            self.pi_grad_tf
        ])
        return critic_loss, actor_loss, Q_grad, pi_grad

    def _update(self, Q_grad, pi_grad):
        self.Q_adam.update(Q_grad, self.Q_lr)
        self.pi_adam.update(pi_grad, self.pi_lr)

    def sample_batch(self):
        if self.bc_loss: #use demonstration buffer to sample as well if bc_loss flag is set TRUE
            transitions = self.buffer.sample(self.batch_size - self.demo_batch_size)
            global DEMO_BUFFER
            transitions_demo = DEMO_BUFFER.sample(self.demo_batch_size) #sample from the demo buffer
            for k, values in transitions_demo.items():
                rolloutV = transitions[k].tolist()
                for v in values:
                    rolloutV.append(v.tolist())
                transitions[k] = np.array(rolloutV)
        else:
            transitions = self.buffer.sample(self.batch_size) #otherwise only sample from primary buffer

        o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
        ag, ag_2 = transitions['ag'], transitions['ag_2']
        transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
        transitions['o_2'], transitions['g_2'] = self._preprocess_og(o_2, ag_2, g)

        transitions_batch = [transitions[key] for key in self.stage_shapes.keys()]
        return transitions_batch

    def stage_batch(self, batch=None):
        if batch is None:
            batch = self.sample_batch()
        assert len(self.buffer_ph_tf) == len(batch)
        self.sess.run(self.stage_op, feed_dict=dict(zip(self.buffer_ph_tf, batch)))

    def train(self, stage=True):
        if stage:
            self.stage_batch()
        critic_loss, actor_loss, Q_grad, pi_grad = self._grads()
        self._update(Q_grad, pi_grad)
        return critic_loss, actor_loss

    def _init_target_net(self):
        self.sess.run(self.init_target_net_op)

    def update_target_net(self):
        self.sess.run(self.update_target_net_op)

    def clear_buffer(self):
        self.buffer.clear_buffer()

    def _vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope + '/' + scope)
        assert len(res) > 0
        return res

    def _global_vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope + '/' + scope)
        return res

    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u))
        self.sess = tf_util.get_session()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([(key, batch[i])
                                for i, key in enumerate(self.stage_shapes.keys())])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        #choose only the demo buffer samples
        mask = np.concatenate((np.zeros(self.batch_size - self.demo_batch_size), np.ones(self.demo_batch_size)), axis = 0)

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(
                target_batch_tf, net_type='target', **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
        self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))

        if self.bc_loss ==1 and self.q_filter == 1 : # train with demonstrations and use bc_loss and q_filter both
            maskMain = tf.reshape(tf.boolean_mask(self.main.Q_tf > self.main.Q_pi_tf, mask), [-1]) #where is the demonstrator action better than actor action according to the critic? choose those samples only
            #define the cloning loss on the actor's actions only on the samples which adhere to the above masks
            self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask(tf.boolean_mask((self.main.pi_tf), mask), maskMain, axis=0) - tf.boolean_mask(tf.boolean_mask((batch_tf['u']), mask), maskMain, axis=0)))
            self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf) #primary loss scaled by it's respective weight prm_loss_weight
            self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) #L2 loss on action values scaled by the same weight prm_loss_weight
            self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf #adding the cloning loss to the actor loss as an auxilliary loss scaled by its weight aux_loss_weight

        elif self.bc_loss == 1 and self.q_filter == 0: # train with demonstrations without q_filter
            self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask((self.main.pi_tf), mask) - tf.boolean_mask((batch_tf['u']), mask)))
            self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf)
            self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
            self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf

        else: #If  not training with demonstrations
            self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
            self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))

        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats')
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()

    def logs(self, prefix=''):
        logs = []
        logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
        logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
        logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
        logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]

        if prefix != '' and not prefix.endswith('/'):
            return [(prefix + '/' + key, val) for key, val in logs]
        else:
            return logs

    def __getstate__(self):
        """Our policies can be loaded from pkl, but after unpickling you cannot continue training.
        """
        excluded_subnames = ['_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats',
                             'main', 'target', 'lock', 'env', 'sample_transitions',
                             'stage_shapes', 'create_actor_critic']

        state = {k: v for k, v in self.__dict__.items() if all([not subname in k for subname in excluded_subnames])}
        state['buffer_size'] = self.buffer_size
        state['tf'] = self.sess.run([x for x in self._global_vars('') if 'buffer' not in x.name])
        return state

    def __setstate__(self, state):
        if 'sample_transitions' not in state:
            # We don't need this for playing the policy.
            state['sample_transitions'] = None

        self.__init__(**state)
        # set up stats (they are overwritten in __init__)
        for k, v in state.items():
            if k[-6:] == '_stats':
                self.__dict__[k] = v
        # load TF variables
        vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
        assert(len(vars) == len(state["tf"]))
        node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
        self.sess.run(node)

    def save(self, save_path):
        tf_util.save_variables(save_path)

Example #23

Show file

class DDPG(object):
    @store_args
    def __init__(self,
                 input_dims,
                 buffer_size,
                 hidden,
                 layers,
                 network_class,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 bc_loss,
                 q_filter,
                 num_demo,
                 demo_batch_size,
                 prm_loss_weight,
                 aux_loss_weight,
                 sample_transitions,
                 gamma,
                 reuse=False,
                 **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).
            Added functionality to use demonstrations for training to Overcome exploration problem.

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
            bc_loss: whether or not the behavior cloning loss should be used as an auxilliary loss
            q_filter: whether or not a filter on the q value update should be used when training with demonstartions
            num_demo: Number of episodes in to be used in the demonstration buffer
            demo_batch_size: number of samples to be used from the demonstrations buffer, per mpi thread
            prm_loss_weight: Weight corresponding to the primary loss
            aux_loss_weight: Weight corresponding to the auxilliary loss also called the cloning loss
        """
        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(self.network_class)

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {
            key: (self.T - 1 if key != 'o' else self.T, *input_shapes[key])
            for key, val in input_shapes.items()
        }
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T, self.dimg)

        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                   self.sample_transitions)

        global DEMO_BUFFER
        DEMO_BUFFER = ReplayBuffer(
            buffer_shapes, buffer_size, self.T, self.sample_transitions
        )  #initialize the demo buffer; in the same way as the primary data buffer

    def _random_action(self, n):
        return np.random.uniform(low=-self.max_u,
                                 high=self.max_u,
                                 size=(n, self.dimu))

    def _preprocess_og(self, o, ag, g):
        if self.relative_goals:
            g_shape = g.shape
            g = g.reshape(-1, self.dimg)
            ag = ag.reshape(-1, self.dimg)
            g = self.subtract_goals(g, ag)
            g = g.reshape(*g_shape)
        o = np.clip(o, -self.clip_obs, self.clip_obs)
        g = np.clip(g, -self.clip_obs, self.clip_obs)
        return o, g

    def step(self, obs):
        actions = self.get_actions(obs['observation'], obs['achieved_goal'],
                                   obs['desired_goal'])
        return actions, None, None, None

    def get_actions(self,
                    o,
                    ag,
                    g,
                    noise_eps=0.,
                    random_eps=0.,
                    use_target_net=False,
                    compute_Q=False):
        o, g = self._preprocess_og(o, ag, g)
        policy = self.target if use_target_net else self.main
        # values to compute
        vals = [policy.pi_tf]
        if compute_Q:
            vals += [policy.Q_pi_tf]
        # feed
        feed = {
            policy.o_tf:
            o.reshape(-1, self.dimo),
            policy.g_tf:
            g.reshape(-1, self.dimg),
            policy.u_tf:
            np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32)
        }

        ret = self.sess.run(vals, feed_dict=feed)
        # action postprocessing
        u = ret[0]
        noise = noise_eps * self.max_u * np.random.randn(
            *u.shape)  # gaussian noise
        u += noise
        u = np.clip(u, -self.max_u, self.max_u)
        u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (
            self._random_action(u.shape[0]) - u)  # eps-greedy
        if u.shape[0] == 1:
            u = u[0]
        u = u.copy()
        ret[0] = u

        if len(ret) == 1:
            return ret[0]
        else:
            return ret

    def init_demo_buffer(
            self,
            demoDataFile,
            update_stats=True):  #function that initializes the demo buffer

        demoData = np.load(
            demoDataFile)  #load the demonstration data from data file
        info_keys = [
            key.replace('info_', '') for key in self.input_dims.keys()
            if key.startswith('info_')
        ]
        info_values = [
            np.empty((self.T - 1, 1, self.input_dims['info_' + key]),
                     np.float32) for key in info_keys
        ]

        demo_data_obs = demoData['obs']
        demo_data_acs = demoData['acs']
        demo_data_info = demoData['info']

        for epsd in range(
                self.num_demo
        ):  # we initialize the whole demo buffer at the start of the training
            obs, acts, goals, achieved_goals = [], [], [], []
            i = 0
            for transition in range(self.T - 1):
                obs.append(
                    [demo_data_obs[epsd][transition].get('observation')])
                acts.append([demo_data_acs[epsd][transition]])
                goals.append(
                    [demo_data_obs[epsd][transition].get('desired_goal')])
                achieved_goals.append(
                    [demo_data_obs[epsd][transition].get('achieved_goal')])
                for idx, key in enumerate(info_keys):
                    info_values[idx][transition,
                                     i] = demo_data_info[epsd][transition][key]

            obs.append([demo_data_obs[epsd][self.T - 1].get('observation')])
            achieved_goals.append(
                [demo_data_obs[epsd][self.T - 1].get('achieved_goal')])

            episode = dict(o=obs, u=acts, g=goals, ag=achieved_goals)
            for key, value in zip(info_keys, info_values):
                episode['info_{}'.format(key)] = value

            episode = convert_episode_to_batch_major(episode)
            global DEMO_BUFFER
            DEMO_BUFFER.store_episode(
                episode
            )  # create the observation dict and append them into the demonstration buffer
            logger.debug("Demo buffer size currently ",
                         DEMO_BUFFER.get_current_size()
                         )  #print out the demonstration buffer size

            if update_stats:
                # add transitions to normalizer to normalize the demo data as well
                episode['o_2'] = episode['o'][:, 1:, :]
                episode['ag_2'] = episode['ag'][:, 1:, :]
                num_normalizing_transitions = transitions_in_episode_batch(
                    episode)
                transitions = self.sample_transitions(
                    episode, num_normalizing_transitions)

                o, g, ag = transitions['o'], transitions['g'], transitions[
                    'ag']
                transitions['o'], transitions['g'] = self._preprocess_og(
                    o, ag, g)
                # No need to preprocess the o_2 and g_2 since this is only used for stats

                self.o_stats.update(transitions['o'])
                self.g_stats.update(transitions['g'])

                self.o_stats.recompute_stats()
                self.g_stats.recompute_stats()
            episode.clear()

        logger.info("Demo buffer size: ", DEMO_BUFFER.get_current_size()
                    )  #print out the demonstration buffer size

    def store_episode(self, episode_batch, update_stats=True):
        """
        episode_batch: array of batch_size x (T or T+1) x dim_key
                       'o' is of size T+1, others are of size T
        """

        self.buffer.store_episode(episode_batch)

        if update_stats:
            # add transitions to normalizer
            episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
            episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
            num_normalizing_transitions = transitions_in_episode_batch(
                episode_batch)
            transitions = self.sample_transitions(episode_batch,
                                                  num_normalizing_transitions)

            o, g, ag = transitions['o'], transitions['g'], transitions['ag']
            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
            # No need to preprocess the o_2 and g_2 since this is only used for stats

            self.o_stats.update(transitions['o'])
            self.g_stats.update(transitions['g'])

            self.o_stats.recompute_stats()
            self.g_stats.recompute_stats()

    def get_current_buffer_size(self):
        return self.buffer.get_current_size()

    def _sync_optimizers(self):
        self.Q_adam.sync()
        self.pi_adam.sync()

    def _grads(self):
        # Avoid feed_dict here for performance!
        critic_loss, actor_loss, Q_grad, pi_grad = self.sess.run([
            self.Q_loss_tf, self.main.Q_pi_tf, self.Q_grad_tf, self.pi_grad_tf
        ])
        return critic_loss, actor_loss, Q_grad, pi_grad

    def _update(self, Q_grad, pi_grad):
        self.Q_adam.update(Q_grad, self.Q_lr)
        self.pi_adam.update(pi_grad, self.pi_lr)

    def sample_batch(self):
        if self.bc_loss:  #use demonstration buffer to sample as well if bc_loss flag is set TRUE
            transitions = self.buffer.sample(self.batch_size -
                                             self.demo_batch_size)
            global DEMO_BUFFER
            transitions_demo = DEMO_BUFFER.sample(
                self.demo_batch_size)  #sample from the demo buffer
            for k, values in transitions_demo.items():
                rolloutV = transitions[k].tolist()
                for v in values:
                    rolloutV.append(v.tolist())
                transitions[k] = np.array(rolloutV)
        else:
            transitions = self.buffer.sample(
                self.batch_size)  #otherwise only sample from primary buffer

        o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
        ag, ag_2 = transitions['ag'], transitions['ag_2']
        transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
        transitions['o_2'], transitions['g_2'] = self._preprocess_og(
            o_2, ag_2, g)
        assert np.array_equal(transitions['g_2'], transitions['g'])

        transitions_batch = [
            transitions[key] for key in self.stage_shapes.keys()
        ]
        return transitions_batch

    def stage_batch(self, batch=None):
        if batch is None:
            batch = self.sample_batch()
        assert len(self.buffer_ph_tf) == len(batch)
        self.sess.run(self.stage_op,
                      feed_dict=dict(zip(self.buffer_ph_tf, batch)))

    def train(self, stage=True):
        if stage:
            self.stage_batch()
        critic_loss, actor_loss, Q_grad, pi_grad = self._grads()
        self._update(Q_grad, pi_grad)
        return critic_loss, actor_loss

    def _init_target_net(self):
        self.sess.run(self.init_target_net_op)

    def update_target_net(self):
        self.sess.run(self.update_target_net_op)

    def clear_buffer(self):
        self.buffer.clear_buffer()

    def _vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                scope=self.scope + '/' + scope)
        assert len(res) > 0
        return res

    def _global_vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                scope=self.scope + '/' + scope)
        return res

    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." %
                    (self.dimu, self.max_u))
        self.sess = tf_util.get_session()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([
            (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())
        ])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        #choose only the demo buffer samples
        mask = np.concatenate(
            (np.zeros(self.batch_size - self.demo_batch_size),
             np.ones(self.demo_batch_size)),
            axis=0)

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf,
                                                 net_type='main',
                                                 **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(target_batch_tf,
                                                   net_type='target',
                                                   **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(
            batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
        self.Q_loss_tf = tf.reduce_mean(
            tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))

        if self.bc_loss == 1 and self.q_filter == 1:  # train with demonstrations and use bc_loss and q_filter both
            maskMain = tf.reshape(
                tf.boolean_mask(self.main.Q_tf > self.main.Q_pi_tf,
                                mask), [-1]
            )  #where is the demonstrator action better than actor action according to the critic? choose those samples only
            #define the cloning loss on the actor's actions only on the samples which adhere to the above masks
            self.cloning_loss_tf = tf.reduce_sum(
                tf.square(
                    tf.boolean_mask(tf.boolean_mask((self.main.pi_tf), mask),
                                    maskMain,
                                    axis=0) -
                    tf.boolean_mask(tf.boolean_mask((batch_tf['u']), mask),
                                    maskMain,
                                    axis=0)))
            self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(
                self.main.Q_pi_tf
            )  #primary loss scaled by it's respective weight prm_loss_weight
            self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(
                tf.square(self.main.pi_tf / self.max_u)
            )  #L2 loss on action values scaled by the same weight prm_loss_weight
            self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf  #adding the cloning loss to the actor loss as an auxilliary loss scaled by its weight aux_loss_weight

        elif self.bc_loss == 1 and self.q_filter == 0:  # train with demonstrations without q_filter
            self.cloning_loss_tf = tf.reduce_sum(
                tf.square(
                    tf.boolean_mask((self.main.pi_tf), mask) -
                    tf.boolean_mask((batch_tf['u']), mask)))
            self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(
                self.main.Q_pi_tf)
            self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(
                tf.square(self.main.pi_tf / self.max_u))
            self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf

        else:  #If  not training with demonstrations
            self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
            self.pi_loss_tf += self.action_l2 * tf.reduce_mean(
                tf.square(self.main.pi_tf / self.max_u))

        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf,
                                       var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf,
                                        var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'),
                               scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars(
            'g_stats')
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]),
                zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(
                lambda v: v[0].assign(self.polyak * v[0] +
                                      (1. - self.polyak) * v[1]),
                zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()

    def logs(self, prefix=''):
        logs = []
        logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
        logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
        logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
        logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]

        if prefix != '' and not prefix.endswith('/'):
            return [(prefix + '/' + key, val) for key, val in logs]
        else:
            return logs

    def __getstate__(self):
        """Our policies can be loaded from pkl, but after unpickling you cannot continue training.
        """
        excluded_subnames = [
            '_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats', 'main',
            'target', 'lock', 'env', 'sample_transitions', 'stage_shapes',
            'create_actor_critic'
        ]

        state = {
            k: v
            for k, v in self.__dict__.items()
            if all([not subname in k for subname in excluded_subnames])
        }
        state['buffer_size'] = self.buffer_size
        state['tf'] = self.sess.run(
            [x for x in self._global_vars('') if 'buffer' not in x.name])
        return state

    def __setstate__(self, state):
        if 'sample_transitions' not in state:
            # We don't need this for playing the policy.
            state['sample_transitions'] = None

        self.__init__(**state)
        # set up stats (they are overwritten in __init__)
        for k, v in state.items():
            if k[-6:] == '_stats':
                self.__dict__[k] = v
        # load TF variables
        vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
        assert (len(vars) == len(state["tf"]))
        node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
        self.sess.run(node)

    def save(self, save_path):
        tf_util.save_variables(save_path)

Example #24

Show file

File: ddpg.py Project: Baichenjia/BHER

class DDPG(object):
    @store_args
    def __init__(self,
                 input_dims,
                 buffer_size,
                 hidden,
                 layers,
                 network_class,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 sample_transitions,
                 gamma,
                 r_bias,
                 bias_clip_low,
                 bias_clip_high,
                 n_epochs,
                 ismuti,
                 reuse=False,
                 **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
        """
        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(self.network_class)

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {
            key: (self.T if key != 'o' else self.T + 1, *input_shapes[key])
            for key, val in input_shapes.items()
        }
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T + 1, self.dimg)

        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                   self.sample_transitions)

        self.total_epoch_r_mean_bias = []
        self.total_epoch_r_std_bias = []
        self.rb = r_bias
        self.bias_clip_low = bias_clip_low
        self.bias_clip_high = bias_clip_high
        self.isMuti = ismuti
        self.epcoch_num = 0
        self.isPlot = False
        self.picdir = ''
        self.rewdir = ''

    def save_reward_pic(self, reward):

        with open(self.rewdir, "wb") as fp:
            pickle.dump(reward, fp)

        plt.clf()
        print('min:', np.min(reward))
        print('max', np.max(reward))
        # fig, ax = plt.subplots()
        plt.figure(figsize=(10, 8))
        high = min(np.max(reward), 5.)
        bins = np.arange(0., high, 0.1)
        plt.hist(reward,
                 bins,
                 alpha=0.5,
                 weights=[1. / len(reward)] * len(reward))  # alpha设置透明度，0为完全透明
        font2 = {
            'size': 18,
        }
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)
        plt.xlabel('Bias', font2)
        plt.ylabel('Prob', font2)
        plt.grid(True)
        plt.xlim([0.0, high])

        plt.savefig(self.picdir)
        print('save pic path:', self.picdir)

    def _random_action(self, n):
        return np.random.uniform(low=-self.max_u,
                                 high=self.max_u,
                                 size=(n, self.dimu))

    def _preprocess_og(self, o, ag, g):
        if self.relative_goals:
            g_shape = g.shape
            g = g.reshape(-1, self.dimg)
            ag = ag.reshape(-1, self.dimg)
            g = self.subtract_goals(g, ag)
            g = g.reshape(*g_shape)
        o = np.clip(o, -self.clip_obs, self.clip_obs)
        g = np.clip(g, -self.clip_obs, self.clip_obs)
        return o, g

    def get_actions(self,
                    o,
                    ag,
                    g,
                    noise_eps=0.,
                    random_eps=0.,
                    use_target_net=False,
                    compute_Q=False,
                    compute_r_bias=False):
        o, g = self._preprocess_og(o, ag, g)
        policy = self.target if use_target_net else self.main
        # values to compute
        vals = [policy.pi_tf]
        if compute_Q:
            vals += [policy.Q_pi_tf]
        # feed
        feed = {
            policy.o_tf:
            o.reshape(-1, self.dimo),
            policy.g_tf:
            g.reshape(-1, self.dimg),
            policy.u_tf:
            np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32)
        }

        ret = self.sess.run(vals, feed_dict=feed)
        if compute_r_bias:
            return ret[0]
        # action postprocessing
        u = ret[0]
        noise = noise_eps * self.max_u * np.random.randn(
            *u.shape)  # gaussian noise
        u += noise
        u = np.clip(u, -self.max_u, self.max_u)
        u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (
            self._random_action(u.shape[0]) - u)  # eps-greedy
        if u.shape[0] == 1:
            u = u[0]
        u = u.copy()
        ret[0] = u

        if len(ret) == 1:
            return ret[0]
        else:
            return ret

    def store_episode(self, episode_batch, update_stats=True):
        """
        episode_batch: array of batch_size x (T or T+1) x dim_key
                       'o' is of size T+1, others are of size T
        """

        self.buffer.store_episode(episode_batch)

        if update_stats:
            # add transitions to normalizer
            episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
            episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
            num_normalizing_transitions = transitions_in_episode_batch(
                episode_batch)
            transitions = self.sample_transitions(episode_batch,
                                                  num_normalizing_transitions)

            o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions[
                'g'], transitions['ag']
            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
            # No need to preprocess the o_2 and g_2 since this is only used for stats

            self.o_stats.update(transitions['o'])
            self.g_stats.update(transitions['g'])

            self.o_stats.recompute_stats()
            self.g_stats.recompute_stats()

    def get_current_buffer_size(self):
        return self.buffer.get_current_size()

    def _sync_optimizers(self):
        self.Q_adam.sync()
        self.pi_adam.sync()

    def _grads(self):
        # Avoid feed_dict here for performance!
        critic_loss, actor_loss, Q_grad, pi_grad = self.sess.run([
            self.Q_loss_tf, self.main.Q_pi_tf, self.Q_grad_tf, self.pi_grad_tf
        ])
        return critic_loss, actor_loss, Q_grad, pi_grad

    def _update(self, Q_grad, pi_grad):
        self.Q_adam.update(Q_grad, self.Q_lr)
        self.pi_adam.update(pi_grad, self.pi_lr)

    def recompute_reward(self, transitions):
        re_transitions = transitions['re_transitions']
        T = re_transitions['u'].shape[1]
        batch_size = re_transitions['u'].shape[0]
        # o(256, 51, 25) u (256, 50, 4) g (256, 50, 3) future_g(256,50,3) info_is_success (256, 50, 1)  o_2 (256, 50, 25) ag_2 (256, 50, 3)
        re_transitions['o'] = re_transitions['o'][:, :T, :].reshape(
            batch_size * T, self.dimo)
        re_transitions['ag'] = re_transitions['ag'][:, :T, :].reshape(
            batch_size * T, self.dimg)
        re_transitions['future_g'] = re_transitions['future_g'].reshape(
            batch_size * T, self.dimg)
        re_transitions['g'] = re_transitions['g'].reshape(
            batch_size * T, self.dimg)
        re_transitions['u'] = re_transitions['u'].reshape(
            batch_size * T, self.dimu)

        u1 = self.get_actions(re_transitions['o'],
                              re_transitions['ag'],
                              re_transitions['future_g'],
                              compute_r_bias=True)
        u2 = self.get_actions(re_transitions['o'],
                              re_transitions['ag'],
                              re_transitions['g'],
                              compute_r_bias=True)
        r_b = self.rb * (np.square(LA.norm(u2 - re_transitions['u'], axis=1)) -
                         np.square(LA.norm(u1 - re_transitions['u'], axis=1)))

        r_b = np.sum(r_b.reshape(batch_size, T), axis=1)

        e_r_b = np.exp(r_b)
        her_indexes = re_transitions['her_index']
        other_indexes = re_transitions['other_index']

        rank = MPI.COMM_WORLD.Get_rank()
        if rank == 0:
            self.total_epoch_r_mean_bias.append(e_r_b.mean())
            self.total_epoch_r_std_bias.append(e_r_b.std())

        if self.isPlot:
            self.save_reward_pic(e_r_b)
            self.isPlot = False

        if self.isMuti:
            transitions['r'] *= np.clip(e_r_b, self.bias_clip_low,
                                        self.bias_clip_high)
        else:  # batch projection
            e_r_b = np.clip(e_r_b, self.bias_clip_low, self.bias_clip_high)
            e_r_b_mean = np.mean(e_r_b[her_indexes])
            transitions['r'][other_indexes] /= e_r_b_mean

        del transitions['re_transitions']
        del transitions['origin_g']
        return transitions

    def sample_batch(self):
        transitions = self.buffer.sample(self.batch_size)
        # lky  recompute reward
        transitions = self.recompute_reward(transitions)

        o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
        ag, ag_2 = transitions['ag'], transitions['ag_2']
        transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
        transitions['o_2'], transitions['g_2'] = self._preprocess_og(
            o_2, ag_2, g)

        transitions_batch = [
            transitions[key] for key in self.stage_shapes.keys()
        ]
        return transitions_batch

    def stage_batch(self, batch=None):
        if batch is None:
            batch = self.sample_batch()
        assert len(self.buffer_ph_tf) == len(batch)
        self.sess.run(self.stage_op,
                      feed_dict=dict(zip(self.buffer_ph_tf, batch)))

    def train(self, stage=True):
        if stage:
            self.stage_batch()
        critic_loss, actor_loss, Q_grad, pi_grad = self._grads()
        self._update(Q_grad, pi_grad)
        return critic_loss, actor_loss

    def _init_target_net(self):
        self.sess.run(self.init_target_net_op)

    def update_target_net(self):
        self.sess.run(self.update_target_net_op)

    def clear_buffer(self):
        self.buffer.clear_buffer()

    def _vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                scope=self.scope + '/' + scope)
        assert len(res) > 0
        return res

    def _global_vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                scope=self.scope + '/' + scope)
        return res

    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." %
                    (self.dimu, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([
            (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())
        ])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf,
                                                 net_type='main',
                                                 **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(target_batch_tf,
                                                   net_type='target',
                                                   **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(
            batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
        self.Q_loss_tf = tf.reduce_mean(
            tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))
        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(
            tf.square(self.main.pi_tf / self.max_u))
        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf,
                                       var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf,
                                        var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'),
                               scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars(
            'g_stats')
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]),
                zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(
                lambda v: v[0].assign(self.polyak * v[0] +
                                      (1. - self.polyak) * v[1]),
                zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()

    def logs(self, prefix=''):
        logs = []
        logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
        logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
        logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
        logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]

        if prefix is not '' and not prefix.endswith('/'):
            return [(prefix + '/' + key, val) for key, val in logs]
        else:
            return logs

    def __getstate__(self):
        """Our policies can be loaded from pkl, but after unpickling you cannot continue training.
        """
        excluded_subnames = [
            '_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats', 'main',
            'target', 'lock', 'env', 'sample_transitions', 'stage_shapes',
            'create_actor_critic'
        ]

        state = {
            k: v
            for k, v in self.__dict__.items()
            if all([not subname in k for subname in excluded_subnames])
        }
        state['buffer_size'] = self.buffer_size
        state['tf'] = self.sess.run(
            [x for x in self._global_vars('') if 'buffer' not in x.name])
        return state

    def __setstate__(self, state):
        if 'sample_transitions' not in state:
            # We don't need this for playing the policy.
            state['sample_transitions'] = None

        self.__init__(**state)
        # set up stats (they are overwritten in __init__)
        for k, v in state.items():
            if k[-6:] == '_stats':
                self.__dict__[k] = v
        # load TF variables
        vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
        assert (len(vars) == len(state["tf"]))
        node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
        self.sess.run(node)

Example #25

Show file

File: ddpg.py Project: beyretb/baselines-Dot-To-Dot

class DDPG(object):
    @store_args
    def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size,
                 Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T,
                 rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return,
                 sample_transitions, gamma, reuse=False, **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
        """
        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(self.network_class)

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None,)
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values()]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {key: (self.T if key != 'o' else self.T+1, *input_shapes[key])
                         for key, val in input_shapes.items()}
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T+1, self.dimg)

        buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)

    def _random_action(self, n):
        return np.random.uniform(low=-self.max_u, high=self.max_u, size=(n, self.dimu))

    def _preprocess_og(self, o, ag, g):
        if self.relative_goals:
            g_shape = g.shape
            g = g.reshape(-1, self.dimg)
            ag = ag.reshape(-1, self.dimg)
            g = self.subtract_goals(g, ag)
            g = g.reshape(*g_shape)
        o = np.clip(o, -self.clip_obs, self.clip_obs)
        g = np.clip(g, -self.clip_obs, self.clip_obs)
        return o, g

    def get_actions(self, o, ag, g, noise_eps=0., random_eps=0., use_target_net=False,
                    compute_Q=False):
        o, g = self._preprocess_og(o, ag, g) # clip observations and goals
        policy = self.target if use_target_net else self.main
        # values to compute
        vals = [policy.pi_tf]
        if compute_Q:
            vals += [policy.Q_pi_tf]
        # feed
        feed = {
            policy.o_tf: o.reshape(-1, self.dimo),
            policy.g_tf: g.reshape(-1, self.dimg),
            policy.u_tf: np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32)
        }

        # ret = action given by the current policy (eval of NN)
        ret = self.sess.run(vals, feed_dict=feed)
        # action postprocessing
        u = ret[0]
        noise = noise_eps * self.max_u * np.random.randn(*u.shape)  # gaussian noise
        u += noise
        u = np.clip(u, -self.max_u, self.max_u)
        # Below: for each mini-batch we take action u (the one given by the policy) with probability
        # 1-random_eps, and a random action (u + random_action - u) with probability random_eps
        u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (self._random_action(u.shape[0]) - u)  # eps-greedy
        if u.shape[0] == 1:
            u = u[0]
        u = u.copy()
        ret[0] = u

        if len(ret) == 1:
            return ret[0]
        else:
            return ret

    def store_episode(self, episode_batch, update_stats=True):
        """
        episode_batch: array of batch_size x (T or T+1) x dim_key
                       'o' is of size T+1, others are of size T
        """

        self.buffer.store_episode(episode_batch)

        if update_stats:
            # add transitions to normalizer
            episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
            episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
            num_normalizing_transitions = transitions_in_episode_batch(episode_batch)
            transitions = self.sample_transitions(episode_batch, num_normalizing_transitions)

            o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions['g'], transitions['ag']
            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
            # No need to preprocess the o_2 and g_2 since this is only used for stats

            self.o_stats.update(transitions['o'])
            self.g_stats.update(transitions['g'])

            self.o_stats.recompute_stats()
            self.g_stats.recompute_stats()

    def get_current_buffer_size(self):
        return self.buffer.get_current_size()

    def _sync_optimizers(self):
        self.Q_adam.sync()
        self.pi_adam.sync()

    def _grads(self):
        # Avoid feed_dict here for performance!
        critic_loss, actor_loss, Q_grad, pi_grad = self.sess.run([
            self.Q_loss_tf,
            self.main.Q_pi_tf,
            self.Q_grad_tf,
            self.pi_grad_tf
        ])
        return critic_loss, actor_loss, Q_grad, pi_grad

    def _update(self, Q_grad, pi_grad):
        self.Q_adam.update(Q_grad, self.Q_lr)
        self.pi_adam.update(pi_grad, self.pi_lr)

    def sample_batch(self):
        transitions = self.buffer.sample(self.batch_size)
        o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
        ag, ag_2 = transitions['ag'], transitions['ag_2']
        transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
        transitions['o_2'], transitions['g_2'] = self._preprocess_og(o_2, ag_2, g)

        transitions_batch = [transitions[key] for key in self.stage_shapes.keys()]
        return transitions_batch

    def stage_batch(self, batch=None):
        if batch is None:
            batch = self.sample_batch()
        assert len(self.buffer_ph_tf) == len(batch)
        self.sess.run(self.stage_op, feed_dict=dict(zip(self.buffer_ph_tf, batch)))

    def train(self, stage=True):
        if stage:
            self.stage_batch()
        critic_loss, actor_loss, Q_grad, pi_grad = self._grads()
        self._update(Q_grad, pi_grad)
        return critic_loss, actor_loss

    def _init_target_net(self):
        self.sess.run(self.init_target_net_op)

    def update_target_net(self):
        self.sess.run(self.update_target_net_op)

    def clear_buffer(self):
        self.buffer.clear_buffer()

    def _vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope + '/' + scope)
        assert len(res) > 0
        return res

    def _global_vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope + '/' + scope)
        return res

    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([(key, batch[i])
                                for i, key in enumerate(self.stage_shapes.keys())])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(
                target_batch_tf, net_type='target', **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        # self.XX.pi_tf is the action policy we ll use for exploration (TO CONFIRM)
        # self.XX.Q_pi_tf is the Q network used to train this policy
        # self.XX.Q_tf

        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
        # target y_i= r + gamma*Q part of the Bellman equation (with returns clipped if necessary:
        target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
        # loss function for Q_tf where we exclude target_tf from the gradient computation:
        self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))

        # loss function for the action policy is that of the main Q_pi network:
        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
        # add L2 regularization term from the policy itself:
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))

        # define the gradients of the Q_loss and pi_loss wrt to their variables respectively
        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)

        # zip the gradients together with their respective variables
        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))

        # flattened gradients and variables
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi'))

        # optimizers (using MPI for parralel updates of the network (TO CONFIRM))
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False)

        # polyak averaging used for the update of the target networks in both pi and Q nets
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats')
        # operation to initialize the target nets at the main nets'values
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
        # operation to update the target nets from the main nets using polyak averaging
        self.update_target_net_op = list(
            map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]),
                zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()  # CHECK WHAT THIS DOES ????
        self._init_target_net()

    def logs(self, prefix=''):
        logs = []
        logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
        logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
        logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
        logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]

        if prefix is not '' and not prefix.endswith('/'):
            return [(prefix + '/' + key, val) for key, val in logs]
        else:
            return logs

    def __getstate__(self):
        """Our policies can be loaded from pkl, but after unpickling you cannot continue training.
        """
        excluded_subnames = ['_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats',
                             'main', 'target', 'lock', 'env', 'sample_transitions',
                             'stage_shapes', 'create_actor_critic']

        state = {k: v for k, v in self.__dict__.items() if all([not subname in k for subname in excluded_subnames])}
        state['buffer_size'] = self.buffer_size
        state['tf'] = self.sess.run([x for x in self._global_vars('') if 'buffer' not in x.name])
        return state

    def __setstate__(self, state):
        if 'sample_transitions' not in state:
            # We don't need this for playing the policy.
            state['sample_transitions'] = None

        self.__init__(**state)
        # set up stats (they are overwritten in __init__)
        for k, v in state.items():
            if k[-6:] == '_stats':
                self.__dict__[k] = v
        # load TF variables
        vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
        assert(len(vars) == len(state["tf"]))
        node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
        self.sess.run(node)

Example #26

Show file

File: ddpg.py Project: nitish-kulkarni/Automatic-Hindsight-Experience-Replay

class DDPG(object):
    @store_args
    def __init__(self,
                 input_dims,
                 buffer_size,
                 hidden,
                 layers,
                 network_class,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 sample_transitions,
                 gamma,
                 gg_k,
                 replay_strategy,
                 reuse=False,
                 **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
        """
        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(self.network_class)
        self.replay_strategy = replay_strategy

        self.dimo = self.input_dims['o']
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']

        if self.replay_strategy == C.REPLAY_STRATEGY_GEN_K:
            self.max_g = kwargs['max_g']
            self.d0 = kwargs['d0']
            self.slope = kwargs['slope']
            self.goal_lr = kwargs['goal_lr']
            # reward shaping parameters
            self.rshape_lambda = kwargs['rshape_lambda']
            self.reshape_p = kwargs['rshape_p']
            self.rshaping = kwargs['rshaping']

            self.input_dims['e'] = self.dimg * self.T
            self.input_dims['mask'] = self.T
            self.dime = self.input_dims['e']
            self.dim_mask = self.input_dims['mask']

        input_shapes = dims_to_shapes(self.input_dims)

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {
            key: (self.T if key != 'o' else self.T + 1, *input_shapes[key])
            for key, val in input_shapes.items()
        }
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T + 1, self.dimg)

        if self.replay_strategy in [
                C.REPLAY_STRATEGY_BEST_K, C.REPLAY_STRATEGY_GEN_K,
                C.REPLAY_STRATEGY_GEN_K_GMM
        ]:
            buffer_shapes['gg'] = (self.T, self.gg_k, self.dimg)

        if self.replay_strategy in [
                C.REPLAY_STRATEGY_BEST_K, C.REPLAY_STRATEGY_GEN_K_GMM
        ]:
            buffer_shapes['gg_idx'] = (self.T, self.gg_k)

        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                   self.sample_transitions)

    def _random_action(self, n):
        return np.random.uniform(low=-self.max_u,
                                 high=self.max_u,
                                 size=(n, self.dimu))

    def _preprocess_og(self, o, ag, g):
        if self.relative_goals:
            g_shape = g.shape
            g = g.reshape(-1, self.dimg)
            ag = ag.reshape(-1, self.dimg)
            g = self.subtract_goals(g, ag)
            g = g.reshape(*g_shape)
        o = np.clip(o, -self.clip_obs, self.clip_obs)
        g = np.clip(g, -self.clip_obs, self.clip_obs)
        return o, g

    def _preprocess_e(self, e):
        e = np.clip(e, -self.clip_obs, self.clip_obs)
        return e

    # def td_error(self, o, g):
    #     vals = [self.Q_loss_tf]
    def get_target_q_val(self, o, ag, g):
        vals = [self.target.Q_pi_tf]
        feed = {
            self.target.o_tf: o.reshape(-1, self.dimo),
            self.target.g_tf: g.reshape(-1, self.dimg)
        }
        ret = self.sess.run(vals, feed_dict=feed)
        return ret[0]

    def get_goals(self, u_goal, e, mask, use_target_net=False):
        """
        :param u_goal: batch_size * dim_u dimensional array
        :param e: batch_size * (T*dim_g) dimensional array
        :param mask: batch_size * T dimensional array
        :param use_target_net: True/False
        :return:
        """
        e = self._preprocess_e(e)
        policy = self.target if use_target_net else self.main
        vals = [
            policy.goal_tf, policy.distance, policy.e_reshaped,
            policy.goal_tf_repeated, policy.reward_sum
        ]
        # feed
        feed = {
            policy.e_tf: e.reshape(-1, self.dime),
            policy.mask_tf: mask.reshape(-1, self.dim_mask),
            policy.u_tf: u_goal.reshape(-1, self.dimu)
        }
        ret = self.sess.run(vals, feed_dict=feed)
        # print("Generated goal: ")
        # print("Goal: ", ret[0])
        # print("Distance: ", ret[1])
        # print("Episode: ", ret[2])
        # print("Goal repeated: ", ret[3])
        # print("Reward: ", np.average(ret[4]))
        # print('---------------------------------------------------------------')
        # for var in self._vars('main/goal'):
        #     print("Name: " + var.name)
        #     print("Shape: " + str(var.shape))
        #     print(var.eval())
        return ret[0]

    def get_actions(self,
                    o,
                    ag,
                    g,
                    noise_eps=0.,
                    random_eps=0.,
                    use_target_net=False,
                    compute_Q=False):
        o, g = self._preprocess_og(o, ag, g)
        policy = self.target if use_target_net else self.main
        # values to compute
        vals = [policy.pi_tf]
        if compute_Q:
            vals += [policy.Q_pi_tf]
        # feed
        feed = {
            policy.o_tf:
            o.reshape(-1, self.dimo),
            policy.g_tf:
            g.reshape(-1, self.dimg),
            policy.u_tf:
            np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32)
        }

        ret = self.sess.run(vals, feed_dict=feed)
        # action postprocessing
        u = ret[0]
        noise = noise_eps * self.max_u * np.random.randn(
            *u.shape)  # gaussian noise
        u += noise
        u = np.clip(u, -self.max_u, self.max_u)
        u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (
            self._random_action(u.shape[0]) - u)  # eps-greedy
        if u.shape[0] == 1:
            u = u[0]
        u = u.copy()
        ret[0] = u

        if len(ret) == 1:
            return ret[0]
        else:
            return ret

    def store_episode(self, episode_batch, update_stats=True):
        """
        episode_batch: array of batch_size x (T or T+1) x dim_key
                       'o' is of size T+1, others are of size T
        """

        self.buffer.store_episode(episode_batch)

        if update_stats:
            # add transitions to normalizer
            episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
            episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
            num_normalizing_transitions = transitions_in_episode_batch(
                episode_batch)
            transitions = self.sample_transitions(episode_batch,
                                                  num_normalizing_transitions)

            o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions[
                'g'], transitions['ag']
            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
            # No need to preprocess the o_2 and g_2 since this is only used for stats

            self.o_stats.update(transitions['o'])
            self.g_stats.update(transitions['g'])

            self.o_stats.recompute_stats()
            self.g_stats.recompute_stats()

            if self.replay_strategy == C.REPLAY_STRATEGY_GEN_K:
                e = transitions['e']
                transitions['e'] = self._preprocess_e(e)
                self.e_stats.update(transitions['e'])
                self.e_stats.recompute_stats()

    def get_current_buffer_size(self):
        return self.buffer.get_current_size()

    def _sync_optimizers(self):
        self.Q_adam.sync()
        self.pi_adam.sync()
        if self.replay_strategy == C.REPLAY_STRATEGY_GEN_K:
            self.goal_adam.sync()
            self.Q_goal_adam.sync()
            self.pi_goal_adam.sync()

    def _grads(self):
        # Avoid feed_dict here for performance!
        tf_list = [
            self.Q_loss_tf, self.main.Q_pi_tf, self.Q_grad_tf, self.pi_grad_tf
        ]
        if self.replay_strategy == C.REPLAY_STRATEGY_GEN_K:
            tf_list.extend([self.goal_loss_tf, self.goal_grad_tf])
            tf_list.extend([self.Q_goal_loss_tf, self.Q_goal_grad_tf])
            tf_list.extend([self.pi_goal_loss_tf, self.pi_goal_grad_tf])
            tf_list.extend([self.main.mask_tf, self.main.d])
        return self.sess.run(tf_list)

    def _update(self, Q_grad, pi_grad):
        self.Q_adam.update(Q_grad, self.Q_lr)
        self.pi_adam.update(pi_grad, self.pi_lr)

    def _update_goal(self, goal_grad, Q_goal_grad, pi_goal_grad):
        # self.Q_goal_adam.update(Q_goal_grad, self.Q_lr)
        # self.pi_goal_adam.update(pi_goal_grad, self.pi_lr)
        self.goal_adam.update(goal_grad, self.goal_lr)

    def sample_batch(self):
        transitions = self.buffer.sample(self.batch_size)
        return self.batch_from_transitions(transitions)

    def batch_from_transitions(self, transitions):
        """
            transitions is a dictionary with keys: ['o', 'ag', 'u', 'o_2', 'ag_2', 'r', 'g']
            batch is a processed batch (normalizing, clipping, relative goal) for staging,
                and has the keys ['o', 'ag', 'u', 'o_2', 'ag_2', 'r', g', 'g_2']
        """
        # preprocess observations and goals
        o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
        ag, ag_2 = transitions['ag'], transitions['ag_2']
        transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
        transitions['o_2'], transitions['g_2'] = self._preprocess_og(
            o_2, ag_2, g)

        if self.replay_strategy == C.REPLAY_STRATEGY_GEN_K:
            e = transitions['e']
            transitions['e'] = self._preprocess_e(e)

        # Set the correct order of keys in the batch
        transitions_batch = [
            transitions[key] for key in self.stage_shapes.keys()
        ]
        return transitions_batch

    def stage_batch(self, batch=None):
        if batch is None:
            batch = self.sample_batch()
        assert len(self.buffer_ph_tf) == len(batch)
        self.sess.run(self.stage_op,
                      feed_dict=dict(zip(self.buffer_ph_tf, batch)))

    def train(self, stage=True):
        # print("*********************************Training*******************************")
        if stage:
            self.stage_batch()

        if self.replay_strategy != C.REPLAY_STRATEGY_GEN_K:
            critic_loss, actor_loss, Q_grad, pi_grad = self._grads()
        else:
            critic_loss, actor_loss, Q_grad, pi_grad,\
            goal_loss, goal_grad, Q_goal_loss, Q_goal_grad, \
            pi_goal_loss, pi_goal_grad, x, y = self._grads()
            self._update_goal(goal_grad, Q_goal_grad, pi_goal_grad)

        self._update(Q_grad, pi_grad)
        # print("Loss: ", goal_loss)
        # print("mask: ", np.sum(x, axis=1))
        # print("distance: ", y)
        # print("Reward: ", r)

        # if self.replay_strategy == C.REPLAY_STRATEGY_GEN_K:
        #     goal_loss = self.sess.run(self.target_Q_goal_tf)
        #     # self.goal_adam.update(goal_grad, self.goal_lr)
        #     print("Goal loss: ", goal_loss)

        if self.replay_strategy == C.REPLAY_STRATEGY_GEN_K:
            self.sess.run(self.copy_normal_to_goal_op)

        return critic_loss, actor_loss

    def _init_target_net(self):
        self.sess.run(self.init_target_net_op)

    def update_target_net(self):
        self.sess.run(self.update_target_net_op)

    def clear_buffer(self):
        self.buffer.clear_buffer()

    def _vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                scope=self.scope + '/' + scope)
        assert len(res) > 0
        return res

    def _global_vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                scope=self.scope + '/' + scope)
        return res

    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." %
                    (self.dimu, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        if self.replay_strategy == C.REPLAY_STRATEGY_GEN_K:
            # running averages
            with tf.variable_scope('e_stats') as vs:
                if reuse:
                    vs.reuse_variables()
                self.e_stats = Normalizer(self.dime,
                                          self.norm_eps,
                                          self.norm_clip,
                                          sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([
            (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())
        ])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf,
                                                 net_type='main',
                                                 **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(target_batch_tf,
                                                   net_type='target',
                                                   **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(
            batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
        self.Q_loss_tf_vec = tf.square(
            tf.stop_gradient(target_tf) - self.main.Q_tf)
        self.Q_loss_tf = tf.reduce_mean(self.Q_loss_tf_vec)
        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(
            tf.square(self.main.pi_tf / self.max_u))
        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf,
                                       var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf,
                                        var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'),
                               scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars(
            'g_stats')

        if self.replay_strategy == C.REPLAY_STRATEGY_GEN_K:
            # loss functions for goal generation network
            target_Q_pi_goal_tf = self.target.Q_pi_goal_tf
            target_goal_tf = tf.clip_by_value(
                self.main.reward + self.gamma * target_Q_pi_goal_tf,
                *clip_range)
            self.goal_loss_tf = -self.LAMBDA * tf.reduce_mean(
                tf.square(
                    tf.stop_gradient(target_goal_tf) - self.main.Q_goal_tf))
            # self.goal_loss_tf += 0.0 * tf.reduce_mean(tf.square(self.main.goal_tf / self.max_g))
            # self.goal_loss_tf = 0
            # self.reward_sum = tf.reduce_mean(self.main.reward_sum)
            self.goal_loss_tf += -tf.reduce_mean(self.main.reward_sum)

            # loss functions for Q_goal and pi_goal
            self.Q_goal_loss_tf = tf.reduce_mean(
                tf.square(
                    tf.stop_gradient(target_goal_tf) - self.main.Q_goal_tf))
            self.pi_goal_loss_tf = -tf.reduce_mean(self.main.Q_pi_goal_tf)
            self.pi_goal_loss_tf += self.action_l2 * tf.reduce_mean(
                tf.square(self.main.pi_goal_tf / self.max_u))

            # gradients
            goal_grads_tf = tf.gradients(self.goal_loss_tf,
                                         self._vars('main/goal'))
            self.goal_grad_tf = flatten_grads(grads=goal_grads_tf,
                                              var_list=self._vars('main/goal'))

            Q_goal_grads_tf = tf.gradients(self.Q_goal_loss_tf,
                                           self._vars('main/gQ'))
            self.Q_goal_grad_tf = flatten_grads(grads=Q_goal_grads_tf,
                                                var_list=self._vars('main/gQ'))

            pi_goal_grads_tf = tf.gradients(self.pi_goal_loss_tf,
                                            self._vars('main/gpi'))
            self.pi_goal_grad_tf = flatten_grads(
                grads=pi_goal_grads_tf, var_list=self._vars('main/gpi'))

            assert len(self._vars('main/goal')) == len(goal_grads_tf)
            assert len(self._vars('main/gQ')) == len(Q_goal_grads_tf)
            assert len(self._vars('main/gpi')) == len(pi_goal_grads_tf)

            # optimizers
            self.goal_adam = MpiAdam(self._vars('main/goal'),
                                     scale_grad_by_procs=False)
            self.Q_goal_adam = MpiAdam(self._vars('main/gQ'),
                                       scale_grad_by_procs=False)
            self.pi_goal_adam = MpiAdam(self._vars('main/gpi'),
                                        scale_grad_by_procs=False)

            self.main_vars += self._vars('main/goal') + self._vars(
                'main/gQ') + self._vars('main/gpi')
            self.target_vars += self._vars('target/goal') + self._vars(
                'target/gQ') + self._vars('target/gpi')
            self.stats_vars += self._global_vars('e_stats')

            self.normal_vars = self._vars('main/Q') + self._vars(
                'main/pi') + self._vars('target/Q') + self._vars('target/pi')
            self.goal_vars = self._vars('main/gQ') + self._vars(
                'main/gpi') + self._vars('target/gQ') + self._vars(
                    'target/gpi')

            self.copy_normal_to_goal_op = list(
                map(lambda v: v[0].assign(0 * v[0] + 1 * v[1]),
                    zip(self.goal_vars, self.normal_vars)))

        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]),
                zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(
                lambda v: v[0].assign(self.polyak * v[0] +
                                      (1. - self.polyak) * v[1]),
                zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()

    def logs(self, prefix=''):
        logs = []
        logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
        logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
        logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
        logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]

        if prefix is not '' and not prefix.endswith('/'):
            return [(prefix + '/' + key, val) for key, val in logs]
        else:
            return logs

    def __getstate__(self):
        """Our policies can be loaded from pkl, but after unpickling you cannot continue training.
        """
        excluded_subnames = [
            '_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats', 'main',
            'target', 'lock', 'env', 'sample_transitions', 'stage_shapes',
            'create_actor_critic'
        ]

        state = {
            k: v
            for k, v in self.__dict__.items()
            if all([not subname in k for subname in excluded_subnames])
        }
        state['buffer_size'] = self.buffer_size
        state['tf'] = self.sess.run(
            [x for x in self._global_vars('') if 'buffer' not in x.name])
        return state

    def __setstate__(self, state):
        if 'sample_transitions' not in state:
            # We don't need this for playing the policy.
            state['sample_transitions'] = None

        self.__init__(**state)
        # set up stats (they are overwritten in __init__)
        for k, v in state.items():
            if k[-6:] == '_stats':
                self.__dict__[k] = v
        # load TF variables
        vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
        assert (len(vars) == len(state["tf"]))
        node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
        self.sess.run(node)

Example #27

Show file

class DDPG(object):
    @store_args
    def __init__(self,
                 input_dims,
                 buffer_size,
                 hidden,
                 layers,
                 network_class,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 bc_loss,
                 q_filter,
                 num_demo,
                 demo_batch_size,
                 prm_loss_weight,
                 aux_loss_weight,
                 sample_transitions,
                 gamma,
                 reuse=False,
                 pre_train_model=False,
                 update_model=True,
                 feature_net_path='',
                 **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).
            Added functionality to use demonstrations for training to Overcome exploration problem.
        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
            bc_loss: whether or not the behavior cloning loss should be used as an auxiliary loss
            q_filter: whether or not a filter on the q value update should be used when training with demonstartions
            num_demo: Number of episodes in to be used in the demonstration buffer
            demo_batch_size: number of samples to be used from the demonstrations buffer, per mpi thread
            prm_loss_weight: Weight corresponding to the primary loss
            aux_loss_weight: Weight corresponding to the auxiliary loss also called the cloning loss
        """
        if self.clip_return is None:
            self.clip_return = np.inf

        # ADDED
        self.use_contact = (self.contact_dim > 0)
        self.pre_train_model = pre_train_model
        self.feature_net_path = feature_net_path
        self.process_type = kwargs['process_type']
        self.contact_dim = kwargs['contact_dim']
        self.__dict__['use_contact'] = self.use_contact
        self.__dict__['pre_train'] = self.pre_train_model

        self.create_actor_critic = import_function(self.network_class)
        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o'] - self.contact_dim
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']
        self.feature_dim = kwargs['feature_dim']
        self.contact_point_dim = self.contact_dim // self.fixed_num_of_contact

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            logger.info("Creating a DDPG agent with action space %d x %s..." %
                        (self.dimu, self.max_u))
            self.sess = tf_util.get_session()
            # order: ['g', 'o', 'u', 'o_2', 'g_2', 'r'])
            if self.pre_train_model == 'cpc':
                self.staging_tf = StagingArea(
                    dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                    shapes=list(self.stage_shapes.values()))
                self.buffer_ph_tf = [
                    tf.placeholder(tf.float32, shape=shape)
                    for shape in self.stage_shapes.values()
                ]
                self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

                self.cpc_shape = OrderedDict()
                self.cpc_shape['obs_neg'] = (None, self.fixed_num_of_contact,
                                             self.contact_point_dim)
                self.cpc_shape['obs_pos'] = (None, self.fixed_num_of_contact,
                                             self.contact_point_dim)
                self.cpc_staging_tf = StagingArea(
                    dtypes=[tf.float32 for _ in self.cpc_shape.keys()],
                    shapes=list(self.cpc_shape.values()))
                self.cpc_buffer_ph_tf = [
                    tf.placeholder(tf.float32, shape=shape)
                    for shape in self.cpc_shape.values()
                ]
                self.cpc_stage_op = self.cpc_staging_tf.put(
                    self.cpc_buffer_ph_tf)
            else:
                self.staging_tf = StagingArea(
                    dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                    shapes=list(self.stage_shapes.values()))
                self.buffer_ph_tf = [
                    tf.placeholder(tf.float32, shape=shape)
                    for shape in self.stage_shapes.values()
                ]
                self.stage_op = self.staging_tf.put(self.buffer_ph_tf)
            self.update_model = update_model

            if self.pre_train_model != 'none':
                self.__dict__['feature_net_path'] = self.feature_net_path
                self.__dict__['clip_obs'] = self.clip_obs

            self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {
            key: (self.T - 1 if key != 'o' else self.T, *input_shapes[key])
            for key, val in input_shapes.items()
        }
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T, self.dimg)

        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                   self.sample_transitions)

    def _random_action(self, n):
        return np.random.uniform(low=-self.max_u,
                                 high=self.max_u,
                                 size=(n, self.dimu))

    def _preprocess_og(self, o, ag, g):
        # self.clip_obs = 200
        if self.relative_goals:
            g_shape = g.shape
            g = g.reshape(-1, self.dimg)
            ag = ag.reshape(-1, self.dimg)
            g = self.subtract_goals(g, ag)
            g = g.reshape(*g_shape)
        if len(o.shape) == 1:
            o[-self.dimo:] = np.clip(o[-self.dimo:], -self.clip_obs,
                                     self.clip_obs)
        elif len(o.shape) == 2:
            o[:, -self.dimo:] = np.clip(o[:, -self.dimo:], -self.clip_obs,
                                        self.clip_obs)
        g = np.clip(g, -self.clip_obs, self.clip_obs)
        return o, g

    def step(self, obs):
        actions = self.get_actions(obs['observation'], obs['achieved_goal'],
                                   obs['desired_goal'])
        return actions, None, None, None

    def get_actions(self,
                    o,
                    ag,
                    g,
                    noise_eps=0.,
                    random_eps=0.,
                    use_target_net=False,
                    compute_Q=False):
        o, g = self._preprocess_og(o, ag, g)
        policy = self.target if use_target_net else self.main
        # values to compute
        vals = [policy.pi_tf]
        if compute_Q:
            vals += [policy.Q_pi_tf]
        """lines added here, remove later"""
        ori = o[:, -7:-4].copy()
        noise = np.random.normal(0, 7e-4, ori.shape)
        o[:, -7:-4] += noise

        feed = {
            policy.o_tf:
            o.reshape(-1, self.dimo + self.contact_dim),
            policy.g_tf:
            g.reshape(-1, self.dimg),
            policy.u_tf:
            np.zeros((o.size // (self.dimo + self.contact_dim), self.dimu),
                     dtype=np.float32)
        }

        ret = self.sess.run(vals, feed_dict=feed)

        u = ret[0]
        noise = noise_eps * self.max_u * np.random.randn(
            *u.shape)  # gaussian noise
        u += noise
        u = np.clip(u, -self.max_u, self.max_u)
        u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (
            self._random_action(u.shape[0]) - u)  # eps-greedy
        if u.shape[0] == 1:
            u = u[0]
        u = u.copy()
        ret[0] = u

        if len(ret) == 1:
            return ret[0]
        else:
            return ret

    def store_episode(self, episode_batch, update_stats=True):
        """
        episode_batch: array of batch_size x (T or T+1) x dim_key
                       'o' is of size T+1, others are of size T
        """
        """lines added, remove later"""
        ori = episode_batch['o'][:, :, -7:-4].copy()
        noise = np.random.normal(0, 7e-4, ori.shape)
        episode_batch['o'][:, :, -7:-4] += noise

        self.buffer.store_episode(episode_batch)
        if update_stats:
            # add transitions to normalizer
            episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
            episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
            num_normalizing_transitions = transitions_in_episode_batch(
                episode_batch)
            # change goals here, recompute rewards
            transitions = self.sample_transitions(episode_batch,
                                                  num_normalizing_transitions)

            o, g, ag = transitions['o'], transitions['g'], transitions['ag']
            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
            # No need to preprocess the o_2 and g_2 since this is only used for stat
            """Normalization stuff here. """
            self.o_stats.update(transitions['o'][:, -self.o_stats.size:])
            self.g_stats.update(transitions['g'])
            if self.pre_train_model in ['cpc', 'curl']:
                feed_dict = {self.main.o_tf: transitions['o']}
                features = self.sess.run(self.main.features,
                                         feed_dict=feed_dict)
                features = np.clip(features, -self.clip_obs, self.clip_obs)
                self.feature_stats.update(features)
                self.feature_stats.recompute_stats()
            # elif self.process_type == 'max_pool':
            #     feed_dict = {self.main.o_tf:transitions['o']}
            #     features = self.sess.run(self.main.features, feed_dict=feed_dict)
            #     self.feature_stats.update(features)
            #     self.feature_stats.recompute_stats()

            self.o_stats.recompute_stats()
            self.g_stats.recompute_stats()
        return transitions['o']

    def get_current_buffer_size(self):
        return self.buffer.get_current_size()

    def _sync_optimizers(self):
        self.Q_adam.sync()
        self.pi_adam.sync()
        if self.pre_train_model == 'supervised':
            self.feature_adam.sync()
        elif self.pre_train_model == 'cpc':
            self.cpc.sync()
        elif self.pre_train_model == 'curl':
            self.curl_adam.sync()
            self.encoder_adam.sync()

    def _grads(self):
        # Avoid feed_dict here for performance!
        critic_loss, actor_loss, Q_grad, pi_grad = self.sess.run([
            self.Q_loss_tf, self.main.Q_pi_tf, self.Q_grad_tf, self.pi_grad_tf
        ])
        return critic_loss, actor_loss, Q_grad, pi_grad

    def _update(self, Q_grad, pi_grad):
        self.Q_adam.update(Q_grad, self.Q_lr)
        self.pi_adam.update(pi_grad, self.pi_lr)

    def sample_batch(self):
        transitions = self.buffer.sample(
            self.batch_size)  #otherwise only sample from primary buffer
        o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
        ag, ag_2 = transitions['ag'], transitions['ag_2']
        transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
        transitions['o_2'], transitions['g_2'] = self._preprocess_og(
            o_2, ag_2, g)

        transitions_batch = [
            transitions[key] for key in self.stage_shapes.keys()
            if key not in ['obs_pos', 'obs_neg']
        ]
        return transitions_batch

    def stage_batch(self, batch=None):
        if batch is None:
            batch = self.sample_batch()
        assert len(self.buffer_ph_tf) == len(batch)
        """lines added, remove them later"""
        ori = batch[1][:, -7:-4].copy()
        noise = np.random.normal(0, 7e-4, ori.shape)
        batch[1][:, -7:-4] += noise
        noise = np.random.normal(0, 7e-4, ori.shape)
        batch[3][:, -7:-4] += noise

        self.sess.run(self.stage_op,
                      feed_dict=dict(zip(self.buffer_ph_tf, batch)))

        if self.pre_train_model == 'supervised':
            assert batch[1].shape[1] == 583, "must use full observations"
            # 253, 251, 246, 233, 232, 220, 215, 210
            # feature_loss, max_feature_loss, feature_grad = self.sess.run([self.feature_loss_tf, self.max_feature_loss, self.feature_grad_tf])
            feature_loss, feature_grad = self.sess.run(
                [self.feature_loss_tf, self.feature_grad_tf])
            self.feature_adam.update(feature_grad, 1e-3)
            self.sess.run(self.update_feature_weights_target)
            self.sess.run(self.stage_op,
                          feed_dict=dict(zip(self.buffer_ph_tf, batch)))
            # writer = tf.summary.FileWriter("home/vioichigo/try/tactile-baselines/graph", self.sess.graph)
            # print(self.sess.run(self.main.features))
            # writer.close()
            return feature_loss
        elif self.pre_train_model == 'cpc':
            # run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            # run_metadata = tf.RunMetadata()
            # obs = pickle.load(open('/home/vioichigo/try/tactile-baselines/dataset/HandManipulateEgg-v0/50000obs.pickle', 'rb'))
            # indices = np.random.randint(obs.shape[0], size=batch[1].shape[0] * (self.main.n_neg - 1))
            # obs_neg = obs[indices]
            # obs_pos = batch[3][:, :self.contact_dim].reshape((-1, self.fixed_num_of_contact, self.contact_dim//self.fixed_num_of_contact))
            # # self.sess.run(self.cpc_stage_op, feed_dict=dict(zip(self.cpc_buffer_ph_tf, [obs_neg, obs_pos])), options=run_options, run_metadata=run_metadata)
            # first = time.time()
            # # self.sess.run(self.cpc_stage_op, feed_dict=dict(zip(self.cpc_buffer_ph_tf, [obs_neg, obs_pos])))
            # start = time.time()
            # print("feed:", start - first)
            # feed_dict = {self.cpc_inputs_tf['obs_pos']: obs_pos, self.cpc_inputs_tf['obs_neg']: obs_neg}
            # # dict(zip(self.cpc_inputs_tf, [obs_neg, obs_pos]))
            # cpc_loss, cpc_grad = self.sess.run([self.cpc_loss_tf, self.cpc_grad_tf], feed_dict=feed_dict, options=run_options, run_metadata=run_metadata)
            # tl = timeline.Timeline(run_metadata.step_stats)
            # ctf = tl.generate_chrome_trace_format()
            # with open('./timeline.json', 'w') as f:
            #     f.write(ctf)
            # now = time.time()
            # print("compute_loss", now - start)
            # self.cpc_adam.update(cpc_grad, 1e-3)
            # print("update weights", time.time() - now)
            # self.sess.run(self.update_cpc_weights_target)
            # self.sess.run(self.stage_op, feed_dict=dict(zip(self.buffer_ph_tf, batch)))

            return 1
        elif self.pre_train_model == 'curl':
            curl_loss, curl_grad, encoder_grad = self.sess.run(
                [self.curl_loss, self.curl_grad_tf, self.encoder_grad_tf])
            self.curl_adam.update(curl_grad, 1e-3)
            self.encoder_adam.update(encoder_grad, 1e-3)

            self.sess.run(self.stage_op,
                          feed_dict=dict(zip(self.buffer_ph_tf, batch)))
            self.sess.run(self.update_curl_weights_op)

            return curl_loss

            # return cpc_loss

    def train(self, stage=True):
        if stage:
            if self.pre_train_model == 'none':
                self.stage_batch()
            else:
                feature_loss = self.stage_batch()
        critic_loss, actor_loss, Q_grad, pi_grad = self._grads()
        self._update(Q_grad, pi_grad)
        if self.pre_train_model == 'none':
            return critic_loss, actor_loss
        else:
            return critic_loss, actor_loss, feature_loss

    def _init_target_net(self):
        self.sess.run(self.init_target_net_op)

    def update_target_net(self):
        self.sess.run(self.update_target_net_op)

    def clear_buffer(self):
        self.buffer.clear_buffer()

    def _vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                scope=self.scope + '/' + scope)
        if self.pre_train_model == 'supervised':
            if not self.update_model:
                res = [x for x in res if x.name.find('predicted_pos') == -1]
        elif self.pre_train_model == 'cpc':
            if not self.update_model:
                res = [x for x in res if x.name.find('new_cpc') == -1]
        # elif self.pre_train_model == 'curl':
        #     res = [x for x in res if x.name.find('W') == -1]
        assert len(res) > 0
        return res

    def _global_vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                scope=self.scope + '/' + scope)
        return res

    def _create_network(self, reuse=False):
        # running averages
        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.increment_global_step = tf.assign_add(
            self.global_step, 1, name='increment_global_step')
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            """Normalization stuff here. """
            if self.use_contact and self.process_type in ['none', 'test']:
                self.o_stats = Normalizer(self.dimo + self.contact_dim,
                                          self.norm_eps,
                                          self.norm_clip,
                                          sess=self.sess)
            else:
                self.o_stats = Normalizer(self.dimo,
                                          self.norm_eps,
                                          self.norm_clip,
                                          sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        if self.pre_train_model == 'cpc':
            with tf.variable_scope('feature_stats') as vs:
                if reuse:
                    vs.reuse_variables()
                z_dim = pickle.load(
                    open(self.feature_net_path + 'params.pickle', 'rb'))[0]
                self.feature_stats = Normalizer(z_dim,
                                                self.norm_eps,
                                                self.norm_clip,
                                                sess=self.sess)
                self.__dict__['feature_normalizer'] = self.feature_stats
        elif self.pre_train_model == 'curl':
            with tf.variable_scope('feature_stats') as vs:
                if reuse:
                    vs.reuse_variables()
                self.feature_stats = Normalizer(32,
                                                self.norm_eps,
                                                self.norm_clip,
                                                sess=self.sess)
                self.__dict__['feature_normalizer'] = self.feature_stats

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([
            (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())
        ])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        if self.pre_train_model == 'cpc':
            cpc_batch = self.cpc_staging_tf.get()
            cpc_batch_tf = OrderedDict([
                (key, cpc_batch[i])
                for i, key in enumerate(self.cpc_shape.keys())
            ])
            # self.cpc_batch_tf = {}
            # self.cpc_batch_tf['obs_neg'] = tf.placeholder(tf.float32, shape=(None, self.fixed_num_of_contact, self.contact_point_dim))
            # self.cpc_batch_tf['obs_pos'] = tf.placeholder(tf.float32, shape=(None, self.fixed_num_of_contact, self.contact_point_dim))
            # self.__dict__['cpc_inputs_tf'] = self.cpc_batch_tf

        #choose only the demo buffer samples

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf,
                                                 net_type='main',
                                                 **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            # reuse = False
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']  #next_observations
            target_batch_tf['g'] = batch_tf['g_2']  #next_goals
            self.target = self.create_actor_critic(target_batch_tf,
                                                   net_type='target',
                                                   **self.__dict__)
            vs.reuse_variables()

        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(
            batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
        self.Q_loss_tf = tf.reduce_mean(
            tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))

        # else: #If  not training with demonstrations
        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(
            tf.square(self.main.pi_tf / self.max_u))

        if self.pre_train_model == 'supervised':
            self.feature_net_var = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                scope='ddpg/main/pi/process/predicted_pos')
            pos = batch_tf['o'][:, self.contact_dim:][:, -7:-4]
            self.feature_loss_tf = tf.reduce_mean(
                tf.square(pos - self.main.features))
            # self.max_feature_loss = tf.reduce_max(tf.square(pos - self.main.features))
            feature_grads_tf = tf.gradients(self.feature_loss_tf,
                                            self.feature_net_var)
            assert len(self.feature_net_var) == len(feature_grads_tf)
            self.feature_grads_vars_tf = zip(feature_grads_tf,
                                             self.feature_net_var)
            self.feature_grad_tf = flatten_grads(grads=feature_grads_tf,
                                                 var_list=self.feature_net_var)
            self.feature_adam = MpiAdam(self.feature_net_var,
                                        scale_grad_by_procs=False)

            target_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                scope='ddpg/target/pi/process/predicted_pos')
            self.update_feature_weights_target = [
                tf.assign(new, old)
                for (new, old) in zip(target_vars, self.feature_net_var)
            ]
        elif self.pre_train_model == 'cpc':
            self.cpc_var = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                scope='ddpg/main/pi/process/new_cpc')
            pos = tf.reshape(batch_tf['o_2'][:, :self.contact_dim], [
                -1, self.fixed_num_of_contact,
                self.contact_dim // self.fixed_num_of_contact
            ])
            with tf.variable_scope('auxiliary'):
                self.cpc_loss_tf = compute_cpc_loss(
                    self.main.z_dim,
                    self.main.pos_features,
                    self.main.neg_features,
                    self.main.next,
                    process_type=self.process_type,
                    n_neg=self.main.n_neg,
                    type=self.main.type)
            cpc_grads_tf = tf.gradients(self.cpc_loss_tf, self.cpc_var)
            assert len(self.cpc_var) == len(cpc_grads_tf)
            self.cpc_grads_vars_tf = zip(cpc_grads_tf, self.cpc_var)
            self.cpc_grad_tf = flatten_grads(grads=cpc_grads_tf,
                                             var_list=self.cpc_var)
            self.cpc_adam = MpiAdam(self.cpc_var, scale_grad_by_procs=False)

            target_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                scope='ddpg/target/pi/process/new_cpc')
            self.update_cpc_weights_target = [
                tf.assign(new, old)
                for (new, old) in zip(target_vars, self.cpc_var)
            ]

        elif self.pre_train_model == 'curl':
            self.W = tf.get_variable("W",
                                     shape=[self.main.z_dim, self.main.z_dim],
                                     trainable=True)
            self.encoder_var = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope='ddpg/main/pi/curl')
            self.encoder_adam = MpiAdam(self.encoder_var,
                                        scale_grad_by_procs=False)
            self.curl_var = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                scope='ddpg/main/pi/curl') + [self.W]
            # + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='ddpg/target/pi/curl')
            self.curl_adam = MpiAdam(self.curl_var, scale_grad_by_procs=False)

            z_a = self.main.features
            z_pos = tf.stop_gradient(self.target.features)

            Wz = tf.matmul(self.W, tf.transpose(z_pos))  # (z_dim,B)
            logits = tf.matmul(z_a, Wz)  # (B,B)
            logits = logits - tf.reduce_max(logits, 1)[:, None]
            labels = tf.range(tf.shape(logits)[0])
            self.curl_loss = tf.nn.softmax_cross_entropy_with_logits(
                logits=logits, labels=labels)

            target_curl_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope='ddpg/target/pi/curl')

            self.update_curl_weights_op = list(
                map(
                    lambda v: v[0].assign(self.polyak * v[0] +
                                          (1. - self.polyak) * v[1]),
                    zip(target_curl_vars, self.encoder_var)))

            curl_grads_tf = tf.gradients(self.curl_loss, self.curl_var)
            self.curl_grad_tf = flatten_grads(grads=curl_grads_tf,
                                              var_list=self.curl_var)
            encoder_grads_tf = tf.gradients(self.curl_loss, self.encoder_var)
            self.encoder_grad_tf = flatten_grads(grads=encoder_grads_tf,
                                                 var_list=self.encoder_var)

        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf,
                                       var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf,
                                        var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'),
                               scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')

        self.stats_vars = self._global_vars('o_stats') + self._global_vars(
            'g_stats')

        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]),
                zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(
                lambda v: v[0].assign(self.polyak * v[0] +
                                      (1. - self.polyak) * v[1]),
                zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()

    def logs(self, prefix=''):
        logs = []
        logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
        logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
        logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
        logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]

        if prefix != '' and not prefix.endswith('/'):
            return [(prefix + '/' + key, val) for key, val in logs]
        else:
            return logs

    def __getstate__(self):
        """Our policies can be loaded from pkl, but after unpickling you cannot continue training.
        """
        excluded_subnames = [
            '_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats', 'main',
            'target', 'lock', 'env', 'sample_transitions', 'stage_shapes',
            'create_actor_critic'
        ]

        state = {
            k: v
            for k, v in self.__dict__.items()
            if all([not subname in k for subname in excluded_subnames])
        }
        state['buffer_size'] = self.buffer_size
        state['tf'] = self.sess.run(
            [x for x in self._global_vars('') if 'buffer' not in x.name])
        return state

    def __setstate__(self, state):
        if 'sample_transitions' not in state:
            # We don't need this for playing the policy.
            state['sample_transitions'] = None

        self.__init__(**state)
        # set up stats (they are overwritten in __init__)
        for k, v in state.items():
            if k[-6:] == '_stats':
                self.__dict__[k] = v
        # load TF variables
        vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
        assert (len(vars) == len(state["tf"]))
        node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
        self.sess.run(node)

    def save(self, save_path):
        tf_util.save_variables(save_path)

Example #28

Show file

File: ddpg.py Project: Divyankpandey/baselines

class DDPG(object):
    @store_args
    def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size,
                 Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T,
                 rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return,
                 sample_transitions, gamma, reuse=False, **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
        """
        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(self.network_class)

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None,)
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values()]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {key: (self.T if key != 'o' else self.T+1, *input_shapes[key])
                         for key, val in input_shapes.items()}
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T+1, self.dimg)

        buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)

    def _random_action(self, n):
        return np.random.uniform(low=-self.max_u, high=self.max_u, size=(n, self.dimu))

    def _preprocess_og(self, o, ag, g):
        if self.relative_goals:
            g_shape = g.shape
            g = g.reshape(-1, self.dimg)
            ag = ag.reshape(-1, self.dimg)
            g = self.subtract_goals(g, ag)
            g = g.reshape(*g_shape)
        o = np.clip(o, -self.clip_obs, self.clip_obs)
        g = np.clip(g, -self.clip_obs, self.clip_obs)
        return o, g

    def get_actions(self, o, ag, g, noise_eps=0., random_eps=0., use_target_net=False,
                    compute_Q=False):
        o, g = self._preprocess_og(o, ag, g)
        policy = self.target if use_target_net else self.main
        # values to compute
        vals = [policy.pi_tf]
        if compute_Q:
            vals += [policy.Q_pi_tf]
        # feed
        feed = {
            policy.o_tf: o.reshape(-1, self.dimo),
            policy.g_tf: g.reshape(-1, self.dimg),
            policy.u_tf: np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32)
        }

        ret = self.sess.run(vals, feed_dict=feed)
        # action postprocessing
        u = ret[0]
        noise = noise_eps * self.max_u * np.random.randn(*u.shape)  # gaussian noise
        u += noise
        u = np.clip(u, -self.max_u, self.max_u)
        u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (self._random_action(u.shape[0]) - u)  # eps-greedy
        if u.shape[0] == 1:
            u = u[0]
        u = u.copy()
        ret[0] = u

        if len(ret) == 1:
            return ret[0]
        else:
            return ret

    def store_episode(self, episode_batch, update_stats=True):
        """
        episode_batch: array of batch_size x (T or T+1) x dim_key
                       'o' is of size T+1, others are of size T
        """

        self.buffer.store_episode(episode_batch)

        if update_stats:
            # add transitions to normalizer
            episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
            episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
            num_normalizing_transitions = transitions_in_episode_batch(episode_batch)
            transitions = self.sample_transitions(episode_batch, num_normalizing_transitions)

            o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions['g'], transitions['ag']
            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
            # No need to preprocess the o_2 and g_2 since this is only used for stats

            self.o_stats.update(transitions['o'])
            self.g_stats.update(transitions['g'])

            self.o_stats.recompute_stats()
            self.g_stats.recompute_stats()

    def get_current_buffer_size(self):
        return self.buffer.get_current_size()

    def _sync_optimizers(self):
        self.Q_adam.sync()
        self.pi_adam.sync()

    def _grads(self):
        # Avoid feed_dict here for performance!
        critic_loss, actor_loss, Q_grad, pi_grad = self.sess.run([
            self.Q_loss_tf,
            self.main.Q_pi_tf,
            self.Q_grad_tf,
            self.pi_grad_tf
        ])
        return critic_loss, actor_loss, Q_grad, pi_grad

    def _update(self, Q_grad, pi_grad):
        self.Q_adam.update(Q_grad, self.Q_lr)
        self.pi_adam.update(pi_grad, self.pi_lr)

    def sample_batch(self):
        transitions = self.buffer.sample(self.batch_size)
        o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
        ag, ag_2 = transitions['ag'], transitions['ag_2']
        transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
        transitions['o_2'], transitions['g_2'] = self._preprocess_og(o_2, ag_2, g)

        transitions_batch = [transitions[key] for key in self.stage_shapes.keys()]
        return transitions_batch

    def stage_batch(self, batch=None):
        if batch is None:
            batch = self.sample_batch()
        assert len(self.buffer_ph_tf) == len(batch)
        self.sess.run(self.stage_op, feed_dict=dict(zip(self.buffer_ph_tf, batch)))

    def train(self, stage=True):
        if stage:
            self.stage_batch()
        critic_loss, actor_loss, Q_grad, pi_grad = self._grads()
        self._update(Q_grad, pi_grad)
        return critic_loss, actor_loss

    def _init_target_net(self):
        self.sess.run(self.init_target_net_op)

    def update_target_net(self):
        self.sess.run(self.update_target_net_op)

    def clear_buffer(self):
        self.buffer.clear_buffer()

    def _vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.scope + '/' + scope)
        assert len(res) > 0
        return res

    def _global_vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope + '/' + scope)
        return res

    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([(key, batch[i])
                                for i, key in enumerate(self.stage_shapes.keys())])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(
                target_batch_tf, net_type='target', **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
        self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))
        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats')
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()

    def logs(self, prefix=''):
        logs = []
        logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
        logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
        logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
        logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]

        if prefix is not '' and not prefix.endswith('/'):
            return [(prefix + '/' + key, val) for key, val in logs]
        else:
            return logs

    def __getstate__(self):
        """Our policies can be loaded from pkl, but after unpickling you cannot continue training.
        """
        excluded_subnames = ['_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats',
                             'main', 'target', 'lock', 'env', 'sample_transitions',
                             'stage_shapes', 'create_actor_critic']

        state = {k: v for k, v in self.__dict__.items() if all([not subname in k for subname in excluded_subnames])}
        state['buffer_size'] = self.buffer_size
        state['tf'] = self.sess.run([x for x in self._global_vars('') if 'buffer' not in x.name])
        return state

    def __setstate__(self, state):
        if 'sample_transitions' not in state:
            # We don't need this for playing the policy.
            state['sample_transitions'] = None

        self.__init__(**state)
        # set up stats (they are overwritten in __init__)
        for k, v in state.items():
            if k[-6:] == '_stats':
                self.__dict__[k] = v
        # load TF variables
        vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
        assert(len(vars) == len(state["tf"]))
        node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
        self.sess.run(node)