コード例 #1
0
    def _create_network(self,
                        pretrain_weights,
                        mi_prioritization,
                        reuse=False):
        if self.sac:
            logger.info("Creating a SAC agent with action space %d x %s..." %
                        (self.dimu, self.max_u))
        else:
            logger.info("Creating a DDPG agent with action space %d x %s..." %
                        (self.dimu, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([
            (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())
        ])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])
        batch_tf['w'] = tf.reshape(batch_tf['w'], [-1, 1])
        batch_tf['m'] = tf.reshape(batch_tf['m'], [-1, 1])
        batch_tf['s'] = tf.reshape(batch_tf['s'], [-1, 1])

        self.o_tau_tf = tf.placeholder(tf.float32,
                                       shape=(None, None, self.dimo))

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf,
                                                 net_type='main',
                                                 **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(target_batch_tf,
                                                   net_type='target',
                                                   **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # intrinsic reward (ir) network for mutual information
        with tf.variable_scope('ir') as vs:
            if reuse:
                vs.reuse_variables()
            self.main_ir = self.create_discriminator(batch_tf,
                                                     net_type='ir',
                                                     **self.__dict__)
            vs.reuse_variables()

        # loss functions

        mi_grads_tf = tf.gradients(tf.reduce_mean(self.main_ir.mi_tf),
                                   self._vars('ir/state_mi'))
        assert len(self._vars('ir/state_mi')) == len(mi_grads_tf)
        self.mi_grads_vars_tf = zip(mi_grads_tf, self._vars('ir/state_mi'))
        self.mi_grad_tf = flatten_grads(grads=mi_grads_tf,
                                        var_list=self._vars('ir/state_mi'))
        self.mi_adam = MpiAdam(self._vars('ir/state_mi'),
                               scale_grad_by_procs=False)

        sk_grads_tf = tf.gradients(tf.reduce_mean(self.main_ir.sk_tf),
                                   self._vars('ir/skill_ds'))
        assert len(self._vars('ir/skill_ds')) == len(sk_grads_tf)
        self.sk_grads_vars_tf = zip(sk_grads_tf, self._vars('ir/skill_ds'))
        self.sk_grad_tf = flatten_grads(grads=sk_grads_tf,
                                        var_list=self._vars('ir/skill_ds'))
        self.sk_adam = MpiAdam(self._vars('ir/skill_ds'),
                               scale_grad_by_procs=False)

        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return,
                      self.clip_return if self.clip_pos_returns else np.inf)

        self.e_w_tf = batch_tf['e_w']

        if not self.sac:
            self.main.neg_logp_pi_tf = tf.zeros(1)

        target_tf = tf.clip_by_value(
            self.r_scale * batch_tf['r'] * batch_tf['r_w'] +
            (tf.clip_by_value(self.mi_r_scale * batch_tf['m'], *(0, 1)) -
             (1 if not self.mi_r_scale == 0 else 0)) * batch_tf['m_w'] +
            (tf.clip_by_value(self.sk_r_scale * batch_tf['s'], *(-1, 0))) *
            batch_tf['s_w'] +
            (tf.clip_by_value(self.et_r_scale * self.main.neg_logp_pi_tf,
                              *(-1, 0))) * self.e_w_tf +
            self.gamma * target_Q_pi_tf, *clip_range)

        self.td_error_tf = tf.stop_gradient(target_tf) - self.main.Q_tf
        self.errors_tf = tf.square(self.td_error_tf)
        self.errors_tf = tf.reduce_mean(batch_tf['w'] * self.errors_tf)
        self.Q_loss_tf = tf.reduce_mean(self.errors_tf)

        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(
            tf.square(self.main.pi_tf / self.max_u))
        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf,
                                       var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf,
                                        var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'),
                               scale_grad_by_procs=False)

        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')

        # polyak averaging
        self.stats_vars = self._global_vars('o_stats') + self._global_vars(
            'g_stats')
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]),
                zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(
                lambda v: v[0].assign(self.polyak * v[0] +
                                      (1. - self.polyak) * v[1]),
                zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        if pretrain_weights:
            load_weight(self.sess, pretrain_weights, ['state_mi'])
            if self.finetune_pi:
                load_weight(self.sess, pretrain_weights, ['main'])

        self._sync_optimizers()
        if pretrain_weights and self.finetune_pi:
            load_weight(self.sess, pretrain_weights, ['target'])
        else:
            self._init_target_net()
コード例 #2
0
    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." %
                    (self.dimu, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([
            (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())
        ])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])
        batch_tf['w'] = tf.reshape(batch_tf['w'], [-1, 1])

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf,
                                                 net_type='main',
                                                 **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(target_batch_tf,
                                                   net_type='target',
                                                   **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(
            batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)

        self.td_error_tf = tf.stop_gradient(target_tf) - self.main.Q_tf
        self.errors_tf = tf.square(self.td_error_tf)
        self.errors_tf = tf.reduce_mean(batch_tf['w'] * self.errors_tf)
        self.Q_loss_tf = tf.reduce_mean(self.errors_tf)
        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(
            tf.square(self.main.pi_tf / self.max_u))
        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf,
                                       var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf,
                                        var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'),
                               scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars(
            'g_stats')
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]),
                zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(
                lambda v: v[0].assign(self.polyak * v[0] +
                                      (1. - self.polyak) * v[1]),
                zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()
コード例 #3
0
ファイル: ddpg.py プロジェクト: beyretb/baselines-Dot-To-Dot
    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([(key, batch[i])
                                for i, key in enumerate(self.stage_shapes.keys())])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(
                target_batch_tf, net_type='target', **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        # self.XX.pi_tf is the action policy we ll use for exploration (TO CONFIRM)
        # self.XX.Q_pi_tf is the Q network used to train this policy
        # self.XX.Q_tf

        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
        # target y_i= r + gamma*Q part of the Bellman equation (with returns clipped if necessary:
        target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
        # loss function for Q_tf where we exclude target_tf from the gradient computation:
        self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))

        # loss function for the action policy is that of the main Q_pi network:
        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
        # add L2 regularization term from the policy itself:
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))

        # define the gradients of the Q_loss and pi_loss wrt to their variables respectively
        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)

        # zip the gradients together with their respective variables
        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))

        # flattened gradients and variables
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi'))

        # optimizers (using MPI for parralel updates of the network (TO CONFIRM))
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False)

        # polyak averaging used for the update of the target networks in both pi and Q nets
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats')
        # operation to initialize the target nets at the main nets'values
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
        # operation to update the target nets from the main nets using polyak averaging
        self.update_target_net_op = list(
            map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]),
                zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()  # CHECK WHAT THIS DOES ????
        self._init_target_net()
コード例 #4
0
    def _create_network(self, reuse=False): ## num_demo 추가 -2
        logger.info("Debug : Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u))
        self.sess = tf_util.get_session()
        # self.num_demo = num_demo

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get() ## 그냥 꺼내오는거..
        batch_tf = OrderedDict([(key, batch[i])
                                for i, key in enumerate(self.stage_shapes.keys())])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        #choose only the demo buffer samples
        mask = np.concatenate((np.zeros(self.batch_size - self.demo_batch_size), np.ones(self.demo_batch_size)), axis = 0)

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__)
            vs.reuse_variables()
            print("tf.variable_scope(main) = {}".format(tf.variable_scope('target1'))) #-1

        with tf.variable_scope('target1') as vs:
            if reuse:
                vs.reuse_variables()
            target1_batch_tf = batch_tf.copy()
            target1_batch_tf['o'] = batch_tf['o_2']
            target1_batch_tf['g'] = batch_tf['g_2']
            self.target1 = self.create_actor_critic(
                target1_batch_tf, net_type='target1', **self.__dict__)
            vs.reuse_variables()
            print("tf.variable_scope(target1) = {}".format(tf.variable_scope('target1')))
            # print("batch= {}".format(target1_batch_tf))
            # print(type('target')) #<class 'baselines.her.actor_critic.ActorCritic'>
        assert len(self._vars("main")) == len(self._vars("target1"))

        with tf.variable_scope('target2') as vs:
            if reuse:
                vs.reuse_variables()
            target2_batch_tf = batch_tf.copy()
            target2_batch_tf['o'] = batch_tf['o_2']
            target2_batch_tf['g'] = batch_tf['g_2']
            self.target2 = self.create_actor_critic(
                target2_batch_tf, net_type='target2', **self.__dict__)
            vs.reuse_variables()
            print("tf.variable_scope(target2) = {}".format(tf.variable_scope('target2')))
            print("batch= {}".format(target2_batch_tf))
        assert len(self._vars("main")) == len(self._vars("target2"))

        for nd in range(self.num_demo):       

            ##A.R
            ##Compute the target Q value, Q1과 Q2중에 min값을 사용한다.

            target1_Q_pi_tf = self.target1.Q_pi_tf ##A.R policy training
            target2_Q_pi_tf = self.target2.Q_pi_tf ##A.R
            # target_Q_pi_tf = tf.minimum(target1_Q_pi_tf, target2_Q_pi_tf)
            # target1_Q_tf = self.target1.Q_tf ##A.R policy training
            # target2_Q_tf = self.target2.Q_tf ##A.R
            # print('target1={}/////target2={}'.format(target1_Q_tf,target2_Q_tf))
            target_Q_pi_tf = tf.minimum(target1_Q_pi_tf, target2_Q_pi_tf)
            # target_Q_tf = tf.minimum(target1_Q_tf, target2_Q_tf) ## 대체 코드

            # print("{}///{}///{}".format(target1_Q_pi_tf,target2_Q_pi_tf,tf.minimum(target1_Q_pi_tf, target2_Q_pi_tf)))
            ####
            #TD3에서 빠진 코드 :target_Q = reward + (done * discount * target_Q).detach()(L109) ->L428에서 해주고 clip한다

            # loss functions
            # for policy training, Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1])
            # target_Q_pi_tf = self.target.Q_pi_tf #original code
            clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
            target_Q_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
            # target_Q_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_tf, *clip_range) ## 대체 코드
            # self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))
            ##
            # current_Q1, current_Q2 = self.critic(state, action)

            # for critic training, Q_tf = nn(input_Q, [self.hidden] * self.layers + [1], reuse=True)
            # target_Q_pi_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_tf, *clip_range) #original code
            
            # self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf)) #critic taining 
            
            ## Get current Q estimates, for critic Q
            current_Q1 = self.main.Q_tf ##A.R
            current_Q2 = self.main.Q_tf
            # print("Q1={}".format(current_Q1))

            ## Compute critic loss
            ## Torch => critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q) 
            self.Q_loss_tf = tf.losses.mean_squared_error(current_Q1, target_Q_tf)+ tf.losses.mean_squared_error(current_Q2,target_Q_tf)
            # self.Q_loss_tf = tf.losses.mean_squared_error(current_Q1, target_Q_tf)+ tf.losses.mean_squared_error(current_Q2,target_Q_tf)
            # print("critic_loss ={}".format(self.Q_loss_tf))

            Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
            assert len(self._vars('main/Q')) == len(Q_grads_tf)

            ## Optimize the critic 아담 옵티마이저
            self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
            assert len(self._vars('main/Q')) == len(Q_grads_tf)
            self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
            self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q'))

            # ## Delayed policy updates
            if nd % self.td3_policy_freq == 0:
                # print("num_demo = {}".format(nd))
                target1_Q_pi_tf = self.target1.Q_pi_tf ##A.R policy training
                target2_Q_pi_tf = self.target2.Q_pi_tf ##A.R
                tf.print(target1_Q_pi_tf, [target1_Q_pi_tf])
                tf.print(target2_Q_pi_tf, [target2_Q_pi_tf])
                # print(target2_Q_pi_tf)
                target_Q_pi_tf = tf.minimum(target1_Q_pi_tf, target2_Q_pi_tf)

                # target_Q_pi_tf = self.target.Q_pi_tf
                clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
                target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
                self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))
                # Compute actor loss
                if self.bc_loss ==1 and self.q_filter == 1 : # train with demonstrations and use bc_loss and q_filter both
                    maskMain = tf.reshape(tf.boolean_mask(self.main.Q_tf > self.main.Q_pi_tf, mask), [-1]) #where is the demonstrator action better than actor action according to the critic? choose those samples only
                    #define the cloning loss on the actor's actions only on the samples which adhere to the above masks
                    self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask(tf.boolean_mask((self.main.pi_tf), mask), maskMain, axis=0) - tf.boolean_mask(tf.boolean_mask((batch_tf['u']), mask), maskMain, axis=0)))
                    self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf) #primary loss scaled by it's respective weight prm_loss_weight
                    self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) #L2 loss on action values scaled by the same weight prm_loss_weight
                    self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf #adding the cloning loss to the actor loss as an auxilliary loss scaled by its weight aux_loss_weight

                elif self.bc_loss == 1 and self.q_filter == 0: # train with demonstrations without q_filter
                    self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask((self.main.pi_tf), mask) - tf.boolean_mask((batch_tf['u']), mask)))
                    self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf)
                    self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
                    self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf

                else: #If  not training with demonstrations
                    self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
                    self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
                # self.pi_loss_tf = -tf.reduce_mean(self.main.pi_tf) ## what about target1?
                # self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
                # actor_loss = -tf.reduce_mean(self.main.Q_tf)
                # actor_loss += self.action_l2 * tf.reduce_mean(tf.square(self.main.Q_tf / self.max_u))

                pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
                assert len(self._vars('main/pi')) == len(pi_grads_tf)

                # Optimize the actor 
                # Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
                self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False)
                assert len(self._vars('main/pi')) == len(pi_grads_tf)
                self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
                self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi'))

                # Update the frozen target models
            ## torch code
                # for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                #     target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
                

                self.main_vars = self._vars('main/Q') + self._vars('main/pi')
                self.target1_vars = self._vars('target1/Q') + self._vars('target1/pi') ##A.R
                self.target2_vars = self._vars('target2/Q') + self._vars('target2/pi') ##A.R
                if target_Q_pi_tf == target1_Q_pi_tf:
                    target_vars = self.target1_vars
                else:
                    target_vars = self.target2_vars
                # self.target_vars = self._vars('target/Q') + self._vars('target/pi') #original
                self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats')
                self.init_target1_net_op = list(
                    map(lambda v: v[0].assign(v[1]), zip(self.target1_vars, self.main_vars)))
                self.init_target2_net_op = list(
                    map(lambda v: v[0].assign(v[1]), zip(self.target2_vars, self.main_vars)))

                self.update_target_net_op = list(
                    map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(target_vars, self.main_vars)))
                self.update_target1_net_op = list(
                    map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(target_vars, self.main_vars)))
                self.update_target2_net_op = list(
                    map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(target_vars, self.main_vars)))


                tf.variables_initializer(self._global_vars('')).run()
                self._sync_optimizers()
                self._init_target_net()
コード例 #5
0
    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." %
                    (self.dimu, self.max_u))
        self.sess = tf_util.get_session()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([
            (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())
        ])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        #choose only the demo buffer samples
        mask = np.concatenate(
            (np.zeros(self.batch_size - self.demo_batch_size),
             np.ones(self.demo_batch_size)),
            axis=0)

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf,
                                                 net_type='main',
                                                 **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(target_batch_tf,
                                                   net_type='target',
                                                   **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(
            batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
        self.Q_loss_tf = tf.reduce_mean(
            tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))

        if self.bc_loss == 1 and self.q_filter == 1:  # train with demonstrations and use bc_loss and q_filter both
            maskMain = tf.reshape(
                tf.boolean_mask(self.main.Q_tf > self.main.Q_pi_tf,
                                mask), [-1]
            )  #where is the demonstrator action better than actor action according to the critic? choose those samples only
            #define the cloning loss on the actor's actions only on the samples which adhere to the above masks
            self.cloning_loss_tf = tf.reduce_sum(
                tf.square(
                    tf.boolean_mask(tf.boolean_mask((self.main.pi_tf), mask),
                                    maskMain,
                                    axis=0) -
                    tf.boolean_mask(tf.boolean_mask((batch_tf['u']), mask),
                                    maskMain,
                                    axis=0)))
            self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(
                self.main.Q_pi_tf
            )  #primary loss scaled by it's respective weight prm_loss_weight
            self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(
                tf.square(self.main.pi_tf / self.max_u)
            )  #L2 loss on action values scaled by the same weight prm_loss_weight
            self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf  #adding the cloning loss to the actor loss as an auxilliary loss scaled by its weight aux_loss_weight

        elif self.bc_loss == 1 and self.q_filter == 0:  # train with demonstrations without q_filter
            self.cloning_loss_tf = tf.reduce_sum(
                tf.square(
                    tf.boolean_mask((self.main.pi_tf), mask) -
                    tf.boolean_mask((batch_tf['u']), mask)))
            self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(
                self.main.Q_pi_tf)
            self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(
                tf.square(self.main.pi_tf / self.max_u))
            self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf

        else:  #If  not training with demonstrations
            self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
            self.pi_loss_tf += self.action_l2 * tf.reduce_mean(
                tf.square(self.main.pi_tf / self.max_u))

        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf,
                                       var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf,
                                        var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'),
                               scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars(
            'g_stats')
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]),
                zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(
                lambda v: v[0].assign(self.polyak * v[0] +
                                      (1. - self.polyak) * v[1]),
                zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()
コード例 #6
0
ファイル: ddpg.py プロジェクト: RyanRizzo96/RL_baselines
    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." %
                    (self.dimu, self.action_scale))
        self.sess = tf_util.get_session()

        # running averages
        with tf.variable_scope('o_stats') as variable_scope:
            if reuse:
                variable_scope.reuse_variables()
            self.o_stats = Normalizer(self.dimo,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('g_stats') as variable_scope:
            if reuse:
                variable_scope.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([
            (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())
        ])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        # choose only the demo buffer samples
        mask = np.concatenate(
            (np.zeros(self.batch_size - self.demo_batch_size),
             np.ones(self.demo_batch_size)),
            axis=0)

        # networks
        with tf.variable_scope('main') as variable_scope:
            if reuse:
                variable_scope.reuse_variables()

            # Create actor critic network
            self.main = self.create_actor_critic(batch_tf,
                                                 net_type='main',
                                                 **self.__dict__)
            variable_scope.reuse_variables()

        with tf.variable_scope('target') as variable_scope:
            if reuse:
                variable_scope.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(target_batch_tf,
                                                   net_type='target',
                                                   **self.__dict__)
            variable_scope.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_critic_actor_tf = self.target.critic_with_actor_tf
        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else np.inf)

        target_tf = tf.clip_by_value(
            batch_tf['r'] + self.gamma * target_critic_actor_tf, *clip_range)

        # MSE of target_tf - critic_tf. This is the TD Learning step
        self.critic_loss_tf = tf.reduce_mean(
            tf.square(tf.stop_gradient(target_tf) - self.main.critic_tf))

        #
        self.actor_loss_tf = -tf.reduce_mean(self.main.critic_with_actor_tf)
        self.actor_loss_tf += self.action_l2 * tf.reduce_mean(
            tf.square(self.main.actor_tf / self.action_scale))

        # Constructs symbolic derivatives of sum of critic_loss_tf vs _vars('main/Q')
        critic_grads_tf = tf.gradients(self.critic_loss_tf,
                                       self._vars('main/Q'))
        actor_grads_tf = tf.gradients(self.actor_loss_tf,
                                      self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(critic_grads_tf)
        assert len(self._vars('main/pi')) == len(actor_grads_tf)
        self.critic_grads_vars_tf = zip(critic_grads_tf, self._vars('main/Q'))
        self.actor_grads_vars_tf = zip(actor_grads_tf, self._vars('main/pi'))

        # Flattens variables and their gradients.
        self.critic_grads = flatten_grads(grads=critic_grads_tf,
                                          var_list=self._vars('main/Q'))
        self.actor_grads = flatten_grads(grads=actor_grads_tf,
                                         var_list=self._vars('main/pi'))

        # optimizers
        self.critic_optimiser = MpiAdam(self._vars('main/Q'),
                                        scale_grad_by_procs=False)
        self.actor_optimiser = MpiAdam(self._vars('main/pi'),
                                       scale_grad_by_procs=False)

        # polyak averaging used to update target network
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars(
            'g_stats')

        # list( map( lambda( assign() ), zip()))
        self.init_target_net_op = list(
            map(  # Apply lambda to each item item in the zipped list
                lambda v: v[0].assign(v[1]),
                zip(self.target_vars, self.main_vars)))

        # Polyak-Ruppert averaging where most recent iterations are weighted more than past iterations.
        self.update_target_net_op = list(
            map(  # Apply lambda to each item item in the zipped list
                lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) *
                                      v[1]),  # polyak averaging
                zip(self.target_vars,
                    self.main_vars))  # [(target_vars, main_vars), (), ...]
        )

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()
コード例 #7
0
ファイル: ddpg.py プロジェクト: mk37972/gym_adjustments
    def _create_network(self, reuse=False):
        #        logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u))
        self.sess = tf_util.get_session()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([
            (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())
        ])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])
        #choose only the demo buffer samples
        mask = np.concatenate(
            (np.zeros(self.batch_size - self.demo_batch_size),
             np.ones(self.demo_batch_size)),
            axis=0)

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf,
                                                 net_type='main',
                                                 **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(target_batch_tf,
                                                   net_type='target',
                                                   **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(
            batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
        self.Q_loss_tf = tf.reduce_mean(
            tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))

        if self.bc_loss == 1 and self.q_filter == 1:  # train with demonstrations and use bc_loss and q_filter both
            self.maskMain = tf.reshape(
                tf.boolean_mask(self.main.Q_tf > self.main.Q_pi_tf,
                                mask), [-1]
            )  #where is the demonstrator action better than actor action according to the critic? choose those samples only
            #define the cloning loss on the actor's actions only on the samples which adhere to the above masks
            self.cloning_loss_tf = tf.reduce_sum(
                tf.square(
                    tf.boolean_mask(tf.boolean_mask((self.main.pi_tf), mask),
                                    self.maskMain,
                                    axis=0) -
                    tf.boolean_mask(tf.boolean_mask((batch_tf['u']), mask),
                                    self.maskMain,
                                    axis=0)))
            self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(
                self.main.Q_pi_tf
            )  #primary loss scaled by it's respective weight prm_loss_weight
            self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(
                tf.square(self.main.pi_tf / self.max_u)
            )  #L2 loss on action values scaled by the same weight prm_loss_weight
            self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf  #* self.w_loss #adding the cloning loss to the actor loss as an auxilliary loss scaled by its weight aux_loss_weight

        elif self.bc_loss == 1 and self.q_filter == 0:  # train with demonstrations without q_filter
            self.maskMain = tf.constant([0.0])
            self.cloning_loss_tf = tf.reduce_sum(
                tf.square(
                    tf.boolean_mask((self.main.pi_tf), mask) -
                    tf.boolean_mask((batch_tf['u']), mask)))
            self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(
                self.main.Q_pi_tf)
            self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(
                tf.square(self.main.pi_tf / self.max_u))
            self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf

        else:  #If  not training with demonstrations
            self.maskMain = tf.constant([0.0])
            self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
            self.pi_loss_tf += self.action_l2 * tf.reduce_mean(
                tf.square(self.main.pi_tf / self.max_u))

        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))

        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)

        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))

        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf,
                                       var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf,
                                        var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'),
                               scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars(
            'g_stats')
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]),
                zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(
                lambda v: v[0].assign(self.polyak * v[0] +
                                      (1. - self.polyak) * v[1]),
                zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()

        # # load weights from pretrained model
        # weightData = np.load('./hand_dapg/dapg/policies/saved_weights.npz', allow_pickle=True)
        # kernel1 = weightData['kernel1']
        # kernel2 = weightData['kernel2']
        # kernel3 = weightData['kernel3']
        # bias1 = weightData['bias1']
        # bias2 = weightData['bias2']
        # bias3 = weightData['bias3']
        # o_mean = weightData['o_mean']
        # o_std = weightData['o_std']

        # # print([n.name for n in tf.get_default_graph().as_graph_def().node])
        # k1 = self.sess.graph.get_tensor_by_name('ddpg/main/pi/_0/kernel:0')
        # b1 = self.sess.graph.get_tensor_by_name('ddpg/main/pi/_0/bias:0')
        # k2 = self.sess.graph.get_tensor_by_name('ddpg/main/pi/_1/kernel:0')
        # b2 = self.sess.graph.get_tensor_by_name('ddpg/main/pi/_1/bias:0')
        # k3 = self.sess.graph.get_tensor_by_name('ddpg/main/pi/_2/kernel:0')
        # b3 = self.sess.graph.get_tensor_by_name('ddpg/main/pi/_2/bias:0')
        # o_m = self.sess.graph.get_tensor_by_name('ddpg/o_stats/mean:0')
        # o_s = self.sess.graph.get_tensor_by_name('ddpg/o_stats/std:0')
        # o_sumsq = self.sess.graph.get_tensor_by_name('ddpg/o_stats/sumsq:0')
        # o_sum = self.sess.graph.get_tensor_by_name('ddpg/o_stats/sum:0')
        # o_count = self.sess.graph.get_tensor_by_name('ddpg/o_stats/count:0')

        # # feed the weights and biases, normalization stats
        # self.sess.run(tf.assign(k1,tf.concat([tf.transpose(kernel1, perm=[1,0]), tf.zeros(shape=(9,32))],axis=0)))
        # self.sess.run(tf.assign(k2,tf.transpose(kernel2, perm=[1,0])))
        # self.sess.run(tf.assign(k3,tf.transpose(kernel3, perm=[1,0])))
        # self.sess.run(tf.assign(b1,bias1))
        # self.sess.run(tf.assign(b2,bias2))
        # self.sess.run(tf.assign(b3,bias3))
        # self.sess.run(tf.assign(o_m,o_mean))
        # self.sess.run(tf.assign(o_s,o_std))
        # self.sess.run(tf.assign(o_sum,o_mean*1e5))
        # self.sess.run(tf.assign(o_sumsq,np.square(o_mean)*1e5))
        # self.sess.run(tf.assign(o_count,[1e5]))

        self._sync_optimizers()
        self._init_target_net()
コード例 #8
0
ファイル: value_ensemble_v1.py プロジェクト: zzyunzhi/vds
    def _create_double_network(self, reuse=False):
        # logger.info("Creating a q function ensemble with action space %d x %s..." % (self.dimu, self.max_u))
        self.sess = tf_util.get_session()

        # running averages, separate from alg (this is within a different scope)
        # assume reuse is False
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('g_stats'):
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        self.V_loss_tf = [None] * self.size_ensemble
        self.V_fun = [None] * self.size_ensemble
        self.V_target_fun = [None] * self.size_ensemble
        self.V_grads_vars_tf = [None] * self.size_ensemble
        self.V_grad_tf = [None] * self.size_ensemble
        self.V_adam = [None] * self.size_ensemble

        self.init_target_net_op = [None] * self.size_ensemble
        self.update_target_net_op = [None] * self.size_ensemble

        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else self.clip_return)

        for e in range(self.size_ensemble):
            # mini-batch sampling
            batch = self.staging_tf[e].get()
            batch_tf = OrderedDict([
                (key, batch[i])
                for i, key in enumerate(self.stage_shapes.keys())
            ])
            batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

            # networks (no target network for now)
            with tf.variable_scope(f've_{e}') as vs:
                if reuse:
                    vs.reuse_variables()
                v_function = self.create_v_function(batch_tf, **self.__dict__)
                vs.reuse_variables()

            with tf.variable_scope(f've_{e}_target') as vs:
                if reuse:
                    vs.reuse_variables()
                target_batch_tf = batch_tf.copy()
                target_batch_tf['o'] = batch_tf['o_2']
                target_batch_tf['g'] = batch_tf['g_2']
                target_batch_tf['u'] = batch_tf['u_2']
                v_target_function = self.create_v_function(
                    target_batch_tf, **self.__dict__)
                vs.reuse_variables()

            # loss functions
            target_tf = tf.clip_by_value(
                batch_tf['r'] + self.gamma * v_target_function.V_tf,
                *clip_range)
            V_loss_tf = tf.reduce_mean(
                tf.square(tf.stop_gradient(target_tf) - v_function.V_tf))

            V_scope = f've_{e}/V'
            V_grads_tf = tf.gradients(V_loss_tf, self._vars(V_scope))
            assert len(self._vars(V_scope)) == len(V_grads_tf)
            V_grads_vars_tf = zip(V_grads_tf, self._vars(V_scope))
            V_grad_tf = flatten_grads(grads=V_grads_tf,
                                      var_list=self._vars(V_scope))

            # optimizers
            V_adam = MpiAdam(self._vars(V_scope), scale_grad_by_procs=False)

            # store in attribute lists
            self.V_loss_tf[e] = V_loss_tf
            self.V_fun[e] = v_function
            self.V_target_fun[e] = v_target_function
            self.V_grads_vars_tf[e] = V_grads_vars_tf
            self.V_grad_tf[e] = V_grad_tf
            self.V_adam[e] = V_adam

        # polyak averaging
        main_vars = sum(
            [self._vars(f've_{e}/V') for e in range(self.size_ensemble)], [])
        target_vars = sum([
            self._vars(f've_{e}_target/V') for e in range(self.size_ensemble)
        ], [])
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]), zip(target_vars, main_vars)))
        self.update_target_net_op = list(
            map(
                lambda v: v[0].assign(self.polyak * v[0] +
                                      (1. - self.polyak) * v[1]),
                zip(target_vars, main_vars)))

        assert len(main_vars) == len(target_vars)

        # report loss as the average of value function loss over the ensemble
        # self.V_loss_tf = tf.reduce_mean(self.V_loss_tf)

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()
コード例 #9
0
    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([(key, batch[i])
                                for i, key in enumerate(self.stage_shapes.keys())])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        ########### Getting the bias terms - Ameet
        bias = self.staging_tf_new.get()
        bias_tf = OrderedDict([(key, bias[i])
                                for i, key in enumerate(self.stage_shapes_new.keys())])
        bias_tf['bias'] = tf.reshape(bias_tf['bias'], [-1, 1])
        #######################################

        # Create main and target networks, each will have a pi_tf, Q_tf and Q_pi_tf
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(
                target_batch_tf, net_type='target', **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
        ############## Added for bias - Ameet
        error = (tf.stop_gradient(target_tf) - self.main.Q_tf) * bias_tf['bias']
        self.Q_loss_tf = tf.reduce_mean(tf.square(error))
        # self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf * bias_tf['bias'])
        # Note that the following statement does not include bias because of the remark in the IEEE paper
        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
        ##############
        # Regularization - L2 - Check - Penalty for taking the best action
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        ################### Shape Info
        ####Shape of Q_grads_tf is: 8
        ####Shape of Q_grads_tf[0] is: (17, 256)
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False)

        # polyak averaging
        # 'main/Q' is a way of communicating the scope of the variables
        # _vars has a way to understand this
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats')
        # Update the networks
        # target net is updated by using polyak averaging
        # target net is initialized by just copying the main net
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()
コード例 #10
0
ファイル: value_ensemble_v1.py プロジェクト: zzyunzhi/vds
    def _create_network(self, reuse=False):
        # logger.info("Creating a q function ensemble with action space %d x %s..." % (self.dimu, self.max_u))
        # self.sess = tf_util.get_session()
        self.sess = tf.get_default_session()
        assert self.sess is not None

        # running averages, separate from alg (this is within a different scope)
        # assume reuse is False
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('g_stats'):
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        self.V_loss_tf = [None] * self.size_ensemble
        self.V_fun = [None] * self.size_ensemble
        self.V_grads_vars_tf = [None] * self.size_ensemble
        self.V_grad_tf = [None] * self.size_ensemble
        self.V_adam = [None] * self.size_ensemble
        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else self.clip_return)

        for e in range(self.size_ensemble):
            # mini-batch sampling
            batch = self.staging_tf[e].get()
            batch_tf = OrderedDict([
                (key, batch[i])
                for i, key in enumerate(self.stage_shapes.keys())
            ])
            batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

            # networks (no target network for now)
            with tf.variable_scope("ve_{}".format(e)) as vs:
                if reuse:
                    vs.reuse_variables()
                v_function = self.create_v_function(batch_tf, **self.__dict__)
                vs.reuse_variables()

            # loss functions
            V_2_tf = v_function.V_2_tf
            target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * V_2_tf,
                                         *clip_range)
            V_loss_tf = tf.reduce_mean(
                tf.square(tf.stop_gradient(target_tf) - v_function.V_tf))

            V_scope = 've_{}/V'.format(e)
            V_grads_tf = tf.gradients(V_loss_tf, self._vars(V_scope))
            assert len(self._vars(V_scope)) == len(V_grads_tf)
            V_grads_vars_tf = zip(V_grads_tf, self._vars(V_scope))
            V_grad_tf = flatten_grads(grads=V_grads_tf,
                                      var_list=self._vars(V_scope))

            # optimizers
            V_adam = MpiAdam(self._vars(V_scope), scale_grad_by_procs=False)

            # store in attribute lists
            self.V_loss_tf[e] = V_loss_tf
            self.V_fun[e] = v_function
            self.V_grads_vars_tf[e] = V_grads_vars_tf
            self.V_grad_tf[e] = V_grad_tf
            self.V_adam[e] = V_adam

        n_vars = [
            len(self._vars("ve_{}".format(e)))
            for e in range(self.size_ensemble)
        ]
        assert np.all(np.asarray(n_vars) == n_vars[0]), n_vars

        # report loss as the average of value function loss over the ensemble
        # self.V_loss_tf = tf.reduce_mean(self.V_loss_tf)

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
コード例 #11
0
ファイル: value_ensemble_v1.py プロジェクト: zzyunzhi/vds
class ValueEnsemble:
    @store_args
    def __init__(self,
                 *,
                 input_dims,
                 size_ensemble,
                 use_Q,
                 use_double_network,
                 buffer_size,
                 hidden,
                 layers,
                 batch_size,
                 lr,
                 norm_eps,
                 norm_clip,
                 polyak,
                 max_u,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 sample_transitions,
                 gamma,
                 reuse=False,
                 **kwargs):
        """Implementation of value function ensemble.

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            size_ensemble (int): number of value functions in the ensemble
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            batch_size (int): batch size for training
            lr (float): learning rate for the Q (critic) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped in Bellman update
            inference_clip_pos_returns (boolean): whether or not output of the value output used for disagreement should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
        """
        if self.use_double_network:
            self.use_Q = True
            self.create_v_function = DoubleQFunction
        elif self.use_Q:
            self.create_v_function = QFunction
        else:
            self.create_v_function = VFunction

        if self.clip_return is None:
            self.clip_return = np.inf
        # self.inference_clip_range = (-self.clip_return, 0. if inference_clip_pos_returns else self.clip_return)

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        if self.use_Q:
            stage_shapes['u_2'] = stage_shapes['u']
        stage_shapes['r'] = (None, )
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = [None] * self.size_ensemble
            self.stage_ops = [None] * self.size_ensemble
            self.buffer_ph_tf = []
            for e in range(self.size_ensemble):
                staging_tf = StagingArea(
                    dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                    shapes=list(self.stage_shapes.values()))
                buffer_ph_tf = [
                    tf.placeholder(tf.float32, shape=shape)
                    for shape in self.stage_shapes.values()
                ]
                stage_op = staging_tf.put(buffer_ph_tf)

                # store in attribute list
                self.staging_tf[e] = staging_tf
                self.buffer_ph_tf.extend(buffer_ph_tf)
                self.stage_ops[e] = stage_op

            if self.use_double_network:
                self._create_double_network(reuse=reuse)
            else:
                self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {
            key: (self.T - 1 if key != 'o' else self.T, *input_shapes[key])
            for key, val in input_shapes.items()
        }
        buffer_shapes['ag'] = (self.T, self.dimg)
        # if self.use_Q:
        #     buffer_shapes['u_2'] = (self.T-1, self.dimu)

        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size
        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                   self.sample_transitions)

    # @property
    # def buffer_full(self):
    #     return self.buffer.full

    # def buffer_get_transitions_stored(self):
    #     return self.buffer.get_transitions_stored()

    def get_values(self, o, ag, g, u=None):
        if self.size_ensemble == 0:
            return None
        if u is not None:
            assert self.use_Q
            u = self._preprocess_u(u)
        o, g = self._preprocess_og(o, ag, g)
        # values to compute
        vars = [v_function.V_tf for v_function in self.V_fun]
        # feed
        feed = {}
        for e in range(self.size_ensemble):
            feed[self.V_fun[e].o_tf] = o.reshape(-1, self.dimo)
            feed[self.V_fun[e].g_tf] = g.reshape(-1, self.dimg)
            if self.use_Q:
                feed[self.V_fun[e].u_tf] = u.reshape(-1, self.dimu)

        ret = self.sess.run(vars, feed_dict=feed)
        # value prediction postprocessing
        # ret = np.clip(ret, -self.clip_return, 0. if self.clip_pos_returns else self.clip_return)
        ret = np.clip(ret, -self.clip_return,
                      0. if self.clip_pos_returns else np.inf)
        return ret

    def _sample_batch(self, policy):
        batch_size_in_transitions = self.batch_size * self.size_ensemble
        transitions = self.buffer.sample(batch_size_in_transitions)

        # label policy
        if self.use_Q:
            u = transitions['u']
            u_2 = policy.get_actions(o=transitions['o_2'],
                                     ag=transitions['ag_2'],
                                     g=transitions['g'])
            transitions['u'] = self._preprocess_u(u)
            transitions['u_2'] = self._preprocess_u(u_2)

        o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
        ag, ag_2 = transitions['ag'], transitions['ag_2']
        transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
        transitions['o_2'], transitions['g_2'] = self._preprocess_og(
            o_2, ag_2, g)

        transitions_batches = [
            transitions[key][e * self.batch_size:(e + 1) * self.batch_size]
            for e in range(self.size_ensemble)
            for key in self.stage_shapes.keys()
        ]

        return transitions_batches

    def _stage_batch(self, policy):
        batches = self._sample_batch(policy=policy)
        assert len(self.buffer_ph_tf) == len(batches)
        self.sess.run(self.stage_ops,
                      feed_dict=dict(zip(self.buffer_ph_tf, batches)))

    def logs(self, prefix=''):
        logs = []
        logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
        logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
        logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
        logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]

        if prefix != '' and not prefix.endswith('/'):
            return [(prefix + '/' + key, val) for key, val in logs]
        else:
            return logs

    def train(self, policy):
        self._stage_batch(policy=policy)
        V_loss, V_grad = self._grads()
        self._update(V_grad)
        assert len(V_loss) == self.size_ensemble
        return np.mean(V_loss)

    def _update(self, V_grad):
        for e in range(self.size_ensemble):
            self.V_adam[e].update(V_grad[e], self.lr)

    def _create_network(self, reuse=False):
        # logger.info("Creating a q function ensemble with action space %d x %s..." % (self.dimu, self.max_u))
        # self.sess = tf_util.get_session()
        self.sess = tf.get_default_session()
        assert self.sess is not None

        # running averages, separate from alg (this is within a different scope)
        # assume reuse is False
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('g_stats'):
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        self.V_loss_tf = [None] * self.size_ensemble
        self.V_fun = [None] * self.size_ensemble
        self.V_grads_vars_tf = [None] * self.size_ensemble
        self.V_grad_tf = [None] * self.size_ensemble
        self.V_adam = [None] * self.size_ensemble
        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else self.clip_return)

        for e in range(self.size_ensemble):
            # mini-batch sampling
            batch = self.staging_tf[e].get()
            batch_tf = OrderedDict([
                (key, batch[i])
                for i, key in enumerate(self.stage_shapes.keys())
            ])
            batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

            # networks (no target network for now)
            with tf.variable_scope("ve_{}".format(e)) as vs:
                if reuse:
                    vs.reuse_variables()
                v_function = self.create_v_function(batch_tf, **self.__dict__)
                vs.reuse_variables()

            # loss functions
            V_2_tf = v_function.V_2_tf
            target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * V_2_tf,
                                         *clip_range)
            V_loss_tf = tf.reduce_mean(
                tf.square(tf.stop_gradient(target_tf) - v_function.V_tf))

            V_scope = 've_{}/V'.format(e)
            V_grads_tf = tf.gradients(V_loss_tf, self._vars(V_scope))
            assert len(self._vars(V_scope)) == len(V_grads_tf)
            V_grads_vars_tf = zip(V_grads_tf, self._vars(V_scope))
            V_grad_tf = flatten_grads(grads=V_grads_tf,
                                      var_list=self._vars(V_scope))

            # optimizers
            V_adam = MpiAdam(self._vars(V_scope), scale_grad_by_procs=False)

            # store in attribute lists
            self.V_loss_tf[e] = V_loss_tf
            self.V_fun[e] = v_function
            self.V_grads_vars_tf[e] = V_grads_vars_tf
            self.V_grad_tf[e] = V_grad_tf
            self.V_adam[e] = V_adam

        n_vars = [
            len(self._vars("ve_{}".format(e)))
            for e in range(self.size_ensemble)
        ]
        assert np.all(np.asarray(n_vars) == n_vars[0]), n_vars

        # report loss as the average of value function loss over the ensemble
        # self.V_loss_tf = tf.reduce_mean(self.V_loss_tf)

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()

    def _create_double_network(self, reuse=False):
        # logger.info("Creating a q function ensemble with action space %d x %s..." % (self.dimu, self.max_u))
        self.sess = tf_util.get_session()

        # running averages, separate from alg (this is within a different scope)
        # assume reuse is False
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('g_stats'):
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        self.V_loss_tf = [None] * self.size_ensemble
        self.V_fun = [None] * self.size_ensemble
        self.V_target_fun = [None] * self.size_ensemble
        self.V_grads_vars_tf = [None] * self.size_ensemble
        self.V_grad_tf = [None] * self.size_ensemble
        self.V_adam = [None] * self.size_ensemble

        self.init_target_net_op = [None] * self.size_ensemble
        self.update_target_net_op = [None] * self.size_ensemble

        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else self.clip_return)

        for e in range(self.size_ensemble):
            # mini-batch sampling
            batch = self.staging_tf[e].get()
            batch_tf = OrderedDict([
                (key, batch[i])
                for i, key in enumerate(self.stage_shapes.keys())
            ])
            batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

            # networks (no target network for now)
            with tf.variable_scope(f've_{e}') as vs:
                if reuse:
                    vs.reuse_variables()
                v_function = self.create_v_function(batch_tf, **self.__dict__)
                vs.reuse_variables()

            with tf.variable_scope(f've_{e}_target') as vs:
                if reuse:
                    vs.reuse_variables()
                target_batch_tf = batch_tf.copy()
                target_batch_tf['o'] = batch_tf['o_2']
                target_batch_tf['g'] = batch_tf['g_2']
                target_batch_tf['u'] = batch_tf['u_2']
                v_target_function = self.create_v_function(
                    target_batch_tf, **self.__dict__)
                vs.reuse_variables()

            # loss functions
            target_tf = tf.clip_by_value(
                batch_tf['r'] + self.gamma * v_target_function.V_tf,
                *clip_range)
            V_loss_tf = tf.reduce_mean(
                tf.square(tf.stop_gradient(target_tf) - v_function.V_tf))

            V_scope = f've_{e}/V'
            V_grads_tf = tf.gradients(V_loss_tf, self._vars(V_scope))
            assert len(self._vars(V_scope)) == len(V_grads_tf)
            V_grads_vars_tf = zip(V_grads_tf, self._vars(V_scope))
            V_grad_tf = flatten_grads(grads=V_grads_tf,
                                      var_list=self._vars(V_scope))

            # optimizers
            V_adam = MpiAdam(self._vars(V_scope), scale_grad_by_procs=False)

            # store in attribute lists
            self.V_loss_tf[e] = V_loss_tf
            self.V_fun[e] = v_function
            self.V_target_fun[e] = v_target_function
            self.V_grads_vars_tf[e] = V_grads_vars_tf
            self.V_grad_tf[e] = V_grad_tf
            self.V_adam[e] = V_adam

        # polyak averaging
        main_vars = sum(
            [self._vars(f've_{e}/V') for e in range(self.size_ensemble)], [])
        target_vars = sum([
            self._vars(f've_{e}_target/V') for e in range(self.size_ensemble)
        ], [])
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]), zip(target_vars, main_vars)))
        self.update_target_net_op = list(
            map(
                lambda v: v[0].assign(self.polyak * v[0] +
                                      (1. - self.polyak) * v[1]),
                zip(target_vars, main_vars)))

        assert len(main_vars) == len(target_vars)

        # report loss as the average of value function loss over the ensemble
        # self.V_loss_tf = tf.reduce_mean(self.V_loss_tf)

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()

    def _init_target_net(self):
        self.sess.run(self.init_target_net_op)

    def update_target_net(self):
        if self.use_double_network:
            self.sess.run(self.update_target_net_op)
        else:
            pass

    def _vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                scope=self.scope + '/' + scope)
        assert len(res) > 0
        return res

    def _global_vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                scope=self.scope + '/' + scope)
        return res

    def _sync_optimizers(self):
        for e in range(self.size_ensemble):
            self.V_adam[e].sync()

    def _grads(self):
        """
        returns:
            V_loss (scalar)
            V_grad (list)
        """
        V_loss, V_grad = self.sess.run([
            self.V_loss_tf,
            self.V_grad_tf,
        ])
        return V_loss, V_grad

    def get_current_buffer_size(self):
        return self.buffer.get_current_size()

    def store_episode(self, episode_batch, update_stats=True):
        """
        episode_batch: array of batch_size x (T or T+1) x dim_key
                       'o' is of size T+1, others are of size T
        """
        # if self.use_Q:
        #     u_2 = policy.get_actions(o=episode_batch['o'][:, 1:, :], ag=episode_batch['ag'][:, 1:, :], g=episode_batch['g'])  # (batch_size x t x dimu)
        #     self.buffer.store_episode({**episode_batch, 'u_2': u_2.reshape(episode_batch['u'].shape)})
        # else:
        #     self.buffer.store_episode(episode_batch)
        self.buffer.store_episode(episode_batch)

        if update_stats:
            # add transitions to normalizer

            # # flatten episode batch
            # o = episode_batch['o']#[:, :-1, :]
            # g = episode_batch['g']#[:, :-1, :]
            # ag = episode_batch['ag']#[:, :-1, :]
            # o = np.reshape(o, (-1, self.dimo))
            # g = np.reshape(g, (-1, self.dimg))
            # ag = np.reshape(ag, (-1, self.dimg))
            # o, g = self._preprocess_og(o, ag, g)
            #
            # self.o_stats.update(o)
            # self.g_stats.update(g)
            #
            # self.o_stats.recompute_stats()
            # self.g_stats.recompute_stats()

            episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
            episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
            num_normalizing_transitions = transitions_in_episode_batch(
                episode_batch)
            transitions = self.sample_transitions(episode_batch,
                                                  num_normalizing_transitions)

            o, g, ag = transitions['o'], transitions['g'], transitions['ag']
            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
            # No need to preprocess the o_2 and g_2 since this is only used for stats

            self.o_stats.update(transitions['o'])
            self.g_stats.update(transitions['g'])

            self.o_stats.recompute_stats()
            self.g_stats.recompute_stats()

    def _preprocess_og(self, o, ag, g):
        if self.relative_goals:
            g_shape = g.shape
            g = g.reshape(-1, self.dimg)
            ag = ag.reshape(-1, self.dimg)
            g = self.subtract_goals(g, ag)
            g = g.reshape(*g_shape)
        o = np.clip(o, -self.clip_obs, self.clip_obs)
        g = np.clip(g, -self.clip_obs, self.clip_obs)
        return o, g

    def _preprocess_u(self, u):
        return np.clip(u, -self.max_u, self.max_u)

    def __getstate__(self):
        """Our policies can be loaded from pkl, but after unpickling you cannot continue training.
        """
        excluded_subnames = [
            '_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats',
            'V_fun', 'V_target_fun', 'lock', 'env', 'sample_transitions',
            'stage_shapes', 'create_v_function'
        ]

        state = {
            k: v
            for k, v in self.__dict__.items()
            if all([not subname in k for subname in excluded_subnames])
        }
        state['buffer_size'] = self.buffer_size
        state['tf'] = self.sess.run(
            [x for x in self._global_vars('') if 'buffer' not in x.name])
        return state

    def __setstate__(self, state):
        if 'sample_transitions' not in state:
            # We don't need this for playing the policy.
            state['sample_transitions'] = None
        if 'use_Q' not in state:
            state['use_Q'] = False  # a hack to accomendate old data
        if 'create_v_function' in state:
            del state['create_v_function']

        self.__init__(**state)
        # set up stats (they are overwritten in __init__)
        for k, v in state.items():
            if k[-6:] == '_stats':
                self.__dict__[k] = v
        # load TF variables
        vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
        assert (len(vars) == len(state["tf"]))
        node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
        self.sess.run(node)

    def save(self, save_path):
        tf_util.save_variables(save_path)
コード例 #12
0
ファイル: ddpg.py プロジェクト: dingyiming0427/goalgail
    def _create_network(self, reuse=False):
        # logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()

        # running averages
        with tf.variable_scope('o_stats', reuse=reuse) as vs:
            # if reuse:
            #     vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess)
        with tf.variable_scope('g_stats', reuse=reuse) as vs:
            # if reuse:
            #     vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess)
        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([(key, batch[i])
                                for i, key in enumerate(self.stage_shapes.keys())])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])
        batch_tf['successes'] = tf.reshape(batch_tf['successes'], [-1, 1])
        # networks
        with tf.variable_scope('main', reuse=reuse) as vs:
            # if reuse:
            #     vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target', reuse=reuse) as vs:
            # if reuse:
            #     vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(
                target_batch_tf, net_type='target',  **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))
        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        if self.two_qs:
            target_Q2_pi_tf = self.target.Q2_pi_tf
        # clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
        clip_range = (-np.inf, self.clip_return)
        # print(clip_range)
        if self.terminate_bootstrapping:
            target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * (1 - batch_tf['successes']) * target_Q_pi_tf, *clip_range)
            if self.two_qs:
                target2_tf = tf.clip_by_value(batch_tf['r2'] + self.gamma * (1 - batch_tf['successes']) * target_Q2_pi_tf, *clip_range)
        else:
            target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
            if self.two_qs:
                target2_tf = tf.clip_by_value(batch_tf['r2'] + self.gamma * target_Q2_pi_tf, *clip_range)
        if self.nearby_action_penalty:
            target_tf -= tf.reshape(batch_tf['far_from_goal'] * self.nearby_penalty_weight * tf.norm(self.main.pi_tf - batch_tf['u'], axis=-1), (-1, 1))

        self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))
        if self.two_qs:
            self.Q2_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target2_tf) - self.main.Q2_tf))

        if self.mask_q:
            self.pi_loss_tf = 0
        else:
            if self.two_qs:
                self.pi_loss_tf = -tf.reduce_mean((1 - batch_tf['w_q2'])[:, None] * self.main.Q_pi_tf + batch_tf['w_q2'][:, None] * self.main.Q2_pi_tf)
            else:
                self.pi_loss_tf =  -tf.reduce_mean(self.main.Q_pi_tf)
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf/ self.max_u))
        if self.sample_expert:
            self.pi_loss_tf += (1 - self.anneal_bc * tf.to_float(tf.greater_equal(self.target.Q_pi_tf, self.target.Q_tf))) * \
                self.bc_loss * tf.reduce_mean(batch_tf['is_demo'] * batch_tf['annealing_factor'] *
                tf.reduce_sum(tf.square(self.main.pi_tf - batch_tf['u']), axis=-1 ))

        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))

        if self.two_qs:
            Q2_grads_tf = tf.gradients(self.Q2_loss_tf, self._vars('main/2Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))

        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)

        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        if self.two_qs:
            self.Q2_grads_vars_tf = zip(Q2_grads_tf, self._vars('main/2Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))

        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q'))
        if self.two_qs:
            self.Q2_grad_tf = flatten_grads(grads=Q2_grads_tf, var_list=self._vars('main/2Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        if self.two_qs:
            self.Q2_adam = MpiAdam(self._vars('main/2Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = self._vars('main/Q') + self._vars('main/pi') + (self._vars('main/2Q') if self.two_qs else [])
        self.target_vars = self._vars('target/Q') + self._vars('target/pi') + (self._vars('target/2Q') if self.two_qs else [])
        self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats')
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars)))
        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()
コード例 #13
0
ファイル: pggd.py プロジェクト: matthew9671/BlockPuzzle-gym
    def _create_network(self, reuse=False):
        logger.info("Creating a PGGD agent with action space %d x %s..." %
                    (self.dimu, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            o_stats_dim = self.dimo
            if 'Variation' in self.kwargs['info']['env_name']:
                print("Found Variation in env name")
                o_stats_dim -= 1
            self.o_stats = Normalizer(o_stats_dim,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        # --------------
        with tf.variable_scope('G_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.G_stats = Normalizer(1,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('sigma_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.sigma_stats = Normalizer(self.dimu,
                                          self.norm_eps,
                                          self.norm_clip,
                                          sess=self.sess)
        # --------------
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([
            (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())
        ])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])
        # ------------
        batch_tf['G'] = tf.reshape(batch_tf['G'], [
            -1,
        ])
        # ------------

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf,
                                                 net_type='main',
                                                 **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(target_batch_tf,
                                                   net_type='target',
                                                   **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # ---------------------------
        # loss functions
        log_prob = tf.reduce_sum(tf.log(
            tf.clip_by_value(self.main.a_prob_tf, 1e-10, 1.0)),
                                 axis=1)
        neg_weighted_log_prob = -tf.multiply(batch_tf['G'], log_prob)
        self.pi_loss_tf = tf.reduce_mean(neg_weighted_log_prob)

        # https://github.com/tensorflow/tensorflow/issues/783
        def replace_none_with_zero(grads, var_list):
            return [
                grad if grad is not None else tf.zeros_like(var)
                for var, grad in zip(var_list, grads)
            ]

        pi_grads_tf = replace_none_with_zero(
            tf.gradients(self.pi_loss_tf, self._vars('main/pi')),
            self._vars('main/pi'))
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf,
                                        var_list=self._vars('main/pi'))
        # ---------------------------

        # optimizers
        self.pi_adam = MpiAdam(self._vars('main/pi'),
                               scale_grad_by_procs=False)

        # polyak averaging
        # self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        # self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars(
            'g_stats') + self._global_vars('G_stats') + self._global_vars(
                'sigma_stats')
        # self.init_target_net_op = list(
        #     map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
        # self.update_target_net_op = list(
        #     map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
コード例 #14
0
ファイル: ddpg.py プロジェクト: matthew9671/BlockPuzzle-gym
    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." %
                    (self.dimu, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()

            o_stats_dim = self.dimo
            if 'Variation' in self.kwargs['info']['env_name']:
                print("Found Variation in env name")
                o_stats_dim -= 1
            self.o_stats = Normalizer(o_stats_dim,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([
            (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())
        ])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf,
                                                 net_type='main',
                                                 **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(target_batch_tf,
                                                   net_type='target',
                                                   **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(
            batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
        self.Q_loss_tf = tf.reduce_mean(
            tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))
        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(
            tf.square(self.main.pi_tf / self.max_u))

        # https://github.com/tensorflow/tensorflow/issues/783
        def replace_none_with_zero(grads, var_list):
            result = [
                grad if grad is not None else tf.zeros_like(var)
                for var, grad in zip(var_list, grads)
            ]
            # count = 0
            # for grad in grads:
            #     if grad is None:
            #         count += 1
            # print(count)
            return result

        # print(tf.gradients(self.Q_loss_tf, self._vars('main/Q')))
        Q_grads_tf = replace_none_with_zero(
            tf.gradients(self.Q_loss_tf, self._vars('main/Q')),
            self._vars('main/Q'))
        # print(Q_grads_tf)
        # print(tf.gradients(self.pi_loss_tf, self._vars('main/pi')))
        pi_grads_tf = replace_none_with_zero(
            tf.gradients(self.pi_loss_tf, self._vars('main/pi')),
            self._vars('main/pi'))
        # print(pi_grads_tf)
        # assert(False)
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf,
                                       var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf,
                                        var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'),
                               scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars(
            'g_stats')
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]),
                zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(
                lambda v: v[0].assign(self.polyak * v[0] +
                                      (1. - self.polyak) * v[1]),
                zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()