def _create_network(self, reuse=False): logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf.get_default_session() if self.sess is None: self.sess = tf.InteractiveSession() # running averages with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats') as vs: if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() batch_tf = OrderedDict([(key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) # networks with tf.variable_scope('main') as vs: if reuse: vs.reuse_variables() self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) vs.reuse_variables() with tf.variable_scope('target') as vs: if reuse: vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_actor_critic( target_batch_tf, net_type='target', **self.__dict__) vs.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # loss functions # self.XX.pi_tf is the action policy we ll use for exploration (TO CONFIRM) # self.XX.Q_pi_tf is the Q network used to train this policy # self.XX.Q_tf target_Q_pi_tf = self.target.Q_pi_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) # target y_i= r + gamma*Q part of the Bellman equation (with returns clipped if necessary: target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) # loss function for Q_tf where we exclude target_tf from the gradient computation: self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf)) # loss function for the action policy is that of the main Q_pi network: self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) # add L2 regularization term from the policy itself: self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) # define the gradients of the Q_loss and pi_loss wrt to their variables respectively Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) assert len(self._vars('main/Q')) == len(Q_grads_tf) assert len(self._vars('main/pi')) == len(pi_grads_tf) # zip the gradients together with their respective variables self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) # flattened gradients and variables self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) # optimizers (using MPI for parralel updates of the network (TO CONFIRM)) self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) # polyak averaging used for the update of the target networks in both pi and Q nets self.main_vars = self._vars('main/Q') + self._vars('main/pi') self.target_vars = self._vars('target/Q') + self._vars('target/pi') self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats') # operation to initialize the target nets at the main nets'values self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) # operation to update the target nets from the main nets using polyak averaging self.update_target_net_op = list( map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers() # CHECK WHAT THIS DOES ???? self._init_target_net()
def _create_network(self, pretrain_weights, mi_prioritization, reuse=False): if self.sac: logger.info("Creating a SAC agent with action space %d x %s..." % (self.dimu, self.max_u)) else: logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf.get_default_session() if self.sess is None: self.sess = tf.InteractiveSession() # running averages with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats') as vs: if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() batch_tf = OrderedDict([ (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys()) ]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) batch_tf['w'] = tf.reshape(batch_tf['w'], [-1, 1]) batch_tf['m'] = tf.reshape(batch_tf['m'], [-1, 1]) batch_tf['s'] = tf.reshape(batch_tf['s'], [-1, 1]) self.o_tau_tf = tf.placeholder(tf.float32, shape=(None, None, self.dimo)) # networks with tf.variable_scope('main') as vs: if reuse: vs.reuse_variables() self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) vs.reuse_variables() with tf.variable_scope('target') as vs: if reuse: vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_actor_critic(target_batch_tf, net_type='target', **self.__dict__) vs.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # intrinsic reward (ir) network for mutual information with tf.variable_scope('ir') as vs: if reuse: vs.reuse_variables() self.main_ir = self.create_discriminator(batch_tf, net_type='ir', **self.__dict__) vs.reuse_variables() # loss functions mi_grads_tf = tf.gradients(tf.reduce_mean(self.main_ir.mi_tf), self._vars('ir/state_mi')) assert len(self._vars('ir/state_mi')) == len(mi_grads_tf) self.mi_grads_vars_tf = zip(mi_grads_tf, self._vars('ir/state_mi')) self.mi_grad_tf = flatten_grads(grads=mi_grads_tf, var_list=self._vars('ir/state_mi')) self.mi_adam = MpiAdam(self._vars('ir/state_mi'), scale_grad_by_procs=False) sk_grads_tf = tf.gradients(tf.reduce_mean(self.main_ir.sk_tf), self._vars('ir/skill_ds')) assert len(self._vars('ir/skill_ds')) == len(sk_grads_tf) self.sk_grads_vars_tf = zip(sk_grads_tf, self._vars('ir/skill_ds')) self.sk_grad_tf = flatten_grads(grads=sk_grads_tf, var_list=self._vars('ir/skill_ds')) self.sk_adam = MpiAdam(self._vars('ir/skill_ds'), scale_grad_by_procs=False) target_Q_pi_tf = self.target.Q_pi_tf clip_range = (-self.clip_return, self.clip_return if self.clip_pos_returns else np.inf) self.e_w_tf = batch_tf['e_w'] if not self.sac: self.main.neg_logp_pi_tf = tf.zeros(1) target_tf = tf.clip_by_value( self.r_scale * batch_tf['r'] * batch_tf['r_w'] + (tf.clip_by_value(self.mi_r_scale * batch_tf['m'], *(0, 1)) - (1 if not self.mi_r_scale == 0 else 0)) * batch_tf['m_w'] + (tf.clip_by_value(self.sk_r_scale * batch_tf['s'], *(-1, 0))) * batch_tf['s_w'] + (tf.clip_by_value(self.et_r_scale * self.main.neg_logp_pi_tf, *(-1, 0))) * self.e_w_tf + self.gamma * target_Q_pi_tf, *clip_range) self.td_error_tf = tf.stop_gradient(target_tf) - self.main.Q_tf self.errors_tf = tf.square(self.td_error_tf) self.errors_tf = tf.reduce_mean(batch_tf['w'] * self.errors_tf) self.Q_loss_tf = tf.reduce_mean(self.errors_tf) self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.action_l2 * tf.reduce_mean( tf.square(self.main.pi_tf / self.max_u)) Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) assert len(self._vars('main/Q')) == len(Q_grads_tf) assert len(self._vars('main/pi')) == len(pi_grads_tf) self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) # optimizers self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) self.main_vars = self._vars('main/Q') + self._vars('main/pi') self.target_vars = self._vars('target/Q') + self._vars('target/pi') # polyak averaging self.stats_vars = self._global_vars('o_stats') + self._global_vars( 'g_stats') self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list( map( lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables tf.variables_initializer(self._global_vars('')).run() if pretrain_weights: load_weight(self.sess, pretrain_weights, ['state_mi']) if self.finetune_pi: load_weight(self.sess, pretrain_weights, ['main']) self._sync_optimizers() if pretrain_weights and self.finetune_pi: load_weight(self.sess, pretrain_weights, ['target']) else: self._init_target_net()
def _create_network(self, reuse=False): logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf_util.get_session() # running averages with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats') as vs: if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() batch_tf = OrderedDict([ (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys()) ]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) #choose only the demo buffer samples mask = np.concatenate( (np.zeros(self.batch_size - self.demo_batch_size), np.ones(self.demo_batch_size)), axis=0) # networks with tf.variable_scope('main') as vs: if reuse: vs.reuse_variables() self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) vs.reuse_variables() with tf.variable_scope('target') as vs: if reuse: vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_actor_critic(target_batch_tf, net_type='target', **self.__dict__) vs.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # loss functions target_Q_pi_tf = self.target.Q_pi_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = tf.clip_by_value( batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) self.Q_loss_tf = tf.reduce_mean( tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf)) if self.bc_loss == 1 and self.q_filter == 1: # train with demonstrations and use bc_loss and q_filter both maskMain = tf.reshape( tf.boolean_mask(self.main.Q_tf > self.main.Q_pi_tf, mask), [-1] ) #where is the demonstrator action better than actor action according to the critic? choose those samples only #define the cloning loss on the actor's actions only on the samples which adhere to the above masks self.cloning_loss_tf = tf.reduce_sum( tf.square( tf.boolean_mask(tf.boolean_mask((self.main.pi_tf), mask), maskMain, axis=0) - tf.boolean_mask(tf.boolean_mask((batch_tf['u']), mask), maskMain, axis=0))) self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean( self.main.Q_pi_tf ) #primary loss scaled by it's respective weight prm_loss_weight self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean( tf.square(self.main.pi_tf / self.max_u) ) #L2 loss on action values scaled by the same weight prm_loss_weight self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf #adding the cloning loss to the actor loss as an auxilliary loss scaled by its weight aux_loss_weight elif self.bc_loss == 1 and self.q_filter == 0: # train with demonstrations without q_filter self.cloning_loss_tf = tf.reduce_sum( tf.square( tf.boolean_mask((self.main.pi_tf), mask) - tf.boolean_mask((batch_tf['u']), mask))) self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean( self.main.Q_pi_tf) self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean( tf.square(self.main.pi_tf / self.max_u)) self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf else: #If not training with demonstrations self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.action_l2 * tf.reduce_mean( tf.square(self.main.pi_tf / self.max_u)) Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) assert len(self._vars('main/Q')) == len(Q_grads_tf) assert len(self._vars('main/pi')) == len(pi_grads_tf) self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) # optimizers self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) # polyak averaging self.main_vars = self._vars('main/Q') + self._vars('main/pi') self.target_vars = self._vars('target/Q') + self._vars('target/pi') self.stats_vars = self._global_vars('o_stats') + self._global_vars( 'g_stats') self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list( map( lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers() self._init_target_net()
def _create_network(self, reuse=False): logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf.get_default_session() if self.sess is None: self.sess = tf.InteractiveSession() # running averages with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats') as vs: if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() batch_tf = OrderedDict([ (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys()) ]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) batch_tf['w'] = tf.reshape(batch_tf['w'], [-1, 1]) # networks with tf.variable_scope('main') as vs: if reuse: vs.reuse_variables() self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) vs.reuse_variables() with tf.variable_scope('target') as vs: if reuse: vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_actor_critic(target_batch_tf, net_type='target', **self.__dict__) vs.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # loss functions target_Q_pi_tf = self.target.Q_pi_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = tf.clip_by_value( batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) self.td_error_tf = tf.stop_gradient(target_tf) - self.main.Q_tf self.errors_tf = tf.square(self.td_error_tf) self.errors_tf = tf.reduce_mean(batch_tf['w'] * self.errors_tf) self.Q_loss_tf = tf.reduce_mean(self.errors_tf) self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.action_l2 * tf.reduce_mean( tf.square(self.main.pi_tf / self.max_u)) Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) assert len(self._vars('main/Q')) == len(Q_grads_tf) assert len(self._vars('main/pi')) == len(pi_grads_tf) self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) # optimizers self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) # polyak averaging self.main_vars = self._vars('main/Q') + self._vars('main/pi') self.target_vars = self._vars('target/Q') + self._vars('target/pi') self.stats_vars = self._global_vars('o_stats') + self._global_vars( 'g_stats') self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list( map( lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers() self._init_target_net()
def _create_network(self, reuse=False): # logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf_util.get_session() # running averages with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats') as vs: if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() batch_tf = OrderedDict([ (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys()) ]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) #choose only the demo buffer samples mask = np.concatenate( (np.zeros(self.batch_size - self.demo_batch_size), np.ones(self.demo_batch_size)), axis=0) # networks with tf.variable_scope('main') as vs: if reuse: vs.reuse_variables() self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) vs.reuse_variables() with tf.variable_scope('target') as vs: if reuse: vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_actor_critic(target_batch_tf, net_type='target', **self.__dict__) vs.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # loss functions target_Q_pi_tf = self.target.Q_pi_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = tf.clip_by_value( batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) self.Q_loss_tf = tf.reduce_mean( tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf)) if self.bc_loss == 1 and self.q_filter == 1: # train with demonstrations and use bc_loss and q_filter both self.maskMain = tf.reshape( tf.boolean_mask(self.main.Q_tf > self.main.Q_pi_tf, mask), [-1] ) #where is the demonstrator action better than actor action according to the critic? choose those samples only #define the cloning loss on the actor's actions only on the samples which adhere to the above masks self.cloning_loss_tf = tf.reduce_sum( tf.square( tf.boolean_mask(tf.boolean_mask((self.main.pi_tf), mask), self.maskMain, axis=0) - tf.boolean_mask(tf.boolean_mask((batch_tf['u']), mask), self.maskMain, axis=0))) self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean( self.main.Q_pi_tf ) #primary loss scaled by it's respective weight prm_loss_weight self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean( tf.square(self.main.pi_tf / self.max_u) ) #L2 loss on action values scaled by the same weight prm_loss_weight self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf #* self.w_loss #adding the cloning loss to the actor loss as an auxilliary loss scaled by its weight aux_loss_weight elif self.bc_loss == 1 and self.q_filter == 0: # train with demonstrations without q_filter self.maskMain = tf.constant([0.0]) self.cloning_loss_tf = tf.reduce_sum( tf.square( tf.boolean_mask((self.main.pi_tf), mask) - tf.boolean_mask((batch_tf['u']), mask))) self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean( self.main.Q_pi_tf) self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean( tf.square(self.main.pi_tf / self.max_u)) self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf else: #If not training with demonstrations self.maskMain = tf.constant([0.0]) self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.action_l2 * tf.reduce_mean( tf.square(self.main.pi_tf / self.max_u)) Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) assert len(self._vars('main/Q')) == len(Q_grads_tf) assert len(self._vars('main/pi')) == len(pi_grads_tf) self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) # optimizers self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) # polyak averaging self.main_vars = self._vars('main/Q') + self._vars('main/pi') self.target_vars = self._vars('target/Q') + self._vars('target/pi') self.stats_vars = self._global_vars('o_stats') + self._global_vars( 'g_stats') self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list( map( lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables tf.variables_initializer(self._global_vars('')).run() # # load weights from pretrained model # weightData = np.load('./hand_dapg/dapg/policies/saved_weights.npz', allow_pickle=True) # kernel1 = weightData['kernel1'] # kernel2 = weightData['kernel2'] # kernel3 = weightData['kernel3'] # bias1 = weightData['bias1'] # bias2 = weightData['bias2'] # bias3 = weightData['bias3'] # o_mean = weightData['o_mean'] # o_std = weightData['o_std'] # # print([n.name for n in tf.get_default_graph().as_graph_def().node]) # k1 = self.sess.graph.get_tensor_by_name('ddpg/main/pi/_0/kernel:0') # b1 = self.sess.graph.get_tensor_by_name('ddpg/main/pi/_0/bias:0') # k2 = self.sess.graph.get_tensor_by_name('ddpg/main/pi/_1/kernel:0') # b2 = self.sess.graph.get_tensor_by_name('ddpg/main/pi/_1/bias:0') # k3 = self.sess.graph.get_tensor_by_name('ddpg/main/pi/_2/kernel:0') # b3 = self.sess.graph.get_tensor_by_name('ddpg/main/pi/_2/bias:0') # o_m = self.sess.graph.get_tensor_by_name('ddpg/o_stats/mean:0') # o_s = self.sess.graph.get_tensor_by_name('ddpg/o_stats/std:0') # o_sumsq = self.sess.graph.get_tensor_by_name('ddpg/o_stats/sumsq:0') # o_sum = self.sess.graph.get_tensor_by_name('ddpg/o_stats/sum:0') # o_count = self.sess.graph.get_tensor_by_name('ddpg/o_stats/count:0') # # feed the weights and biases, normalization stats # self.sess.run(tf.assign(k1,tf.concat([tf.transpose(kernel1, perm=[1,0]), tf.zeros(shape=(9,32))],axis=0))) # self.sess.run(tf.assign(k2,tf.transpose(kernel2, perm=[1,0]))) # self.sess.run(tf.assign(k3,tf.transpose(kernel3, perm=[1,0]))) # self.sess.run(tf.assign(b1,bias1)) # self.sess.run(tf.assign(b2,bias2)) # self.sess.run(tf.assign(b3,bias3)) # self.sess.run(tf.assign(o_m,o_mean)) # self.sess.run(tf.assign(o_s,o_std)) # self.sess.run(tf.assign(o_sum,o_mean*1e5)) # self.sess.run(tf.assign(o_sumsq,np.square(o_mean)*1e5)) # self.sess.run(tf.assign(o_count,[1e5])) self._sync_optimizers() self._init_target_net()
def _create_network(self, reuse=False): ## num_demo 추가 -2 logger.info("Debug : Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf_util.get_session() # self.num_demo = num_demo # running averages with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats') as vs: if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() ## 그냥 꺼내오는거.. batch_tf = OrderedDict([(key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) #choose only the demo buffer samples mask = np.concatenate((np.zeros(self.batch_size - self.demo_batch_size), np.ones(self.demo_batch_size)), axis = 0) # networks with tf.variable_scope('main') as vs: if reuse: vs.reuse_variables() self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) vs.reuse_variables() print("tf.variable_scope(main) = {}".format(tf.variable_scope('target1'))) #-1 with tf.variable_scope('target1') as vs: if reuse: vs.reuse_variables() target1_batch_tf = batch_tf.copy() target1_batch_tf['o'] = batch_tf['o_2'] target1_batch_tf['g'] = batch_tf['g_2'] self.target1 = self.create_actor_critic( target1_batch_tf, net_type='target1', **self.__dict__) vs.reuse_variables() print("tf.variable_scope(target1) = {}".format(tf.variable_scope('target1'))) # print("batch= {}".format(target1_batch_tf)) # print(type('target')) #<class 'baselines.her.actor_critic.ActorCritic'> assert len(self._vars("main")) == len(self._vars("target1")) with tf.variable_scope('target2') as vs: if reuse: vs.reuse_variables() target2_batch_tf = batch_tf.copy() target2_batch_tf['o'] = batch_tf['o_2'] target2_batch_tf['g'] = batch_tf['g_2'] self.target2 = self.create_actor_critic( target2_batch_tf, net_type='target2', **self.__dict__) vs.reuse_variables() print("tf.variable_scope(target2) = {}".format(tf.variable_scope('target2'))) print("batch= {}".format(target2_batch_tf)) assert len(self._vars("main")) == len(self._vars("target2")) for nd in range(self.num_demo): ##A.R ##Compute the target Q value, Q1과 Q2중에 min값을 사용한다. target1_Q_pi_tf = self.target1.Q_pi_tf ##A.R policy training target2_Q_pi_tf = self.target2.Q_pi_tf ##A.R # target_Q_pi_tf = tf.minimum(target1_Q_pi_tf, target2_Q_pi_tf) # target1_Q_tf = self.target1.Q_tf ##A.R policy training # target2_Q_tf = self.target2.Q_tf ##A.R # print('target1={}/////target2={}'.format(target1_Q_tf,target2_Q_tf)) target_Q_pi_tf = tf.minimum(target1_Q_pi_tf, target2_Q_pi_tf) # target_Q_tf = tf.minimum(target1_Q_tf, target2_Q_tf) ## 대체 코드 # print("{}///{}///{}".format(target1_Q_pi_tf,target2_Q_pi_tf,tf.minimum(target1_Q_pi_tf, target2_Q_pi_tf))) #### #TD3에서 빠진 코드 :target_Q = reward + (done * discount * target_Q).detach()(L109) ->L428에서 해주고 clip한다 # loss functions # for policy training, Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1]) # target_Q_pi_tf = self.target.Q_pi_tf #original code clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_Q_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) # target_Q_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_tf, *clip_range) ## 대체 코드 # self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf)) ## # current_Q1, current_Q2 = self.critic(state, action) # for critic training, Q_tf = nn(input_Q, [self.hidden] * self.layers + [1], reuse=True) # target_Q_pi_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_tf, *clip_range) #original code # self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf)) #critic taining ## Get current Q estimates, for critic Q current_Q1 = self.main.Q_tf ##A.R current_Q2 = self.main.Q_tf # print("Q1={}".format(current_Q1)) ## Compute critic loss ## Torch => critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q) self.Q_loss_tf = tf.losses.mean_squared_error(current_Q1, target_Q_tf)+ tf.losses.mean_squared_error(current_Q2,target_Q_tf) # self.Q_loss_tf = tf.losses.mean_squared_error(current_Q1, target_Q_tf)+ tf.losses.mean_squared_error(current_Q2,target_Q_tf) # print("critic_loss ={}".format(self.Q_loss_tf)) Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) assert len(self._vars('main/Q')) == len(Q_grads_tf) ## Optimize the critic 아담 옵티마이저 self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) assert len(self._vars('main/Q')) == len(Q_grads_tf) self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) # ## Delayed policy updates if nd % self.td3_policy_freq == 0: # print("num_demo = {}".format(nd)) target1_Q_pi_tf = self.target1.Q_pi_tf ##A.R policy training target2_Q_pi_tf = self.target2.Q_pi_tf ##A.R tf.print(target1_Q_pi_tf, [target1_Q_pi_tf]) tf.print(target2_Q_pi_tf, [target2_Q_pi_tf]) # print(target2_Q_pi_tf) target_Q_pi_tf = tf.minimum(target1_Q_pi_tf, target2_Q_pi_tf) # target_Q_pi_tf = self.target.Q_pi_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf)) # Compute actor loss if self.bc_loss ==1 and self.q_filter == 1 : # train with demonstrations and use bc_loss and q_filter both maskMain = tf.reshape(tf.boolean_mask(self.main.Q_tf > self.main.Q_pi_tf, mask), [-1]) #where is the demonstrator action better than actor action according to the critic? choose those samples only #define the cloning loss on the actor's actions only on the samples which adhere to the above masks self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask(tf.boolean_mask((self.main.pi_tf), mask), maskMain, axis=0) - tf.boolean_mask(tf.boolean_mask((batch_tf['u']), mask), maskMain, axis=0))) self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf) #primary loss scaled by it's respective weight prm_loss_weight self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) #L2 loss on action values scaled by the same weight prm_loss_weight self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf #adding the cloning loss to the actor loss as an auxilliary loss scaled by its weight aux_loss_weight elif self.bc_loss == 1 and self.q_filter == 0: # train with demonstrations without q_filter self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask((self.main.pi_tf), mask) - tf.boolean_mask((batch_tf['u']), mask))) self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf else: #If not training with demonstrations self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) # self.pi_loss_tf = -tf.reduce_mean(self.main.pi_tf) ## what about target1? # self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) # actor_loss = -tf.reduce_mean(self.main.Q_tf) # actor_loss += self.action_l2 * tf.reduce_mean(tf.square(self.main.Q_tf / self.max_u)) pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) assert len(self._vars('main/pi')) == len(pi_grads_tf) # Optimize the actor # Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) assert len(self._vars('main/pi')) == len(pi_grads_tf) self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) # Update the frozen target models ## torch code # for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): # target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) self.main_vars = self._vars('main/Q') + self._vars('main/pi') self.target1_vars = self._vars('target1/Q') + self._vars('target1/pi') ##A.R self.target2_vars = self._vars('target2/Q') + self._vars('target2/pi') ##A.R if target_Q_pi_tf == target1_Q_pi_tf: target_vars = self.target1_vars else: target_vars = self.target2_vars # self.target_vars = self._vars('target/Q') + self._vars('target/pi') #original self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats') self.init_target1_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target1_vars, self.main_vars))) self.init_target2_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target2_vars, self.main_vars))) self.update_target_net_op = list( map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(target_vars, self.main_vars))) self.update_target1_net_op = list( map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(target_vars, self.main_vars))) self.update_target2_net_op = list( map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(target_vars, self.main_vars))) tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers() self._init_target_net()
def _create_network(self, reuse=False): logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf_util.get_session() # running averages with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats') as vs: if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() batch_tf = OrderedDict([(key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) #choose only the demo buffer samples mask = np.concatenate((np.zeros(self.batch_size - self.demo_batch_size), np.ones(self.demo_batch_size)), axis = 0) # networks with tf.variable_scope('main') as vs: if reuse: vs.reuse_variables() self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) vs.reuse_variables() with tf.variable_scope('target') as vs: if reuse: vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_actor_critic( target_batch_tf, net_type='target', **self.__dict__) vs.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # loss functions target_Q_pi_tf = self.target.Q_pi_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf)) if self.bc_loss ==1 and self.q_filter == 1 : # train with demonstrations and use bc_loss and q_filter both maskMain = tf.reshape(tf.boolean_mask(self.main.Q_tf > self.main.Q_pi_tf, mask), [-1]) #where is the demonstrator action better than actor action according to the critic? choose those samples only #define the cloning loss on the actor's actions only on the samples which adhere to the above masks self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask(tf.boolean_mask((self.main.pi_tf), mask), maskMain, axis=0) - tf.boolean_mask(tf.boolean_mask((batch_tf['u']), mask), maskMain, axis=0))) self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf) #primary loss scaled by it's respective weight prm_loss_weight self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) #L2 loss on action values scaled by the same weight prm_loss_weight self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf #adding the cloning loss to the actor loss as an auxilliary loss scaled by its weight aux_loss_weight elif self.bc_loss == 1 and self.q_filter == 0: # train with demonstrations without q_filter self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask((self.main.pi_tf), mask) - tf.boolean_mask((batch_tf['u']), mask))) self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf else: #If not training with demonstrations self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) assert len(self._vars('main/Q')) == len(Q_grads_tf) assert len(self._vars('main/pi')) == len(pi_grads_tf) self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) # optimizers self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) # polyak averaging self.main_vars = self._vars('main/Q') + self._vars('main/pi') self.target_vars = self._vars('target/Q') + self._vars('target/pi') self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats') self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list( map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers() self._init_target_net()
def _create_network(self, reuse=False): logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.action_scale)) self.sess = tf_util.get_session() # running averages with tf.variable_scope('o_stats') as variable_scope: if reuse: variable_scope.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats') as variable_scope: if reuse: variable_scope.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() batch_tf = OrderedDict([ (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys()) ]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) # choose only the demo buffer samples mask = np.concatenate( (np.zeros(self.batch_size - self.demo_batch_size), np.ones(self.demo_batch_size)), axis=0) # networks with tf.variable_scope('main') as variable_scope: if reuse: variable_scope.reuse_variables() # Create actor critic network self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) variable_scope.reuse_variables() with tf.variable_scope('target') as variable_scope: if reuse: variable_scope.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_actor_critic(target_batch_tf, net_type='target', **self.__dict__) variable_scope.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # loss functions target_critic_actor_tf = self.target.critic_with_actor_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = tf.clip_by_value( batch_tf['r'] + self.gamma * target_critic_actor_tf, *clip_range) # MSE of target_tf - critic_tf. This is the TD Learning step self.critic_loss_tf = tf.reduce_mean( tf.square(tf.stop_gradient(target_tf) - self.main.critic_tf)) # self.actor_loss_tf = -tf.reduce_mean(self.main.critic_with_actor_tf) self.actor_loss_tf += self.action_l2 * tf.reduce_mean( tf.square(self.main.actor_tf / self.action_scale)) # Constructs symbolic derivatives of sum of critic_loss_tf vs _vars('main/Q') critic_grads_tf = tf.gradients(self.critic_loss_tf, self._vars('main/Q')) actor_grads_tf = tf.gradients(self.actor_loss_tf, self._vars('main/pi')) assert len(self._vars('main/Q')) == len(critic_grads_tf) assert len(self._vars('main/pi')) == len(actor_grads_tf) self.critic_grads_vars_tf = zip(critic_grads_tf, self._vars('main/Q')) self.actor_grads_vars_tf = zip(actor_grads_tf, self._vars('main/pi')) # Flattens variables and their gradients. self.critic_grads = flatten_grads(grads=critic_grads_tf, var_list=self._vars('main/Q')) self.actor_grads = flatten_grads(grads=actor_grads_tf, var_list=self._vars('main/pi')) # optimizers self.critic_optimiser = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) self.actor_optimiser = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) # polyak averaging used to update target network self.main_vars = self._vars('main/Q') + self._vars('main/pi') self.target_vars = self._vars('target/Q') + self._vars('target/pi') self.stats_vars = self._global_vars('o_stats') + self._global_vars( 'g_stats') # list( map( lambda( assign() ), zip())) self.init_target_net_op = list( map( # Apply lambda to each item item in the zipped list lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) # Polyak-Ruppert averaging where most recent iterations are weighted more than past iterations. self.update_target_net_op = list( map( # Apply lambda to each item item in the zipped list lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), # polyak averaging zip(self.target_vars, self.main_vars)) # [(target_vars, main_vars), (), ...] ) # initialize all variables tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers() self._init_target_net()
def _create_double_network(self, reuse=False): # logger.info("Creating a q function ensemble with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf_util.get_session() # running averages, separate from alg (this is within a different scope) # assume reuse is False with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats'): if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) self.V_loss_tf = [None] * self.size_ensemble self.V_fun = [None] * self.size_ensemble self.V_target_fun = [None] * self.size_ensemble self.V_grads_vars_tf = [None] * self.size_ensemble self.V_grad_tf = [None] * self.size_ensemble self.V_adam = [None] * self.size_ensemble self.init_target_net_op = [None] * self.size_ensemble self.update_target_net_op = [None] * self.size_ensemble clip_range = (-self.clip_return, 0. if self.clip_pos_returns else self.clip_return) for e in range(self.size_ensemble): # mini-batch sampling batch = self.staging_tf[e].get() batch_tf = OrderedDict([ (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys()) ]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) # networks (no target network for now) with tf.variable_scope(f've_{e}') as vs: if reuse: vs.reuse_variables() v_function = self.create_v_function(batch_tf, **self.__dict__) vs.reuse_variables() with tf.variable_scope(f've_{e}_target') as vs: if reuse: vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] target_batch_tf['u'] = batch_tf['u_2'] v_target_function = self.create_v_function( target_batch_tf, **self.__dict__) vs.reuse_variables() # loss functions target_tf = tf.clip_by_value( batch_tf['r'] + self.gamma * v_target_function.V_tf, *clip_range) V_loss_tf = tf.reduce_mean( tf.square(tf.stop_gradient(target_tf) - v_function.V_tf)) V_scope = f've_{e}/V' V_grads_tf = tf.gradients(V_loss_tf, self._vars(V_scope)) assert len(self._vars(V_scope)) == len(V_grads_tf) V_grads_vars_tf = zip(V_grads_tf, self._vars(V_scope)) V_grad_tf = flatten_grads(grads=V_grads_tf, var_list=self._vars(V_scope)) # optimizers V_adam = MpiAdam(self._vars(V_scope), scale_grad_by_procs=False) # store in attribute lists self.V_loss_tf[e] = V_loss_tf self.V_fun[e] = v_function self.V_target_fun[e] = v_target_function self.V_grads_vars_tf[e] = V_grads_vars_tf self.V_grad_tf[e] = V_grad_tf self.V_adam[e] = V_adam # polyak averaging main_vars = sum( [self._vars(f've_{e}/V') for e in range(self.size_ensemble)], []) target_vars = sum([ self._vars(f've_{e}_target/V') for e in range(self.size_ensemble) ], []) self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(target_vars, main_vars))) self.update_target_net_op = list( map( lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(target_vars, main_vars))) assert len(main_vars) == len(target_vars) # report loss as the average of value function loss over the ensemble # self.V_loss_tf = tf.reduce_mean(self.V_loss_tf) # initialize all variables tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers() self._init_target_net()
def _create_network(self, reuse=False): logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf.get_default_session() if self.sess is None: self.sess = tf.InteractiveSession() # running averages with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats') as vs: if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() batch_tf = OrderedDict([(key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) ########### Getting the bias terms - Ameet bias = self.staging_tf_new.get() bias_tf = OrderedDict([(key, bias[i]) for i, key in enumerate(self.stage_shapes_new.keys())]) bias_tf['bias'] = tf.reshape(bias_tf['bias'], [-1, 1]) ####################################### # Create main and target networks, each will have a pi_tf, Q_tf and Q_pi_tf with tf.variable_scope('main') as vs: if reuse: vs.reuse_variables() self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) vs.reuse_variables() with tf.variable_scope('target') as vs: if reuse: vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_actor_critic( target_batch_tf, net_type='target', **self.__dict__) vs.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # loss functions target_Q_pi_tf = self.target.Q_pi_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) ############## Added for bias - Ameet error = (tf.stop_gradient(target_tf) - self.main.Q_tf) * bias_tf['bias'] self.Q_loss_tf = tf.reduce_mean(tf.square(error)) # self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf * bias_tf['bias']) # Note that the following statement does not include bias because of the remark in the IEEE paper self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) ############## # Regularization - L2 - Check - Penalty for taking the best action self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) assert len(self._vars('main/Q')) == len(Q_grads_tf) assert len(self._vars('main/pi')) == len(pi_grads_tf) self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) ################### Shape Info ####Shape of Q_grads_tf is: 8 ####Shape of Q_grads_tf[0] is: (17, 256) self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) # optimizers self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) # polyak averaging # 'main/Q' is a way of communicating the scope of the variables # _vars has a way to understand this self.main_vars = self._vars('main/Q') + self._vars('main/pi') self.target_vars = self._vars('target/Q') + self._vars('target/pi') self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats') # Update the networks # target net is updated by using polyak averaging # target net is initialized by just copying the main net self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list( map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers() self._init_target_net()
def _create_network(self, reuse=False): # logger.info("Creating a q function ensemble with action space %d x %s..." % (self.dimu, self.max_u)) # self.sess = tf_util.get_session() self.sess = tf.get_default_session() assert self.sess is not None # running averages, separate from alg (this is within a different scope) # assume reuse is False with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats'): if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) self.V_loss_tf = [None] * self.size_ensemble self.V_fun = [None] * self.size_ensemble self.V_grads_vars_tf = [None] * self.size_ensemble self.V_grad_tf = [None] * self.size_ensemble self.V_adam = [None] * self.size_ensemble clip_range = (-self.clip_return, 0. if self.clip_pos_returns else self.clip_return) for e in range(self.size_ensemble): # mini-batch sampling batch = self.staging_tf[e].get() batch_tf = OrderedDict([ (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys()) ]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) # networks (no target network for now) with tf.variable_scope("ve_{}".format(e)) as vs: if reuse: vs.reuse_variables() v_function = self.create_v_function(batch_tf, **self.__dict__) vs.reuse_variables() # loss functions V_2_tf = v_function.V_2_tf target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * V_2_tf, *clip_range) V_loss_tf = tf.reduce_mean( tf.square(tf.stop_gradient(target_tf) - v_function.V_tf)) V_scope = 've_{}/V'.format(e) V_grads_tf = tf.gradients(V_loss_tf, self._vars(V_scope)) assert len(self._vars(V_scope)) == len(V_grads_tf) V_grads_vars_tf = zip(V_grads_tf, self._vars(V_scope)) V_grad_tf = flatten_grads(grads=V_grads_tf, var_list=self._vars(V_scope)) # optimizers V_adam = MpiAdam(self._vars(V_scope), scale_grad_by_procs=False) # store in attribute lists self.V_loss_tf[e] = V_loss_tf self.V_fun[e] = v_function self.V_grads_vars_tf[e] = V_grads_vars_tf self.V_grad_tf[e] = V_grad_tf self.V_adam[e] = V_adam n_vars = [ len(self._vars("ve_{}".format(e))) for e in range(self.size_ensemble) ] assert np.all(np.asarray(n_vars) == n_vars[0]), n_vars # report loss as the average of value function loss over the ensemble # self.V_loss_tf = tf.reduce_mean(self.V_loss_tf) # initialize all variables tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers()
def _create_network(self, reuse=False): # logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf.get_default_session() if self.sess is None: self.sess = tf.InteractiveSession() # running averages with tf.variable_scope('o_stats', reuse=reuse) as vs: # if reuse: # vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats', reuse=reuse) as vs: # if reuse: # vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() batch_tf = OrderedDict([(key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) batch_tf['successes'] = tf.reshape(batch_tf['successes'], [-1, 1]) # networks with tf.variable_scope('main', reuse=reuse) as vs: # if reuse: # vs.reuse_variables() self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) vs.reuse_variables() with tf.variable_scope('target', reuse=reuse) as vs: # if reuse: # vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_actor_critic( target_batch_tf, net_type='target', **self.__dict__) vs.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # loss functions target_Q_pi_tf = self.target.Q_pi_tf if self.two_qs: target_Q2_pi_tf = self.target.Q2_pi_tf # clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) clip_range = (-np.inf, self.clip_return) # print(clip_range) if self.terminate_bootstrapping: target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * (1 - batch_tf['successes']) * target_Q_pi_tf, *clip_range) if self.two_qs: target2_tf = tf.clip_by_value(batch_tf['r2'] + self.gamma * (1 - batch_tf['successes']) * target_Q2_pi_tf, *clip_range) else: target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) if self.two_qs: target2_tf = tf.clip_by_value(batch_tf['r2'] + self.gamma * target_Q2_pi_tf, *clip_range) if self.nearby_action_penalty: target_tf -= tf.reshape(batch_tf['far_from_goal'] * self.nearby_penalty_weight * tf.norm(self.main.pi_tf - batch_tf['u'], axis=-1), (-1, 1)) self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf)) if self.two_qs: self.Q2_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target2_tf) - self.main.Q2_tf)) if self.mask_q: self.pi_loss_tf = 0 else: if self.two_qs: self.pi_loss_tf = -tf.reduce_mean((1 - batch_tf['w_q2'])[:, None] * self.main.Q_pi_tf + batch_tf['w_q2'][:, None] * self.main.Q2_pi_tf) else: self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf/ self.max_u)) if self.sample_expert: self.pi_loss_tf += (1 - self.anneal_bc * tf.to_float(tf.greater_equal(self.target.Q_pi_tf, self.target.Q_tf))) * \ self.bc_loss * tf.reduce_mean(batch_tf['is_demo'] * batch_tf['annealing_factor'] * tf.reduce_sum(tf.square(self.main.pi_tf - batch_tf['u']), axis=-1 )) Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) if self.two_qs: Q2_grads_tf = tf.gradients(self.Q2_loss_tf, self._vars('main/2Q')) pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) assert len(self._vars('main/Q')) == len(Q_grads_tf) assert len(self._vars('main/pi')) == len(pi_grads_tf) self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) if self.two_qs: self.Q2_grads_vars_tf = zip(Q2_grads_tf, self._vars('main/2Q')) self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) if self.two_qs: self.Q2_grad_tf = flatten_grads(grads=Q2_grads_tf, var_list=self._vars('main/2Q')) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) # optimizers self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) if self.two_qs: self.Q2_adam = MpiAdam(self._vars('main/2Q'), scale_grad_by_procs=False) self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) # polyak averaging self.main_vars = self._vars('main/Q') + self._vars('main/pi') + (self._vars('main/2Q') if self.two_qs else []) self.target_vars = self._vars('target/Q') + self._vars('target/pi') + (self._vars('target/2Q') if self.two_qs else []) self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats') self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list( map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers() self._init_target_net()
def _create_network(self, reuse=False): logger.info("Creating a PGGD agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf.get_default_session() if self.sess is None: self.sess = tf.InteractiveSession() # running averages with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() o_stats_dim = self.dimo if 'Variation' in self.kwargs['info']['env_name']: print("Found Variation in env name") o_stats_dim -= 1 self.o_stats = Normalizer(o_stats_dim, self.norm_eps, self.norm_clip, sess=self.sess) # -------------- with tf.variable_scope('G_stats') as vs: if reuse: vs.reuse_variables() self.G_stats = Normalizer(1, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('sigma_stats') as vs: if reuse: vs.reuse_variables() self.sigma_stats = Normalizer(self.dimu, self.norm_eps, self.norm_clip, sess=self.sess) # -------------- with tf.variable_scope('g_stats') as vs: if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() batch_tf = OrderedDict([ (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys()) ]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) # ------------ batch_tf['G'] = tf.reshape(batch_tf['G'], [ -1, ]) # ------------ # networks with tf.variable_scope('main') as vs: if reuse: vs.reuse_variables() self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) vs.reuse_variables() with tf.variable_scope('target') as vs: if reuse: vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_actor_critic(target_batch_tf, net_type='target', **self.__dict__) vs.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # --------------------------- # loss functions log_prob = tf.reduce_sum(tf.log( tf.clip_by_value(self.main.a_prob_tf, 1e-10, 1.0)), axis=1) neg_weighted_log_prob = -tf.multiply(batch_tf['G'], log_prob) self.pi_loss_tf = tf.reduce_mean(neg_weighted_log_prob) # https://github.com/tensorflow/tensorflow/issues/783 def replace_none_with_zero(grads, var_list): return [ grad if grad is not None else tf.zeros_like(var) for var, grad in zip(var_list, grads) ] pi_grads_tf = replace_none_with_zero( tf.gradients(self.pi_loss_tf, self._vars('main/pi')), self._vars('main/pi')) assert len(self._vars('main/pi')) == len(pi_grads_tf) self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) # --------------------------- # optimizers self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) # polyak averaging # self.main_vars = self._vars('main/Q') + self._vars('main/pi') # self.target_vars = self._vars('target/Q') + self._vars('target/pi') self.stats_vars = self._global_vars('o_stats') + self._global_vars( 'g_stats') + self._global_vars('G_stats') + self._global_vars( 'sigma_stats') # self.init_target_net_op = list( # map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) # self.update_target_net_op = list( # map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers()
def _create_network(self, reuse=False): logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf.get_default_session() if self.sess is None: self.sess = tf.InteractiveSession() # running averages with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() o_stats_dim = self.dimo if 'Variation' in self.kwargs['info']['env_name']: print("Found Variation in env name") o_stats_dim -= 1 self.o_stats = Normalizer(o_stats_dim, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats') as vs: if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() batch_tf = OrderedDict([ (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys()) ]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) # networks with tf.variable_scope('main') as vs: if reuse: vs.reuse_variables() self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) vs.reuse_variables() with tf.variable_scope('target') as vs: if reuse: vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_actor_critic(target_batch_tf, net_type='target', **self.__dict__) vs.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # loss functions target_Q_pi_tf = self.target.Q_pi_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = tf.clip_by_value( batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) self.Q_loss_tf = tf.reduce_mean( tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf)) self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.action_l2 * tf.reduce_mean( tf.square(self.main.pi_tf / self.max_u)) # https://github.com/tensorflow/tensorflow/issues/783 def replace_none_with_zero(grads, var_list): result = [ grad if grad is not None else tf.zeros_like(var) for var, grad in zip(var_list, grads) ] # count = 0 # for grad in grads: # if grad is None: # count += 1 # print(count) return result # print(tf.gradients(self.Q_loss_tf, self._vars('main/Q'))) Q_grads_tf = replace_none_with_zero( tf.gradients(self.Q_loss_tf, self._vars('main/Q')), self._vars('main/Q')) # print(Q_grads_tf) # print(tf.gradients(self.pi_loss_tf, self._vars('main/pi'))) pi_grads_tf = replace_none_with_zero( tf.gradients(self.pi_loss_tf, self._vars('main/pi')), self._vars('main/pi')) # print(pi_grads_tf) # assert(False) assert len(self._vars('main/Q')) == len(Q_grads_tf) assert len(self._vars('main/pi')) == len(pi_grads_tf) self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) # optimizers self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) # polyak averaging self.main_vars = self._vars('main/Q') + self._vars('main/pi') self.target_vars = self._vars('target/Q') + self._vars('target/pi') self.stats_vars = self._global_vars('o_stats') + self._global_vars( 'g_stats') self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list( map( lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers() self._init_target_net()
def _create_network(self, reuse=False): logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf.get_default_session() if self.sess is None: self.sess = tf.InteractiveSession() # running averages with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats') as vs: if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() batch_tf = OrderedDict([(key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) # networks with tf.variable_scope('main') as vs: if reuse: vs.reuse_variables() self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) vs.reuse_variables() with tf.variable_scope('target') as vs: if reuse: vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_actor_critic( target_batch_tf, net_type='target', **self.__dict__) vs.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # loss functions target_Q_pi_tf = self.target.Q_pi_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf)) self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) assert len(self._vars('main/Q')) == len(Q_grads_tf) assert len(self._vars('main/pi')) == len(pi_grads_tf) self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) # optimizers self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) # polyak averaging self.main_vars = self._vars('main/Q') + self._vars('main/pi') self.target_vars = self._vars('target/Q') + self._vars('target/pi') self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats') self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list( map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers() self._init_target_net()