def _create_actor_network(self, joints_input, is_online, reuse_flag): name_prefix = '{}_actor_{}'.format(self.name_prefix, 'online' if is_online else 'target') activation = get_activation( self.config['action_predictor']['activation']) layers = self.config['action_predictor']['layers'] + [ self.number_of_joints ] current = self._generate_policy_features(joints_input, name_prefix, reuse_flag) for i, layer_size in enumerate(layers[:-1]): current = tf.layers.dense(current, layer_size, activation=activation, name='{}_{}'.format(name_prefix, i), reuse=reuse_flag) tanh_preactivation = tf.layers.dense( current, layers[-1], activation=None, name='{}_tanh'.format(name_prefix), reuse=reuse_flag) action = tf.nn.l2_normalize(tf.nn.tanh(tanh_preactivation), 1) return action, tanh_preactivation
def _create_actor_network(self, joints_input, is_online, reuse_flag): name_prefix = "{}_actor_{}".format(self.name_prefix, "online" if is_online else "target") activation = get_activation( self.config["action_predictor"]["activation"]) layers = self.config["action_predictor"]["layers"] + [ self.number_of_joints ] current = self._generate_policy_features(joints_input, name_prefix, reuse_flag) for i, layer_size in enumerate(layers[:-1]): current = tf.layers.dense( current, layer_size, activation=activation, name="{}_{}".format(name_prefix, i), reuse=reuse_flag, ) tanh_preactivation = tf.layers.dense( current, layers[-1], activation=None, name="{}_tanh".format(name_prefix), reuse=reuse_flag, ) action = tf.nn.l2_normalize(tf.nn.tanh(tanh_preactivation), 1) return action, tanh_preactivation
def create_reward_network( self, joints_inputs, action_inputs, goal_joints_inputs, goal_pose_inputs, images_3d): name_prefix = 'reward' # get the next joints clipped_next_joints, unclipped_next_joints = self._next_state_model(joints_inputs, action_inputs) # predict the transition classification layers = self.config['reward']['layers'] + [3] scale = 0.0 if 'l2_regularization_coefficient' in self.config['reward']: scale = self.config['reward']['l2_regularization_coefficient'] current = tf.concat( (clipped_next_joints, self._generate_goal_features(goal_joints_inputs, goal_pose_inputs)), axis=1) # add vision if needed if self.is_vision_enabled: visual_inputs = DqnModel(name_prefix).predict(images_3d, self._reuse_flag) current = tf.concat((current, visual_inputs), axis=1) for i, layer_size in enumerate(layers): _activation = None if i == len(layers) - 1 else get_activation(self.config['reward']['activation']) current = tf.layers.dense( current, layer_size, activation=_activation, name='{}_layers_{}'.format(name_prefix, i), kernel_regularizer=tf_layers.l2_regularizer(scale), reuse=self._reuse_flag ) softmax_logits = current softmax_res = tf.nn.softmax(softmax_logits) # if the one-hot input is fed, is labeled will be 1.0 otherwise it will be zero is_labeled = tf.expand_dims(tf.reduce_max(self.transition_label, axis=1), axis=1) reward_calculation_input = self.transition_label + tf.multiply(1.0 - is_labeled, softmax_res) # get the classification reward classification_reward = tf.layers.dense( reward_calculation_input, 1, activation=None, use_bias=False, name='{}_classification_reward'.format(name_prefix), reuse=self._reuse_flag ) # get the clipping-related reward # clipped_difference = tf.expand_dims(tf.norm(unclipped_next_joints - clipped_next_joints, axis=1), axis=1) # this is the original # clipped_difference = tf.expand_dims(tf.reduce_sum(tf.zeros_like(clipped_next_joints), axis=1), axis=1) # this will have no gradient backlash clipped_difference = tf.expand_dims(tf.reduce_sum(tf.abs(unclipped_next_joints - clipped_next_joints), axis=1), axis=1) clipping_reward = tf.layers.dense( clipped_difference, 1, activation=None, use_bias=False, name='{}_clipping_weight'.format(name_prefix), reuse=self._reuse_flag ) total_reward = classification_reward + clipping_reward self._reuse_flag = True return total_reward, softmax_logits
def _create_critic_network(self, joints_input, action_input, is_online, reuse_flag, add_regularization_loss): name_prefix = '{}_critic_{}'.format( self.name_prefix, 'online' if is_online else 'target') layers_before_action = self.config['critic']['layers_before_action'] layers_after_action = self.config['critic']['layers_after_action'] activation = get_activation(self.config['critic']['activation']) current = self._generate_policy_features(joints_input, name_prefix, reuse_flag) scale = self.config['critic'][ 'l2_regularization_coefficient'] if add_regularization_loss else 0.0 for i, layer_size in enumerate(layers_before_action): current = tf.layers.dense( current, layer_size, activation=activation, name='{}_before_action_{}'.format(name_prefix, i), reuse=reuse_flag, kernel_regularizer=layers.l2_regularizer(scale)) current = tf.concat((current, action_input), axis=1) for i, layer_size in enumerate(layers_after_action): _activation = None if i == len( layers_after_action) - 1 else activation current = tf.layers.dense( current, layer_size, activation=_activation, name='{}_after_action_{}'.format(name_prefix, i), reuse=reuse_flag, kernel_regularizer=layers.l2_regularizer(scale)) if self.config['critic']['last_layer_tanh']: q_val = tf.layers.dense( current, 1, activation=tf.nn.tanh, name='{}_tanh_layer'.format(name_prefix), reuse=reuse_flag, kernel_regularizer=layers.l2_regularizer(scale)) q_val_with_stretch = tf.layers.dense( tf.ones_like(q_val), 1, tf.abs, False, name='{}_stretch'.format(name_prefix), reuse=reuse_flag, kernel_regularizer=layers.l2_regularizer(scale)) * q_val return q_val_with_stretch # gamma = self.config['model']['gamma'] # stretch = 1.0 / (1.0 - gamma) # q_val_with_stretch = stretch * q_val else: q_val = tf.layers.dense( current, 1, activation=None, name='{}_linear_layer'.format(name_prefix), reuse=reuse_flag, kernel_regularizer=layers.l2_regularizer(scale)) return q_val