def aac_loss_def(act_target, adv_target, r_target, pi_logits, pi_vf, pi_prime_logits, entropy_beta, epsilon=None, name='_aac_', verbose=False): """ Advantage Actor Critic loss definition. Paper: https://arxiv.org/abs/1602.01783 Args: act_target: tensor holding policy actions targets; adv_target: tensor holding policy estimated advantages targets; r_target: tensor holding policy empirical returns targets; pi_logits: policy logits output tensor; pi_prime_logits: not used; pi_vf: policy value function output tensor; entropy_beta: entropy regularization constant; epsilon: not used; name: scope; verbose: summary level. Returns: tensor holding estimated AAC loss; list of related tensorboard summaries. """ with tf.name_scope(name + '/aac'): neg_pi_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2( logits=pi_logits, labels=act_target) pi_loss = tf.reduce_mean(neg_pi_log_prob * adv_target) vf_loss = 0.5 * tf.losses.mean_squared_error(r_target, pi_vf) entropy = tf.reduce_mean(cat_entropy(pi_logits)) loss = pi_loss + vf_loss - entropy * entropy_beta mean_vf = tf.reduce_mean(pi_vf) mean_t_target = tf.reduce_mean(r_target) summaries = [ tf.summary.scalar('policy_loss', pi_loss), tf.summary.scalar('value_loss', vf_loss), ] if verbose: summaries += [ tf.summary.scalar('entropy', entropy), tf.summary.scalar('value_fn', mean_vf), # tf.summary.scalar('empirical_return',mean_t_target), # tf.summary.histogram('value_fn', pi_vf), # tf.summary.histogram('empirical_return', r_target), ] return loss, summaries
def meta_loss_def_1_0( act_target_train, act_target_test, adv_target_train, adv_target_test, r_target_train, r_target_test, pi_logits_train, pi_logits_test, pi_vf_train, pi_vf_test, pi_prime_logits, entropy_beta, epsilon=None, name='_meta_', verbose=False ): with tf.name_scope(name + '/meta'): neg_pi_log_prob_train = tf.nn.softmax_cross_entropy_with_logits_v2( logits=pi_logits_train, labels=act_target_train ) neg_pi_log_prob_test = tf.nn.softmax_cross_entropy_with_logits_v2( logits=pi_logits_test, labels=act_target_test ) pi_loss = tf.reduce_mean( (neg_pi_log_prob_train + neg_pi_log_prob_test) * adv_target_test ) vf_loss_train = 0.5 * tf.losses.mean_squared_error(r_target_test, pi_vf_train) vf_loss_test = 0.5 * tf.losses.mean_squared_error(r_target_test, pi_vf_test) entropy = tf.reduce_mean(cat_entropy(pi_logits_test)) loss = pi_loss + vf_loss_test + vf_loss_train - entropy * entropy_beta mean_vf_test = tf.reduce_mean(pi_vf_test) mean_vf_train = tf.reduce_mean(pi_vf_train) summaries = [ tf.summary.scalar('meta_policy_loss', pi_loss), tf.summary.scalar('meta_value_loss_test', vf_loss_test), ] if verbose: summaries += [ tf.summary.scalar('entropy', entropy), tf.summary.scalar('value_fn_test', mean_vf_test), tf.summary.scalar('value_fn_train', mean_vf_train) ] return loss, summaries
def ppo_loss_def(act_target, adv_target, r_target, pi_logits, pi_vf, pi_prime_logits, entropy_beta, epsilon, name='_ppo_', verbose=False): """ PPO clipped surrogate loss definition, as (7) in https://arxiv.org/pdf/1707.06347.pdf Args: act_target: tensor holding policy actions targets; adv_target: tensor holding policy estimated advantages targets; r_target: tensor holding policy empirical returns targets; pi_logits: policy logits output tensor; pi_vf: policy value function output tensor; pi_prime_logits: old_policy logits output tensor; entropy_beta: entropy regularization constant epsilon: L^Clip epsilon tensor; name: scope; verbose: summary level. Returns: tensor holding estimated PPO L^Clip loss; list of related tensorboard summaries. """ #act_target = tf.placeholder(tf.float32, [None, env.action_space.n], name="on_policy_action_pl") #adv_target = tf.placeholder(tf.float32, [None], name="on_policy_advantage_pl") #r_target = tf.placeholder(tf.float32, [None], name="on_policy_return_pl") with tf.name_scope(name + '/ppo'): pi_log_prob = -tf.nn.softmax_cross_entropy_with_logits( logits=pi_logits, labels=act_target) pi_old_log_prob = tf.stop_gradient( -tf.nn.softmax_cross_entropy_with_logits(logits=pi_prime_logits, labels=act_target)) pi_ratio = tf.exp(pi_log_prob - pi_old_log_prob) surr1 = pi_ratio * adv_target # surrogate from conservative policy iteration surr2 = tf.clip_by_value(pi_ratio, 1.0 - epsilon, 1.0 + epsilon) * adv_target pi_surr_loss = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.losses.mean_squared_error(r_target, pi_vf) # V.fn. loss entropy = tf.reduce_mean(cat_entropy(pi_logits)) loss = pi_surr_loss + vf_loss - entropy * entropy_beta # Info: mean_pi_ratio = tf.reduce_mean(pi_ratio) mean_vf = tf.reduce_mean(pi_vf) mean_kl_old_new = tf.reduce_mean( kl_divergence(pi_prime_logits, pi_logits)) summaries = [ tf.summary.scalar('l_clip_loss', pi_surr_loss), tf.summary.scalar('value_loss', vf_loss), ] if verbose: summaries += [ tf.summary.scalar('entropy', entropy), tf.summary.scalar('Dkl_old_new', mean_kl_old_new), tf.summary.scalar('pi_ratio', mean_pi_ratio), tf.summary.scalar('value_f', mean_vf), ] return loss, summaries
def __init__( self, data_batch, labels_batch=None, keep_prob=tf.ones([],), activation=tf.nn.elu, name='model', reuse=False): self.data_batch = data_batch self.labels_batch = labels_batch with tf.variable_scope(name_or_scope=name, reuse=reuse): hidden = conv_1d_casual_attention_encoder( data_batch['features'], keep_prob=keep_prob, conv_1d_num_filters=64, conv_1d_filter_size=2, conv_1d_activation=activation, reuse=False, ) hidden = tf.layers.flatten(hidden) # print(hidden.shape) # hidden = tf.layers.dense( # inputs=hidden, # units=512, # activation=activation, # ) hidden = noisy_linear( x=hidden, size=64, activation_fn=activation, name='dense1' ) hidden = tf.nn.dropout(hidden, keep_prob=keep_prob) self.predicted_log_sum = tf.layers.dense( inputs=hidden, units=1, activation=activation, kernel_initializer=tf.contrib.layers.xavier_initializer(), ) # self.predicted_log_sum = noisy_linear( # x=hidden, # size=1, # activation_fn=activation, # name='log_sum' # ) self.predicted_target_sum = tf.clip_by_value( tf.exp(self.predicted_log_sum) - 1, clip_value_min=0, clip_value_max=1e20 ) self.predicted_flag_logits = tf.layers.dense( inputs=hidden, units=2, activation=activation, kernel_initializer=tf.contrib.layers.xavier_initializer(), ) # self.predicted_flag_logits = noisy_linear( # x=tf.concat([hidden, self.predicted_log_sum], axis=-1), # size=2, # activation_fn=activation, # name='flag' # ) self.predicted_flag_probs = tf.nn.softmax(self.predicted_flag_logits) self.predicted_flag = tf.argmax( self.predicted_flag_probs, axis=-1 ) self.class_entropy = tf.reduce_mean(cat_entropy(self.predicted_flag_logits)) self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) if labels_batch is not None: self.regress_loss = tf.losses.mean_squared_error( labels=labels_batch['target_sum'][..., None], predictions=self.predicted_log_sum ) self.regress_loss = tf.sqrt(self.regress_loss) self.class_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2( logits=self.predicted_flag_logits, labels=labels_batch['target_flag'], ) ) self.auc, self.auc_update_op = tf.metrics.auc( labels=labels_batch['target_flag'], predictions=self.predicted_flag_probs, weights=None, num_thresholds=200, ) else: self.regress_loss = None self.class_loss = None self.auc = 0 self.auc_update_op = None
def _make_loss(self, pi, pi_prime, name='base', verbose=True, **kwargs): """ Defines policy state encoder classification loss, placeholders and summaries. Args: pi: policy network obj. pi_prime: optional policy network obj. name: str, name scope verbose: summary level Returns: tensor holding estimated loss graph list of related summaries """ with tf.name_scope(name): # On-policy AAC loss definition: pi.on_pi_act_target = tf.placeholder( tf.float32, [None, self.ref_env.action_space.n], name="on_policy_action_pl") pi.on_pi_adv_target = tf.placeholder(tf.float32, [None], name="on_policy_advantage_pl") pi.on_pi_r_target = tf.placeholder(tf.float32, [None], name="on_policy_return_pl") clip_epsilon = tf.cast( self.clip_epsilon * self.learn_rate_decayed / self.opt_learn_rate, tf.float32) on_pi_loss, on_pi_summaries = self.on_policy_loss( act_target=pi.on_pi_act_target, adv_target=pi.on_pi_adv_target, r_target=pi.on_pi_r_target, pi_logits=pi.on_logits, pi_vf=pi.on_vf, pi_prime_logits=pi_prime.on_logits, entropy_beta=self.model_beta, epsilon=clip_epsilon, name='on_policy', verbose=verbose) # Classification loss for price movements prediction: # oracle_labels = tf.one_hot(tf.argmax(pi.expert_actions, axis=-1), depth=4) if self.class_use_rnn: class_logits = pi.on_logits else: class_logits = pi.on_simple_logits # class_loss = tf.reduce_mean( # tf.nn.softmax_cross_entropy_with_logits_v2( # labels=pi.expert_actions,#oracle_labels, # logits=class_logits, # ) # ) class_loss = tf.losses.mean_squared_error( labels=pi.expert_actions[..., 1:3], predictions=tf.nn.softmax(class_logits)[..., 1:3], ) entropy = tf.reduce_mean(cat_entropy(class_logits)) # self.accuracy = tf.metrics.accuracy( # labels=tf.argmax(pi.expert_actions, axis=-1), # predictions=tf.argmax(class_logits, axis=-1) # ) self.accuracy = tf.metrics.accuracy( labels=tf.argmax(pi.expert_actions[..., 1:3], axis=-1), predictions=tf.argmax(class_logits[..., 1:3], axis=-1)) model_summaries = [ tf.summary.scalar('class_loss', class_loss), tf.summary.scalar('class_accuracy', self.accuracy[0]) ] # Accumulate total loss: loss = float(self.class_lambda) * class_loss + float(self.aac_lambda) * on_pi_loss\ - float(self.model_beta) * entropy model_summaries += on_pi_summaries return loss, model_summaries