def create_pretrain_loss(self): pretrain_loss = self.loss_for_reconstruction() # training updates train_op = get_train_op(self.learning_rate, pretrain_loss, self.g_params, self.clip_val) return pretrain_loss, train_op
def create_critic_loss(self, cumulative_rewards, missing=None): if missing is not None: missing = tf.cast(missing, tf.bool) else: missing = 1.0 loss = tf.compat.v1.losses.mean_squared_error(labels=cumulative_rewards, predictions=self.estimated_values, weights=missing) vars = [v for v in tf.trainable_variables() if v.op.name.startswith(self.name)] train_op = get_train_op(self.generator.learning_rate, loss, vars) return loss, train_op
def create_pretrain_loss(self): pretrain_recon_loss = self.loss_for_reconstruction() pretrain_act_loss = -tf.reduce_sum( tf.one_hot(tf.cast(tf.reshape(self.acts, [-1]), tf.int32), self.n_actions, 1.0, 0.0) * clip_and_log( tf.reshape(self.a_predictions, [-1, self.n_actions]) ) ) / (self.sequence_length * self.batch_size) # training updates pretrain_loss = pretrain_recon_loss + pretrain_act_loss train_op = get_train_op(self.learning_rate, pretrain_loss, self.g_params) return pretrain_loss, pretrain_act_loss, train_op
def create_adversarial_loss(self, dis_predictions): missing = tf.cast(self.missing, tf.float32) rewards = tf.nn.sigmoid(dis_predictions) rewards = clip_and_log(rewards) log_probs = self.gen_log_p * missing rewards_list = tf.unstack(rewards, axis=1) missing_list = tf.unstack(missing, axis=1) # Cumulative Discounted Returns. The true value function V*(s). cumulative_rewards = [] for t in range(self.sequence_length): cum_value = tf.zeros(shape=[self.batch_size]) for s in range(t, self.sequence_length): cum_value += missing_list[s] * np.power( self.reward_gamma, (s - t)) * rewards_list[s] cumulative_rewards.append(cum_value) cumulative_rewards = tf.stack(cumulative_rewards, axis=1) print("cumulative_rewards:", cumulative_rewards.shape) # Unstack Tensors into lists. self.critic_loss, self.critic_updates = self.critic.create_critic_loss( cumulative_rewards, self.missing) baselines = tf.unstack(self.critic.estimated_values, axis=1) log_probs_list = tf.unstack(log_probs, axis=1) g_loss = 0. for t in range(self.sequence_length): log_probability = log_probs_list[t] cum_advantage = tf.zeros(shape=[self.batch_size]) for s in range(t, self.sequence_length): cum_advantage += missing_list[s] * np.power( self.reward_gamma, (s - t)) * rewards_list[s] cum_advantage -= baselines[t] # Clip advantages. cum_advantage = tf.clip_by_value(cum_advantage, -self.clip_val, self.clip_val) g_loss += tf.multiply(missing_list[t] * log_probability, tf.stop_gradient(cum_advantage)) train_op = get_train_op(self.learning_rate, -g_loss, self.g_params, self.clip_val) return g_loss, train_op
def create_loss(self, fake_predictions, real_predictions, missing): real_labels = tf.ones_like(real_predictions) fake_labels = 1 - missing loss_real = tf.compat.v1.losses.sigmoid_cross_entropy(real_labels, real_predictions, weights=missing) loss_fake = tf.compat.v1.losses.sigmoid_cross_entropy(fake_labels, fake_predictions, weights=missing) loss = (loss_fake + loss_real) / 2. vars = [ param for param in tf.trainable_variables() if 'discriminator' in param.name ] train_op = get_train_op(self.generator.learning_rate, loss, vars) return loss_fake, loss_real, train_op
def create_loss(self, fake_predictions, real_predictions, fake_sequence, real_sequence, fake_missing, real_missing, fake_weights=1.0, real_weights=1.0): real_labels = tf.ones_like(real_predictions) fake_labels = tf.zeros_like(fake_predictions) real_presented = tf.cast(real_sequence, tf.float32) * (1 - real_missing) fake_presented = tf.cast(fake_sequence, tf.float32) * (1 - fake_missing) # all presented tokens > 0 but missing = -1 _fake_presented = tf.where(tf.math.equal(fake_presented, 0), tf.ones_like(fake_presented) * -1, fake_presented) fake_labels = tf.where(tf.math.equal(real_presented, _fake_presented), real_labels, fake_labels) loss_real = tf.compat.v1.losses.sigmoid_cross_entropy(real_labels, real_predictions, weights=real_weights) loss_fake = tf.compat.v1.losses.sigmoid_cross_entropy(fake_labels, fake_predictions, weights=fake_weights) loss = (loss_fake + loss_real) / 2. vars = [param for param in tf.trainable_variables() if 'discriminator' in param.name] train_op = get_train_op(self.generator.learning_rate, loss, vars) return loss_fake, loss_real, train_op
def create_adversarial_loss(self, dis_predictions): missing = get_mask(self.gen_act) # mask_sent = tf.cast(get_mask_for_pad(self.gen_x, self.gen_act), tf.float32) mask_sent = tf.ones_like(missing, tf.float32) rewards = tf.nn.sigmoid(dis_predictions) rewards = clip_and_log(rewards) missing = tf.cast(missing, tf.float32) present = (1 - missing) mask_act_log_probs = clip_and_log(self.gen_mask_act_p) util_act_log_probs = clip_and_log(1 - self.gen_mask_act_p) gen_log_p = self.gen_log_p gen_act_log_p = self.gen_act_log_p - tf.stop_gradient(util_act_log_probs) probs4Tok = tf.exp(gen_log_p) probs4Man = tf.exp(gen_act_log_p) log_probs = gen_log_p * missing act_log_probs = gen_act_log_p * present rewards_list = tf.unstack(rewards, axis=1) missing_list = tf.unstack(missing, axis=1) present_list = tf.unstack(present, axis=1) mask_sent_list = tf.unstack(mask_sent, axis=1) # Cumulative Discounted Returns. The true value function V*(s). cumulative_rewards = [] for t in range(self.sequence_length): cum_value = tf.zeros(shape=[self.batch_size]) for s in range(t, self.sequence_length): cum_value += mask_sent_list[s] * np.power(self.reward_gamma, (s - t)) * rewards_list[s] cumulative_rewards.append(cum_value) cumulative_rewards = tf.stack(cumulative_rewards, axis=1) self.critic_loss, self.critic_updates = self.critic.create_critic_loss(cumulative_rewards, missing=mask_sent) baselines = tf.unstack(self.critic.estimated_values, axis=1) probs4Tok_list = tf.unstack(probs4Tok, axis=1) probs4Man_list = tf.unstack(probs4Man, axis=1) log_probs_list = tf.unstack(log_probs, axis=1) act_log_probs_list = tf.unstack(act_log_probs, axis=1) mask_act_log_probs_list = tf.unstack(mask_act_log_probs, axis=1) util_act_log_probs_list = tf.unstack(util_act_log_probs, axis=1) g_loss = 0. for t in range(self.sequence_length): prob = probs4Tok_list[t] act_prob = probs4Man_list[t] log_probability = log_probs_list[t] act_log_probability = act_log_probs_list[t] mask_log_prob = mask_act_log_probs_list[t] util_log_prob = util_act_log_probs_list[t] mask_prob = tf.exp(mask_log_prob) util_prob = tf.exp(util_log_prob) cum_advantage = tf.zeros(shape=[self.batch_size]) for s in range(t, self.sequence_length): cum_advantage += mask_sent_list[s] * np.power(self.reward_gamma, (s - t)) * rewards_list[s] cum_advantage -= baselines[t] # Clip advantages. cum_advantage = tf.clip_by_value(cum_advantage, -self.clip_val, self.clip_val) g_loss += tf.multiply(missing_list[t] * log_probability * tf.stop_gradient(mask_prob), tf.stop_gradient(cum_advantage)) g_loss += tf.multiply(present_list[t] * act_log_probability * tf.stop_gradient(util_prob), tf.stop_gradient(cum_advantage)) g_loss += tf.multiply(missing_list[t] * mask_log_prob * tf.stop_gradient(prob), tf.stop_gradient(cum_advantage)) g_loss += tf.multiply(present_list[t] * util_log_prob * tf.stop_gradient(act_prob), tf.stop_gradient(cum_advantage)) train_op = get_train_op(self.learning_rate, -g_loss, self.g_params, self.clip_val) return g_loss, train_op