def train(self, training_num): total_a_loss = 0 total_c_loss = 0 for i in range(training_num): self.current_step += 1 s, a, r, ns, d = self.buffer.sample(self.batch_size) value_next = tf.stop_gradient( self.target_critic(ns, self.target_actor(ns))) target_value = r + (1 - d) * self.gamma * value_next with tf.GradientTape(persistent=True) as tape: critic_loss = 0.5 * tf.reduce_mean( tf.square(target_value - self.critic(s, a))) actor_loss = -tf.reduce_mean(self.critic(s, self.actor(s))) critic_grad = tape.gradient(critic_loss, self.critic.trainable_variables) self.critic_optimizer.apply_gradients( zip(critic_grad, self.critic.trainable_variables)) actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( (zip(actor_grad, self.actor.trainable_variables))) soft_update(self.actor, self.target_actor, self.tau) soft_update(self.critic, self.target_critic, self.tau) del tape total_a_loss += actor_loss.numpy() total_c_loss += critic_loss.numpy() return [['Loss/Actor', total_a_loss], ['Loss/Critic', total_c_loss]]
def train(self, training_num): total_a_loss = 0 total_c1_loss, total_c2_loss = 0, 0 for i in range(training_num): self.current_step += 1 s, a, r, ns, d = self.buffer.sample(self.batch_size) target_action = tf.clip_by_value( self.target_actor(ns) + tf.clip_by_value( tf.random.normal(shape=self.target_actor(ns).shape, mean=0, stddev=self.target_noise), -self.noise_clip, self.noise_clip), -1, 1) target_value = tf.stop_gradient( r + self.gamma * (1 - d) * tf.minimum(self.target_critic1(ns, target_action), self.target_critic2(ns, target_action))) with tf.GradientTape(persistent=True) as tape: critic1_loss = 0.5 * tf.reduce_mean( tf.square(target_value - self.critic1(s, a))) critic2_loss = 0.5 * tf.reduce_mean( tf.square(target_value - self.critic2(s, a))) critic1_grad = tape.gradient(critic1_loss, self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip(critic1_grad, self.critic1.trainable_variables)) critic2_grad = tape.gradient(critic2_loss, self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip(critic2_grad, self.critic2.trainable_variables)) if self.current_step % self.policy_delay == 0: with tf.GradientTape() as tape2: actor_loss = -tf.reduce_mean(self.critic1( s, self.actor(s))) actor_grad = tape2.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) soft_update(self.actor, self.target_actor, self.tau) soft_update(self.critic1, self.target_critic1, self.tau) soft_update(self.critic2, self.target_critic2, self.tau) del tape, tape2 total_a_loss += actor_loss.numpy() total_c1_loss += critic1_loss.numpy() total_c2_loss += critic2_loss.numpy() return [['Loss/Actor', total_a_loss], ['Loss/Critic1', total_c1_loss], ['Loss/Critic2', total_c2_loss]]
def train(self, training_num): total_a_loss = 0 total_c1_loss, total_c2_loss = 0, 0 total_alpha_loss = 0 for i in range(training_num): self.current_step += 1 s, a, r, ns, d = self.buffer.sample(self.batch_size) ns_action, ns_logpi = self.actor(ns) target_min_aq = tf.minimum(self.target_critic1(ns, ns_action), self.target_critic2(ns, ns_action)) target_q = tf.stop_gradient( r + self.gamma * (1 - d) * (target_min_aq - self.alpha.numpy() * ns_logpi)) with tf.GradientTape(persistent=True) as tape1: critic1_loss = 0.5 * tf.reduce_mean( tf.square(self.critic1(s, a) - target_q)) critic2_loss = 0.5 * tf.reduce_mean( tf.square(self.critic2(s, a) - target_q)) critic1_gradients = tape1.gradient( critic1_loss, self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip(critic1_gradients, self.critic1.trainable_variables)) critic2_gradients = tape1.gradient( critic2_loss, self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip(critic2_gradients, self.critic2.trainable_variables)) del tape1 with tf.GradientTape() as tape2: s_action, s_logpi = self.actor(s) min_aq_rep = tf.minimum(self.critic1(s, s_action), self.critic2(s, s_action)) actor_loss = 0.5 * tf.reduce_mean(self.alpha.numpy() * s_logpi - min_aq_rep) actor_gradients = tape2.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_gradients, self.actor.trainable_variables)) del tape2 if self.train_alpha == True: with tf.GradientTape() as tape3: _, s_logpi = self.actor(s) alpha_loss = -( tf.exp(self.log_alpha) * (tf.stop_gradient(s_logpi + self.target_entropy))) alpha_loss = tf.nn.compute_average_loss( alpha_loss) #from softlearning package alpha_grad = tape3.gradient(alpha_loss, [self.log_alpha]) self.alpha_optimizer.apply_gradients( zip(alpha_grad, [self.log_alpha])) del tape3 if self.current_step % self.critic_update == 0: soft_update(self.critic1, self.target_critic1, self.tau) soft_update(self.critic2, self.target_critic2, self.tau) total_a_loss += actor_loss.numpy() total_c1_loss += critic1_loss.numpy() total_c2_loss += critic2_loss.numpy() if self.train_alpha == True: total_alpha_loss += alpha_loss.numpy() return [['Loss/Actor', total_a_loss], ['Loss/Critic1', total_c1_loss], ['Loss/Critic2', total_c2_loss], ['Loss/alpha', total_alpha_loss], ['Alpha', tf.exp(self.log_alpha).numpy()]]
def train(self, local_step): self.current_step += 1 total_a_loss = 0 total_c1_loss, total_c2_loss = 0, 0 total_alpha_loss = 0 loss_list = [] s, a, r, ns, d = self.buffer.rad_sample(self.batch_size, self.aug_funcs, self.pre_image_size) ns_action, ns_logpi = self.actor(self.encoder(ns)) target_min_aq = tf.minimum(self.target_critic1(self.target_encoder(ns), ns_action), self.target_critic2(self.target_encoder(ns), ns_action)) target_q = tf.stop_gradient(r + self.gamma * (1 - d) * ( target_min_aq - self.alpha.numpy() * ns_logpi)) with tf.GradientTape(persistent=True) as tape1: critic1_loss = tf.reduce_mean(tf.square(self.critic1(self.encoder(s), a) - target_q)) critic2_loss = tf.reduce_mean(tf.square(self.critic2(self.encoder(s), a) - target_q)) critic1_gradients = tape1.gradient(critic1_loss, self.encoder.trainable_variables + self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip(critic1_gradients, self.encoder.trainable_variables + self.critic1.trainable_variables)) critic2_gradients = tape1.gradient(critic2_loss, self.encoder.trainable_variables + self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip(critic2_gradients, self.encoder.trainable_variables + self.critic2.trainable_variables)) del tape1 with tf.GradientTape() as tape2: s_action, s_logpi = self.actor(tf.stop_gradient(self.encoder(s))) min_aq_rep = tf.minimum(self.critic1(tf.stop_gradient(self.encoder(s)), s_action), self.critic2(tf.stop_gradient(self.encoder(s)), s_action)) actor_loss = tf.reduce_mean(self.alpha.numpy() * s_logpi - min_aq_rep) actor_gradients = tape2.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients(zip(actor_gradients, self.actor.trainable_variables)) del tape2 if self.train_alpha == True: with tf.GradientTape() as tape3: _, s_logpi = self.actor(self.encoder(s)) alpha_loss = -tf.exp(self.log_alpha) * tf.stop_gradient(s_logpi + self.target_entropy) alpha_loss = tf.nn.compute_average_loss(alpha_loss) #alpha_loss = tf.reduce_mean(alpha_loss) log_alpha_gradients = tape3.gradient(alpha_loss, [self.log_alpha]) self.log_alpha_optimizer.apply_gradients(zip(log_alpha_gradients, [self.log_alpha])) del tape3 if self.current_step % self.critic_update == 0: soft_update(self.critic1, self.target_critic1, self.tau) soft_update(self.critic2, self.target_critic2, self.tau) soft_update(self.encoder, self.target_encoder, self.encoder_tau) total_c1_loss += critic1_loss.numpy() total_c2_loss += critic2_loss.numpy() loss_list.append(['Loss/Critic1', total_c1_loss]) loss_list.append(['Loss/Critic2', total_c2_loss]) total_a_loss += actor_loss.numpy() loss_list.append(['Loss/Actor', total_a_loss]) if self.train_alpha == True: total_alpha_loss += alpha_loss.numpy() loss_list.append(['Loss/Alpha', total_alpha_loss]) loss_list.append(['Alpha', tf.exp(self.log_alpha).numpy()]) return loss_list
def train(self, local_step): self.current_step += 1 total_a_loss = 0 total_c1_loss, total_c2_loss = 0, 0 total_cpc_loss = 0 loss_list = [] s, a, r, ns, d, cpc_kwargs = self.buffer.cpc_sample( self.batch_size, self.image_size) obs_anchor, obs_pos = cpc_kwargs["obs_anchor"], cpc_kwargs["obs_pos"] with tf.GradientTape(persistent=True) as tape: z_a = self.encoder(obs_anchor) z_pos = tf.stop_gradient(self.target_encoder(obs_pos)) logits = self.curl.compute_logits(z_a, z_pos) labels = tf.range(logits.shape[0], dtype='int64') cpc_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels, logits)) cpc_gradients = tape.gradient(cpc_loss, self.curl.trainable_variables) self.cpc_optimizer.apply_gradients( zip(cpc_gradients, self.curl.trainable_variables)) encoder_gradients = tape.gradient(cpc_loss, self.encoder.trainable_variables) self.encoder_optimizer.apply_gradients( zip(encoder_gradients, self.encoder.trainable_variables)) total_cpc_loss += cpc_loss.numpy() loss_list.append(['Loss/CPC', total_cpc_loss]) del tape if self.current_step % 2 == 0: target_action = tf.clip_by_value( self.target_actor(self.target_encoder(ns)) + tf.clip_by_value( tf.random.normal(shape=self.target_actor( self.target_encoder(ns)).shape, mean=0, stddev=self.target_noise), -self.noise_clip, self.noise_clip), -1, 1) target_value = tf.stop_gradient(r + self.gamma * ( 1 - d ) * tf.minimum( self.target_critic1(self.target_encoder(ns), target_action), self.target_critic2(self.target_encoder(ns), target_action))) with tf.GradientTape(persistent=True) as tape: critic1_loss = 0.5 * tf.reduce_mean( tf.square(target_value - self.critic1(self.encoder(s), a))) critic2_loss = 0.5 * tf.reduce_mean( tf.square(target_value - self.critic2(self.encoder(s), a))) critic1_grad = tape.gradient( critic1_loss, self.encoder.trainable_variables + self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip( critic1_grad, self.encoder.trainable_variables + self.critic1.trainable_variables)) critic2_grad = tape.gradient( critic2_loss, self.encoder.trainable_variables + self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip( critic2_grad, self.encoder.trainable_variables + self.critic2.trainable_variables)) if self.current_step % (2 * self.policy_delay) == 0: with tf.GradientTape() as tape2: actor_loss = -tf.reduce_mean( self.critic1( tf.stop_gradient(self.encoder(s)), self.actor(tf.stop_gradient(self.encoder(s))))) actor_grad = tape2.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) soft_update(self.actor, self.target_actor, self.tau) soft_update(self.critic1, self.target_critic1, self.tau) soft_update(self.critic2, self.target_critic2, self.tau) soft_update(self.encoder, self.target_encoder, self.encoder_tau) total_a_loss += actor_loss.numpy() loss_list.append(['Loss/Actor', total_a_loss]) total_c1_loss += critic1_loss.numpy() total_c2_loss += critic2_loss.numpy() loss_list.append(['Loss/Critic1', total_c1_loss]) loss_list.append(['Loss/Critic2', total_c2_loss]) return loss_list
def train(self, local_step): self.current_step += 1 total_a_loss = 0 total_c1_loss, total_c2_loss = 0, 0 total_cpc_loss = 0 total_alpha_loss = 0 loss_list = [] s, a, r, ns, d, cpc_kwargs = self.buffer.cpc_sample( self.batch_size, self.image_size) obs_anchor, obs_pos = cpc_kwargs["obs_anchor"], cpc_kwargs["obs_pos"] ns_action, ns_logpi = self.actor(self.encoder(ns)) target_min_aq = tf.minimum( self.target_critic1(self.target_encoder(ns), ns_action), self.target_critic2(self.target_encoder(ns), ns_action)) target_q = tf.stop_gradient( r + self.gamma * (1 - d) * (target_min_aq - self.alpha.numpy() * ns_logpi)) with tf.GradientTape(persistent=True) as tape1: critic1_loss = tf.reduce_mean( tf.square(self.critic1(self.encoder(s), a) - target_q)) critic2_loss = tf.reduce_mean( tf.square(self.critic2(self.encoder(s), a) - target_q)) critic1_gradients = tape1.gradient( critic1_loss, self.encoder.trainable_variables + self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip( critic1_gradients, self.encoder.trainable_variables + self.critic1.trainable_variables)) critic2_gradients = tape1.gradient( critic2_loss, self.encoder.trainable_variables + self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip( critic2_gradients, self.encoder.trainable_variables + self.critic2.trainable_variables)) del tape1 with tf.GradientTape() as tape2: s_action, s_logpi = self.actor(tf.stop_gradient(self.encoder(s))) min_aq_rep = tf.minimum( self.critic1(tf.stop_gradient(self.encoder(s)), s_action), self.critic2(tf.stop_gradient(self.encoder(s)), s_action)) actor_loss = tf.reduce_mean(self.alpha.numpy() * s_logpi - min_aq_rep) actor_gradients = tape2.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_gradients, self.actor.trainable_variables)) del tape2 if self.train_alpha == True: with tf.GradientTape() as tape3: _, s_logpi = self.actor(self.encoder(s)) alpha_loss = -(tf.exp(self.log_alpha) * tf.stop_gradient(s_logpi + self.target_entropy)) alpha_loss = tf.nn.compute_average_loss(alpha_loss) #alpha_loss = tf.reduce_mean(alpha_loss) log_alpha_gradients = tape3.gradient(alpha_loss, [self.log_alpha]) self.log_alpha_optimizer.apply_gradients( zip(log_alpha_gradients, [self.log_alpha])) del tape3 if self.current_step % self.critic_update == 0: soft_update(self.critic1, self.target_critic1, self.tau) soft_update(self.critic2, self.target_critic2, self.tau) soft_update(self.encoder, self.target_encoder, self.encoder_tau) with tf.GradientTape(persistent=True) as tape4: z_a = self.encoder(obs_anchor) z_pos = tf.stop_gradient(self.target_encoder(obs_pos)) logits = self.curl.compute_logits(z_a, z_pos) labels = tf.range(logits.shape[0], dtype='int64') cpc_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels, logits)) cpc_gradients = tape4.gradient(cpc_loss, self.curl.trainable_variables) self.cpc_optimizer.apply_gradients( zip(cpc_gradients, self.curl.trainable_variables)) encoder_gradients = tape4.gradient(cpc_loss, self.encoder.trainable_variables) self.encoder_optimizer.apply_gradients( zip(encoder_gradients, self.encoder.trainable_variables)) del tape4 total_c1_loss += critic1_loss.numpy() total_c2_loss += critic2_loss.numpy() loss_list.append(['Loss/Critic1', total_c1_loss]) loss_list.append(['Loss/Critic2', total_c2_loss]) total_a_loss += actor_loss.numpy() loss_list.append(['Loss/Actor', total_a_loss]) total_cpc_loss += cpc_loss.numpy() loss_list.append(['Loss/CPC', total_cpc_loss]) if self.train_alpha == True: total_alpha_loss += alpha_loss.numpy() loss_list.append(['Loss/Alpha', total_alpha_loss]) loss_list.append(['Alpha', tf.exp(self.log_alpha).numpy()]) return loss_list
def train(self, training_step): total_a_loss = 0 total_c1_loss, total_c2_loss = 0, 0 total_v_loss = 0 total_cpc_loss = 0 loss_list = [] self.current_step += 1 s, a, r, ns, d, cpc_kwargs = self.buffer.cpc_sample( self.batch_size, self.image_size) obs_anchor, obs_pos = cpc_kwargs["obs_anchor"], cpc_kwargs["obs_pos"] s_action, s_logpi = self.actor(self.encoder(s)) min_aq = tf.minimum(self.critic1(self.encoder(s), s_action), self.critic2(self.encoder(s), s_action)) target_v = tf.stop_gradient(min_aq - self.alpha * s_logpi) with tf.GradientTape() as tape1: v_loss = 0.5 * tf.reduce_mean( tf.square( self.v_network(tf.stop_gradient(self.encoder(s))) - target_v)) v_gradients = tape1.gradient(v_loss, self.v_network.trainable_variables) self.v_network_optimizer.apply_gradients( zip(v_gradients, self.v_network.trainable_variables)) del tape1 target_q = tf.stop_gradient( r + self.gamma * (1 - d) * self.target_v_network(self.target_encoder(ns))) with tf.GradientTape(persistent=True) as tape2: critic1_loss = 0.5 * tf.reduce_mean( tf.square(self.critic1(self.encoder(s), a) - target_q)) critic2_loss = 0.5 * tf.reduce_mean( tf.square(self.critic2(self.encoder(s), a) - target_q)) critic1_gradients = tape2.gradient( critic1_loss, self.encoder.trainable_variables + self.critic1.trainable_variables) critic2_gradients = tape2.gradient( critic2_loss, self.encoder.trainable_variables + self.critic2.trainable_variables) self.critic1_optimizer.apply_gradients( zip( critic1_gradients, self.encoder.trainable_variables + self.critic1.trainable_variables)) self.critic2_optimizer.apply_gradients( zip( critic2_gradients, self.encoder.trainable_variables + self.critic2.trainable_variables)) del tape2 with tf.GradientTape() as tape3: s_action, s_logpi = self.actor(tf.stop_gradient(self.encoder(s))) min_aq_rep = tf.minimum( self.critic1(tf.stop_gradient(self.encoder(s)), s_action), self.critic2(tf.stop_gradient(self.encoder(s)), s_action)) actor_loss = tf.reduce_mean(self.alpha * s_logpi - min_aq_rep) actor_gradients = tape3.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_gradients, self.actor.trainable_variables)) soft_update(self.v_network, self.target_v_network, self.tau) with tf.GradientTape(persistent=True) as tape4: z_a = self.encoder(obs_anchor) z_pos = tf.stop_gradient(self.target_encoder(obs_pos)) logits = self.curl.compute_logits(z_a, z_pos) labels = tf.range(logits.shape[0], dtype='int64') cpc_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels, logits)) cpc_gradients = tape4.gradient(cpc_loss, self.curl.trainable_variables) self.cpc_optimizer.apply_gradients( (zip(cpc_gradients, self.curl.trainable_variables))) encoder_gradients = tape4.gradient(cpc_loss, self.encoder.trainable_variables) self.encoder_optimizer.apply_gradients( zip(encoder_gradients, self.encoder.trainable_variables)) soft_update(self.encoder, self.target_encoder, self.encoder_tau) del tape4 total_v_loss += v_loss.numpy() loss_list.append(['Loss/V', total_v_loss]) total_c1_loss += critic1_loss.numpy() total_c2_loss += critic2_loss.numpy() loss_list.append(['Loss/Critic1', total_c1_loss]) loss_list.append(['Loss/Critic2', total_c2_loss]) total_a_loss += actor_loss.numpy() loss_list.append(['Loss/Actor', total_a_loss]) total_cpc_loss += cpc_loss.numpy() loss_list.append(['Loss/CPC', total_cpc_loss]) return loss_list
def train(self, local_step): #critic -> transition -> reward -> encoder -> actor set1, set2 = self.buffer.dbc_sample(self.batch_size) s, a, r, ns, d = set1 s2, a2, r2, ns2, d2 = set2 target_action = tf.clip_by_value( self.target_actor(self.target_encoder(ns)) + tf.clip_by_value( tf.random.normal(shape=self.target_actor( self.target_encoder(ns)).shape, mean=0, stddev=self.target_noise), -self.noise_clip, self.noise_clip), -1, 1) target_value = tf.stop_gradient(r + self.gamma * (1 - d) * tf.minimum( self.target_critic1(self.target_encoder(ns), target_action), self.target_critic2(self.target_encoder(ns), target_action))) with tf.GradientTape(persistent=True) as tape1: critic1_loss = 0.5 * tf.reduce_mean( tf.square(target_value - self.critic1(self.encoder(s), a))) critic2_loss = 0.5 * tf.reduce_mean( tf.square(target_value - self.critic2(self.encoder(s), a))) critic1_grad = tape1.gradient( critic1_loss, self.encoder.trainable_variables + self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip( critic1_grad, self.encoder.trainable_variables + self.critic1.trainable_variables)) critic2_grad = tape1.gradient( critic2_loss, self.encoder.trainable_variables + self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip( critic2_grad, self.encoder.trainable_variables + self.critic2.trainable_variables)) del tape1 #train dynamics with tf.GradientTape() as tape2: feature = self.encoder(s) next_feature = self.encoder(ns) mu, sigma = self.dynamics_model(tf.concat([feature, a], axis=1)) if (sigma[0][0].numpy() == 0): sigma = tf.ones_like(mu) diff = (mu - tf.stop_gradient(next_feature)) / sigma dynamics_loss = tf.reduce_mean(0.5 * tf.square(diff) + tf.math.log(sigma)) dynamics_gradients = tape2.gradient( dynamics_loss, self.encoder.trainable_variables + self.dynamics_model.trainable_variables) self.dynamics_optimizer.apply_gradients( zip( dynamics_gradients, self.encoder.trainable_variables + self.dynamics_model.trainable_variables)) #dynamics_gradients = tape2.gradient(dynamics_loss, self.dynamics_model.trainable_variables) #self.dynamics_optimizer.apply_gradients(zip(dynamics_gradients, self.dynamics_model.trainable_variables)) del tape2 #train reward with tf.GradientTape() as tape3: feature = self.encoder(s) sample_dynamics = self.dynamics_model.sample( tf.concat([feature, a], axis=1)) reward_prediction = self.reward_model(sample_dynamics) reward_loss = tf.reduce_mean(tf.square(reward_prediction - (r))) reward_gradients = tape3.gradient( reward_loss, self.encoder.trainable_variables + self.reward_model.trainable_variables) self.reward_optimizer.apply_gradients( zip( reward_gradients, self.encoder.trainable_variables + self.reward_model.trainable_variables)) #reward_gradients = tape3.gradient(reward_loss, self.reward_model.trainable_variables) #self.reward_optimizer.apply_gradients(zip(reward_gradients, self.reward_model.trainable_variables)) del tape3 #train encoder with tf.GradientTape() as tape4: feature1 = self.encoder(s) feature2 = self.encoder(s2) mu1, sigma1 = self.dynamics_model(tf.concat([feature1, a], axis=1)) mu2, sigma2 = self.dynamics_model(tf.concat([feature2, a2], axis=1)) z_dist = tf.abs(feature1 - feature2) r_dist = tf.abs(r - r2) transition_dist = tf.sqrt( tf.square(tf.abs(mu1 - mu2)) + tf.square(tf.abs(sigma1 - sigma2))) bisimilarity = tf.stop_gradient( tf.cast(r_dist, tf.float32) + self.gamma * tf.cast(transition_dist, tf.float32)) encoder_loss = self.bisim_coef * tf.reduce_mean( tf.square(z_dist - bisimilarity)) encoder_gradients = tape4.gradient(encoder_loss, self.encoder.trainable_variables) self.encoder_optimizer.apply_gradients( zip(encoder_gradients, self.encoder.trainable_variables)) del tape4 if local_step % (self.policy_delay) == 0: with tf.GradientTape() as tape5: actor_loss = -tf.reduce_mean( self.critic1(tf.stop_gradient(self.encoder(s)), self.actor(tf.stop_gradient( self.encoder(s))))) actor_grad = tape5.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) del tape5 soft_update(self.actor, self.target_actor, self.tau) soft_update(self.critic1, self.target_critic1, self.tau) soft_update(self.critic2, self.target_critic2, self.tau) soft_update(self.encoder, self.target_encoder, self.encoder_tau)
def train(self, local_step): self.current_step += 1 total_a_loss = 0 total_c1_loss, total_c2_loss = 0, 0 total_alpha_loss = 0 total_encoder_loss = 0 total_dynamics_loss = 0 total_reward_loss = 0 loss_list = [] s, a, r, ns, d = self.buffer.sample(self.batch_size) ns_action, ns_logpi = self.actor(self.encoder(ns)) target_min_aq = tf.minimum( self.target_critic1(self.target_encoder(ns), ns_action), self.target_critic2(self.target_encoder(ns), ns_action)) target_q = tf.stop_gradient( r + self.gamma * (1 - d) * (target_min_aq - self.alpha.numpy() * ns_logpi)) with tf.GradientTape(persistent=True) as tape1: critic1_loss = tf.reduce_mean( tf.square(self.critic1(self.encoder(s), a) - target_q)) critic2_loss = tf.reduce_mean( tf.square(self.critic2(self.encoder(s), a) - target_q)) critic1_gradients = tape1.gradient( critic1_loss, self.encoder.trainable_variables + self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip( critic1_gradients, self.encoder.trainable_variables + self.critic1.trainable_variables)) critic2_gradients = tape1.gradient( critic2_loss, self.encoder.trainable_variables + self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip( critic2_gradients, self.encoder.trainable_variables + self.critic2.trainable_variables)) del tape1 if self.current_step % self.actor_update == 0: with tf.GradientTape() as tape2: s_action, s_logpi = self.actor( tf.stop_gradient(self.encoder(s))) min_aq_rep = tf.minimum( self.critic1(tf.stop_gradient(self.encoder(s)), s_action), self.critic2(tf.stop_gradient(self.encoder(s)), s_action)) actor_loss = tf.reduce_mean(self.alpha.numpy() * s_logpi - min_aq_rep) actor_gradients = tape2.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_gradients, self.actor.trainable_variables)) del tape2 if self.train_alpha == True: with tf.GradientTape() as tape3: _, s_logpi = self.actor(self.encoder(s)) alpha_loss = -( tf.exp(self.log_alpha) * tf.stop_gradient(s_logpi + self.target_entropy)) alpha_loss = tf.nn.compute_average_loss(alpha_loss) #alpha_loss = tf.reduce_mean(alpha_loss) log_alpha_gradients = tape3.gradient(alpha_loss, [self.log_alpha]) self.log_alpha_optimizer.apply_gradients( zip(log_alpha_gradients, [self.log_alpha])) del tape3 if self.current_step % self.critic_update == 0: soft_update(self.critic1, self.target_critic1, self.tau) soft_update(self.critic2, self.target_critic2, self.tau) soft_update(self.encoder, self.target_encoder, self.encoder_tau) #train encoder with tf.GradientTape() as tape4: new_ids = np.arange(len(s)) np.random.shuffle(new_ids) s2 = tf.gather(s, new_ids) feature = self.encoder(s) #feature2 = tf.gather(feature, new_ids) feature2 = self.encoder(s2) reward = self.reward_model(tf.stop_gradient(feature)) #reward2 = tf.gather(reward, new_ids) reward2 = self.reward_model(tf.stop_gradient(feature2)) feature_action, _ = self.actor(tf.stop_gradient(feature), True) feature2_action, _ = self.actor(tf.stop_gradient(feature2), True) mu, sigma = self.dynamics_model(tf.stop_gradient(feature), feature_action) mu2, sigma2 = self.dynamics_model(tf.stop_gradient(feature2), feature2_action) z_dist = tf.reshape(tf.keras.losses.huber(feature, feature2), shape=[-1, 1]) r_dist = tf.reshape(tf.keras.losses.huber(reward, reward2), shape=[-1, 1]) transition_dist = tf.sqrt( tf.square(mu - mu2) + tf.square(sigma - sigma2)) bisimilarity = r_dist + self.gamma * transition_dist encoder_loss = tf.reduce_mean(tf.square(z_dist - bisimilarity)) encoder_gradients = tape4.gradient(encoder_loss, self.encoder.trainable_variables) self.encoder_optimizer.apply_gradients( zip(encoder_gradients, self.encoder.trainable_variables)) #train dynamics with tf.GradientTape() as tape5: feature = self.encoder(s) mu, sigma = self.dynamics_model(feature, a) if (sigma[0][0].numpy() == 0): if self.dynamics_model.deterministic == False: print("error") sigma = tf.ones_like(mu) next_feature = self.encoder(ns) diff = (mu - tf.stop_gradient(next_feature)) / sigma dynamics_loss = tf.reduce_mean(0.5 * tf.square(diff) + tf.math.log(sigma)) dynamics_gradients = tape5.gradient( dynamics_loss, self.encoder.trainable_variables + self.dynamics_model.trainable_variables) self.dynamics_optimizer.apply_gradients( zip( dynamics_gradients, self.encoder.trainable_variables + self.dynamics_model.trainable_variables)) #train reward with tf.GradientTape() as tape6: feature = self.encoder(s) sample_dynamics = self.dynamics_model.sample(feature, a) reward_prediction = self.reward_model(sample_dynamics) reward_loss = tf.reduce_mean(tf.square(reward_prediction - r)) reward_gradients = tape6.gradient( reward_loss, self.encoder.trainable_variables + self.reward_model.trainable_variables) self.reward_optimizer.apply_gradients( zip( reward_gradients, self.encoder.trainable_variables + self.reward_model.trainable_variables)) total_c1_loss += critic1_loss.numpy() total_c2_loss += critic2_loss.numpy() loss_list.append(['Loss/Critic1', total_c1_loss]) loss_list.append(['Loss/Critic2', total_c2_loss]) if self.current_step % self.actor_update == 0: total_a_loss += actor_loss.numpy() loss_list.append(['Loss/Actor', total_a_loss]) total_encoder_loss += encoder_loss.numpy() loss_list.append(['Loss/Encoder', total_encoder_loss]) total_dynamics_loss += dynamics_loss.numpy() loss_list.append(['Loss/Dynamics', total_dynamics_loss]) total_reward_loss += reward_loss.numpy() loss_list.append(['Loss/Reward', total_reward_loss]) if self.current_step % self.actor_update == 0 and self.train_alpha == True: total_alpha_loss += alpha_loss.numpy() loss_list.append(['Loss/Alpha', total_alpha_loss]) loss_list.append(['Alpha', tf.exp(self.log_alpha).numpy()]) return loss_list
def train(self, local_step): set1, set2 = self.buffer.dbc_sample(self.batch_size) s, a, r, ns, d = set1 s2, a2, r2, ns2, d2 = set2 target_min_aq = tf.minimum( self.target_critic1(self.target_encoder(ns), self.actor(self.encoder(ns))), self.target_critic2(self.target_encoder(ns), self.actor(self.encoder(ns)))) target_q = tf.stop_gradient(r + self.gamma * (1 - d) * (target_min_aq - self.alpha.numpy() * self.actor.log_pi(self.encoder(ns)))) with tf.GradientTape(persistent=True) as tape1: critic1_loss = tf.reduce_mean( tf.square(self.critic1(self.encoder(s), a) - target_q)) critic2_loss = tf.reduce_mean( tf.square(self.critic2(self.encoder(s), a) - target_q)) critic1_gradients = tape1.gradient( critic1_loss, self.encoder.trainable_variables + self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip( critic1_gradients, self.encoder.trainable_variables + self.critic1.trainable_variables)) critic2_gradients = tape1.gradient( critic2_loss, self.encoder.trainable_variables + self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip( critic2_gradients, self.encoder.trainable_variables + self.critic2.trainable_variables)) del tape1 #train dynamics(encoder used together) next_feature = self.encoder(ns) with tf.GradientTape() as tape2: feature = self.encoder(s) mu, sigma = self.dynamics_model(tf.concat([feature, a], axis=1)) if (sigma[0][0].numpy() == 0): if self.dynamics_model.deterministic == False: print("error") sigma = tf.ones_like(mu) diff = (mu - tf.stop_gradient(next_feature)) / sigma dynamics_loss = tf.reduce_mean(0.5 * tf.square(diff) + tf.math.log(sigma)) dynamics_gradients = tape2.gradient( dynamics_loss, self.encoder.trainable_variables + self.dynamics_model.trainable_variables) self.dynamics_optimizer.apply_gradients( zip( dynamics_gradients, self.encoder.trainable_variables + self.dynamics_model.trainable_variables)) del tape2 #train rewards(encoder used together) with tf.GradientTape() as tape3: feature = self.encoder(s) sample_dynamics = self.dynamics_model.sample( tf.concat([feature, a], axis=1)) reward_prediction = self.reward_model(sample_dynamics) reward_loss = tf.reduce_mean(tf.square(reward_prediction - r)) reward_gradients = tape3.gradient( reward_loss, self.encoder.trainable_variables + self.reward_model.trainable_variables) self.reward_optimizer.apply_gradients( zip( reward_gradients, self.encoder.trainable_variables + self.reward_model.trainable_variables)) del tape3 # train encoder with tf.GradientTape() as tape4: feature1 = self.encoder(s) feature2 = self.encoder(s2) mu1, sigma1 = self.dynamics_model(tf.concat([feature1, a], axis=1)) mu2, sigma2 = self.dynamics_model(tf.concat([feature2, a2], axis=1)) z_dist = tf.abs(feature1 - feature2) r_dist = tf.abs(r - r2) transition_dist = tf.sqrt( tf.square(tf.abs(mu1 - mu2)) + tf.square(tf.abs(sigma1 - sigma2))) bisimilarity = ( tf.cast(r_dist, tf.float32) + self.gamma * tf.cast(transition_dist, tf.float32)).numpy() encoder_loss = self.bisim_coef * tf.reduce_mean( tf.square(z_dist - bisimilarity)) encoder_gradients = tape4.gradient(encoder_loss, self.encoder.trainable_variables) self.encoder_optimizer.apply_gradients( zip(encoder_gradients, self.encoder.trainable_variables)) del tape4 if local_step % 2 == 0: with tf.GradientTape() as tape5: mu, sigma = self.actor.mu_sigma( tf.stop_gradient(self.encoder(s))) output = mu + tf.random.normal(shape=mu.shape) * sigma min_aq_rep = tf.minimum( self.critic1(tf.stop_gradient(self.encoder(s)), output), self.critic2(tf.stop_gradient(self.encoder(s)), output)) actor_loss = tf.reduce_mean( self.alpha.numpy() * self.actor.log_pi(tf.stop_gradient(self.encoder(s))) - min_aq_rep) actor_gradients = tape5.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_gradients, self.actor.trainable_variables)) del tape5 if self.train_alpha == True: with tf.GradientTape() as tape6: alpha_loss = -(tf.exp(self.log_alpha) * tf.stop_gradient( self.actor.log_pi(self.encoder(s)) + self.target_entropy)) alpha_loss = tf.nn.compute_average_loss(alpha_loss) log_alpha_gradients = tape6.gradient(alpha_loss, [self.log_alpha]) self.log_alpha_optimizer.apply_gradients( zip(log_alpha_gradients, [self.log_alpha])) del tape6 soft_update(self.critic1, self.target_critic1, self.tau) soft_update(self.critic2, self.target_critic2, self.tau) soft_update(self.encoder, self.target_encoder, self.encoder_tau)
def train(self, local_step): self.current_step += 1 total_a_loss = 0 total_c1_loss, total_c2_loss = 0, 0 total_alpha_loss = 0 total_ae_loss = 0 loss_list = [] s, a, r, ns, d = self.buffer.sample(self.batch_size) ns_action, ns_logpi = self.actor(self.encoder(ns)) target_min_aq = tf.minimum( self.target_critic1(self.target_encoder(ns), ns_action), self.target_critic2(self.target_encoder(ns), ns_action)) target_q = tf.stop_gradient( r + self.gamma * (1 - d) * (target_min_aq - self.alpha.numpy() * ns_logpi)) #critic update with tf.GradientTape(persistent=True) as tape1: critic1_loss = tf.reduce_mean( tf.square(self.critic1(self.encoder(s), a) - target_q)) critic2_loss = tf.reduce_mean( tf.square(self.critic2(self.encoder(s), a) - target_q)) critic1_gradients = tape1.gradient( critic1_loss, self.encoder.trainable_variables + self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip( critic1_gradients, self.encoder.trainable_variables + self.critic1.trainable_variables)) critic2_gradients = tape1.gradient( critic2_loss, self.encoder.trainable_variables + self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip( critic2_gradients, self.encoder.trainable_variables + self.critic2.trainable_variables)) del tape1 #actor update if self.current_step % self.actor_update == 0: with tf.GradientTape() as tape2: s_action, s_logpi = self.actor( tf.stop_gradient(self.encoder(s))) min_aq_rep = tf.minimum( self.critic1(tf.stop_gradient(self.encoder(s)), s_action), self.critic2(tf.stop_gradient(self.encoder(s)), s_action)) actor_loss = tf.reduce_mean(self.alpha.numpy() * s_logpi - min_aq_rep) actor_gradients = tape2.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_gradients, self.actor.trainable_variables)) del tape2 #alpha update if self.train_alpha == True: with tf.GradientTape() as tape3: _, s_logpi = self.actor(self.encoder(s)) alpha_loss = -( tf.exp(self.log_alpha) * tf.stop_gradient(s_logpi + self.target_entropy)) alpha_loss = tf.nn.compute_average_loss(alpha_loss) log_alpha_gradients = tape3.gradient(alpha_loss, [self.log_alpha]) self.log_alpha_optimizer.apply_gradients( zip(log_alpha_gradients, [self.log_alpha])) del tape3 if self.current_step % self.decoder_update == 0: #encoder, decoder update with tf.GradientTape(persistent=True) as tape4: feature = self.encoder(s) recovered_s = self.decoder(feature) real_s = preprocess_obs(s) rec_loss = tf.reduce_mean(tf.square(recovered_s - real_s)) latent_loss = tf.reduce_mean( 0.5 * tf.reduce_sum(tf.square(feature), axis=1)) ae_loss = rec_loss + self.decoder_latent_lambda * latent_loss encoder_gradients = tape4.gradient( ae_loss, self.encoder.trainable_variables) decoder_gradients = tape4.gradient( ae_loss, self.decoder.trainable_variables) self.encoder_optimizer.apply_gradients( zip(encoder_gradients, self.encoder.trainable_variables)) self.decoder_optimizer.apply_gradients( zip(decoder_gradients, self.decoder.trainable_variables)) if self.current_step % self.critic_update == 0: soft_update(self.critic1, self.target_critic1, self.tau) soft_update(self.critic2, self.target_critic2, self.tau) soft_update(self.encoder, self.target_encoder, self.encoder_tau) del tape4 total_c1_loss += critic1_loss.numpy() total_c2_loss += critic2_loss.numpy() loss_list.append(['Loss/Critic1', total_c1_loss]) loss_list.append(['Loss/Critic2', total_c2_loss]) if self.current_step % self.decoder_update == 0: total_ae_loss += ae_loss.numpy() loss_list.append(['Loss/AutoEncoder', total_ae_loss]) if self.current_step % self.actor_update == 0: total_a_loss += actor_loss.numpy() loss_list.append(['Loss/Actor', total_a_loss]) if self.train_alpha == True: total_alpha_loss += alpha_loss.numpy() loss_list.append(['Loss/Alpha', total_alpha_loss]) loss_list.append(['Alpha', tf.exp(self.log_alpha).numpy()]) return loss_list
def train(self, training_num): for i in range(training_num): s, a, r, ns, d = self.buffer.sample(self.batch_size) target_min_aq = tf.minimum(self.target_critic1(ns, self.actor(ns)), self.target_critic2(ns, self.actor(ns))) target_q = tf.stop_gradient( r + self.gamma * (1 - d) * (target_min_aq - self.alpha.numpy() * self.actor.log_pi(ns))) #critic training with tf.GradientTape(persistent=True) as tape1: critic1_loss = tf.reduce_mean( tf.square(self.critic1(s, a) - target_q)) critic2_loss = tf.reduce_mean( tf.square(self.critic2(s, a) - target_q)) critic1_gradients = tape1.gradient( critic1_loss, self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip(critic1_gradients, self.critic1.trainable_variables)) critic2_gradients = tape1.gradient( critic2_loss, self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip(critic2_gradients, self.critic2.trainable_variables)) del tape1 #actor training with tf.GradientTape() as tape2: mu, sigma = self.actor.mu_sigma(s) output = mu + tf.random.normal(shape=mu.shape) * sigma min_aq_rep = tf.minimum(self.critic1(s, output), self.critic2(s, output)) actor_loss = tf.reduce_mean(self.alpha.numpy() * self.actor.log_pi(s) - min_aq_rep) actor_gradients = tape2.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_gradients, self.actor.trainable_variables)) del tape2 #alpha(temperature) training if self.train_alpha == True: with tf.GradientTape() as tape3: alpha_loss = -(tf.exp(self.log_alpha) * (tf.stop_gradient( self.actor.log_pi(s) + self.target_entropy))) alpha_loss = tf.nn.compute_average_loss( alpha_loss) #from softlearning package alpha_grad = tape3.gradient(alpha_loss, [self.log_alpha]) self.alpha_optimizer.apply_gradients( zip(alpha_grad, [self.log_alpha])) del tape3 soft_update(self.critic1, self.target_critic1, self.tau) soft_update(self.critic2, self.target_critic2, self.tau)
def train(self, training_num): for i in range(training_num): s, a, r, ns, d = self.buffer.sample(self.batch_size) min_aq = tf.minimum(self.critic1(s, self.actor(s)), self.critic2(s, self.actor(s))) target_v = tf.stop_gradient(min_aq - self.alpha * self.actor.log_pi(s)) #v_network training with tf.GradientTape(persistent=True) as tape1: v_loss = 0.5 * tf.reduce_mean( tf.square(self.v_network(s) - target_v)) v_gradients = tape1.gradient(v_loss, self.v_network.trainable_variables) self.v_network_optimizer.apply_gradients( zip(v_gradients, self.v_network.trainable_variables)) del tape1 target_q = tf.stop_gradient(r + self.gamma * (1 - d) * self.target_v_network(ns)) #critic training with tf.GradientTape(persistent=True) as tape2: critic1_loss = 0.5 * tf.reduce_mean( tf.square(self.critic1(s, a) - target_q)) critic2_loss = 0.5 * tf.reduce_mean( tf.square(self.critic2(s, a) - target_q)) critic1_gradients = tape2.gradient( critic1_loss, self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip(critic1_gradients, self.critic1.trainable_variables)) critic2_gradients = tape2.gradient( critic2_loss, self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip(critic2_gradients, self.critic2.trainable_variables)) del tape2 #actor training with tf.GradientTape() as tape3: mu, sigma = self.actor.mu_sigma(s) output = mu + tf.random.normal(shape=sigma.shape) * sigma min_aq_rep = tf.minimum(self.critic1(s, output), self.critic2(s, output)) actor_loss = tf.reduce_mean(self.alpha * self.actor.log_pi(s) - min_aq_rep) actor_grad = tape3.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) del tape3 soft_update(self.v_network, self.target_v_network, self.tau)
def train(self, training_num): total_a_loss = 0 total_c1_loss, total_c2_loss = 0, 0 total_v_loss = 0 for i in range(training_num): self.current_step += 1 s, a, r, ns, d = self.buffer.sample(self.batch_size) s_action, s_logpi = self.actor(s) min_aq = tf.minimum(self.critic1(s, s_action), self.critic2(s, s_action)) target_v = tf.stop_gradient(min_aq - self.alpha * s_logpi) with tf.GradientTape() as tape1: v_loss = 0.5 * tf.reduce_mean( tf.square(self.v_network(s) - target_v)) v_gradients = tape1.gradient(v_loss, self.v_network.trainable_variables) self.v_network_optimizer.apply_gradients( zip(v_gradients, self.v_network.trainable_variables)) target_q = tf.stop_gradient(r + self.gamma * (1 - d) * self.target_v_network(ns)) with tf.GradientTape(persistent=True) as tape2: critic1_loss = 0.5 * tf.reduce_mean( tf.square(self.critic1(s, a) - target_q)) critic2_loss = 0.5 * tf.reduce_mean( tf.square(self.critic2(s, a) - target_q)) critic1_gradients = tape2.gradient( critic1_loss, self.critic1.trainable_variables) self.critic1_optimizer.apply_gradients( zip(critic1_gradients, self.critic1.trainable_variables)) critic2_gradients = tape2.gradient( critic2_loss, self.critic2.trainable_variables) self.critic2_optimizer.apply_gradients( zip(critic2_gradients, self.critic2.trainable_variables)) with tf.GradientTape() as tape3: s_action, s_logpi = self.actor(s) min_aq_rep = tf.minimum(self.critic1(s, s_action), self.critic2(s, s_action)) actor_loss = tf.reduce_mean(self.alpha * s_logpi - min_aq_rep) actor_grad = tape3.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) soft_update(self.v_network, self.target_v_network, self.tau) del tape1, tape2, tape3 total_a_loss += actor_loss.numpy() total_c1_loss += critic1_loss.numpy() total_c2_loss += critic2_loss.numpy() total_v_loss += v_loss.numpy() return [['Loss/Actor', total_a_loss], ['Loss/Critic1', total_c1_loss], ['Loss/Critic2', total_c2_loss], ['Loss/V', total_v_loss]]