def build_training_op(self, loss): """Get training operation. Args: loss: a loss function for training. Define the optimization operation and perform gradient calculation for both TPU/Non-TPU training. Returns: Computed gradient. """ adam_optimizer = tf.train.AdamOptimizer( learning_rate=self._decayed_learning_rate, epsilon=1e-5) if self._use_tpu: # Notes from: learning/brain/research/dune/examples/v2018_09/train.py # If we use TPUs, reduce_mean runs on each chip separately and by default # only the loss of the first chip is reported. # # You can either: # - execute this if, which synchronizes the losses # across the chips to obtain the full loss on all samples. # - or remove this section, gaining some performance and getting the # loss only from the first chip. # compute gradients perform averaging of the loss adam_optimizer = tf.tpu.CrossShardOptimizer(adam_optimizer) tpu_sum_loss = contrib_tpu.cross_replica_sum(loss / self._tpu_num_shards) grads_and_vars = adam_optimizer.compute_gradients( tpu_sum_loss, self.total_params) grads, var = zip(*grads_and_vars) sum_grads = [] sum_vars = [] for (grad, var) in grads_and_vars: if grad is None: sum_grads.append(grad) sum_vars.append(var) else: sum_grads.append( contrib_tpu.cross_replica_sum(grad) / self._tpu_num_shards) sum_vars.append(var) # calculate sum of grads norm_grads, _ = tf.clip_by_global_norm(sum_grads, 0.5) grads_and_vars = list(zip(norm_grads, sum_vars)) else: grads_and_vars = adam_optimizer.compute_gradients( loss, self.total_params) grads, var = zip(*grads_and_vars) norm_grads, _ = tf.clip_by_global_norm(grads, 0.5) grads_and_vars = list(zip(norm_grads, var)) return adam_optimizer.apply_gradients( grads_and_vars, global_step=tf.train.get_global_step())
def __init__(self, sess, state_dim, action_dim, learning_rate, global_critic): self.sess = sess self.global_critic = global_critic self.state_dim = state_dim self.action_dim = action_dim self.learning_rate = learning_rate #크리틱 신경망 생성 self.model, self.phi, self.states = build_network(self.state_dim) #시간차 타깃을 담을 플레이스홀더 self.td_targets = tf.placeholder(tf.float32, [None, 1]) #워커의 손실함수와 그래디언트 v_values = self.model.output #abstract tensor loss = tf.reduce_sum(tf.square(self.td_targets - v_values)) dj_dphi = tf.gradients(loss, self.phi) #그래디언트 클리핑 dj_dphi, _ = tf.clip_by_global_norm(dj_dphi, 40) #워커의 그래디언트를 이용해 글로벌 신경망 업데이트 grads = zip(dj_dphi, self.global_critic.phi) self.critic_optimizer = tf.train.AdamOptimizer( self.learning_rate).apply_gradients(grads)
def __init__(self, mdp, n_input, lr, n_h1=400, n_h2=300, l2=10, name='deep_irl_fc'): super(DeepIRLFC, self).__init__(mdp, lr) self.n_input = n_input self.lr = lr self.n_h1 = n_h1 self.n_h2 = n_h2 self.name = name self.sess = tf.compat.v1.Session() self.input_s, self.reward, self.theta = self._build_network(self.name) self.optimizer = tf.train.AdamOptimizer(lr) self.grad_r = tf.placeholder(tf.float32, [None, 1]) self.l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in self.theta]) self.grad_l2 = tf.gradients(self.l2_loss, self.theta) self.grad_theta = tf.gradients(self.reward, self.theta, -self.grad_r) self.grad_theta = [ tf.add(l2 * self.grad_l2[i], self.grad_theta[i]) for i in range(len(self.grad_l2)) ] self.grad_theta, _ = tf.clip_by_global_norm(self.grad_theta, 100.0) self.grad_norms = tf.global_norm(self.grad_theta) self.optimize = self.optimizer.apply_gradients( zip(self.grad_theta, self.theta)) self.sess.run(tf.compat.v1.global_variables_initializer())
def output(image, labels, optimize, loss, out, reshaped_labels): """ Handles output of a model input image labels optimize bool construct graph with optimizer or not loss objective of model out logits reshaped_labels labels as a (x,2) tensor output if optimize: image labels optmizer used to train else: image labels prob_of_cell shape=shape=(b,s,s,1) probability of pixel being a cone correct_prediction number of correct pixel classifications """ if optimize: optimizer = tf.train.RMSPropOptimizer(1e-3) gradients, variables = zip(*optimizer.compute_gradients(loss)) gradients, _ = tf.clip_by_global_norm(gradients, 5.0) # occasionally gradients would explode # tells us if there are NaN values in any tensors grad_checks = [ tf.check_numerics(grad, 'Gradients exploding') for grad in gradients if grad is not None ] with tf.control_dependencies(grad_checks): optimize = optimizer.apply_gradients(zip(gradients, variables)) return image, labels, optimize else: # convert scores to probabilities probs = tf.nn.softmax(out) # gives classification prediction = tf.argmax(probs, 1) # count how many classifications are correct correct_prediction = tf.equal(tf.argmax(reshaped_labels, 1), prediction) # if you give images of shape (b,s,s,1) then # probs is (b,s,s,1) and each value of probs # is the probability that the corresponding # pixel belongs to a cone prob_of_cell = probs[:, 1] return image, labels, prob_of_cell, correct_prediction
def train(self, X, Y): # train_log = train_log.reshape(-1, self._batch_size, 21) global_step = tf.Variable(0, trainable=False, dtype=tf.int32, name='global_step') starter_learning_rate = 0.01 optimizer = tf.train.AdadeltaOptimizer(starter_learning_rate, epsilon=1e-06) # Compute the gradients for each variable grads_and_vars = optimizer.compute_gradients(-self._loss) # gradient clipping grads, variables = zip(*grads_and_vars) grads_clipped, _ = tf.clip_by_global_norm(grads, clip_norm=1) apply_gradients_op = optimizer.apply_gradients(zip(grads_clipped, variables), global_step=global_step) # start the Session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) # for batch_session in train_log: # inputs, targets = self._get_batch_train_sample(batch_session) # get the loss and the probabilities that the model outputs for iter in range(1000): _, loss_, pred, probs = self.sess.run([apply_gradients_op, self._loss, self._predictions, self._probabilities], feed_dict={self._inputs: X, self._targets: Y, self._state_placeholder: np.zeros((self._lstm_num_layers, 2, self._batch_size, self._lstm_num_hidden)), self.keep_prob: 0.9}) print(iter, loss_)
def _build_train(self): """Build training ops.""" print('-' * 80) print('Building train graph') reg_loss, loss = self._forward(self.x_train, self.y_train, self.train_params, self.batch_init_states, is_training=True) tf_vars = tf.trainable_variables() global_step = tf.train.get_or_create_global_step() lr_scale = (tf.cast(tf.shape(self.y_train)[-1], dtype=tf.float32) / tf.cast(self.params.bptt_steps, dtype=tf.float32)) learning_rate = utils.get_lr(global_step, self.params) * lr_scale # learning_rate = tf.Print( # learning_rate, # [learning_rate, lr_scale, self.base_bptt, tf.shape(self.y_train)], # message='lr: ', summarize=3) grads = tf.gradients(reg_loss, tf_vars) clipped_grads, grad_norm = tf.clip_by_global_norm( grads, self.params.grad_bound) (self.update_moving_avg_ops, self.use_moving_avg_vars, self.restore_normal_vars) = self._create_average_ops() optimizer = tf.train.GradientDescentOptimizer(learning_rate) train_op = optimizer.apply_gradients(zip(clipped_grads, tf_vars), global_step=global_step) self.train_loss = loss self.train_op = train_op self.grad_norm = grad_norm self.learning_rate = learning_rate
def _create_optimizer(self, args): """Create optimizer to minimize loss Args: args: Various arguments and specifications """ # First extract mean and std for prior dists, dist over g, and dist over x g_prior_mean, g_prior_logstd = tf.split( self.g_prior, [args.latent_dim, args.latent_dim], axis=1) g_prior_std = tf.exp(g_prior_logstd) + 1e-6 g_mean, g_logstd = tf.split(self.g_dists, [args.latent_dim, args.latent_dim], axis=1) g_std = tf.exp(g_logstd) + 1e-6 # Get predictions for x and reconstructions self.x_pred_norm = self._get_decoder_output(args, self.z_vals) self.x_pred = self.x_pred_norm * self.scale + self.shift # First component of loss: NLL of observed states x_reshape = tf.reshape( self.x, [args.batch_size, 2 * args.seq_length, args.state_dim]) x_pred_reshape = tf.reshape( self.x_pred_norm, [args.batch_size, args.seq_length, args.state_dim]) self.x_pred_init = x_pred_reshape * self.scale + self.shift # needed for ilqr # Add in predictions for how system will evolve self.x_pred_reshape = tf.concat([x_pred_reshape, self.x_future_norm], axis=1) self.x_pred_reshape_unnorm = self.x_pred_reshape * self.scale + self.shift # Prediction loss self.pred_loss = tf.reduce_sum( tf.square(x_reshape - self.x_pred_reshape)) # Weight loss at t = T more heavily self.pred_loss += 20.0*tf.reduce_sum(tf.square(x_reshape[:, args.seq_length-1]\ - x_pred_reshape[:, args.seq_length-1])) # Define reconstructed state needed for ilqr self.rec_state = self._get_decoder_output( args, self.z1) * self.scale + self.shift # Second component of loss: KLD between approximate posterior and prior g_prior_dist = tf.distributions.Normal(loc=g_prior_mean, scale=g_prior_std) g_dist = tf.distributions.Normal(loc=g_mean, scale=g_std) self.kl_loss = tf.reduce_sum( tf.distributions.kl_divergence(g_dist, g_prior_dist)) # Sum with regularization losses to form total cost self.cost = self.pred_loss + self.kl_weight * self.kl_loss + tf.reduce_sum( tf.losses.get_regularization_losses()) # Perform parameter update optimizer = tf.train.AdamOptimizer(self.learning_rate) tvars = [v for v in tf.trainable_variables()] self.grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) self.train = optimizer.apply_gradients(zip(self.grads, tvars))
def init_optimizer(self): """最適化アルゴリズムの設定""" # 論文だとSGDを利用、lrは1.0→0.1に変化するようになっている # Gradients and SGD update operation for training the model trainable_params = tf.trainable_variables() if self.config['optimizer'] == 'adadelta': self.opt = tf.train.AdadeltaOptimizer(learning_rate=self.lr) elif self.config['optimizer'] == 'adam': self.opt = tf.train.AdamOptimizer(learning_rate=self.lr) elif self.config['optimizer'] == 'rmsprop': self.opt = tf.train.RMSPropOptimizer(learning_rate=self.lr) else: self.opt = tf.train.GradientDescentOptimizer(learning_rate=self.lr) # Compute gradients of loss w.r.t. all trainable variables gradients = tf.gradients(self.loss, trainable_params) # Clip gradients by a given maximum_gradient_norm clip_gradients, _ = tf.clip_by_global_norm( gradients, self.config['max_gradient_norm']) # Update the model self.train_op = self.opt.apply_gradients(zip(clip_gradients, trainable_params), global_step=self.global_step)
def __init__(self, sess, state_dim, action_dim, action_bound, learning_rate, entropy_beta, global_actor): self.sess = sess self.global_actor = global_actor self.state_dim = state_dim self.action_dim = action_dim self.action_bound = action_bound self.learning_rate = learning_rate # 표준편차의 최대값 최소값 설정 self.std_bound = [1e-2, 1] #워커의 액터 신경망 생성 self.model, self.theta, self.states = build_network(self.state_dim, self.action_dim, self.action_bound) #정책과 어드벤티지를 담을 플레이스 홀더 self.actions = tf.placeholder(tf.float32, [None, self.action_dim]) self.advantages = tf.placeholder(tf.float32, [None, 1]) #정책 확률밀도함수 및 엔트로피 mu_a, std_a = self.model.output log_policy_pdf, entropy = self.log_pdf(mu_a, std_a, self.actions) #워커의 손실함수와 그래디언트 loss_policy = log_policy_pdf * self.advantages loss = tf.reduce_sum(-loss_policy-entropy_beta*entropy) dj_dtheta = tf.gradients(loss, self.theta) #그래디언트 글리핑 dj_dtheta, _ = tf.clip_by_global_norm(dj_dtheta, 40) #워커의 그래디언트를 이용해 글로벌 신경망 업데이트 grads = zip(dj_dtheta, self.global_actor.theta) self.actor_optimizer = tf.train.AdamOptimizer(self.learning_rate).apply_gradients(grads)
def init_optimizer(self): print("setting optimizer..") # add L2 loss to main loss, do backpropagation self.l2_loss = tf.losses.get_regularization_loss() tf.summary.scalar("l2_loss", self.l2_loss) self.total_loss = tf.add(self.loss, self.l2_loss) tf.summary.scalar('final_loss', self.total_loss) # we need to define a dependency before calculating the total_loss update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if update_ops: updates = tf.group(*update_ops) self.final_loss = control_flow_ops.with_dependencies([updates], self.total_loss) with tf.control_dependencies(update_ops): trainable_params = tf.trainable_variables() opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate) # Compute gradients of loss w.r.t. all trainable variables gradients = tf.gradients(self.final_loss, trainable_params) # Clip gradients by a given maximum_gradient_norm clip_gradients, _ = tf.clip_by_global_norm(gradients, self.max_gradient_norm) # Update the model self.update = opt.apply_gradients(zip(clip_gradients, trainable_params), global_step=self.global_step)
def create_optimizer(loss, learning_rate, num_train_steps, weight_decay_rate=0.0, use_tpu=False, warmup_steps=0, warmup_proportion=0, lr_decay_power=1.0, layerwise_lr_decay_power=-1, n_transformer_layers=None, name="adamw", var_map=None): """Creates an optimizer and training op.""" global_step = tf.train.get_or_create_global_step() learning_rate = tf.train.polynomial_decay(learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=lr_decay_power, cycle=False) warmup_steps = max(num_train_steps * warmup_proportion, warmup_steps) learning_rate *= tf.minimum( 1.0, tf.cast(global_step, tf.float32) / tf.cast(warmup_steps, tf.float32)) if layerwise_lr_decay_power > 0: learning_rate = _get_layer_lrs(learning_rate, layerwise_lr_decay_power, n_transformer_layers) if name == "recadam": optimizer = RecAdamOptimizer( learning_rate=learning_rate, weight_decay_rate=weight_decay_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"], anneal_k=0.5, anneal_t0=500, anneal_w=1.0, pretrain_cof=5000.0, pretrain_params=var_map) else: optimizer = AdamWeightDecayOptimizer( learning_rate=learning_rate, weight_decay_rate=weight_decay_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) if use_tpu: optimizer = tf.tpu.CrossShardOptimizer(optimizer) tvars = tf.trainable_variables() grads = tf.gradients(loss, tvars) (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) new_global_step = global_step + 1 train_op = tf.group(train_op, [global_step.assign(new_global_step)]) return train_op
def _make_training_step(self, loss: tf.Tensor) -> tf.Tensor: """ Constructs a trainig step from the loss parameter and hyperparameters. """ optimizer_name = self.hyperparameters["optimizer"].lower() if optimizer_name == "sgd": optimizer = tf.train.GradientDescentOptimizer( learning_rate=self.hyperparameters["learning_rate"]) elif optimizer_name == "rmsprop": optimizer = tf.train.RMSPropOptimizer( learning_rate=self.hyperparameters["learning_rate"], decay=self.hyperparameters["learning_rate_decay"], momentum=self.hyperparameters["momentum"], ) elif optimizer_name == "adam": optimizer = tf.train.AdamOptimizer( learning_rate=self.hyperparameters["learning_rate"]) else: raise Exception('Unknown optimizer "%s".' % (self.hyperparameters["optimizer"])) # Calculate and clip gradients trainable_vars = self._sess.graph.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES) gradients = tf.gradients(loss, trainable_vars) clipped_gradients, _ = tf.clip_by_global_norm( gradients, self.hyperparameters["gradient_clip_value"]) pruned_clipped_gradients = [] for (gradient, trainable_var) in zip(clipped_gradients, trainable_vars): if gradient is None: continue pruned_clipped_gradients.append((gradient, trainable_var)) return optimizer.apply_gradients(pruned_clipped_gradients)
def nll_gnp_step_bandits(model, data, optimizer_config): """Applies gradient updates and returns appropriate metrics. Args: model: An instance of SNP Regressor. data: A 5-tuple consisting of context_x, context_y, target_x, target_y, unseen_targets (i.e., target_x-context_x). optimizer_config: A dictionary with two keys: an 'optimizer' object and a 'max_grad_norm' for clipping gradients. Returns: nll_term: Negative log-likelihood of model for unseen targets. local_kl: KL loss for latent variables of unseen targets. global_kl: KL loss for global latent variable. """ (context_x, context_y, target_x, target_y, unseen_target_y, unseen_target_a) = data num_context = tf.shape(context_x)[1] with tf.GradientTape() as tape: prediction = model(context_x, context_y, target_x, target_y) unseen_predictions = prediction[:, num_context:] nll_term = nll(unseen_target_y, unseen_predictions, unseen_target_a) local_kl = tf.reduce_mean( tf.reduce_sum(model.losses[-1][:, num_context:], axis=[1, 2])) global_kl = tf.reduce_mean(tf.reduce_sum(model.losses[-2], axis=-1)) loss = nll_term + local_kl + global_kl gradients = tape.gradient(loss, model.trainable_variables) max_grad_norm = optimizer_config['max_grad_norm'] optimizer = optimizer_config['optimizer'] clipped_gradients, _ = tf.clip_by_global_norm(gradients, max_grad_norm) optimizer.apply_gradients(zip(clipped_gradients, model.trainable_variables)) return nll_term, local_kl, global_kl
def _add_train_op(self): """Sets self._train_op, the op to run for training.""" # Take gradients of the trainable variables w.r.t. the loss function to minimize loss_to_minimize = self._total_loss if self._hps.coverage else self._loss tvars = tf.trainable_variables() gradients = tf.gradients( loss_to_minimize, tvars, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE) # Clip the gradients with tf.device("/gpu:0"): grads, global_norm = tf.clip_by_global_norm( gradients, self._hps.max_grad_norm) # Add a summary tf.summary.scalar('global_norm', global_norm) # Apply adagrad optimizer optimizer = tf.train.AdagradOptimizer( self._hps.lr, initial_accumulator_value=self._hps.adagrad_init_acc) with tf.device("/gpu:0"): self._train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=self.global_step, name='train_step')
def build_optimizer(self): # 使用clipping gradients tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), self.grad_clip) train_op = tf.train.AdamOptimizer(self.learning_rate) self.optimizer = train_op.apply_gradients(zip(grads, tvars))
def _build_train(self): """Build training ops.""" print('-' * 80) print('Building train graph') reg_loss, loss = self._forward(self.x_train, self.y_train, self.train_params, self.batch_init_states, is_training=True) tf_vars = [ v for v in tf.trainable_variables() if v.name.startswith(self.name) ] global_step = tf.train.get_or_create_global_step() lr_scale = (tf.cast(tf.shape(self.y_train)[-1], dtype=tf.float32) / tf.cast(self.params.bptt_steps, dtype=tf.float32)) learning_rate = utils.get_lr(global_step, self.params) * lr_scale if self.params.grad_bound: grads = tf.gradients(reg_loss, tf_vars) clipped_grads, grad_norm = tf.clip_by_global_norm( grads, self.params.grad_bound) optimizer = tf.train.GradientDescentOptimizer(learning_rate) train_op = optimizer.apply_gradients(zip(clipped_grads, tf_vars), global_step=global_step) self.train_loss = loss self.train_op = train_op self.grad_norm = grad_norm self.learning_rate = learning_rate
def build_train(self, initial_lr): """ """ #count_number_trainable_params(verbose=True) # TODO remove # Decay learning rate by manually incrementing decay_step decay_step = tf.Variable(0.0, name='decay_step', trainable=False) learning_rate = tf.train.exponential_decay(initial_lr, decay_step, 1, 0.8, staircase=True, name="learning_rate") trainable_variables = tf.trainable_variables() optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=0.9) # clip gradients grads = tf.gradients(self.loss, trainable_variables) grads, _ = tf.clip_by_global_norm(grads, 1.0, use_norm=tf.global_norm(grads)) train_op = optimizer.apply_gradients(zip(grads, trainable_variables)) self.decay_step = decay_step self.learning_rate = learning_rate self.train_op = train_op
def eager_train_step(detection_model, features, labels, unpad_groundtruth_tensors, optimizer, learning_rate, add_regularization_loss=True, clip_gradients_value=None, global_step=None, num_replicas=1.0): is_training = True detection_model._is_training = is_training tf.keras.backend.set_learning_phase(is_training) labels = model_lib.unstack_batch( labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors) with tf.GradientTape() as tape: losses_dict, _ = _compute_losses_and_predictions_dicts( detection_model, features, labels, add_regularization_loss) total_loss = losses_dict["Loss/total_loss"] total_loss = tf.math.divide( total_loss, tf.constant(num_replicas, dtype=tf.float32)) losses_dict["Loss/normalized_total_loss"] = total_loss for loss_type in losses_dict: tf.compat.v2.summary.scalar(loss_type, losses_dict[loss_type], step=global_step) trainable_variables = detection_model.trainable_variables gradients = tape.gradient(total_loss, trainable_variables) if clip_gradients_value: gradients, _ = tf.clip_by_global_norm(gradients, clip_gradients_value) optimizer.apply_gradients(zip(gradients, trainable_variables)) return total_loss
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): """Creates an optimizer training op.""" global_step = tf.train.get_or_create_global_step() learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) # Implements linear decay of the learning rate. learning_rate = tf.train.polynomial_decay(learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False) # Implements linear warmup. I.e., if global_step < num_warmup_steps, the # learning rate will be `global_step/num_warmup_steps * init_lr`. if num_warmup_steps: global_steps_int = tf.cast(global_step, tf.int32) warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = init_lr * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) learning_rate = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) # It is recommended that you use this optimizer for fine tuning, since this # is how the model was trained (note that the Adam m/v variables are NOT # loaded from init_checkpoint.) optimizer = AdamWeightDecayOptimizer( learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) if use_tpu: optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) tvars = tf.trainable_variables() grads = tf.gradients(loss, tvars) # This is how the model was pre-trained. (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) # Normally the global step update is done inside of `apply_gradients`. # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use # a different optimizer, you should probably take this line out. new_global_step = global_step + 1 train_op = tf.group(train_op, [global_step.assign(new_global_step)]) return train_op
def clip_gradients(gvs, value_clip=0, norm_clip=0): """Clips gradients.""" grads, vs = zip(*gvs) grads = list(grads) if value_clip > 0: for i, g in enumerate(grads): if g is not None: grads[i] = tf.clip_by_value(g, -value_clip, value_clip) if norm_clip > 0: n_params = sum(np.prod(g.shape) for g in grads if g is not None) # n_params is most likely tf.Dimension and cannot be converted # to float directly norm_clip *= np.sqrt(float(int(n_params))) grads_to_clip = [(i, g) for i, g in enumerate(grads) if g is not None] idx, grads_to_clip = zip(*grads_to_clip) clipped_grads = tf.clip_by_global_norm(grads_to_clip, norm_clip)[0] for i, g in zip(idx, clipped_grads): grads[i] = g return [item for item in zip(grads, vs)]
def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Applying gradients and tune hyperparams with YellowFin. Args: grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients(). global_step: Optional Variable to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. Returns: (A group of operations) Variable Update with Momentum ops, YellowFin ops(Curvature, Variance, Distance) ops, SingleStep and lr_mu tuning ops, Step increment ops. """ self._grad, self._vars = zip(*[(g, t) for g, t in grads_and_vars if g is not None]) # Var update with Momentum. with tf.variable_scope("apply_updates"): # Gradient Clipping? if self._clip_thresh_var is not None: self._grad, _ = tf.clip_by_global_norm(self._grad, self._clip_thresh_var) apply_grad_op = self._momentum_optimizer.apply_gradients( zip(self._grad, self._vars), global_step=global_step, name=name) else: apply_grad_op = self._momentum_optimizer.apply_gradients( zip(self._grad, self._vars), global_step=global_step, name=name) # Begin lr and mu tuning. with tf.variable_scope("prepare_yellowFin_variables"): # the dependencies ideally only need to be after clip is done, # i.e. depends on self._grads. However, the control_dependencies # does not support indexed slice for sparse gradients. # The alternative dependencies here might be slightly slower due # to less parallelization. with tf.control_dependencies([ apply_grad_op, ]): prepare_variables_op = self._prepare_variables() with tf.variable_scope("yellowfin"): with tf.control_dependencies([prepare_variables_op]): yellowfin_op = self._yellowfin() # Update YellowFin step variable. with tf.control_dependencies([yellowfin_op]): self._increment_step_op = tf.assign_add(self._step, 1).op return tf.group(apply_grad_op, prepare_variables_op, yellowfin_op, self._increment_step_op)
def _update_actor(self, obs, mask): """Updates parameters of critic given samples from the batch. Args: obs: A tfe.Variable with a batch of observations. mask: A tfe.Variable with a batch of masks. """ with tf.GradientTape() as tape: if self.use_td3: q_pred, _ = self.critic(obs, self.actor(obs)) else: q_pred = self.critic(obs, self.actor(obs)) if self.use_absorbing_state: # Don't update the actor for absorbing states. # And skip update if all states are absorbing. a_mask = 1.0 - tf.maximum(0, -mask) if tf.reduce_sum(a_mask) < 1e-8: return actor_loss = -tf.reduce_sum( q_pred * a_mask) / tf.reduce_sum(a_mask) else: actor_loss = -tf.reduce_mean(q_pred) grads = tape.gradient(actor_loss, self.actor.variables) # Clipping makes training more stable. grads, _ = tf.clip_by_global_norm(grads, 40.0) self.actor_optimizer.apply_gradients(zip(grads, self.actor.variables), global_step=self.actor_step) with contrib_summary.record_summaries_every_n_global_steps( 100, self.actor_step): contrib_summary.scalar('actor/loss', actor_loss, step=self.actor_step)
def ProcessGradients(grads_and_vars, global_gradient_clip=0.0, sanitize_gradients=False, normalize_gradients=False): tf.logging.info("Prcessing gradients") grads, vars_ = list(zip(*grads_and_vars)) if sanitize_gradients: new_grads = [] for g in grads: if g is not None: g = tf.where(tf.is_finite(g), g, tf.zeros_like(g)) new_grads.append(g) grads = new_grads if normalize_gradients: new_grads = [] for g in grads: if g is not None: g *= tf.rsqrt(tf.maximum(1e-12, tf.reduce_sum(tf.square(g)))) new_grads.append(g) grads = new_grads if global_gradient_clip > 0: grads, grad_norm = tf.clip_by_global_norm(grads, global_gradient_clip) grads_and_vars = list(zip(grads, vars_)) else: grad_norm = tf.global_norm(grads) tf.summary.scalar("global_grad_norm", grad_norm) return grads_and_vars
def add_train_op(self, lr_method, lr, loss, clip=-1): """Defines self.train_op that performs an update on a batch Args: lr_method: (string) sgd method, for example "adam" lr: (tf.placeholder) tf.float32, learning rate loss: (tensor) tf.float32 loss to minimize clip: (python float) clipping of gradient. If < 0, no clipping """ _lr_m = lr_method.lower() # lower to make sure with tf.variable_scope("train_step"): if _lr_m == 'adam': # sgd method optimizer = tf.train.AdamOptimizer(lr) elif _lr_m == 'adagrad': optimizer = tf.train.AdagradOptimizer(lr) elif _lr_m == 'sgd': optimizer = tf.train.GradientDescentOptimizer(lr) elif _lr_m == 'rmsprop': optimizer = tf.train.RMSPropOptimizer(lr) else: raise NotImplementedError("Unknown method {}".format(_lr_m)) if clip > 0: # gradient clipping if clip is positive grads, vs = zip(*optimizer.compute_gradients(loss)) grads, gnorm = tf.clip_by_global_norm(grads, clip) self.train_op = optimizer.apply_gradients(zip(grads, vs)) else: self.train_op = optimizer.minimize(loss)
def _finish(self, caches): """ """ if self.clip > 0: S_t = [cache['s_t'] for cache in caches] S_t, _ = tf.clip_by_global_norm(S_t, self.clip) for cache, s_t in zip(caches, S_t): cache['s_t'] = s_t for cache in caches: x_tm1 = cache['x_tm1'] s_t = cache['s_t'] updates = cache['updates'] with tf.name_scope('update_' + x_tm1.op.name), tf.device( x_tm1.device): if 'idxs' in cache: idxs = cache['idxs'] x_t = tf.scatter_sub(x_tm1, idxs, s_t) if self.chi > 0: x_t_ = tf.gather(x_t, idxs) x_bar_t, t_x_bar = self._sparse_moving_average( x_tm1, idxs, x_t_, 'x', beta=self.chi) else: x_t = tf.assign_sub(x_tm1, s_t) if self.chi > 0: x_bar_t, t_x_bar = self._dense_moving_average( x_tm1, x_t, 'x', beta=self.chi) updates.append(x_t) if self.chi > 0: updates.extend([x_bar_t, t_x_bar]) update_ops = [tf.group(*cache['updates']) for cache in caches] return tf.group(*update_ops, name='update')
def apply_gradients(self, grads_and_vars, *args, **kwargs): if self._clip_norm == np.inf: return self._opt.apply_gradients(grads_and_vars, *args, **kwargs) grads, vars_ = list(zip(*grads_and_vars)) clipped_grads, _ = tf.clip_by_global_norm(grads, self._clip_norm) return self._opt.apply_gradients(zip(clipped_grads, vars_), *args, **kwargs)
def create_optimizer( loss, learning_rate, num_train_steps, weight_decay_rate=0.0, use_tpu=False, warmup_steps=0, warmup_proportion=0, lr_decay_power=1.0, layerwise_lr_decay_power=-1, n_transformer_layers=None, decoder_layers=None, ): """Creates an optimizer and training op.""" global_step = tf.train.get_or_create_global_step() learning_rate = tf.train.polynomial_decay( learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=lr_decay_power, cycle=False, ) warmup_steps = max(num_train_steps * warmup_proportion, warmup_steps) learning_rate *= tf.minimum( 1.0, tf.cast(global_step, tf.float32) / tf.cast(warmup_steps, tf.float32), ) cp_learning_rate = learning_rate if layerwise_lr_decay_power > 0: learning_rate = _get_layer_lrs( learning_rate, layerwise_lr_decay_power, n_transformer_layers, decoder_layers, ) learning_rate['embedding_shared_weights/'] = cp_learning_rate learning_rate['decoder_stack/layer_normalization/'] = cp_learning_rate print(learning_rate) optimizer = AdamWeightDecayOptimizer( learning_rate=learning_rate, weight_decay_rate=weight_decay_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'], ) if use_tpu: optimizer = tf.tpu.CrossShardOptimizer(optimizer) tvars = tf.trainable_variables() grads = tf.gradients(loss, tvars) (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) new_global_step = global_step + 1 train_op = tf.group(train_op, [global_step.assign(new_global_step)]) return train_op
def step_fn(self, params, model): """A single step for supervised learning.""" images, labels = tf.raw_ops.InfeedDequeueTuple( dtypes=params.train_dtypes, shapes=params.train_shapes) if labels.dtype == tf.int32: labels = tf.one_hot(labels, depth=params.num_classes, dtype=tf.float32) global_step = tf.train.get_or_create_global_step() train_batch_size = tf.cast(params.train_batch_size, tf.float32) num_replicas = tf.cast(params.num_replicas, tf.float32) with tf.variable_scope(MODEL_SCOPE): logits = model(images, training=True) cross_entropy = tf.losses.softmax_cross_entropy( onehot_labels=labels, logits=logits, label_smoothing=params.label_smoothing, reduction=tf.losses.Reduction.SUM) / train_batch_size l2_reg_rate = tf.cast(params.weight_decay / params.num_replicas, tf.float32) weight_dec = common_utils.get_l2_loss() total_loss = cross_entropy + weight_dec * l2_reg_rate variables = tf.trainable_variables() gradients = tf.gradients(total_loss, variables) gradients = [tf.tpu.cross_replica_sum(g) for g in gradients] gradients, grad_norm = tf.clip_by_global_norm(gradients, params.grad_bound) learning_rate, optimizer = common_utils.get_optimizer(params) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_op = tf.cond( tf.math.is_finite(grad_norm), lambda: optimizer.apply_gradients( zip(gradients, variables), global_step=global_step), tf.no_op) with tf.control_dependencies(update_ops + [train_op]): ema_train_op = common_utils.setup_ema( params, f'{MODEL_SCOPE}/{model.name}') with tf.control_dependencies([ema_train_op]): logs = collections.OrderedDict() logs['global_step'] = tf.cast(global_step, tf.float32) logs['loss/total'] = total_loss logs['loss/weight_decay'] = weight_dec / num_replicas logs['loss/cross_entropy'] = cross_entropy logs['loss/lr'] = tf.identity(learning_rate) / num_replicas logs['loss/grad_norm'] = grad_norm / num_replicas tensors = [tf.expand_dims(t, axis=0) for t in logs.values()] self.step_info = {k: [tf.float32, [1]] for k in logs.keys()} outfeed_enqueue_op = tf.cond( common_utils.should_log(params), lambda: tf.raw_ops.OutfeedEnqueueTuple(inputs=tensors), tf.no_op) return outfeed_enqueue_op
def __init__(self): # placeholder self.sph_user = tf.sparse_placeholder(tf.int32, name='sph_user') self.sph_doc = tf.sparse_placeholder(tf.int32, name='sph_doc') self.sph_con = tf.sparse_placeholder(tf.int32, name='sph_con') self.ph_reward = tf.placeholder(tf.float32, name='ph_reward') self.ph_nq = tf.placeholder( tf.float32, shape=[pd['batch_size'], pd['rnn_max_len']], name='ph_nq') # main networks self.dst_embed, self.mq = self.build_net('main') # target networks _, self.tq = self.build_net('target') diff = tf.reshape(self.ph_reward, [-1]) + tf.scalar_mul( tf.constant(pd['gamma']), tf.reshape( self.ph_nq, [-1])) - tf.reshape(self.mq, [-1]) self.loss = tf.reduce_mean(tf.square(diff)) self.a_grads = tf.clip_by_global_norm( tf.gradients(self.mq, self.dst_embed), pd['grad_clip'])[0] vs = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='main/value') vs.extend( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='main/feat_embedding')) self.grads = tf.clip_by_global_norm(tf.gradients(self.loss, vs), pd['grad_clip'])[0] with tf.variable_scope('train_value'): optimizer = tf.train.AdamOptimizer(pd['lr']) self.opt = optimizer.apply_gradients(zip(self.grads, vs)) self.m_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/value") self.m_params.extend( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='main/feat_embedding')) self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/value") self.t_params.extend( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target/feat_embedding')) alpha = pd['double_networks_sync_step'] self.sync_op = [ tf.assign(t, (1.0 - alpha) * t + alpha * m) for t, m in zip(self.t_params, self.m_params) ] self.total_loss, self.batch_counter = 0.0, 0
def minimize_with_clipping(optimizer, loss): grads_and_vars = optimizer.compute_gradients(loss) if max_global_gradient_norm is not None: grads, variables = zip(*grads_and_vars) grads, _ = tf.clip_by_global_norm(grads, max_global_gradient_norm) grads_and_vars = list(zip(grads, variables)) return optimizer.apply_gradients(grads_and_vars)