def FProp(self, theta, inputs, *extra_inputs): initial_step_seed = py_utils.GetStepSeed() final_step_seed = py_utils.GenerateSeedFromName( tf.no_op(name='new_step_seed').name) num_layers = len(self.sub_layers) def Bak(inputs, outputs, d_outputs): """Backward step.""" del inputs # unused output_acts, step_seeds = outputs d_outputs = d_outputs[0] d_layer_thetas = [] for layer_idx in reversed(range(num_layers)): f_seed, g_seed = step_seeds[layer_idx] layer = self.sub_layers[layer_idx] layer_theta = theta.sub_layers[layer_idx] input_acts, d_inputs, d_theta = layer.ReverseAndGrad( layer_theta, output_acts, d_outputs, f_seed, g_seed, *extra_inputs) d_layer_thetas.append(d_theta) # Passes reconstructed inputs to the previous layer. output_acts = input_acts d_outputs = d_inputs py_utils.ResetStepSeed(final_step_seed) d_theta = py_utils.NestedMap() d_theta.sub_layers = list(reversed(d_layer_thetas)) extra_grads = [tf.zeros_like(t) for t in extra_inputs] return [ tf.zeros_like(initial_step_seed), d_theta, d_inputs, extra_grads ] def Fwd(xs): """Forward pass.""" initial_step_seed, theta, acts, extra_inputs = xs py_utils.ResetStepSeed(initial_step_seed) layer_step_seeds = [] for layer_theta, layer in zip(theta.sub_layers, self.sub_layers): acts, f_seed, g_seed = layer.FProp(layer_theta, acts, *extra_inputs) layer_step_seeds += [(f_seed, g_seed)] return [acts, layer_step_seeds] if self.params.custom_gradient: acts, _ = py_utils.CallDefun( Fwd, [initial_step_seed, theta, inputs, extra_inputs], Bak) py_utils.ResetStepSeed(final_step_seed) return acts else: acts = inputs for layer_theta, layer in zip(theta.sub_layers, self.sub_layers): acts, _, _ = layer.FProp(layer_theta, acts, *extra_inputs) return acts
def SendRecv(graph, dtype): to_send = np.array(3.1415 + 2j).astype(dtype.as_numpy_dtype) with graph.as_default(): ch = sendrecv.Channel(dtype, shape, sender, recver, "test") with tf.device(sender): # py_utils.CallDefun requires non-empty inputs. Same below. def Send(_): src_val = tf.constant(to_send) ch.Send(src_val) return tf.convert_to_tensor(1.0) send_op = py_utils.CallDefun(Send, tf.convert_to_tensor(0)) with tf.device(recver): def Recv(_): return ch.Recv() recv_val = py_utils.CallDefun(Recv, tf.convert_to_tensor(0)) return send_op, recv_val, to_send
def FProp(self, theta, current_step): return py_utils.CallDefun(self._combined, tf.convert_to_tensor(current_step))
def FProp(self, theta, current_step): return py_utils.CallDefun(self._exp, tf.cast(current_step, dtype=self.params.dtype))
def FProp(self, theta, input_tensor): p = self.params if self._output_tensor is not None: raise ValueError('FProp was already called.') def _Gradient(inputs, _, original_grad): # Compute the gradients for each loss w.r.t. the inputs. # TODO(jngiam): Look into whether TF dedups this computation. per_loss_grads = [] for loss, _ in self._losses: per_loss_grad = tf.gradients(loss, self._output_tensor)[0] if per_loss_grad is None: tf.logging.warning( 'Loss %s did not result in a gradient during ' 'GradDrop computation.', loss) else: per_loss_grads.append(per_loss_grad) if not per_loss_grads: raise ValueError('No valid gradients for GradDrop.') # Multiply the gradients with the inputs. grads = per_loss_grads if p.use_input_sign_only: input_abs = tf.abs( tf.cast(tf.abs(inputs) <= p.epsilon, tf.float32) + inputs) grads = [grad * ((inputs) / (input_abs)) for grad in grads] else: grads = [grad * inputs for grad in grads] # Sum gradient over batch, assuming that batch is always on dim 0. if p.marginalize_batch_dim: grads = [ tf.reduce_sum(grad, axis=0, keepdims=True) for grad in grads ] # First discretize all gradients into their sign values. grad_sign_positive = [ tf.cast(grad > 0.0, tf.float32) for grad in grads ] grad_sign_negative = [ tf.cast(grad < 0.0, tf.float32) for grad in grads ] # Calculate the probability of positive gradients based on equation (1) # in the GradDrop paper. grad_abs_sum = tf.add_n([tf.abs(grad) for grad in grads]) prob_pos = (tf.add_n(grads) / (2. * grad_abs_sum + p.epsilon)) # Implementation of different scales for the keep function. Larger # scales result in steeper keep functions. prob_pos *= p.keep_prob_function_scale if p.keep_prob_function == 'sigmoid': # Standard sigmoid has derivative of 0.25 at 0 so the factor of 4.0 # allows the function scale in sigmoid to be compatible with the # function scale in the linear case. prob_pos = tf.sigmoid(4.0 * prob_pos) elif p.keep_prob_function == 'linear': prob_pos += 0.5 # The main, default mode of GradDrop. Only gradients of one sign are kept, # and which sign is calculated via equation (1) of the main paper. prob_pos = tf.cast(prob_pos >= tf.random.uniform(prob_pos.shape), tf.float32) - 0.5 grad_masks = [ (gsp - gsn) * prob_pos >= 0 for (gsn, gsp) in zip(grad_sign_negative, grad_sign_positive) ] # This diag value gives us the percentage of grads which are kept. gradmask_diag = [tf.cast(gm, tf.float32) for gm in grad_masks] diag = tf.reduce_mean(tf.add_n(gradmask_diag) / len(grad_masks)) summary_utils.scalar('average_grad_mask', diag) leak_ratios = [leak_ratio for _, leak_ratio in self._losses] transformed_per_loss_grads = [ grad * (leak + (1.0 - leak) * tf.cast(grad_mask, tf.float32)) for (leak, grad, grad_mask) in zip(leak_ratios, per_loss_grads, grad_masks) ] transformed_grad = tf.cast(tf.add_n(transformed_per_loss_grads), original_grad.dtype) if not p.keep_gradnorm_constant: return transformed_grad transformed_grad_norm = tf.sqrt(tf.reduce_sum(transformed_grad**2)) original_grad_norm = tf.sqrt(tf.reduce_sum(original_grad**2)) return transformed_grad * original_grad_norm / ( transformed_grad_norm + p.epsilon) output_tensor = py_utils.CallDefun(tf.identity, input_tensor, _Gradient) self._output_tensor = tf.identity(output_tensor) return self._output_tensor