def _TruncateTargetSequence(self, targets): """Truncate padded time steps from all sequences.""" # The following tensors are all in the [batch, time] shape. p = self.params # Let's make a copy of targets. targets = targets.Pack(targets.Flatten()) target_ids = targets.ids target_labels = targets.labels target_weights = targets.weights target_paddings = targets.paddings max_seq_length = tf.to_int32( tf.reduce_max(tf.reduce_sum(1.0 - target_paddings, 1))) summary_utils.scalar('max_seq_length', max_seq_length) # Assert to make sure after max_seq_length, all are padded steps for all # sequences. target_paddings = py_utils.with_dependencies([ py_utils.assert_equal( tf.constant(True, tf.bool), tf.reduce_all(target_paddings[:, max_seq_length:] > 0.5)) ], target_paddings) target_ids = py_utils.with_dependencies([ AssertIdShape( py_utils.GetShape(target_ids), py_utils.GetShape(target_labels), py_utils.GetShape(target_paddings), py_utils.GetShape(target_weights)) ], target_ids) targets.ids = target_ids[:, :max_seq_length] targets.labels = target_labels[:, :max_seq_length] targets.weights = target_weights[:, :max_seq_length] targets.paddings = target_paddings[:, :max_seq_length] return targets
def _Moments(inputs, mask, enable_cross_replica_sum_on_tpu=False): """Computes mean and variance over the valid data points in inputs.""" inputs = py_utils.with_dependencies([ py_utils.assert_equal(tf.rank(inputs), tf.rank(mask)), py_utils.assert_greater_equal(mask, tf.zeros_like(mask)), ], inputs) rank = tf.rank(mask) reduce_over_dims = tf.range(0, rank - 1) sum_v = tf.reduce_sum(inputs * tf.cast(mask, inputs.dtype), reduce_over_dims) count_v = tf.reduce_sum(mask, reduce_over_dims) # Input shape is guaranteed to be a multiple of mask shape because the # inputs * mask op above was successfully broadcasted. mask_multiplier = tf.shape(inputs)[:-1] // tf.shape(mask)[:-1] count_v *= tf.cast(tf.reduce_prod(mask_multiplier), count_v.dtype) if py_utils.use_tpu() and enable_cross_replica_sum_on_tpu: sum_v = tf.tpu.cross_replica_sum(sum_v) count_v = tf.tpu.cross_replica_sum(count_v) count_v = tf.maximum(count_v, 1.0) mean = sum_v / count_v sum_vv = tf.reduce_sum((inputs - mean) * (inputs - mean) * mask, reduce_over_dims) if py_utils.use_tpu() and enable_cross_replica_sum_on_tpu: sum_vv = tf.tpu.cross_replica_sum(sum_vv) variance = py_utils.with_dependencies([ py_utils.assert_greater_equal(sum_vv, tf.zeros_like(sum_vv)), ], sum_vv / count_v) return mean, variance
def FProp(self, theta, inputs, paddings, class_emb): """Apply batch normalization. Args: theta: A `.NestedMap` object containing weights' values of this layer and its children layers. inputs: The inputs tensor. Shaped [batch, ..., dim]. paddings: The paddings tensor. Shaped [batch, ..., 1], with the same rank as the input tensor. class_emb: The conditioning inputs, Shaped [batch, emb_dim]. Returns: Output after applying batch normalization, with the same shape as 'inputs'. """ p = self.params batch = py_utils.GetShape(inputs)[0] class_emb = py_utils.HasShape(class_emb, [batch, p.class_emb_dim]) if not py_utils.use_tpu(): class_emb = py_utils.with_dependencies([ py_utils.assert_less_equal( tf.cast(class_emb, tf.int32), 1, name='one_hot_assert1'), py_utils.assert_greater_equal( tf.cast(class_emb, tf.int32), 0, name='one_hot_assert2'), py_utils.assert_equal(tf.ones([batch], tf.int32), tf.cast(tf.reduce_sum(class_emb, -1), tf.int32), name='one_hot_assert3'), ], class_emb) with tf.name_scope(p.name): norm_mean, norm_variance, beta, gamma = self.ComputeAndUpdateMoments( theta, inputs, paddings=paddings, class_emb=class_emb) return self._ComputeBN(inputs, paddings, gamma, beta, norm_mean, norm_variance)
def SplitTensors(xs, num_splits): """Splits tensors in `xs` evenly into num_splits along the 1st dimenion. Args: xs: A tuple of tensors. Each tensor's 1st dimension is the same size. num_splits: A python integer. Returns: A tuple of lists of tensors, num elements in the tuple = len(xs). i-th element in each list corresponds to i-th split of each tensor in xs along the first dimension of each tensor. """ # assert first dim of all tensors in xs is equal batch_dims = [tf.shape(x)[0] for x in xs] all_batch_dims = tf.stack(batch_dims) all_batch_dims = py_utils.with_dependencies([ py_utils.assert_equal( all_batch_dims, tf.shape(xs[0])[0], message='first dim of tensors in xs must match'), py_utils.assert_greater_equal( tf.shape(xs[0])[0], num_splits, message='first dim of tensors in xs must be greater than num_splits') ], all_batch_dims) splits = ComputeSplits(tf.shape(xs[0])[0], num_splits) # add the above assertion into the compute graph splits = py_utils.with_dependencies([all_batch_dims], splits) split_xs = [tf.split(axis=0, num_or_size_splits=splits, value=x) for x in xs] return split_xs
def ComputeMoments(inputs, padding, reduce_over_dims, cumulative_axis=None, enable_cross_replica_sum_on_tpu=False, keepdims=False): """Computes mean and variance over the valid data points in inputs.""" mask = 1.0 - padding inputs = py_utils.with_dependencies([ py_utils.assert_equal(tf.rank(inputs), tf.rank(mask)), py_utils.assert_greater_equal(mask, tf.zeros_like(mask)), ], inputs) sum_v = tf.reduce_sum(inputs * tf.cast(mask, inputs.dtype), reduce_over_dims, keepdims=keepdims) count_v = tf.reduce_sum(mask, reduce_over_dims, keepdims=keepdims) if cumulative_axis is not None: sum_v = tf.math.cumsum(sum_v, axis=cumulative_axis) count_v = tf.math.cumsum(count_v, axis=cumulative_axis) # Input shape is guaranteed to be a multiple of mask shape because the # inputs * mask op above was successfully broadcasted. input_size_on_reduced_dims = tf.reduce_prod( tf.gather(tf.shape(inputs), reduce_over_dims)) mask_size_on_reduced_dims = tf.reduce_prod( tf.gather(tf.shape(mask), reduce_over_dims)) mask_multiplier = tf.math.truediv(input_size_on_reduced_dims, mask_size_on_reduced_dims) count_v *= tf.cast(mask_multiplier, count_v.dtype) if py_utils.use_tpu() and enable_cross_replica_sum_on_tpu: sum_v = tf.tpu.cross_replica_sum(sum_v) count_v = tf.tpu.cross_replica_sum(count_v) count_v = tf.maximum(count_v, 1.0) mean = sum_v / count_v sum_vv = tf.reduce_sum((inputs - mean) * (inputs - mean) * mask, reduce_over_dims, keepdims=keepdims) if cumulative_axis is not None: sum_vv = tf.math.cumsum(sum_vv, axis=cumulative_axis) if py_utils.use_tpu() and enable_cross_replica_sum_on_tpu: sum_vv = tf.tpu.cross_replica_sum(sum_vv) variance = py_utils.with_dependencies([ py_utils.assert_greater_equal(sum_vv, tf.zeros_like(sum_vv)), ], sum_vv / count_v) return mean, variance
def _InferenceSubgraph_Default(self): with tf.name_scope('inference'): src_strings = tf.placeholder(tf.string, shape=[None]) _, src_ids, src_paddings = self.input_generator.StringsToIds( src_strings, is_source=True) # Truncate paddings at the end. max_seq_length = tf.to_int32( tf.reduce_max(tf.reduce_sum(1.0 - src_paddings, 1))) src_paddings = py_utils.with_dependencies([ py_utils.assert_equal( tf.constant(True, tf.bool), tf.reduce_all(src_paddings[:, max_seq_length:] > 0.5)) ], src_paddings) src_ids = src_ids[:, :max_seq_length] src_paddings = src_paddings[:, :max_seq_length] src_input_map = py_utils.NestedMap(ids=src_ids, paddings=src_paddings) encoder_outputs = self.enc.FPropDefaultTheta(src_input_map) decoder_outs = self.dec.BeamSearchDecode(encoder_outputs) topk_hyps = decoder_outs.topk_hyps topk_ids = decoder_outs.topk_ids topk_lens = decoder_outs.topk_lens topk_decoded = self.input_generator.IdsToStrings( topk_ids, topk_lens - 1) topk_decoded = tf.reshape(topk_decoded, tf.shape(topk_hyps)) feeds = py_utils.NestedMap({'src_strings': src_strings}) fetches = py_utils.NestedMap({ 'src_ids': src_ids, 'topk_decoded': topk_decoded, 'topk_scores': decoder_outs.topk_scores, 'topk_hyps': topk_hyps, }) return fetches, feeds
def FProp(self, theta, input_batch): """Embeds source ids and transforms with TransformerStack. Args: theta: A `.NestedMap` object containing weights' values of this layer and its children layers. input_batch: A `.NestedMap` with fields: - ids: The inputs tensor. It is expected to be of shape [batch, time]. - paddings: The paddings tensor. Expected shape [batch, time]. Returns: A NestedMap containing: - encoded: The encoded features, either a tensor of shape [time, batch, depth], or a list of tensors if is_transparent is set in transformer_stack. - padding: of shape [time, batch] - segment_id: [time, batch] if packed inputs are supported by the model (and all layers), or None otherwise. - embedded_inputs: [time, batch, depth] embedded inputs tokens without positional encodings. """ p = self.params with tf.name_scope(p.name): src_segment_id = None src_segment_pos = None input_ids = py_utils.with_dependencies([ py_utils.assert_shape_match(tf.shape(input_batch.ids), tf.shape(input_batch.paddings)), py_utils.assert_equal(tf.rank(input_batch.ids), 2) ], input_batch.ids) if (not py_utils.use_tpu() and tf.flags.FLAGS.transformer_encoder_truncates_inputs): max_seq_length = tf.cast( tf.reduce_max(tf.reduce_sum(1.0 - input_batch.paddings, 1)), tf.int32) paddings = py_utils.with_dependencies([ py_utils.assert_equal( tf.constant(True, tf.bool), tf.reduce_all( input_batch.paddings[:, max_seq_length:] > 0.5)) ], input_batch.paddings) input_ids = input_ids[:, :max_seq_length] paddings = paddings[:, :max_seq_length] if p.packed_input: src_segment_id = input_batch.segment_ids[:, : max_seq_length] src_segment_pos = input_batch.segment_pos[:, : max_seq_length] else: paddings = input_batch.paddings if p.packed_input: src_segment_id = input_batch.segment_ids src_segment_pos = input_batch.segment_pos max_time = tf.shape(input_ids)[1] # Input token embeddings + positional embeddings input_embs = self.token_emb.EmbLookup(theta.token_emb, tf.reshape(input_ids, [-1])) input_embs = tf.reshape(input_embs, [-1, max_time, p.token_emb.embedding_dim]) # [time, batch, dim] orig_input_embs = tf.transpose(input_embs, [1, 0, 2]) if p.packed_input: position_embs = self.position_emb.FPropWithPosition( theta.position_emb, src_segment_pos) else: position_embs = self.position_emb.FProp( theta.position_emb, max_time) position_embs = tf.reshape( position_embs, [1, max_time, p.token_emb.embedding_dim]) input_embs += position_embs if p.model_dim != p.token_emb.embedding_dim: input_embs = self.emb_proj.FProp(theta.emb_proj, input_embs) paddings = tf.transpose(paddings) if p.packed_input: src_segment_id = tf.transpose(src_segment_id) input_embs = self.input_dropout.FProp(theta.input_dropout, input_embs) # [time, batch, dim] transformer_input = tf.transpose(input_embs, [1, 0, 2]) encoded, padding, segment_id = self.transformer_stack.FProp( theta.transformer_stack, transformer_input, paddings, src_segment_id) return py_utils.NestedMap(encoded=encoded, padding=padding, segment_id=segment_id, embedded_inputs=orig_input_embs)
def MergeBeamSearchOutputs(max_hyps_per_beam, beam_search_outputs): """Merges beam search hyps from multiple decoders. Args: max_hyps_per_beam: the number of top hyps in the merged results. Must be less than or equal to total number of input hyps. beam_search_outputs: a list of BeamSearchDecodeOutput objects. Must share the same source_batch and max sequence length. Returns: A BeamSearchDecodeOutput object containing max_hyps_per_beam hypotheses per beam. """ source_batch = tf.shape(beam_search_outputs[0].topk_hyps)[0] value_dict = {} for output in beam_search_outputs: hyps_per_beam = py_utils.with_dependencies([ py_utils.assert_equal(source_batch, tf.shape(output.topk_hyps)[0]), ], tf.shape(output.topk_hyps)[1]) for k, v in six.iteritems(output._asdict()): if v is None: continue if k == 'done_hyps': v = tf.transpose(v) if k not in value_dict: value_dict[k] = [] value_dict[k].append(tf.reshape(v, [source_batch, hyps_per_beam, -1])) # Concatenate the tensors along the 'num_hyps_per_beam' dimension. concatenated = {} for k, values in six.iteritems(value_dict): if len(values) != len(beam_search_outputs): raise ValueError('Incomplete values for %s: %s' % (k, beam_search_outputs)) concatenated[k] = tf.concat(values, axis=1) scores = concatenated['topk_scores'] scores = tf.where( tf.equal(concatenated['topk_lens'], 0), tf.fill(tf.shape(scores), -1e6), scores) scores = tf.squeeze(scores, -1) # Select top max_hyps_per_beam indices per beam. _, top_indices = tf.nn.top_k(scores, max_hyps_per_beam) batch_ids = tf.tile( tf.expand_dims(tf.range(source_batch), -1), [1, max_hyps_per_beam]) # [source_batch, max_hyps_per_beam, 2] gather_indices = tf.stack([batch_ids, top_indices], axis=-1) # Gather the merged top hyps according to 'gather_indices'. top = beam_search_outputs[0]._asdict() total_hyps = source_batch * max_hyps_per_beam for k, v in six.iteritems(concatenated): v = tf.gather_nd(v, gather_indices) if k == 'done_hyps': v = tf.transpose(tf.reshape(v, [total_hyps, -1])) elif k == 'topk_hyps': v = tf.reshape(v, [source_batch, max_hyps_per_beam]) elif k == 'topk_ids': v = tf.reshape(v, [total_hyps, -1]) elif k in ('topk_lens', 'topk_scores', 'topk_decoded'): v = tf.reshape(v, [total_hyps]) else: raise ValueError('Unexpected field: %s' % k) top[k] = v return BeamSearchDecodeOutput(**top)
def FProp(self, theta, inputs, query_vec=None): """Combines the list of input tensors into a single tensor. Args: theta: A `.NestedMap` object containing weights' values of this layer and its children layers. inputs: A list of tensors of shape [..., hidden_dim] or [..., [pre_proj_input_dims[i]]] if pre_proj_input_dims is specified. query_vec: A tensor of shape [..., hidden_dim]. Returns: A tensor of the same shape with input tensors. Raises: ValueError: p.merger_op is not defined. """ p = self.params n_sources = len(inputs) if p.pre_proj_input_dims and len(p.pre_proj_input_dims) != n_sources: raise ValueError( 'pre_proj_input_dims must be specified for each input.') if n_sources == 1: return inputs[0] # Pre-projection operation. if p.pre_proj_input_dims: for i in range(n_sources): inputs[i] = self.pre_proj[i].FProp(theta.pre_proj[i], inputs[i]) tensor_pairs = list(zip(inputs[:-1], inputs[1:])) if p.merger_op == 'mean': # Simply take the mean, all dims must match. with tf.control_dependencies([ py_utils.assert_shape_match(tf.shape(t1), tf.shape(t2)) for t1, t2 in tensor_pairs ]): output = tf.add_n(inputs) / n_sources elif p.merger_op == 'sum': # Sum up all sources, all dims must match. with tf.control_dependencies([ py_utils.assert_shape_match(tf.shape(t1), tf.shape(t2)) for t1, t2 in tensor_pairs ]): output = tf.add_n(inputs) elif p.merger_op == 'weighted_sum': # Weighted sum of all sources, all dims must match. # For weighted_sum, assume input is a list of rank 3 tensors inputs = tf.stack(inputs) inputs = py_utils.HasRank(inputs, 4) with tf.control_dependencies([ py_utils.assert_shape_match(tf.shape(t1), tf.shape(t2)) for t1, t2 in tensor_pairs ]): w = tf.expand_dims( tf.expand_dims(tf.expand_dims(self._sum_weight, 1), 1), 1) w = tf.tile(w, [ 1, tf.shape(inputs)[1], tf.shape(inputs)[2], tf.shape(inputs)[3] ]) output = tf.reduce_sum(inputs * w, axis=0) elif p.merger_op == 'atten': # Apply attention over the concatenated tensor, all dims must match. with tf.control_dependencies([ py_utils.assert_shape_match(tf.shape(t1), tf.shape(t2)) for t1, t2 in tensor_pairs ]): inputs = tf.stack(inputs, axis=0) batch_size = tf.shape(inputs)[1] paddings = tf.zeros([n_sources, batch_size], dtype=inputs.dtype) self.atten.InitForSourcePacked(theta.atten, inputs, inputs, paddings) output, _, _ = self.atten.ComputeContextVector( theta.atten, tf.reshape(query_vec, [-1, p.query_dim])) elif p.merger_op == 'concat': # Concatenate over the last dim, all dims but last must match. with tf.control_dependencies([ py_utils.assert_equal( tf.shape(t1)[:-1], tf.shape(t2)[:-1]) for t1, t2 in tensor_pairs ]): output = tf.concat(inputs, axis=-1) elif p.merger_op == 'gated_avg': output = self.gated_average.FProp(theta.gated_average, inputs) else: raise ValueError('Unrecognized merge op!') return output
def ResidualsToBBoxes(self, anchor_bboxes, residuals, min_angle_rad=-np.pi, max_angle_rad=np.pi): r"""Converts anchor_boxes and residuals to predicted bboxes. This converts predicted residuals into bboxes using the following formulae:: x_predicted = x_a + x_residual * diagonal_xy y_predicted = y_a + y_residual * diagonal_xy z_predicted = z_a + z_residual * dz_a dx_predicted = dx_a * exp(dx_residual) dy_predicted = dy_a * exp(dy_residual) dz_predicted = dz_a * exp(dz_residual) # Adding the residual, and bounding it between # [min_angle_rad, max_angle_rad] phi_predicted = NormalizeAngleRad(phi_a + phi_residual, min_angle_rad, max_angle_rad) These equations follow from those in LocalizationResiduals, where we solve for the \*_gt variables. Args: anchor_bboxes: tf.float32. where [..., :7] contains (x, y, z, dx, dy, dz, phi), corresponding to each anchor bbox parameters. residuals: tf.float32 of the same shape as anchor_bboxes containing predicted residuals at each anchor location. min_angle_rad: Scalar with the minimum angle allowed (before wrapping) in radians. max_angle_rad: Scalar with the maximum angle allowed (before wrapping) in radians. This value usually should be pi. Returns: A tf.float32 tensor of the same shape as anchor_bboxes with predicted bboxes. """ anchor_bboxes_shape = py_utils.GetShape(anchor_bboxes) anchor_bboxes = py_utils.with_dependencies( [py_utils.assert_equal(anchor_bboxes_shape[-1], 7)], anchor_bboxes) residuals = py_utils.HasShape(residuals, anchor_bboxes_shape) x_a, y_a, z_a, dx_a, dy_a, dz_a, phi_a = tf.unstack(anchor_bboxes, num=7, axis=-1) (x_residual, y_residual, z_residual, dx_residual, dy_residual, dz_residual, phi_residual) = tf.unstack(residuals, num=7, axis=-1) diagonal_xy = tf.sqrt(tf.square(dx_a) + tf.square(dy_a)) x_predicted = x_a + x_residual * diagonal_xy y_predicted = y_a + y_residual * diagonal_xy z_predicted = z_a + z_residual * dz_a dx_predicted = dx_a * tf.exp(dx_residual) dy_predicted = dy_a * tf.exp(dy_residual) dz_predicted = dz_a * tf.exp(dz_residual) # We bound the angle between [min_angle_rad, max_angle_rad], which should # be passed in depending on the heading handling in the calling model. # If the model uses a sine(delta_phi) transformation in the loss, then it # cannot distinguish direction and a [0, np.pi] # [min_angle_rad, max_angle_rad] should be used. # If there is a heading encoding that is directional, most likely you # should use a [-np.pi, np.pi] [min_angle_rad, max_angle_rad]. phi_predicted = phi_a + phi_residual phi_predicted = geometry.WrapAngleRad(phi_predicted, min_angle_rad, max_angle_rad) return tf.stack([ x_predicted, y_predicted, z_predicted, dx_predicted, dy_predicted, dz_predicted, phi_predicted, ], axis=-1) # pyformat: disable
def LocalizationResiduals(self, anchor_bboxes, assigned_gt_bboxes): """Computes the anchor residuals for every bbox. For a given bbox, compute residuals in the following way: Let ``anchor_bbox = (x_a, y_a, z_a, dx_a, dy_a, dz_a, phi_a)`` and ``assigned_gt_bbox = (x_gt, y_gt, z_gt, dx_gt, dy_gt, dz_gt, phi_gt)`` Define ``diagonal_xy = sqrt(dx_a^2 + dy_a^2)`` Then the corresponding residuals are given by:: x_residual = (x_gt - x_a) / (diagonal_xy) y_residual = (y_gt - y_a) / (diagonal_xy) z_residual = (z_gt - z_a) / (dz_a) dx_residual = log(dx_gt / dx_a) dy_residual = log(dy_gt / dy_a) dz_residual = log(dz_gt / dz_a) phi_residual = phi_gt - phi_a The normalization for x and y residuals by the diagonal was first proposed by [1]. Intuitively, this reflects that objects can usually move freely in the x-y plane, including diagonally. On the other hand, moving in the z-axis (up and down) can be considered orthogonal to x-y. For phi_residual, one way to frame the loss is with SmoothL1(sine(phi_residual - phi_predicted)). The use of sine to wrap the phi residual was proposed by [2]. This stems from the observation that bboxes at phi and phi + pi are the same bbox, fully overlapping in 3D space, except that the direction is different. Note that the use of sine makes this residual invariant to direction when a symmetric loss like SmoothL1 is used. In ResidualsToBBoxes, we ensure that the phi predicted is between [0, pi). The Huber (SmoothL1) loss can then be applied to the delta between these target residuals and the model predicted residuals. [1] VoxelNet: End-to-End Learning for Point Cloud Based 3D Object Detection https://arxiv.org/abs/1711.06396 [2] SECOND: Sparsely Embedded Convolutional Detection https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf Args: anchor_bboxes: tf.float32. where [..., :7] contains (x, y, z, dx, dy, dz, phi), corresponding to each anchor bbox parameters. assigned_gt_bboxes: tf.float32 of the same shape as anchor_bboxes containing the corresponding assigned ground-truth bboxes. Returns: A tf.float32 tensor of the same shape as anchor_bboxes with target residuals for every corresponding bbox. """ anchor_bboxes_shape = py_utils.GetShape(anchor_bboxes) anchor_bboxes = py_utils.with_dependencies( [py_utils.assert_equal(anchor_bboxes_shape[-1], 7)], anchor_bboxes) assigned_gt_bboxes = py_utils.HasShape(assigned_gt_bboxes, anchor_bboxes_shape) x_a, y_a, z_a, dx_a, dy_a, dz_a, phi_a = tf.unstack(anchor_bboxes, num=7, axis=-1) x_gt, y_gt, z_gt, dx_gt, dy_gt, dz_gt, phi_gt = tf.unstack( assigned_gt_bboxes, num=7, axis=-1) diagonal_xy = tf.sqrt(tf.square(dx_a) + tf.square(dy_a)) # The anchor dimensions is usually a hard-coded param given to the input # generator and should not be 0. We use CheckNumerics to ensure that is the # case. x_residual = py_utils.CheckNumerics((x_gt - x_a) / diagonal_xy) y_residual = py_utils.CheckNumerics((y_gt - y_a) / diagonal_xy) z_residual = py_utils.CheckNumerics((z_gt - z_a) / dz_a) dx_residual = py_utils.CheckNumerics(tf.log(dx_gt / dx_a)) dy_residual = py_utils.CheckNumerics(tf.log(dy_gt / dy_a)) dz_residual = py_utils.CheckNumerics(tf.log(dz_gt / dz_a)) phi_residual = phi_gt - phi_a return tf.stack([ x_residual, y_residual, z_residual, dx_residual, dy_residual, dz_residual, phi_residual, ], axis=-1) # pyformat: disable
def AddAttentionSummaryBatchMajor(name, attention_tensors, src_paddings, tgt_paddings, transcripts=None, max_outputs=3): """Adds an image summary showing the attention probability matrix and state. As opposed to AddAttentionSummary() takes all tensors with batch dimension in axis 0. Args: name: Summary name. attention_tensors: A list of 3D tensors shaped [batch_size, target_len, source_len] where attention[b, i, j] is the probability for the i-th output attending to the j-th input for element b in the batch. src_paddings: A tensor of binary paddings shaped [batch, source_len] for the source sequence. Or a list of tensors of the same length as attention_tensors with a separate paddings for each entry in attention_tensors. tgt_paddings: A tensor of binary paddings shaped [batch, target_len] for the target sequence. Or a list of tensors of the same length as attention_tensors with a separate paddings for each entry in attention_tensors. transcripts: Optional, transcripts shaped [batch, source_len] for the source sequence. max_outputs: Integer maximum number of elements of the batch to plot. """ def VerifyLen(paddings): length = len(paddings) if isinstance(paddings, list) else 1 if length != 1 and length != len(attention_tensors): raise ValueError('Bad length of paddings list {}'.format(length)) VerifyLen(src_paddings) VerifyLen(tgt_paddings) # Verify shapes. for i, attention_tensor in enumerate(attention_tensors): src, tgt = src_paddings, tgt_paddings src = src[0 if len(src) == 1 else i] if isinstance(src, list) else src tgt = tgt[0 if len(tgt) == 1 else i] if isinstance(tgt, list) else tgt tgt_shape = py_utils.GetShape(tgt) attention_tensors[i] = tf.identity( py_utils.with_dependencies([ py_utils.assert_equal( py_utils.GetShape(attention_tensor), tgt_shape[:2] + [py_utils.GetShape(src)[1]] + tgt_shape[2:]) ], attention_tensor), re.sub(':.*$', '', GetTensorName(attention_tensor, name, i))) if not _ShouldAddSummary(): return def ToLengths(paddings): paddings = paddings if isinstance(paddings, list) else [paddings] return [SequenceLength(p) for p in paddings] def Get(lengths, i): return lengths[0 if len(lengths) == 1 else i] src_lens = ToLengths(src_paddings) tgt_lens = ToLengths(tgt_paddings) with plot.MatplotlibFigureSummary(name + '/Attention', max_outputs=max_outputs, gridspec_kwargs={'hspace': 0.3}) as fig: for n, atten in enumerate(attention_tensors): # Diagnostic metric that decreases as attention picks up. max_entropy = tf.math.log(tf.cast(Get(src_lens, n), tf.float32)) max_entropy = tf.expand_dims(tf.expand_dims(max_entropy, -1), -1) atten_normalized_entropy = -atten * tf.math.log( atten + 1e-10) / max_entropy scalar(name + '/Attention/average_normalized_entropy/%d' % n, tf.reduce_mean(atten_normalized_entropy)) args = [atten, Get(src_lens, n), Get(tgt_lens, n)] if transcripts is not None and n == 0: args.append(transcripts) fig.AddSubplot(args, TrimPaddingAndPlotAttention, title=GetTensorName(atten, name, n), xlabel='Input', ylabel='Output')
def ResidualsToBBoxes(self, anchor_bboxes, residuals): r"""Converts anchor_boxes and residuals to predicted bboxes. This converts predicted residuals into bboxes using the following formulae: x_predicted = x_a + x_residual \* diagonal_xy y_predicted = y_a + y_residual \* diagonal_xy z_predicted = z_a + z_residual \* dz_a dx_predicted = dx_a \* exp(dx_residual) dy_predicted = dy_a \* exp(dy_residual) dz_predicted = dz_a \* exp(dz_residual) phi_predicted = phi_a + phi_residual These equations follow from those in LocalizationResiduals, where we solve for the \*_gt variables. Args: anchor_bboxes: tf.float32. where [..., :7] contains (x, y, z, dx, dy, dz, phi), corresponding to each anchor bbox parameters. residuals: tf.float32 of the same shape as anchor_bboxes containing predicted residuals at each anchor location. Returns: A tf.float32 tensor of the same shape as anchor_bboxes with predicted bboxes. """ anchor_bboxes_shape = py_utils.GetShape(anchor_bboxes) anchor_bboxes = py_utils.with_dependencies( [py_utils.assert_equal(anchor_bboxes_shape[-1], 7)], anchor_bboxes) residuals = py_utils.HasShape(residuals, anchor_bboxes_shape) x_a, y_a, z_a, dx_a, dy_a, dz_a, phi_a = tf.unstack( anchor_bboxes, num=7, axis=-1) (x_residual, y_residual, z_residual, dx_residual, dy_residual, dz_residual, phi_residual) = tf.unstack( residuals, num=7, axis=-1) diagonal_xy = tf.sqrt(tf.square(dx_a) + tf.square(dy_a)) x_predicted = x_a + x_residual * diagonal_xy y_predicted = y_a + y_residual * diagonal_xy z_predicted = z_a + z_residual * dz_a dx_predicted = dx_a * tf.exp(dx_residual) dy_predicted = dy_a * tf.exp(dy_residual) dz_predicted = dz_a * tf.exp(dz_residual) # Assuming a sine(delta_phi) transformation is used in the loss, then, it # is not possible to distinguish direction, hence, we use floormod here to # ensure that the predicted_phi is always in [0, np.pi) for consistency. # A separate direction classifier should be added the model if needed. phi_predicted = phi_a + phi_residual phi_predicted = tf.floormod(phi_predicted, np.pi) return tf.stack([ x_predicted, y_predicted, z_predicted, dx_predicted, dy_predicted, dz_predicted, phi_predicted, ], axis=-1) # pyformat: disable
def FProp(self, theta, input_batch): """Embeds source ids and transforms with TransformerStack. Args: theta: A `.NestedMap` object containing weights' values of this layer and its children layers. input_batch: A `.NestedMap` with fields: - ids: The inputs tensor. It is expected to be of shape [batch, time]. - paddings: The paddings tensor. Expected shape [batch, time]. - task_ids: If p.task_emb is provided, must contain per-token task ids of shape [batch, time]. Returns: A NestedMap containing - encoded: The encoded features, either a tensor of shape [time, batch, depth], or a list of tensors if is_transparent is set in transformer_stack. - padding: of shape [time, batch] - segment_id: [time, batch] if packed inputs are supported by the model (and all layers), or None otherwise. - embedded_inputs: [time, batch, depth] embedded inputs tokens without positional encodings. """ p = self.params with tf.name_scope(p.name): src_segment_id = None src_segment_pos = None input_ids = py_utils.with_dependencies([ py_utils.assert_shape_match( tf.shape(input_batch.ids), tf.shape(input_batch.paddings)), py_utils.assert_equal(tf.rank(input_batch.ids), 2) ], input_batch.ids) if (not py_utils.use_tpu() and FLAGS.transformer_encoder_truncates_inputs): max_seq_length = tf.cast( tf.reduce_max(tf.reduce_sum(1.0 - input_batch.paddings, 1)), tf.int32) paddings = py_utils.with_dependencies([ py_utils.assert_equal( tf.constant(True, tf.bool), tf.reduce_all(input_batch.paddings[:, max_seq_length:] > 0.5)) ], input_batch.paddings) input_ids = input_ids[:, :max_seq_length] paddings = paddings[:, :max_seq_length] if p.packed_input: src_segment_id = input_batch.segment_ids[:, :max_seq_length] src_segment_pos = input_batch.segment_pos[:, :max_seq_length] else: paddings = input_batch.paddings if p.packed_input: src_segment_id = input_batch.segment_ids src_segment_pos = input_batch.segment_pos max_time = tf.shape(input_ids)[1] # Input token embeddings + positional embeddings if not p.shared_emb: input_embs = self.token_emb.EmbLookup(theta.token_emb, tf.reshape(input_ids, [-1])) else: input_embs = self.softmax.EmbLookup(theta.softmax, tf.reshape(input_ids, [-1])) input_embs = tf.reshape(input_embs, [-1, max_time, p.token_emb.embedding_dim]) # [time, batch, dim] orig_input_embs = tf.transpose(input_embs, [1, 0, 2]) if p.packed_input: position_embs = self.position_emb.FPropWithPosition( theta.position_emb, src_segment_pos) else: position_embs = self.position_emb.FProp(theta.position_emb, max_time) position_embs = tf.reshape(position_embs, [1, max_time, p.token_emb.embedding_dim]) # Position embeddings are simply added to token embeddings. input_embs += position_embs if p.individually_tagged_input: assert not p.packed_input # Look up tag embeddings; this assumes that the tags arriving on # input_batch.segment_ids (originating as common.source_segment_id # in the input NMTExample) have been reserved in the WPM vocabulary # as context tags, e.g. the ids for <src_token> and <ctxt_token> in # wide source context experiments. input_tags = py_utils.with_dependencies([ py_utils.assert_shape_match( tf.shape(input_batch.segment_ids), tf.shape(input_batch.ids)), py_utils.assert_equal(tf.rank(input_batch.segment_ids), 2) ], input_batch.segment_ids) tag_embeddings = self.token_emb.EmbLookup(theta.token_emb, tf.reshape(input_tags, [-1])) tag_embeddings = tf.reshape(tag_embeddings, [-1, max_time, p.token_emb.embedding_dim]) # Concatenate the tag embeddings to the input embeddings, and then # project back to the original embedding dimensionality. concat_embs = tf.concat([input_embs, tag_embeddings], -1) input_embs = self.concat_emb_and_tag_proj.FProp( theta.concat_emb_and_tag_proj, concat_embs) if p.ln_input: input_embs = self.layer_norm_input.FProp(theta.layer_norm_input, input_embs) if p.task_emb: input_embs += self.task_emb.EmbLookup(theta.task_emb, input_batch.task_ids) summary_utils.histogram('input_embs', input_embs) if p.model_dim != p.token_emb.embedding_dim: input_embs = self.emb_proj.FProp(theta.emb_proj, input_embs) summary_utils.histogram('emb_proj', input_embs) paddings = tf.cast(tf.transpose(paddings), py_utils.FPropDtype(p)) if p.packed_input: src_segment_id = tf.transpose(src_segment_id) input_embs = self.input_dropout.FProp(theta.input_dropout, input_embs) # [time, batch, dim] transformer_input = tf.transpose(input_embs, [1, 0, 2]) if not self.do_eval and p.apply_source_mask: # Augment padding for masked source word positions. dtype = paddings.dtype source_mask = tf.where( tf.equal(input_ids, p.source_mask_id), tf.ones_like(input_ids, dtype=dtype), tf.zeros_like(input_ids, dtype=dtype)) # Make sure padding is between 0 and 1. paddings = tf.clip_by_value(paddings + tf.transpose(source_mask), 0.0, 1.0) encoded, padding, segment_id = self.transformer_stack.FProp( theta.transformer_stack, transformer_input, paddings, src_segment_id) return py_utils.NestedMap( encoded=encoded, padding=padding, segment_id=segment_id, embedded_inputs=orig_input_embs)
def FProp(self, theta, input_batch, interpolation_batch=None, lambdas=None): # pyformat: disable """Interpolates source ids in input_batch and interpolation_batch. Refer to Eq. (4) in paper https://arxiv.org/abs/2106.04060. It is a standard Transformer Encoder if interpolation_batch != None. Args: theta: A `.NestedMap` object containing weights values of this layer and its children layers. input_batch: A `.NestedMap` with fields: - ids: The inputs tensor. It is expected to be of shape [batch, time]. - paddings: The paddings tensor. Expected shape [batch, time]. - task_ids: If p.task_emb is provided, must contain per-token task ids of shape [batch, time]. interpolation_batch: A `.NestedMap` with fields: - ids: The inputs tensor. It is expected to be of shape [batch, time]. - paddings: The paddings tensor. Expected shape [batch, time]. - task_ids: If p.task_emb is provided, must contain per-token task ids of shape [batch, time]. - embs: Embeddings of ids. lambdas: A pair of tensors to combine embeddings of ids in input_batch and interpolation_batch. Returns: A NestedMap of - encoded: The encoded features, either a tensor of shape [time, batch, depth], or a list of tensors if is_transparent is set in transformer_stack. - padding: of shape [time, batch] - segment_id: [time, batch] if packed inputs are supported by the model (and all layers), or None otherwise. - embedded_inputs: [time, batch, depth] embedded inputs tokens without positional encodings. """ # pyformat: enable p = self.params with tf.name_scope(p.name): src_segment_id = None src_segment_pos = None input_ids = py_utils.with_dependencies([ py_utils.assert_shape_match( tf.shape(input_batch.ids), tf.shape(input_batch.paddings)), py_utils.assert_equal(tf.rank(input_batch.ids), 2) ], input_batch.ids) max_seq_length = None if (not py_utils.use_tpu() and FLAGS.transformer_encoder_truncates_inputs): max_seq_length = tf.cast( tf.reduce_max(tf.reduce_sum(1.0 - input_batch.paddings, 1)), tf.int32) paddings = py_utils.with_dependencies([ py_utils.assert_equal( tf.constant(True, tf.bool), tf.reduce_all(input_batch.paddings[:, max_seq_length:] > 0.5)) ], input_batch.paddings) input_ids = input_ids[:, :max_seq_length] paddings = paddings[:, :max_seq_length] if p.packed_input: src_segment_id = input_batch.segment_ids[:, :max_seq_length] src_segment_pos = input_batch.segment_pos[:, :max_seq_length] else: paddings = input_batch.paddings if p.packed_input: src_segment_id = input_batch.segment_ids src_segment_pos = input_batch.segment_pos max_time = tf.shape(input_ids)[1] # Input token embeddings + positional embeddings if not p.shared_emb: input_embs = self.token_emb.EmbLookup(theta.token_emb, tf.reshape(input_ids, [-1])) else: input_embs = self.softmax.EmbLookup(theta.softmax, tf.reshape(input_ids, [-1])) if interpolation_batch is not None: other_input_ids = interpolation_batch.ids if not p.shared_emb: other_input_embs = self.token_emb.EmbLookup( theta.token_emb, tf.reshape(other_input_ids, [-1])) else: other_input_embs = self.softmax.EmbLookup( theta.softmax, tf.reshape(other_input_ids, [-1])) lambdas = [tf.expand_dims(a, -1) for a in lambdas] if 'embs' in input_batch and input_batch.embs is not None: input_embs = input_batch.embs if 'embs' in interpolation_batch and interpolation_batch.embs is not None: other_input_embs = interpolation_batch.embs else: input_embs = tf.reshape( input_embs, [-1, tf.shape(input_ids)[1], p.token_emb.embedding_dim]) other_input_embs = tf.reshape( other_input_embs, [-1, tf.shape(other_input_ids)[1], p.token_emb.embedding_dim]) input_embs = lambdas[0] * input_embs + lambdas[1] * other_input_embs paddings = paddings + interpolation_batch.paddings - 1.0 paddings = tf.clip_by_value(paddings, 0.0, 1.0) input_embs = tf.reshape(input_embs, [-1, max_time, p.token_emb.embedding_dim]) orig_input_embs = input_embs if p.task_emb: if interpolation_batch is None: input_embs += self.task_emb.EmbLookup(theta.task_emb, input_batch.task_ids) else: task_embs = self.task_emb.EmbLookup(theta.task_emb, input_batch.task_ids) other_task_embs = self.task_emb.EmbLookup( theta.task_emb, interpolation_batch.task_ids) task_embs = lambdas[0] * task_embs + lambdas[1] * other_task_embs input_embs += task_embs if p.packed_input: position_embs = self.position_emb.FPropWithPosition( theta.position_emb, src_segment_pos) else: position_embs = self.position_emb.FProp(theta.position_emb, max_time) position_embs = tf.reshape(position_embs, [1, max_time, p.token_emb.embedding_dim]) input_embs += position_embs if p.model_dim != p.token_emb.embedding_dim: input_embs = self.emb_proj.FProp(theta.emb_proj, input_embs) paddings = tf.cast(tf.transpose(paddings), py_utils.FPropDtype(p)) if p.packed_input: src_segment_id = tf.transpose(src_segment_id) input_embs = self.input_dropout.FProp(theta.input_dropout, input_embs) # [time, batch, dim] transformer_input = tf.transpose(input_embs, [1, 0, 2]) if not self.do_eval and p.apply_source_mask: # Augment padding for masked source word positions. dtype = paddings.dtype source_mask = tf.where( tf.equal(input_ids, p.source_mask_id), tf.ones_like(input_ids, dtype=dtype), tf.zeros_like(input_ids, dtype=dtype)) # Make sure padding is between 0 and 1. paddings = tf.clip_by_value(paddings + tf.transpose(source_mask), 0.0, 1.0) encoded, padding, segment_id = self.transformer_stack.FProp( theta.transformer_stack, transformer_input, paddings, src_segment_id) return py_utils.NestedMap( encoded=encoded, padding=padding, segment_id=segment_id, embedded_inputs=orig_input_embs)