def train(self, sentences): token_ids, token_values, token_dense_shape = self._tokenize(sentences) tokens_sparse = tf.sparse.SparseTensor( indices=token_ids, values=token_values, dense_shape=token_dense_shape) tokens = tf.sparse.to_dense(tokens_sparse, default_value="") sparse_lookup_ids = tf.sparse.SparseTensor( indices=tokens_sparse.indices, values=self._words_to_indices(tokens_sparse.values), dense_shape=tokens_sparse.dense_shape) lookup_ids = tf.sparse.to_dense(sparse_lookup_ids, default_value=0) # Targets are the next word for each word of the sentence. tokens_ids_seq = lookup_ids[:, 0:-1] tokens_ids_target = lookup_ids[:, 1:] tokens_prefix = tokens[:, 0:-1] # Mask determining which positions we care about for a loss: all positions # that have a valid non-terminal token. mask = tf.logical_and( tf.logical_not(tf.equal(tokens_prefix, "")), tf.logical_not(tf.equal(tokens_prefix, "<E>"))) input_mask = tf.cast(mask, tf.int32) with tf.GradientTape() as t: sentence_embeddings = tf.nn.embedding_lookup(self._embeddings, tokens_ids_seq) lstm_initial_state = self._lstm_cell.get_initial_state( sentence_embeddings) lstm_output = self._rnn_layer( inputs=sentence_embeddings, initial_state=lstm_initial_state) # Stack LSTM outputs into a batch instead of a 2D array. lstm_output = tf.reshape(lstm_output, [-1, self._lstm_cell.output_size]) logits = self._logit_layer(lstm_output) targets = tf.reshape(tokens_ids_target, [-1]) weights = tf.cast(tf.reshape(input_mask, [-1]), tf.float32) losses = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=targets, logits=logits) # Final loss is the mean loss for all token losses. final_loss = tf.math.divide( tf.reduce_sum(tf.multiply(losses, weights)), tf.reduce_sum(weights), name="final_loss") watched = t.watched_variables() gradients = t.gradient(final_loss, watched) for w, g in zip(watched, gradients): w.assign_sub(g) return final_loss
def _multiply(self, x, y): if self._use_operator: return x * y else: return tf.multiply(x, y)
def _train_op_fn(loss, optimizer_fn, l2_regularization=-1, gradient_max_norm=-1, use_synchronous_optimizer=False): """Returns the op to optimize the loss. Supports l2 regularization, learning rate decay and gradient clipping. Args: loss: The training loss before regularization. optimizer_fn: the optimization function. l2_regularization: a float that will multiply the l2 weight norms in the loss function. gradient_max_norm: a float - maximal gradient update allowed. use_synchronous_optimizer: a bool whether to use synchronous optimization. Returns: `ModelSpec` with logits, loss, train_ops and train_hooks. """ total_loss = loss if l2_regularization > 0: weight_losses = [ tf.multiply(tf.nn.l2_loss(weight), l2_regularization, name="l2_weight_loss") for weight in tf.compat.v1.trainable_variables() ] total_loss = tf.add_n(weight_losses + [loss], name="total_loss") global_step = tf.compat.v1.train.get_or_create_global_step() opt = optimizer_fn() train_hooks = [] if use_synchronous_optimizer: config = tf.estimator.RunConfig() workers = config.num_worker_replicas + 1 tolerance = _compute_tolerance(workers) to_aggregate = workers - tolerance opt = tf.compat.v1.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=to_aggregate, total_num_replicas=workers) sync_replicas_hook = opt.make_session_run_hook(config.is_chief) train_hooks.append(sync_replicas_hook) tvars = tf.compat.v1.trainable_variables() grads_and_vars = opt.compute_gradients(loss=total_loss, var_list=tvars) # TODO(b/172564129): switch to tf.contrib.estimator.clip_gradients_by_norm if gradient_max_norm > 0.0: grads = [gv[0] for gv in grads_and_vars] tvars = [gv[1] for gv in grads_and_vars] grads, _ = tf.clip_by_global_norm(grads, gradient_max_norm) grads_and_vars = list(zip(grads, tvars)) if use_synchronous_optimizer: apply_gradients_op = opt.apply_gradients(grads_and_vars, global_step) else: apply_gradients_op = opt.apply_gradients(grads_and_vars) update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): return tf.group(apply_gradients_op), train_hooks
def fft_convolve(audio: tf.Tensor, impulse_response: tf.Tensor, padding: Text = 'same', delay_compensation: int = -1) -> tf.Tensor: """Filter audio with frames of time-varying impulse responses. Time-varying filter. Given audio [batch, n_samples], and a series of impulse responses [batch, n_frames, n_impulse_response], splits the audio into frames, applies filters, and then overlap-and-adds audio back together. Applies non-windowed non-overlapping STFT/ISTFT to efficiently compute convolution for large impulse response sizes. Args: audio: Input audio. Tensor of shape [batch, audio_timesteps]. impulse_response: Finite impulse response to convolve. Can either be a 2-D Tensor of shape [batch, ir_size], or a 3-D Tensor of shape [batch, ir_frames, ir_size]. A 2-D tensor will apply a single linear time-invariant filter to the audio. A 3-D Tensor will apply a linear time-varying filter. Automatically chops the audio into equally shaped blocks to match ir_frames. padding: Either 'valid' or 'same'. For 'same' the final output to be the same size as the input audio (audio_timesteps). For 'valid' the audio is extended to include the tail of the impulse response (audio_timesteps + ir_timesteps - 1). delay_compensation: Samples to crop from start of output audio to compensate for group delay of the impulse response. If delay_compensation is less than 0 it defaults to automatically calculating a constant group delay of the windowed linear phase filter from frequency_impulse_response(). Returns: audio_out: Convolved audio. Tensor of shape [batch, audio_timesteps + ir_timesteps - 1] ('valid' padding) or shape [batch, audio_timesteps] ('same' padding). Raises: ValueError: If audio and impulse response have different batch size. ValueError: If audio cannot be split into evenly spaced frames. (i.e. the number of impulse response frames is on the order of the audio size and not a multiple of the audio size.) """ audio, impulse_response = tf_float32(audio), tf_float32(impulse_response) # Add a frame dimension to impulse response if it doesn't have one. ir_shape = impulse_response.shape.as_list() if len(ir_shape) == 2: impulse_response = impulse_response[:, tf.newaxis, :] ir_shape = impulse_response.shape.as_list() # Get shapes of audio and impulse response. batch_size_ir, n_ir_frames, ir_size = ir_shape batch_size, audio_size = audio.shape.as_list() # Validate that batch sizes match. if batch_size != batch_size_ir: raise ValueError( 'Batch size of audio ({}) and impulse response ({}) must ' 'be the same.'.format(batch_size, batch_size_ir)) # Cut audio into frames. frame_size = int(np.ceil(audio_size / n_ir_frames)) hop_size = frame_size audio_frames = tf.signal.frame(audio, frame_size, hop_size, pad_end=True) # Check that number of frames match. n_audio_frames = int(audio_frames.shape[1]) if n_audio_frames != n_ir_frames: raise ValueError( 'Number of Audio frames ({}) and impulse response frames ({}) do not ' 'match. For small hop size = ceil(audio_size / n_ir_frames), ' 'number of impulse response frames must be a multiple of the audio ' 'size.'.format(n_audio_frames, n_ir_frames)) # Pad and FFT the audio and impulse responses. fft_size = get_fft_size(frame_size, ir_size, power_of_2=True) audio_fft = tf.signal.rfft(audio_frames, [fft_size]) ir_fft = tf.signal.rfft(impulse_response, [fft_size]) # Multiply the FFTs (same as convolution in time). audio_ir_fft = tf.multiply(audio_fft, ir_fft) # Take the IFFT to resynthesize audio. audio_frames_out = tf.signal.irfft(audio_ir_fft) audio_out = tf.signal.overlap_and_add(audio_frames_out, hop_size) # Crop and shift the output audio. return crop_and_compensate_delay(audio_out, audio_size, ir_size, padding, delay_compensation)
def mul_or_and(x1, x2): if x1.dtype == tf.bool: assert x2.dtype == tf.bool return tf.logical_and(x1, x2) return tf.multiply(x1, x2)
def draw_samples(self, alpha, scale): r"""Draw samples from the robust distribution. This function implements Algorithm 1 the paper. This code is written to allow for sampling from a set of different distributions, each parametrized by its own alpha and scale values, as opposed to the more standard approach of drawing N samples from the same distribution. This is done by repeatedly performing N instances of rejection sampling for each of the N distributions until at least one proposal for each of the N distributions has been accepted. All samples assume a zero mean --- to get non-zero mean samples, just add each mean to each sample. Args: alpha: A TF tensor/scalar or numpy array/scalar of floats where each element is the shape parameter of that element's distribution. scale: A TF tensor/scalar or numpy array/scalar of floats where each element is the scale parameter of that element's distribution. Must be the same shape as `alpha`. Returns: A TF tensor with the same shape and precision as `alpha` and `scale` where each element is a sample drawn from the zero-mean distribution specified for that element by `alpha` and `scale`. """ # `scale` must have the same type as `alpha`. float_dtype = alpha.dtype tf.debugging.assert_type(scale, float_dtype) assert_ops = [ # `scale` must be > 0. tf.Assert(tf.reduce_all(scale > 0.), [scale]), # `alpha` must be >= 0. tf.Assert(tf.reduce_all(alpha >= 0.), [alpha]), # `alpha` and `scale` must have the same shape. tf.Assert( tf.reduce_all(tf.equal(tf.shape(alpha), tf.shape(scale))), [tf.shape(alpha), tf.shape(scale)]), ] with tf.control_dependencies(assert_ops): shape = tf.shape(alpha) # The distributions we will need for rejection sampling. The sqrt(2) # scaling of the Cauchy distribution corrects for our differing # conventions for standardization. cauchy = tfp.distributions.Cauchy(loc=0., scale=tf.sqrt(2.)) uniform = tfp.distributions.Uniform(low=0., high=1.) def while_cond(_, accepted): """Terminate the loop only when all samples have been accepted.""" return ~tf.reduce_all(accepted) def while_body(samples, accepted): """Generate N proposal samples, and then perform rejection sampling.""" # Draw N samples from a Cauchy, our proposal distribution. cauchy_sample = tf.cast(cauchy.sample(shape), float_dtype) # Compute the likelihood of each sample under its target distribution. nll = self.nllfun(cauchy_sample, alpha, tf.cast(1, float_dtype)) # Bound the NLL. We don't use the approximate loss as it may cause # unpredictable behavior in the context of sampling. nll_bound = general.lossfun( cauchy_sample, tf.cast(0, float_dtype), tf.cast(1, float_dtype), approximate=False) + self.log_base_partition_function( alpha) # Draw N samples from a uniform distribution, and use each uniform # sample to decide whether or not to accept each proposal sample. uniform_sample = tf.cast(uniform.sample(shape), float_dtype) accept = uniform_sample <= tf.math.exp(nll_bound - nll) # If a sample is accepted, replace its element in `samples` with the # proposal sample, and set its bit in `accepted` to True. samples = tf.where(accept, cauchy_sample, samples) accepted = accept | accepted return (samples, accepted) # Initialize the loop. The first item does not matter as it will get # overwritten, the second item must be all False. while_loop_vars = (tf.zeros(shape, float_dtype), tf.zeros(shape, dtype=bool)) # Perform rejection sampling until all N samples have been accepted. terminal_state = tf.while_loop(cond=while_cond, body=while_body, loop_vars=while_loop_vars) # Because our distribution is a location-scale family, we sample from # p(x | 0, \alpha, 1) and then scale each sample by `scale`. samples = tf.multiply(terminal_state[0], scale) return samples
def update_confusion_matrix_variables( variables_to_update, y_true, y_pred, thresholds, top_k=None, class_id=None, sample_weight=None, multi_label=False, label_weights=None, thresholds_distributed_evenly=False, ): """Returns op to update the given confusion matrix variables. For every pair of values in y_true and y_pred: true_positive: y_true == True and y_pred > thresholds false_negatives: y_true == True and y_pred <= thresholds true_negatives: y_true == False and y_pred <= thresholds false_positive: y_true == False and y_pred > thresholds The results will be weighted and added together. When multiple thresholds are provided, we will repeat the same for every threshold. For estimation of these metrics over a stream of data, the function creates an `update_op` operation that updates the given variables. If `sample_weight` is `None`, weights default to 1. Use weights of 0 to mask values. Args: variables_to_update: Dictionary with 'tp', 'fn', 'tn', 'fp' as valid keys and corresponding variables to update as values. y_true: A `Tensor` whose shape matches `y_pred`. Will be cast to `bool`. y_pred: A floating point `Tensor` of arbitrary shape and whose values are in the range `[0, 1]`. thresholds: A float value, float tensor, python list, or tuple of float thresholds in `[0, 1]`, or NEG_INF (used when top_k is set). top_k: Optional int, indicates that the positive labels should be limited to the top k predictions. class_id: Optional int, limits the prediction and labels to the class specified by this argument. sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as `y_true`, and must be broadcastable to `y_true` (i.e., all dimensions must be either `1`, or the same as the corresponding `y_true` dimension). multi_label: Optional boolean indicating whether multidimensional prediction/labels should be treated as multilabel responses, or flattened into a single label. When True, the valus of `variables_to_update` must have a second dimension equal to the number of labels in y_true and y_pred, and those tensors must not be RaggedTensors. label_weights: (optional) tensor of non-negative weights for multilabel data. The weights are applied when calculating TP, FP, FN, and TN without explicit multilabel handling (i.e. when the data is to be flattened). thresholds_distributed_evenly: Boolean, whether the thresholds are evenly distributed within the list. An optimized method will be used if this is the case. See _update_confusion_matrix_variables_optimized() for more details. Returns: Update op. Raises: ValueError: If `y_pred` and `y_true` have mismatched shapes, or if `sample_weight` is not `None` and its shape doesn't match `y_pred`, or if `variables_to_update` contains invalid keys. """ if multi_label and label_weights is not None: raise ValueError( "`label_weights` for multilabel data should be handled " "outside of `update_confusion_matrix_variables` when " "`multi_label` is True.") if variables_to_update is None: return if not any(key for key in variables_to_update if key in list(ConfusionMatrix)): raise ValueError( "Please provide at least one valid confusion matrix " "variable to update. Valid variable key options are: " f'"{list(ConfusionMatrix)}". Received: "{variables_to_update.keys()}"' ) variable_dtype = list(variables_to_update.values())[0].dtype y_true = tf.cast(y_true, dtype=variable_dtype) y_pred = tf.cast(y_pred, dtype=variable_dtype) if thresholds_distributed_evenly: # Check whether the thresholds has any leading or tailing epsilon added # for floating point imprecision. The leading and tailing threshold will be # handled bit differently as the corner case. # At this point, thresholds should be a list/array with more than 2 items, # and ranged between [0, 1]. See is_evenly_distributed_thresholds() for more # details. thresholds_with_epsilon = thresholds[0] < 0.0 or thresholds[-1] > 1.0 thresholds = tf.convert_to_tensor(thresholds, dtype=variable_dtype) num_thresholds = thresholds.shape.as_list()[0] if multi_label: one_thresh = tf.equal( tf.cast(1, dtype=tf.int32), tf.rank(thresholds), name="one_set_of_thresholds_cond", ) else: [y_pred, y_true ], _ = ragged_assert_compatible_and_get_flat_values([y_pred, y_true], sample_weight) one_thresh = tf.cast(True, dtype=tf.bool) invalid_keys = [ key for key in variables_to_update if key not in list(ConfusionMatrix) ] if invalid_keys: raise ValueError( f'Invalid keys: "{invalid_keys}". ' f'Valid variable key options are: "{list(ConfusionMatrix)}"') if sample_weight is None: y_pred, y_true = losses_utils.squeeze_or_expand_dimensions( y_pred, y_true) else: sample_weight = tf.cast(sample_weight, dtype=variable_dtype) ( y_pred, y_true, sample_weight, ) = losses_utils.squeeze_or_expand_dimensions( y_pred, y_true, sample_weight=sample_weight) y_pred.shape.assert_is_compatible_with(y_true.shape) if top_k is not None: y_pred = _filter_top_k(y_pred, top_k) if class_id is not None: y_true = y_true[..., class_id] y_pred = y_pred[..., class_id] if thresholds_distributed_evenly: return _update_confusion_matrix_variables_optimized( variables_to_update, y_true, y_pred, thresholds, multi_label=multi_label, sample_weights=sample_weight, label_weights=label_weights, thresholds_with_epsilon=thresholds_with_epsilon, ) pred_shape = tf.shape(y_pred) num_predictions = pred_shape[0] if y_pred.shape.ndims == 1: num_labels = 1 else: num_labels = tf.math.reduce_prod(pred_shape[1:], axis=0) thresh_label_tile = tf.where(one_thresh, num_labels, tf.ones([], dtype=tf.int32)) # Reshape predictions and labels, adding a dim for thresholding. if multi_label: predictions_extra_dim = tf.expand_dims(y_pred, 0) labels_extra_dim = tf.expand_dims(tf.cast(y_true, dtype=tf.bool), 0) else: # Flatten predictions and labels when not multilabel. predictions_extra_dim = tf.reshape(y_pred, [1, -1]) labels_extra_dim = tf.reshape(tf.cast(y_true, dtype=tf.bool), [1, -1]) # Tile the thresholds for every prediction. if multi_label: thresh_pretile_shape = [num_thresholds, 1, -1] thresh_tiles = [1, num_predictions, thresh_label_tile] data_tiles = [num_thresholds, 1, 1] else: thresh_pretile_shape = [num_thresholds, -1] thresh_tiles = [1, num_predictions * num_labels] data_tiles = [num_thresholds, 1] thresh_tiled = tf.tile(tf.reshape(thresholds, thresh_pretile_shape), tf.stack(thresh_tiles)) # Tile the predictions for every threshold. preds_tiled = tf.tile(predictions_extra_dim, data_tiles) # Compare predictions and threshold. pred_is_pos = tf.greater(preds_tiled, thresh_tiled) # Tile labels by number of thresholds label_is_pos = tf.tile(labels_extra_dim, data_tiles) if sample_weight is not None: sample_weight = tf.__internal__.ops.broadcast_weights( tf.cast(sample_weight, dtype=variable_dtype), y_pred) weights_tiled = tf.tile(tf.reshape(sample_weight, thresh_tiles), data_tiles) else: weights_tiled = None if label_weights is not None and not multi_label: label_weights = tf.expand_dims(label_weights, 0) label_weights = tf.__internal__.ops.broadcast_weights( label_weights, y_pred) label_weights_tiled = tf.tile(tf.reshape(label_weights, thresh_tiles), data_tiles) if weights_tiled is None: weights_tiled = label_weights_tiled else: weights_tiled = tf.multiply(weights_tiled, label_weights_tiled) update_ops = [] def weighted_assign_add(label, pred, weights, var): label_and_pred = tf.cast(tf.logical_and(label, pred), dtype=var.dtype) if weights is not None: label_and_pred *= tf.cast(weights, dtype=var.dtype) return var.assign_add(tf.reduce_sum(label_and_pred, 1)) loop_vars = { ConfusionMatrix.TRUE_POSITIVES: (label_is_pos, pred_is_pos), } update_tn = ConfusionMatrix.TRUE_NEGATIVES in variables_to_update update_fp = ConfusionMatrix.FALSE_POSITIVES in variables_to_update update_fn = ConfusionMatrix.FALSE_NEGATIVES in variables_to_update if update_fn or update_tn: pred_is_neg = tf.logical_not(pred_is_pos) loop_vars[ConfusionMatrix.FALSE_NEGATIVES] = (label_is_pos, pred_is_neg) if update_fp or update_tn: label_is_neg = tf.logical_not(label_is_pos) loop_vars[ConfusionMatrix.FALSE_POSITIVES] = (label_is_neg, pred_is_pos) if update_tn: loop_vars[ConfusionMatrix.TRUE_NEGATIVES] = ( label_is_neg, pred_is_neg, ) for matrix_cond, (label, pred) in loop_vars.items(): if matrix_cond in variables_to_update: update_ops.append( weighted_assign_add(label, pred, weights_tiled, variables_to_update[matrix_cond])) return tf.group(update_ops)
def test_false(self): x = tf.constant(4) y = tf.constant(3) z = ps.cond(False, lambda: tf.multiply(x, 16), lambda: tf.multiply(y, 3)) self.assertEqual(self.evaluate(z), 9)
def compute_weighted_loss( losses, sample_weight=None, reduction=ReductionV2.SUM_OVER_BATCH_SIZE, name=None, ): """Computes the weighted loss. Args: losses: `Tensor` of shape `[batch_size, d1, ... dN]`. sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as `losses`, or be broadcastable to `losses`. reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss. Default value is `SUM_OVER_BATCH_SIZE`. name: Optional name for the op. Raises: ValueError: If the shape of `sample_weight` is not compatible with `losses`. Returns: Weighted loss `Tensor` of the same type as `losses`. If `reduction` is `NONE`, this has the same shape as `losses`; otherwise, it is scalar. """ ReductionV2.validate(reduction) # If this function is called directly, then we just default 'AUTO' to # 'SUM_OVER_BATCH_SIZE'. Eg. Canned estimator use cases. if reduction == ReductionV2.AUTO: reduction = ReductionV2.SUM_OVER_BATCH_SIZE if sample_weight is None: sample_weight = 1.0 with backend.name_scope(name or "weighted_loss"): # Save the `reduction` argument for loss normalization when distributing # to multiple replicas. Used only for estimator + v1 optimizer flow. tf.compat.v1.get_default_graph()._last_loss_reduction = reduction if not isinstance(losses, (keras_tensor.KerasTensor, tf.RaggedTensor)): losses = tf.convert_to_tensor(losses) if not isinstance( sample_weight, (keras_tensor.KerasTensor, tf.RaggedTensor) ): sample_weight = tf.convert_to_tensor(sample_weight) # Convert any non float dtypes to floats, to avoid it loss any precision # for dtype like int or bool. if not losses.dtype.is_floating: input_dtype = losses.dtype losses = tf.cast(losses, "float32") input_casted = True else: input_casted = False sample_weight = tf.cast(sample_weight, losses.dtype) # Update dimensions of `sample_weight` to match with `losses` if # possible. ( losses, _, sample_weight, ) = squeeze_or_expand_dimensions(losses, None, sample_weight) weighted_losses = tf.multiply(losses, sample_weight) # Apply reduction function to the individual weighted losses. loss = reduce_weighted_loss(weighted_losses, reduction) if input_casted: # Convert the result back to the input type. loss = tf.cast(loss, input_dtype) return loss
def selective_crop_and_resize(features, boxes, box_levels, boundaries, output_size=7, sample_offset=0.5, use_einsum_gather=False): """Crop and resize boxes on a set of feature maps. Given multiple features maps indexed by different levels, and a set of boxes where each box is mapped to a certain level, it selectively crops and resizes boxes from the corresponding feature maps to generate the box features. We follow the ROIAlign technique (see https://arxiv.org/pdf/1703.06870.pdf, figure 3 for reference). Specifically, for each feature map, we select an (output_size, output_size) set of pixels corresponding to the box location, and then use bilinear interpolation to select the feature value for each pixel. For performance, we perform the gather and interpolation on all layers as a single operation. In this op the multi-level features are first stacked and gathered into [2*output_size, 2*output_size] feature points. Then bilinear interpolation is performed on the gathered feature points to generate [output_size, output_size] RoIAlign feature map. Here is the step-by-step algorithm: 1. The multi-level features are gathered into a [batch_size, num_boxes, output_size*2, output_size*2, num_filters] Tensor. The Tensor contains four neighboring feature points for each vertice in the output grid. 2. Compute the interpolation kernel of shape [batch_size, num_boxes, output_size*2, output_size*2]. The last 2 axis can be seen as stacking 2x2 interpolation kernels for all vertices in the output grid. 3. Element-wise multiply the gathered features and interpolation kernel. Then apply 2x2 average pooling to reduce spatial dimension to output_size. Args: features: a 5-D tensor of shape [batch_size, num_levels, max_height, max_width, num_filters] where cropping and resizing are based. boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the information of each box w.r.t. the corresponding feature map. boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float) in terms of the number of pixels of the corresponding feature map size. box_levels: a 3-D tensor of shape [batch_size, num_boxes, 1] representing the 0-based corresponding feature level index of each box. boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing the boundary (in (y, x)) of the corresponding feature map for each box. Any resampled grid points that go beyond the bounary will be clipped. output_size: a scalar indicating the output crop size. sample_offset: a float number in [0, 1] indicates the subpixel sample offset from grid point. use_einsum_gather: use einsum to replace gather or not. Replacing einsum with gather can improve performance when feature size is not large, einsum is friendly with model partition as well. Gather's performance is better when feature size is very large and there are multiple box levels. Returns: features_per_box: a 5-D tensor of shape [batch_size, num_boxes, output_size, output_size, num_filters] representing the cropped features. """ (batch_size, num_levels, max_feature_height, max_feature_width, num_filters) = features.get_shape().as_list() _, num_boxes, _ = boxes.get_shape().as_list() kernel_y, kernel_x, box_gridy0y1, box_gridx0x1 = compute_grid_positions( boxes, boundaries, output_size, sample_offset) x_indices = tf.cast(tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size * 2]), dtype=tf.int32) y_indices = tf.cast(tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size * 2]), dtype=tf.int32) if use_einsum_gather: # Blinear interpolation is done during the last two gathers: # f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T # [f10, f11]] # [[f00, f01], # [f10, f11]] = tf.einsum(tf.einsum(features, y_one_hot), x_one_hot) # where [hy, ly] and [hx, lx] are the bilinear interpolation kernel. # shape is [batch_size, boxes, output_size, 2, 1] grid_y_one_hot, grid_x_one_hot = get_grid_one_hot( box_gridy0y1, box_gridx0x1, max_feature_height, max_feature_width) # shape is [batch_size, num_boxes, output_size, height] grid_y_weight = tf.reduce_sum(tf.multiply(grid_y_one_hot, kernel_y), axis=-2) # shape is [batch_size, num_boxes, output_size, width] grid_x_weight = tf.reduce_sum(tf.multiply(grid_x_one_hot, kernel_x), axis=-2) # Gather for y_axis. # shape is [batch_size, num_boxes, output_size, width, features] features_per_box = tf.einsum('bmhwf,bmoh->bmowf', features, tf.cast(grid_y_weight, features.dtype)) # Gather for x_axis. # shape is [batch_size, num_boxes, output_size, output_size, features] features_per_box = tf.einsum('bmhwf,bmow->bmhof', features_per_box, tf.cast(grid_x_weight, features.dtype)) else: height_dim_offset = max_feature_width level_dim_offset = max_feature_height * height_dim_offset batch_dim_offset = num_levels * level_dim_offset batch_size_offset = tf.tile( tf.reshape( tf.range(batch_size) * batch_dim_offset, [batch_size, 1, 1, 1]), [1, num_boxes, output_size * 2, output_size * 2]) box_levels_offset = tf.tile( tf.reshape(box_levels * level_dim_offset, [batch_size, num_boxes, 1, 1]), [1, 1, output_size * 2, output_size * 2]) y_indices_offset = tf.tile( tf.reshape(y_indices * height_dim_offset, [batch_size, num_boxes, output_size * 2, 1]), [1, 1, 1, output_size * 2]) x_indices_offset = tf.tile( tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]), [1, 1, output_size * 2, 1]) indices = tf.reshape( batch_size_offset + box_levels_offset + y_indices_offset + x_indices_offset, [-1]) features = tf.reshape(features, [-1, num_filters]) # TODO(wangtao): replace tf.gather with tf.gather_nd and try to get similar # performance. features_per_box = tf.reshape(tf.gather(features, indices), [ batch_size, num_boxes, output_size * 2, output_size * 2, num_filters ]) features_per_box = feature_bilinear_interpolation( features_per_box, kernel_y, kernel_x) return features_per_box
def bigbird_block_sparse_attention(query_layer, key_layer, value_layer, band_mask, from_mask, to_mask, from_blocked_mask, to_blocked_mask, num_attention_heads, num_rand_blocks, size_per_head, batch_size, from_seq_length, to_seq_length, from_block_size, to_block_size, seed=None, plan_from_length=None, plan_num_rand_blocks=None): """BigBird attention sparse calculation using blocks in linear time. Assumes from_seq_length//from_block_size == to_seq_length//to_block_size. Args: query_layer: float Tensor of shape [batch_size, num_attention_heads, from_seq_length, size_per_head] key_layer: float Tensor of shape [batch_size, num_attention_heads, to_seq_length, size_per_head] value_layer: float Tensor of shape [batch_size, num_attention_heads, to_seq_length, size_per_head] band_mask: (optional) int32 Tensor of shape [batch_size, 1, from_seq_length//from_block_size-4, from_block_size, 3*to_block_size]. The values should be 1 or 0. The attention scores will effectively be set to -infinity for any positions in the mask that are 0, and will be unchanged for positions that are 1. from_mask: (optional) int32 Tensor of shape [batch_size, 1, from_seq_length, 1]. The values should be 1 or 0. The attention scores will effectively be set to -infinity for any positions in the mask that are 0, and will be unchanged for positions that are 1. to_mask: (optional) int32 Tensor of shape [batch_size, 1, 1, to_seq_length]. The values should be 1 or 0. The attention scores will effectively be set to -infinity for any positions in the mask that are 0, and will be unchanged for positions that are 1. from_blocked_mask: (optional) int32 Tensor of shape [batch_size, from_seq_length//from_block_size, from_block_size]. Same as from_mask, just reshaped. to_blocked_mask: (optional) int32 Tensor of shape [batch_size, to_seq_length//to_block_size, to_block_size]. Same as to_mask, just reshaped. num_attention_heads: int. Number of attention heads. num_rand_blocks: int. Number of random chunks per row. size_per_head: int. Size of each attention head. batch_size: int. Batch size for computation. from_seq_length: int. length of from sequence. to_seq_length: int. length of to sequence. from_block_size: int. size of block in from sequence. to_block_size: int. size of block in to sequence. seed: (Optional) int. Reandom seed for generating random mask. plan_from_length: (Optional) list. Plan of where to put random attn. It divides the block matrix into chuncks, where each chunck will have some randomm attn. plan_num_rand_blocks: (Optional) list. Number of random per block given by plan_from_length. Returns: float Tensor of shape [batch_size, from_seq_length, num_attention_heads, size_per_head]. """ assert from_seq_length // from_block_size == to_seq_length // to_block_size # cast masks to float from_mask = tf.cast(from_mask, tf.float32) to_mask = tf.cast(to_mask, tf.float32) band_mask = tf.cast(band_mask, tf.float32) from_blocked_mask = tf.cast(from_blocked_mask, tf.float32) to_blocked_mask = tf.cast(to_blocked_mask, tf.float32) # generate random attention and corresponding masks np.random.seed(seed) if from_seq_length in [1024, 3072, 4096]: # old plans used in paper rand_attn = [ bigbird_block_rand_mask( # pylint: disable=g-complex-comprehension MAX_SEQ_LEN, MAX_SEQ_LEN, from_block_size, to_block_size, num_rand_blocks, last_idx=1024)[:(from_seq_length // from_block_size - 2)] for _ in range(num_attention_heads) ] else: if plan_from_length is None: plan_from_length, plan_num_rand_blocks = get_rand_attn_plan( from_seq_length, from_block_size, num_rand_blocks) rand_attn = bigbird_block_rand_mask_with_head( from_seq_length=from_seq_length, to_seq_length=to_seq_length, from_block_size=from_block_size, to_block_size=to_block_size, num_heads=num_attention_heads, plan_from_length=plan_from_length, plan_num_rand_blocks=plan_num_rand_blocks) rand_attn = np.stack(rand_attn, axis=0) rand_attn = tf.constant(rand_attn, dtype=tf.int32) rand_attn = tf.expand_dims(rand_attn, 0) rand_attn = tf.repeat(rand_attn, batch_size, 0) rand_mask = create_rand_mask_from_inputs( from_blocked_mask, to_blocked_mask, rand_attn, num_attention_heads, num_rand_blocks, batch_size, from_seq_length, from_block_size, ) # Define shorthands h = num_attention_heads r = num_rand_blocks d = size_per_head b = batch_size m = from_seq_length n = to_seq_length wm = from_block_size wn = to_block_size blocked_query_matrix = tf.reshape(query_layer, (b, h, m // wm, wm, -1)) blocked_key_matrix = tf.reshape(key_layer, (b, h, n // wn, wn, -1)) blocked_value_matrix = tf.reshape(value_layer, (b, h, n // wn, wn, -1)) gathered_key = tf.reshape( tf.gather(blocked_key_matrix, rand_attn, batch_dims=2, name="gather_key"), (b, h, m // wm - 2, r * wn, -1)) # [b, h, n//wn-2, r, wn, -1] gathered_value = tf.reshape( tf.gather(blocked_value_matrix, rand_attn, batch_dims=2, name="gather_value"), (b, h, m // wm - 2, r * wn, -1)) # [b, h, n//wn-2, r, wn, -1] first_product = tf.einsum( "BHQD,BHKD->BHQK", blocked_query_matrix[:, :, 0], key_layer) # [b, h, wm, -1] x [b, h, n, -1] ==> [b, h, wm, n] first_product = tf.multiply(first_product, 1.0 / np.sqrt(d)) first_product += (1.0 - to_mask) * -10000.0 first_attn_weights = tf.nn.softmax(first_product) # [b, h, wm, n] first_context_layer = tf.einsum( "BHQK,BHKD->BHQD", first_attn_weights, value_layer) # [b, h, wm, n] x [b, h, n, -1] ==> [b, h, wm, -1] first_context_layer = tf.expand_dims(first_context_layer, 2) second_key_mat = tf.concat([ blocked_key_matrix[:, :, 0], blocked_key_matrix[:, :, 1], blocked_key_matrix[:, :, 2], blocked_key_matrix[:, :, -1], gathered_key[:, :, 0] ], 2) # [b, h, (4+r)*wn, -1] second_value_mat = tf.concat([ blocked_value_matrix[:, :, 0], blocked_value_matrix[:, :, 1], blocked_value_matrix[:, :, 2], blocked_value_matrix[:, :, -1], gathered_value[:, :, 0] ], 2) # [b, h, (4+r)*wn, -1] second_product = tf.einsum( "BHQD,BHKD->BHQK", blocked_query_matrix[:, :, 1], second_key_mat ) # [b, h, wm, -1] x [b, h, (4+r)*wn, -1] ==> [b, h, wm, (4+r)*wn] second_seq_pad = tf.concat([ to_mask[:, :, :, :3 * wn], to_mask[:, :, :, -wn:], tf.ones([b, 1, 1, r * wn], dtype=tf.float32) ], 3) second_rand_pad = tf.concat( [tf.ones([b, h, wm, 4 * wn], dtype=tf.float32), rand_mask[:, :, 0]], 3) second_product = tf.multiply(second_product, 1.0 / np.sqrt(d)) second_product += (1.0 - tf.minimum(second_seq_pad, second_rand_pad)) * -10000.0 second_attn_weights = tf.nn.softmax( second_product) # [b , h, wm, (4+r)*wn] second_context_layer = tf.einsum( "BHQK,BHKD->BHQD", second_attn_weights, second_value_mat ) # [b, h, wm, (4+r)*wn] x [b, h, (4+r)*wn, -1] ==> [b, h, wm, -1] second_context_layer = tf.expand_dims(second_context_layer, 2) exp_blocked_key_matrix = tf.concat([ blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1] ], 3) # [b, h, m//wm-4, 3*wn, -1] exp_blocked_value_matrix = tf.concat([ blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1] ], 3) # [b, h, m//wm-4, 3*wn, -1] middle_query_matrix = blocked_query_matrix[:, :, 2:-2] inner_band_product = tf.einsum( "BHLQD,BHLKD->BHLQK", middle_query_matrix, exp_blocked_key_matrix ) # [b, h, m//wm-4, wm, -1] x [b, h, m//wm-4, 3*wn, -1] # ==> [b, h, m//wm-4, wm, 3*wn] inner_band_product = tf.multiply(inner_band_product, 1.0 / np.sqrt(d)) rand_band_product = tf.einsum( "BHLQD,BHLKD->BHLQK", middle_query_matrix, gathered_key[:, :, 1:-1] ) # [b, h, m//wm-4, wm, -1] x [b, h, m//wm-4, r*wn, -1] # ==> [b, h, m//wm-4, wm, r*wn] rand_band_product = tf.multiply(rand_band_product, 1.0 / np.sqrt(d)) first_band_product = tf.einsum( "BHLQD,BHKD->BHLQK", middle_query_matrix, blocked_key_matrix[:, :, 0] ) # [b, h, m//wm-4, wm, -1] x [b, h, wn, -1] ==> [b, h, m//wm-4, wm, wn] first_band_product = tf.multiply(first_band_product, 1.0 / np.sqrt(d)) last_band_product = tf.einsum( "BHLQD,BHKD->BHLQK", middle_query_matrix, blocked_key_matrix[:, :, -1] ) # [b, h, m//wm-4, wm, -1] x [b, h, wn, -1] ==> [b, h, m//wm-4, wm, wn] last_band_product = tf.multiply(last_band_product, 1.0 / np.sqrt(d)) inner_band_product += (1.0 - band_mask) * -10000.0 first_band_product += (1.0 - tf.expand_dims(to_mask[:, :, :, :wn], 3)) * -10000.0 last_band_product += (1.0 - tf.expand_dims(to_mask[:, :, :, -wn:], 3)) * -10000.0 rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * -10000.0 band_product = tf.concat([ first_band_product, inner_band_product, rand_band_product, last_band_product ], -1) # [b, h, m//wm-4, wm, (5+r)*wn] attn_weights = tf.nn.softmax(band_product) # [b, h, m//wm-4, wm, (5+r)*wn] context_layer = tf.einsum( "BHLQK,BHLKD->BHLQD", attn_weights[:, :, :, :, wn:4 * wn], exp_blocked_value_matrix ) # [b, h, m//wm-4, wm, 3*wn] x [b, h, m//wm-4, 3*wn, -1] # ==> [b, h, m//wm-4, wm, -1] context_layer += tf.einsum( "BHLQK,BHLKD->BHLQD", attn_weights[:, :, :, :, 4 * wn:-wn], gathered_value[:, :, 1:-1] ) # [b, h, m//wm-4, wm, r*wn] x [b, h, m//wm-4, r*wn, -1] # ==> [b, h, m//wm-4, wm, -1] context_layer += tf.einsum( "BHLQK,BHKD->BHLQD", attn_weights[:, :, :, :, :wn], blocked_value_matrix[:, :, 0] ) # [b, h, m//wm-4, wm, wn] x [b, h, wn, -1] ==> [b, h, m//wm-4, wm, -1] context_layer += tf.einsum( "BHLQK,BHKD->BHLQD", attn_weights[:, :, :, :, -wn:], blocked_value_matrix[:, :, -1] ) # [b, h, m//wm-4, wm, wn] x [b, h, wn, -1] ==> [b, h, m//wm-4, wm, -1] second_last_key_mat = tf.concat([ blocked_key_matrix[:, :, 0], blocked_key_matrix[:, :, -3], blocked_key_matrix[:, :, -2], blocked_key_matrix[:, :, -1], gathered_key[:, :, -1] ], 2) # [b, h, (4+r)*wn, -1] second_last_value_mat = tf.concat([ blocked_value_matrix[:, :, 0], blocked_value_matrix[:, :, -3], blocked_value_matrix[:, :, -2], blocked_value_matrix[:, :, -1], gathered_value[:, :, -1] ], 2) # [b, h, (4+r)*wn, -1] second_last_product = tf.einsum( "BHQD,BHKD->BHQK", blocked_query_matrix[:, :, -2], second_last_key_mat ) # [b, h, wm, -1] x [b, h, (4+r)*wn, -1] ==> [b, h, wm, (4+r)*wn] second_last_seq_pad = tf.concat([ to_mask[:, :, :, :wn], to_mask[:, :, :, -3 * wn:], tf.ones([b, 1, 1, r * wn], dtype=tf.float32) ], 3) second_last_rand_pad = tf.concat( [tf.ones([b, h, wm, 4 * wn], dtype=tf.float32), rand_mask[:, :, -1]], 3) second_last_product = tf.multiply(second_last_product, 1.0 / np.sqrt(d)) second_last_product += ( 1.0 - tf.minimum(second_last_seq_pad, second_last_rand_pad)) * -10000.0 second_last_attn_weights = tf.nn.softmax( second_last_product) # [b, h, wm, (4+r)*wn] second_last_context_layer = tf.einsum( "BHQK,BHKD->BHQD", second_last_attn_weights, second_last_value_mat ) # [b, h, wm, (4+r)*wn] x [b, h, (4+r)*wn, -1] ==> [b, h, wm, -1] second_last_context_layer = tf.expand_dims(second_last_context_layer, 2) last_product = tf.einsum( "BHQD,BHKD->BHQK", blocked_query_matrix[:, :, -1], key_layer) # [b, h, wm, -1] x [b, h, n, -1] ==> [b, h, wm, n] last_product = tf.multiply(last_product, 1.0 / np.sqrt(d)) last_product += (1.0 - to_mask) * -10000.0 last_attn_weights = tf.nn.softmax(last_product) # [b, h, wm, n] last_context_layer = tf.einsum( "BHQK,BHKD->BHQD", last_attn_weights, value_layer) # [b, h, wm, n] x [b, h, n, -1] ==> [b, h, wm, -1] last_context_layer = tf.expand_dims(last_context_layer, 2) context_layer = tf.concat([ first_context_layer, second_context_layer, context_layer, second_last_context_layer, last_context_layer ], 2) context_layer = tf.reshape(context_layer, (b, h, m, -1)) * from_mask context_layer = tf.transpose(context_layer, (0, 2, 1, 3)) return context_layer
def positive_fcn(): res = tf.divide(tf.math.reduce_sum(tf.multiply(x_m, y_m), axis=0), tf.multiply(tf.math.sqrt(x_p), tf.math.sqrt(y_p))) return res
def call(self, y_true, y_pred): """See tf.keras.losses.Loss.""" losses, weights = self._loss.compute_unreduced_loss( labels=y_true, logits=y_pred) return tf.multiply(losses, weights)
def boolean_mask(boxlist, indicator, fields=None, scope=None, use_static_shapes=False, indicator_sum=None): """Select boxes from BoxList according to indicator and return new BoxList. `boolean_mask` returns the subset of boxes that are marked as "True" by the indicator tensor. By default, `boolean_mask` returns boxes corresponding to the input index list, as well as all additional fields stored in the boxlist (indexing into the first dimension). However one can optionally only draw from a subset of fields. Args: boxlist: BoxList holding N boxes indicator: a rank-1 boolean tensor fields: (optional) list of fields to also gather from. If None (default), all fields are gathered from. Pass an empty fields list to only gather the box coordinates. scope: name scope. use_static_shapes: Whether to use an implementation with static shape gurantees. indicator_sum: An integer containing the sum of `indicator` vector. Only required if `use_static_shape` is True. Returns: subboxlist: a BoxList corresponding to the subset of the input BoxList specified by indicator Raises: ValueError: if `indicator` is not a rank-1 boolean tensor. """ with tf.name_scope(scope, 'BooleanMask'): if indicator.shape.ndims != 1: raise ValueError('indicator should have rank 1') if indicator.dtype != tf.bool: raise ValueError('indicator should be a boolean tensor') if use_static_shapes: if not (indicator_sum and isinstance(indicator_sum, int)): raise ValueError('`indicator_sum` must be a of type int') selected_positions = tf.cast(indicator, dtype=tf.float32) indexed_positions = tf.cast(tf.multiply( tf.cumsum(selected_positions), selected_positions), dtype=tf.int32) one_hot_selector = tf.one_hot(indexed_positions - 1, indicator_sum, dtype=tf.float32) sampled_indices = tf.cast(tf.tensordot(tf.cast(tf.range( tf.shape(indicator)[0]), dtype=tf.float32), one_hot_selector, axes=[0, 0]), dtype=tf.int32) return gather(boxlist, sampled_indices, use_static_shapes=True) else: subboxlist = box_list.BoxList( tf.boolean_mask(boxlist.get(), indicator)) if fields is None: fields = boxlist.get_extra_fields() for field in fields: if not boxlist.has_field(field): raise ValueError( 'boxlist must contain all specified fields') subfieldlist = tf.boolean_mask(boxlist.get_field(field), indicator) subboxlist.add_field(field, subfieldlist) return subboxlist
def _static_subsample(self, indicator, batch_size, labels): """Returns subsampled minibatch. Args: indicator: boolean tensor of shape [N] whose True entries can be sampled. N should be a complie time constant. batch_size: desired batch size. This scalar cannot be None. labels: boolean tensor of shape [N] denoting positive(=True) and negative (=False) examples. N should be a complie time constant. Returns: sampled_idx_indicator: boolean tensor of shape [N], True for entries which are sampled. It ensures the length of output of the subsample is always batch_size, even when number of examples set to True in indicator is less than batch_size. Raises: ValueError: if labels and indicator are not 1D boolean tensors. """ # Check if indicator and labels have a static size. if not indicator.shape.is_fully_defined(): raise ValueError( 'indicator must be static in shape when is_static is' 'True') if not labels.shape.is_fully_defined(): raise ValueError('labels must be static in shape when is_static is' 'True') if not isinstance(batch_size, int): raise ValueError( 'batch_size has to be an integer when is_static is' 'True.') input_length = tf.shape(input=indicator)[0] # Set the number of examples set True in indicator to be at least # batch_size. num_true_sampled = tf.reduce_sum( input_tensor=tf.cast(indicator, tf.float32)) additional_false_sample = tf.less_equal( tf.cumsum(tf.cast(tf.logical_not(indicator), tf.float32)), batch_size - num_true_sampled) indicator = tf.logical_or(indicator, additional_false_sample) # Shuffle indicator and label. Need to store the permutation to restore the # order post sampling. permutation = tf.random.shuffle(tf.range(input_length)) indicator = ops.matmul_gather_on_zeroth_axis( tf.cast(indicator, tf.float32), permutation) labels = ops.matmul_gather_on_zeroth_axis(tf.cast(labels, tf.float32), permutation) # index (starting from 1) when indicator is True, 0 when False indicator_idx = tf.where(tf.cast(indicator, tf.bool), tf.range(1, input_length + 1), tf.zeros(input_length, tf.int32)) # Replace -1 for negative, +1 for positive labels signed_label = tf.where( tf.cast(labels, tf.bool), tf.ones(input_length, tf.int32), tf.scalar_mul(-1, tf.ones(input_length, tf.int32))) # negative of index for negative label, positive index for positive label, # 0 when indicator is False. signed_indicator_idx = tf.multiply(indicator_idx, signed_label) sorted_signed_indicator_idx = tf.nn.top_k(signed_indicator_idx, input_length, sorted=True).values [num_positive_samples, num_negative_samples ] = self._get_num_pos_neg_samples(sorted_signed_indicator_idx, batch_size) sampled_idx = self._get_values_from_start_and_end( sorted_signed_indicator_idx, num_positive_samples, num_negative_samples, batch_size) # Shift the indices to start from 0 and remove any samples that are set as # False. sampled_idx = tf.abs(sampled_idx) - tf.ones(batch_size, tf.int32) sampled_idx = tf.multiply( tf.cast(tf.greater_equal(sampled_idx, tf.constant(0)), tf.int32), sampled_idx) sampled_idx_indicator = tf.cast( tf.reduce_sum(input_tensor=tf.one_hot(sampled_idx, depth=input_length), axis=0), tf.bool) # project back the order based on stored permutations reprojections = tf.one_hot(permutation, depth=input_length, dtype=tf.float32) return tf.cast( tf.tensordot(tf.cast(sampled_idx_indicator, tf.float32), reprojections, axes=[0, 0]), tf.bool)
def test_true(self): x = tf.constant(2) y = tf.constant(5) z = ps.cond(True, lambda: tf.multiply(x, 16), lambda: tf.multiply(y, 5)) self.assertEqual(self.evaluate(z), 32)
def train(self, sentences): token_ids, token_values, token_dense_shape = self._tokenize(sentences) tokens_sparse = tf.sparse.SparseTensor(indices=token_ids, values=token_values, dense_shape=token_dense_shape) tokens = tf.sparse.to_dense(tokens_sparse, default_value="") sparse_lookup_ids = tf.sparse.SparseTensor( indices=tokens_sparse.indices, values=self._words_to_indices(tokens_sparse.values), dense_shape=tokens_sparse.dense_shape) lookup_ids = tf.sparse.to_dense(sparse_lookup_ids, default_value=0) # Targets are the next word for each word of the sentence. tokens_ids_seq = lookup_ids[:, 0:-1] tokens_ids_target = lookup_ids[:, 1:] tokens_prefix = tokens[:, 0:-1] # Mask determining which positions we care about for a loss: all positions # that have a valid non-terminal token. mask = tf.logical_and(tf.logical_not(tf.equal(tokens_prefix, "")), tf.logical_not(tf.equal(tokens_prefix, "<E>"))) input_mask = tf.cast(mask, tf.int32) with tf.GradientTape() as t: sentence_embeddings = tf.nn.embedding_lookup( self._embeddings, tokens_ids_seq) lstm_initial_state = self._lstm_cell.get_initial_state( sentence_embeddings) lstm_output = self._rnn_layer(inputs=sentence_embeddings, initial_state=lstm_initial_state) # Stack LSTM outputs into a batch instead of a 2D array. lstm_output = tf.reshape(lstm_output, [-1, self._lstm_cell.output_size]) logits = self._logit_layer(lstm_output) targets = tf.reshape(tokens_ids_target, [-1]) weights = tf.cast(tf.reshape(input_mask, [-1]), tf.float32) losses = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=targets, logits=logits) # Final loss is the mean loss for all token losses. final_loss = tf.math.divide(tf.reduce_sum( tf.multiply(losses, weights)), tf.reduce_sum(weights), name="final_loss") watched = t.watched_variables() gradients = t.gradient(final_loss, watched) for w, g in zip(watched, gradients): w.assign_sub(g) return final_loss
def _update_confusion_matrix_variables_optimized( variables_to_update, y_true, y_pred, thresholds, multi_label=False, sample_weights=None, label_weights=None, thresholds_with_epsilon=False, ): """Update confusion matrix variables with memory efficient alternative. Note that the thresholds need to be evenly distributed within the list, eg, the diff between consecutive elements are the same. To compute TP/FP/TN/FN, we are measuring a binary classifier C(t) = (predictions >= t) at each threshold 't'. So we have TP(t) = sum( C(t) * true_labels ) FP(t) = sum( C(t) * false_labels ) But, computing C(t) requires computation for each t. To make it fast, observe that C(t) is a cumulative integral, and so if we have thresholds = [t_0, ..., t_{n-1}]; t_0 < ... < t_{n-1} where n = num_thresholds, and if we can compute the bucket function B(i) = Sum( (predictions == t), t_i <= t < t{i+1} ) then we get C(t_i) = sum( B(j), j >= i ) which is the reversed cumulative sum in tf.cumsum(). We can compute B(i) efficiently by taking advantage of the fact that our thresholds are evenly distributed, in that width = 1.0 / (num_thresholds - 1) thresholds = [0.0, 1*width, 2*width, 3*width, ..., 1.0] Given a prediction value p, we can map it to its bucket by bucket_index(p) = floor( p * (num_thresholds - 1) ) so we can use tf.math.unsorted_segment_sum() to update the buckets in one pass. Consider following example: y_true = [0, 0, 1, 1] y_pred = [0.1, 0.5, 0.3, 0.9] thresholds = [0.0, 0.5, 1.0] num_buckets = 2 # [0.0, 1.0], (1.0, 2.0] bucket_index(y_pred) = tf.math.floor(y_pred * num_buckets) = tf.math.floor([0.2, 1.0, 0.6, 1.8]) = [0, 0, 0, 1] # The meaning of this bucket is that if any of the label is true, # then 1 will be added to the corresponding bucket with the index. # Eg, if the label for 0.2 is true, then 1 will be added to bucket 0. If the # label for 1.8 is true, then 1 will be added to bucket 1. # # Note the second item "1.0" is floored to 0, since the value need to be # strictly larger than the bucket lower bound. # In the implementation, we use tf.math.ceil() - 1 to achieve this. tp_bucket_value = tf.math.unsorted_segment_sum(true_labels, bucket_indices, num_segments=num_thresholds) = [1, 1, 0] # For [1, 1, 0] here, it means there is 1 true value contributed by bucket 0, # and 1 value contributed by bucket 1. When we aggregate them to together, # the result become [a + b + c, b + c, c], since large thresholds will always # contribute to the value for smaller thresholds. true_positive = tf.math.cumsum(tp_bucket_value, reverse=True) = [2, 1, 0] This implementation exhibits a run time and space complexity of O(T + N), where T is the number of thresholds and N is the size of predictions. Metrics that rely on standard implementation instead exhibit a complexity of O(T * N). Args: variables_to_update: Dictionary with 'tp', 'fn', 'tn', 'fp' as valid keys and corresponding variables to update as values. y_true: A floating point `Tensor` whose shape matches `y_pred`. Will be cast to `bool`. y_pred: A floating point `Tensor` of arbitrary shape and whose values are in the range `[0, 1]`. thresholds: A sorted floating point `Tensor` with value in `[0, 1]`. It need to be evenly distributed (the diff between each element need to be the same). multi_label: Optional boolean indicating whether multidimensional prediction/labels should be treated as multilabel responses, or flattened into a single label. When True, the valus of `variables_to_update` must have a second dimension equal to the number of labels in y_true and y_pred, and those tensors must not be RaggedTensors. sample_weights: Optional `Tensor` whose rank is either 0, or the same rank as `y_true`, and must be broadcastable to `y_true` (i.e., all dimensions must be either `1`, or the same as the corresponding `y_true` dimension). label_weights: Optional tensor of non-negative weights for multilabel data. The weights are applied when calculating TP, FP, FN, and TN without explicit multilabel handling (i.e. when the data is to be flattened). thresholds_with_epsilon: Optional boolean indicating whether the leading and tailing thresholds has any epsilon added for floating point imprecisions. It will change how we handle the leading and tailing bucket. Returns: Update op. """ num_thresholds = thresholds.shape.as_list()[0] if sample_weights is None: sample_weights = 1.0 else: sample_weights = tf.__internal__.ops.broadcast_weights( tf.cast(sample_weights, dtype=y_pred.dtype), y_pred) if not multi_label: sample_weights = tf.reshape(sample_weights, [-1]) if label_weights is None: label_weights = 1.0 else: label_weights = tf.expand_dims(label_weights, 0) label_weights = tf.__internal__.ops.broadcast_weights( label_weights, y_pred) if not multi_label: label_weights = tf.reshape(label_weights, [-1]) weights = tf.multiply(sample_weights, label_weights) # We shouldn't need this, but in case there are predict value that is out of # the range of [0.0, 1.0] y_pred = tf.clip_by_value(y_pred, clip_value_min=0.0, clip_value_max=1.0) y_true = tf.cast(tf.cast(y_true, tf.bool), y_true.dtype) if not multi_label: y_true = tf.reshape(y_true, [-1]) y_pred = tf.reshape(y_pred, [-1]) true_labels = tf.multiply(y_true, weights) false_labels = tf.multiply((1.0 - y_true), weights) # Compute the bucket indices for each prediction value. # Since the predict value has to be strictly greater than the thresholds, # eg, buckets like [0, 0.5], (0.5, 1], and 0.5 belongs to first bucket. # We have to use math.ceil(val) - 1 for the bucket. bucket_indices = tf.math.ceil(y_pred * (num_thresholds - 1)) - 1 if thresholds_with_epsilon: # In this case, the first bucket should actually take into account since # the any prediction between [0.0, 1.0] should be larger than the first # threshold. We change the bucket value from -1 to 0. bucket_indices = tf.nn.relu(bucket_indices) bucket_indices = tf.cast(bucket_indices, tf.int32) if multi_label: # We need to run bucket segment sum for each of the label class. In the # multi_label case, the rank of the label is 2. We first transpose it so # that the label dim becomes the first and we can parallel run though them. true_labels = tf.transpose(true_labels) false_labels = tf.transpose(false_labels) bucket_indices = tf.transpose(bucket_indices) def gather_bucket(label_and_bucket_index): label, bucket_index = ( label_and_bucket_index[0], label_and_bucket_index[1], ) return tf.math.unsorted_segment_sum( data=label, segment_ids=bucket_index, num_segments=num_thresholds, ) tp_bucket_v = tf.vectorized_map(gather_bucket, (true_labels, bucket_indices)) fp_bucket_v = tf.vectorized_map(gather_bucket, (false_labels, bucket_indices)) tp = tf.transpose(tf.cumsum(tp_bucket_v, reverse=True, axis=1)) fp = tf.transpose(tf.cumsum(fp_bucket_v, reverse=True, axis=1)) else: tp_bucket_v = tf.math.unsorted_segment_sum( data=true_labels, segment_ids=bucket_indices, num_segments=num_thresholds, ) fp_bucket_v = tf.math.unsorted_segment_sum( data=false_labels, segment_ids=bucket_indices, num_segments=num_thresholds, ) tp = tf.cumsum(tp_bucket_v, reverse=True) fp = tf.cumsum(fp_bucket_v, reverse=True) # fn = sum(true_labels) - tp # tn = sum(false_labels) - fp if (ConfusionMatrix.TRUE_NEGATIVES in variables_to_update or ConfusionMatrix.FALSE_NEGATIVES in variables_to_update): if multi_label: total_true_labels = tf.reduce_sum(true_labels, axis=1) total_false_labels = tf.reduce_sum(false_labels, axis=1) else: total_true_labels = tf.reduce_sum(true_labels) total_false_labels = tf.reduce_sum(false_labels) update_ops = [] if ConfusionMatrix.TRUE_POSITIVES in variables_to_update: variable = variables_to_update[ConfusionMatrix.TRUE_POSITIVES] update_ops.append(variable.assign_add(tp)) if ConfusionMatrix.FALSE_POSITIVES in variables_to_update: variable = variables_to_update[ConfusionMatrix.FALSE_POSITIVES] update_ops.append(variable.assign_add(fp)) if ConfusionMatrix.TRUE_NEGATIVES in variables_to_update: variable = variables_to_update[ConfusionMatrix.TRUE_NEGATIVES] tn = total_false_labels - fp update_ops.append(variable.assign_add(tn)) if ConfusionMatrix.FALSE_NEGATIVES in variables_to_update: variable = variables_to_update[ConfusionMatrix.FALSE_NEGATIVES] fn = total_true_labels - tp update_ops.append(variable.assign_add(fn)) return tf.group(update_ops)
def _scale_one_loss(l): # Separate def avoids lambda capture of loop var. f = tf.function(lambda: tf.multiply(multiplier, l())) _ = f.get_concrete_function() return f
def lattice_rule_sample(generating_vectors: types.IntTensor, dim: types.IntTensor, num_results: types.IntTensor, sequence_indices: types.IntTensor = None, additive_shift: types.FloatTensor = None, apply_tent_transform: bool = False, validate_args: bool = False, dtype: tf.DType = None, name: str = None) -> types.RealTensor: r"""Constructs a lattice rule from a generating vector. #### Examples ```python import tensorflow as tf import tf_quant_finance as tff # Example: Sampling 1,000 points from 2D generating vectors. generating_vectors = tf.constant([1, 387275, 314993, 50301], dtype=tf.int32) dim = 2 num_results = 1000 tff.math.qmc.lattice_rule_sample(generating_vectors, dim, num_results) # ==> tf.Tensor([ # [0., 0. ], # [0.001, 0.2749939 ], # [0.002, 0.5499878 ], # ... # [0.99700004, 0.1689148 ], # [0.998, 0.4439087 ], # [0.9990001, 0.7189026 ], # ], shape=(1000, 2), dtype=float32) ``` Args: generating_vectors: Positive scalar `Tensor` of integers with rank 1 representing the vector from which to sample points. dim: Positive scalar `Tensor` of integers with rank 0. The event size of the sampled points. Must not exceed the size of `generating_vectors`. num_results: Positive scalar `Tensor` of integers with rank 0. The maximum number of points to sample. sequence_indices: Optional positive scalar `Tensor` of integers with rank 1. The elements of the sequence to return specified by their position in the sequence. Default value: `None` which corresponds to the `[0, num_results)` range. additive_shift: Optional scalar `Tensor` of real values with the same `shape` as `generating_vectors`. The additive shift to add to all the points (modulo 1) before applying the tent transform. Default value: `None`. apply_tent_transform: Python `bool` indicating whether to apply a tent transform to the sampled points. Default value: `False`. validate_args: Python `bool` indicating whether to validate arguments. Default value: `False`. dtype: Optional `dtype`. The `dtype` of the output `Tensor` (either `float32` or `float64`). Default value: `None` which maps to `float32`. name: Python `str` name prefixed to ops created by this function. Default value: `None` which maps to `sample_lattice_rule`. Returns: A `Tensor` of samples from the Sobol sequence with `shape` `(num_samples,)` where `num_samples = min(num_results, size(sequence_indices))`. """ with tf.name_scope(name or 'sample_lattice_rule'): # shape: (?,) generating_vectors = tf.convert_to_tensor(generating_vectors, name='generating_vectors') int_dtype = generating_vectors.dtype real_dtype = dtype or tf.float32 dim = tf.convert_to_tensor(dim, dtype=int_dtype, name='dim') num_results = tf.convert_to_tensor(num_results, dtype=int_dtype, name='num_results') control_deps = [] if validate_args: control_deps.append( tf.debugging.assert_equal( tf.rank(generating_vectors), 1, message='generating_vectors must have rank 1')) control_deps.append( tf.debugging.assert_less_equal( dim, tf.size(generating_vectors, out_type=int_dtype), message='dim must not exceed the size of generating_vectors' )) control_deps.append( tf.debugging.assert_positive( num_results, message='num_results must be positive')) with tf.control_dependencies(control_deps): # shape: (num_samples,) if sequence_indices is None: sequence_indices = tf.range(0, num_results) sequence_indices = tf.cast(sequence_indices, int_dtype, name='sequence_indices') unit = tf.ones(shape=(), dtype=real_dtype) # shape: (dim,) scaled_vector = tf.divide( # shape: (dim,) tf.cast(generating_vectors[:dim], real_dtype), # shape: () tf.cast(num_results, real_dtype)) # shape: (num_samples, dim) points = tf.multiply( # shape: (num_samples, 1) tf.expand_dims(tf.cast(sequence_indices, real_dtype), axis=1), # shape: (1, dim) tf.expand_dims(tf.math.floormod(scaled_vector, unit), axis=0)) if additive_shift is not None: # shape: (num_results,) additive_shift = tf.cast(additive_shift, real_dtype, name='additive_shift') # shape: (num_samples, dim) points += additive_shift[:dim] # shape: (num_samples, dim) points = tf.math.floormod(points, unit) # shape: (num_samples, dim) return utils.tent_transform( points) if apply_tent_transform else points
def volume_coefficient(basis): return tf.multiply( tf.linalg.logdet(tf.linalg.matmul(basis, basis, transpose_b=True)), 0.5)
def _scale_one_loss( l): # Separate def avoids lambda capture of loop var. f = tf.function( lambda: tf.multiply(regularization_loss_multiplier, l())) _ = f.get_concrete_function() return f
def call(self, inputs): return tf.multiply(inputs, self.my_var, name='my_op')
def call(self, text, features): text_embedding = self.text_encoder(text) film_mask = self.fc_film(text_embedding) x = self.fc1(features) x = self.fc2(x) return tf.multiply(x, film_mask)
def update_state(self, values, sample_weight=None): """Accumulates statistics for computing the metric. Args: values: Per-example value. sample_weight: Optional weighting of each example. Defaults to 1. Returns: Update op. """ [values], sample_weight = \ metrics_utils.ragged_assert_compatible_and_get_flat_values( [values], sample_weight) try: values = tf.cast(values, self._dtype) except (ValueError, TypeError): msg = ( 'The output of a metric function can only be a single Tensor. ' f'Received: {values}. ') if isinstance(values, dict): msg += ( 'To return a dict of values, implement a custom Metric ' 'subclass.') raise RuntimeError(msg) if sample_weight is not None: sample_weight = tf.cast(sample_weight, self._dtype) # Update dimensions of weights to match with values if possible. values, _, sample_weight = losses_utils.squeeze_or_expand_dimensions( values, sample_weight=sample_weight) try: # Broadcast weights if possible. sample_weight = tf.__internal__.ops.broadcast_weights( sample_weight, values) except ValueError: # Reduce values to same ndim as weight array ndim = backend.ndim(values) weight_ndim = backend.ndim(sample_weight) if self.reduction == metrics_utils.Reduction.SUM: values = tf.reduce_sum(values, axis=list(range(weight_ndim, ndim))) else: values = tf.reduce_mean(values, axis=list(range(weight_ndim, ndim))) values = tf.multiply(values, sample_weight) value_sum = tf.reduce_sum(values) with tf.control_dependencies([value_sum]): update_total_op = self.total.assign_add(value_sum) # Exit early if the reduction doesn't have a denominator. if self.reduction == metrics_utils.Reduction.SUM: return update_total_op # Update `count` for reductions that require a denominator. if self.reduction == metrics_utils.Reduction.SUM_OVER_BATCH_SIZE: num_values = tf.cast(tf.size(values), self._dtype) elif self.reduction == metrics_utils.Reduction.WEIGHTED_MEAN: if sample_weight is None: num_values = tf.cast(tf.size(values), self._dtype) else: num_values = tf.reduce_sum(sample_weight) else: raise NotImplementedError( f'Reduction "{self.reduction}" not implemented. Expected ' '"sum", "weighted_mean", or "sum_over_batch_size".') with tf.control_dependencies([update_total_op]): return self.count.assign_add(num_values)
def cosine_distance(x, y): """Calculates the distance between 2 tensors of same shape.""" normalizedx = tf.math.l2_normalize(x) normalizedy = tf.math.l2_normalize(y) return 1. - tf.reduce_sum(tf.multiply(normalizedx, normalizedy))
def multiply_tt(y): return tf.reduce_mean( tf.multiply( tf.multiply(weights[:, y, tf.newaxis], responsabilities[y]), self.compute_log_pdf(X, y)))