def _double_factorial_loop_condition(n, result, two): del result # Unused return tf.cast(tf.math.count_nonzero(tf.greater_equal(n, two)), tf.bool)
def assign_and_sample_proposals(proposed_boxes, gt_boxes, gt_classes, num_samples_per_image=512, mix_gt_boxes=True, fg_fraction=0.25, fg_iou_thresh=0.5, bg_iou_thresh_hi=0.5, bg_iou_thresh_lo=0.0): """Assigns the proposals with groundtruth classes and performs subsmpling. Given `proposed_boxes`, `gt_boxes`, and `gt_classes`, the function uses the following algorithm to generate the final `num_samples_per_image` RoIs. 1. Calculates the IoU between each proposal box and each gt_boxes. 2. Assigns each proposed box with a groundtruth class and box by choosing the largest IoU overlap. 3. Samples `num_samples_per_image` boxes from all proposed boxes, and returns box_targets, class_targets, and RoIs. Args: proposed_boxes: a tensor of shape of [batch_size, N, 4]. N is the number of proposals before groundtruth assignment. The last dimension is the box coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax] format. gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4]. The coordinates of gt_boxes are in the pixel coordinates of the scaled image. This tensor might have padding of values -1 indicating the invalid box coordinates. gt_classes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This tensor might have paddings with values of -1 indicating the invalid classes. num_samples_per_image: a integer represents RoI minibatch size per image. mix_gt_boxes: a bool indicating whether to mix the groundtruth boxes before sampling proposals. fg_fraction: a float represents the target fraction of RoI minibatch that is labeled foreground (i.e., class > 0). fg_iou_thresh: a float represents the IoU overlap threshold for an RoI to be considered foreground (if >= fg_iou_thresh). bg_iou_thresh_hi: a float represents the IoU overlap threshold for an RoI to be considered background (class = 0 if overlap in [LO, HI)). bg_iou_thresh_lo: a float represents the IoU overlap threshold for an RoI to be considered background (class = 0 if overlap in [LO, HI)). Returns: sampled_rois: a tensor of shape of [batch_size, K, 4], representing the coordinates of the sampled RoIs, where K is the number of the sampled RoIs, i.e. K = num_samples_per_image. sampled_gt_boxes: a tensor of shape of [batch_size, K, 4], storing the box coordinates of the matched groundtruth boxes of the samples RoIs. sampled_gt_classes: a tensor of shape of [batch_size, K], storing the classes of the matched groundtruth boxes of the sampled RoIs. sampled_gt_indices: a tensor of shape of [batch_size, K], storing the indices of the sampled groudntruth boxes in the original `gt_boxes` tensor, i.e. gt_boxes[sampled_gt_indices[:, i]] = sampled_gt_boxes[:, i]. """ with tf.name_scope('sample_proposals'): if mix_gt_boxes: boxes = tf.concat([proposed_boxes, gt_boxes], axis=1) else: boxes = proposed_boxes (matched_gt_boxes, matched_gt_classes, matched_gt_indices, matched_iou, _) = box_matching(boxes, gt_boxes, gt_classes) positive_match = tf.greater(matched_iou, fg_iou_thresh) negative_match = tf.logical_and( tf.greater_equal(matched_iou, bg_iou_thresh_lo), tf.less(matched_iou, bg_iou_thresh_hi)) ignored_match = tf.less(matched_iou, 0.0) # re-assign negatively matched boxes to the background class. matched_gt_classes = tf.where(negative_match, tf.zeros_like(matched_gt_classes), matched_gt_classes) matched_gt_indices = tf.where(negative_match, tf.zeros_like(matched_gt_indices), matched_gt_indices) sample_candidates = tf.logical_and( tf.logical_or(positive_match, negative_match), tf.logical_not(ignored_match)) sampler = ( balanced_positive_negative_sampler.BalancedPositiveNegativeSampler( positive_fraction=fg_fraction, is_static=True)) batch_size, _ = sample_candidates.get_shape().as_list() sampled_indicators = [] for i in range(batch_size): sampled_indicator = sampler.subsample(sample_candidates[i], num_samples_per_image, positive_match[i]) sampled_indicators.append(sampled_indicator) sampled_indicators = tf.stack(sampled_indicators) _, sampled_indices = tf.nn.top_k(tf.cast(sampled_indicators, dtype=tf.int32), k=num_samples_per_image, sorted=True) sampled_indices_shape = tf.shape(sampled_indices) batch_indices = ( tf.expand_dims(tf.range(sampled_indices_shape[0]), axis=-1) * tf.ones([1, sampled_indices_shape[-1]], dtype=tf.int32)) gather_nd_indices = tf.stack([batch_indices, sampled_indices], axis=-1) sampled_rois = tf.gather_nd(boxes, gather_nd_indices) sampled_gt_boxes = tf.gather_nd(matched_gt_boxes, gather_nd_indices) sampled_gt_classes = tf.gather_nd(matched_gt_classes, gather_nd_indices) sampled_gt_indices = tf.gather_nd(matched_gt_indices, gather_nd_indices) return (sampled_rois, sampled_gt_boxes, sampled_gt_classes, sampled_gt_indices)
def _build_outputs(self, images, labels, mode): is_training = mode == mode_keys.TRAIN model_outputs = {} if 'anchor_boxes' in labels: anchor_boxes = labels['anchor_boxes'] else: anchor_boxes = anchor.Anchor( self._params.architecture.min_level, self._params.architecture.max_level, self._params.anchor.num_scales, self._params.anchor.aspect_ratios, self._params.anchor.anchor_size, images.get_shape().as_list()[1:3]).multilevel_boxes batch_size = tf.shape(images)[0] for level in anchor_boxes: anchor_boxes[level] = tf.tile( tf.expand_dims(anchor_boxes[level], 0), [batch_size, 1, 1]) backbone_features = self._backbone_fn(images, is_training) fpn_features = self._fpn_fn(backbone_features, is_training) rpn_score_outputs, rpn_box_outputs = self._rpn_head_fn( fpn_features, is_training) model_outputs.update({ 'rpn_score_outputs': rpn_score_outputs, 'rpn_box_outputs': rpn_box_outputs, }) rpn_rois, _ = self._generate_rois_fn(rpn_box_outputs, rpn_score_outputs, anchor_boxes, labels['image_info'][:, 1, :], is_training) if is_training: rpn_rois = tf.stop_gradient(rpn_rois) # Sample proposals. rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices = ( self._sample_rois_fn(rpn_rois, labels['gt_boxes'], labels['gt_classes'])) self.add_scalar_summary( 'fg_bg_ratio_{}'.format(0), tf.reduce_sum( tf.cast(tf.greater(matched_gt_classes, 0), tf.float32)) / tf.reduce_sum( tf.cast(tf.greater_equal(matched_gt_classes, 0), tf.float32))) # Create bounding box training targets. box_targets = box_utils.encode_boxes( matched_gt_boxes, rpn_rois, weights=[10.0, 10.0, 5.0, 5.0]) # If the target is background, the box target is set to all 0s. box_targets = tf.where( tf.tile( tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1), [1, 1, 4]), tf.zeros_like(box_targets), box_targets) model_outputs.update({ 'class_targets': matched_gt_classes, 'box_targets': box_targets, }) _, num_rois_before_cat, _ = rpn_rois.get_shape().as_list() if is_training and self._feat_distill: tf.logging.info(f'rois before concat distill boxes: {rpn_rois}') rpn_rois = tf.concat([rpn_rois, labels['roi_boxes']], axis=1) # [batch_size, num_rois+max_distill_rois, 4] tf.logging.info(f'rois after concat distill boxes: {rpn_rois}') roi_features = spatial_transform_ops.multilevel_crop_and_resize( fpn_features, rpn_rois, output_size=7) if is_training and self._feat_distill: tf.logging.info(f'rois before split: {rpn_rois}') rpn_rois, _ = tf.split( rpn_rois, [num_rois_before_cat, self._max_distill_rois], axis=1) tf.logging.info(f'rois after split: {rpn_rois}') (class_outputs, box_outputs, distill_feat_outputs, distill_class_outputs) = self._frcnn_head_fn(roi_features, is_training) model_outputs.update({ 'class_outputs': class_outputs, 'box_outputs': box_outputs, }) if is_training and self._feat_distill: model_outputs.update( {'distill_feat_outputs': distill_feat_outputs}) if not is_training: detection_results = self._generate_detections_fn( box_outputs, class_outputs, rpn_rois, labels['image_info'][:, 1:2, :], bbox_per_class=not self._params.frcnn_head. class_agnostic_bbox_pred, distill_class_outputs=distill_class_outputs, ) model_outputs.update(detection_results) if not self._include_mask: return model_outputs if is_training: rpn_rois, classes, mask_targets = self._sample_masks_fn( rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices, labels['gt_masks']) mask_targets = tf.stop_gradient(mask_targets) classes = tf.cast(classes, dtype=tf.int32) model_outputs.update({ 'mask_targets': mask_targets, 'sampled_class_targets': classes, }) else: rpn_rois = detection_results['detection_boxes'] classes = tf.cast(detection_results['detection_classes'], dtype=tf.int32) mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize( fpn_features, rpn_rois, output_size=14) mask_outputs = self._mrcnn_head_fn(mask_roi_features, classes, is_training) if is_training: model_outputs.update({ 'mask_outputs': mask_outputs, }) else: model_outputs.update( {'detection_masks': tf.nn.sigmoid(mask_outputs)}) return model_outputs
def _scan_step_fn(state, example, packed_length, queue_size, spacing, num_sequences, token_dtype): # pylint: disable=g-doc-args """Transform function used by tf.data.experimental.scan to process an example. This is written as a stateless function rather than a class method because we trace it with AutoGraph (in order to simplify the conditional), and this way we don't have to worry about handling re-tracing semantics. Args: See the SequenceDatasetPacker class. Returns: The updated queue state, and either a packed example or a dummy sequence which will be filtered out downstream. """ # Convert TensorArray tuples to lists since we'll need to replace them. availability, contents, top_index = state lengths = tf.concat([tf.shape(i) for i in example], axis=0) start_availability = availability.stack() can_fit = tf.reduce_all(tf.greater_equal(start_availability, lengths), axis=1) any_can_fit = tf.reduce_any(can_fit, axis=0) # AutoGraph will convert this block to a tf.cond if any_can_fit: # This indicates where in the FFD queue rotation a given index sits shifted_range = (tf.range(queue_size, dtype=INDEX_DTYPE) - top_index) % queue_size # Mark any indices which cannot accommodate the current example. exclusion_mask = tf.cast(tf.logical_not(can_fit), INDEX_DTYPE) * queue_size # Index in [0, queue_size) in which to place the sample. Note, this index # is the position in the actual TensorArray, not the index of the FFD queue. queue_index = (tf.reduce_min(shifted_range + exclusion_mask) + top_index) % queue_size # NOTE(taylorrobie): We emit a non-empty Tensor for downstream checks. output_contents = -tf.ones((1, num_sequences), dtype=token_dtype) else: index_range = top_index * packed_length + tf.range(packed_length) output_contents = contents.gather(index_range) # Reset the queue state. availability = availability.write( top_index, packed_length * tf.ones((num_sequences, ), dtype=INDEX_DTYPE)) empty_contents = tf.zeros((packed_length, num_sequences * 2), dtype=token_dtype) contents = contents.scatter(index_range, empty_contents) queue_index = top_index top_index = (top_index + 1) % queue_size pre_assign_availability = availability.read(queue_index) space_left = pre_assign_availability - lengths - spacing availability = availability.write(queue_index, space_left) # ============================================================================ # == Update contents ========================================================= # ============================================================================ # Consider the following case for a seq-to-seq packing: # (padding is represented as underscores) # # Queue starting state: # [1, 3, 2, 4, 6, 1, _, _, _, _, _, ...] # [5, 9, _, _, _, _, _, _, _, _, _, ...] # # Examples: # [4, 2, 4], [3] # # Desired new queue state: # [1, 3, 2, 4, 6, 1, _, _, 4, 2, 4, _, _, ...] # [5, 9, _, _, 3, _, _, _, _, _, _, _, _, ...] # # This could be acomplished by creating a TensorArray for each of the two # sequences, and scattering into the respective arrays. However TensorArray # writes are extremely expensive relative to other operations. So instead we # store the contents in a single TensorArray of shape (packed_length, 2), and # we pad and concatenate the examples such that they can be added in a single # assign: # # [_, _, _, _, 4, 2, 4] # [3, _, _, _, _, _, _] # + # [1, 3, 2, 4, 6, 1, _, _, _, _, _, ...] # [5, 9, _, _, _, _, _, _, _, _, _, ...] # # And in practice, the extra work of padding is neglidgable compared to # the gain from vectorizing the TensorArray assign. We also store a bit mask # denoting where sequences start which is used to compute segment and # position metadata: # # [_, _, _, _, 1, _, _] # [1, _, _, _, _, _, _] # + # [1, _, _, _, _, _, _, _, _, _, _, ...] # [1, _, _, _, _, _, _, _, _, _, _, ...] # # Both the contents and the mask are concatenated in the same TensorArray # for performance. start_index = packed_length - pre_assign_availability end_index = start_index + lengths leftmost = tf.reduce_min(start_index, axis=0) rightmost = tf.reduce_max(end_index, axis=0) delta = rightmost - leftmost pad_indices = [ tf.stack((start_index[i] - leftmost, rightmost - end_index[i])) for i in range(num_sequences) ] padded_examples = [ tf.pad(ex, padding[tf.newaxis, :]) for ex, padding in zip(example, pad_indices) ] padded_examples = tf.transpose(tf.stack(padded_examples)) mask_update = tf.one_hot(start_index - leftmost, delta, dtype=contents.dtype, axis=0) content_update = tf.concat([padded_examples, mask_update], axis=1) index_range = ( queue_index * packed_length + # Offset into the right section. tf.range(delta, dtype=INDEX_DTYPE) + leftmost) contents = contents.scatter(index_range, contents.gather(index_range) + content_update) state = (availability, contents, top_index) return state, (tf.logical_not(any_can_fit), output_contents)
def _parse_train_data(self, data): """Parse data for ShapeMask training.""" classes = data['groundtruth_classes'] boxes = data['groundtruth_boxes'] masks = data['groundtruth_instance_masks'] is_crowds = data['groundtruth_is_crowd'] # Skips annotations with `is_crowd` = True. if self._skip_crowd_during_training and self._is_training: num_groundtrtuhs = tf.shape(classes)[0] with tf.control_dependencies([num_groundtrtuhs, is_crowds]): indices = tf.cond( tf.greater(tf.size(is_crowds), 0), lambda: tf.where(tf.logical_not(is_crowds))[:, 0], lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64)) classes = tf.gather(classes, indices) boxes = tf.gather(boxes, indices) masks = tf.gather(masks, indices) # Gets original image and its size. image = data['image'] image_shape = tf.shape(image)[0:2] # If not using category, makes all categories with id = 0. if not self._use_category: classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32) # Normalizes image with mean and std pixel values. image = input_utils.normalize_image(image) # Flips image randomly during training. if self._aug_rand_hflip: image, boxes, masks = input_utils.random_horizontal_flip( image, boxes, masks) # Converts boxes from normalized coordinates to pixel coordinates. boxes = box_utils.denormalize_boxes(boxes, image_shape) # Resizes and crops image. image, image_info = input_utils.resize_and_crop_image( image, self._output_size, self._output_size, aug_scale_min=self._aug_scale_min, aug_scale_max=self._aug_scale_max) image_scale = image_info[2, :] offset = image_info[3, :] # Resizes and crops boxes and masks. boxes = input_utils.resize_and_crop_boxes(boxes, image_scale, self._output_size, offset) # Filters out ground truth boxes that are all zeros. indices = input_utils.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) masks = tf.gather(masks, indices) # Assigns anchors. input_anchor = anchor.Anchor(self._min_level, self._max_level, self._num_scales, self._aspect_ratios, self._anchor_size, self._output_size) anchor_labeler = anchor.AnchorLabeler(input_anchor, self._match_threshold, self._unmatched_threshold) (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors( boxes, tf.cast(tf.expand_dims(classes, axis=1), tf.float32)) # Sample groundtruth masks/boxes/classes for mask branch. num_masks = tf.shape(masks)[0] mask_shape = tf.shape(masks)[1:3] # Pad sampled boxes/masks/classes to a constant batch size. padded_boxes = input_utils.pad_to_fixed_size(boxes, self._num_sampled_masks) padded_classes = input_utils.pad_to_fixed_size(classes, self._num_sampled_masks) padded_masks = input_utils.pad_to_fixed_size(masks, self._num_sampled_masks) # Randomly sample groundtruth masks for mask branch training. For the image # without groundtruth masks, it will sample the dummy padded tensors. rand_indices = tf.random.shuffle( tf.range(tf.maximum(num_masks, self._num_sampled_masks))) rand_indices = tf.mod(rand_indices, tf.maximum(num_masks, 1)) rand_indices = rand_indices[0:self._num_sampled_masks] rand_indices = tf.reshape(rand_indices, [self._num_sampled_masks]) sampled_boxes = tf.gather(padded_boxes, rand_indices) sampled_classes = tf.gather(padded_classes, rand_indices) sampled_masks = tf.gather(padded_masks, rand_indices) # Jitter the sampled boxes to mimic the noisy detections. sampled_boxes = box_utils.jitter_boxes( sampled_boxes, noise_scale=self._box_jitter_scale) sampled_boxes = box_utils.clip_boxes(sampled_boxes, self._output_size) # Compute mask targets in feature crop. A feature crop fully contains a # sampled box. mask_outer_boxes = box_utils.compute_outer_boxes( sampled_boxes, tf.shape(image)[0:2], scale=self._outer_box_scale) mask_outer_boxes = box_utils.clip_boxes(mask_outer_boxes, self._output_size) # Compensate the offset of mask_outer_boxes to map it back to original image # scale. mask_outer_boxes_ori = mask_outer_boxes mask_outer_boxes_ori += tf.tile(tf.expand_dims(offset, axis=0), [1, 2]) mask_outer_boxes_ori /= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2]) norm_mask_outer_boxes_ori = box_utils.normalize_boxes( mask_outer_boxes_ori, mask_shape) # Set sampled_masks shape to [batch_size, height, width, 1]. sampled_masks = tf.cast(tf.expand_dims(sampled_masks, axis=-1), tf.float32) mask_targets = tf.image.crop_and_resize( sampled_masks, norm_mask_outer_boxes_ori, box_ind=tf.range(self._num_sampled_masks), crop_size=[self._mask_crop_size, self._mask_crop_size], method='bilinear', extrapolation_value=0, name='train_mask_targets') mask_targets = tf.where(tf.greater_equal(mask_targets, 0.5), tf.ones_like(mask_targets), tf.zeros_like(mask_targets)) mask_targets = tf.squeeze(mask_targets, axis=-1) if self._up_sample_factor > 1: fine_mask_targets = tf.image.crop_and_resize( sampled_masks, norm_mask_outer_boxes_ori, box_ind=tf.range(self._num_sampled_masks), crop_size=[ self._mask_crop_size * self._up_sample_factor, self._mask_crop_size * self._up_sample_factor ], method='bilinear', extrapolation_value=0, name='train_mask_targets') fine_mask_targets = tf.where( tf.greater_equal(fine_mask_targets, 0.5), tf.ones_like(fine_mask_targets), tf.zeros_like(fine_mask_targets)) fine_mask_targets = tf.squeeze(fine_mask_targets, axis=-1) else: fine_mask_targets = mask_targets # If bfloat16 is used, casts input image to tf.bfloat16. if self._use_bfloat16: image = tf.cast(image, dtype=tf.bfloat16) valid_image = tf.cast(tf.not_equal(num_masks, 0), tf.int32) if self._mask_train_class == 'all': mask_is_valid = valid_image * tf.ones_like(sampled_classes, tf.int32) else: # Get the intersection of sampled classes with training splits. mask_valid_classes = tf.cast( tf.expand_dims( class_utils.coco_split_class_ids(self._mask_train_class), 1), sampled_classes.dtype) match = tf.reduce_any( tf.equal(tf.expand_dims(sampled_classes, 0), mask_valid_classes), 0) mask_is_valid = valid_image * tf.cast(match, tf.int32) # Packs labels for model_fn outputs. labels = { 'cls_targets': cls_targets, 'box_targets': box_targets, 'anchor_boxes': input_anchor.multilevel_boxes, 'num_positives': num_positives, 'image_info': image_info, # For ShapeMask. 'mask_boxes': sampled_boxes, 'mask_outer_boxes': mask_outer_boxes, 'mask_targets': mask_targets, 'fine_mask_targets': fine_mask_targets, 'mask_classes': sampled_classes, 'mask_is_valid': mask_is_valid, } return image, labels
def gte(self, x, y): return tf.greater_equal(x, y)
def _update_random_mask(self, weights, mask): """Randomly identifies subset of weights to be set to zero in the network. If pruning method is specified as 'random_cumulative', at each pruning step a random subset of weights is set to zero taking into account which weights are still non-zero. If pruning method is specified to be 'random_independent', the random weights selected at each pruning step are entirely independent of previous pruning steps. Args: weights: The weight tensor that needs to be masked. mask: The mask from the previous pruning update. Returns: new_mask: A tensor of the same size and shape as weights containing 0 or 1. Raises: ValueError: Raises ValueError if sparsity is not defined """ if self._sparsity is None: raise ValueError('Sparsity variable undefined') sparsity = self._get_sparsity(weights.op.name) with tf.name_scope(weights.op.name + '_pruning_ops'): if self._pruning_method == 'random_cumulative': # compute the total number of weights in the layer. total_weights = tf.size(weights) mask = tf.reshape(mask, [total_weights]) # adding random vector because if there are ties sort simply # selects based upon index position (starts from beginning of vector). random_noise = tf.random_uniform(shape=mask.shape, minval=0.0001, maxval=0.0003) mask = tf.cast(tf.add(random_noise, mask), tf.float32) # rank the binary mask by magnitude. Weights already on are selected # plus a random subset of all other weights. sorted_mask = sort(mask, direction='DESCENDING') # multiply desired sparsity fraction by the number of weights. num_weights = tf.reshape( tf.cast( tf.cast(total_weights, tf.float32) * sparsity, tf.int32), [1]) percentile = tf.gather_nd(sorted_mask, num_weights) one_mask = tf.ones([total_weights]) zero_mask = tf.zeros([total_weights]) feature_ranking = tf.where(tf.greater_equal(percentile, mask), one_mask, zero_mask) new_mask = tf.reshape(feature_ranking, weights.get_shape()) else: drop_out = tf.nn.dropout(tf.ones_like(weights), keep_prob=(1. - self._sparsity)) new_mask = tf.cast(drop_out, tf.float32) return self._sparsity, new_mask
0.70710677*temp_3 S1.append(tf.concat([S1_real, S1_im], 1)) x_ind_reshaped = tf.reshape(X_IND, [batch_size, 4 * K]) LOSS.append( np.log(i) * tf.reduce_mean(tf.reduce_mean(tf.square(x_ind_reshaped - S2[-1]), 1))) BER.append( tf.reduce_mean( tf.cast( tf.logical_or( tf.not_equal(tf.sign(x_real), tf.sign(S1[-1][:, 0:K])), tf.not_equal(tf.sign(x_imag), tf.sign(S1[-1][:, K:2 * K]))), tf.float32))) Max_Val = tf.reduce_max(S3, axis=2, keep_dims=True) Greater = tf.greater_equal(S3, Max_Val) BER2 = tf.round(tf.cast(Greater, tf.float32)) BER3 = tf.not_equal(BER2, X_IND) BER4 = tf.reduce_sum(tf.cast(BER3, tf.float32), 2) BER5 = tf.cast(tf.greater(BER4, 0), tf.float32) SER = tf.reduce_mean(BER5) TOTAL_LOSS = tf.add_n(LOSS) saver = tf.train.Saver() global_step = tf.Variable(0, trainable=False) learning_rate = tf.train.exponential_decay(startingLearningRate, global_step, decay_step, decay_factor, staircase=True)
def test_ge(self): input1 = tf.placeholder(shape=(4, 32, 32, 3), dtype=tf.float32) input2 = tf.placeholder(shape=(4, 32, 32, 3), dtype=tf.float32) output = tf.greater_equal(input1, input2) self._test_conversion('ge', [input1, input2], [output])
def loss(self, prediction_dict): """ Returns cost for RCNN based on: Args: prediction_dict with keys: rcnn: cls_score: shape (num_proposals, num_classes + 1) Has the class scoring for each the proposals. Classes are 1-indexed with 0 being the background. cls_prob: shape (num_proposals, num_classes + 1) Application of softmax on cls_score. bbox_offsets: shape (num_proposals, num_classes * 4) Has the offset for each proposal for each class. We have to compare only the proposals labeled with the offsets for that label. target: cls_target: shape (num_proposals,) Has the correct label for each of the proposals. 0 => background 1..n => 1-indexed classes bbox_offsets_target: shape (num_proposals, 4) Has the true offset of each proposal for the true label. In case of not having a true label (non-background) then it's just zeroes. Returns: loss_dict with keys: rcnn_cls_loss: The cross-entropy or log-loss of the classification tasks between then num_classes + background. rcnn_reg_loss: The smooth L1 loss for the bounding box regression task to adjust correctly labeled boxes. """ with tf.name_scope('RCNNLoss'): cls_score = prediction_dict['rcnn']['cls_score'] # cls_prob = prediction_dict['rcnn']['cls_prob'] # Cast target explicitly as int32. cls_target = tf.cast( prediction_dict['target']['cls'], tf.int32 ) # First we need to calculate the log loss betweetn cls_prob and # cls_target # We only care for the targets that are >= 0 not_ignored = tf.reshape(tf.greater_equal( cls_target, 0), [-1], name='not_ignored') # We apply boolean mask to score, prob and target. cls_score_labeled = tf.boolean_mask( cls_score, not_ignored, name='cls_score_labeled') # cls_prob_labeled = tf.boolean_mask( # cls_prob, not_ignored, name='cls_prob_labeled') cls_target_labeled = tf.boolean_mask( cls_target, not_ignored, name='cls_target_labeled') tf.summary.scalar( 'batch_size', tf.shape(cls_score_labeled)[0], ['rcnn'] ) # Transform to one-hot vector cls_target_one_hot = tf.one_hot( cls_target_labeled, depth=self._num_classes + 1, name='cls_target_one_hot' ) # We get cross entropy loss of each proposal. cross_entropy_per_proposal = ( tf.nn.softmax_cross_entropy_with_logits_v2( labels=tf.stop_gradient(cls_target_one_hot), logits=cls_score_labeled ) ) if self._debug: prediction_dict['_debug']['losses'] = {} # Save the cross entropy per proposal to be able to # visualize proposals with high and low error. prediction_dict['_debug']['losses'][ 'cross_entropy_per_proposal' ] = ( cross_entropy_per_proposal ) # Second we need to calculate the smooth l1 loss between # `bbox_offsets` and `bbox_offsets_target`. bbox_offsets = prediction_dict['rcnn']['bbox_offsets'] bbox_offsets_target = ( prediction_dict['target']['bbox_offsets'] ) # We only want the non-background labels bounding boxes. not_ignored = tf.reshape(tf.greater(cls_target, 0), [-1]) bbox_offsets_labeled = tf.boolean_mask( bbox_offsets, not_ignored, name='bbox_offsets_labeled') bbox_offsets_target_labeled = tf.boolean_mask( bbox_offsets_target, not_ignored, name='bbox_offsets_target_labeled' ) cls_target_labeled = tf.boolean_mask( cls_target, not_ignored, name='cls_target_labeled') # `cls_target_labeled` is based on `cls_target` which has # `num_classes` + 1 classes. # for making `one_hot` with depth `num_classes` to work we need # to lower them to make them 0-index. cls_target_labeled = cls_target_labeled - 1 cls_target_one_hot = tf.one_hot( cls_target_labeled, depth=self._num_classes, name='cls_target_one_hot' ) # cls_target now is (num_labeled, num_classes) bbox_flatten = tf.reshape( bbox_offsets_labeled, [-1, 4], name='bbox_flatten') # We use the flatten cls_target_one_hot as boolean mask for the # bboxes. cls_flatten = tf.cast(tf.reshape( cls_target_one_hot, [-1]), tf.bool, 'cls_flatten_as_bool') bbox_offset_cleaned = tf.boolean_mask( bbox_flatten, cls_flatten, 'bbox_offset_cleaned') # Calculate the smooth l1 loss between the "cleaned" bboxes # offsets (that means, the useful results) and the labeled # targets. reg_loss_per_proposal = smooth_l1_loss( bbox_offset_cleaned, bbox_offsets_target_labeled, sigma=self._l1_sigma ) tf.summary.scalar( 'rcnn_foreground_samples', tf.shape(bbox_offset_cleaned)[0], ['rcnn'] ) if self._debug: # Also save reg loss per proposals to be able to visualize # good and bad proposals in debug mode. prediction_dict['_debug']['losses'][ 'reg_loss_per_proposal' ] = ( reg_loss_per_proposal ) return { 'rcnn_cls_loss': tf.reduce_mean(cross_entropy_per_proposal), 'rcnn_reg_loss': tf.reduce_mean(reg_loss_per_proposal), }
def _build(self, all_anchors, gt_boxes, im_shape): """ We compare anchors to GT and using the minibatch size and the different config settings (clobber, foreground fraction, etc), we end up with training targets *only* for the elements we want to use in the batch, while everything else is ignored. Basically what it does is, first generate the targets for all (valid) anchors, and then start subsampling the positive (foreground) and the negative ones (background) based on the number of samples of each type that we want. Args: all_anchors: A Tensor with all the bounding boxes coords of the anchors. Its shape should be (num_anchors, 4). gt_boxes: A Tensor with the ground truth bounding boxes of the image of the batch being processed. Its shape should be (num_gt, 5). The last dimension is used for the label. im_shape: Shape of original image (height, width) in order to define anchor targers in respect with gt_boxes. Returns: Tuple of the tensors of: labels: (1, 0, -1) for each anchor. Shape (num_anchors, 1) bbox_targets: 4d bbox targets as specified by paper. Shape (num_anchors, 4) max_overlaps: Max IoU overlap with ground truth boxes. Shape (num_anchors, 1) """ # Keep only the coordinates of gt_boxes gt_boxes = gt_boxes[:, :4] all_anchors = all_anchors[:, :4] # Only keep anchors inside the image (x_min_anchor, y_min_anchor, x_max_anchor, y_max_anchor) = tf.unstack(all_anchors, axis=1) anchor_filter = tf.logical_and( tf.logical_and( tf.greater_equal(x_min_anchor, -self._allowed_border), tf.greater_equal(y_min_anchor, -self._allowed_border)), tf.logical_and( tf.less(x_max_anchor, im_shape[1] + self._allowed_border), tf.less(y_max_anchor, im_shape[0] + self._allowed_border))) # We (force) reshape the filter so that we can use it as a boolean mask anchor_filter = tf.reshape(anchor_filter, [-1]) # Filter anchors. anchors = tf.boolean_mask(all_anchors, anchor_filter, name='filter_anchors') # Generate array with the labels for all_anchors. labels = tf.fill((tf.gather(tf.shape(all_anchors), [0])), -1) labels = tf.boolean_mask(labels, anchor_filter, name='filter_labels') # Intersection over union (IoU) overlap between the anchors and the # ground truth boxes. overlaps = bbox_overlap_tf(tf.to_float(anchors), tf.to_float(gt_boxes)) # Generate array with the IoU value of the closest GT box for each # anchor. max_overlaps = tf.reduce_max(overlaps, axis=1) if not self._clobber_positives: # Assign bg labels first so that positive labels can clobber them. # First we get an array with True where IoU is less than # self._negative_overlap negative_overlap_nonzero = tf.less(max_overlaps, self._negative_overlap) # Finally we set 0 at True indices labels = tf.where(condition=negative_overlap_nonzero, x=tf.zeros(tf.shape(labels)), y=tf.to_float(labels)) # Get the value of the max IoU for the closest anchor for each gt. gt_max_overlaps = tf.reduce_max(overlaps, axis=0) # Find all the indices that match (at least one, but could be more). gt_argmax_overlaps = tf.squeeze(tf.equal(overlaps, gt_max_overlaps)) gt_argmax_overlaps = tf.where(gt_argmax_overlaps)[:, 0] # Eliminate duplicates indices. gt_argmax_overlaps, _ = tf.unique(gt_argmax_overlaps) # Order the indices for sparse_to_dense compatibility gt_argmax_overlaps, _ = tf.nn.top_k(gt_argmax_overlaps, k=tf.shape(gt_argmax_overlaps)[-1]) gt_argmax_overlaps = tf.reverse(gt_argmax_overlaps, [0]) # Foreground label: for each ground-truth, anchor with highest overlap. # When the argmax is many items we use all of them (for consistency). # We set 1 at gt_argmax_overlaps_cond indices gt_argmax_overlaps_cond = tf.sparse_to_dense(gt_argmax_overlaps, tf.shape( labels, out_type=tf.int64), True, default_value=False) labels = tf.where(condition=gt_argmax_overlaps_cond, x=tf.ones(tf.shape(labels)), y=tf.to_float(labels)) # Foreground label: above threshold Intersection over Union (IoU) # First we get an array with True where IoU is greater or equal than # self._positive_overlap positive_overlap_inds = tf.greater_equal(max_overlaps, self._positive_overlap) # Finally we set 1 at True indices labels = tf.where(condition=positive_overlap_inds, x=tf.ones(tf.shape(labels)), y=labels) if self._clobber_positives: # Assign background labels last so that negative labels can clobber # positives. First we get an array with True where IoU is less than # self._negative_overlap negative_overlap_nonzero = tf.less(max_overlaps, self._negative_overlap) # Finally we set 0 at True indices labels = tf.where(condition=negative_overlap_nonzero, x=tf.zeros(tf.shape(labels)), y=labels) # Subsample positive labels if we have too many def subsample_positive(): # Shuffle the foreground indices disable_fg_inds = tf.random_shuffle(fg_inds, seed=self._seed) # Select the indices that we have to ignore, this is # `tf.shape(fg_inds)[0] - num_fg` because we want to get only # `num_fg` foreground labels. disable_place = (tf.shape(fg_inds)[0] - num_fg) disable_fg_inds = disable_fg_inds[:disable_place] # Order the indices for sparse_to_dense compatibility disable_fg_inds, _ = tf.nn.top_k(disable_fg_inds, k=tf.shape(disable_fg_inds)[-1]) disable_fg_inds = tf.reverse(disable_fg_inds, [0]) disable_fg_inds = tf.sparse_to_dense(disable_fg_inds, tf.shape(labels, out_type=tf.int64), True, default_value=False) # Put -1 to ignore the anchors in the selected indices return tf.where(condition=tf.squeeze(disable_fg_inds), x=tf.to_float(tf.fill(tf.shape(labels), -1)), y=labels) num_fg = tf.to_int32(self._foreground_fraction * self._minibatch_size) # Get foreground indices, get True in the indices where we have a one. fg_inds = tf.equal(labels, 1) # We get only the indices where we have True. fg_inds = tf.squeeze(tf.where(fg_inds), axis=1) fg_inds_size = tf.size(fg_inds) # Condition for check if we have too many positive labels. subsample_positive_cond = fg_inds_size > num_fg # Check the condition and subsample positive labels. labels = tf.cond(subsample_positive_cond, true_fn=subsample_positive, false_fn=lambda: labels) # Subsample negative labels if we have too many def subsample_negative(): # Shuffle the background indices disable_bg_inds = tf.random_shuffle(bg_inds, seed=self._seed) # Select the indices that we have to ignore, this is # `tf.shape(bg_inds)[0] - num_bg` because we want to get only # `num_bg` background labels. disable_place = (tf.shape(bg_inds)[0] - num_bg) disable_bg_inds = disable_bg_inds[:disable_place] # Order the indices for sparse_to_dense compatibility disable_bg_inds, _ = tf.nn.top_k(disable_bg_inds, k=tf.shape(disable_bg_inds)[-1]) disable_bg_inds = tf.reverse(disable_bg_inds, [0]) disable_bg_inds = tf.sparse_to_dense(disable_bg_inds, tf.shape(labels, out_type=tf.int64), True, default_value=False) # Put -1 to ignore the anchors in the selected indices return tf.where(condition=tf.squeeze(disable_bg_inds), x=tf.to_float(tf.fill(tf.shape(labels), -1)), y=labels) # Recalculate the foreground indices after (maybe) disable some of them # Get foreground indices, get True in the indices where we have a one. fg_inds = tf.equal(labels, 1) # We get only the indices where we have True. fg_inds = tf.squeeze(tf.where(fg_inds), axis=1) fg_inds_size = tf.size(fg_inds) num_bg = tf.to_int32(self._minibatch_size - fg_inds_size) # Get background indices, get True in the indices where we have a zero. bg_inds = tf.equal(labels, 0) # We get only the indices where we have True. bg_inds = tf.squeeze(tf.where(bg_inds), axis=1) bg_inds_size = tf.size(bg_inds) # Condition for check if we have too many positive labels. subsample_negative_cond = bg_inds_size > num_bg # Check the condition and subsample positive labels. labels = tf.cond(subsample_negative_cond, true_fn=subsample_negative, false_fn=lambda: labels) # Return bbox targets with shape (anchors.shape[0], 4). # Find the closest gt box for each anchor. argmax_overlaps = tf.argmax(overlaps, axis=1) # Eliminate duplicates. argmax_overlaps_unique, _ = tf.unique(argmax_overlaps) # Filter the gt_boxes. # We get only the indices where we have "inside anchors". anchor_filter_inds = tf.where(anchor_filter) gt_boxes = tf.gather(gt_boxes, argmax_overlaps) bbox_targets = encode_tf(anchors, gt_boxes) # For the anchors that arent foreground, we ignore the bbox_targets. anchor_foreground_filter = tf.equal(labels, 1) bbox_targets = tf.where(condition=anchor_foreground_filter, x=bbox_targets, y=tf.zeros_like(bbox_targets)) # We unroll "inside anchors" value for all anchors (for shape # compatibility). # We complete the missed indices with zeros # (because scatter_nd has zeros as default). bbox_targets = tf.scatter_nd(indices=tf.to_int32(anchor_filter_inds), updates=bbox_targets, shape=tf.shape(all_anchors)) labels_scatter = tf.scatter_nd(indices=tf.to_int32(anchor_filter_inds), updates=labels, shape=[tf.shape(all_anchors)[0]]) # We have to put -1 to ignore the indices with 0 generated by # scatter_nd, otherwise it will be considered as background. labels = tf.where(condition=anchor_filter, x=labels_scatter, y=tf.to_float(tf.fill(tf.shape(labels_scatter), -1))) max_overlaps = tf.scatter_nd(indices=tf.to_int32(anchor_filter_inds), updates=max_overlaps, shape=[tf.shape(all_anchors)[0]]) return labels, bbox_targets, max_overlaps
def _build(self, conv_feature_map, proposals, im_shape, base_network, gt_boxes=None, is_training=False): """ Classifies & refines proposals based on the pooled feature map. Args: conv_feature_map: The feature map of the image, extracted using the pretrained network. Shape: (num_proposals, pool_height, pool_width, 512). proposals: A Tensor with the bounding boxes proposed by the RPN. Shape: (total_num_proposals, 4). Encoding: (x1, y1, x2, y2). im_shape: A Tensor with the shape of the image in the form of (image_height, image_width). gt_boxes (optional): A Tensor with the ground truth boxes of the image. Shape: (total_num_gt, 5). Encoding: (x1, y1, x2, y2, label). is_training (optional): A boolean to determine if we are just using the module for training or just inference. Returns: prediction_dict: a dict with the object predictions. It should have the keys: objects: labels: probs: rcnn: target: """ self._instantiate_layers() prediction_dict = {'_debug': {}} if gt_boxes is not None: proposals_target, bbox_offsets_target = self._rcnn_target( proposals, gt_boxes) if is_training: with tf.name_scope('prepare_batch'): # We flatten to set shape, but it is already a flat Tensor. in_batch_proposals = tf.reshape( tf.greater_equal(proposals_target, 0), [-1] ) proposals = tf.boolean_mask( proposals, in_batch_proposals) bbox_offsets_target = tf.boolean_mask( bbox_offsets_target, in_batch_proposals) proposals_target = tf.boolean_mask( proposals_target, in_batch_proposals) prediction_dict['target'] = { 'cls': proposals_target, 'bbox_offsets': bbox_offsets_target, } roi_prediction = self._roi_pool(proposals, conv_feature_map, im_shape) if self._debug: # Save raw roi prediction in debug mode. prediction_dict['_debug']['roi'] = roi_prediction pooled_features = roi_prediction['roi_pool'] features = base_network._build_tail( pooled_features, is_training=is_training ) if self._use_mean: # We avg our height and width dimensions for a more # "memory-friendly" Tensor. features = tf.reduce_mean(features, [1, 2]) # We treat num proposals as batch number so that when flattening we # get a (num_proposals, flatten_pooled_feature_map_size) Tensor. flatten_features = tf.contrib.layers.flatten(features) net = tf.identity(flatten_features) if is_training: net = tf.nn.dropout(net, keep_prob=self._dropout_keep_prob) if self._debug: prediction_dict['_debug']['flatten_net'] = net # After flattening we are left with a Tensor of shape # (num_proposals, pool_height * pool_width * 512). # The first dimension works as batch size when applied to snt.Linear. for i, layer in enumerate(self._layers): # Through FC layer. net = layer(net) # Apply activation and dropout. variable_summaries( net, 'fc_{}_preactivationout'.format(i), 'reduced' ) net = self._activation(net) if self._debug: prediction_dict['_debug']['layer_{}_out'.format(i)] = net variable_summaries(net, 'fc_{}_out'.format(i), 'reduced') if is_training: net = tf.nn.dropout(net, keep_prob=self._dropout_keep_prob) cls_score = self._classifier_layer(net) cls_prob = tf.nn.softmax(cls_score, axis=1) bbox_offsets = self._bbox_layer(net) prediction_dict['rcnn'] = { 'cls_score': cls_score, 'cls_prob': cls_prob, 'bbox_offsets': bbox_offsets, } # Get final objects proposals based on the probabilty, the offsets and # the original proposals. proposals_pred = self._rcnn_proposal( proposals, bbox_offsets, cls_prob, im_shape) # objects, objects_labels, and objects_labels_prob are the only keys # that matter for drawing objects. prediction_dict['objects'] = proposals_pred['objects'] prediction_dict['labels'] = proposals_pred['proposal_label'] prediction_dict['probs'] = proposals_pred['proposal_label_prob'] if self._debug: prediction_dict['_debug']['proposal'] = proposals_pred # Calculate summaries for results variable_summaries(cls_prob, 'cls_prob', 'reduced') variable_summaries(bbox_offsets, 'bbox_offsets', 'reduced') if self._debug: variable_summaries(pooled_features, 'pooled_features', 'full') layer_summaries(self._classifier_layer, 'full') layer_summaries(self._bbox_layer, 'full') return prediction_dict
def _compute_inner_update_onsbet(self, var, grad, state): update_ops = [] eta = tf.cast(state.get_hyper(ETA), var.dtype.base_dtype) betting_domain = tf.cast( state.get_hyper(BETTING_DOMAIN), var.dtype.base_dtype) wealth = state.get_slot(var, INNER_WEALTH) betting_fraction = state.get_slot(var, OUTER_BETTING_FRACTION) inner_betting_fraction = state.get_slot(var, INNER_BETTING_FRACTION) sum_grad_squared = state.get_slot(var, INNER_SUM_GRAD_SQUARED) inner_maximum_gradient = state.get_slot(var, INNER_MAXIMUM_GRADIENT) inner_maximum_gradient_updated = self._assign( inner_maximum_gradient, tf.maximum(inner_maximum_gradient, tf.abs(grad))) update_ops.append(inner_maximum_gradient_updated) clipped_old_betting_fraction = tf.clip_by_value(betting_fraction, -betting_domain, betting_domain) # Process grad to respect truncation to [-betting_domain, betting_domain] truncated_grad = tf.where( tf.greater_equal( grad * (betting_fraction - clipped_old_betting_fraction), 0), grad, tf.zeros(tf.shape(grad))) wealth_delta = -betting_fraction * truncated_grad wealth_updated = self._assign_add(wealth, wealth_delta) update_ops.append(wealth_updated) # This is the gradient with respect to the betting fraction v # use by the ONS algorithm - a kind of "inner inner grad". # Hueristic: We also scale v_grad down by the inner maximum gradient so as # to make it ``unitless''. This is helpful because the learning rate for # ONS is proportional to sum v_grad**2, and so the scale of the learning # rate and of v_grad are unlikely to be properly matched without this. if self.rescale_inner: v_grad = truncated_grad / ( (1.0 - inner_betting_fraction * truncated_grad) * inner_maximum_gradient_updated) else: v_grad = truncated_grad / ( (1.0 - inner_betting_fraction * truncated_grad)) sum_grad_squared_updated = self._assign_add(sum_grad_squared, tf.square(v_grad)) update_ops.append(sum_grad_squared_updated) new_inner_betting_fraction = inner_betting_fraction - eta * v_grad / ( sum_grad_squared_updated) new_inner_betting_fraction = tf.clip_by_value(new_inner_betting_fraction, -betting_domain, betting_domain) inner_betting_fraction_updated = self._assign(inner_betting_fraction, new_inner_betting_fraction) update_ops.append(inner_betting_fraction_updated) if self.output_summaries: mean_inner_betting_fraction_summary = tf.reduce_mean( tf.abs(inner_betting_fraction_updated)) max_inner_betting_fraction_summary = tf.reduce_max( tf.abs(inner_betting_fraction_updated)) inner_maximum_gradient_summary = tf.reduce_max( inner_maximum_gradient_updated) tf.summary.scalar(self._name + "/mean_inner_betting/" + var.name, mean_inner_betting_fraction_summary) tf.summary.scalar(self._name + "/max_inner_betting/" + var.name, max_inner_betting_fraction_summary) tf.summary.scalar(self._name + "/inner_maximum_gradient/" + var.name, inner_maximum_gradient_summary) betting_fraction_updated = self._assign( betting_fraction, inner_betting_fraction_updated * wealth_updated) update_ops.append(betting_fraction_updated) clipped_betting_fraction = tf.clip_by_value(betting_fraction_updated, -betting_domain, betting_domain) return clipped_betting_fraction, tf.group(*update_ops)
def _compute_inner_update_scinol(self, var, grad, state): update_ops = [] betting_domain = tf.cast( state.get_hyper(BETTING_DOMAIN), var.dtype.base_dtype) reward = state.get_slot(var, INNER_REWARD) betting_fraction = state.get_slot(var, OUTER_BETTING_FRACTION) sum_grad_squared = state.get_slot(var, INNER_SUM_GRAD_SQUARED) sum_grad = state.get_slot(var, INNER_SUM_GRAD) inner_maximum_gradient = state.get_slot(var, INNER_MAXIMUM_GRADIENT) # clip inner gradient to respect previous inner_maximum_gradient value # This introduces at most an additive constant overhead in the regret # since the inner betting fraction lies in a bounded domain. clipped_grad = tf.clip_by_value(grad, -inner_maximum_gradient, inner_maximum_gradient) with tf.control_dependencies([clipped_grad]): inner_maximum_gradient_updated = self._assign( inner_maximum_gradient, tf.maximum(inner_maximum_gradient, tf.abs(grad))) update_ops.append(inner_maximum_gradient_updated) clipped_old_betting_fraction = tf.clip_by_value(betting_fraction, -betting_domain, betting_domain) # Process grad to respect truncation to [-betting_domain, betting_domain] truncated_grad = tf.where( tf.greater_equal( clipped_grad * (betting_fraction - clipped_old_betting_fraction), 0.0), clipped_grad, tf.zeros(tf.shape(clipped_grad))) reward_delta = -betting_fraction * truncated_grad reward_updated = self._assign_add(reward, reward_delta) update_ops.append(reward_updated) sum_grad_squared_updated = self._assign_add(sum_grad_squared, tf.square(truncated_grad)) update_ops.append(sum_grad_squared_updated) sum_grad_updated = self._assign_add(sum_grad, truncated_grad) update_ops.append(sum_grad_updated) # The second term in this maximum, inner_maximum_gradient_updated / self.eta # is a hack to force the betting fraction to not be too big at first. scaling = tf.minimum(tf.rsqrt(sum_grad_squared_updated + tf.square(inner_maximum_gradient_updated)), self.eta/inner_maximum_gradient_updated) theta = -sum_grad_updated * scaling # rescale inner flag is a hack that rescales the epsilon_v by the # maximum inner gradient. if self.rescale_inner: epsilon_scaling = inner_maximum_gradient_updated else: epsilon_scaling = 1.0 inner_betting_fraction = tf.sign(theta) * tf.minimum(tf.abs(theta), 1.0) * scaling / 2.0 new_betting_fraction = inner_betting_fraction * ( reward_updated + epsilon_scaling * self.epsilon_v) betting_fraction_updated = self._assign(betting_fraction, new_betting_fraction) update_ops.append(betting_fraction_updated) clipped_betting_fraction = tf.clip_by_value(betting_fraction_updated, -betting_domain, betting_domain) if self.output_summaries: mean_unclipped_betting_fraction_summary = tf.reduce_mean( tf.abs(betting_fraction_updated)) max_unclipped_betting_fraction_summary = tf.reduce_max( tf.abs(betting_fraction_updated)) mean_clipped_betting_fraction_summary = tf.reduce_mean( tf.abs(clipped_betting_fraction)) max_clipped_betting_fraction_summary = tf.reduce_max( tf.abs(clipped_betting_fraction)) max_abs_gradient = tf.reduce_max(tf.abs(grad)) max_truncated_grad = tf.reduce_max(tf.abs(truncated_grad)) tf.summary.scalar(self._name + "/mean_unclipped_bet/" + var.name, mean_unclipped_betting_fraction_summary) tf.summary.scalar(self._name + "/max_unclipped_bet/" + var.name, max_unclipped_betting_fraction_summary) tf.summary.scalar(self._name + "/mean_clipped_bet/" + var.name, mean_clipped_betting_fraction_summary) tf.summary.scalar(self._name + "/max_clipped_bet/" + var.name, max_clipped_betting_fraction_summary) tf.summary.scalar(self._name + "/max_abs_inner_grad/" + var.name, max_abs_gradient) tf.summary.scalar( self._name + "/max_abs_truncated_inner_grad/" + var.name, max_truncated_grad) return clipped_betting_fraction, tf.group(*update_ops)
def random_crop(image_list, crop_height, crop_width): """Crops the given list of images. The function applies the same crop to each image in the list. This can be effectively applied when there are multiple image inputs of the same dimension such as: image, depths, normals = random_crop([image, depths, normals], 120, 150) Args: image_list: a list of image tensors of the same dimension but possibly varying channel. crop_height: the new height. crop_width: the new width. Returns: the image_list with cropped images. Raises: ValueError: if there are multiple image inputs provided with different size or the images are smaller than the crop dimensions. """ if not image_list: raise ValueError('Empty image_list.') # Compute the rank assertions. rank_assertions = [] for i in range(len(image_list)): image_rank = tf.rank(image_list[i]) rank_assert = tf.Assert(tf.equal(image_rank, 3), [ 'Wrong rank for tensor %s [expected] [actual]', image_list[i].name, 3, image_rank ]) rank_assertions.append(rank_assert) with tf.control_dependencies([rank_assertions[0]]): image_shape = tf.shape(image_list[0]) image_height = image_shape[0] image_width = image_shape[1] crop_size_assert = tf.Assert( tf.logical_and(tf.greater_equal(image_height, crop_height), tf.greater_equal(image_width, crop_width)), ['Crop size greater than the image size.']) asserts = [rank_assertions[0], crop_size_assert] for i in range(1, len(image_list)): image = image_list[i] asserts.append(rank_assertions[i]) with tf.control_dependencies([rank_assertions[i]]): shape = tf.shape(image) height = shape[0] width = shape[1] height_assert = tf.Assert(tf.equal(height, image_height), [ 'Wrong height for tensor %s [expected][actual]', image.name, height, image_height ]) width_assert = tf.Assert(tf.equal(width, image_width), [ 'Wrong width for tensor %s [expected][actual]', image.name, width, image_width ]) asserts.extend([height_assert, width_assert]) # Create a random bounding box. # # Use tf.random_uniform and not numpy.random.rand as doing the former would # generate random numbers at graph eval time, unlike the latter which # generates random numbers at graph definition time. with tf.control_dependencies(asserts): max_offset_height = tf.reshape(image_height - crop_height + 1, []) max_offset_width = tf.reshape(image_width - crop_width + 1, []) offset_height = tf.random_uniform([], maxval=max_offset_height, dtype=tf.int32) offset_width = tf.random_uniform([], maxval=max_offset_width, dtype=tf.int32) return [ _crop(image, offset_height, offset_width, crop_height, crop_width) for image in image_list ]
def match_boxes(anchors, groundtruth_boxes, positives_threshold=0.5, negatives_threshold=0.4, force_match_groundtruth=True): """ If an anchor has IoU over `positives_threshold` with any groundtruth box, it will be set a positive label. Anchors which have highest IoU for a groundtruth box will also be assigned a positive label. Meanwhile, if other anchors have IoU less than `negatives_threshold` with all groundtruth boxes, their labels will be negative. Matching algorithm: 1) for each groundtruth box choose the anchor with largest IoU, 2) remove this set of anchors from the set of all anchors, 3) for each remaining anchor choose the groundtruth box with largest IoU, but only if this IoU is larger than `positives_threshold`, 4) remove this set of matched anchors from the set of all anchors, 5) for each remaining anchor if it has IoU less than `negatives_threshold` with all groundtruth boxes set it to `negative`, otherwise set it to `ignore`. Note: after step 1, it could happen that for some two groundtruth boxes chosen anchors are the same. Let's hope this never happens. Also see the comments below. Arguments: anchors: a float tensor with shape [num_anchors, 4]. groundtruth_boxes: a float tensor with shape [N, 4]. positives_threshold: a float number. negatives_threshold: a float number. force_match_groundtruth: a boolean, whether to try to make sure that all groundtruth boxes are matched. Returns: an int tensor with shape [num_anchors], possible values that it can contain are [-2, -1, 0, 1, 2, ..., (N - 1)], where numbers in the range [0, N - 1] mean indices of the groundtruth boxes, `-1` means that an anchor box is negative (background), and `-2` means that we must ignore this anchor box. """ assert positives_threshold >= negatives_threshold # for each anchor box choose the groundtruth box with largest iou similarity_matrix = iou(groundtruth_boxes, anchors) # shape [N, num_anchors] matches = tf.argmax(similarity_matrix, axis=0, output_type=tf.int32) # shape [num_anchors] matched_vals = tf.reduce_max(similarity_matrix, axis=0) # shape [num_anchors] is_positive = tf.to_int32( tf.greater_equal(matched_vals, positives_threshold)) if positives_threshold == negatives_threshold: is_negative = 1 - is_positive matches = matches * is_positive + (-1 * is_negative) else: is_negative = tf.to_int32(tf.greater(negatives_threshold, matched_vals)) to_ignore = (1 - is_positive) * (1 - is_negative) matches = matches * is_positive + (-1 * is_negative) + (-2 * to_ignore) # after this, it could happen that some groundtruth # boxes are not matched with any anchor box if force_match_groundtruth: # now we must ensure that each row (groundtruth box) is matched to # at least one column (which is not guaranteed # otherwise if `positives_threshold` is high) # for each groundtruth box choose the anchor box with largest iou # (force match for each groundtruth box) forced_matches_ids = tf.argmax(similarity_matrix, axis=1, output_type=tf.int32) # shape [N] # if all indices in forced_matches_ids are different then all rows will be matched num_anchors = tf.shape(anchors)[0] forced_matches_indicators = tf.one_hot( forced_matches_ids, depth=num_anchors, dtype=tf.int32) # shape [N, num_anchors] forced_match_row_ids = tf.argmax( forced_matches_indicators, axis=0, output_type=tf.int32) # shape [num_anchors] # some forced matches could be very bad! forced_matches_values = tf.reduce_max(similarity_matrix, axis=1) # shape [N] small_iou = 0.05 # this requires that forced match has at least small intersection is_okay = tf.to_int32( tf.greater_equal(forced_matches_values, small_iou)) # shape [N] forced_matches_indicators = forced_matches_indicators * tf.expand_dims( is_okay, axis=1) forced_match_mask = tf.greater( tf.reduce_max(forced_matches_indicators, axis=0), 0) # shape [num_anchors] matches = tf.where(forced_match_mask, forced_match_row_ids, matches) # even after this it could happen that some rows aren't matched, # but i believe that this event has low probability return matches
def prepare_encoder_input(features, hparams, embed_scope=None, embed_token_fn=common_embed.embed_tokens): """Prepares the input for the screen encoder. Args: features: the feature dict. hparams: the hyperparameter. embed_scope: the embedding variable scope. embed_token_fn: the function for embedding tokens. Returns: object_embedding: a Tensor of shape [batch_size, num_steps, max_object_count, embed_depth] object_mask: a binary tensor of shape [batch_size, num_steps, max_object_count] nonpadding_bias: a Tensor of shape [batch_size, num_steps, max_object_count] """ with tf.control_dependencies( [tf.assert_equal(tf.rank(features["obj_text"]), 4)]): if hparams.get("synthetic_screen_noise", 0.) > 0.: num_objects = tf.shape(features["obj_text"])[2] # [batch, length, num_objects] target_obj_mask = tf.cast( tf.one_hot(features["objects"], depth=num_objects), tf.bool) num_tokens = tf.shape(features["obj_text"])[-1] target_obj_mask = tf.tile(tf.expand_dims(target_obj_mask, 3), [1, 1, 1, num_tokens]) # Randomly keep tokens keep_mask = tf.greater_equal( tf.random_uniform(shape=tf.shape(features["obj_text"])), hparams.synthetic_screen_noise) # Keep paddings keep_mask = tf.logical_or(tf.equal(features["obj_text"], 0), keep_mask) # Keep targets target_obj_mask = tf.logical_or(target_obj_mask, keep_mask) features["obj_text"] = tf.where( target_obj_mask, features["obj_text"], tf.random_uniform(shape=tf.shape(features["obj_text"]), maxval=50000, dtype=tf.int32)) text_embeddings, _ = embed_token_fn(features["obj_text"], hparams.task_vocab_size, hparams.hidden_size, hparams, embed_scope=embed_scope) with tf.variable_scope("obj_text_embed", reuse=tf.AUTO_REUSE): if hparams.obj_text_aggregation == "max": embed_bias = tf.cast(tf.less(features["obj_text"], 2), tf.float32) * -1e7 with tf.control_dependencies( [tf.assert_equal(tf.rank(embed_bias), 4)]): text_embeddings = tf.reduce_max( text_embeddings + tf.expand_dims(embed_bias, 4), -2) no_txt_embed = tf.get_variable(name="no_txt_embed", shape=[hparams.hidden_size]) shape = common_layers.shape_list(text_embeddings) no_txt_embed = tf.tile( tf.reshape(no_txt_embed, [1, 1, 1, hparams.hidden_size]), [shape[0], shape[1], shape[2], 1]) text_embeddings = tf.maximum(text_embeddings, no_txt_embed) elif hparams.obj_text_aggregation == "sum": # [batch, step, #max_obj, #max_token] 0 for padded tokens real_objects = tf.cast( tf.greater_equal(features["obj_text"], 2), tf.float32) # [batch, step, #max_obj, hidden] 0s for padded objects text_embeddings = tf.reduce_sum( text_embeddings * tf.expand_dims(real_objects, 4), -2) elif hparams.obj_text_aggregation == "mean": shape_list = common_layers.shape_list(text_embeddings) embeddings = tf.reshape(text_embeddings, [-1] + shape_list[3:]) emb_sum = tf.reduce_sum(tf.abs(embeddings), axis=-1) non_paddings = tf.not_equal(emb_sum, 0.0) embeddings = common_embed.average_bag_of_embeds( embeddings, non_paddings, use_bigrams=True, bigram_embed_scope=embed_scope, append_start_end=True) text_embeddings = tf.reshape( embeddings, shape_list[:3] + [hparams.hidden_size]) else: raise ValueError("Unrecognized token aggregation %s" % (hparams.obj_text_aggregation)) with tf.control_dependencies([ tf.assert_equal(tf.rank(features["obj_type"]), 3), tf.assert_equal(tf.rank(features["obj_clickable"]), 3) ]): with tf.variable_scope("encode_object_attr", reuse=tf.AUTO_REUSE): type_embedding = tf.nn.embedding_lookup(params=tf.get_variable( name="embed_type_w", shape=[hparams.get("num_types", 100), hparams.hidden_size]), ids=tf.maximum( features["obj_type"], 0)) clickable_embedding = tf.nn.embedding_lookup( params=tf.get_variable(name="embed_clickable_w", shape=[2, hparams.hidden_size]), ids=features["obj_clickable"]) with tf.control_dependencies( [tf.assert_equal(tf.rank(features["obj_screen_pos"]), 4)]): def _create_embed(feature_name, vocab_size, depth): """Embed a position feature.""" pos_embedding_list = [] with tf.variable_scope("encode_object_" + feature_name, reuse=tf.AUTO_REUSE): num_featues = common_layers.shape_list( features[feature_name])[-1] for i in range(num_featues): pos_embedding_list.append( tf.nn.embedding_lookup( params=tf.get_variable(name=feature_name + "_embed_w_%d" % i, shape=[vocab_size, depth]), ids=features[feature_name][:, :, :, i])) pos_embedding = tf.add_n(pos_embedding_list) return pos_embedding pos_embedding = _create_embed("obj_screen_pos", hparams.max_pixel_pos, hparams.hidden_size) if "all" == hparams.screen_embedding_feature or ( "dom" in hparams.screen_embedding_feature): dom_embedding = _create_embed("obj_dom_pos", hparams.max_dom_pos, hparams.hidden_size) object_embed = tf.zeros_like(text_embeddings, dtype=tf.float32) if hparams.screen_embedding_feature == "all": object_embed = (text_embeddings + type_embedding + pos_embedding + dom_embedding) elif "text" in hparams.screen_embedding_feature: object_embed += text_embeddings elif "type" in hparams.screen_embedding_feature: object_embed += type_embedding elif "pos" in hparams.screen_embedding_feature: object_embed += pos_embedding elif "dom" in hparams.screen_embedding_feature: object_embed += dom_embedding elif "click" in hparams.screen_embedding_feature: object_embed += clickable_embedding object_mask = tf.cast(tf.not_equal(features["obj_type"], -1), tf.float32) object_embed = object_embed * tf.expand_dims(object_mask, 3) att_bias = (1. - object_mask) * common_attention.large_compatible_negative( object_embed.dtype) return object_embed, object_mask, att_bias
def build_genie_model(feat_dict, cfg, batch_size, seq_len, is_training=True, seq_varlens=None, dtype=tf.float32): """Builds a Piano Genie model. Args: feat_dict: Dictionary containing input tensors. cfg: Configuration object. batch_size: Number of items in batch. seq_len: Length of each batch item. is_training: Set to False for evaluation. seq_varlens: If not None, a tensor with the batch sequence lengths. dtype: Model weight type. Returns: A dict containing tensors for relevant model config. """ out_dict = {} # Parse features pitches = util.demidify(feat_dict["midi_pitches"]) velocities = feat_dict["velocities"] pitches_scalar = ((tf.cast(pitches, tf.float32) / 87.) * 2.) - 1. # Create sequence lens if is_training and cfg.train_randomize_seq_len: seq_lens = tf.random_uniform([batch_size], minval=cfg.train_seq_len_min, maxval=seq_len + 1, dtype=tf.int32) stp_varlen_mask = tf.sequence_mask(seq_lens, maxlen=seq_len, dtype=tf.float32) elif seq_varlens is not None: seq_lens = seq_varlens stp_varlen_mask = tf.sequence_mask(seq_varlens, maxlen=seq_len, dtype=tf.float32) else: seq_lens = tf.ones([batch_size], dtype=tf.int32) * seq_len stp_varlen_mask = None # Encode if (cfg.stp_emb_unconstrained or cfg.stp_emb_vq or cfg.stp_emb_iq or cfg.seq_emb_unconstrained or cfg.seq_emb_vae or cfg.lor_emb_unconstrained): # Build encoder features enc_feats = [] if cfg.enc_pitch_scalar: enc_feats.append(tf.expand_dims(pitches_scalar, axis=-1)) else: enc_feats.append(tf.one_hot(pitches, 88)) if "delta_times_int" in cfg.enc_aux_feats: enc_feats.append( tf.one_hot(feat_dict["delta_times_int"], cfg.data_max_discrete_times + 1)) if "velocities" in cfg.enc_aux_feats: enc_feats.append( tf.one_hot(velocities, cfg.data_max_discrete_velocities + 1)) enc_feats = tf.concat(enc_feats, axis=2) with tf.variable_scope("encoder"): enc_stp, enc_seq = simple_lstm_encoder( enc_feats, seq_lens, rnn_celltype=cfg.rnn_celltype, rnn_nlayers=cfg.rnn_nlayers, rnn_nunits=cfg.rnn_nunits, rnn_bidirectional=cfg.enc_rnn_bidirectional, dtype=dtype) latents = [] # Step embeddings (single vector per timestep) if cfg.stp_emb_unconstrained: with tf.variable_scope("stp_emb_unconstrained"): stp_emb_unconstrained = tf.layers.dense( enc_stp, cfg.stp_emb_unconstrained_embedding_dim) out_dict["stp_emb_unconstrained"] = stp_emb_unconstrained latents.append(stp_emb_unconstrained) # Quantized step embeddings with VQ-VAE if cfg.stp_emb_vq: import sonnet as snt # pylint:disable=g-import-not-at-top with tf.variable_scope("stp_emb_vq"): with tf.variable_scope("pre_vq"): # pre_vq_encoding is tf.float32 of [batch_size, seq_len, embedding_dim] pre_vq_encoding = tf.layers.dense(enc_stp, cfg.stp_emb_vq_embedding_dim) with tf.variable_scope("quantizer"): assert stp_varlen_mask is None vq_vae = snt.nets.VectorQuantizer( embedding_dim=cfg.stp_emb_vq_embedding_dim, num_embeddings=cfg.stp_emb_vq_codebook_size, commitment_cost=cfg.stp_emb_vq_commitment_cost) vq_vae_output = vq_vae(pre_vq_encoding, is_training=is_training) stp_emb_vq_quantized = vq_vae_output["quantize"] stp_emb_vq_discrete = tf.reshape( tf.argmax(vq_vae_output["encodings"], axis=1, output_type=tf.int32), [batch_size, seq_len]) stp_emb_vq_codebook = tf.transpose(vq_vae.embeddings) out_dict["stp_emb_vq_quantized"] = stp_emb_vq_quantized out_dict["stp_emb_vq_discrete"] = stp_emb_vq_discrete out_dict["stp_emb_vq_loss"] = vq_vae_output["loss"] out_dict["stp_emb_vq_codebook"] = stp_emb_vq_codebook out_dict["stp_emb_vq_codebook_ppl"] = vq_vae_output["perplexity"] latents.append(stp_emb_vq_quantized) # This tensor retrieves continuous embeddings from codebook. It should # *never* be used during training. out_dict["stp_emb_vq_quantized_lookup"] = tf.nn.embedding_lookup( stp_emb_vq_codebook, stp_emb_vq_discrete) # Integer-quantized step embeddings with straight-through if cfg.stp_emb_iq: with tf.variable_scope("stp_emb_iq"): with tf.variable_scope("pre_iq"): # pre_iq_encoding is tf.float32 of [batch_size, seq_len] pre_iq_encoding = tf.layers.dense(enc_stp, 1)[:, :, 0] def iqst(x, n): """Integer quantization with straight-through estimator.""" eps = 1e-7 s = float(n - 1) xp = tf.clip_by_value((x + 1) / 2.0, -eps, 1 + eps) xpp = tf.round(s * xp) xppp = 2 * (xpp / s) - 1 return xpp, x + tf.stop_gradient(xppp - x) with tf.variable_scope("quantizer"): # Pass rounded vals to decoder w/ straight-through estimator stp_emb_iq_discrete_f, stp_emb_iq_discrete_rescaled = iqst( pre_iq_encoding, cfg.stp_emb_iq_nbins) stp_emb_iq_discrete = tf.cast(stp_emb_iq_discrete_f + 1e-4, tf.int32) stp_emb_iq_discrete_f = tf.cast(stp_emb_iq_discrete, tf.float32) stp_emb_iq_quantized = tf.expand_dims( stp_emb_iq_discrete_rescaled, axis=2) # Determine which elements round to valid indices stp_emb_iq_inrange = tf.logical_and( tf.greater_equal(pre_iq_encoding, -1), tf.less_equal(pre_iq_encoding, 1)) stp_emb_iq_inrange_mask = tf.cast(stp_emb_iq_inrange, tf.float32) stp_emb_iq_valid_p = weighted_avg(stp_emb_iq_inrange_mask, stp_varlen_mask) # Regularize to encourage encoder to output in range stp_emb_iq_range_penalty = weighted_avg( tf.square(tf.maximum(tf.abs(pre_iq_encoding) - 1, 0)), stp_varlen_mask) # Regularize to correlate latent finite differences to input stp_emb_iq_dlatents = pre_iq_encoding[:, 1:] - pre_iq_encoding[:, : -1] if cfg.stp_emb_iq_contour_dy_scalar: stp_emb_iq_dnotes = pitches_scalar[:, 1:] - pitches_scalar[:, : -1] else: stp_emb_iq_dnotes = tf.cast( pitches[:, 1:] - pitches[:, :-1], tf.float32) if cfg.stp_emb_iq_contour_exp == 1: power_func = tf.identity elif cfg.stp_emb_iq_contour_exp == 2: power_func = tf.square else: raise NotImplementedError() if cfg.stp_emb_iq_contour_comp == "product": comp_func = tf.multiply elif cfg.stp_emb_iq_contour_comp == "quotient": def comp_func(x, y): return tf.divide(x, y + 1e-6) else: raise NotImplementedError() stp_emb_iq_contour_penalty = weighted_avg( power_func( tf.maximum( cfg.stp_emb_iq_contour_margin - comp_func(stp_emb_iq_dnotes, stp_emb_iq_dlatents), 0)), None if stp_varlen_mask is None else stp_varlen_mask[:, 1:]) # Regularize to maintain note consistency stp_emb_iq_note_held = tf.cast( tf.equal(pitches[:, 1:] - pitches[:, :-1], 0), tf.float32) if cfg.stp_emb_iq_deviate_exp == 1: power_func = tf.abs elif cfg.stp_emb_iq_deviate_exp == 2: power_func = tf.square if stp_varlen_mask is None: mask = stp_emb_iq_note_held else: mask = stp_varlen_mask[:, 1:] * stp_emb_iq_note_held stp_emb_iq_deviate_penalty = weighted_avg( power_func(stp_emb_iq_dlatents), mask) # Calculate perplexity of discrete encoder posterior if stp_varlen_mask is None: mask = stp_emb_iq_inrange_mask else: mask = stp_varlen_mask * stp_emb_iq_inrange_mask stp_emb_iq_discrete_oh = tf.one_hot(stp_emb_iq_discrete, cfg.stp_emb_iq_nbins) stp_emb_iq_avg_probs = weighted_avg(stp_emb_iq_discrete_oh, mask, axis=[0, 1], expand_mask=True) stp_emb_iq_discrete_ppl = tf.exp( -tf.reduce_sum(stp_emb_iq_avg_probs * tf.log(stp_emb_iq_avg_probs + 1e-10))) out_dict["stp_emb_iq_quantized"] = stp_emb_iq_quantized out_dict["stp_emb_iq_discrete"] = stp_emb_iq_discrete out_dict["stp_emb_iq_valid_p"] = stp_emb_iq_valid_p out_dict["stp_emb_iq_range_penalty"] = stp_emb_iq_range_penalty out_dict["stp_emb_iq_contour_penalty"] = stp_emb_iq_contour_penalty out_dict["stp_emb_iq_deviate_penalty"] = stp_emb_iq_deviate_penalty out_dict["stp_emb_iq_discrete_ppl"] = stp_emb_iq_discrete_ppl latents.append(stp_emb_iq_quantized) # This tensor converts discrete values to continuous. # It should *never* be used during training. out_dict["stp_emb_iq_quantized_lookup"] = tf.expand_dims( 2. * (stp_emb_iq_discrete_f / (cfg.stp_emb_iq_nbins - 1.)) - 1., axis=2) # Sequence embedding (single vector per sequence) if cfg.seq_emb_unconstrained: with tf.variable_scope("seq_emb_unconstrained"): seq_emb_unconstrained = tf.layers.dense( enc_seq, cfg.seq_emb_unconstrained_embedding_dim) out_dict["seq_emb_unconstrained"] = seq_emb_unconstrained seq_emb_unconstrained = tf.stack([seq_emb_unconstrained] * seq_len, axis=1) latents.append(seq_emb_unconstrained) # Sequence embeddings (variational w/ reparameterization trick) if cfg.seq_emb_vae: with tf.variable_scope("seq_emb_vae"): seq_emb_vae = tf.layers.dense(enc_seq, cfg.seq_emb_vae_embedding_dim * 2) mean = seq_emb_vae[:, :cfg.seq_emb_vae_embedding_dim] stddev = 1e-6 + tf.nn.softplus( seq_emb_vae[:, cfg.seq_emb_vae_embedding_dim:]) seq_emb_vae = mean + stddev * tf.random_normal( tf.shape(mean), 0, 1, dtype=dtype) kl = tf.reduce_mean( 0.5 * tf.reduce_sum(tf.square(mean) + tf.square(stddev) - tf.log(1e-8 + tf.square(stddev)) - 1, axis=1)) out_dict["seq_emb_vae"] = seq_emb_vae out_dict["seq_emb_vae_kl"] = kl seq_emb_vae = tf.stack([seq_emb_vae] * seq_len, axis=1) latents.append(seq_emb_vae) # Low-rate embeddings if cfg.lor_emb_unconstrained: assert seq_len % cfg.lor_emb_n == 0 with tf.variable_scope("lor_emb_unconstrained"): # Downsample step embeddings rnn_embedding_dim = int(enc_stp.get_shape()[-1]) enc_lor = tf.reshape(enc_stp, [ batch_size, seq_len // cfg.lor_emb_n, cfg.lor_emb_n * rnn_embedding_dim ]) lor_emb_unconstrained = tf.layers.dense( enc_lor, cfg.lor_emb_unconstrained_embedding_dim) out_dict["lor_emb_unconstrained"] = lor_emb_unconstrained # Upsample lo-rate embeddings for decoding lor_emb_unconstrained = tf.expand_dims(lor_emb_unconstrained, axis=2) lor_emb_unconstrained = tf.tile(lor_emb_unconstrained, [1, 1, cfg.lor_emb_n, 1]) lor_emb_unconstrained = tf.reshape( lor_emb_unconstrained, [batch_size, seq_len, cfg.lor_emb_unconstrained_embedding_dim]) latents.append(lor_emb_unconstrained) # Build decoder features dec_feats = latents if cfg.dec_autoregressive: # Retrieve pitch numbers curr_pitches = pitches last_pitches = curr_pitches[:, :-1] last_pitches = tf.pad(last_pitches, [[0, 0], [1, 0]], constant_values=-1) # Prepend <SOS> token out_dict["dec_last_pitches"] = last_pitches dec_feats.append(tf.one_hot(last_pitches + 1, 89)) if cfg.dec_pred_velocity: curr_velocities = velocities last_velocities = curr_velocities[:, :-1] last_velocities = tf.pad(last_velocities, [[0, 0], [1, 0]]) dec_feats.append( tf.one_hot(last_velocities, cfg.data_max_discrete_velocities + 1)) if "delta_times_int" in cfg.dec_aux_feats: dec_feats.append( tf.one_hot(feat_dict["delta_times_int"], cfg.data_max_discrete_times + 1)) if "velocities" in cfg.dec_aux_feats: assert not cfg.dec_pred_velocity dec_feats.append( tf.one_hot(feat_dict["velocities"], cfg.data_max_discrete_velocities + 1)) assert dec_feats dec_feats = tf.concat(dec_feats, axis=2) # Decode with tf.variable_scope("decoder"): dec_stp, dec_initial_state, dec_final_state = simple_lstm_decoder( dec_feats, seq_lens, batch_size, rnn_celltype=cfg.rnn_celltype, rnn_nlayers=cfg.rnn_nlayers, rnn_nunits=cfg.rnn_nunits) with tf.variable_scope("pitches"): dec_recons_logits = tf.layers.dense(dec_stp, 88) dec_recons_loss = weighted_avg( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=dec_recons_logits, labels=pitches), stp_varlen_mask) out_dict["dec_initial_state"] = dec_initial_state out_dict["dec_final_state"] = dec_final_state out_dict["dec_recons_logits"] = dec_recons_logits out_dict["dec_recons_scores"] = tf.nn.softmax(dec_recons_logits, axis=-1) out_dict["dec_recons_preds"] = tf.argmax(dec_recons_logits, output_type=tf.int32, axis=-1) out_dict["dec_recons_midi_preds"] = util.remidify( out_dict["dec_recons_preds"]) out_dict["dec_recons_loss"] = dec_recons_loss if cfg.dec_pred_velocity: with tf.variable_scope("velocities"): dec_recons_velocity_logits = tf.layers.dense( dec_stp, cfg.data_max_discrete_velocities + 1) dec_recons_velocity_loss = weighted_avg( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=dec_recons_velocity_logits, labels=velocities), stp_varlen_mask) out_dict["dec_recons_velocity_logits"] = dec_recons_velocity_logits out_dict["dec_recons_velocity_loss"] = dec_recons_velocity_loss # Stats if cfg.stp_emb_vq or cfg.stp_emb_iq: discrete = out_dict["stp_emb_vq_discrete" if cfg. stp_emb_vq else "stp_emb_iq_discrete"] dx = pitches[:, 1:] - pitches[:, :-1] dy = discrete[:, 1:] - discrete[:, :-1] contour_violation = tf.reduce_mean( tf.cast(tf.less(dx * dy, 0), tf.float32)) dx_hold = tf.equal(dx, 0) deviate_violation = weighted_avg( tf.cast(tf.not_equal(dy, 0), tf.float32), tf.cast(dx_hold, tf.float32)) out_dict["contour_violation"] = contour_violation out_dict["deviate_violation"] = deviate_violation return out_dict
def _filter_short(note_sequence_tensor, seq_len): note_sequence_len = tf.shape(note_sequence_tensor)[0] return tf.greater_equal(note_sequence_len, seq_len)
def hash_in_range(self, buckets, base, limit): """Return true if the hashing key falls in the range [base, limit).""" hash_bucket = tf.string_to_hash_bucket_fast(self.scene_id, buckets) return tf.logical_and(tf.greater_equal(hash_bucket, base), tf.less(hash_bucket, limit))
def overlap_mask(depth1, pose1_c2w, depth2, pose2_c2w, intrinsics): """Compute the overlap masks of two views using triangulation. The masks have the same shape of the input images. A pixel value is true if it can be seen by both cameras. Args: depth1: [HEIGHT, WIDTH, 1] the depth map of the first view. pose1_c2w: [3, 4] camera pose matrix (camera to world) of the first view. pose1_c2w[:, :3] is the rotation and pose1_c2w[:, -1] is the translation. depth2: [HEIGHT, WIDTH, 1] the depth map of the second view. pose2_c2w: [3, 4] camera pose matrix (camera to world) of the second view. pose1_c2w[:, :3] is the rotation and pose1_c2w[:, -1] is the translation. intrinsics: [3, 3] camera's intrinsic matrix. Returns: [HEIGHT, WIDTH] two overlap masks of the two inputs respectively. """ pose1_w2c = tf.matrix_inverse( tf.concat([pose1_c2w, tf.constant([[0., 0., 0., 1.]])], 0))[:3] pose2_w2c = tf.matrix_inverse( tf.concat([pose2_c2w, tf.constant([[0., 0., 0., 1.]])], 0))[:3] p_world1 = image_to_world_projection(depth1, intrinsics, pose1_c2w) p_image1_in_2, z1_c2 = world_to_image_projection(p_world1, intrinsics, pose2_w2c) p_world2 = image_to_world_projection(depth2, intrinsics, pose2_c2w) p_image2_in_1, z2_c1 = world_to_image_projection(p_world2, intrinsics, pose1_w2c) shape = depth1.shape.as_list() height, width = shape[0], shape[1] height = tf.cast(height, tf.float32) width = tf.cast(width, tf.float32) # Error tolerance. eps = 1e-4 # check the object seen by camera 2 is also projected to camera 1's image # plane and in front of the camera 1. mask_h2_in_1 = tf.logical_and( tf.less_equal(p_image2_in_1[:, :, 1], height + eps), tf.greater_equal(p_image2_in_1[:, :, 1], 0. - eps)) mask_w2_in_1 = tf.logical_and( tf.less_equal(p_image2_in_1[:, :, 0], width + eps), tf.greater_equal(p_image2_in_1[:, :, 0], 0. - eps)) # check the projected points are within the image boundaries and in front of # the camera. mask2_in_1 = tf.logical_and(tf.logical_and(mask_h2_in_1, mask_w2_in_1), tf.squeeze(z2_c1, -1) > 0) # check the object seen by camera 1 is also projected to camera 2's image # plane and in front of the camera 2. mask_h1_in_2 = tf.logical_and( tf.less_equal(p_image1_in_2[:, :, 1], height + eps), tf.greater_equal(p_image1_in_2[:, :, 1], 0. - eps)) mask_w1_in_2 = tf.logical_and( tf.less_equal(p_image1_in_2[:, :, 0], width + eps), tf.greater_equal(p_image1_in_2[:, :, 0], 0. - eps)) # check the projected points are within the image boundaries and in front of # the camera. mask1_in_2 = tf.logical_and(tf.logical_and(mask_h1_in_2, mask_w1_in_2), tf.squeeze(z1_c2, -1) > 0) return mask1_in_2, mask2_in_1
def p_sample_loop_trajectory(self, denoise_fn, *, shape, noise_fn=tf.random_normal, repeat_noise_steps=-1): """ Generate samples, returning intermediate images Useful for visualizing how denoised images evolve over time Args: repeat_noise_steps (int): Number of denoising timesteps in which the same noise is used across the batch. If >= 0, the initial noise is the same for all batch elemements. """ i_0 = tf.constant(self.num_timesteps - 1, dtype=tf.int32) assert isinstance(shape, (tuple, list)) img_0 = noise_like(shape, noise_fn, repeat_noise_steps >= 0) times = tf.Variable([i_0]) imgs = tf.Variable([img_0]) # Steps with repeated noise times, imgs = tf.while_loop( cond=lambda times_, _: tf.less_equal( self.num_timesteps - times_[-1], repeat_noise_steps), body=lambda times_, imgs_: [ tf.concat([times_, [times_[-1] - 1]], 0), tf.concat([ imgs_, [ self.p_sample(denoise_fn=denoise_fn, x=imgs_[-1], t=tf.fill([shape[0]], times_[-1]), noise_fn=noise_fn, repeat_noise=True) ] ], 0) ], loop_vars=[times, imgs], shape_invariants=[ tf.TensorShape([None, *i_0.shape]), tf.TensorShape([None, *img_0.shape]) ], back_prop=False) # Steps with different noise for each batch element times, imgs = tf.while_loop( cond=lambda times_, _: tf.greater_equal(times_[-1], 0), body=lambda times_, imgs_: [ tf.concat([times_, [times_[-1] - 1]], 0), tf.concat([ imgs_, [ self.p_sample(denoise_fn=denoise_fn, x=imgs_[-1], t=tf.fill([shape[0]], times_[-1]), noise_fn=noise_fn, repeat_noise=False) ] ], 0) ], loop_vars=[times, imgs], shape_invariants=[ tf.TensorShape([None, *i_0.shape]), tf.TensorShape([None, *img_0.shape]) ], back_prop=False) assert imgs[-1].shape == shape return times, imgs
def build(): """Builds the Tensorflow graph.""" inputs, lengths = None, None if mode in ('train', 'eval'): inputs, _, lengths = magenta.common.get_padded_batch( sequence_example_file_paths, hparams.batch_size, input_size, shuffle=mode == 'train') elif mode == 'generate': inputs = tf.placeholder(tf.float32, [hparams.batch_size, None, input_size]) cell = events_rnn_graph.make_rnn_cell( hparams.rnn_layer_sizes, dropout_keep_prob=hparams.dropout_keep_prob if mode == 'train' else 1.0, attn_length=hparams.attn_length, residual_connections=hparams.residual_connections) rnn_nade = RnnNade(cell, num_dims=input_size, num_hidden=hparams.nade_hidden_units) if mode in ('train', 'eval'): log_probs, cond_probs = rnn_nade.log_prob(inputs, lengths) inputs_flat = tf.to_float( magenta.common.flatten_maybe_padded_sequences(inputs, lengths)) predictions_flat = tf.to_float(tf.greater_equal(cond_probs, .5)) if mode == 'train': loss = tf.reduce_mean(-log_probs) perplexity = tf.reduce_mean(tf.exp(log_probs)) correct_predictions = tf.to_float( tf.equal(inputs_flat, predictions_flat)) accuracy = tf.reduce_mean(correct_predictions) precision = (tf.reduce_sum(inputs_flat * predictions_flat) / tf.reduce_sum(predictions_flat)) recall = (tf.reduce_sum(inputs_flat * predictions_flat) / tf.reduce_sum(inputs_flat)) optimizer = tf.train.AdamOptimizer( learning_rate=hparams.learning_rate) train_op = contrib_slim.learning.create_train_op( loss, optimizer, clip_gradient_norm=hparams.clip_norm) tf.add_to_collection('train_op', train_op) vars_to_summarize = { 'loss': loss, 'metrics/perplexity': perplexity, 'metrics/accuracy': accuracy, 'metrics/precision': precision, 'metrics/recall': recall, } elif mode == 'eval': vars_to_summarize, update_ops = contrib_metrics.aggregate_metric_map( { 'loss': tf.metrics.mean(-log_probs), 'metrics/perplexity': tf.metrics.mean(tf.exp(log_probs)), 'metrics/accuracy': tf.metrics.accuracy(inputs_flat, predictions_flat), 'metrics/precision': tf.metrics.precision(inputs_flat, predictions_flat), 'metrics/recall': tf.metrics.recall(inputs_flat, predictions_flat), }) for updates_op in update_ops.values(): tf.add_to_collection('eval_ops', updates_op) precision = vars_to_summarize['metrics/precision'] recall = vars_to_summarize['metrics/precision'] f1_score = tf.where( tf.greater(precision + recall, 0), 2 * ((precision * recall) / (precision + recall)), 0) vars_to_summarize['metrics/f1_score'] = f1_score for var_name, var_value in vars_to_summarize.items(): tf.summary.scalar(var_name, var_value) tf.add_to_collection(var_name, var_value) elif mode == 'generate': initial_state = rnn_nade.zero_state(hparams.batch_size) final_state = rnn_nade.steps(inputs, initial_state) samples, log_prob = rnn_nade.sample_single(initial_state) tf.add_to_collection('inputs', inputs) tf.add_to_collection('sample', samples) tf.add_to_collection('log_prob', log_prob) # Flatten state tuples for metagraph compatibility. for state in tf.nest.flatten(initial_state): tf.add_to_collection('initial_state', state) for state in tf.nest.flatten(final_state): tf.add_to_collection('final_state', state)
def dot_product_area_attention(q, k, v, bias, dropout_rate=0.0, image_shapes=None, name=None, attention_image_summary=None, save_weights_to=None, dropout_broadcast_dims=None, max_area_width=1, max_area_height=1, memory_height=1, area_key_mode="mean", area_value_mode="sum", top_k_areas=0, area_temperature=1.0, training=True): """Dot-product area attention. Args: q: Tensor with shape [..., length_q, depth_k]. k: Tensor with shape [..., length_kv, depth_k]. Leading dimensions must match with q. v: Tensor with shape [..., length_kv, depth_v] Leading dimensions must match with q. bias: bias Tensor (see attention_bias()) dropout_rate: a float. image_shapes: optional tuple of integer scalars. see comments for attention_image_summary() name: an optional string attention_image_summary: the callback for making image summary of attention. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). dropout_broadcast_dims: an optional list of integers less than rank of q. Specifies in which dimensions to broadcast the dropout decisions. max_area_width: the max width allowed for an area. max_area_height: the max height allowed for an area. memory_height: the height of the memory. area_key_mode: the mode for computing area keys, which can be "mean", "concat", "sum", "sample_concat", and "sample_sum". area_value_mode: the mode for computing area values, which can be either "mean", or "sum". top_k_areas: Use the top key areas for attention. area_temperature: the temperature for attention softmax. training: indicating if it is in the training mode. Returns: Tensor with shape [..., length_q, depth_v]. """ tf.logging.info( "dot_product_area_attention: " "area_h=%d, area_w=%d, mem_h=%d, " "area_key_mode=%s, area_value_mode=%s, " "area_temperature=%f", max_area_height, max_area_width, memory_height, area_key_mode, area_value_mode, area_temperature) with tf.variable_scope(name, default_name="dot_product_area_attention", values=[q, k, v]) as scope: mem_shape = common_layers.shape_list(k) batch_size = mem_shape[0] head_size = mem_shape[1] length = mem_shape[2] depth = mem_shape[3] k_area = compute_area_key(tf.reshape(k, [-1, length, depth]), max_area_width=max_area_width, max_area_height=max_area_height, height=memory_height, mode=area_key_mode, training=training) if area_value_mode == "mean": v_area, _, _, _, _ = compute_area_features( tf.reshape(v, [-1, length, depth]), max_area_width=max_area_width, max_area_height=max_area_height, height=memory_height) elif area_value_mode == "max": v_area, _, _ = basic_pool(tf.reshape(v, [-1, length, depth]), max_area_width=max_area_width, max_area_height=max_area_height, height=memory_height, fn=tf.reduce_max) elif area_value_mode == "sum": _, _, v_area, _, _ = compute_area_features( tf.reshape(v, [-1, length, depth]), max_area_width=max_area_width, max_area_height=max_area_height, height=memory_height) else: raise ValueError("Unsupported area value mode=%s" % area_value_mode) k = tf.reshape(k_area, [batch_size, head_size, -1, depth]) v = tf.reshape(v_area, [batch_size, head_size, -1, depth]) logits = tf.matmul(q, k, transpose_b=True) # [..., length_q, length_kv] if bias is not None: bias = common_layers.cast_like(bias, logits) with tf.name_scope("compute_area_att_bias", values=[bias]): bias_shape = common_layers.shape_list(bias) mem_length = bias_shape[-1] bias_values = tf.reshape(tf.to_float(tf.less(bias, -1)), [-1, mem_length, 1]) _, _, padding_sum, _, _ = compute_area_features( bias_values, max_area_width=max_area_width, max_area_height=max_area_height, height=memory_height) bias = tf.where(tf.cast(tf.to_int32(padding_sum), tf.bool), tf.fill(tf.shape(padding_sum), -np.inf), tf.zeros_like(padding_sum, dtype=tf.float32)) bias = tf.reshape( bias, [bias_shape[0], bias_shape[1], bias_shape[2], -1]) logits += bias logits = logits / area_temperature weights = tf.nn.softmax(logits, name="attention_weights") if top_k_areas > 0: tf.logging.info("area_attention top_k_areas=%d", top_k_areas) top_k = tf.minimum( common_layers.shape_list(weights)[-1], top_k_areas) top_weights, _ = tf.nn.top_k(weights, k=top_k) min_values = tf.reduce_min(top_weights, -1, keepdims=True) weights = tf.where(tf.greater_equal(weights, min_values), weights, tf.zeros_like(weights)) weights = tf.div(weights, tf.reduce_sum(weights, -1, keepdims=True)) if save_weights_to is not None: save_weights_to[scope.name] = weights save_weights_to[scope.name + "/logits"] = logits # Drop out attention links for each head. weights = common_layers.dropout_with_broadcast_dims( weights, 1.0 - dropout_rate, broadcast_dims=dropout_broadcast_dims) if common_layers.should_generate_summaries( ) and attention_image_summary: attention_image_summary(weights, image_shapes) return tf.matmul(weights, v)
def input_fn_dataset(dataset, flags): """Gets the model input from the given dataset.""" features = {} dataset_descriptor = dataset_descriptors[flags.dataset] def process_label(label): """Preprocesses the label.""" label = tf.image.decode_image(label, channels=1) ignore_label = 255 label = tf.cast(label, tf.int32) if flags.preprocess_divide_label: label /= 255 label = resize_im(label, flags.image_size, ignore_label, 1) label = tf.cast(label, tf.int32) return label def _parse_function(*args): """Parses the tf example.""" serialized_example = args[-1] context_feature_names = { dataset_descriptor.image_id: tf.FixedLenFeature([], tf.string), } sequence_feature_names = {} if flags.use_ref_exp: context_feature_names[REF_EXP_ID] = tf.FixedLenFeature([], tf.string) if flags.use_labels: if dataset_descriptor.has_candidate: context_feature_names[ SELECTED_CANDIDATE_ID] = tf.FixedLenFeature([], tf.int64) sequence_feature_names[ ELEMENTS_MASK_ID] = tf.FixedLenSequenceFeature([], tf.string) else: context_feature_names[ dataset_descriptor.label_id] = tf.FixedLenFeature( [], tf.string) if dataset_descriptor.has_elements_boxes: sequence_feature_names[ dataset_descriptor. elements_box_id] = tf.FixedLenSequenceFeature([4], dtype=tf.float32) if flags.use_elements_texts: sequence_feature_names[ dataset_descriptor. elements_text_id] = tf.FixedLenSequenceFeature([], dtype=tf.string) if flags.use_elements_neighbors: sequence_feature_names[ ELEMENTS_NEIGHBORS_ID] = tf.FixedLenSequenceFeature( [], dtype=tf.string) if flags.use_elements_ref_match: sequence_feature_names[ ELEMENTS_REF_MATCH_ID] = tf.FixedLenSequenceFeature( [], dtype=tf.string) if flags.use_groundtruth_box: context_feature_names[GROUNDTRUTH_XMIN_ID] = tf.FixedLenFeature( [], tf.float32) context_feature_names[GROUNDTRUTH_XMAX_ID] = tf.FixedLenFeature( [], tf.float32) context_feature_names[GROUNDTRUTH_YMIN_ID] = tf.FixedLenFeature( [], tf.float32) context_feature_names[GROUNDTRUTH_YMAX_ID] = tf.FixedLenFeature( [], tf.float32) context_features, sequence_features = tf.parse_single_sequence_example( serialized_example, context_features=context_feature_names, sequence_features=sequence_feature_names, ) features.update(context_features) features.update(sequence_features) if flags.use_elements_texts: features[ELEMENTS_TEXT_ID] = features.pop( dataset_descriptor.elements_text_id) if dataset_descriptor.has_elements_boxes: features[ELEMENTS_BOX_ID] = features.pop( dataset_descriptor.elements_box_id) image = features.pop(dataset_descriptor.image_id) image = tf.image.decode_image(image, channels=3) image = tf.cast(image, tf.float32) mean_pixel = tf.reshape( feature_extractor.mean_pixel(flags.model_variant), [1, 1, 3]) features[IMAGE_PAD_WEIGHTS_ID] = tf.ones_like(image[:, :, 0:1]) features[IMAGE_PAD_WEIGHTS_ID] = resize_im( features[IMAGE_PAD_WEIGHTS_ID], flags.image_size, 0, 1) features[IMAGE_PAD_WEIGHTS_ID] = tf.squeeze( features[IMAGE_PAD_WEIGHTS_ID], 2) if dataset_descriptor.has_elements_boxes: image = resize_im(image, flags.image_size, mean_pixel, 3, features) else: image = resize_im(image, flags.image_size, mean_pixel, 3) if flags.use_labels: if dataset_descriptor.has_candidate: features[ELEMENTS_MASK_ID] = tf.map_fn( process_label, features.pop(ELEMENTS_MASK_ID), parallel_iterations=128, dtype=tf.int32, name="mask_map") features[LABEL_ID] = tf.gather_nd( features[ELEMENTS_MASK_ID], [features[SELECTED_CANDIDATE_ID]]) else: label = features.pop(dataset_descriptor.label_id) label = process_label(label) features[LABEL_ID] = label if flags.use_elements_texts: features[ELEMENTS_EXIST_ID] = tf.ones_like( features[ELEMENTS_TEXT_ID], dtype=tf.int32) elif dataset_descriptor.has_elements_boxes: features[ELEMENTS_EXIST_ID] = tf.ones(tf.shape( features[ELEMENTS_BOX_ID])[:1], dtype=tf.int32) if flags.use_elements_neighbors: features[ELEMENTS_NEIGHBORS_ID] = convert_string_neighbors( features[ELEMENTS_NEIGHBORS_ID]) features[IMAGE_ID] = image return features dataset = dataset.map(_parse_function, num_parallel_calls=flags.dataset_threads).prefetch( flags.batch_size) padded_shapes = { IMAGE_ID: [None, None, None], } if flags.use_labels: padded_shapes[LABEL_ID] = [None, None, None] if flags.use_groundtruth_box: padded_shapes[GROUNDTRUTH_XMIN_ID] = [] padded_shapes[GROUNDTRUTH_XMAX_ID] = [] padded_shapes[GROUNDTRUTH_YMIN_ID] = [] padded_shapes[GROUNDTRUTH_YMAX_ID] = [] if flags.use_elements_texts: padded_shapes[ELEMENTS_TEXT_ID] = [None] padded_shapes[ELEMENTS_EXIST_ID] = [None] if dataset_descriptor.has_elements_boxes: padded_shapes[ELEMENTS_BOX_ID] = [None, None] padded_shapes[ELEMENTS_EXIST_ID] = [None] if flags.use_elements_neighbors: padded_shapes[ELEMENTS_NEIGHBORS_ID] = [None, None] if flags.use_elements_ref_match: padded_shapes[ELEMENTS_REF_MATCH_ID] = [None] padded_shapes[IMAGE_PAD_WEIGHTS_ID] = [None, None] if flags.use_ref_exp: padded_shapes.update({ REF_EXP_ID: [], }) if dataset_descriptor.has_candidate: padded_shapes.update({ SELECTED_CANDIDATE_ID: [], ELEMENTS_MASK_ID: [None, None, None, None], }) dataset = dataset.padded_batch(flags.batch_size, padded_shapes=padded_shapes) dataset = dataset.prefetch(1) try: iterator = dataset.make_one_shot_iterator() feature_map = iterator.get_next() except ValueError: # This means the input pipeline uses placeholders probably because it's in # inference mode. feature_map = tf.contrib.data.get_single_element(dataset) feature_map[IMAGE_ID] = tf.reshape( feature_map[IMAGE_ID], [-1, flags.image_size, flags.image_size, 3]) assert_ops = [] if dataset_descriptor.has_elements_boxes: assert_ops.append( assert_or_warn( tf.greater_equal(tf.reduce_min(feature_map[ELEMENTS_BOX_ID]), -.001), [ "Bounding box is negative", tf.reduce_min(feature_map[ELEMENTS_BOX_ID]) ], flags.incorrect_boxes_as_errors)) assert_ops.append( assert_or_warn( tf.less_equal( tf.reduce_max(feature_map[ELEMENTS_BOX_ID][:, :, 0] + feature_map[ELEMENTS_BOX_ID][:, :, 2]), 1.001), [ "Bounding box x dim is too large.", tf.reduce_max(feature_map[ELEMENTS_BOX_ID][:, :, 0] + feature_map[ELEMENTS_BOX_ID][:, :, 2]) ], flags.incorrect_boxes_as_errors)) assert_ops.append( assert_or_warn( tf.less_equal( tf.reduce_max(feature_map[ELEMENTS_BOX_ID][:, :, 1] + feature_map[ELEMENTS_BOX_ID][:, :, 3]), 1.001), [ "Bounding box y dim is too large.", tf.reduce_max(feature_map[ELEMENTS_BOX_ID][:, :, 1] + feature_map[ELEMENTS_BOX_ID][:, :, 3]) ], flags.incorrect_boxes_as_errors)) with tf.control_dependencies(assert_ops): if dataset_descriptor.has_elements_boxes: feature_map[ELEMENTS_BOX_ID].set_shape([None, None, 4]) feature_map[ELEMENTS_EXIST_ID] = tf.cast( feature_map[ELEMENTS_EXIST_ID], tf.bool) if flags.use_labels: if flags.output_mode == "segment" or flags.output_mode == "regression": feature_map[LABEL_ID] = tf.reshape( feature_map[LABEL_ID], [-1, flags.image_size, flags.image_size, 1]) return feature_map
def _is_enough_agreement(example): return tf.greater_equal(example['agreement_count'], required_agreement)
def compare_dims(a, b, x): """At least `x` of `a` and `b` `Tensors` are true.""" match = tf.equal(a, b) match = tf.cast(match, tf.int32) return tf.greater_equal(tf.reduce_sum(match), x)
def _static_subsample(self, indicator, batch_size, labels): """Returns subsampled minibatch. Args: indicator: boolean tensor of shape [N] whose True entries can be sampled. N should be a complie time constant. batch_size: desired batch size. This scalar cannot be None. labels: boolean tensor of shape [N] denoting positive(=True) and negative (=False) examples. N should be a complie time constant. Returns: sampled_idx_indicator: boolean tensor of shape [N], True for entries which are sampled. It ensures the length of output of the subsample is always batch_size, even when number of examples set to True in indicator is less than batch_size. Raises: ValueError: if labels and indicator are not 1D boolean tensors. """ # Check if indicator and labels have a static size. if not indicator.shape.is_fully_defined(): raise ValueError( 'indicator must be static in shape when is_static is' 'True') if not labels.shape.is_fully_defined(): raise ValueError('labels must be static in shape when is_static is' 'True') if not isinstance(batch_size, int): raise ValueError( 'batch_size has to be an integer when is_static is' 'True.') input_length = tf.shape(indicator)[0] # Set the number of examples set True in indicator to be at least # batch_size. num_true_sampled = tf.reduce_sum(tf.cast(indicator, tf.float32)) additional_false_sample = tf.less_equal( tf.cumsum(tf.cast(tf.logical_not(indicator), tf.float32)), batch_size - num_true_sampled) indicator = tf.logical_or(indicator, additional_false_sample) # Shuffle indicator and label. Need to store the permutation to restore the # order post sampling. permutation = tf.random_shuffle(tf.range(input_length)) indicator = ops.matmul_gather_on_zeroth_axis( tf.cast(indicator, tf.float32), permutation) labels = ops.matmul_gather_on_zeroth_axis(tf.cast(labels, tf.float32), permutation) # index (starting from 1) when indicator is True, 0 when False indicator_idx = tf.where(tf.cast(indicator, tf.bool), tf.range(1, input_length + 1), tf.zeros(input_length, tf.int32)) # Replace -1 for negative, +1 for positive labels signed_label = tf.where( tf.cast(labels, tf.bool), tf.ones(input_length, tf.int32), tf.scalar_mul(-1, tf.ones(input_length, tf.int32))) # negative of index for negative label, positive index for positive label, # 0 when indicator is False. signed_indicator_idx = tf.multiply(indicator_idx, signed_label) sorted_signed_indicator_idx = tf.nn.top_k(signed_indicator_idx, input_length, sorted=True).values [num_positive_samples, num_negative_samples ] = self._get_num_pos_neg_samples(sorted_signed_indicator_idx, batch_size) sampled_idx = self._get_values_from_start_and_end( sorted_signed_indicator_idx, num_positive_samples, num_negative_samples, batch_size) # Shift the indices to start from 0 and remove any samples that are set as # False. sampled_idx = tf.abs(sampled_idx) - tf.ones(batch_size, tf.int32) sampled_idx = tf.multiply( tf.cast(tf.greater_equal(sampled_idx, tf.constant(0)), tf.int32), sampled_idx) sampled_idx_indicator = tf.cast( tf.reduce_sum(tf.one_hot(sampled_idx, depth=input_length), axis=0), tf.bool) # project back the order based on stored permutations reprojections = tf.one_hot(permutation, depth=input_length, dtype=tf.float32) return tf.cast( tf.tensordot(tf.cast(sampled_idx_indicator, tf.float32), reprojections, axes=[0, 0]), tf.bool)
def _at_least_x_are_equal(a, b, x): """At least `x` of `a` and `b` `Tensors` are equal.""" match = tf.equal(a, b) match = tf.cast(match, tf.int32) return tf.greater_equal(tf.reduce_sum(match), x)
def _double_factorial_loop_body(n, result, two): result = tf.where(tf.greater_equal(n, two), result + tf.math.log(n), result) return n - two, result, two