Ejemplo n.º 1
0
def _double_factorial_loop_condition(n, result, two):
    del result  # Unused
    return tf.cast(tf.math.count_nonzero(tf.greater_equal(n, two)), tf.bool)
Ejemplo n.º 2
0
def assign_and_sample_proposals(proposed_boxes,
                                gt_boxes,
                                gt_classes,
                                num_samples_per_image=512,
                                mix_gt_boxes=True,
                                fg_fraction=0.25,
                                fg_iou_thresh=0.5,
                                bg_iou_thresh_hi=0.5,
                                bg_iou_thresh_lo=0.0):
    """Assigns the proposals with groundtruth classes and performs subsmpling.

  Given `proposed_boxes`, `gt_boxes`, and `gt_classes`, the function uses the
  following algorithm to generate the final `num_samples_per_image` RoIs.
    1. Calculates the IoU between each proposal box and each gt_boxes.
    2. Assigns each proposed box with a groundtruth class and box by choosing
       the largest IoU overlap.
    3. Samples `num_samples_per_image` boxes from all proposed boxes, and
       returns box_targets, class_targets, and RoIs.

  Args:
    proposed_boxes: a tensor of shape of [batch_size, N, 4]. N is the number
      of proposals before groundtruth assignment. The last dimension is the
      box coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax]
      format.
    gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4].
      The coordinates of gt_boxes are in the pixel coordinates of the scaled
      image. This tensor might have padding of values -1 indicating the invalid
      box coordinates.
    gt_classes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This
      tensor might have paddings with values of -1 indicating the invalid
      classes.
    num_samples_per_image: a integer represents RoI minibatch size per image.
    mix_gt_boxes: a bool indicating whether to mix the groundtruth boxes before
      sampling proposals.
    fg_fraction: a float represents the target fraction of RoI minibatch that
      is labeled foreground (i.e., class > 0).
    fg_iou_thresh: a float represents the IoU overlap threshold for an RoI to be
      considered foreground (if >= fg_iou_thresh).
    bg_iou_thresh_hi: a float represents the IoU overlap threshold for an RoI to
      be considered background (class = 0 if overlap in [LO, HI)).
    bg_iou_thresh_lo: a float represents the IoU overlap threshold for an RoI to
      be considered background (class = 0 if overlap in [LO, HI)).

  Returns:
    sampled_rois: a tensor of shape of [batch_size, K, 4], representing the
      coordinates of the sampled RoIs, where K is the number of the sampled
      RoIs, i.e. K = num_samples_per_image.
    sampled_gt_boxes: a tensor of shape of [batch_size, K, 4], storing the
      box coordinates of the matched groundtruth boxes of the samples RoIs.
    sampled_gt_classes: a tensor of shape of [batch_size, K], storing the
      classes of the matched groundtruth boxes of the sampled RoIs.
    sampled_gt_indices: a tensor of shape of [batch_size, K], storing the
      indices of the sampled groudntruth boxes in the original `gt_boxes`
      tensor, i.e. gt_boxes[sampled_gt_indices[:, i]] = sampled_gt_boxes[:, i].
  """

    with tf.name_scope('sample_proposals'):
        if mix_gt_boxes:
            boxes = tf.concat([proposed_boxes, gt_boxes], axis=1)
        else:
            boxes = proposed_boxes

        (matched_gt_boxes, matched_gt_classes, matched_gt_indices, matched_iou,
         _) = box_matching(boxes, gt_boxes, gt_classes)

        positive_match = tf.greater(matched_iou, fg_iou_thresh)
        negative_match = tf.logical_and(
            tf.greater_equal(matched_iou, bg_iou_thresh_lo),
            tf.less(matched_iou, bg_iou_thresh_hi))
        ignored_match = tf.less(matched_iou, 0.0)

        # re-assign negatively matched boxes to the background class.
        matched_gt_classes = tf.where(negative_match,
                                      tf.zeros_like(matched_gt_classes),
                                      matched_gt_classes)
        matched_gt_indices = tf.where(negative_match,
                                      tf.zeros_like(matched_gt_indices),
                                      matched_gt_indices)

        sample_candidates = tf.logical_and(
            tf.logical_or(positive_match, negative_match),
            tf.logical_not(ignored_match))

        sampler = (
            balanced_positive_negative_sampler.BalancedPositiveNegativeSampler(
                positive_fraction=fg_fraction, is_static=True))

        batch_size, _ = sample_candidates.get_shape().as_list()
        sampled_indicators = []
        for i in range(batch_size):
            sampled_indicator = sampler.subsample(sample_candidates[i],
                                                  num_samples_per_image,
                                                  positive_match[i])
            sampled_indicators.append(sampled_indicator)
        sampled_indicators = tf.stack(sampled_indicators)
        _, sampled_indices = tf.nn.top_k(tf.cast(sampled_indicators,
                                                 dtype=tf.int32),
                                         k=num_samples_per_image,
                                         sorted=True)

        sampled_indices_shape = tf.shape(sampled_indices)
        batch_indices = (
            tf.expand_dims(tf.range(sampled_indices_shape[0]), axis=-1) *
            tf.ones([1, sampled_indices_shape[-1]], dtype=tf.int32))
        gather_nd_indices = tf.stack([batch_indices, sampled_indices], axis=-1)

        sampled_rois = tf.gather_nd(boxes, gather_nd_indices)
        sampled_gt_boxes = tf.gather_nd(matched_gt_boxes, gather_nd_indices)
        sampled_gt_classes = tf.gather_nd(matched_gt_classes,
                                          gather_nd_indices)
        sampled_gt_indices = tf.gather_nd(matched_gt_indices,
                                          gather_nd_indices)

        return (sampled_rois, sampled_gt_boxes, sampled_gt_classes,
                sampled_gt_indices)
Ejemplo n.º 3
0
    def _build_outputs(self, images, labels, mode):
        is_training = mode == mode_keys.TRAIN
        model_outputs = {}

        if 'anchor_boxes' in labels:
            anchor_boxes = labels['anchor_boxes']
        else:
            anchor_boxes = anchor.Anchor(
                self._params.architecture.min_level,
                self._params.architecture.max_level,
                self._params.anchor.num_scales,
                self._params.anchor.aspect_ratios,
                self._params.anchor.anchor_size,
                images.get_shape().as_list()[1:3]).multilevel_boxes

            batch_size = tf.shape(images)[0]
            for level in anchor_boxes:
                anchor_boxes[level] = tf.tile(
                    tf.expand_dims(anchor_boxes[level], 0), [batch_size, 1, 1])

        backbone_features = self._backbone_fn(images, is_training)
        fpn_features = self._fpn_fn(backbone_features, is_training)

        rpn_score_outputs, rpn_box_outputs = self._rpn_head_fn(
            fpn_features, is_training)
        model_outputs.update({
            'rpn_score_outputs': rpn_score_outputs,
            'rpn_box_outputs': rpn_box_outputs,
        })
        rpn_rois, _ = self._generate_rois_fn(rpn_box_outputs,
                                             rpn_score_outputs, anchor_boxes,
                                             labels['image_info'][:, 1, :],
                                             is_training)

        if is_training:
            rpn_rois = tf.stop_gradient(rpn_rois)

            # Sample proposals.
            rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices = (
                self._sample_rois_fn(rpn_rois, labels['gt_boxes'],
                                     labels['gt_classes']))

            self.add_scalar_summary(
                'fg_bg_ratio_{}'.format(0),
                tf.reduce_sum(
                    tf.cast(tf.greater(matched_gt_classes, 0), tf.float32)) /
                tf.reduce_sum(
                    tf.cast(tf.greater_equal(matched_gt_classes, 0),
                            tf.float32)))

            # Create bounding box training targets.
            box_targets = box_utils.encode_boxes(
                matched_gt_boxes, rpn_rois, weights=[10.0, 10.0, 5.0, 5.0])
            # If the target is background, the box target is set to all 0s.
            box_targets = tf.where(
                tf.tile(
                    tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1),
                    [1, 1, 4]), tf.zeros_like(box_targets), box_targets)
            model_outputs.update({
                'class_targets': matched_gt_classes,
                'box_targets': box_targets,
            })

        _, num_rois_before_cat, _ = rpn_rois.get_shape().as_list()

        if is_training and self._feat_distill:
            tf.logging.info(f'rois before concat distill boxes: {rpn_rois}')
            rpn_rois = tf.concat([rpn_rois, labels['roi_boxes']], axis=1)
            # [batch_size, num_rois+max_distill_rois, 4]
            tf.logging.info(f'rois after concat distill boxes: {rpn_rois}')

        roi_features = spatial_transform_ops.multilevel_crop_and_resize(
            fpn_features, rpn_rois, output_size=7)

        if is_training and self._feat_distill:
            tf.logging.info(f'rois before split: {rpn_rois}')
            rpn_rois, _ = tf.split(
                rpn_rois, [num_rois_before_cat, self._max_distill_rois],
                axis=1)
            tf.logging.info(f'rois after split: {rpn_rois}')

        (class_outputs, box_outputs, distill_feat_outputs,
         distill_class_outputs) = self._frcnn_head_fn(roi_features,
                                                      is_training)
        model_outputs.update({
            'class_outputs': class_outputs,
            'box_outputs': box_outputs,
        })
        if is_training and self._feat_distill:
            model_outputs.update(
                {'distill_feat_outputs': distill_feat_outputs})

        if not is_training:
            detection_results = self._generate_detections_fn(
                box_outputs,
                class_outputs,
                rpn_rois,
                labels['image_info'][:, 1:2, :],
                bbox_per_class=not self._params.frcnn_head.
                class_agnostic_bbox_pred,
                distill_class_outputs=distill_class_outputs,
            )
            model_outputs.update(detection_results)

        if not self._include_mask:
            return model_outputs

        if is_training:
            rpn_rois, classes, mask_targets = self._sample_masks_fn(
                rpn_rois, matched_gt_boxes, matched_gt_classes,
                matched_gt_indices, labels['gt_masks'])
            mask_targets = tf.stop_gradient(mask_targets)

            classes = tf.cast(classes, dtype=tf.int32)

            model_outputs.update({
                'mask_targets': mask_targets,
                'sampled_class_targets': classes,
            })
        else:
            rpn_rois = detection_results['detection_boxes']
            classes = tf.cast(detection_results['detection_classes'],
                              dtype=tf.int32)

        mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
            fpn_features, rpn_rois, output_size=14)

        mask_outputs = self._mrcnn_head_fn(mask_roi_features, classes,
                                           is_training)

        if is_training:
            model_outputs.update({
                'mask_outputs': mask_outputs,
            })
        else:
            model_outputs.update(
                {'detection_masks': tf.nn.sigmoid(mask_outputs)})

        return model_outputs
Ejemplo n.º 4
0
def _scan_step_fn(state, example, packed_length, queue_size, spacing,
                  num_sequences, token_dtype):  # pylint: disable=g-doc-args
    """Transform function used by tf.data.experimental.scan to process an example.

  This is written as a stateless function rather than a class method because we
  trace it with AutoGraph (in order to simplify the conditional), and this way
  we don't have to worry about handling re-tracing semantics.

  Args:
    See the SequenceDatasetPacker class.

  Returns:
    The updated queue state, and either a packed example or a dummy sequence
    which will be filtered out downstream.
  """

    # Convert TensorArray tuples to lists since we'll need to replace them.
    availability, contents, top_index = state

    lengths = tf.concat([tf.shape(i) for i in example], axis=0)
    start_availability = availability.stack()
    can_fit = tf.reduce_all(tf.greater_equal(start_availability, lengths),
                            axis=1)
    any_can_fit = tf.reduce_any(can_fit, axis=0)

    # AutoGraph will convert this block to a tf.cond
    if any_can_fit:
        # This indicates where in the FFD queue rotation a given index sits
        shifted_range = (tf.range(queue_size, dtype=INDEX_DTYPE) -
                         top_index) % queue_size

        # Mark any indices which cannot accommodate the current example.
        exclusion_mask = tf.cast(tf.logical_not(can_fit),
                                 INDEX_DTYPE) * queue_size

        # Index in [0, queue_size) in which to place the sample. Note, this index
        # is the position in the actual TensorArray, not the index of the FFD queue.
        queue_index = (tf.reduce_min(shifted_range + exclusion_mask) +
                       top_index) % queue_size

        # NOTE(taylorrobie): We emit a non-empty Tensor for downstream checks.
        output_contents = -tf.ones((1, num_sequences), dtype=token_dtype)

    else:
        index_range = top_index * packed_length + tf.range(packed_length)
        output_contents = contents.gather(index_range)

        # Reset the queue state.
        availability = availability.write(
            top_index,
            packed_length * tf.ones((num_sequences, ), dtype=INDEX_DTYPE))
        empty_contents = tf.zeros((packed_length, num_sequences * 2),
                                  dtype=token_dtype)
        contents = contents.scatter(index_range, empty_contents)

        queue_index = top_index
        top_index = (top_index + 1) % queue_size

    pre_assign_availability = availability.read(queue_index)
    space_left = pre_assign_availability - lengths - spacing
    availability = availability.write(queue_index, space_left)

    # ============================================================================
    # == Update contents =========================================================
    # ============================================================================
    # Consider the following case for a seq-to-seq packing:
    #   (padding is represented as underscores)
    #
    #   Queue starting state:
    #     [1, 3, 2, 4, 6, 1, _, _, _, _, _, ...]
    #     [5, 9, _, _, _, _, _, _, _, _, _, ...]
    #
    #   Examples:
    #     [4, 2, 4], [3]
    #
    #   Desired new queue state:
    #     [1, 3, 2, 4, 6, 1, _, _, 4, 2, 4, _, _, ...]
    #     [5, 9, _, _, 3, _, _, _, _, _, _, _, _, ...]
    #
    # This could be acomplished by creating a TensorArray for each of the two
    # sequences, and scattering into the respective arrays. However TensorArray
    # writes are extremely expensive relative to other operations. So instead we
    # store the contents in a single TensorArray of shape (packed_length, 2), and
    # we pad and concatenate the examples such that they can be added in a single
    # assign:
    #
    #              [_, _, _, _, 4, 2, 4]
    #              [3, _, _, _, _, _, _]
    #                        +
    #  [1, 3, 2, 4, 6, 1, _, _, _, _, _, ...]
    #  [5, 9, _, _, _, _, _, _, _, _, _, ...]
    #
    # And in practice, the extra work of padding is neglidgable compared to
    # the gain from vectorizing the TensorArray assign. We also store a bit mask
    # denoting where sequences start which is used to compute segment and
    # position metadata:
    #
    #              [_, _, _, _, 1, _, _]
    #              [1, _, _, _, _, _, _]
    #                        +
    #  [1, _, _, _, _, _, _, _, _, _, _, ...]
    #  [1, _, _, _, _, _, _, _, _, _, _, ...]
    #
    # Both the contents and the mask are concatenated in the same TensorArray
    # for performance.

    start_index = packed_length - pre_assign_availability
    end_index = start_index + lengths
    leftmost = tf.reduce_min(start_index, axis=0)
    rightmost = tf.reduce_max(end_index, axis=0)
    delta = rightmost - leftmost
    pad_indices = [
        tf.stack((start_index[i] - leftmost, rightmost - end_index[i]))
        for i in range(num_sequences)
    ]

    padded_examples = [
        tf.pad(ex, padding[tf.newaxis, :])
        for ex, padding in zip(example, pad_indices)
    ]
    padded_examples = tf.transpose(tf.stack(padded_examples))
    mask_update = tf.one_hot(start_index - leftmost,
                             delta,
                             dtype=contents.dtype,
                             axis=0)

    content_update = tf.concat([padded_examples, mask_update], axis=1)

    index_range = (
        queue_index * packed_length +  # Offset into the right section.
        tf.range(delta, dtype=INDEX_DTYPE) + leftmost)
    contents = contents.scatter(index_range,
                                contents.gather(index_range) + content_update)

    state = (availability, contents, top_index)
    return state, (tf.logical_not(any_can_fit), output_contents)
Ejemplo n.º 5
0
    def _parse_train_data(self, data):
        """Parse data for ShapeMask training."""
        classes = data['groundtruth_classes']
        boxes = data['groundtruth_boxes']
        masks = data['groundtruth_instance_masks']
        is_crowds = data['groundtruth_is_crowd']
        # Skips annotations with `is_crowd` = True.
        if self._skip_crowd_during_training and self._is_training:
            num_groundtrtuhs = tf.shape(classes)[0]
            with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
                indices = tf.cond(
                    tf.greater(tf.size(is_crowds), 0),
                    lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
                    lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64))
            classes = tf.gather(classes, indices)
            boxes = tf.gather(boxes, indices)
            masks = tf.gather(masks, indices)

        # Gets original image and its size.
        image = data['image']
        image_shape = tf.shape(image)[0:2]

        # If not using category, makes all categories with id = 0.
        if not self._use_category:
            classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32)

        # Normalizes image with mean and std pixel values.
        image = input_utils.normalize_image(image)

        # Flips image randomly during training.
        if self._aug_rand_hflip:
            image, boxes, masks = input_utils.random_horizontal_flip(
                image, boxes, masks)

        # Converts boxes from normalized coordinates to pixel coordinates.
        boxes = box_utils.denormalize_boxes(boxes, image_shape)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            self._output_size,
            aug_scale_min=self._aug_scale_min,
            aug_scale_max=self._aug_scale_max)
        image_scale = image_info[2, :]
        offset = image_info[3, :]

        # Resizes and crops boxes and masks.
        boxes = input_utils.resize_and_crop_boxes(boxes, image_scale,
                                                  self._output_size, offset)

        # Filters out ground truth boxes that are all zeros.
        indices = input_utils.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices)
        classes = tf.gather(classes, indices)
        masks = tf.gather(masks, indices)

        # Assigns anchors.
        input_anchor = anchor.Anchor(self._min_level, self._max_level,
                                     self._num_scales, self._aspect_ratios,
                                     self._anchor_size, self._output_size)
        anchor_labeler = anchor.AnchorLabeler(input_anchor,
                                              self._match_threshold,
                                              self._unmatched_threshold)
        (cls_targets, box_targets,
         num_positives) = anchor_labeler.label_anchors(
             boxes, tf.cast(tf.expand_dims(classes, axis=1), tf.float32))

        # Sample groundtruth masks/boxes/classes for mask branch.
        num_masks = tf.shape(masks)[0]
        mask_shape = tf.shape(masks)[1:3]

        # Pad sampled boxes/masks/classes to a constant batch size.
        padded_boxes = input_utils.pad_to_fixed_size(boxes,
                                                     self._num_sampled_masks)
        padded_classes = input_utils.pad_to_fixed_size(classes,
                                                       self._num_sampled_masks)
        padded_masks = input_utils.pad_to_fixed_size(masks,
                                                     self._num_sampled_masks)

        # Randomly sample groundtruth masks for mask branch training. For the image
        # without groundtruth masks, it will sample the dummy padded tensors.
        rand_indices = tf.random.shuffle(
            tf.range(tf.maximum(num_masks, self._num_sampled_masks)))
        rand_indices = tf.mod(rand_indices, tf.maximum(num_masks, 1))
        rand_indices = rand_indices[0:self._num_sampled_masks]
        rand_indices = tf.reshape(rand_indices, [self._num_sampled_masks])

        sampled_boxes = tf.gather(padded_boxes, rand_indices)
        sampled_classes = tf.gather(padded_classes, rand_indices)
        sampled_masks = tf.gather(padded_masks, rand_indices)
        # Jitter the sampled boxes to mimic the noisy detections.
        sampled_boxes = box_utils.jitter_boxes(
            sampled_boxes, noise_scale=self._box_jitter_scale)
        sampled_boxes = box_utils.clip_boxes(sampled_boxes, self._output_size)
        # Compute mask targets in feature crop. A feature crop fully contains a
        # sampled box.
        mask_outer_boxes = box_utils.compute_outer_boxes(
            sampled_boxes, tf.shape(image)[0:2], scale=self._outer_box_scale)
        mask_outer_boxes = box_utils.clip_boxes(mask_outer_boxes,
                                                self._output_size)
        # Compensate the offset of mask_outer_boxes to map it back to original image
        # scale.
        mask_outer_boxes_ori = mask_outer_boxes
        mask_outer_boxes_ori += tf.tile(tf.expand_dims(offset, axis=0), [1, 2])
        mask_outer_boxes_ori /= tf.tile(tf.expand_dims(image_scale, axis=0),
                                        [1, 2])
        norm_mask_outer_boxes_ori = box_utils.normalize_boxes(
            mask_outer_boxes_ori, mask_shape)

        # Set sampled_masks shape to [batch_size, height, width, 1].
        sampled_masks = tf.cast(tf.expand_dims(sampled_masks, axis=-1),
                                tf.float32)
        mask_targets = tf.image.crop_and_resize(
            sampled_masks,
            norm_mask_outer_boxes_ori,
            box_ind=tf.range(self._num_sampled_masks),
            crop_size=[self._mask_crop_size, self._mask_crop_size],
            method='bilinear',
            extrapolation_value=0,
            name='train_mask_targets')
        mask_targets = tf.where(tf.greater_equal(mask_targets, 0.5),
                                tf.ones_like(mask_targets),
                                tf.zeros_like(mask_targets))
        mask_targets = tf.squeeze(mask_targets, axis=-1)
        if self._up_sample_factor > 1:
            fine_mask_targets = tf.image.crop_and_resize(
                sampled_masks,
                norm_mask_outer_boxes_ori,
                box_ind=tf.range(self._num_sampled_masks),
                crop_size=[
                    self._mask_crop_size * self._up_sample_factor,
                    self._mask_crop_size * self._up_sample_factor
                ],
                method='bilinear',
                extrapolation_value=0,
                name='train_mask_targets')
            fine_mask_targets = tf.where(
                tf.greater_equal(fine_mask_targets, 0.5),
                tf.ones_like(fine_mask_targets),
                tf.zeros_like(fine_mask_targets))
            fine_mask_targets = tf.squeeze(fine_mask_targets, axis=-1)
        else:
            fine_mask_targets = mask_targets

        # If bfloat16 is used, casts input image to tf.bfloat16.
        if self._use_bfloat16:
            image = tf.cast(image, dtype=tf.bfloat16)

        valid_image = tf.cast(tf.not_equal(num_masks, 0), tf.int32)
        if self._mask_train_class == 'all':
            mask_is_valid = valid_image * tf.ones_like(sampled_classes,
                                                       tf.int32)
        else:
            # Get the intersection of sampled classes with training splits.
            mask_valid_classes = tf.cast(
                tf.expand_dims(
                    class_utils.coco_split_class_ids(self._mask_train_class),
                    1), sampled_classes.dtype)
            match = tf.reduce_any(
                tf.equal(tf.expand_dims(sampled_classes, 0),
                         mask_valid_classes), 0)
            mask_is_valid = valid_image * tf.cast(match, tf.int32)

        # Packs labels for model_fn outputs.
        labels = {
            'cls_targets': cls_targets,
            'box_targets': box_targets,
            'anchor_boxes': input_anchor.multilevel_boxes,
            'num_positives': num_positives,
            'image_info': image_info,
            # For ShapeMask.
            'mask_boxes': sampled_boxes,
            'mask_outer_boxes': mask_outer_boxes,
            'mask_targets': mask_targets,
            'fine_mask_targets': fine_mask_targets,
            'mask_classes': sampled_classes,
            'mask_is_valid': mask_is_valid,
        }
        return image, labels
Ejemplo n.º 6
0
 def gte(self, x, y):
     return tf.greater_equal(x, y)
Ejemplo n.º 7
0
    def _update_random_mask(self, weights, mask):
        """Randomly identifies subset of weights to be set to zero in the network.

       If pruning method is specified as 'random_cumulative', at each pruning
       step a random subset of weights is set to zero taking into account which
       weights are still non-zero.

       If pruning method is specified to be 'random_independent', the random
       weights selected at each pruning step are entirely independent
       of previous pruning steps.

    Args:
      weights: The weight tensor that needs to be masked.
      mask: The mask from the previous pruning update.

    Returns:
      new_mask: A tensor of the same size and shape as weights containing
        0 or 1.
    Raises:
      ValueError: Raises ValueError if sparsity is not defined
    """

        if self._sparsity is None:
            raise ValueError('Sparsity variable undefined')

        sparsity = self._get_sparsity(weights.op.name)
        with tf.name_scope(weights.op.name + '_pruning_ops'):

            if self._pruning_method == 'random_cumulative':
                # compute the total number of weights in the layer.
                total_weights = tf.size(weights)
                mask = tf.reshape(mask, [total_weights])

                # adding random vector because if there are ties sort simply
                # selects based upon index position (starts from beginning of vector).
                random_noise = tf.random_uniform(shape=mask.shape,
                                                 minval=0.0001,
                                                 maxval=0.0003)
                mask = tf.cast(tf.add(random_noise, mask), tf.float32)

                # rank the binary mask by magnitude. Weights already on are selected
                # plus a random subset of all other weights.
                sorted_mask = sort(mask, direction='DESCENDING')

                # multiply desired sparsity fraction by the number of weights.
                num_weights = tf.reshape(
                    tf.cast(
                        tf.cast(total_weights, tf.float32) * sparsity,
                        tf.int32), [1])
                percentile = tf.gather_nd(sorted_mask, num_weights)

                one_mask = tf.ones([total_weights])
                zero_mask = tf.zeros([total_weights])

                feature_ranking = tf.where(tf.greater_equal(percentile, mask),
                                           one_mask, zero_mask)
                new_mask = tf.reshape(feature_ranking, weights.get_shape())

            else:
                drop_out = tf.nn.dropout(tf.ones_like(weights),
                                         keep_prob=(1. - self._sparsity))
                new_mask = tf.cast(drop_out, tf.float32)

        return self._sparsity, new_mask
Ejemplo n.º 8
0
              0.70710677*temp_3
    S1.append(tf.concat([S1_real, S1_im], 1))
    x_ind_reshaped = tf.reshape(X_IND, [batch_size, 4 * K])
    LOSS.append(
        np.log(i) *
        tf.reduce_mean(tf.reduce_mean(tf.square(x_ind_reshaped - S2[-1]), 1)))
    BER.append(
        tf.reduce_mean(
            tf.cast(
                tf.logical_or(
                    tf.not_equal(tf.sign(x_real), tf.sign(S1[-1][:, 0:K])),
                    tf.not_equal(tf.sign(x_imag),
                                 tf.sign(S1[-1][:, K:2 * K]))), tf.float32)))

Max_Val = tf.reduce_max(S3, axis=2, keep_dims=True)
Greater = tf.greater_equal(S3, Max_Val)
BER2 = tf.round(tf.cast(Greater, tf.float32))
BER3 = tf.not_equal(BER2, X_IND)
BER4 = tf.reduce_sum(tf.cast(BER3, tf.float32), 2)
BER5 = tf.cast(tf.greater(BER4, 0), tf.float32)
SER = tf.reduce_mean(BER5)
TOTAL_LOSS = tf.add_n(LOSS)

saver = tf.train.Saver()

global_step = tf.Variable(0, trainable=False)
learning_rate = tf.train.exponential_decay(startingLearningRate,
                                           global_step,
                                           decay_step,
                                           decay_factor,
                                           staircase=True)
Ejemplo n.º 9
0
    def test_ge(self):
        input1 = tf.placeholder(shape=(4, 32, 32, 3), dtype=tf.float32)
        input2 = tf.placeholder(shape=(4, 32, 32, 3), dtype=tf.float32)
        output = tf.greater_equal(input1, input2)

        self._test_conversion('ge', [input1, input2], [output])
Ejemplo n.º 10
0
    def loss(self, prediction_dict):
        """
        Returns cost for RCNN based on:

        Args:
            prediction_dict with keys:
                rcnn:
                    cls_score: shape (num_proposals, num_classes + 1)
                        Has the class scoring for each the proposals. Classes
                        are 1-indexed with 0 being the background.

                    cls_prob: shape (num_proposals, num_classes + 1)
                        Application of softmax on cls_score.

                    bbox_offsets: shape (num_proposals, num_classes * 4)
                        Has the offset for each proposal for each class.
                        We have to compare only the proposals labeled with the
                        offsets for that label.

                target:
                    cls_target: shape (num_proposals,)
                        Has the correct label for each of the proposals.
                        0 => background
                        1..n => 1-indexed classes

                    bbox_offsets_target: shape (num_proposals, 4)
                        Has the true offset of each proposal for the true
                        label.
                        In case of not having a true label (non-background)
                        then it's just zeroes.

        Returns:
            loss_dict with keys:
                rcnn_cls_loss: The cross-entropy or log-loss of the
                    classification tasks between then num_classes + background.
                rcnn_reg_loss: The smooth L1 loss for the bounding box
                    regression task to adjust correctly labeled boxes.

        """
        with tf.name_scope('RCNNLoss'):
            cls_score = prediction_dict['rcnn']['cls_score']
            # cls_prob = prediction_dict['rcnn']['cls_prob']
            # Cast target explicitly as int32.
            cls_target = tf.cast(
                prediction_dict['target']['cls'], tf.int32
            )

            # First we need to calculate the log loss betweetn cls_prob and
            # cls_target

            # We only care for the targets that are >= 0
            not_ignored = tf.reshape(tf.greater_equal(
                cls_target, 0), [-1], name='not_ignored')
            # We apply boolean mask to score, prob and target.
            cls_score_labeled = tf.boolean_mask(
                cls_score, not_ignored, name='cls_score_labeled')
            # cls_prob_labeled = tf.boolean_mask(
            #    cls_prob, not_ignored, name='cls_prob_labeled')
            cls_target_labeled = tf.boolean_mask(
                cls_target, not_ignored, name='cls_target_labeled')

            tf.summary.scalar(
                'batch_size',
                tf.shape(cls_score_labeled)[0], ['rcnn']
            )

            # Transform to one-hot vector
            cls_target_one_hot = tf.one_hot(
                cls_target_labeled, depth=self._num_classes + 1,
                name='cls_target_one_hot'
            )

            # We get cross entropy loss of each proposal.
            cross_entropy_per_proposal = (
                tf.nn.softmax_cross_entropy_with_logits_v2(
                    labels=tf.stop_gradient(cls_target_one_hot),
                    logits=cls_score_labeled
                )
            )

            if self._debug:
                prediction_dict['_debug']['losses'] = {}
                # Save the cross entropy per proposal to be able to
                # visualize proposals with high and low error.
                prediction_dict['_debug']['losses'][
                    'cross_entropy_per_proposal'
                ] = (
                    cross_entropy_per_proposal
                )

            # Second we need to calculate the smooth l1 loss between
            # `bbox_offsets` and `bbox_offsets_target`.
            bbox_offsets = prediction_dict['rcnn']['bbox_offsets']
            bbox_offsets_target = (
                prediction_dict['target']['bbox_offsets']
            )

            # We only want the non-background labels bounding boxes.
            not_ignored = tf.reshape(tf.greater(cls_target, 0), [-1])
            bbox_offsets_labeled = tf.boolean_mask(
                bbox_offsets, not_ignored, name='bbox_offsets_labeled')
            bbox_offsets_target_labeled = tf.boolean_mask(
                bbox_offsets_target, not_ignored,
                name='bbox_offsets_target_labeled'
            )

            cls_target_labeled = tf.boolean_mask(
                cls_target, not_ignored, name='cls_target_labeled')
            # `cls_target_labeled` is based on `cls_target` which has
            # `num_classes` + 1 classes.
            # for making `one_hot` with depth `num_classes` to work we need
            # to lower them to make them 0-index.
            cls_target_labeled = cls_target_labeled - 1

            cls_target_one_hot = tf.one_hot(
                cls_target_labeled, depth=self._num_classes,
                name='cls_target_one_hot'
            )

            # cls_target now is (num_labeled, num_classes)
            bbox_flatten = tf.reshape(
                bbox_offsets_labeled, [-1, 4], name='bbox_flatten')

            # We use the flatten cls_target_one_hot as boolean mask for the
            # bboxes.
            cls_flatten = tf.cast(tf.reshape(
                cls_target_one_hot, [-1]), tf.bool, 'cls_flatten_as_bool')

            bbox_offset_cleaned = tf.boolean_mask(
                bbox_flatten, cls_flatten, 'bbox_offset_cleaned')

            # Calculate the smooth l1 loss between the "cleaned" bboxes
            # offsets (that means, the useful results) and the labeled
            # targets.
            reg_loss_per_proposal = smooth_l1_loss(
                bbox_offset_cleaned, bbox_offsets_target_labeled,
                sigma=self._l1_sigma
            )

            tf.summary.scalar(
                'rcnn_foreground_samples',
                tf.shape(bbox_offset_cleaned)[0], ['rcnn']
            )

            if self._debug:
                # Also save reg loss per proposals to be able to visualize
                # good and bad proposals in debug mode.
                prediction_dict['_debug']['losses'][
                    'reg_loss_per_proposal'
                ] = (
                    reg_loss_per_proposal
                )

            return {
                'rcnn_cls_loss': tf.reduce_mean(cross_entropy_per_proposal),
                'rcnn_reg_loss': tf.reduce_mean(reg_loss_per_proposal),
            }
Ejemplo n.º 11
0
    def _build(self, all_anchors, gt_boxes, im_shape):
        """
        We compare anchors to GT and using the minibatch size and the different
        config settings (clobber, foreground fraction, etc), we end up with
        training targets *only* for the elements we want to use in the batch,
        while everything else is ignored.

        Basically what it does is, first generate the targets for all (valid)
        anchors, and then start subsampling the positive (foreground) and the
        negative ones (background) based on the number of samples of each type
        that we want.

        Args:
            all_anchors:
                A Tensor with all the bounding boxes coords of the anchors.
                Its shape should be (num_anchors, 4).
            gt_boxes:
                A Tensor with the ground truth bounding boxes of the image of
                the batch being processed. Its shape should be (num_gt, 5).
                The last dimension is used for the label.
            im_shape:
                Shape of original image (height, width) in order to define
                anchor targers in respect with gt_boxes.

        Returns:
            Tuple of the tensors of:
                labels: (1, 0, -1) for each anchor.
                    Shape (num_anchors, 1)
                bbox_targets: 4d bbox targets as specified by paper.
                    Shape (num_anchors, 4)
                max_overlaps: Max IoU overlap with ground truth boxes.
                    Shape (num_anchors, 1)
        """
        # Keep only the coordinates of gt_boxes
        gt_boxes = gt_boxes[:, :4]
        all_anchors = all_anchors[:, :4]

        # Only keep anchors inside the image
        (x_min_anchor, y_min_anchor, x_max_anchor,
         y_max_anchor) = tf.unstack(all_anchors, axis=1)

        anchor_filter = tf.logical_and(
            tf.logical_and(
                tf.greater_equal(x_min_anchor, -self._allowed_border),
                tf.greater_equal(y_min_anchor, -self._allowed_border)),
            tf.logical_and(
                tf.less(x_max_anchor, im_shape[1] + self._allowed_border),
                tf.less(y_max_anchor, im_shape[0] + self._allowed_border)))

        # We (force) reshape the filter so that we can use it as a boolean mask
        anchor_filter = tf.reshape(anchor_filter, [-1])
        # Filter anchors.
        anchors = tf.boolean_mask(all_anchors,
                                  anchor_filter,
                                  name='filter_anchors')

        # Generate array with the labels for all_anchors.
        labels = tf.fill((tf.gather(tf.shape(all_anchors), [0])), -1)
        labels = tf.boolean_mask(labels, anchor_filter, name='filter_labels')

        # Intersection over union (IoU) overlap between the anchors and the
        # ground truth boxes.
        overlaps = bbox_overlap_tf(tf.to_float(anchors), tf.to_float(gt_boxes))

        # Generate array with the IoU value of the closest GT box for each
        # anchor.
        max_overlaps = tf.reduce_max(overlaps, axis=1)
        if not self._clobber_positives:
            # Assign bg labels first so that positive labels can clobber them.
            # First we get an array with True where IoU is less than
            # self._negative_overlap
            negative_overlap_nonzero = tf.less(max_overlaps,
                                               self._negative_overlap)

            # Finally we set 0 at True indices
            labels = tf.where(condition=negative_overlap_nonzero,
                              x=tf.zeros(tf.shape(labels)),
                              y=tf.to_float(labels))
        # Get the value of the max IoU for the closest anchor for each gt.
        gt_max_overlaps = tf.reduce_max(overlaps, axis=0)

        # Find all the indices that match (at least one, but could be more).
        gt_argmax_overlaps = tf.squeeze(tf.equal(overlaps, gt_max_overlaps))
        gt_argmax_overlaps = tf.where(gt_argmax_overlaps)[:, 0]
        # Eliminate duplicates indices.
        gt_argmax_overlaps, _ = tf.unique(gt_argmax_overlaps)
        # Order the indices for sparse_to_dense compatibility
        gt_argmax_overlaps, _ = tf.nn.top_k(gt_argmax_overlaps,
                                            k=tf.shape(gt_argmax_overlaps)[-1])
        gt_argmax_overlaps = tf.reverse(gt_argmax_overlaps, [0])

        # Foreground label: for each ground-truth, anchor with highest overlap.
        # When the argmax is many items we use all of them (for consistency).
        # We set 1 at gt_argmax_overlaps_cond indices
        gt_argmax_overlaps_cond = tf.sparse_to_dense(gt_argmax_overlaps,
                                                     tf.shape(
                                                         labels,
                                                         out_type=tf.int64),
                                                     True,
                                                     default_value=False)

        labels = tf.where(condition=gt_argmax_overlaps_cond,
                          x=tf.ones(tf.shape(labels)),
                          y=tf.to_float(labels))

        # Foreground label: above threshold Intersection over Union (IoU)
        # First we get an array with True where IoU is greater or equal than
        # self._positive_overlap
        positive_overlap_inds = tf.greater_equal(max_overlaps,
                                                 self._positive_overlap)
        # Finally we set 1 at True indices
        labels = tf.where(condition=positive_overlap_inds,
                          x=tf.ones(tf.shape(labels)),
                          y=labels)

        if self._clobber_positives:
            # Assign background labels last so that negative labels can clobber
            # positives. First we get an array with True where IoU is less than
            # self._negative_overlap
            negative_overlap_nonzero = tf.less(max_overlaps,
                                               self._negative_overlap)
            # Finally we set 0 at True indices
            labels = tf.where(condition=negative_overlap_nonzero,
                              x=tf.zeros(tf.shape(labels)),
                              y=labels)

        # Subsample positive labels if we have too many
        def subsample_positive():
            # Shuffle the foreground indices
            disable_fg_inds = tf.random_shuffle(fg_inds, seed=self._seed)
            # Select the indices that we have to ignore, this is
            # `tf.shape(fg_inds)[0] - num_fg` because we want to get only
            # `num_fg` foreground labels.
            disable_place = (tf.shape(fg_inds)[0] - num_fg)
            disable_fg_inds = disable_fg_inds[:disable_place]
            # Order the indices for sparse_to_dense compatibility
            disable_fg_inds, _ = tf.nn.top_k(disable_fg_inds,
                                             k=tf.shape(disable_fg_inds)[-1])
            disable_fg_inds = tf.reverse(disable_fg_inds, [0])
            disable_fg_inds = tf.sparse_to_dense(disable_fg_inds,
                                                 tf.shape(labels,
                                                          out_type=tf.int64),
                                                 True,
                                                 default_value=False)
            # Put -1 to ignore the anchors in the selected indices
            return tf.where(condition=tf.squeeze(disable_fg_inds),
                            x=tf.to_float(tf.fill(tf.shape(labels), -1)),
                            y=labels)

        num_fg = tf.to_int32(self._foreground_fraction * self._minibatch_size)
        # Get foreground indices, get True in the indices where we have a one.
        fg_inds = tf.equal(labels, 1)
        # We get only the indices where we have True.
        fg_inds = tf.squeeze(tf.where(fg_inds), axis=1)
        fg_inds_size = tf.size(fg_inds)
        # Condition for check if we have too many positive labels.
        subsample_positive_cond = fg_inds_size > num_fg
        # Check the condition and subsample positive labels.
        labels = tf.cond(subsample_positive_cond,
                         true_fn=subsample_positive,
                         false_fn=lambda: labels)

        # Subsample negative labels if we have too many
        def subsample_negative():
            # Shuffle the background indices
            disable_bg_inds = tf.random_shuffle(bg_inds, seed=self._seed)

            # Select the indices that we have to ignore, this is
            # `tf.shape(bg_inds)[0] - num_bg` because we want to get only
            # `num_bg` background labels.
            disable_place = (tf.shape(bg_inds)[0] - num_bg)
            disable_bg_inds = disable_bg_inds[:disable_place]
            # Order the indices for sparse_to_dense compatibility
            disable_bg_inds, _ = tf.nn.top_k(disable_bg_inds,
                                             k=tf.shape(disable_bg_inds)[-1])
            disable_bg_inds = tf.reverse(disable_bg_inds, [0])
            disable_bg_inds = tf.sparse_to_dense(disable_bg_inds,
                                                 tf.shape(labels,
                                                          out_type=tf.int64),
                                                 True,
                                                 default_value=False)
            # Put -1 to ignore the anchors in the selected indices
            return tf.where(condition=tf.squeeze(disable_bg_inds),
                            x=tf.to_float(tf.fill(tf.shape(labels), -1)),
                            y=labels)

        # Recalculate the foreground indices after (maybe) disable some of them

        # Get foreground indices, get True in the indices where we have a one.
        fg_inds = tf.equal(labels, 1)
        # We get only the indices where we have True.
        fg_inds = tf.squeeze(tf.where(fg_inds), axis=1)
        fg_inds_size = tf.size(fg_inds)

        num_bg = tf.to_int32(self._minibatch_size - fg_inds_size)
        # Get background indices, get True in the indices where we have a zero.
        bg_inds = tf.equal(labels, 0)
        # We get only the indices where we have True.
        bg_inds = tf.squeeze(tf.where(bg_inds), axis=1)
        bg_inds_size = tf.size(bg_inds)
        # Condition for check if we have too many positive labels.
        subsample_negative_cond = bg_inds_size > num_bg
        # Check the condition and subsample positive labels.
        labels = tf.cond(subsample_negative_cond,
                         true_fn=subsample_negative,
                         false_fn=lambda: labels)

        # Return bbox targets with shape (anchors.shape[0], 4).

        # Find the closest gt box for each anchor.
        argmax_overlaps = tf.argmax(overlaps, axis=1)
        # Eliminate duplicates.
        argmax_overlaps_unique, _ = tf.unique(argmax_overlaps)
        # Filter the gt_boxes.
        # We get only the indices where we have "inside anchors".
        anchor_filter_inds = tf.where(anchor_filter)
        gt_boxes = tf.gather(gt_boxes, argmax_overlaps)

        bbox_targets = encode_tf(anchors, gt_boxes)

        # For the anchors that arent foreground, we ignore the bbox_targets.
        anchor_foreground_filter = tf.equal(labels, 1)
        bbox_targets = tf.where(condition=anchor_foreground_filter,
                                x=bbox_targets,
                                y=tf.zeros_like(bbox_targets))

        # We unroll "inside anchors" value for all anchors (for shape
        # compatibility).

        # We complete the missed indices with zeros
        # (because scatter_nd has zeros as default).
        bbox_targets = tf.scatter_nd(indices=tf.to_int32(anchor_filter_inds),
                                     updates=bbox_targets,
                                     shape=tf.shape(all_anchors))

        labels_scatter = tf.scatter_nd(indices=tf.to_int32(anchor_filter_inds),
                                       updates=labels,
                                       shape=[tf.shape(all_anchors)[0]])
        # We have to put -1 to ignore the indices with 0 generated by
        # scatter_nd, otherwise it will be considered as background.
        labels = tf.where(condition=anchor_filter,
                          x=labels_scatter,
                          y=tf.to_float(tf.fill(tf.shape(labels_scatter), -1)))

        max_overlaps = tf.scatter_nd(indices=tf.to_int32(anchor_filter_inds),
                                     updates=max_overlaps,
                                     shape=[tf.shape(all_anchors)[0]])

        return labels, bbox_targets, max_overlaps
Ejemplo n.º 12
0
    def _build(self, conv_feature_map, proposals, im_shape, base_network,
               gt_boxes=None, is_training=False):
        """
        Classifies & refines proposals based on the pooled feature map.

        Args:
            conv_feature_map: The feature map of the image, extracted
                using the pretrained network.
                Shape: (num_proposals, pool_height, pool_width, 512).
            proposals: A Tensor with the bounding boxes proposed by the RPN.
                Shape: (total_num_proposals, 4).
                Encoding: (x1, y1, x2, y2).
            im_shape: A Tensor with the shape of the image in the form of
                (image_height, image_width).
            gt_boxes (optional): A Tensor with the ground truth boxes of the
                image.
                Shape: (total_num_gt, 5).
                Encoding: (x1, y1, x2, y2, label).
            is_training (optional): A boolean to determine if we are just using
                the module for training or just inference.

        Returns:
            prediction_dict: a dict with the object predictions.
                It should have the keys:
                objects:
                labels:
                probs:

                rcnn:
                target:

        """
        self._instantiate_layers()

        prediction_dict = {'_debug': {}}

        if gt_boxes is not None:
            proposals_target, bbox_offsets_target = self._rcnn_target(
                proposals, gt_boxes)

            if is_training:
                with tf.name_scope('prepare_batch'):
                    # We flatten to set shape, but it is already a flat Tensor.
                    in_batch_proposals = tf.reshape(
                        tf.greater_equal(proposals_target, 0), [-1]
                    )
                    proposals = tf.boolean_mask(
                        proposals, in_batch_proposals)
                    bbox_offsets_target = tf.boolean_mask(
                        bbox_offsets_target, in_batch_proposals)
                    proposals_target = tf.boolean_mask(
                        proposals_target, in_batch_proposals)

            prediction_dict['target'] = {
                'cls': proposals_target,
                'bbox_offsets': bbox_offsets_target,
            }

        roi_prediction = self._roi_pool(proposals, conv_feature_map, im_shape)

        if self._debug:
            # Save raw roi prediction in debug mode.
            prediction_dict['_debug']['roi'] = roi_prediction

        pooled_features = roi_prediction['roi_pool']
        features = base_network._build_tail(
            pooled_features, is_training=is_training
        )

        if self._use_mean:
            # We avg our height and width dimensions for a more
            # "memory-friendly" Tensor.
            features = tf.reduce_mean(features, [1, 2])

        # We treat num proposals as batch number so that when flattening we
        # get a (num_proposals, flatten_pooled_feature_map_size) Tensor.
        flatten_features = tf.contrib.layers.flatten(features)
        net = tf.identity(flatten_features)

        if is_training:
            net = tf.nn.dropout(net, keep_prob=self._dropout_keep_prob)

        if self._debug:
            prediction_dict['_debug']['flatten_net'] = net

        # After flattening we are left with a Tensor of shape
        # (num_proposals, pool_height * pool_width * 512).
        # The first dimension works as batch size when applied to snt.Linear.
        for i, layer in enumerate(self._layers):
            # Through FC layer.
            net = layer(net)

            # Apply activation and dropout.
            variable_summaries(
                net, 'fc_{}_preactivationout'.format(i), 'reduced'
            )
            net = self._activation(net)
            if self._debug:
                prediction_dict['_debug']['layer_{}_out'.format(i)] = net

            variable_summaries(net, 'fc_{}_out'.format(i), 'reduced')
            if is_training:
                net = tf.nn.dropout(net, keep_prob=self._dropout_keep_prob)

        cls_score = self._classifier_layer(net)
        cls_prob = tf.nn.softmax(cls_score, axis=1)
        bbox_offsets = self._bbox_layer(net)

        prediction_dict['rcnn'] = {
            'cls_score': cls_score,
            'cls_prob': cls_prob,
            'bbox_offsets': bbox_offsets,
        }

        # Get final objects proposals based on the probabilty, the offsets and
        # the original proposals.
        proposals_pred = self._rcnn_proposal(
            proposals, bbox_offsets, cls_prob, im_shape)

        # objects, objects_labels, and objects_labels_prob are the only keys
        # that matter for drawing objects.
        prediction_dict['objects'] = proposals_pred['objects']
        prediction_dict['labels'] = proposals_pred['proposal_label']
        prediction_dict['probs'] = proposals_pred['proposal_label_prob']

        if self._debug:
            prediction_dict['_debug']['proposal'] = proposals_pred

        # Calculate summaries for results
        variable_summaries(cls_prob, 'cls_prob', 'reduced')
        variable_summaries(bbox_offsets, 'bbox_offsets', 'reduced')

        if self._debug:
            variable_summaries(pooled_features, 'pooled_features', 'full')
            layer_summaries(self._classifier_layer, 'full')
            layer_summaries(self._bbox_layer, 'full')

        return prediction_dict
Ejemplo n.º 13
0
  def _compute_inner_update_onsbet(self, var, grad, state):
    update_ops = []

    eta = tf.cast(state.get_hyper(ETA), var.dtype.base_dtype)
    betting_domain = tf.cast(
        state.get_hyper(BETTING_DOMAIN), var.dtype.base_dtype)

    wealth = state.get_slot(var, INNER_WEALTH)
    betting_fraction = state.get_slot(var, OUTER_BETTING_FRACTION)
    inner_betting_fraction = state.get_slot(var, INNER_BETTING_FRACTION)
    sum_grad_squared = state.get_slot(var, INNER_SUM_GRAD_SQUARED)
    inner_maximum_gradient = state.get_slot(var, INNER_MAXIMUM_GRADIENT)

    inner_maximum_gradient_updated = self._assign(
        inner_maximum_gradient, tf.maximum(inner_maximum_gradient,
                                           tf.abs(grad)))
    update_ops.append(inner_maximum_gradient_updated)

    clipped_old_betting_fraction = tf.clip_by_value(betting_fraction,
                                                    -betting_domain,
                                                    betting_domain)

    # Process grad to respect truncation to [-betting_domain, betting_domain]
    truncated_grad = tf.where(
        tf.greater_equal(
            grad * (betting_fraction - clipped_old_betting_fraction), 0), grad,
        tf.zeros(tf.shape(grad)))

    wealth_delta = -betting_fraction * truncated_grad
    wealth_updated = self._assign_add(wealth, wealth_delta)
    update_ops.append(wealth_updated)

    # This is the gradient with respect to the betting fraction v
    # use by the ONS algorithm - a kind of "inner inner grad".
    # Hueristic: We also scale v_grad down by the inner maximum gradient so as
    # to make it ``unitless''. This is helpful because the learning rate for
    # ONS is proportional to sum v_grad**2, and so the scale of the learning
    # rate and of v_grad are unlikely to be properly matched without this.
    if self.rescale_inner:
      v_grad = truncated_grad / (
          (1.0 - inner_betting_fraction * truncated_grad) *
          inner_maximum_gradient_updated)
    else:
      v_grad = truncated_grad / (
          (1.0 - inner_betting_fraction * truncated_grad))

    sum_grad_squared_updated = self._assign_add(sum_grad_squared,
                                                tf.square(v_grad))
    update_ops.append(sum_grad_squared_updated)

    new_inner_betting_fraction = inner_betting_fraction - eta * v_grad / (
        sum_grad_squared_updated)
    new_inner_betting_fraction = tf.clip_by_value(new_inner_betting_fraction,
                                                  -betting_domain,
                                                  betting_domain)
    inner_betting_fraction_updated = self._assign(inner_betting_fraction,
                                                  new_inner_betting_fraction)
    update_ops.append(inner_betting_fraction_updated)

    if self.output_summaries:
      mean_inner_betting_fraction_summary = tf.reduce_mean(
          tf.abs(inner_betting_fraction_updated))
      max_inner_betting_fraction_summary = tf.reduce_max(
          tf.abs(inner_betting_fraction_updated))
      inner_maximum_gradient_summary = tf.reduce_max(
          inner_maximum_gradient_updated)
      tf.summary.scalar(self._name + "/mean_inner_betting/" + var.name,
                        mean_inner_betting_fraction_summary)
      tf.summary.scalar(self._name + "/max_inner_betting/" + var.name,
                        max_inner_betting_fraction_summary)
      tf.summary.scalar(self._name + "/inner_maximum_gradient/" + var.name,
                        inner_maximum_gradient_summary)

    betting_fraction_updated = self._assign(
        betting_fraction, inner_betting_fraction_updated * wealth_updated)
    update_ops.append(betting_fraction_updated)

    clipped_betting_fraction = tf.clip_by_value(betting_fraction_updated,
                                                -betting_domain, betting_domain)

    return clipped_betting_fraction, tf.group(*update_ops)
Ejemplo n.º 14
0
  def _compute_inner_update_scinol(self, var, grad, state):
    update_ops = []

    betting_domain = tf.cast(
        state.get_hyper(BETTING_DOMAIN), var.dtype.base_dtype)

    reward = state.get_slot(var, INNER_REWARD)
    betting_fraction = state.get_slot(var, OUTER_BETTING_FRACTION)
    sum_grad_squared = state.get_slot(var, INNER_SUM_GRAD_SQUARED)
    sum_grad = state.get_slot(var, INNER_SUM_GRAD)
    inner_maximum_gradient = state.get_slot(var, INNER_MAXIMUM_GRADIENT)

    # clip inner gradient to respect previous inner_maximum_gradient value
    # This introduces at most an additive constant overhead in the regret
    # since the inner betting fraction lies in a bounded domain.
    clipped_grad = tf.clip_by_value(grad, -inner_maximum_gradient,
                                    inner_maximum_gradient)

    with tf.control_dependencies([clipped_grad]):
      inner_maximum_gradient_updated = self._assign(
          inner_maximum_gradient,
          tf.maximum(inner_maximum_gradient, tf.abs(grad)))
      update_ops.append(inner_maximum_gradient_updated)

    clipped_old_betting_fraction = tf.clip_by_value(betting_fraction,
                                                    -betting_domain,
                                                    betting_domain)

    # Process grad to respect truncation to [-betting_domain, betting_domain]
    truncated_grad = tf.where(
        tf.greater_equal(
            clipped_grad * (betting_fraction - clipped_old_betting_fraction),
            0.0), clipped_grad, tf.zeros(tf.shape(clipped_grad)))

    reward_delta = -betting_fraction * truncated_grad
    reward_updated = self._assign_add(reward, reward_delta)
    update_ops.append(reward_updated)

    sum_grad_squared_updated = self._assign_add(sum_grad_squared,
                                                tf.square(truncated_grad))
    update_ops.append(sum_grad_squared_updated)

    sum_grad_updated = self._assign_add(sum_grad, truncated_grad)
    update_ops.append(sum_grad_updated)

    # The second term in this maximum, inner_maximum_gradient_updated / self.eta
    # is a hack to force the betting fraction to not be too big at first.
    scaling = tf.minimum(tf.rsqrt(sum_grad_squared_updated +
                tf.square(inner_maximum_gradient_updated)),
                         self.eta/inner_maximum_gradient_updated)
    theta = -sum_grad_updated * scaling

    # rescale inner flag is a hack that rescales the epsilon_v by the
    # maximum inner gradient.
    if self.rescale_inner:
      epsilon_scaling = inner_maximum_gradient_updated
    else:
      epsilon_scaling = 1.0

    inner_betting_fraction = tf.sign(theta) * tf.minimum(tf.abs(theta),
                                                         1.0) * scaling / 2.0
    new_betting_fraction = inner_betting_fraction * (
        reward_updated + epsilon_scaling * self.epsilon_v)

    betting_fraction_updated = self._assign(betting_fraction,
                                            new_betting_fraction)
    update_ops.append(betting_fraction_updated)

    clipped_betting_fraction = tf.clip_by_value(betting_fraction_updated,
                                                -betting_domain, betting_domain)

    if self.output_summaries:
      mean_unclipped_betting_fraction_summary = tf.reduce_mean(
          tf.abs(betting_fraction_updated))
      max_unclipped_betting_fraction_summary = tf.reduce_max(
          tf.abs(betting_fraction_updated))

      mean_clipped_betting_fraction_summary = tf.reduce_mean(
          tf.abs(clipped_betting_fraction))
      max_clipped_betting_fraction_summary = tf.reduce_max(
          tf.abs(clipped_betting_fraction))

      max_abs_gradient = tf.reduce_max(tf.abs(grad))
      max_truncated_grad = tf.reduce_max(tf.abs(truncated_grad))

      tf.summary.scalar(self._name + "/mean_unclipped_bet/" + var.name,
                        mean_unclipped_betting_fraction_summary)
      tf.summary.scalar(self._name + "/max_unclipped_bet/" + var.name,
                        max_unclipped_betting_fraction_summary)
      tf.summary.scalar(self._name + "/mean_clipped_bet/" + var.name,
                        mean_clipped_betting_fraction_summary)
      tf.summary.scalar(self._name + "/max_clipped_bet/" + var.name,
                        max_clipped_betting_fraction_summary)

      tf.summary.scalar(self._name + "/max_abs_inner_grad/" + var.name,
                        max_abs_gradient)
      tf.summary.scalar(
          self._name + "/max_abs_truncated_inner_grad/" + var.name,
          max_truncated_grad)
    return clipped_betting_fraction, tf.group(*update_ops)
def random_crop(image_list, crop_height, crop_width):
    """Crops the given list of images.

  The function applies the same crop to each image in the list. This can be
  effectively applied when there are multiple image inputs of the same
  dimension such as:
    image, depths, normals = random_crop([image, depths, normals], 120, 150)
  Args:
    image_list: a list of image tensors of the same dimension but possibly
      varying channel.
    crop_height: the new height.
    crop_width: the new width.

  Returns:
    the image_list with cropped images.
  Raises:
    ValueError: if there are multiple image inputs provided with different size
      or the images are smaller than the crop dimensions.
  """
    if not image_list:
        raise ValueError('Empty image_list.')

    # Compute the rank assertions.
    rank_assertions = []
    for i in range(len(image_list)):
        image_rank = tf.rank(image_list[i])
        rank_assert = tf.Assert(tf.equal(image_rank, 3), [
            'Wrong rank for tensor  %s [expected] [actual]',
            image_list[i].name, 3, image_rank
        ])
        rank_assertions.append(rank_assert)

    with tf.control_dependencies([rank_assertions[0]]):
        image_shape = tf.shape(image_list[0])
    image_height = image_shape[0]
    image_width = image_shape[1]
    crop_size_assert = tf.Assert(
        tf.logical_and(tf.greater_equal(image_height, crop_height),
                       tf.greater_equal(image_width, crop_width)),
        ['Crop size greater than the image size.'])

    asserts = [rank_assertions[0], crop_size_assert]

    for i in range(1, len(image_list)):
        image = image_list[i]
        asserts.append(rank_assertions[i])
        with tf.control_dependencies([rank_assertions[i]]):
            shape = tf.shape(image)
        height = shape[0]
        width = shape[1]

        height_assert = tf.Assert(tf.equal(height, image_height), [
            'Wrong height for tensor %s [expected][actual]', image.name,
            height, image_height
        ])
        width_assert = tf.Assert(tf.equal(width, image_width), [
            'Wrong width for tensor %s [expected][actual]', image.name, width,
            image_width
        ])
        asserts.extend([height_assert, width_assert])

    # Create a random bounding box.
    #
    # Use tf.random_uniform and not numpy.random.rand as doing the former would
    # generate random numbers at graph eval time, unlike the latter which
    # generates random numbers at graph definition time.
    with tf.control_dependencies(asserts):
        max_offset_height = tf.reshape(image_height - crop_height + 1, [])
        max_offset_width = tf.reshape(image_width - crop_width + 1, [])
    offset_height = tf.random_uniform([],
                                      maxval=max_offset_height,
                                      dtype=tf.int32)
    offset_width = tf.random_uniform([],
                                     maxval=max_offset_width,
                                     dtype=tf.int32)

    return [
        _crop(image, offset_height, offset_width, crop_height, crop_width)
        for image in image_list
    ]
def match_boxes(anchors,
                groundtruth_boxes,
                positives_threshold=0.5,
                negatives_threshold=0.4,
                force_match_groundtruth=True):
    """
    If an anchor has IoU over `positives_threshold` with any groundtruth box,
    it will be set a positive label.
    Anchors which have highest IoU for a groundtruth box will
    also be assigned a positive label.
    Meanwhile, if other anchors have IoU less than `negatives_threshold`
    with all groundtruth boxes, their labels will be negative.

    Matching algorithm:
    1) for each groundtruth box choose the anchor with largest IoU,
    2) remove this set of anchors from the set of all anchors,
    3) for each remaining anchor choose the groundtruth box with largest IoU,
       but only if this IoU is larger than `positives_threshold`,
    4) remove this set of matched anchors from the set of all anchors,
    5) for each remaining anchor if it has IoU less than `negatives_threshold`
       with all groundtruth boxes set it to `negative`, otherwise set it to `ignore`.

    Note: after step 1, it could happen that for some two groundtruth boxes
    chosen anchors are the same. Let's hope this never happens.
    Also see the comments below.

    Arguments:
        anchors: a float tensor with shape [num_anchors, 4].
        groundtruth_boxes: a float tensor with shape [N, 4].
        positives_threshold: a float number.
        negatives_threshold: a float number.
        force_match_groundtruth: a boolean, whether to try to make sure
            that all groundtruth boxes are matched.
    Returns:
        an int tensor with shape [num_anchors], possible values
            that it can contain are [-2, -1, 0, 1, 2, ..., (N - 1)],
            where numbers in the range [0, N - 1] mean indices of the groundtruth boxes,
            `-1` means that an anchor box is negative (background),
            and `-2` means that we must ignore this anchor box.
    """
    assert positives_threshold >= negatives_threshold

    # for each anchor box choose the groundtruth box with largest iou
    similarity_matrix = iou(groundtruth_boxes,
                            anchors)  # shape [N, num_anchors]
    matches = tf.argmax(similarity_matrix, axis=0,
                        output_type=tf.int32)  # shape [num_anchors]
    matched_vals = tf.reduce_max(similarity_matrix,
                                 axis=0)  # shape [num_anchors]
    is_positive = tf.to_int32(
        tf.greater_equal(matched_vals, positives_threshold))

    if positives_threshold == negatives_threshold:
        is_negative = 1 - is_positive
        matches = matches * is_positive + (-1 * is_negative)
    else:
        is_negative = tf.to_int32(tf.greater(negatives_threshold,
                                             matched_vals))
        to_ignore = (1 - is_positive) * (1 - is_negative)
        matches = matches * is_positive + (-1 * is_negative) + (-2 * to_ignore)

    # after this, it could happen that some groundtruth
    # boxes are not matched with any anchor box

    if force_match_groundtruth:
        # now we must ensure that each row (groundtruth box) is matched to
        # at least one column (which is not guaranteed
        # otherwise if `positives_threshold` is high)

        # for each groundtruth box choose the anchor box with largest iou
        # (force match for each groundtruth box)
        forced_matches_ids = tf.argmax(similarity_matrix,
                                       axis=1,
                                       output_type=tf.int32)  # shape [N]
        # if all indices in forced_matches_ids are different then all rows will be matched

        num_anchors = tf.shape(anchors)[0]
        forced_matches_indicators = tf.one_hot(
            forced_matches_ids, depth=num_anchors,
            dtype=tf.int32)  # shape [N, num_anchors]
        forced_match_row_ids = tf.argmax(
            forced_matches_indicators, axis=0,
            output_type=tf.int32)  # shape [num_anchors]

        # some forced matches could be very bad!
        forced_matches_values = tf.reduce_max(similarity_matrix,
                                              axis=1)  # shape [N]
        small_iou = 0.05  # this requires that forced match has at least small intersection
        is_okay = tf.to_int32(
            tf.greater_equal(forced_matches_values, small_iou))  # shape [N]
        forced_matches_indicators = forced_matches_indicators * tf.expand_dims(
            is_okay, axis=1)

        forced_match_mask = tf.greater(
            tf.reduce_max(forced_matches_indicators, axis=0),
            0)  # shape [num_anchors]
        matches = tf.where(forced_match_mask, forced_match_row_ids, matches)
        # even after this it could happen that some rows aren't matched,
        # but i believe that this event has low probability

    return matches
Ejemplo n.º 17
0
def prepare_encoder_input(features,
                          hparams,
                          embed_scope=None,
                          embed_token_fn=common_embed.embed_tokens):
    """Prepares the input for the screen encoder.

  Args:
    features: the feature dict.
    hparams: the hyperparameter.
    embed_scope: the embedding variable scope.
    embed_token_fn: the function for embedding tokens.
  Returns:
    object_embedding: a Tensor of shape
        [batch_size, num_steps, max_object_count, embed_depth]
    object_mask: a binary tensor of shape
        [batch_size, num_steps, max_object_count]
    nonpadding_bias: a Tensor of shape
        [batch_size, num_steps, max_object_count]
  """
    with tf.control_dependencies(
        [tf.assert_equal(tf.rank(features["obj_text"]), 4)]):
        if hparams.get("synthetic_screen_noise", 0.) > 0.:
            num_objects = tf.shape(features["obj_text"])[2]
            # [batch, length, num_objects]
            target_obj_mask = tf.cast(
                tf.one_hot(features["objects"], depth=num_objects), tf.bool)
            num_tokens = tf.shape(features["obj_text"])[-1]
            target_obj_mask = tf.tile(tf.expand_dims(target_obj_mask, 3),
                                      [1, 1, 1, num_tokens])
            # Randomly keep tokens
            keep_mask = tf.greater_equal(
                tf.random_uniform(shape=tf.shape(features["obj_text"])),
                hparams.synthetic_screen_noise)
            # Keep paddings
            keep_mask = tf.logical_or(tf.equal(features["obj_text"], 0),
                                      keep_mask)
            # Keep targets
            target_obj_mask = tf.logical_or(target_obj_mask, keep_mask)
            features["obj_text"] = tf.where(
                target_obj_mask, features["obj_text"],
                tf.random_uniform(shape=tf.shape(features["obj_text"]),
                                  maxval=50000,
                                  dtype=tf.int32))
        text_embeddings, _ = embed_token_fn(features["obj_text"],
                                            hparams.task_vocab_size,
                                            hparams.hidden_size,
                                            hparams,
                                            embed_scope=embed_scope)
        with tf.variable_scope("obj_text_embed", reuse=tf.AUTO_REUSE):
            if hparams.obj_text_aggregation == "max":
                embed_bias = tf.cast(tf.less(features["obj_text"], 2),
                                     tf.float32) * -1e7
                with tf.control_dependencies(
                    [tf.assert_equal(tf.rank(embed_bias), 4)]):
                    text_embeddings = tf.reduce_max(
                        text_embeddings + tf.expand_dims(embed_bias, 4), -2)
                    no_txt_embed = tf.get_variable(name="no_txt_embed",
                                                   shape=[hparams.hidden_size])
                    shape = common_layers.shape_list(text_embeddings)
                    no_txt_embed = tf.tile(
                        tf.reshape(no_txt_embed,
                                   [1, 1, 1, hparams.hidden_size]),
                        [shape[0], shape[1], shape[2], 1])
                    text_embeddings = tf.maximum(text_embeddings, no_txt_embed)
            elif hparams.obj_text_aggregation == "sum":
                # [batch, step, #max_obj, #max_token]  0 for padded tokens
                real_objects = tf.cast(
                    tf.greater_equal(features["obj_text"], 2), tf.float32)
                # [batch, step, #max_obj, hidden]   0s for padded objects
                text_embeddings = tf.reduce_sum(
                    text_embeddings * tf.expand_dims(real_objects, 4), -2)
            elif hparams.obj_text_aggregation == "mean":
                shape_list = common_layers.shape_list(text_embeddings)
                embeddings = tf.reshape(text_embeddings, [-1] + shape_list[3:])
                emb_sum = tf.reduce_sum(tf.abs(embeddings), axis=-1)
                non_paddings = tf.not_equal(emb_sum, 0.0)
                embeddings = common_embed.average_bag_of_embeds(
                    embeddings,
                    non_paddings,
                    use_bigrams=True,
                    bigram_embed_scope=embed_scope,
                    append_start_end=True)
                text_embeddings = tf.reshape(
                    embeddings, shape_list[:3] + [hparams.hidden_size])
            else:
                raise ValueError("Unrecognized token aggregation %s" %
                                 (hparams.obj_text_aggregation))
    with tf.control_dependencies([
            tf.assert_equal(tf.rank(features["obj_type"]), 3),
            tf.assert_equal(tf.rank(features["obj_clickable"]), 3)
    ]):
        with tf.variable_scope("encode_object_attr", reuse=tf.AUTO_REUSE):
            type_embedding = tf.nn.embedding_lookup(params=tf.get_variable(
                name="embed_type_w",
                shape=[hparams.get("num_types", 100), hparams.hidden_size]),
                                                    ids=tf.maximum(
                                                        features["obj_type"],
                                                        0))
            clickable_embedding = tf.nn.embedding_lookup(
                params=tf.get_variable(name="embed_clickable_w",
                                       shape=[2, hparams.hidden_size]),
                ids=features["obj_clickable"])
    with tf.control_dependencies(
        [tf.assert_equal(tf.rank(features["obj_screen_pos"]), 4)]):

        def _create_embed(feature_name, vocab_size, depth):
            """Embed a position feature."""
            pos_embedding_list = []
            with tf.variable_scope("encode_object_" + feature_name,
                                   reuse=tf.AUTO_REUSE):
                num_featues = common_layers.shape_list(
                    features[feature_name])[-1]
                for i in range(num_featues):
                    pos_embedding_list.append(
                        tf.nn.embedding_lookup(
                            params=tf.get_variable(name=feature_name +
                                                   "_embed_w_%d" % i,
                                                   shape=[vocab_size, depth]),
                            ids=features[feature_name][:, :, :, i]))
                pos_embedding = tf.add_n(pos_embedding_list)
                return pos_embedding

        pos_embedding = _create_embed("obj_screen_pos", hparams.max_pixel_pos,
                                      hparams.hidden_size)
    if "all" == hparams.screen_embedding_feature or (
            "dom" in hparams.screen_embedding_feature):
        dom_embedding = _create_embed("obj_dom_pos", hparams.max_dom_pos,
                                      hparams.hidden_size)
    object_embed = tf.zeros_like(text_embeddings, dtype=tf.float32)
    if hparams.screen_embedding_feature == "all":
        object_embed = (text_embeddings + type_embedding + pos_embedding +
                        dom_embedding)
    elif "text" in hparams.screen_embedding_feature:
        object_embed += text_embeddings
    elif "type" in hparams.screen_embedding_feature:
        object_embed += type_embedding
    elif "pos" in hparams.screen_embedding_feature:
        object_embed += pos_embedding
    elif "dom" in hparams.screen_embedding_feature:
        object_embed += dom_embedding
    elif "click" in hparams.screen_embedding_feature:
        object_embed += clickable_embedding
    object_mask = tf.cast(tf.not_equal(features["obj_type"], -1), tf.float32)
    object_embed = object_embed * tf.expand_dims(object_mask, 3)
    att_bias = (1. - object_mask) * common_attention.large_compatible_negative(
        object_embed.dtype)
    return object_embed, object_mask, att_bias
Ejemplo n.º 18
0
def build_genie_model(feat_dict,
                      cfg,
                      batch_size,
                      seq_len,
                      is_training=True,
                      seq_varlens=None,
                      dtype=tf.float32):
    """Builds a Piano Genie model.

    Args:
      feat_dict: Dictionary containing input tensors.
      cfg: Configuration object.
      batch_size: Number of items in batch.
      seq_len: Length of each batch item.
      is_training: Set to False for evaluation.
      seq_varlens: If not None, a tensor with the batch sequence lengths.
      dtype: Model weight type.

    Returns:
      A dict containing tensors for relevant model config.
    """
    out_dict = {}

    # Parse features
    pitches = util.demidify(feat_dict["midi_pitches"])
    velocities = feat_dict["velocities"]
    pitches_scalar = ((tf.cast(pitches, tf.float32) / 87.) * 2.) - 1.

    # Create sequence lens
    if is_training and cfg.train_randomize_seq_len:
        seq_lens = tf.random_uniform([batch_size],
                                     minval=cfg.train_seq_len_min,
                                     maxval=seq_len + 1,
                                     dtype=tf.int32)
        stp_varlen_mask = tf.sequence_mask(seq_lens,
                                           maxlen=seq_len,
                                           dtype=tf.float32)
    elif seq_varlens is not None:
        seq_lens = seq_varlens
        stp_varlen_mask = tf.sequence_mask(seq_varlens,
                                           maxlen=seq_len,
                                           dtype=tf.float32)
    else:
        seq_lens = tf.ones([batch_size], dtype=tf.int32) * seq_len
        stp_varlen_mask = None

    # Encode
    if (cfg.stp_emb_unconstrained or cfg.stp_emb_vq or cfg.stp_emb_iq
            or cfg.seq_emb_unconstrained or cfg.seq_emb_vae
            or cfg.lor_emb_unconstrained):
        # Build encoder features
        enc_feats = []
        if cfg.enc_pitch_scalar:
            enc_feats.append(tf.expand_dims(pitches_scalar, axis=-1))
        else:
            enc_feats.append(tf.one_hot(pitches, 88))
        if "delta_times_int" in cfg.enc_aux_feats:
            enc_feats.append(
                tf.one_hot(feat_dict["delta_times_int"],
                           cfg.data_max_discrete_times + 1))
        if "velocities" in cfg.enc_aux_feats:
            enc_feats.append(
                tf.one_hot(velocities, cfg.data_max_discrete_velocities + 1))
        enc_feats = tf.concat(enc_feats, axis=2)

        with tf.variable_scope("encoder"):
            enc_stp, enc_seq = simple_lstm_encoder(
                enc_feats,
                seq_lens,
                rnn_celltype=cfg.rnn_celltype,
                rnn_nlayers=cfg.rnn_nlayers,
                rnn_nunits=cfg.rnn_nunits,
                rnn_bidirectional=cfg.enc_rnn_bidirectional,
                dtype=dtype)

    latents = []

    # Step embeddings (single vector per timestep)
    if cfg.stp_emb_unconstrained:
        with tf.variable_scope("stp_emb_unconstrained"):
            stp_emb_unconstrained = tf.layers.dense(
                enc_stp, cfg.stp_emb_unconstrained_embedding_dim)

        out_dict["stp_emb_unconstrained"] = stp_emb_unconstrained
        latents.append(stp_emb_unconstrained)

    # Quantized step embeddings with VQ-VAE
    if cfg.stp_emb_vq:
        import sonnet as snt  # pylint:disable=g-import-not-at-top
        with tf.variable_scope("stp_emb_vq"):
            with tf.variable_scope("pre_vq"):
                # pre_vq_encoding is tf.float32 of [batch_size, seq_len, embedding_dim]
                pre_vq_encoding = tf.layers.dense(enc_stp,
                                                  cfg.stp_emb_vq_embedding_dim)

            with tf.variable_scope("quantizer"):
                assert stp_varlen_mask is None
                vq_vae = snt.nets.VectorQuantizer(
                    embedding_dim=cfg.stp_emb_vq_embedding_dim,
                    num_embeddings=cfg.stp_emb_vq_codebook_size,
                    commitment_cost=cfg.stp_emb_vq_commitment_cost)
                vq_vae_output = vq_vae(pre_vq_encoding,
                                       is_training=is_training)

                stp_emb_vq_quantized = vq_vae_output["quantize"]
                stp_emb_vq_discrete = tf.reshape(
                    tf.argmax(vq_vae_output["encodings"],
                              axis=1,
                              output_type=tf.int32), [batch_size, seq_len])
                stp_emb_vq_codebook = tf.transpose(vq_vae.embeddings)

        out_dict["stp_emb_vq_quantized"] = stp_emb_vq_quantized
        out_dict["stp_emb_vq_discrete"] = stp_emb_vq_discrete
        out_dict["stp_emb_vq_loss"] = vq_vae_output["loss"]
        out_dict["stp_emb_vq_codebook"] = stp_emb_vq_codebook
        out_dict["stp_emb_vq_codebook_ppl"] = vq_vae_output["perplexity"]
        latents.append(stp_emb_vq_quantized)

        # This tensor retrieves continuous embeddings from codebook. It should
        # *never* be used during training.
        out_dict["stp_emb_vq_quantized_lookup"] = tf.nn.embedding_lookup(
            stp_emb_vq_codebook, stp_emb_vq_discrete)

    # Integer-quantized step embeddings with straight-through
    if cfg.stp_emb_iq:
        with tf.variable_scope("stp_emb_iq"):
            with tf.variable_scope("pre_iq"):
                # pre_iq_encoding is tf.float32 of [batch_size, seq_len]
                pre_iq_encoding = tf.layers.dense(enc_stp, 1)[:, :, 0]

            def iqst(x, n):
                """Integer quantization with straight-through estimator."""
                eps = 1e-7
                s = float(n - 1)
                xp = tf.clip_by_value((x + 1) / 2.0, -eps, 1 + eps)
                xpp = tf.round(s * xp)
                xppp = 2 * (xpp / s) - 1
                return xpp, x + tf.stop_gradient(xppp - x)

            with tf.variable_scope("quantizer"):
                # Pass rounded vals to decoder w/ straight-through estimator
                stp_emb_iq_discrete_f, stp_emb_iq_discrete_rescaled = iqst(
                    pre_iq_encoding, cfg.stp_emb_iq_nbins)
                stp_emb_iq_discrete = tf.cast(stp_emb_iq_discrete_f + 1e-4,
                                              tf.int32)
                stp_emb_iq_discrete_f = tf.cast(stp_emb_iq_discrete,
                                                tf.float32)
                stp_emb_iq_quantized = tf.expand_dims(
                    stp_emb_iq_discrete_rescaled, axis=2)

                # Determine which elements round to valid indices
                stp_emb_iq_inrange = tf.logical_and(
                    tf.greater_equal(pre_iq_encoding, -1),
                    tf.less_equal(pre_iq_encoding, 1))
                stp_emb_iq_inrange_mask = tf.cast(stp_emb_iq_inrange,
                                                  tf.float32)
                stp_emb_iq_valid_p = weighted_avg(stp_emb_iq_inrange_mask,
                                                  stp_varlen_mask)

                # Regularize to encourage encoder to output in range
                stp_emb_iq_range_penalty = weighted_avg(
                    tf.square(tf.maximum(tf.abs(pre_iq_encoding) - 1, 0)),
                    stp_varlen_mask)

                # Regularize to correlate latent finite differences to input
                stp_emb_iq_dlatents = pre_iq_encoding[:,
                                                      1:] - pre_iq_encoding[:, :
                                                                            -1]
                if cfg.stp_emb_iq_contour_dy_scalar:
                    stp_emb_iq_dnotes = pitches_scalar[:,
                                                       1:] - pitches_scalar[:, :
                                                                            -1]
                else:
                    stp_emb_iq_dnotes = tf.cast(
                        pitches[:, 1:] - pitches[:, :-1], tf.float32)
                if cfg.stp_emb_iq_contour_exp == 1:
                    power_func = tf.identity
                elif cfg.stp_emb_iq_contour_exp == 2:
                    power_func = tf.square
                else:
                    raise NotImplementedError()
                if cfg.stp_emb_iq_contour_comp == "product":
                    comp_func = tf.multiply
                elif cfg.stp_emb_iq_contour_comp == "quotient":

                    def comp_func(x, y):
                        return tf.divide(x, y + 1e-6)
                else:
                    raise NotImplementedError()

                stp_emb_iq_contour_penalty = weighted_avg(
                    power_func(
                        tf.maximum(
                            cfg.stp_emb_iq_contour_margin -
                            comp_func(stp_emb_iq_dnotes, stp_emb_iq_dlatents),
                            0)),
                    None if stp_varlen_mask is None else stp_varlen_mask[:,
                                                                         1:])

                # Regularize to maintain note consistency
                stp_emb_iq_note_held = tf.cast(
                    tf.equal(pitches[:, 1:] - pitches[:, :-1], 0), tf.float32)
                if cfg.stp_emb_iq_deviate_exp == 1:
                    power_func = tf.abs
                elif cfg.stp_emb_iq_deviate_exp == 2:
                    power_func = tf.square

                if stp_varlen_mask is None:
                    mask = stp_emb_iq_note_held
                else:
                    mask = stp_varlen_mask[:, 1:] * stp_emb_iq_note_held
                stp_emb_iq_deviate_penalty = weighted_avg(
                    power_func(stp_emb_iq_dlatents), mask)

                # Calculate perplexity of discrete encoder posterior
                if stp_varlen_mask is None:
                    mask = stp_emb_iq_inrange_mask
                else:
                    mask = stp_varlen_mask * stp_emb_iq_inrange_mask
                stp_emb_iq_discrete_oh = tf.one_hot(stp_emb_iq_discrete,
                                                    cfg.stp_emb_iq_nbins)
                stp_emb_iq_avg_probs = weighted_avg(stp_emb_iq_discrete_oh,
                                                    mask,
                                                    axis=[0, 1],
                                                    expand_mask=True)
                stp_emb_iq_discrete_ppl = tf.exp(
                    -tf.reduce_sum(stp_emb_iq_avg_probs *
                                   tf.log(stp_emb_iq_avg_probs + 1e-10)))

        out_dict["stp_emb_iq_quantized"] = stp_emb_iq_quantized
        out_dict["stp_emb_iq_discrete"] = stp_emb_iq_discrete
        out_dict["stp_emb_iq_valid_p"] = stp_emb_iq_valid_p
        out_dict["stp_emb_iq_range_penalty"] = stp_emb_iq_range_penalty
        out_dict["stp_emb_iq_contour_penalty"] = stp_emb_iq_contour_penalty
        out_dict["stp_emb_iq_deviate_penalty"] = stp_emb_iq_deviate_penalty
        out_dict["stp_emb_iq_discrete_ppl"] = stp_emb_iq_discrete_ppl
        latents.append(stp_emb_iq_quantized)

        # This tensor converts discrete values to continuous.
        # It should *never* be used during training.
        out_dict["stp_emb_iq_quantized_lookup"] = tf.expand_dims(
            2. * (stp_emb_iq_discrete_f / (cfg.stp_emb_iq_nbins - 1.)) - 1.,
            axis=2)

    # Sequence embedding (single vector per sequence)
    if cfg.seq_emb_unconstrained:
        with tf.variable_scope("seq_emb_unconstrained"):
            seq_emb_unconstrained = tf.layers.dense(
                enc_seq, cfg.seq_emb_unconstrained_embedding_dim)

        out_dict["seq_emb_unconstrained"] = seq_emb_unconstrained

        seq_emb_unconstrained = tf.stack([seq_emb_unconstrained] * seq_len,
                                         axis=1)
        latents.append(seq_emb_unconstrained)

    # Sequence embeddings (variational w/ reparameterization trick)
    if cfg.seq_emb_vae:
        with tf.variable_scope("seq_emb_vae"):
            seq_emb_vae = tf.layers.dense(enc_seq,
                                          cfg.seq_emb_vae_embedding_dim * 2)

            mean = seq_emb_vae[:, :cfg.seq_emb_vae_embedding_dim]
            stddev = 1e-6 + tf.nn.softplus(
                seq_emb_vae[:, cfg.seq_emb_vae_embedding_dim:])
            seq_emb_vae = mean + stddev * tf.random_normal(
                tf.shape(mean), 0, 1, dtype=dtype)

            kl = tf.reduce_mean(
                0.5 * tf.reduce_sum(tf.square(mean) + tf.square(stddev) -
                                    tf.log(1e-8 + tf.square(stddev)) - 1,
                                    axis=1))

        out_dict["seq_emb_vae"] = seq_emb_vae
        out_dict["seq_emb_vae_kl"] = kl

        seq_emb_vae = tf.stack([seq_emb_vae] * seq_len, axis=1)
        latents.append(seq_emb_vae)

    # Low-rate embeddings
    if cfg.lor_emb_unconstrained:
        assert seq_len % cfg.lor_emb_n == 0

        with tf.variable_scope("lor_emb_unconstrained"):
            # Downsample step embeddings
            rnn_embedding_dim = int(enc_stp.get_shape()[-1])
            enc_lor = tf.reshape(enc_stp, [
                batch_size, seq_len // cfg.lor_emb_n,
                cfg.lor_emb_n * rnn_embedding_dim
            ])
            lor_emb_unconstrained = tf.layers.dense(
                enc_lor, cfg.lor_emb_unconstrained_embedding_dim)

            out_dict["lor_emb_unconstrained"] = lor_emb_unconstrained

            # Upsample lo-rate embeddings for decoding
            lor_emb_unconstrained = tf.expand_dims(lor_emb_unconstrained,
                                                   axis=2)
            lor_emb_unconstrained = tf.tile(lor_emb_unconstrained,
                                            [1, 1, cfg.lor_emb_n, 1])
            lor_emb_unconstrained = tf.reshape(
                lor_emb_unconstrained,
                [batch_size, seq_len, cfg.lor_emb_unconstrained_embedding_dim])

            latents.append(lor_emb_unconstrained)

    # Build decoder features
    dec_feats = latents

    if cfg.dec_autoregressive:
        # Retrieve pitch numbers
        curr_pitches = pitches
        last_pitches = curr_pitches[:, :-1]
        last_pitches = tf.pad(last_pitches, [[0, 0], [1, 0]],
                              constant_values=-1)  # Prepend <SOS> token
        out_dict["dec_last_pitches"] = last_pitches
        dec_feats.append(tf.one_hot(last_pitches + 1, 89))

        if cfg.dec_pred_velocity:
            curr_velocities = velocities
            last_velocities = curr_velocities[:, :-1]
            last_velocities = tf.pad(last_velocities, [[0, 0], [1, 0]])
            dec_feats.append(
                tf.one_hot(last_velocities,
                           cfg.data_max_discrete_velocities + 1))

    if "delta_times_int" in cfg.dec_aux_feats:
        dec_feats.append(
            tf.one_hot(feat_dict["delta_times_int"],
                       cfg.data_max_discrete_times + 1))
    if "velocities" in cfg.dec_aux_feats:
        assert not cfg.dec_pred_velocity
        dec_feats.append(
            tf.one_hot(feat_dict["velocities"],
                       cfg.data_max_discrete_velocities + 1))

    assert dec_feats
    dec_feats = tf.concat(dec_feats, axis=2)

    # Decode
    with tf.variable_scope("decoder"):
        dec_stp, dec_initial_state, dec_final_state = simple_lstm_decoder(
            dec_feats,
            seq_lens,
            batch_size,
            rnn_celltype=cfg.rnn_celltype,
            rnn_nlayers=cfg.rnn_nlayers,
            rnn_nunits=cfg.rnn_nunits)

        with tf.variable_scope("pitches"):
            dec_recons_logits = tf.layers.dense(dec_stp, 88)

        dec_recons_loss = weighted_avg(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=dec_recons_logits, labels=pitches), stp_varlen_mask)

        out_dict["dec_initial_state"] = dec_initial_state
        out_dict["dec_final_state"] = dec_final_state
        out_dict["dec_recons_logits"] = dec_recons_logits
        out_dict["dec_recons_scores"] = tf.nn.softmax(dec_recons_logits,
                                                      axis=-1)
        out_dict["dec_recons_preds"] = tf.argmax(dec_recons_logits,
                                                 output_type=tf.int32,
                                                 axis=-1)
        out_dict["dec_recons_midi_preds"] = util.remidify(
            out_dict["dec_recons_preds"])
        out_dict["dec_recons_loss"] = dec_recons_loss

        if cfg.dec_pred_velocity:
            with tf.variable_scope("velocities"):
                dec_recons_velocity_logits = tf.layers.dense(
                    dec_stp, cfg.data_max_discrete_velocities + 1)

            dec_recons_velocity_loss = weighted_avg(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=dec_recons_velocity_logits, labels=velocities),
                stp_varlen_mask)

            out_dict["dec_recons_velocity_logits"] = dec_recons_velocity_logits
            out_dict["dec_recons_velocity_loss"] = dec_recons_velocity_loss

    # Stats
    if cfg.stp_emb_vq or cfg.stp_emb_iq:
        discrete = out_dict["stp_emb_vq_discrete" if cfg.
                            stp_emb_vq else "stp_emb_iq_discrete"]
        dx = pitches[:, 1:] - pitches[:, :-1]
        dy = discrete[:, 1:] - discrete[:, :-1]
        contour_violation = tf.reduce_mean(
            tf.cast(tf.less(dx * dy, 0), tf.float32))

        dx_hold = tf.equal(dx, 0)
        deviate_violation = weighted_avg(
            tf.cast(tf.not_equal(dy, 0), tf.float32),
            tf.cast(dx_hold, tf.float32))

        out_dict["contour_violation"] = contour_violation
        out_dict["deviate_violation"] = deviate_violation

    return out_dict
Ejemplo n.º 19
0
 def _filter_short(note_sequence_tensor, seq_len):
   note_sequence_len = tf.shape(note_sequence_tensor)[0]
   return tf.greater_equal(note_sequence_len, seq_len)
Ejemplo n.º 20
0
 def hash_in_range(self, buckets, base, limit):
     """Return true if the hashing key falls in the range [base, limit)."""
     hash_bucket = tf.string_to_hash_bucket_fast(self.scene_id, buckets)
     return tf.logical_and(tf.greater_equal(hash_bucket, base),
                           tf.less(hash_bucket, limit))
Ejemplo n.º 21
0
def overlap_mask(depth1, pose1_c2w, depth2, pose2_c2w, intrinsics):
    """Compute the overlap masks of two views using triangulation.

  The masks have the same shape of the input images. A pixel value is true if it
  can be seen by both cameras.

  Args:
    depth1: [HEIGHT, WIDTH, 1] the depth map of the first view.
    pose1_c2w: [3, 4] camera pose matrix (camera to world) of the first view.
      pose1_c2w[:, :3] is the rotation and pose1_c2w[:, -1] is the translation.
    depth2: [HEIGHT, WIDTH, 1] the depth map of the second view.
    pose2_c2w: [3, 4] camera pose matrix (camera to world) of the second view.
      pose1_c2w[:, :3] is the rotation and pose1_c2w[:, -1] is the translation.
    intrinsics: [3, 3] camera's intrinsic matrix.

  Returns:
    [HEIGHT, WIDTH] two overlap masks of the two inputs respectively.
  """

    pose1_w2c = tf.matrix_inverse(
        tf.concat([pose1_c2w, tf.constant([[0., 0., 0., 1.]])], 0))[:3]
    pose2_w2c = tf.matrix_inverse(
        tf.concat([pose2_c2w, tf.constant([[0., 0., 0., 1.]])], 0))[:3]

    p_world1 = image_to_world_projection(depth1, intrinsics, pose1_c2w)
    p_image1_in_2, z1_c2 = world_to_image_projection(p_world1, intrinsics,
                                                     pose2_w2c)

    p_world2 = image_to_world_projection(depth2, intrinsics, pose2_c2w)
    p_image2_in_1, z2_c1 = world_to_image_projection(p_world2, intrinsics,
                                                     pose1_w2c)

    shape = depth1.shape.as_list()
    height, width = shape[0], shape[1]
    height = tf.cast(height, tf.float32)
    width = tf.cast(width, tf.float32)
    # Error tolerance.
    eps = 1e-4
    # check the object seen by camera 2 is also projected to camera 1's image
    # plane and in front of the camera 1.
    mask_h2_in_1 = tf.logical_and(
        tf.less_equal(p_image2_in_1[:, :, 1], height + eps),
        tf.greater_equal(p_image2_in_1[:, :, 1], 0. - eps))
    mask_w2_in_1 = tf.logical_and(
        tf.less_equal(p_image2_in_1[:, :, 0], width + eps),
        tf.greater_equal(p_image2_in_1[:, :, 0], 0. - eps))
    # check the projected points are within the image boundaries and in front of
    # the camera.
    mask2_in_1 = tf.logical_and(tf.logical_and(mask_h2_in_1, mask_w2_in_1),
                                tf.squeeze(z2_c1, -1) > 0)

    # check the object seen by camera 1 is also projected to camera 2's image
    # plane and in front of the camera 2.
    mask_h1_in_2 = tf.logical_and(
        tf.less_equal(p_image1_in_2[:, :, 1], height + eps),
        tf.greater_equal(p_image1_in_2[:, :, 1], 0. - eps))
    mask_w1_in_2 = tf.logical_and(
        tf.less_equal(p_image1_in_2[:, :, 0], width + eps),
        tf.greater_equal(p_image1_in_2[:, :, 0], 0. - eps))
    # check the projected points are within the image boundaries and in front of
    # the camera.
    mask1_in_2 = tf.logical_and(tf.logical_and(mask_h1_in_2, mask_w1_in_2),
                                tf.squeeze(z1_c2, -1) > 0)

    return mask1_in_2, mask2_in_1
Ejemplo n.º 22
0
 def p_sample_loop_trajectory(self,
                              denoise_fn,
                              *,
                              shape,
                              noise_fn=tf.random_normal,
                              repeat_noise_steps=-1):
     """
 Generate samples, returning intermediate images
 Useful for visualizing how denoised images evolve over time
 Args:
   repeat_noise_steps (int): Number of denoising timesteps in which the same noise
     is used across the batch. If >= 0, the initial noise is the same for all batch elemements.
 """
     i_0 = tf.constant(self.num_timesteps - 1, dtype=tf.int32)
     assert isinstance(shape, (tuple, list))
     img_0 = noise_like(shape, noise_fn, repeat_noise_steps >= 0)
     times = tf.Variable([i_0])
     imgs = tf.Variable([img_0])
     # Steps with repeated noise
     times, imgs = tf.while_loop(
         cond=lambda times_, _: tf.less_equal(
             self.num_timesteps - times_[-1], repeat_noise_steps),
         body=lambda times_, imgs_: [
             tf.concat([times_, [times_[-1] - 1]], 0),
             tf.concat([
                 imgs_,
                 [
                     self.p_sample(denoise_fn=denoise_fn,
                                   x=imgs_[-1],
                                   t=tf.fill([shape[0]], times_[-1]),
                                   noise_fn=noise_fn,
                                   repeat_noise=True)
                 ]
             ], 0)
         ],
         loop_vars=[times, imgs],
         shape_invariants=[
             tf.TensorShape([None, *i_0.shape]),
             tf.TensorShape([None, *img_0.shape])
         ],
         back_prop=False)
     # Steps with different noise for each batch element
     times, imgs = tf.while_loop(
         cond=lambda times_, _: tf.greater_equal(times_[-1], 0),
         body=lambda times_, imgs_: [
             tf.concat([times_, [times_[-1] - 1]], 0),
             tf.concat([
                 imgs_,
                 [
                     self.p_sample(denoise_fn=denoise_fn,
                                   x=imgs_[-1],
                                   t=tf.fill([shape[0]], times_[-1]),
                                   noise_fn=noise_fn,
                                   repeat_noise=False)
                 ]
             ], 0)
         ],
         loop_vars=[times, imgs],
         shape_invariants=[
             tf.TensorShape([None, *i_0.shape]),
             tf.TensorShape([None, *img_0.shape])
         ],
         back_prop=False)
     assert imgs[-1].shape == shape
     return times, imgs
Ejemplo n.º 23
0
    def build():
        """Builds the Tensorflow graph."""
        inputs, lengths = None, None

        if mode in ('train', 'eval'):
            inputs, _, lengths = magenta.common.get_padded_batch(
                sequence_example_file_paths,
                hparams.batch_size,
                input_size,
                shuffle=mode == 'train')

        elif mode == 'generate':
            inputs = tf.placeholder(tf.float32,
                                    [hparams.batch_size, None, input_size])

        cell = events_rnn_graph.make_rnn_cell(
            hparams.rnn_layer_sizes,
            dropout_keep_prob=hparams.dropout_keep_prob
            if mode == 'train' else 1.0,
            attn_length=hparams.attn_length,
            residual_connections=hparams.residual_connections)

        rnn_nade = RnnNade(cell,
                           num_dims=input_size,
                           num_hidden=hparams.nade_hidden_units)

        if mode in ('train', 'eval'):
            log_probs, cond_probs = rnn_nade.log_prob(inputs, lengths)

            inputs_flat = tf.to_float(
                magenta.common.flatten_maybe_padded_sequences(inputs, lengths))
            predictions_flat = tf.to_float(tf.greater_equal(cond_probs, .5))

            if mode == 'train':
                loss = tf.reduce_mean(-log_probs)
                perplexity = tf.reduce_mean(tf.exp(log_probs))
                correct_predictions = tf.to_float(
                    tf.equal(inputs_flat, predictions_flat))
                accuracy = tf.reduce_mean(correct_predictions)
                precision = (tf.reduce_sum(inputs_flat * predictions_flat) /
                             tf.reduce_sum(predictions_flat))
                recall = (tf.reduce_sum(inputs_flat * predictions_flat) /
                          tf.reduce_sum(inputs_flat))

                optimizer = tf.train.AdamOptimizer(
                    learning_rate=hparams.learning_rate)

                train_op = contrib_slim.learning.create_train_op(
                    loss, optimizer, clip_gradient_norm=hparams.clip_norm)
                tf.add_to_collection('train_op', train_op)

                vars_to_summarize = {
                    'loss': loss,
                    'metrics/perplexity': perplexity,
                    'metrics/accuracy': accuracy,
                    'metrics/precision': precision,
                    'metrics/recall': recall,
                }
            elif mode == 'eval':
                vars_to_summarize, update_ops = contrib_metrics.aggregate_metric_map(
                    {
                        'loss':
                        tf.metrics.mean(-log_probs),
                        'metrics/perplexity':
                        tf.metrics.mean(tf.exp(log_probs)),
                        'metrics/accuracy':
                        tf.metrics.accuracy(inputs_flat, predictions_flat),
                        'metrics/precision':
                        tf.metrics.precision(inputs_flat, predictions_flat),
                        'metrics/recall':
                        tf.metrics.recall(inputs_flat, predictions_flat),
                    })
                for updates_op in update_ops.values():
                    tf.add_to_collection('eval_ops', updates_op)

            precision = vars_to_summarize['metrics/precision']
            recall = vars_to_summarize['metrics/precision']
            f1_score = tf.where(
                tf.greater(precision + recall, 0),
                2 * ((precision * recall) / (precision + recall)), 0)
            vars_to_summarize['metrics/f1_score'] = f1_score
            for var_name, var_value in vars_to_summarize.items():
                tf.summary.scalar(var_name, var_value)
                tf.add_to_collection(var_name, var_value)

        elif mode == 'generate':
            initial_state = rnn_nade.zero_state(hparams.batch_size)

            final_state = rnn_nade.steps(inputs, initial_state)
            samples, log_prob = rnn_nade.sample_single(initial_state)

            tf.add_to_collection('inputs', inputs)
            tf.add_to_collection('sample', samples)
            tf.add_to_collection('log_prob', log_prob)

            # Flatten state tuples for metagraph compatibility.
            for state in tf.nest.flatten(initial_state):
                tf.add_to_collection('initial_state', state)
            for state in tf.nest.flatten(final_state):
                tf.add_to_collection('final_state', state)
Ejemplo n.º 24
0
def dot_product_area_attention(q,
                               k,
                               v,
                               bias,
                               dropout_rate=0.0,
                               image_shapes=None,
                               name=None,
                               attention_image_summary=None,
                               save_weights_to=None,
                               dropout_broadcast_dims=None,
                               max_area_width=1,
                               max_area_height=1,
                               memory_height=1,
                               area_key_mode="mean",
                               area_value_mode="sum",
                               top_k_areas=0,
                               area_temperature=1.0,
                               training=True):
    """Dot-product area attention.

  Args:
    q: Tensor with shape [..., length_q, depth_k].
    k: Tensor with shape [..., length_kv, depth_k]. Leading dimensions must
      match with q.
    v: Tensor with shape [..., length_kv, depth_v] Leading dimensions must
      match with q.
    bias: bias Tensor (see attention_bias())
    dropout_rate: a float.
    image_shapes: optional tuple of integer scalars.
      see comments for attention_image_summary()
    name: an optional string
    attention_image_summary: the callback for making image summary of attention.
    save_weights_to: an optional dictionary to capture attention weights
      for visualization; the weights tensor will be appended there under
      a string key created from the variable scope (including name).
    dropout_broadcast_dims: an optional list of integers less than rank of q.
      Specifies in which dimensions to broadcast the dropout decisions.
    max_area_width: the max width allowed for an area.
    max_area_height: the max height allowed for an area.
    memory_height: the height of the memory.
    area_key_mode: the mode for computing area keys, which can be "mean",
      "concat", "sum", "sample_concat", and "sample_sum".
    area_value_mode: the mode for computing area values, which can be either
      "mean", or "sum".
    top_k_areas: Use the top key areas for attention.
    area_temperature: the temperature for attention softmax.
    training: indicating if it is in the training mode.
  Returns:
    Tensor with shape [..., length_q, depth_v].
  """

    tf.logging.info(
        "dot_product_area_attention: "
        "area_h=%d, area_w=%d, mem_h=%d, "
        "area_key_mode=%s, area_value_mode=%s, "
        "area_temperature=%f", max_area_height, max_area_width, memory_height,
        area_key_mode, area_value_mode, area_temperature)
    with tf.variable_scope(name,
                           default_name="dot_product_area_attention",
                           values=[q, k, v]) as scope:
        mem_shape = common_layers.shape_list(k)
        batch_size = mem_shape[0]
        head_size = mem_shape[1]
        length = mem_shape[2]
        depth = mem_shape[3]
        k_area = compute_area_key(tf.reshape(k, [-1, length, depth]),
                                  max_area_width=max_area_width,
                                  max_area_height=max_area_height,
                                  height=memory_height,
                                  mode=area_key_mode,
                                  training=training)
        if area_value_mode == "mean":
            v_area, _, _, _, _ = compute_area_features(
                tf.reshape(v, [-1, length, depth]),
                max_area_width=max_area_width,
                max_area_height=max_area_height,
                height=memory_height)
        elif area_value_mode == "max":
            v_area, _, _ = basic_pool(tf.reshape(v, [-1, length, depth]),
                                      max_area_width=max_area_width,
                                      max_area_height=max_area_height,
                                      height=memory_height,
                                      fn=tf.reduce_max)
        elif area_value_mode == "sum":
            _, _, v_area, _, _ = compute_area_features(
                tf.reshape(v, [-1, length, depth]),
                max_area_width=max_area_width,
                max_area_height=max_area_height,
                height=memory_height)
        else:
            raise ValueError("Unsupported area value mode=%s" %
                             area_value_mode)
        k = tf.reshape(k_area, [batch_size, head_size, -1, depth])
        v = tf.reshape(v_area, [batch_size, head_size, -1, depth])
        logits = tf.matmul(q, k,
                           transpose_b=True)  # [..., length_q, length_kv]
        if bias is not None:
            bias = common_layers.cast_like(bias, logits)
            with tf.name_scope("compute_area_att_bias", values=[bias]):
                bias_shape = common_layers.shape_list(bias)
                mem_length = bias_shape[-1]
                bias_values = tf.reshape(tf.to_float(tf.less(bias, -1)),
                                         [-1, mem_length, 1])
                _, _, padding_sum, _, _ = compute_area_features(
                    bias_values,
                    max_area_width=max_area_width,
                    max_area_height=max_area_height,
                    height=memory_height)
                bias = tf.where(tf.cast(tf.to_int32(padding_sum), tf.bool),
                                tf.fill(tf.shape(padding_sum), -np.inf),
                                tf.zeros_like(padding_sum, dtype=tf.float32))
                bias = tf.reshape(
                    bias, [bias_shape[0], bias_shape[1], bias_shape[2], -1])
            logits += bias
        logits = logits / area_temperature
        weights = tf.nn.softmax(logits, name="attention_weights")
        if top_k_areas > 0:
            tf.logging.info("area_attention top_k_areas=%d", top_k_areas)
            top_k = tf.minimum(
                common_layers.shape_list(weights)[-1], top_k_areas)
            top_weights, _ = tf.nn.top_k(weights, k=top_k)
            min_values = tf.reduce_min(top_weights, -1, keepdims=True)
            weights = tf.where(tf.greater_equal(weights, min_values), weights,
                               tf.zeros_like(weights))
            weights = tf.div(weights, tf.reduce_sum(weights, -1,
                                                    keepdims=True))
        if save_weights_to is not None:
            save_weights_to[scope.name] = weights
            save_weights_to[scope.name + "/logits"] = logits
        # Drop out attention links for each head.
        weights = common_layers.dropout_with_broadcast_dims(
            weights, 1.0 - dropout_rate, broadcast_dims=dropout_broadcast_dims)
        if common_layers.should_generate_summaries(
        ) and attention_image_summary:
            attention_image_summary(weights, image_shapes)
        return tf.matmul(weights, v)
Ejemplo n.º 25
0
def input_fn_dataset(dataset, flags):
    """Gets the model input from the given dataset."""
    features = {}
    dataset_descriptor = dataset_descriptors[flags.dataset]

    def process_label(label):
        """Preprocesses the label."""
        label = tf.image.decode_image(label, channels=1)
        ignore_label = 255
        label = tf.cast(label, tf.int32)

        if flags.preprocess_divide_label:
            label /= 255

        label = resize_im(label, flags.image_size, ignore_label, 1)
        label = tf.cast(label, tf.int32)
        return label

    def _parse_function(*args):
        """Parses the tf example."""
        serialized_example = args[-1]

        context_feature_names = {
            dataset_descriptor.image_id: tf.FixedLenFeature([], tf.string),
        }
        sequence_feature_names = {}
        if flags.use_ref_exp:
            context_feature_names[REF_EXP_ID] = tf.FixedLenFeature([],
                                                                   tf.string)

        if flags.use_labels:
            if dataset_descriptor.has_candidate:
                context_feature_names[
                    SELECTED_CANDIDATE_ID] = tf.FixedLenFeature([], tf.int64)
                sequence_feature_names[
                    ELEMENTS_MASK_ID] = tf.FixedLenSequenceFeature([],
                                                                   tf.string)
            else:
                context_feature_names[
                    dataset_descriptor.label_id] = tf.FixedLenFeature(
                        [], tf.string)

        if dataset_descriptor.has_elements_boxes:
            sequence_feature_names[
                dataset_descriptor.
                elements_box_id] = tf.FixedLenSequenceFeature([4],
                                                              dtype=tf.float32)
        if flags.use_elements_texts:
            sequence_feature_names[
                dataset_descriptor.
                elements_text_id] = tf.FixedLenSequenceFeature([],
                                                               dtype=tf.string)
        if flags.use_elements_neighbors:
            sequence_feature_names[
                ELEMENTS_NEIGHBORS_ID] = tf.FixedLenSequenceFeature(
                    [], dtype=tf.string)
        if flags.use_elements_ref_match:
            sequence_feature_names[
                ELEMENTS_REF_MATCH_ID] = tf.FixedLenSequenceFeature(
                    [], dtype=tf.string)

        if flags.use_groundtruth_box:
            context_feature_names[GROUNDTRUTH_XMIN_ID] = tf.FixedLenFeature(
                [], tf.float32)
            context_feature_names[GROUNDTRUTH_XMAX_ID] = tf.FixedLenFeature(
                [], tf.float32)
            context_feature_names[GROUNDTRUTH_YMIN_ID] = tf.FixedLenFeature(
                [], tf.float32)
            context_feature_names[GROUNDTRUTH_YMAX_ID] = tf.FixedLenFeature(
                [], tf.float32)

        context_features, sequence_features = tf.parse_single_sequence_example(
            serialized_example,
            context_features=context_feature_names,
            sequence_features=sequence_feature_names,
        )

        features.update(context_features)
        features.update(sequence_features)

        if flags.use_elements_texts:
            features[ELEMENTS_TEXT_ID] = features.pop(
                dataset_descriptor.elements_text_id)
        if dataset_descriptor.has_elements_boxes:
            features[ELEMENTS_BOX_ID] = features.pop(
                dataset_descriptor.elements_box_id)

        image = features.pop(dataset_descriptor.image_id)
        image = tf.image.decode_image(image, channels=3)

        image = tf.cast(image, tf.float32)
        mean_pixel = tf.reshape(
            feature_extractor.mean_pixel(flags.model_variant), [1, 1, 3])

        features[IMAGE_PAD_WEIGHTS_ID] = tf.ones_like(image[:, :, 0:1])
        features[IMAGE_PAD_WEIGHTS_ID] = resize_im(
            features[IMAGE_PAD_WEIGHTS_ID], flags.image_size, 0, 1)
        features[IMAGE_PAD_WEIGHTS_ID] = tf.squeeze(
            features[IMAGE_PAD_WEIGHTS_ID], 2)

        if dataset_descriptor.has_elements_boxes:
            image = resize_im(image, flags.image_size, mean_pixel, 3, features)
        else:
            image = resize_im(image, flags.image_size, mean_pixel, 3)

        if flags.use_labels:
            if dataset_descriptor.has_candidate:
                features[ELEMENTS_MASK_ID] = tf.map_fn(
                    process_label,
                    features.pop(ELEMENTS_MASK_ID),
                    parallel_iterations=128,
                    dtype=tf.int32,
                    name="mask_map")
                features[LABEL_ID] = tf.gather_nd(
                    features[ELEMENTS_MASK_ID],
                    [features[SELECTED_CANDIDATE_ID]])
            else:
                label = features.pop(dataset_descriptor.label_id)
                label = process_label(label)
                features[LABEL_ID] = label

        if flags.use_elements_texts:
            features[ELEMENTS_EXIST_ID] = tf.ones_like(
                features[ELEMENTS_TEXT_ID], dtype=tf.int32)
        elif dataset_descriptor.has_elements_boxes:
            features[ELEMENTS_EXIST_ID] = tf.ones(tf.shape(
                features[ELEMENTS_BOX_ID])[:1],
                                                  dtype=tf.int32)

        if flags.use_elements_neighbors:
            features[ELEMENTS_NEIGHBORS_ID] = convert_string_neighbors(
                features[ELEMENTS_NEIGHBORS_ID])

        features[IMAGE_ID] = image

        return features

    dataset = dataset.map(_parse_function,
                          num_parallel_calls=flags.dataset_threads).prefetch(
                              flags.batch_size)

    padded_shapes = {
        IMAGE_ID: [None, None, None],
    }
    if flags.use_labels:
        padded_shapes[LABEL_ID] = [None, None, None]
        if flags.use_groundtruth_box:
            padded_shapes[GROUNDTRUTH_XMIN_ID] = []
            padded_shapes[GROUNDTRUTH_XMAX_ID] = []
            padded_shapes[GROUNDTRUTH_YMIN_ID] = []
            padded_shapes[GROUNDTRUTH_YMAX_ID] = []
    if flags.use_elements_texts:
        padded_shapes[ELEMENTS_TEXT_ID] = [None]
        padded_shapes[ELEMENTS_EXIST_ID] = [None]
    if dataset_descriptor.has_elements_boxes:
        padded_shapes[ELEMENTS_BOX_ID] = [None, None]
        padded_shapes[ELEMENTS_EXIST_ID] = [None]
    if flags.use_elements_neighbors:
        padded_shapes[ELEMENTS_NEIGHBORS_ID] = [None, None]
    if flags.use_elements_ref_match:
        padded_shapes[ELEMENTS_REF_MATCH_ID] = [None]

    padded_shapes[IMAGE_PAD_WEIGHTS_ID] = [None, None]

    if flags.use_ref_exp:
        padded_shapes.update({
            REF_EXP_ID: [],
        })
    if dataset_descriptor.has_candidate:
        padded_shapes.update({
            SELECTED_CANDIDATE_ID: [],
            ELEMENTS_MASK_ID: [None, None, None, None],
        })

    dataset = dataset.padded_batch(flags.batch_size,
                                   padded_shapes=padded_shapes)
    dataset = dataset.prefetch(1)

    try:
        iterator = dataset.make_one_shot_iterator()
        feature_map = iterator.get_next()
    except ValueError:
        # This means the input pipeline uses placeholders probably because it's in
        # inference mode.
        feature_map = tf.contrib.data.get_single_element(dataset)

    feature_map[IMAGE_ID] = tf.reshape(
        feature_map[IMAGE_ID], [-1, flags.image_size, flags.image_size, 3])

    assert_ops = []
    if dataset_descriptor.has_elements_boxes:
        assert_ops.append(
            assert_or_warn(
                tf.greater_equal(tf.reduce_min(feature_map[ELEMENTS_BOX_ID]),
                                 -.001),
                [
                    "Bounding box is negative",
                    tf.reduce_min(feature_map[ELEMENTS_BOX_ID])
                ], flags.incorrect_boxes_as_errors))

        assert_ops.append(
            assert_or_warn(
                tf.less_equal(
                    tf.reduce_max(feature_map[ELEMENTS_BOX_ID][:, :, 0] +
                                  feature_map[ELEMENTS_BOX_ID][:, :, 2]),
                    1.001), [
                        "Bounding box x dim is too large.",
                        tf.reduce_max(feature_map[ELEMENTS_BOX_ID][:, :, 0] +
                                      feature_map[ELEMENTS_BOX_ID][:, :, 2])
                    ], flags.incorrect_boxes_as_errors))

        assert_ops.append(
            assert_or_warn(
                tf.less_equal(
                    tf.reduce_max(feature_map[ELEMENTS_BOX_ID][:, :, 1] +
                                  feature_map[ELEMENTS_BOX_ID][:, :, 3]),
                    1.001), [
                        "Bounding box y dim is too large.",
                        tf.reduce_max(feature_map[ELEMENTS_BOX_ID][:, :, 1] +
                                      feature_map[ELEMENTS_BOX_ID][:, :, 3])
                    ], flags.incorrect_boxes_as_errors))

    with tf.control_dependencies(assert_ops):
        if dataset_descriptor.has_elements_boxes:
            feature_map[ELEMENTS_BOX_ID].set_shape([None, None, 4])
            feature_map[ELEMENTS_EXIST_ID] = tf.cast(
                feature_map[ELEMENTS_EXIST_ID], tf.bool)
        if flags.use_labels:
            if flags.output_mode == "segment" or flags.output_mode == "regression":
                feature_map[LABEL_ID] = tf.reshape(
                    feature_map[LABEL_ID],
                    [-1, flags.image_size, flags.image_size, 1])
    return feature_map
Ejemplo n.º 26
0
 def _is_enough_agreement(example):
   return tf.greater_equal(example['agreement_count'], required_agreement)
Ejemplo n.º 27
0
def compare_dims(a, b, x):
    """At least `x` of `a` and `b` `Tensors` are true."""
    match = tf.equal(a, b)
    match = tf.cast(match, tf.int32)
    return tf.greater_equal(tf.reduce_sum(match), x)
Ejemplo n.º 28
0
    def _static_subsample(self, indicator, batch_size, labels):
        """Returns subsampled minibatch.

    Args:
      indicator: boolean tensor of shape [N] whose True entries can be sampled.
        N should be a complie time constant.
      batch_size: desired batch size. This scalar cannot be None.
      labels: boolean tensor of shape [N] denoting positive(=True) and negative
        (=False) examples. N should be a complie time constant.

    Returns:
      sampled_idx_indicator: boolean tensor of shape [N], True for entries which
        are sampled. It ensures the length of output of the subsample is always
        batch_size, even when number of examples set to True in indicator is
        less than batch_size.

    Raises:
      ValueError: if labels and indicator are not 1D boolean tensors.
    """
        # Check if indicator and labels have a static size.
        if not indicator.shape.is_fully_defined():
            raise ValueError(
                'indicator must be static in shape when is_static is'
                'True')
        if not labels.shape.is_fully_defined():
            raise ValueError('labels must be static in shape when is_static is'
                             'True')
        if not isinstance(batch_size, int):
            raise ValueError(
                'batch_size has to be an integer when is_static is'
                'True.')

        input_length = tf.shape(indicator)[0]

        # Set the number of examples set True in indicator to be at least
        # batch_size.
        num_true_sampled = tf.reduce_sum(tf.cast(indicator, tf.float32))
        additional_false_sample = tf.less_equal(
            tf.cumsum(tf.cast(tf.logical_not(indicator), tf.float32)),
            batch_size - num_true_sampled)
        indicator = tf.logical_or(indicator, additional_false_sample)

        # Shuffle indicator and label. Need to store the permutation to restore the
        # order post sampling.
        permutation = tf.random_shuffle(tf.range(input_length))
        indicator = ops.matmul_gather_on_zeroth_axis(
            tf.cast(indicator, tf.float32), permutation)
        labels = ops.matmul_gather_on_zeroth_axis(tf.cast(labels, tf.float32),
                                                  permutation)

        # index (starting from 1) when indicator is True, 0 when False
        indicator_idx = tf.where(tf.cast(indicator, tf.bool),
                                 tf.range(1, input_length + 1),
                                 tf.zeros(input_length, tf.int32))

        # Replace -1 for negative, +1 for positive labels
        signed_label = tf.where(
            tf.cast(labels, tf.bool), tf.ones(input_length, tf.int32),
            tf.scalar_mul(-1, tf.ones(input_length, tf.int32)))
        # negative of index for negative label, positive index for positive label,
        # 0 when indicator is False.
        signed_indicator_idx = tf.multiply(indicator_idx, signed_label)
        sorted_signed_indicator_idx = tf.nn.top_k(signed_indicator_idx,
                                                  input_length,
                                                  sorted=True).values

        [num_positive_samples, num_negative_samples
         ] = self._get_num_pos_neg_samples(sorted_signed_indicator_idx,
                                           batch_size)

        sampled_idx = self._get_values_from_start_and_end(
            sorted_signed_indicator_idx, num_positive_samples,
            num_negative_samples, batch_size)

        # Shift the indices to start from 0 and remove any samples that are set as
        # False.
        sampled_idx = tf.abs(sampled_idx) - tf.ones(batch_size, tf.int32)
        sampled_idx = tf.multiply(
            tf.cast(tf.greater_equal(sampled_idx, tf.constant(0)), tf.int32),
            sampled_idx)

        sampled_idx_indicator = tf.cast(
            tf.reduce_sum(tf.one_hot(sampled_idx, depth=input_length), axis=0),
            tf.bool)

        # project back the order based on stored permutations
        reprojections = tf.one_hot(permutation,
                                   depth=input_length,
                                   dtype=tf.float32)
        return tf.cast(
            tf.tensordot(tf.cast(sampled_idx_indicator, tf.float32),
                         reprojections,
                         axes=[0, 0]), tf.bool)
Ejemplo n.º 29
0
def _at_least_x_are_equal(a, b, x):
    """At least `x` of `a` and `b` `Tensors` are equal."""
    match = tf.equal(a, b)
    match = tf.cast(match, tf.int32)
    return tf.greater_equal(tf.reduce_sum(match), x)
Ejemplo n.º 30
0
def _double_factorial_loop_body(n, result, two):
    result = tf.where(tf.greater_equal(n, two), result + tf.math.log(n), result)
    return n - two, result, two