Example #1
0
 def resize_and_crop_boxes(self):
     """Resize boxes and crop it to the self._output dimension."""
     boxlist = preprocessor.box_list.BoxList(self._boxes)
     # boxlist is in range of [0, 1], so here we pass the scale_height/width
     # instead of just scale.
     boxes = preprocessor.box_list_scale(boxlist, self._scaled_height,
                                         self._scaled_width).get()
     # Adjust box coordinates based on the offset.
     box_offset = tf.stack([
         self._crop_offset_y,
         self._crop_offset_x,
         self._crop_offset_y,
         self._crop_offset_x,
     ])
     boxes -= tf.cast(tf.reshape(box_offset, [1, 4]), tf.float32)
     # Clip the boxes.
     boxes = self.clip_boxes(boxes)
     # Filter out ground truth boxes that are illegal.
     indices = tf.where(
         tf.not_equal(
             (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]), 0))
     boxes = tf.gather_nd(boxes, indices)
     classes = tf.gather_nd(self._classes, indices)
     return boxes, classes
Example #2
0
def _stitch(features):
    """Stitch features on the first dimension."""
    full_mask = tf.greater(features['task'], 1)
    step_mask = tf.reduce_any(full_mask, axis=-1)
    step_mask_exclude_last = tf.pad(step_mask, [[0, 0], [0, 1]],
                                    constant_values=False)[:, 1:]
    num_sequences = common_layers.shape_list(features['task'])[0]
    num_steps = common_layers.shape_list(features['task'])[1]
    connectors = tf.constant(PADDED_CONCATENATORS)
    # Select connectors
    connector_indices = tf.random.uniform([num_sequences * num_steps],
                                          minval=0,
                                          maxval=len(PADDED_CONCATENATORS),
                                          dtype=tf.int32)
    selected_connectors = tf.reshape(
        tf.gather(connectors, connector_indices),
        [num_sequences, num_steps,
         len(PADDED_CONCATENATORS[0])])
    selected_connectors = tf.multiply(selected_connectors,
                                      tf.expand_dims(
                                          tf.to_int32(step_mask_exclude_last),
                                          2),
                                      name='connector_mask')
    features['task'] = tf.concat([features['task'], selected_connectors],
                                 axis=-1)
    ref_offsets = tf.expand_dims(
        tf.cumsum(tf.reduce_sum(tf.to_int32(tf.greater(features['task'], 1)),
                                -1),
                  exclusive=True,
                  axis=-1), 2)
    features['task'] = tf.reshape(features['task'], [num_sequences, -1])
    full_mask = tf.greater(features['task'], 1)
    full_mask_int = tf.to_int32(full_mask)
    indices = tf.where(
        tf.sequence_mask(lengths=tf.reduce_sum(full_mask_int, -1)))
    values = tf.boolean_mask(tf.reshape(features['task'], [-1]),
                             tf.reshape(full_mask, [-1]))
    sparse_task = tf.sparse.SparseTensor(indices=indices,
                                         values=values,
                                         dense_shape=tf.to_int64(
                                             tf.shape(features['task'])))
    # Stitch task and raw_task
    stitched_features = {}
    stitched_features['task'] = tf.sparse_tensor_to_dense(sparse_task)
    max_len = tf.reduce_max(
        tf.reduce_sum(tf.to_int32(tf.greater(stitched_features['task'], 1)),
                      -1))
    stitched_features['task'] = stitched_features['task'][:, :max_len]
    if 'raw_task' in features:
        connector_strs = tf.reshape(
            tf.gather(tf.constant(CONCATENATORS_STR), connector_indices),
            [num_sequences, num_steps])
        masked_connector_strs = tf.where(step_mask_exclude_last,
                                         connector_strs,
                                         tf.fill(tf.shape(connector_strs), ''))
        stitched_features['raw_task'] = tf.strings.reduce_join(
            tf.strings.reduce_join(tf.concat([
                tf.expand_dims(features['raw_task'], 2),
                tf.expand_dims(masked_connector_strs, 2)
            ],
                                             axis=2),
                                   axis=-1), -1)
    # Stitch screen sequences
    action_lengths = tf.reduce_sum(
        tf.to_int32(
            tf.greater(features['verb_refs'][:, :, 0, 1],
                       features['verb_refs'][:, :, 0, 0])), -1)
    max_action_length = tf.reduce_max(action_lengths)

    def _pad(tensor, padding_value=0):
        shape_list = common_layers.shape_list(tensor)
        assert len(shape_list) >= 2
        padding_list = [[0, 0], [0, 1]] + [[0, 0]] * (len(shape_list) - 2)
        return tf.pad(tensor[:, :max_action_length],
                      padding_list,
                      constant_values=padding_value)

    for key in features.keys():
        if key.endswith('_refs'):
            features[key] = tf.squeeze(features[key], 2)
            ref_mask = tf.expand_dims(
                tf.to_int32(
                    tf.not_equal(features[key][:, :, 0], features[key][:, :,
                                                                       1])), 2)
            stitched_features[key] = tf.multiply((features[key] + ref_offsets),
                                                 ref_mask,
                                                 name='ref_mask')
            stitched_features[key] = _pad(stitched_features[key])
        elif key in [
                'verbs', 'objects', 'consumed', 'obj_dom_pos', 'obj_text',
                'obj_type', 'obj_clickable', 'obj_screen_pos', 'verb_refs',
                'obj_refs', 'input_refs', 'obj_dom_dist'
        ]:
            features[key] = tf.squeeze(features[key], 2)
            stitched_features[key] = features[key]
            stitched_features[key] = _pad(
                stitched_features[key],
                padding_value=-1 if key == 'obj_type' else 0)
        elif key not in ['task', 'raw_task']:
            stitched_features[key] = features[key][:, 0]
    # Append eos to 'task'
    stitched_features['task'] = tf.pad(stitched_features['task'],
                                       [[0, 0], [0, 1]])
    task_mask = tf.to_int32(tf.greater(stitched_features['task'], 1))
    task_eos_mask = tf.pad(task_mask, [[0, 0], [1, 0]],
                           constant_values=1)[:, :-1]
    stitched_features['task'] = stitched_features['task'] + (task_eos_mask -
                                                             task_mask)
    # Append eos
    verb_mask = tf.to_int32(tf.greater(stitched_features['verbs'], 1))
    verb_eos_mask = tf.pad(verb_mask, [[0, 0], [1, 0]],
                           constant_values=1)[:, :-1]
    verb_eos = verb_eos_mask - verb_mask
    stitched_features['verbs'] = stitched_features['verbs'] + verb_eos
    # Append last step refs to 'verb_refs'
    task_lengths = tf.where(tf.equal(stitched_features['task'], 1))[:, 1]
    eos_pos = tf.to_int32(tf.stack([task_lengths, task_lengths + 1], axis=1))
    action_mask = tf.to_int32(
        tf.sequence_mask(action_lengths, max_action_length + 1))
    action_and_eos_mask = tf.pad(action_mask, [[0, 0], [1, 0]],
                                 constant_values=1)[:, :-1]
    verb_ref_eos = action_and_eos_mask - action_mask
    eos_refs = tf.multiply(tf.tile(tf.expand_dims(eos_pos, 1),
                                   [1, max_action_length + 1, 1]),
                           tf.expand_dims(verb_ref_eos, 2),
                           name='verb_ref_eos')
    stitched_features['verb_refs'] += eos_refs
    return stitched_features
Example #3
0
    def create_training_ops(
        self,
        phi_all,
        values,
        target_values,
        advantages,
        deltas_training,
        delta_sums_training,
        pieces_training,
        old_probs,
        params,
    ):
        clip_param, c1, c2, c3, c4, e = params["clipping_parameter"], params[
            "value_loss"], params["policy_loss"], params[
                "entropy_loss"], params["impossibility_loss"], 10**-6
        #current phi(a|s)
        p_mask = tf.reshape(tf.one_hot(pieces_training[:, :], self.n_pieces),
                            (-1, 1, 1, self.n_pieces),
                            name='p_mask')
        values = tf.reduce_sum(values * p_mask, axis=[2, 3])

        phi = tf.reduce_sum(phi_all * p_mask, axis=3, keepdims=True)
        delta_phi = phi * tf.cast(deltas_training, tf.float32)
        delta_sum_phi = phi * tf.cast(delta_sums_training, tf.float32)
        probability = (tf.reduce_sum(delta_phi, axis=[1, 2]) +
                       e) / (tf.reduce_sum(delta_sum_phi, axis=[1, 2]) + e)

        #probability ratio
        r = tf.maximum(probability, e) / tf.maximum(old_probs, e)
        clipped_r = tf.clip_by_value(r, 1 - clip_param, 1 + clip_param)
        r_saturation = tf.reduce_mean(
            tf.cast(tf.not_equal(r, clipped_r), tf.float32))

        advnorm = adv_normalizer(0.01, safety=2.0, clip_val=4.0)
        if self.settings["compress_advantages"]:
            advantages = advnorm(advantages)

        policy_loss = tf.minimum(r * advantages, clipped_r * advantages)
        #impossibility loss
        impossibility_loss_tf = phi * (
            1 - tf.minimum(1.0, tf.cast(delta_sums_training, tf.float32)))
        #entropy
        entropy_bonus = action_entropy = N.action_entropy(
            delta_sum_phi /
            tf.reduce_sum(tf.cast(delta_sums_training, tf.float32) + e,
                          axis=[
                              1,
                              2,
                              3,
                          ],
                          keepdims=True) + e)

        #tally up
        self.value_loss_tf = c1 * tf.losses.mean_squared_error(
            values, target_values)  #reduce loss
        self.policy_loss_tf = -c2 * tf.reduce_mean(
            policy_loss)  #increase expected advantages
        self.entropy_loss_tf = -c3 * tf.reduce_mean(
            entropy_bonus)  #increase entropy
        self.impossibility_loss_tf = c4 * tf.reduce_mean(impossibility_loss_tf)
        self.regularizer_tf = self.settings["nn_regularizer"] * tf.add_n(
            [tf.nn.l2_loss(v) for v in self.main_net.variables])
        self.loss_tf = self.value_loss_tf + self.policy_loss_tf + self.impossibility_loss_tf + self.entropy_loss_tf + self.regularizer_tf
        training_ops = self.settings["optimizer"](
            learning_rate=params['lr']).minimize(self.loss_tf)
        #Stats: we like stats.
        self.output_as_stats(action_entropy, name='entropy')
        self.output_as_stats(entropy_bonus,
                             name='entropy_bonus',
                             only_mean=True)
        self.output_as_stats(values, name='values')
        self.output_as_stats(target_values, name='target_values')
        self.output_as_stats(r_saturation,
                             name='clip_saturation',
                             only_mean=True)
        self.output_as_stats(advnorm.a_mean,
                             name='advantage_compressor',
                             only_mean=True)
        self.output_as_stats(advnorm.a_max,
                             name='advantage_compressor_max',
                             only_mean=True)
        self.output_as_stats(advnorm.a_saturation,
                             name='advantage_compressor_saturation',
                             only_mean=True)
        self.output_as_stats(self.loss_tf, name='tot_loss', only_mean=True)
        self.output_as_stats(self.value_loss_tf,
                             name='value_loss',
                             only_mean=True)
        self.output_as_stats(-self.policy_loss_tf,
                             name='policy_loss',
                             only_mean=True)
        self.output_as_stats(-self.entropy_loss_tf,
                             name='entropy_loss',
                             only_mean=True)
        self.output_as_stats(self.impossibility_loss_tf,
                             name='impossibility_loss',
                             only_mean=True)
        self.output_as_stats(self.regularizer_tf,
                             name='reg_loss',
                             only_mean=True)
        self.output_as_stats(params["entropy_loss"],
                             name='params/entropy_loss_weight',
                             only_mean=True)
        for param_name in params:
            self.output_as_stats(params[param_name],
                                 name='params/' + param_name,
                                 only_mean=True)
        return [training_ops, advnorm.update_op]
Example #4
0
def build_bert_inputs(example):
  """Convert example <Tensor [30, 70]> into bert inputs."""
  k_size = FLAGS.k_size

  CLS_ID = tf.constant([101], dtype=tf.int64)  # pylint: disable=invalid-name
  SEP_ID = tf.constant([102], dtype=tf.int64)  # pylint: disable=invalid-name
  max_len = tf.constant([FLAGS.max_para_length])
  context_size = tf.constant([FLAGS.context_size])

  intermediate_examples_tensor = tf.reduce_sum(tf.abs(example), 1)
  examples_zero_vector = tf.zeros(shape=(1, 1), dtype=tf.int64)
  examples_bool_mask = tf.squeeze(
      tf.not_equal(intermediate_examples_tensor, examples_zero_vector))
  paragraph_len = tf.reduce_sum(tf.cast(examples_bool_mask, tf.int32))

  start = tf.random.uniform([1],
                            0,
                            tf.reshape(paragraph_len, []) -
                            tf.reshape(context_size, []) + 1,
                            dtype=tf.int32)

  # Slice the document into the before, after and context.
  # Discard the zero padding.
  sizes = tf.squeeze(
      tf.concat([[
          start, context_size, paragraph_len - context_size - start,
          max_len - paragraph_len
      ]], 0))
  before, context, after, _ = tf.split(example, sizes, axis=0)

  # Gather the context removing zero padding at end of sentences.
  non_zeros = tf.where(tf.not_equal(context, tf.zeros_like(context)))
  context_gathered = tf.gather_nd(context, non_zeros)

  # Flip before so we select the 4 sentences closest to target
  before = tf.reverse(before, axis=[0])

  # pad both to longer than needed
  paddings = tf.constant([[0, 8], [0, 0]])
  before = tf.pad(before, paddings)
  after = tf.pad(after, paddings)

  # Extend targets to 3 sentences
  # pad both
  before_minus_one = before[1:][:k_size]
  before_minus_two = before[2:][:k_size]
  after_plus_one = after[1:][:k_size]
  after_plus_two = after[2:][:k_size]
  before = before[:k_size]
  after = after[:k_size]

  before = tf.concat([before_minus_two, before_minus_one, before], axis=1)
  after = tf.concat([after, after_plus_one, after_plus_two], axis=1)
  ############################################################################

  # These 8 sentences are the 8 surrounding targets. Some are padding.
  targets = tf.concat([before, after], axis=0)

  # Remove the padding from the sourrounding sentences
  # Eg. if context starts at beginning of paragraph, before is all padding
  intermediate_tensor = tf.reduce_sum(tf.abs(targets), 1)
  zero_vector = tf.zeros(shape=(1, 1), dtype=tf.int64)
  bool_mask = tf.squeeze(tf.not_equal(intermediate_tensor, zero_vector))
  bool_mask.set_shape([None])
  targets = tf.boolean_mask(targets, bool_mask)

  # Randomly select 4 targets
  # We will also select the label_types for each selected target
  indices = tf.range(0, limit=tf.shape(targets)[0], dtype=tf.int32)
  shuffled_indices = tf.random.shuffle(indices)[:k_size]

  targets = tf.gather(targets, shuffled_indices)
  if k_size == 4:
    full_labels = tf.concat([tf.range(3, -1, -1), tf.range(4, 8)], axis=0)
  elif k_size == 3:
    full_labels = tf.concat([tf.range(2, -1, -1), tf.range(3, 6)], axis=0)
  elif k_size == 2:
    full_labels = tf.concat([tf.range(1, -1, -1), tf.range(2, 4)], axis=0)
  elif k_size == 1:
    full_labels = tf.concat([tf.range(0, -1, -1), tf.range(1, 2)], axis=0)
  label_types = tf.boolean_mask(full_labels, bool_mask)
  label_types = tf.gather(label_types, shuffled_indices)

  # create inputs
  bert_inputs = []
  input_masks = []
  segment_ids = []

  # make context
  ctx_segment_id = tf.concat([
      tf.zeros_like(CLS_ID, dtype=tf.int64),
      tf.zeros_like(context_gathered),
      tf.zeros_like(SEP_ID, dtype=tf.int64)
  ],
                             axis=0)
  ctx_segment_id = pad_and_cut(ctx_segment_id, FLAGS.max_seq_length)
  segment_ids.append(ctx_segment_id)

  new_ctx_input = tf.concat([CLS_ID, context_gathered, SEP_ID], axis=0)
  ctx_input_mask = tf.ones_like(new_ctx_input)
  ctx_input_mask = pad_and_cut(ctx_input_mask, FLAGS.max_seq_length)
  input_masks.append(ctx_input_mask)
  padded_new_ctx_input = pad_and_cut(new_ctx_input, FLAGS.max_seq_length)
  bert_inputs.append(padded_new_ctx_input)

  for i in range(k_size):
    target_non_zero = tf.where(
        tf.not_equal(targets[i], tf.zeros_like(targets[i])))
    targets_stripped = tf.gather_nd(targets[i], target_non_zero)
    if FLAGS.include_context:
      segment_id = tf.concat([
          tf.zeros_like(CLS_ID, dtype=tf.int64),
          tf.zeros_like(context_gathered),
          tf.zeros_like(SEP_ID, dtype=tf.int64),
          tf.ones_like(targets_stripped),
          tf.ones_like(SEP_ID, dtype=tf.int64)
      ],
                             axis=0)
    else:
      segment_id = tf.concat([
          tf.zeros_like(CLS_ID, dtype=tf.int64),
          tf.zeros_like(targets_stripped),
          tf.zeros_like(SEP_ID, dtype=tf.int64)
      ],
                             axis=0)
    segment_id = pad_and_cut(segment_id, FLAGS.max_seq_length)
    segment_ids.append(segment_id)
    if FLAGS.include_context:
      new_input = tf.concat(
          [CLS_ID, context_gathered, SEP_ID, targets_stripped, SEP_ID], axis=0)
    else:
      new_input = tf.concat([CLS_ID, targets_stripped, SEP_ID], axis=0)
    input_mask = tf.ones_like(new_input)
    input_mask = pad_and_cut(input_mask, FLAGS.max_seq_length)
    input_masks.append(input_mask)
    padded_new_input = pad_and_cut(new_input, FLAGS.max_seq_length)
    bert_inputs.append(padded_new_input)
  bert_inputs = tf.stack(bert_inputs, axis=0)
  input_masks = tf.stack(input_masks, axis=0)
  segment_ids = tf.stack(segment_ids, axis=0)

  out = Outputs_And_Context(bert_inputs, input_masks, segment_ids, label_types,
                            context_gathered)

  return out
Example #5
0
def detection_loss(cls_outputs, box_outputs, labels, params):
    """Computes total detection loss.

  Computes total detection loss including box and class loss from all levels.
  Args:
    cls_outputs: an OrderDict with keys representing levels and values
      representing logits in [batch_size, height, width, num_anchors].
    box_outputs: an OrderDict with keys representing levels and values
      representing box regression targets in [batch_size, height, width,
      num_anchors * 4].
    labels: the dictionary that returned from dataloader that includes
      groundtruth targets.
    params: the dictionary including training parameters specified in
      default_haprams function in this file.

  Returns:
    total_loss: an integer tensor representing total loss reducing from
      class and box losses from all levels.
    cls_loss: an integer tensor representing total class loss.
    box_loss: an integer tensor representing total box regression loss.
    box_iou_loss: an integer tensor representing total box iou loss.
  """
    # Sum all positives in a batch for normalization and avoid zero
    # num_positives_sum, which would lead to inf loss during training
    num_positives_sum = tf.reduce_sum(labels['mean_num_positives']) + 1.0
    levels = cls_outputs.keys()

    cls_losses = []
    box_losses = []
    box_iou_losses = []
    for level in levels:
        if params['data_format'] == 'channels_first':
            labels['cls_targets_%d' % level] = tf.transpose(
                labels['cls_targets_%d' % level], [0, 3, 1, 2])
            labels['box_targets_%d' % level] = tf.transpose(
                labels['box_targets_%d' % level], [0, 3, 1, 2])
        # Onehot encoding for classification labels.
        cls_targets_at_level = tf.one_hot(labels['cls_targets_%d' % level],
                                          params['num_classes'])
        if params['data_format'] == 'channels_first':
            bs, _, width, height, _ = cls_targets_at_level.get_shape().as_list(
            )
            cls_targets_at_level = tf.reshape(cls_targets_at_level,
                                              [bs, -1, width, height])
        else:
            bs, width, height, _, _ = cls_targets_at_level.get_shape().as_list(
            )
            cls_targets_at_level = tf.reshape(cls_targets_at_level,
                                              [bs, width, height, -1])
        box_targets_at_level = labels['box_targets_%d' % level]
        cls_loss = _classification_loss(cls_outputs[level],
                                        cls_targets_at_level,
                                        num_positives_sum,
                                        alpha=params['alpha'],
                                        gamma=params['gamma'])
        if params['data_format'] == 'channels_first':
            cls_loss = tf.reshape(
                cls_loss, [bs, -1, width, height, params['num_classes']])
        else:
            cls_loss = tf.reshape(
                cls_loss, [bs, width, height, -1, params['num_classes']])
        cls_loss *= tf.cast(
            tf.expand_dims(tf.not_equal(labels['cls_targets_%d' % level], -2),
                           -1), tf.float32)
        cls_losses.append(tf.reduce_sum(cls_loss))
        box_losses.append(
            _box_loss(box_outputs[level],
                      box_targets_at_level,
                      num_positives_sum,
                      delta=params['delta']))
        if params['iou_loss_type']:
            box_iou_losses.append(
                _box_iou_loss(box_outputs[level], box_targets_at_level,
                              num_positives_sum, params['iou_loss_type']))

    # Sum per level losses to total loss.
    cls_loss = tf.add_n(cls_losses)
    box_loss = tf.add_n(box_losses)
    box_iou_loss = tf.add_n(box_iou_losses) if box_iou_losses else 0
    total_loss = (cls_loss + params['box_loss_weight'] * box_loss +
                  params['iou_loss_weight'] * box_iou_loss)
    return total_loss, cls_loss, box_loss, box_iou_loss
Example #6
0
def add_distance_loss_to_center(labels, logits, groundtruth_coords):
    """Add distance loss function for ClickRegression."""
    weights = tf.to_int32(
        tf.not_equal(
            labels,
            model_input.dataset_descriptors[FLAGS.dataset].ignore_label))
    labels *= weights

    # Use GT box to get center if it exists. Less computation required.
    # Otherwise, calculate from label mask.
    if FLAGS.use_groundtruth_box:
        center_x = (groundtruth_coords['xmin'] +
                    groundtruth_coords['xmax']) / 2.0
        center_y = (groundtruth_coords['ymin'] +
                    groundtruth_coords['ymax']) / 2.0
        center = tf.stack([center_y, center_x], axis=1)
    else:
        # Make array of coordinates (each row contains three coordinates)
        ii, jj = tf.meshgrid(tf.range(FLAGS.image_size),
                             tf.range(FLAGS.image_size),
                             indexing='ij')
        coords = tf.stack([tf.reshape(ii, (-1, )),
                           tf.reshape(jj, (-1, ))],
                          axis=-1)
        coords = tf.cast(coords, tf.int32)

        # Rearrange input into one vector per volume
        volumes_flat = tf.reshape(
            labels, [-1, FLAGS.image_size * FLAGS.image_size * 1, 1])
        # Compute total mass for each volume. Add 0.00001 to prevent division by 0
        total_mass = tf.cast(tf.reduce_sum(volumes_flat, axis=1),
                             tf.float32) + ZERO_DIV_OFFSET
        # Compute centre of mass
        center = tf.cast(tf.reduce_sum(volumes_flat * coords, axis=1),
                         tf.float32) / total_mass
        center = center / FLAGS.image_size

    # Normalize coordinates by size of image
    logits = logits / FLAGS.image_size

    # Calculate loss based on the distance metric specified
    # Loss added later in model_fn by tf.losses.get_total_loss()
    if FLAGS.distance_metric == 'mse':
        tf.losses.mean_squared_error(center, logits)
    elif FLAGS.distance_metric in [
            'euclidean', 'euclidean_sqrt', 'euclidean_iter'
    ]:
        distance_to_center = tf.sqrt(
            tf.reduce_sum(tf.square(logits - center), axis=-1) +
            ZERO_DIV_OFFSET)
        if FLAGS.ratio_box_distance:
            distance_to_box = calc_distance_to_edge(groundtruth_coords, logits)
            box_distance_to_center = (tf.to_float(distance_to_center) -
                                      distance_to_box)
            loss = distance_to_center / (box_distance_to_center +
                                         ZERO_DIV_OFFSET)
        else:
            loss = distance_to_center

        if FLAGS.distance_metric == 'euclidean_sqrt':
            loss = tf.sqrt(loss)
        if FLAGS.distance_metric == 'euclidean_iter':
            iter_num = tf.to_float(tf.train.get_or_create_global_step())
            step = (iter_num // FLAGS.euclidean_step) + 1.0
            loss = tf.pow(loss, tf.to_float(1.0 / step))
        tf.losses.compute_weighted_loss(loss)
Example #7
0
    def map_fn(x):
        """Internal function to flat_map over.

    Consumes a batch of input examples and produces a variable number of output
    examples.

    Args:
      x: a single example
    Returns:
      a tf.data.Dataset
    """
        partial = empty_example.copy()
        i = tf.zeros([], dtype=tf.int32)
        first_key, *_ = keys
        dynamic_batch_size = tf.shape(x[first_key])[0]
        outputs = {}
        for k in keys:
            outputs[k] = tf.TensorArray(tf.int32,
                                        size=0,
                                        dynamic_size=True,
                                        element_shape=[length[k]])
            outputs[k + "_position"] = tf.TensorArray(
                tf.int32, size=0, dynamic_size=True, element_shape=[length[k]])

        def cond_fn(i, partial, outputs):
            del partial, outputs
            return i < dynamic_batch_size

        def body_fn(i, partial, outputs):
            """Body function for while_loop.

      Args:
        i: integer scalar
        partial: dictionary of Tensor (partially-constructed example)
        outputs: dictionary of TensorArray
      Returns:
        A triple containing the new values of the inputs.
      """
            can_append = True
            one_example = {}
            for k in keys:
                val = tf.cast(x[k][i], tf.int32)
                val = val[:tf.
                          reduce_sum(tf.cast(tf.not_equal(val, 0), tf.int32))]
                one_example[k] = val
            for k in keys:
                can_append = tf.logical_and(
                    can_append,
                    tf.less_equal(
                        tf.size(partial[k]) + tf.size(one_example[k]),
                        length[k]))

            def false_fn():
                return write_packed_example(partial, outputs)

            def true_fn():
                return partial, outputs

            partial, outputs = tf.cond(can_append, true_fn, false_fn)
            new_partial = {}
            for k in keys:
                new_seq = one_example[k][:length[k]]
                new_seq_len = tf.size(new_seq)
                new_partial[k] = tf.concat([partial[k], new_seq], 0)
                new_partial[k + "_position"] = tf.concat([
                    partial[k + "_position"],
                    tf.range(new_seq_len, dtype=tf.int32)
                ], 0)
            partial = new_partial
            return i + 1, partial, outputs

        i, partial, outputs = tf.while_loop(
            cond_fn,
            body_fn, (i, partial, outputs),
            back_prop=False,
            shape_invariants=(
                tf.TensorShape([]),
                {k: tf.TensorShape([None])
                 for k in keys_etc},
                {k: tf.TensorShape(None)
                 for k in keys_etc},
            ))
        partial, outputs = write_packed_example(partial, outputs)
        packed = {k: outputs[k].stack() for k in keys_etc}
        for k in keys:
            packed[k + "_segmentation"] = (tf.cumsum(
                tf.cast(tf.equal(packed[k + "_position"], 0), tf.int32),
                axis=1) * tf.cast(tf.not_equal(packed[k], 0), tf.int32))
        return packed
Example #8
0
    def call(self,
             input_tensor,
             label_ids,
             positions=None,
             label_weights=None,
             padding_token_id=None,
             mlm_is_entity_mask=None,
             mlm_is_not_entity_mask=None):
        """Get loss and log probs for the masked LM."""
        if padding_token_id is not None:
            pad_mask = tf.cast(tf.not_equal(label_ids, padding_token_id),
                               tf.float32)
        if label_weights is not None:
            if padding_token_id is not None:
                label_weights *= pad_mask
        else:
            if padding_token_id is not None:
                label_weights = pad_mask
            else:
                label_weights = tf.ones_like(label_ids, tf.float32)

        if positions is not None:
            input_tensor = gather_indexes(input_tensor, positions)
        else:
            input_tensor = tf.reshape(input_tensor, [-1, self.hidden_size])
        input_tensor.set_shape([None, self.hidden_size])

        with tf.variable_scope('cls/predictions'):
            with tf.variable_scope('transform'):
                input_tensor = self.linear_fn(input_tensor)
                input_tensor = self.layer_norm(input_tensor)

            logits = tf.matmul(input_tensor,
                               self.output_weights,
                               transpose_b=True)
            logits = tf.nn.bias_add(logits, self.output_bias)
            log_probs = tf.nn.log_softmax(logits, axis=-1)

            batch_size = tf.shape(label_ids)[0]
            mlm_labels_per_sample = tf.shape(label_ids)[1]
            label_ids_flattened = tf.reshape(label_ids, [-1])

            label_weights_flattened = tf.reshape(label_weights, [-1])

            one_hot_labels = tf.one_hot(label_ids_flattened,
                                        depth=self.vocab_size,
                                        dtype=tf.float32)

            # The `positions` tensor might be zero-padded (if the sequence is too
            # short to have the maximum number of predictions). The `label_weights`
            # tensor has a value of 1.0 for every real prediction and 0.0 for the
            # padding predictions.
            per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels,
                                              axis=[-1])
            mlm_predictions = tf.argmax(log_probs,
                                        axis=-1,
                                        output_type=tf.int32)
            loss = tf.reduce_sum(
                label_weights_flattened *
                per_example_loss) / (tf.reduce_sum(label_weights) + 1e-5)

            def weighted_sum_per_sample(values1, values2, weights):
                weights_per_sample = tf.reduce_sum(weights, 1)
                weights_denominator = weights_per_sample + 1e-5
                return (tf.reduce_sum(values1 * weights, 1) /
                        weights_denominator,
                        tf.reduce_sum(values2 * weights, 1) /
                        weights_denominator, weights_per_sample)

            mlm_loss = tf.reshape(per_example_loss,
                                  [batch_size, mlm_labels_per_sample])
            mlm_accuracy = tf.reshape(
                tf.cast(tf.equal(mlm_predictions, label_ids_flattened),
                        tf.float32), [batch_size, mlm_labels_per_sample])
            (mlm_loss_per_sample, mlm_accuracy_per_sample,
             mlm_weight_per_sample) = weighted_sum_per_sample(
                 mlm_loss, mlm_accuracy, label_weights)

            if mlm_is_entity_mask is not None:
                (mlm_loss_per_entity_sample, mlm_accuracy_per_entity_sample,
                 mlm_weight_per_entity_sample) = weighted_sum_per_sample(
                     mlm_loss, mlm_accuracy,
                     label_weights * mlm_is_entity_mask)
            else:
                mlm_loss_per_entity_sample = None
                mlm_accuracy_per_entity_sample = None
                mlm_weight_per_entity_sample = None

            if mlm_is_not_entity_mask is not None:
                (mlm_loss_per_non_entity_sample,
                 mlm_accuracy_per_non_entity_sample,
                 mlm_weight_per_non_entity_sample) = weighted_sum_per_sample(
                     mlm_loss, mlm_accuracy,
                     label_weights * mlm_is_not_entity_mask)
            else:
                mlm_loss_per_non_entity_sample = None
                mlm_accuracy_per_non_entity_sample = None
                mlm_weight_per_non_entity_sample = None

        return LanguageModelOutput(
            loss=loss,
            mlm_predictions=mlm_predictions,
            mlm_loss_per_sample=mlm_loss_per_sample,
            mlm_accuracy_per_sample=mlm_accuracy_per_sample,
            mlm_weight_per_sample=mlm_weight_per_sample,
            mlm_loss_per_entity_sample=mlm_loss_per_entity_sample,
            mlm_accuracy_per_entity_sample=mlm_accuracy_per_entity_sample,
            mlm_weight_per_entity_sample=mlm_weight_per_entity_sample,
            mlm_loss_per_non_entity_sample=mlm_loss_per_non_entity_sample,
            mlm_accuracy_per_non_entity_sample=
            mlm_accuracy_per_non_entity_sample,
            mlm_weight_per_non_entity_sample=mlm_weight_per_non_entity_sample)
Example #9
0
    def __init__(self, item_num, args, reuse=None):
        self.args = args
        self.is_training = tf.placeholder(tf.bool, shape=())
        self.input_seq = tf.placeholder(tf.int32, shape=(None, args.maxlen))
        self.pos = tf.placeholder(tf.int32, shape=None)
        self.exemplar_logits = tf.placeholder(tf.float32, shape=(None, None))
        self.exemplar_pos = tf.placeholder(tf.int32, shape=None)
        self.max_item = tf.placeholder(tf.int32, shape=())
        self.lr = tf.placeholder(tf.float32, shape=())
        self.dropout_rate = tf.placeholder(tf.float32, shape=())
        pos = self.pos
        mask = tf.expand_dims(tf.to_float(tf.not_equal(self.input_seq, 0)), -1)

        with tf.variable_scope("SASRec", reuse=reuse):
            # sequence embedding, item embedding table
            self.seq, item_emb_table = embedding(self.input_seq,
                                                 vocab_size=item_num + 1,
                                                 num_units=args.hidden_units,
                                                 zero_pad=True,
                                                 scale=True,
                                                 l2_reg=args.l2_emb,
                                                 scope="input_embeddings",
                                                 with_t=True,
                                                 reuse=reuse
                                                 )

            # # Positional Encoding
            t, pos_emb_table = embedding(
                tf.tile(tf.expand_dims(tf.range(tf.shape(self.input_seq)[1]), 0), [tf.shape(self.input_seq)[0], 1]),
                vocab_size=args.maxlen,
                num_units=args.hidden_units,
                zero_pad=False,
                scale=False,
                l2_reg=args.l2_emb,
                scope="dec_pos",
                reuse=reuse,
                with_t=True
            )
            self.seq += t

            # Dropout
            self.seq = tf.layers.dropout(self.seq,
                                         rate=self.dropout_rate,
                                         training=tf.convert_to_tensor(self.is_training),
                                         seed=args.random_seed)

            self.seq *= mask

            # Build blocks
            for i in range(args.num_blocks):
                with tf.variable_scope("num_blocks_%d" % i):
                    # Self-attention
                    self.seq = multihead_attention(queries=normalize(self.seq),
                                                   keys=self.seq,
                                                   num_units=args.hidden_units,
                                                   num_heads=args.num_heads,
                                                   dropout_rate=self.dropout_rate,
                                                   seed=args.random_seed,
                                                   is_training=self.is_training,
                                                   causality=True,
                                                   scope="self_attention")

                    # Feed forward
                    self.seq = feedforward(normalize(self.seq), num_units=[args.hidden_units, args.hidden_units],
                                           dropout_rate=self.dropout_rate, is_training=self.is_training,
                                           seed=args.random_seed)
                    self.seq *= mask

            self.seq = normalize(self.seq)

        # find representation
        self.rep = self.seq[:, -1, :]

        # define loss
        seq_emb = tf.reshape(self.rep, [tf.shape(self.input_seq)[0], args.hidden_units])
        indices = pos - 1
        self.labels = tf.one_hot(indices, self.max_item)
        item_emb = tf.nn.embedding_lookup(item_emb_table, tf.range(1, self.max_item + 1))
        self.logits = tf.matmul(seq_emb, tf.transpose(item_emb))
        self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.labels, logits=self.logits))

        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)

        # prediction
        self.test_item = tf.placeholder(tf.int32, shape=None)
        self.test_item_emb = tf.nn.embedding_lookup(item_emb_table, self.test_item)
        self.test_logits = tf.matmul(seq_emb, tf.transpose(self.test_item_emb))
        self.test_logits = tf.reshape(self.test_logits, [tf.shape(self.input_seq)[0], tf.shape(self.test_item)[0]])
        self.pred_last = tf.argsort(tf.argsort(-self.test_logits))
Example #10
0
 def c(i, j, k):
     return tf.equal(
         tf.not_equal(tf.less(i + j, 10), tf.less(j * k, 100)),
         tf.greater_equal(k, i + j))
    def body(self,
             features,
             decode_step=None,
             cache=None,
             decoding_stats=None,
             add_summary=True):
        encoder_output = None
        extra_losses = []
        padding_bias = None
        if not self.hparams.fast_decode:
            decode_step = None
        if "inputs" in features:
            inputs = features["inputs"]
            # remove the last two dimensions that are always 1.
            inputs = tf.reshape(
                inputs,
                utils.shape_list(inputs)[:2] + [self.hidden_size])
            # Padding bias only used for seq2seq models.
            padding_bias = utils.embedding_to_padding(inputs)
            # Mask random positions
            shape = utils.shape_list(inputs)
            if self.hparams.input_dropout:
                inputs = tf.where(
                    tf.random.uniform(shape) < self.hparams.input_dropout,
                    tf.zeros_like(inputs), inputs)
            if self.hparams.add_timing_signal:
                inputs += utils.get_timing_signal_1d(self.hparams.max_length,
                                                     self.hidden_size)
            if cache is not None and -1 in cache:
                encoder_output = cache[-1]
            else:
                encoder_output = utils.transformer_encoder_layers(
                    inputs=inputs,
                    num_layers=self.num_encoder_layers,
                    hparams=self.hparams,
                    losses=extra_losses,
                    name="encoder",
                    token_bias=features.get("token_bias_inputs"),
                    padding_bias=padding_bias)
            if cache is not None and -1 not in cache:
                cache[-1] = encoder_output
        targets = tf.to_int32(features["targets"])
        # remove the last two dimensions that are always 1.
        targets = tf.reshape(targets, utils.shape_list(targets)[:2])
        # Clamp targets to max_target_length
        targets = targets[:, :self.hparams.max_target_length]
        if self.is_decode:
            targets = self.process_partial_targets_decoding(targets)
        decoder_input = self.prepare_decoder(targets)

        decoder_output = utils.transformer_decoder_layers(
            inputs=decoder_input,
            num_layers=self.num_decoder_layers,
            hparams=self.hparams,
            encoder_output=encoder_output,
            decode_step=decode_step,
            losses=extra_losses,
            cache=cache,
            name="decoder",
            decoding_stats=decoding_stats,
            token_bias_inputs=features.get("token_bias_inputs"),
            token_bias_targets=features.get("token_bias_targets"),
            padding_bias=padding_bias)
        logits = self.produce_output(decoder_output)

        # Return logits as-is in decoding mode
        if self.is_decode:
            return logits

        # Add cross entropy loss
        one_hot_targets = tf.one_hot(tf.cast(targets, dtype=tf.int32),
                                     self.vocab_size)
        x_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
            labels=one_hot_targets, logits=logits)
        weights = tf.to_float(tf.not_equal(targets, 0))
        loss = tf.reduce_sum(x_entropy * weights) / tf.reduce_sum(weights)
        if add_summary:
            tf.summary.scalar("losses/weight", tf.reduce_sum(weights))
            tf.summary.scalar("losses/x_entropy",
                              tf.reduce_sum(x_entropy * weights))

        loss_dict = {"training": loss}
        if extra_losses:
            loss_dict["extra_loss"] = tf.add_n(extra_losses)
        # hack for T2T metrics
        logits = tf.reshape(
            logits,
            utils.shape_list(logits)[:2] + [1, 1] +
            utils.shape_list(logits)[-1:])
        return logits, loss_dict
Example #12
0
    def compute_loss(self, y_true, y_pred):
        """Compute mutlibox loss.

        # Arguments
            y_true: Ground truth targets,
                tensor of shape (?, num_boxes, 4 + num_classes + 8),
                priors in ground truth are fictitious,
                y_true[:, :, -8] has 1 if prior should be penalized
                    or in other words is assigned to some ground truth box,
                y_true[:, :, -7:] are all 0.
            y_pred: Predicted logits,
                tensor of shape (?, num_boxes, 4 + num_classes + 8).

        # Returns
            loss: Loss for prediction, tensor of shape (?,).
        """
        batch_size = tf.shape(y_true)[0]
        num_boxes = tf.to_float(tf.shape(y_true)[1])

        # loss for all priors
        conf_loss = self._softmax_loss(y_true[:, :, 4:-8], y_pred[:, :, 4:-8])
        loc_loss = self._l1_smooth_loss(y_true[:, :, :4], y_pred[:, :, :4])

        # get positives loss
        num_pos = tf.reduce_sum(y_true[:, :, -8], axis=-1)
        pos_loc_loss = tf.reduce_sum(loc_loss * y_true[:, :, -8], axis=1)
        pos_conf_loss = tf.reduce_sum(conf_loss * y_true[:, :, -8], axis=1)

        # get negatives loss, we penalize only confidence here
        num_neg = tf.minimum(self.neg_pos_ratio * num_pos, num_boxes - num_pos)
        pos_num_neg_mask = tf.greater(num_neg, 0)
        has_min = tf.to_float(tf.reduce_any(pos_num_neg_mask))
        num_neg = tf.concat(
            axis=0,
            values=[num_neg, [(1 - has_min) * self.negatives_for_hard]])
        num_neg_batch = tf.reduce_min(
            tf.boolean_mask(num_neg, tf.greater(num_neg, 0)))
        num_neg_batch = tf.to_int32(num_neg_batch)
        confs_start = 4 + self.background_label_id + 1
        confs_end = confs_start + self.num_classes - 1
        max_confs = tf.reduce_max(y_pred[:, :, confs_start:confs_end], axis=2)
        _, indices = tf.nn.top_k(max_confs * (1 - y_true[:, :, -8]),
                                 k=num_neg_batch)
        batch_idx = tf.expand_dims(tf.range(0, batch_size), 1)
        batch_idx = tf.tile(batch_idx, (1, num_neg_batch))
        full_indices = (tf.reshape(batch_idx, [-1]) * tf.to_int32(num_boxes) +
                        tf.reshape(indices, [-1]))
        # full_indices = tf.concat(2, [tf.expand_dims(batch_idx, 2),
        #                              tf.expand_dims(indices, 2)])
        # neg_conf_loss = tf.gather_nd(conf_loss, full_indices)
        neg_conf_loss = tf.gather(tf.reshape(conf_loss, [-1]), full_indices)
        neg_conf_loss = tf.reshape(neg_conf_loss, [batch_size, num_neg_batch])
        neg_conf_loss = tf.reduce_sum(neg_conf_loss, axis=1)

        # loss is sum of positives and negatives
        total_loss = pos_conf_loss + neg_conf_loss
        total_loss /= (num_pos + tf.to_float(num_neg_batch))
        num_pos = tf.where(tf.not_equal(num_pos, 0), num_pos,
                           tf.ones_like(num_pos))
        total_loss += (self.alpha * pos_loc_loss) / num_pos
        return total_loss
def filter_random_lighting(sequence_dir):
    sequence_name = tf.string_split([sequence_dir], '/').values[-1]
    lighting = tf.substr(sequence_name, 0, 6)
    return tf.not_equal(lighting, 'random')
  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    logging.info("*** Model: Params ***")
    for name in sorted(params.keys()):
      logging.info("  %s = %s", name, params[name])
    logging.info("*** Model: Features ***")
    for name in sorted(features.keys()):
      logging.info("  name = %s, shape = %s", name, features[name].shape)

    model = modeling.ReadItTwiceBertModel(
        config=model_config, use_one_hot_embeddings=use_one_hot_embeddings)

    span_prediction_layer = modeling.SpanPredictionHead(
        intermediate_size=model_config.intermediate_size,
        dropout_rate=model_config.hidden_dropout_prob)

    # [batch_size, main_seq_length]
    token_ids = features["token_ids"]
    main_seq_length = tf.shape(token_ids)[1]
    block_ids = features["block_ids"]
    block_pos = features["block_pos"]
    answer_type = features["answer_type"]
    supporting_fact = features["is_supporting_fact"]

    annotation_begins = features.get("entity_annotation_begins")
    annotation_ends = features.get("entity_annotation_ends")
    annotation_labels = features.get("entity_annotation_labels")

    # Do not attend padding tokens
    # [batch_size, main_seq_length, main_seq_length]
    att_mask = tf.tile(
        tf.expand_dims(tf.not_equal(token_ids, padding_token_id), 1),
        [1, main_seq_length, 1])
    att_mask = tf.cast(att_mask, dtype=tf.int32)

    main_output = model(
        token_ids=token_ids,
        training=(mode == tf.estimator.ModeKeys.TRAIN),
        block_ids=block_ids,
        block_pos=block_pos,
        att_mask=att_mask,
        annotation_begins=annotation_begins,
        annotation_ends=annotation_ends,
        annotation_labels=annotation_labels,
        enable_side_inputs=enable_side_inputs,
        num_replicas_concat=num_replicas_concat,
        cross_block_attention_mode=cross_block_attention_mode)

    span_logits = span_prediction_layer(
        hidden_states=main_output.final_hidden_states,
        token_ids=token_ids,
        padding_token_id=padding_token_id,
        ignore_prefix_length=features["prefix_length"],
        training=(mode == tf.estimator.ModeKeys.TRAIN))

    # The "pooler" converts the encoded sequence tensor of shape
    # [batch_size, seq_length, hidden_size] to a tensor of shape
    # [batch_size, hidden_size]. This is necessary for segment-level
    # (or segment-pair-level) classification tasks where we need a fixed
    # dimensional representation of the segment.
    with tf.variable_scope("pooler"):
      # We "pool" the model by simply taking the hidden state corresponding
      # to the first token. We assume that this has been pre-trained
      first_token_tensor = tf.squeeze(
          main_output.final_hidden_states[:, 0:1, :], axis=1)
      pooled_output = tf.layers.dense(
          first_token_tensor,
          model_config.hidden_size,
          activation=tf.tanh,
          kernel_initializer=tf.truncated_normal_initializer(
              stddev=model_config.initializer_range))

    yesno_logits = yesno_model(pooled_output)
    supporting_fact_logits = supporting_fact_model(pooled_output)

    tvars = tf.trainable_variables()

    initialized_variable_names = {}
    scaffold_fn = None
    if init_checkpoint:
      (assignment_map, initialized_variable_names
      ) = checkpoint_utils.get_assignment_map_from_checkpoint(
          tvars, init_checkpoint)
      if use_tpu:

        def tpu_scaffold():
          tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
          return tf.train.Scaffold()

        scaffold_fn = tpu_scaffold
      else:
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    logging.info("**** Trainable Variables ****")
    for var in tvars:
      init_string = ""
      if var.name in initialized_variable_names:
        init_string = ", *INIT_FROM_CKPT*"
      logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                   init_string)

    output_spec = None
    if mode == tf.estimator.ModeKeys.TRAIN:
      host_inputs = dict()

      span_prediction_loss = losses.BatchSpanCrossEntropyLoss()

      total_loss = 0
      qa_loss = span_prediction_loss(
          logits=span_logits,
          annotation_begins=features["answer_annotation_begins"],
          annotation_ends=features["answer_annotation_ends"],
          annotation_labels=features["answer_annotation_labels"],
          block_ids=block_ids,
          num_replicas=num_replicas_concat,
          eps=1e-5)
      host_inputs["train_metrics/qa_loss"] = tf.expand_dims(qa_loss, 0)
      total_loss += qa_loss

      # example_mask = tf.cast(tf.not_equal(block_ids, 0), tf.float32)
      # yesno_loss = compute_pooled_loss(yesno_logits, answer_type, 3,
      #                                  example_mask)
      # supporting_fact_loss = compute_supporting_facts_loss(
      #     supporting_fact_logits, supporting_fact, example_mask)
      hotpot_qa_loss = hotpot_qa_losses.BatchSpanCrossEntropyLoss()
      yesno_loss, supporting_fact_loss = hotpot_qa_loss(
          yesno_logits,
          answer_type,
          supporting_fact_logits,
          supporting_fact,
          block_ids,
          eps=1e-5)

      host_inputs["train_metrics/yesno_loss"] = tf.expand_dims(yesno_loss, 0)
      total_loss += yesno_loss

      host_inputs["train_metrics/supporting_fact_loss"] = tf.expand_dims(
          supporting_fact_loss, 0)
      total_loss += supporting_fact_loss

      # Add regularization losses.
      if model.losses:
        total_loss += tf.math.add_n(model.losses)

      train_op = optimization.create_optimizer(
          total_loss,
          learning_rate,
          num_train_steps,
          num_warmup_steps,
          use_tpu,
          optimizer,
          poly_power,
          start_warmup_step,
          learning_rate_schedule,
          reduce_loss_sum=True)

      host_inputs.update({
          "global_step":
              tf.expand_dims(tf.train.get_or_create_global_step(), 0),
          "train_metrics/loss":
              tf.expand_dims(total_loss, 0),
      })

      host_call = (functools.partial(
          record_summary_host_fn,
          metrics_dir=os.path.join(FLAGS.output_dir,
                                   "train_metrics")), host_inputs)

      output_spec = tf.estimator.tpu.TPUEstimatorSpec(
          mode=mode,
          loss=total_loss,
          train_op=train_op,
          scaffold_fn=scaffold_fn,
          host_call=host_call)
    elif mode == tf.estimator.ModeKeys.PREDICT:
      begin_logits_values, begin_logits_indices = tf.math.top_k(
          span_logits[:, :, 0],
          k=nbest_logits_for_eval,
      )
      end_logits_values, end_logits_indices = tf.math.top_k(
          span_logits[:, :, 1],
          k=nbest_logits_for_eval,
      )

      predictions = {
          "block_ids": tf.identity(block_ids),
          "begin_logits_values": begin_logits_values,
          "begin_logits_indices": begin_logits_indices,
          "end_logits_values": end_logits_values,
          "end_logits_indices": end_logits_indices,
          "token_ids": tf.identity(token_ids),
          "answer_type": answer_type,
          "yesno_logits": yesno_logits,
          "supporting_fact_logits": supporting_fact_logits,
          "is_supporting_fact": supporting_fact,
      }
      output_spec = tf.estimator.tpu.TPUEstimatorSpec(
          mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
    else:
      raise ValueError("Only TRAIN and PREDICT modes is supported: %s" % mode)

    return output_spec
    def build_graph(self, image, edgemap):
        image = image - tf.constant([104, 116, 122], dtype='float32')
        image = tf.transpose(image, [0, 3, 1, 2])
        edgemap = tf.expand_dims(edgemap, 3, name='edgemap4d')

        def branch(name, l, up):
            with tf.variable_scope(name):
                l = Conv2D('convfc',
                           l,
                           1,
                           kernel_size=1,
                           activation=tf.identity,
                           use_bias=True,
                           kernel_initializer=tf.constant_initializer())
                while up != 1:
                    l = CaffeBilinearUpSample('upsample{}'.format(up), l, 2)
                    up = up // 2
                return l

        with argscope(Conv2D, kernel_size=3, activation=tf.nn.relu), \
                argscope([Conv2D, MaxPooling], data_format='NCHW'):
            l = Conv2D('conv1_1', image, 64)
            l = Conv2D('conv1_2', l, 64)
            b1 = branch('branch1', l, 1)
            l = MaxPooling('pool1', l, 2)

            l = Conv2D('conv2_1', l, 128)
            l = Conv2D('conv2_2', l, 128)
            b2 = branch('branch2', l, 2)
            l = MaxPooling('pool2', l, 2)

            l = Conv2D('conv3_1', l, 256)
            l = Conv2D('conv3_2', l, 256)
            l = Conv2D('conv3_3', l, 256)
            b3 = branch('branch3', l, 4)
            l = MaxPooling('pool3', l, 2)

            l = Conv2D('conv4_1', l, 512)
            l = Conv2D('conv4_2', l, 512)
            l = Conv2D('conv4_3', l, 512)
            b4 = branch('branch4', l, 8)
            l = MaxPooling('pool4', l, 2)

            l = Conv2D('conv5_1', l, 512)
            l = Conv2D('conv5_2', l, 512)
            l = Conv2D('conv5_3', l, 512)
            b5 = branch('branch5', l, 16)

            final_map = Conv2D('convfcweight',
                               tf.concat([b1, b2, b3, b4, b5], 1),
                               1,
                               kernel_size=1,
                               kernel_initializer=tf.constant_initializer(0.2),
                               use_bias=False,
                               activation=tf.identity)
        costs = []
        for idx, b in enumerate([b1, b2, b3, b4, b5, final_map]):
            b = tf.transpose(b, [0, 2, 3, 1])
            output = tf.nn.sigmoid(b, name='output{}'.format(idx + 1))
            xentropy = class_balanced_sigmoid_cross_entropy(
                b, edgemap, name='xentropy{}'.format(idx + 1))
            costs.append(xentropy)

        # some magic threshold
        pred = tf.cast(tf.greater(output, 0.5), tf.int32, name='prediction')
        wrong = tf.cast(tf.not_equal(pred, edgemap), tf.float32)
        wrong = tf.reduce_mean(wrong, name='train_error')

        wd_w = tf.train.exponential_decay(2e-4, get_global_step_var(), 80000,
                                          0.7, True)
        wd_cost = tf.multiply(wd_w,
                              regularize_cost('.*/W', tf.nn.l2_loss),
                              name='wd_cost')
        costs.append(wd_cost)

        add_param_summary(('.*/W', ['histogram']))  # monitor W
        total_cost = tf.add_n(costs, name='cost')
        add_moving_summary(wrong, total_cost, *costs)
        return total_cost
    def parse_train_data(self, data):
        """Parse data for ShapeMask training."""
        classes = data['groundtruth_classes']
        boxes = data['groundtruth_boxes']
        masks = data['groundtruth_instance_masks']
        is_crowds = data['groundtruth_is_crowd']
        # Skips annotations with `is_crowd` = True.
        if self._skip_crowd_during_training and self._is_training:
            num_groundtrtuhs = tf.shape(classes)[0]
            with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
                indices = tf.cond(
                    tf.greater(tf.size(is_crowds), 0),
                    lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
                    lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64))
            classes = tf.gather(classes, indices)
            boxes = tf.gather(boxes, indices)
            masks = tf.gather(masks, indices)

        # If not using category, makes all categories with id = 0.
        if not self._use_category:
            classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32)

        image = self.get_normalized_image(data)

        # Flips image randomly during training.
        if self._aug_rand_hflip:
            image, boxes, masks = input_utils.random_horizontal_flip(
                image, boxes, masks)

        # Converts boxes from normalized coordinates to pixel coordinates.
        image_shape = tf.shape(image)[0:2]
        boxes = box_utils.denormalize_boxes(boxes, image_shape)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            self._output_size,
            aug_scale_min=self._aug_scale_min,
            aug_scale_max=self._aug_scale_max)
        self._train_image_scale = image_info[2, :]
        self._train_offset = image_info[3, :]

        # Resizes and crops boxes and masks.
        boxes = input_utils.resize_and_crop_boxes(boxes,
                                                  self._train_image_scale,
                                                  image_info[1, :],
                                                  self._train_offset)

        # Filters out ground truth boxes that are all zeros.
        indices = box_utils.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices)
        classes = tf.gather(classes, indices)
        masks = tf.gather(masks, indices)

        # Assigns anchors.
        input_anchor = anchor.Anchor(self._min_level, self._max_level,
                                     self._num_scales, self._aspect_ratios,
                                     self._anchor_size, self._output_size)
        anchor_labeler = anchor.AnchorLabeler(input_anchor,
                                              self._match_threshold,
                                              self._unmatched_threshold)
        (cls_targets, box_targets,
         num_positives) = anchor_labeler.label_anchors(
             boxes, tf.cast(tf.expand_dims(classes, axis=1), tf.float32))

        # Sample groundtruth masks/boxes/classes for mask branch.
        num_masks = tf.shape(masks)[0]
        mask_shape = tf.shape(masks)[1:3]

        # Pad sampled boxes/masks/classes to a constant batch size.
        padded_boxes = input_utils.pad_to_fixed_size(boxes,
                                                     self._num_sampled_masks)
        padded_classes = input_utils.pad_to_fixed_size(classes,
                                                       self._num_sampled_masks)
        padded_masks = input_utils.pad_to_fixed_size(masks,
                                                     self._num_sampled_masks)

        # Randomly sample groundtruth masks for mask branch training. For the image
        # without groundtruth masks, it will sample the dummy padded tensors.
        rand_indices = tf.random.shuffle(
            tf.range(tf.maximum(num_masks, self._num_sampled_masks)))
        rand_indices = tf.mod(rand_indices, tf.maximum(num_masks, 1))
        rand_indices = rand_indices[0:self._num_sampled_masks]
        rand_indices = tf.reshape(rand_indices, [self._num_sampled_masks])

        sampled_boxes = tf.gather(padded_boxes, rand_indices)
        sampled_classes = tf.gather(padded_classes, rand_indices)
        sampled_masks = tf.gather(padded_masks, rand_indices)
        # Jitter the sampled boxes to mimic the noisy detections.
        sampled_boxes = box_utils.jitter_boxes(
            sampled_boxes, noise_scale=self._box_jitter_scale)
        sampled_boxes = box_utils.clip_boxes(sampled_boxes, self._output_size)
        # Compute mask targets in feature crop. A feature crop fully contains a
        # sampled box.
        mask_outer_boxes = box_utils.compute_outer_boxes(
            sampled_boxes, tf.shape(image)[0:2], scale=self._outer_box_scale)
        mask_outer_boxes = box_utils.clip_boxes(mask_outer_boxes,
                                                self._output_size)
        # Compensate the offset of mask_outer_boxes to map it back to original image
        # scale.
        mask_outer_boxes_ori = mask_outer_boxes
        mask_outer_boxes_ori += tf.tile(
            tf.expand_dims(self._train_offset, axis=0), [1, 2])
        mask_outer_boxes_ori /= tf.tile(
            tf.expand_dims(self._train_image_scale, axis=0), [1, 2])
        norm_mask_outer_boxes_ori = box_utils.normalize_boxes(
            mask_outer_boxes_ori, mask_shape)

        # Set sampled_masks shape to [batch_size, height, width, 1].
        sampled_masks = tf.cast(tf.expand_dims(sampled_masks, axis=-1),
                                tf.float32)
        mask_targets = tf.image.crop_and_resize(
            sampled_masks,
            norm_mask_outer_boxes_ori,
            box_ind=tf.range(self._num_sampled_masks),
            crop_size=[self._mask_crop_size, self._mask_crop_size],
            method='bilinear',
            extrapolation_value=0,
            name='train_mask_targets')
        mask_targets = tf.where(tf.greater_equal(mask_targets, 0.5),
                                tf.ones_like(mask_targets),
                                tf.zeros_like(mask_targets))
        mask_targets = tf.squeeze(mask_targets, axis=-1)
        if self._up_sample_factor > 1:
            fine_mask_targets = tf.image.crop_and_resize(
                sampled_masks,
                norm_mask_outer_boxes_ori,
                box_ind=tf.range(self._num_sampled_masks),
                crop_size=[
                    self._mask_crop_size * self._up_sample_factor,
                    self._mask_crop_size * self._up_sample_factor
                ],
                method='bilinear',
                extrapolation_value=0,
                name='train_mask_targets')
            fine_mask_targets = tf.where(
                tf.greater_equal(fine_mask_targets, 0.5),
                tf.ones_like(fine_mask_targets),
                tf.zeros_like(fine_mask_targets))
            fine_mask_targets = tf.squeeze(fine_mask_targets, axis=-1)
        else:
            fine_mask_targets = mask_targets

        # If bfloat16 is used, casts input image to tf.bfloat16.
        if self._use_bfloat16:
            image = tf.cast(image, dtype=tf.bfloat16)

        valid_image = tf.cast(tf.not_equal(num_masks, 0), tf.int32)
        if self._mask_train_class == 'all':
            mask_is_valid = valid_image * tf.ones_like(sampled_classes,
                                                       tf.int32)
        else:
            # Get the intersection of sampled classes with training splits.
            mask_valid_classes = tf.cast(
                tf.expand_dims(
                    class_utils.coco_split_class_ids(self._mask_train_class),
                    1), sampled_classes.dtype)
            match = tf.reduce_any(
                tf.equal(tf.expand_dims(sampled_classes, 0),
                         mask_valid_classes), 0)
            mask_is_valid = valid_image * tf.cast(match, tf.int32)

        # Packs labels for model_fn outputs.
        labels = {
            'cls_targets': cls_targets,
            'box_targets': box_targets,
            'anchor_boxes': input_anchor.multilevel_boxes,
            'num_positives': num_positives,
            'image_info': image_info,
            # For ShapeMask.
            'mask_boxes': sampled_boxes,
            'mask_outer_boxes': mask_outer_boxes,
            'mask_targets': mask_targets,
            'fine_mask_targets': fine_mask_targets,
            'mask_classes': sampled_classes,
            'mask_is_valid': mask_is_valid,
        }
        return image, labels
Example #17
0
    def call(self,
             item_states,
             item_ids,
             global_item_states,
             global_item_ids,
             labels_mask=None,
             labels_weight=None):
        """Calls the layer.

    Args:
      item_states: <float32>[batch_size, hidden_size]
      item_ids: <int32>[batch_size, hidden_size]
      global_item_states: <float32>[global_batch_size, hidden_size]
      global_item_ids: <int32>[global_batch_size, hidden_size]
      labels_mask: <int32>[batch_size, global_batch_size]
      labels_weight: <float32>[batch_size, global_batch_size]

    Returns:
        total_loss: <float>
    """
        # [batch_size, 1]
        item_ids_expanded = tf.expand_dims(item_ids, 1)
        # [1, global_batch_size]
        global_item_ids_expanded = tf.expand_dims(global_item_ids, 0)

        # Positive labels when IDs are the same
        # [batch_size, global_batch_size]
        labels = tf.equal(item_ids_expanded, global_item_ids_expanded)
        if labels_mask is not None:
            labels = tf.logical_and(labels, labels_mask)

        # In two cases the loss is ignored (label_weight is 0):
        # (1) Either of IDs is the padding ID
        # (2) Loss is computed when comparisng a sample to itself
        both_ids_are_not_padding = tf.logical_and(
            tf.not_equal(item_ids_expanded, self.padding_id),
            tf.not_equal(global_item_ids_expanded, self.padding_id))
        if labels_weight is None:
            labels_weight = tf.cast(both_ids_are_not_padding, tf.float32)
        else:
            labels_weight = labels_weight * tf.cast(both_ids_are_not_padding,
                                                    tf.float32)
        # Hacky way to tell if samples are exactly the same --
        # their IDs are the same and their states are approximately the same.
        samples_are_the_same = tf.logical_and(
            tf.less(
                tf.norm(tf.expand_dims(item_states, 1) -
                        tf.expand_dims(global_item_states, 0),
                        axis=2), 1e-5), labels)
        # [batch_size, global_batch_size]
        labels_weight = (labels_weight *
                         (1 - tf.cast(samples_are_the_same, tf.float32)))

        # [batch_size, global_batch_size]
        labels = tf.stop_gradient(tf.cast(labels, tf.float32))
        labels_weight = tf.stop_gradient(tf.cast(labels_weight, tf.float32))

        if self.apply_linear_layer:
            item_states = self.linear_fn(item_states)

        # [batch_size, global_batch_size]
        logits = tf.matmul(item_states, global_item_states, transpose_b=True)
        logits += self.bias_term

        # [batch_size, global_batch_size]
        loss_per_sample = tf.nn.sigmoid_cross_entropy_with_logits(
            labels=labels, logits=logits)
        loss_per_sample *= labels_weight
        # Here we compute mean because otherwise the loss becomes too large
        loss_per_sample = tf.reduce_sum(loss_per_sample, 1)
        loss_per_sample /= (tf.reduce_sum(labels_weight, 1) + 1e-5)
        return tf.reduce_sum(loss_per_sample)
Example #18
0
def compute_mel_filterbank_features(waveforms,
                                    sample_rate=16000,
                                    dither=1.0 / np.iinfo(np.int16).max,
                                    preemphasis=0.97,
                                    frame_length=25,
                                    frame_step=10,
                                    fft_length=None,
                                    window_fn=functools.partial(
                                        tf.signal.hann_window, periodic=True),
                                    lower_edge_hertz=80.0,
                                    upper_edge_hertz=7600.0,
                                    num_mel_bins=80,
                                    log_noise_floor=1e-3,
                                    apply_mask=True):
    """Implement mel-filterbank extraction using tf ops.

  Args:
    waveforms: float32 tensor with shape [batch_size, max_len]
    sample_rate: sampling rate of the waveform
    dither: stddev of Gaussian noise added to waveform to prevent quantization
      artefacts
    preemphasis: waveform high-pass filtering constant
    frame_length: frame length in ms
    frame_step: frame_Step in ms
    fft_length: number of fft bins
    window_fn: windowing function
    lower_edge_hertz: lowest frequency of the filterbank
    upper_edge_hertz: highest frequency of the filterbank
    num_mel_bins: filterbank size
    log_noise_floor: clip small values to prevent numeric overflow in log
    apply_mask: When working on a batch of samples, set padding frames to zero
  Returns:
    filterbanks: a float32 tensor with shape [batch_size, len, num_bins, 1]
  """
    # `stfts` is a complex64 Tensor representing the short-time Fourier
    # Transform of each signal in `signals`. Its shape is
    # [batch_size, ?, fft_unique_bins]
    # where fft_unique_bins = fft_length // 2 + 1

    # Find the wave length: the largest index for which the value is !=0
    # note that waveforms samples that are exactly 0.0 are quite common, so
    # simply doing sum(waveforms != 0, axis=-1) will not work correctly.
    wav_lens = tf.reduce_max(
        tf.expand_dims(tf.range(tf.shape(waveforms)[1]), 0) *
        tf.to_int32(tf.not_equal(waveforms, 0.0)),
        axis=-1) + 1
    if dither > 0:
        waveforms += tf.random_normal(tf.shape(waveforms), stddev=dither)
    if preemphasis > 0:
        waveforms = waveforms[:, 1:] - preemphasis * waveforms[:, :-1]
        wav_lens -= 1
    frame_length = int(frame_length * sample_rate / 1e3)
    frame_step = int(frame_step * sample_rate / 1e3)
    if fft_length is None:
        fft_length = int(2**(np.ceil(np.log2(frame_length))))

    stfts = tf.contrib.signal.stft(waveforms,
                                   frame_length=frame_length,
                                   frame_step=frame_step,
                                   fft_length=fft_length,
                                   window_fn=window_fn,
                                   pad_end=True)

    stft_lens = (wav_lens + (frame_step - 1)) // frame_step
    masks = tf.to_float(
        tf.less_equal(tf.expand_dims(tf.range(tf.shape(stfts)[1]), 0),
                      tf.expand_dims(stft_lens, 1)))

    # An energy spectrogram is the magnitude of the complex-valued STFT.
    # A float32 Tensor of shape [batch_size, ?, 257].
    magnitude_spectrograms = tf.abs(stfts)

    # Warp the linear-scale, magnitude spectrograms into the mel-scale.
    num_spectrogram_bins = magnitude_spectrograms.shape[-1].value
    linear_to_mel_weight_matrix = (
        tf.contrib.signal.linear_to_mel_weight_matrix(num_mel_bins,
                                                      num_spectrogram_bins,
                                                      sample_rate,
                                                      lower_edge_hertz,
                                                      upper_edge_hertz))
    mel_spectrograms = tf.tensordot(magnitude_spectrograms,
                                    linear_to_mel_weight_matrix, 1)
    # Note: Shape inference for tensordot does not currently handle this case.
    mel_spectrograms.set_shape(magnitude_spectrograms.shape[:-1].concatenate(
        linear_to_mel_weight_matrix.shape[-1:]))

    log_mel_sgram = tf.log(tf.maximum(log_noise_floor, mel_spectrograms))

    if apply_mask:
        log_mel_sgram *= tf.expand_dims(tf.to_float(masks), -1)

    return tf.expand_dims(log_mel_sgram, -1, name="mel_sgrams")
Example #19
0
def prepare_encoder_input(features,
                          hparams,
                          embed_scope=None,
                          embed_token_fn=common_embed.embed_tokens):
    """Prepares the input for the screen encoder.

  Args:
    features: the feature dict.
    hparams: the hyperparameter.
    embed_scope: the embedding variable scope.
    embed_token_fn: the function for embedding tokens.
  Returns:
    object_embedding: a Tensor of shape
        [batch_size, num_steps, max_object_count, embed_depth]
    object_mask: a binary tensor of shape
        [batch_size, num_steps, max_object_count]
    nonpadding_bias: a Tensor of shape
        [batch_size, num_steps, max_object_count]
  """
    with tf.control_dependencies(
        [tf.assert_equal(tf.rank(features["obj_text"]), 4)]):
        if hparams.get("synthetic_screen_noise", 0.) > 0.:
            num_objects = tf.shape(features["obj_text"])[2]
            # [batch, length, num_objects]
            target_obj_mask = tf.cast(
                tf.one_hot(features["objects"], depth=num_objects), tf.bool)
            num_tokens = tf.shape(features["obj_text"])[-1]
            target_obj_mask = tf.tile(tf.expand_dims(target_obj_mask, 3),
                                      [1, 1, 1, num_tokens])
            # Randomly keep tokens
            keep_mask = tf.greater_equal(
                tf.random_uniform(shape=tf.shape(features["obj_text"])),
                hparams.synthetic_screen_noise)
            # Keep paddings
            keep_mask = tf.logical_or(tf.equal(features["obj_text"], 0),
                                      keep_mask)
            # Keep targets
            target_obj_mask = tf.logical_or(target_obj_mask, keep_mask)
            features["obj_text"] = tf.where(
                target_obj_mask, features["obj_text"],
                tf.random_uniform(shape=tf.shape(features["obj_text"]),
                                  maxval=50000,
                                  dtype=tf.int32))
        text_embeddings, _ = embed_token_fn(features["obj_text"],
                                            hparams.task_vocab_size,
                                            hparams.hidden_size,
                                            hparams,
                                            embed_scope=embed_scope)
        with tf.variable_scope("obj_text_embed", reuse=tf.AUTO_REUSE):
            if hparams.obj_text_aggregation == "max":
                embed_bias = tf.cast(tf.less(features["obj_text"], 2),
                                     tf.float32) * -1e7
                with tf.control_dependencies(
                    [tf.assert_equal(tf.rank(embed_bias), 4)]):
                    text_embeddings = tf.reduce_max(
                        text_embeddings + tf.expand_dims(embed_bias, 4), -2)
                    no_txt_embed = tf.get_variable(name="no_txt_embed",
                                                   shape=[hparams.hidden_size])
                    shape = common_layers.shape_list(text_embeddings)
                    no_txt_embed = tf.tile(
                        tf.reshape(no_txt_embed,
                                   [1, 1, 1, hparams.hidden_size]),
                        [shape[0], shape[1], shape[2], 1])
                    text_embeddings = tf.maximum(text_embeddings, no_txt_embed)
            elif hparams.obj_text_aggregation == "sum":
                # [batch, step, #max_obj, #max_token]  0 for padded tokens
                real_objects = tf.cast(
                    tf.greater_equal(features["obj_text"], 2), tf.float32)
                # [batch, step, #max_obj, hidden]   0s for padded objects
                text_embeddings = tf.reduce_sum(
                    text_embeddings * tf.expand_dims(real_objects, 4), -2)
            elif hparams.obj_text_aggregation == "mean":
                shape_list = common_layers.shape_list(text_embeddings)
                embeddings = tf.reshape(text_embeddings, [-1] + shape_list[3:])
                emb_sum = tf.reduce_sum(tf.abs(embeddings), axis=-1)
                non_paddings = tf.not_equal(emb_sum, 0.0)
                embeddings = common_embed.average_bag_of_embeds(
                    embeddings,
                    non_paddings,
                    use_bigrams=True,
                    bigram_embed_scope=embed_scope,
                    append_start_end=True)
                text_embeddings = tf.reshape(
                    embeddings, shape_list[:3] + [hparams.hidden_size])
            else:
                raise ValueError("Unrecognized token aggregation %s" %
                                 (hparams.obj_text_aggregation))
    with tf.control_dependencies([
            tf.assert_equal(tf.rank(features["obj_type"]), 3),
            tf.assert_equal(tf.rank(features["obj_clickable"]), 3)
    ]):
        with tf.variable_scope("encode_object_attr", reuse=tf.AUTO_REUSE):
            type_embedding = tf.nn.embedding_lookup(params=tf.get_variable(
                name="embed_type_w",
                shape=[hparams.get("num_types", 100), hparams.hidden_size]),
                                                    ids=tf.maximum(
                                                        features["obj_type"],
                                                        0))
            clickable_embedding = tf.nn.embedding_lookup(
                params=tf.get_variable(name="embed_clickable_w",
                                       shape=[2, hparams.hidden_size]),
                ids=features["obj_clickable"])
    with tf.control_dependencies(
        [tf.assert_equal(tf.rank(features["obj_screen_pos"]), 4)]):

        def _create_embed(feature_name, vocab_size, depth):
            """Embed a position feature."""
            pos_embedding_list = []
            with tf.variable_scope("encode_object_" + feature_name,
                                   reuse=tf.AUTO_REUSE):
                num_featues = common_layers.shape_list(
                    features[feature_name])[-1]
                for i in range(num_featues):
                    pos_embedding_list.append(
                        tf.nn.embedding_lookup(
                            params=tf.get_variable(name=feature_name +
                                                   "_embed_w_%d" % i,
                                                   shape=[vocab_size, depth]),
                            ids=features[feature_name][:, :, :, i]))
                pos_embedding = tf.add_n(pos_embedding_list)
                return pos_embedding

        pos_embedding = _create_embed("obj_screen_pos", hparams.max_pixel_pos,
                                      hparams.hidden_size)
    if "all" == hparams.screen_embedding_feature or (
            "dom" in hparams.screen_embedding_feature):
        dom_embedding = _create_embed("obj_dom_pos", hparams.max_dom_pos,
                                      hparams.hidden_size)
    object_embed = tf.zeros_like(text_embeddings, dtype=tf.float32)
    if hparams.screen_embedding_feature == "all":
        object_embed = (text_embeddings + type_embedding + pos_embedding +
                        dom_embedding)
    elif "text" in hparams.screen_embedding_feature:
        object_embed += text_embeddings
    elif "type" in hparams.screen_embedding_feature:
        object_embed += type_embedding
    elif "pos" in hparams.screen_embedding_feature:
        object_embed += pos_embedding
    elif "dom" in hparams.screen_embedding_feature:
        object_embed += dom_embedding
    elif "click" in hparams.screen_embedding_feature:
        object_embed += clickable_embedding
    object_mask = tf.cast(tf.not_equal(features["obj_type"], -1), tf.float32)
    object_embed = object_embed * tf.expand_dims(object_mask, 3)
    att_bias = (1. - object_mask) * common_attention.large_compatible_negative(
        object_embed.dtype)
    return object_embed, object_mask, att_bias
Example #20
0
def main(unused_argv):
    FLAGS.comb_dropout_keep_prob = 1.0
    FLAGS.image_keep_prob = 1.0
    FLAGS.elements_keep_prob = 1.0

    # Get dataset-dependent information.

    tf.gfile.MakeDirs(FLAGS.eval_logdir)
    tf.logging.info('Evaluating on %s set', FLAGS.split)

    with tf.Graph().as_default():
        samples = model_input.get_input_fn(FLAGS)()

        # Get model segmentation predictions.
        num_classes = model_input.dataset_descriptors[
            FLAGS.dataset].num_classes
        output_to_num_classes = model.get_output_to_num_classes(FLAGS)

        if tuple(FLAGS.eval_scales) == (1.0, ):
            tf.logging.info('Performing single-scale test.')
            predictions, probs = model.predict_labels(
                samples['image'],
                samples,
                FLAGS,
                outputs_to_num_classes=output_to_num_classes,
                image_pyramid=FLAGS.image_pyramid,
                merge_method=FLAGS.merge_method,
                atrous_rates=FLAGS.atrous_rates,
                add_image_level_feature=FLAGS.add_image_level_feature,
                aspp_with_batch_norm=FLAGS.aspp_with_batch_norm,
                aspp_with_separable_conv=FLAGS.aspp_with_separable_conv,
                multi_grid=FLAGS.multi_grid,
                depth_multiplier=FLAGS.depth_multiplier,
                output_stride=FLAGS.output_stride,
                decoder_output_stride=FLAGS.decoder_output_stride,
                decoder_use_separable_conv=FLAGS.decoder_use_separable_conv,
                crop_size=[FLAGS.image_size, FLAGS.image_size],
                logits_kernel_size=FLAGS.logits_kernel_size,
                model_variant=FLAGS.model_variant)
        else:
            tf.logging.info('Performing multi-scale test.')
            predictions, probs = model.predict_labels_multi_scale(
                samples['image'],
                samples,
                FLAGS,
                outputs_to_num_classes=output_to_num_classes,
                eval_scales=FLAGS.eval_scales,
                add_flipped_images=FLAGS.add_flipped_images,
                merge_method=FLAGS.merge_method,
                atrous_rates=FLAGS.atrous_rates,
                add_image_level_feature=FLAGS.add_image_level_feature,
                aspp_with_batch_norm=FLAGS.aspp_with_batch_norm,
                aspp_with_separable_conv=FLAGS.aspp_with_separable_conv,
                multi_grid=FLAGS.multi_grid,
                depth_multiplier=FLAGS.depth_multiplier,
                output_stride=FLAGS.output_stride,
                decoder_output_stride=FLAGS.decoder_output_stride,
                decoder_use_separable_conv=FLAGS.decoder_use_separable_conv,
                crop_size=[FLAGS.image_size, FLAGS.image_size],
                logits_kernel_size=FLAGS.logits_kernel_size,
                model_variant=FLAGS.model_variant)

        metric_map = {}
        for output in output_to_num_classes:
            output_predictions = predictions[output]
            output_probs = probs[output]
            if output == 'segment':
                output_predictions = tf.expand_dims(output_predictions, 3)
                if num_classes == 2:
                    labels = samples['label']

                    iou, weights = model.foreground_iou(
                        labels, output_predictions, FLAGS)
                    soft_iou, _ = model.foreground_iou(
                        labels, output_probs[:, :, :, 1:2], FLAGS)

                    metric_map['mIOU'] = tf.metrics.mean(iou)
                    metric_map['soft_mIOU'] = tf.metrics.mean(soft_iou)

                    high_prob_overlaps = calc_high_prob_overlaps(
                        labels, output_probs, weights)
                    metric_map['highestOverlaps'] = tf.metrics.mean(
                        high_prob_overlaps)

                    output_probs *= weights

                else:
                    output_predictions = tf.reshape(output_predictions,
                                                    shape=[-1])
                    labels = tf.reshape(samples['label'], shape=[-1])
                    weights = tf.to_float(
                        tf.not_equal(
                            labels, model_input.dataset_descriptors[
                                FLAGS.dataset].ignore_label))

                    # Set ignore_label regions to label 0, because metrics.mean_iou
                    # requires range of labels=[0, dataset.num_classes).
                    # Note the ignore_label regions are not evaluated since
                    # the corresponding regions contain weights=0.
                    labels = tf.where(
                        tf.equal(
                            labels, model_input.dataset_descriptors[
                                FLAGS.dataset].ignore_label),
                        tf.zeros_like(labels), labels)

                    predictions_tag = 'mIOU'
                    for eval_scale in FLAGS.eval_scales:
                        predictions_tag += '_' + str(eval_scale)
                    if FLAGS.add_flipped_images:
                        predictions_tag += '_flipped'

                    # Define the evaluation metric.
                    metric_map[
                        predictions_tag] = contrib_slim.metrics.mean_iou(
                            output_predictions,
                            labels,
                            num_classes,
                            weights=weights)

                def label_summary(labels, weights, name):
                    tf.summary.image(
                        name,
                        tf.reshape(
                            tf.cast(
                                tf.to_float(labels * 255) /
                                tf.to_float(num_classes), tf.uint8) *
                            tf.cast(weights, tf.uint8),
                            [-1, FLAGS.image_size, FLAGS.image_size, 1]), 8)

                label_summary(labels, weights, 'label')
                label_summary(output_predictions, weights,
                              'output_predictions')
                tf.summary.image('logits',
                                 tf.expand_dims(output_probs[:, :, :, 1], 3))

            elif output == 'regression':
                labels = samples['label']
                ignore_mask = model.get_ignore_mask(labels, FLAGS)

                accurate = calc_accuracy_in_box(labels, output_probs,
                                                ignore_mask)
                metric_map['inBoxAccuracy'] = tf.metrics.mean(accurate)

        tf.summary.image('image', samples['image'], 8)

        metrics_to_values, metrics_to_updates = contrib_slim.metrics.aggregate_metric_map(
            metric_map)

        for metric_name, metric_value in metrics_to_values.iteritems():
            metric_value = tf.Print(metric_value, [metric_value], metric_name)
            tf.summary.scalar(metric_name, metric_value)

        num_batches = int(
            math.ceil(FLAGS.num_samples / float(FLAGS.batch_size)))

        tf.logging.info('Eval num images %d', FLAGS.num_samples)
        tf.logging.info('Eval batch size %d and num batch %d',
                        FLAGS.batch_size, num_batches)

        contrib_slim.evaluation.evaluation_loop(
            master='',
            checkpoint_dir=FLAGS.checkpoint_dir,
            logdir=FLAGS.eval_logdir,
            num_evals=num_batches,
            eval_op=metrics_to_updates.values(),
            summary_op=tf.summary.merge_all(),
            max_number_of_evaluations=None,
            eval_interval_secs=FLAGS.eval_interval_secs)
Example #21
0
def detection_loss(cls_outputs, box_outputs, labels, params):
    """Computes total detection loss.

  Computes total detection loss including box and class loss from all levels.
  Args:
    cls_outputs: an OrderDict with keys representing levels and values
      representing logits in [batch_size, height, width, num_anchors].
    box_outputs: an OrderDict with keys representing levels and values
      representing box regression targets in [batch_size, height, width,
      num_anchors * 4].
    labels: the dictionary that returned from dataloader that includes
      groundtruth targets.
    params: the dictionary including training parameters specified in
      default_haprams function in this file.

  Returns:
    total_loss: an integer tensor representing total loss reducing from
      class and box losses from all levels.
    cls_loss: an integer tensor representing total class loss.
    box_loss: an integer tensor representing total box regression loss.
  """
    # Sum all positives in a batch for normalization and avoid zero
    # num_positives_sum, which would lead to inf loss during training
    num_positives_sum = tf.reduce_sum(labels['mean_num_positives']) + 1.0
    positives_momentum = params.get('positives_momentum', None) or 0
    if positives_momentum > 0:
        # normalize the num_positive_examples for training stability.
        moving_normalizer_var = tf.Variable(
            0.0,
            name='moving_normalizer',
            dtype=tf.float32,
            synchronization=tf.VariableSynchronization.ON_READ,
            trainable=False,
            aggregation=tf.VariableAggregation.MEAN)
        num_positives_sum = tf.keras.backend.moving_average_update(
            moving_normalizer_var,
            num_positives_sum,
            momentum=params['positives_momentum'])
    elif positives_momentum < 0:
        num_positives_sum = utils.cross_replica_mean(num_positives_sum)

    levels = cls_outputs.keys()
    cls_losses = []
    box_losses = []
    for level in levels:
        # Onehot encoding for classification labels.
        cls_targets_at_level = tf.one_hot(labels['cls_targets_%d' % level],
                                          params['num_classes'],
                                          dtype=cls_outputs[level].dtype)

        if params['data_format'] == 'channels_first':
            bs, _, width, height, _ = cls_targets_at_level.get_shape().as_list(
            )
            cls_targets_at_level = tf.reshape(cls_targets_at_level,
                                              [bs, -1, width, height])
        else:
            bs, width, height, _, _ = cls_targets_at_level.get_shape().as_list(
            )
            cls_targets_at_level = tf.reshape(cls_targets_at_level,
                                              [bs, width, height, -1])
        box_targets_at_level = labels['box_targets_%d' % level]

        cls_loss = focal_loss(cls_outputs[level],
                              cls_targets_at_level,
                              params['alpha'],
                              params['gamma'],
                              normalizer=num_positives_sum,
                              label_smoothing=params['label_smoothing'])

        if params['data_format'] == 'channels_first':
            cls_loss = tf.reshape(
                cls_loss, [bs, -1, width, height, params['num_classes']])
        else:
            cls_loss = tf.reshape(
                cls_loss, [bs, width, height, -1, params['num_classes']])

        cls_loss *= tf.cast(
            tf.expand_dims(tf.not_equal(labels['cls_targets_%d' % level], -2),
                           -1), cls_loss.dtype)
        cls_loss_sum = tf.reduce_sum(cls_loss)
        cls_losses.append(tf.cast(cls_loss_sum, tf.float32))

        if params['box_loss_weight']:
            box_losses.append(
                _box_loss(box_outputs[level],
                          box_targets_at_level,
                          num_positives_sum,
                          delta=params['delta']))

    # Sum per level losses to total loss.
    cls_loss = tf.add_n(cls_losses)
    box_loss = tf.add_n(box_losses) if box_losses else tf.constant(0.)

    total_loss = (cls_loss + params['box_loss_weight'] * box_loss)

    return total_loss, cls_loss, box_loss
Example #22
0
def build_genie_model(feat_dict,
                      cfg,
                      batch_size,
                      seq_len,
                      is_training=True,
                      seq_varlens=None,
                      dtype=tf.float32):
  """Builds a Piano Genie model.

  Args:
    feat_dict: Dictionary containing input tensors.
    cfg: Configuration object.
    batch_size: Number of items in batch.
    seq_len: Length of each batch item.
    is_training: Set to False for evaluation.
    seq_varlens: If not None, a tensor with the batch sequence lengths.
    dtype: Model weight type.

  Returns:
    A dict containing tensors for relevant model config.
  """
  out_dict = {}

  # Parse features
  pitches = util.demidify(feat_dict["midi_pitches"])
  velocities = feat_dict["velocities"]
  pitches_scalar = ((tf.cast(pitches, tf.float32) / 87.) * 2.) - 1.

  # Create sequence lens
  if is_training and cfg.train_randomize_seq_len:
    seq_lens = tf.random_uniform(
        [batch_size],
        minval=cfg.train_seq_len_min,
        maxval=seq_len + 1,
        dtype=tf.int32)
    stp_varlen_mask = tf.sequence_mask(
        seq_lens, maxlen=seq_len, dtype=tf.float32)
  elif seq_varlens is not None:
    seq_lens = seq_varlens
    stp_varlen_mask = tf.sequence_mask(
        seq_varlens, maxlen=seq_len, dtype=tf.float32)
  else:
    seq_lens = tf.ones([batch_size], dtype=tf.int32) * seq_len
    stp_varlen_mask = None

  # Encode
  if (cfg.stp_emb_unconstrained or cfg.stp_emb_vq or cfg.stp_emb_iq or
      cfg.seq_emb_unconstrained or cfg.seq_emb_vae or
      cfg.lor_emb_unconstrained):
    # Build encoder features
    enc_feats = []
    if cfg.enc_pitch_scalar:
      enc_feats.append(tf.expand_dims(pitches_scalar, axis=-1))
    else:
      enc_feats.append(tf.one_hot(pitches, 88))
    if "delta_times_int" in cfg.enc_aux_feats:
      enc_feats.append(
          tf.one_hot(feat_dict["delta_times_int"],
                     cfg.data_max_discrete_times + 1))
    if "velocities" in cfg.enc_aux_feats:
      enc_feats.append(
          tf.one_hot(velocities, cfg.data_max_discrete_velocities + 1))
    enc_feats = tf.concat(enc_feats, axis=2)

    with tf.variable_scope("encoder"):
      enc_stp, enc_seq = simple_lstm_encoder(
          enc_feats,
          seq_lens,
          rnn_celltype=cfg.rnn_celltype,
          rnn_nlayers=cfg.rnn_nlayers,
          rnn_nunits=cfg.rnn_nunits,
          rnn_bidirectional=cfg.enc_rnn_bidirectional,
          dtype=dtype)

  latents = []

  # Step embeddings (single vector per timestep)
  if cfg.stp_emb_unconstrained:
    with tf.variable_scope("stp_emb_unconstrained"):
      stp_emb_unconstrained = tf.layers.dense(
          enc_stp, cfg.stp_emb_unconstrained_embedding_dim)

    out_dict["stp_emb_unconstrained"] = stp_emb_unconstrained
    latents.append(stp_emb_unconstrained)

  # Quantized step embeddings with VQ-VAE
  if cfg.stp_emb_vq:
    import sonnet as snt  # pylint:disable=g-import-not-at-top,import-outside-toplevel
    with tf.variable_scope("stp_emb_vq"):
      with tf.variable_scope("pre_vq"):
        # pre_vq_encoding is tf.float32 of [batch_size, seq_len, embedding_dim]
        pre_vq_encoding = tf.layers.dense(enc_stp, cfg.stp_emb_vq_embedding_dim)

      with tf.variable_scope("quantizer"):
        assert stp_varlen_mask is None
        vq_vae = snt.nets.VectorQuantizer(
            embedding_dim=cfg.stp_emb_vq_embedding_dim,
            num_embeddings=cfg.stp_emb_vq_codebook_size,
            commitment_cost=cfg.stp_emb_vq_commitment_cost)
        vq_vae_output = vq_vae(pre_vq_encoding, is_training=is_training)

        stp_emb_vq_quantized = vq_vae_output["quantize"]
        stp_emb_vq_discrete = tf.reshape(
            tf.argmax(vq_vae_output["encodings"], axis=1, output_type=tf.int32),
            [batch_size, seq_len])
        stp_emb_vq_codebook = tf.transpose(vq_vae.embeddings)

    out_dict["stp_emb_vq_quantized"] = stp_emb_vq_quantized
    out_dict["stp_emb_vq_discrete"] = stp_emb_vq_discrete
    out_dict["stp_emb_vq_loss"] = vq_vae_output["loss"]
    out_dict["stp_emb_vq_codebook"] = stp_emb_vq_codebook
    out_dict["stp_emb_vq_codebook_ppl"] = vq_vae_output["perplexity"]
    latents.append(stp_emb_vq_quantized)

    # This tensor retrieves continuous embeddings from codebook. It should
    # *never* be used during training.
    out_dict["stp_emb_vq_quantized_lookup"] = tf.nn.embedding_lookup(
        stp_emb_vq_codebook, stp_emb_vq_discrete)

  # Integer-quantized step embeddings with straight-through
  if cfg.stp_emb_iq:
    with tf.variable_scope("stp_emb_iq"):
      with tf.variable_scope("pre_iq"):
        # pre_iq_encoding is tf.float32 of [batch_size, seq_len]
        pre_iq_encoding = tf.layers.dense(enc_stp, 1)[:, :, 0]

      def iqst(x, n):
        """Integer quantization with straight-through estimator."""
        eps = 1e-7
        s = float(n - 1)
        xp = tf.clip_by_value((x + 1) / 2.0, -eps, 1 + eps)
        xpp = tf.round(s * xp)
        xppp = 2 * (xpp / s) - 1
        return xpp, x + tf.stop_gradient(xppp - x)

      with tf.variable_scope("quantizer"):
        # Pass rounded vals to decoder w/ straight-through estimator
        stp_emb_iq_discrete_f, stp_emb_iq_discrete_rescaled = iqst(
            pre_iq_encoding, cfg.stp_emb_iq_nbins)
        stp_emb_iq_discrete = tf.cast(stp_emb_iq_discrete_f + 1e-4, tf.int32)
        stp_emb_iq_discrete_f = tf.cast(stp_emb_iq_discrete, tf.float32)
        stp_emb_iq_quantized = tf.expand_dims(
            stp_emb_iq_discrete_rescaled, axis=2)

        # Determine which elements round to valid indices
        stp_emb_iq_inrange = tf.logical_and(
            tf.greater_equal(pre_iq_encoding, -1),
            tf.less_equal(pre_iq_encoding, 1))
        stp_emb_iq_inrange_mask = tf.cast(stp_emb_iq_inrange, tf.float32)
        stp_emb_iq_valid_p = weighted_avg(stp_emb_iq_inrange_mask,
                                          stp_varlen_mask)

        # Regularize to encourage encoder to output in range
        stp_emb_iq_range_penalty = weighted_avg(
            tf.square(tf.maximum(tf.abs(pre_iq_encoding) - 1, 0)),
            stp_varlen_mask)

        # Regularize to correlate latent finite differences to input
        stp_emb_iq_dlatents = pre_iq_encoding[:, 1:] - pre_iq_encoding[:, :-1]
        if cfg.stp_emb_iq_contour_dy_scalar:
          stp_emb_iq_dnotes = pitches_scalar[:, 1:] - pitches_scalar[:, :-1]
        else:
          stp_emb_iq_dnotes = tf.cast(pitches[:, 1:] - pitches[:, :-1],
                                      tf.float32)
        if cfg.stp_emb_iq_contour_exp == 1:
          power_func = tf.identity
        elif cfg.stp_emb_iq_contour_exp == 2:
          power_func = tf.square
        else:
          raise NotImplementedError()
        if cfg.stp_emb_iq_contour_comp == "product":
          comp_func = tf.multiply
        elif cfg.stp_emb_iq_contour_comp == "quotient":
          comp_func = lambda x, y: tf.divide(x, y + 1e-6)
        else:
          raise NotImplementedError()

        stp_emb_iq_contour_penalty = weighted_avg(
            power_func(
                tf.maximum(
                    cfg.stp_emb_iq_contour_margin - comp_func(
                        stp_emb_iq_dnotes, stp_emb_iq_dlatents), 0)),
            None if stp_varlen_mask is None else stp_varlen_mask[:, 1:])

        # Regularize to maintain note consistency
        stp_emb_iq_note_held = tf.cast(
            tf.equal(pitches[:, 1:] - pitches[:, :-1], 0), tf.float32)
        if cfg.stp_emb_iq_deviate_exp == 1:
          power_func = tf.abs
        elif cfg.stp_emb_iq_deviate_exp == 2:
          power_func = tf.square

        if stp_varlen_mask is None:
          mask = stp_emb_iq_note_held
        else:
          mask = stp_varlen_mask[:, 1:] * stp_emb_iq_note_held
        stp_emb_iq_deviate_penalty = weighted_avg(
            power_func(stp_emb_iq_dlatents), mask)

        # Calculate perplexity of discrete encoder posterior
        if stp_varlen_mask is None:
          mask = stp_emb_iq_inrange_mask
        else:
          mask = stp_varlen_mask * stp_emb_iq_inrange_mask
        stp_emb_iq_discrete_oh = tf.one_hot(stp_emb_iq_discrete,
                                            cfg.stp_emb_iq_nbins)
        stp_emb_iq_avg_probs = weighted_avg(
            stp_emb_iq_discrete_oh,
            mask,
            axis=[0, 1],
            expand_mask=True)
        stp_emb_iq_discrete_ppl = tf.exp(-tf.reduce_sum(
            stp_emb_iq_avg_probs * tf.log(stp_emb_iq_avg_probs + 1e-10)))

    out_dict["stp_emb_iq_quantized"] = stp_emb_iq_quantized
    out_dict["stp_emb_iq_discrete"] = stp_emb_iq_discrete
    out_dict["stp_emb_iq_valid_p"] = stp_emb_iq_valid_p
    out_dict["stp_emb_iq_range_penalty"] = stp_emb_iq_range_penalty
    out_dict["stp_emb_iq_contour_penalty"] = stp_emb_iq_contour_penalty
    out_dict["stp_emb_iq_deviate_penalty"] = stp_emb_iq_deviate_penalty
    out_dict["stp_emb_iq_discrete_ppl"] = stp_emb_iq_discrete_ppl
    latents.append(stp_emb_iq_quantized)

    # This tensor converts discrete values to continuous.
    # It should *never* be used during training.
    out_dict["stp_emb_iq_quantized_lookup"] = tf.expand_dims(
        2. * (stp_emb_iq_discrete_f / (cfg.stp_emb_iq_nbins - 1.)) - 1., axis=2)

  # Sequence embedding (single vector per sequence)
  if cfg.seq_emb_unconstrained:
    with tf.variable_scope("seq_emb_unconstrained"):
      seq_emb_unconstrained = tf.layers.dense(
          enc_seq, cfg.seq_emb_unconstrained_embedding_dim)

    out_dict["seq_emb_unconstrained"] = seq_emb_unconstrained

    seq_emb_unconstrained = tf.stack([seq_emb_unconstrained] * seq_len, axis=1)
    latents.append(seq_emb_unconstrained)

  # Sequence embeddings (variational w/ reparameterization trick)
  if cfg.seq_emb_vae:
    with tf.variable_scope("seq_emb_vae"):
      seq_emb_vae = tf.layers.dense(enc_seq, cfg.seq_emb_vae_embedding_dim * 2)

      mean = seq_emb_vae[:, :cfg.seq_emb_vae_embedding_dim]
      stddev = 1e-6 + tf.nn.softplus(
          seq_emb_vae[:, cfg.seq_emb_vae_embedding_dim:])
      seq_emb_vae = mean + stddev * tf.random_normal(
          tf.shape(mean), 0, 1, dtype=dtype)

      kl = tf.reduce_mean(0.5 * tf.reduce_sum(
          tf.square(mean) + tf.square(stddev) - tf.log(1e-8 + tf.square(stddev))
          - 1,
          axis=1))

    out_dict["seq_emb_vae"] = seq_emb_vae
    out_dict["seq_emb_vae_kl"] = kl

    seq_emb_vae = tf.stack([seq_emb_vae] * seq_len, axis=1)
    latents.append(seq_emb_vae)

  # Low-rate embeddings
  if cfg.lor_emb_unconstrained:
    assert seq_len % cfg.lor_emb_n == 0

    with tf.variable_scope("lor_emb_unconstrained"):
      # Downsample step embeddings
      rnn_embedding_dim = int(enc_stp.get_shape()[-1])
      enc_lor = tf.reshape(enc_stp, [
          batch_size, seq_len // cfg.lor_emb_n,
          cfg.lor_emb_n * rnn_embedding_dim
      ])
      lor_emb_unconstrained = tf.layers.dense(
          enc_lor, cfg.lor_emb_unconstrained_embedding_dim)

      out_dict["lor_emb_unconstrained"] = lor_emb_unconstrained

      # Upsample lo-rate embeddings for decoding
      lor_emb_unconstrained = tf.expand_dims(lor_emb_unconstrained, axis=2)
      lor_emb_unconstrained = tf.tile(lor_emb_unconstrained,
                                      [1, 1, cfg.lor_emb_n, 1])
      lor_emb_unconstrained = tf.reshape(
          lor_emb_unconstrained,
          [batch_size, seq_len, cfg.lor_emb_unconstrained_embedding_dim])

      latents.append(lor_emb_unconstrained)

  # Build decoder features
  dec_feats = latents

  if cfg.dec_autoregressive:
    # Retrieve pitch numbers
    curr_pitches = pitches
    last_pitches = curr_pitches[:, :-1]
    last_pitches = tf.pad(
        last_pitches, [[0, 0], [1, 0]],
        constant_values=-1)  # Prepend <SOS> token
    out_dict["dec_last_pitches"] = last_pitches
    dec_feats.append(tf.one_hot(last_pitches + 1, 89))

    if cfg.dec_pred_velocity:
      curr_velocities = velocities
      last_velocities = curr_velocities[:, :-1]
      last_velocities = tf.pad(last_velocities, [[0, 0], [1, 0]])
      dec_feats.append(
          tf.one_hot(last_velocities, cfg.data_max_discrete_velocities + 1))

  if "delta_times_int" in cfg.dec_aux_feats:
    dec_feats.append(
        tf.one_hot(feat_dict["delta_times_int"],
                   cfg.data_max_discrete_times + 1))
  if "velocities" in cfg.dec_aux_feats:
    assert not cfg.dec_pred_velocity
    dec_feats.append(
        tf.one_hot(feat_dict["velocities"],
                   cfg.data_max_discrete_velocities + 1))

  assert dec_feats
  dec_feats = tf.concat(dec_feats, axis=2)

  # Decode
  with tf.variable_scope("decoder"):
    dec_stp, dec_initial_state, dec_final_state = simple_lstm_decoder(
        dec_feats,
        seq_lens,
        batch_size,
        rnn_celltype=cfg.rnn_celltype,
        rnn_nlayers=cfg.rnn_nlayers,
        rnn_nunits=cfg.rnn_nunits)

    with tf.variable_scope("pitches"):
      dec_recons_logits = tf.layers.dense(dec_stp, 88)

    dec_recons_loss = weighted_avg(
        tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=dec_recons_logits, labels=pitches), stp_varlen_mask)

    out_dict["dec_initial_state"] = dec_initial_state
    out_dict["dec_final_state"] = dec_final_state
    out_dict["dec_recons_logits"] = dec_recons_logits
    out_dict["dec_recons_scores"] = tf.nn.softmax(dec_recons_logits, axis=-1)
    out_dict["dec_recons_preds"] = tf.argmax(
        dec_recons_logits, output_type=tf.int32, axis=-1)
    out_dict["dec_recons_midi_preds"] = util.remidify(
        out_dict["dec_recons_preds"])
    out_dict["dec_recons_loss"] = dec_recons_loss

    if cfg.dec_pred_velocity:
      with tf.variable_scope("velocities"):
        dec_recons_velocity_logits = tf.layers.dense(
            dec_stp, cfg.data_max_discrete_velocities + 1)

      dec_recons_velocity_loss = weighted_avg(
          tf.nn.sparse_softmax_cross_entropy_with_logits(
              logits=dec_recons_velocity_logits, labels=velocities),
          stp_varlen_mask)

      out_dict["dec_recons_velocity_logits"] = dec_recons_velocity_logits
      out_dict["dec_recons_velocity_loss"] = dec_recons_velocity_loss

  # Stats
  if cfg.stp_emb_vq or cfg.stp_emb_iq:
    discrete = out_dict[
        "stp_emb_vq_discrete" if cfg.stp_emb_vq else "stp_emb_iq_discrete"]
    dx = pitches[:, 1:] - pitches[:, :-1]
    dy = discrete[:, 1:] - discrete[:, :-1]
    contour_violation = tf.reduce_mean(tf.cast(tf.less(dx * dy, 0), tf.float32))

    dx_hold = tf.equal(dx, 0)
    deviate_violation = weighted_avg(
        tf.cast(tf.not_equal(dy, 0), tf.float32), tf.cast(dx_hold, tf.float32))

    out_dict["contour_violation"] = contour_violation
    out_dict["deviate_violation"] = deviate_violation

  return out_dict
Example #23
0
  def build():
    """Builds the Tensorflow graph."""
    inputs, labels, lengths = None, None, None

    if mode in ('train', 'eval'):
      if isinstance(no_event_label, numbers.Number):
        label_shape = []
      else:
        label_shape = [len(no_event_label)]
      inputs, labels, lengths = magenta.common.get_padded_batch(
          sequence_example_file_paths, hparams.batch_size, input_size,
          label_shape=label_shape, shuffle=mode == 'train')

    elif mode == 'generate':
      inputs = tf.placeholder(tf.float32, [hparams.batch_size, None,
                                           input_size])

    if isinstance(encoder_decoder,
                  magenta.music.OneHotIndexEventSequenceEncoderDecoder):
      expanded_inputs = tf.one_hot(
          tf.cast(tf.squeeze(inputs, axis=-1), tf.int64),
          encoder_decoder.input_depth)
    else:
      expanded_inputs = inputs

    dropout_keep_prob = 1.0 if mode == 'generate' else hparams.dropout_keep_prob

    if hparams.use_cudnn:
      outputs, initial_state, final_state = make_cudnn(
          expanded_inputs, hparams.rnn_layer_sizes, hparams.batch_size, mode,
          dropout_keep_prob=dropout_keep_prob,
          residual_connections=hparams.residual_connections)

    else:
      cell = make_rnn_cell(
          hparams.rnn_layer_sizes,
          dropout_keep_prob=dropout_keep_prob,
          attn_length=hparams.attn_length,
          residual_connections=hparams.residual_connections)

      initial_state = cell.zero_state(hparams.batch_size, tf.float32)

      outputs, final_state = tf.nn.dynamic_rnn(
          cell, inputs, sequence_length=lengths, initial_state=initial_state,
          swap_memory=True)

    outputs_flat = magenta.common.flatten_maybe_padded_sequences(
        outputs, lengths)
    if isinstance(num_classes, numbers.Number):
      num_logits = num_classes
    else:
      num_logits = sum(num_classes)
    logits_flat = contrib_layers.linear(outputs_flat, num_logits)

    if mode in ('train', 'eval'):
      labels_flat = magenta.common.flatten_maybe_padded_sequences(
          labels, lengths)

      if isinstance(num_classes, numbers.Number):
        softmax_cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=labels_flat, logits=logits_flat)
        predictions_flat = tf.argmax(logits_flat, axis=1)
      else:
        logits_offsets = np.cumsum([0] + num_classes)
        softmax_cross_entropy = []
        predictions = []
        for i in range(len(num_classes)):
          softmax_cross_entropy.append(
              tf.nn.sparse_softmax_cross_entropy_with_logits(
                  labels=labels_flat[:, i],
                  logits=logits_flat[
                      :, logits_offsets[i]:logits_offsets[i + 1]]))
          predictions.append(
              tf.argmax(logits_flat[
                  :, logits_offsets[i]:logits_offsets[i + 1]], axis=1))
        predictions_flat = tf.stack(predictions, 1)

      correct_predictions = tf.to_float(
          tf.equal(labels_flat, predictions_flat))
      event_positions = tf.to_float(tf.not_equal(labels_flat, no_event_label))
      no_event_positions = tf.to_float(tf.equal(labels_flat, no_event_label))

      # Compute the total number of time steps across all sequences in the
      # batch. For some models this will be different from the number of RNN
      # steps.
      def batch_labels_to_num_steps(batch_labels, lengths):
        num_steps = 0
        for labels, length in zip(batch_labels, lengths):
          num_steps += encoder_decoder.labels_to_num_steps(labels[:length])
        return np.float32(num_steps)
      num_steps = tf.py_func(
          batch_labels_to_num_steps, [labels, lengths], tf.float32)

      if mode == 'train':
        loss = tf.reduce_mean(softmax_cross_entropy)
        perplexity = tf.exp(loss)
        accuracy = tf.reduce_mean(correct_predictions)
        event_accuracy = (
            tf.reduce_sum(correct_predictions * event_positions) /
            tf.reduce_sum(event_positions))
        no_event_accuracy = (
            tf.reduce_sum(correct_predictions * no_event_positions) /
            tf.reduce_sum(no_event_positions))

        loss_per_step = tf.reduce_sum(softmax_cross_entropy) / num_steps
        perplexity_per_step = tf.exp(loss_per_step)

        optimizer = tf.train.AdamOptimizer(learning_rate=hparams.learning_rate)

        train_op = contrib_slim.learning.create_train_op(
            loss, optimizer, clip_gradient_norm=hparams.clip_norm)
        tf.add_to_collection('train_op', train_op)

        vars_to_summarize = {
            'loss': loss,
            'metrics/perplexity': perplexity,
            'metrics/accuracy': accuracy,
            'metrics/event_accuracy': event_accuracy,
            'metrics/no_event_accuracy': no_event_accuracy,
            'metrics/loss_per_step': loss_per_step,
            'metrics/perplexity_per_step': perplexity_per_step,
        }
      elif mode == 'eval':
        vars_to_summarize, update_ops = contrib_metrics.aggregate_metric_map({
            'loss':
                tf.metrics.mean(softmax_cross_entropy),
            'metrics/accuracy':
                tf.metrics.accuracy(labels_flat, predictions_flat),
            'metrics/per_class_accuracy':
                tf.metrics.mean_per_class_accuracy(labels_flat,
                                                   predictions_flat,
                                                   num_classes),
            'metrics/event_accuracy':
                tf.metrics.recall(event_positions, correct_predictions),
            'metrics/no_event_accuracy':
                tf.metrics.recall(no_event_positions, correct_predictions),
            'metrics/loss_per_step':
                tf.metrics.mean(
                    tf.reduce_sum(softmax_cross_entropy) / num_steps,
                    weights=num_steps),
        })
        for updates_op in update_ops.values():
          tf.add_to_collection('eval_ops', updates_op)

        # Perplexity is just exp(loss) and doesn't need its own update op.
        vars_to_summarize['metrics/perplexity'] = tf.exp(
            vars_to_summarize['loss'])
        vars_to_summarize['metrics/perplexity_per_step'] = tf.exp(
            vars_to_summarize['metrics/loss_per_step'])

      for var_name, var_value in six.iteritems(vars_to_summarize):
        tf.summary.scalar(var_name, var_value)
        tf.add_to_collection(var_name, var_value)

    elif mode == 'generate':
      temperature = tf.placeholder(tf.float32, [])
      if isinstance(num_classes, numbers.Number):
        softmax_flat = tf.nn.softmax(
            tf.div(logits_flat, tf.fill([num_classes], temperature)))
        softmax = tf.reshape(
            softmax_flat, [hparams.batch_size, -1, num_classes])
      else:
        logits_offsets = np.cumsum([0] + num_classes)
        softmax = []
        for i in range(len(num_classes)):
          sm = tf.nn.softmax(
              tf.div(
                  logits_flat[:, logits_offsets[i]:logits_offsets[i + 1]],
                  tf.fill([num_classes[i]], temperature)))
          sm = tf.reshape(sm, [hparams.batch_size, -1, num_classes[i]])
          softmax.append(sm)

      tf.add_to_collection('inputs', inputs)
      tf.add_to_collection('temperature', temperature)
      tf.add_to_collection('softmax', softmax)
      # Flatten state tuples for metagraph compatibility.
      for state in tf_nest.flatten(initial_state):
        tf.add_to_collection('initial_state', state)
      for state in tf_nest.flatten(final_state):
        tf.add_to_collection('final_state', state)
Example #24
0
    def test_ne(self):
        input1 = tf.placeholder(shape=(4, 32, 32, 3), dtype=tf.float32)
        input2 = tf.placeholder(shape=(4, 32, 32, 3), dtype=tf.float32)
        output = tf.not_equal(input1, input2)

        self._test_conversion('ne', [input1, input2], [output])
Example #25
0
def build_distractors(distractor_examples, context):
  """Create inputs with distractors."""

  CLS_ID = tf.constant([101], dtype=tf.int64)  # pylint: disable=invalid-name
  SEP_ID = tf.constant([102], dtype=tf.int64)  # pylint: disable=invalid-name

  bert_inputs = []
  input_masks = []
  segment_ids = []
  # for each distractor
  sample_size = int(
      (FLAGS.num_choices - FLAGS.k_size) / (FLAGS.data_window_size - 1))
  for example in distractor_examples:
    # randomly sample 7
    intermediate_examples_tensor = tf.reduce_sum(tf.abs(example), 1)
    examples_zero_vector = tf.zeros(shape=(1, 1), dtype=tf.int64)
    examples_bool_mask = tf.squeeze(
        tf.not_equal(intermediate_examples_tensor, examples_zero_vector))
    paragraph_len = tf.reduce_sum(tf.cast(examples_bool_mask, tf.int32))
    indices = tf.range(0, limit=paragraph_len, dtype=tf.int32)
    shuffled_indices = tf.random.shuffle(indices)[:sample_size]

    # extend examples / targets
    distractor_cand = example
    distractor_cand_plus_one = distractor_cand[1:]
    distractor_cand_plus_two = distractor_cand[2:]

    # pad extensions
    paddings_one = tf.constant([[0, 1], [0, 0]])
    distractor_cand_plus_one = tf.pad(distractor_cand_plus_one, paddings_one)

    paddings_two = tf.constant([[0, 2], [0, 0]])
    distractor_cand_plus_two = tf.pad(distractor_cand_plus_two, paddings_two)

    distractor_cand_ext = tf.concat(
        [distractor_cand, distractor_cand_plus_one, distractor_cand_plus_two],
        axis=1)

    distractors = tf.gather(distractor_cand_ext, shuffled_indices)
    for i in range(sample_size):
      distractors_non_zero = tf.where(
          tf.not_equal(distractors[i], tf.zeros_like(distractors[i])))
      distractors_stripped = tf.gather_nd(distractors[i], distractors_non_zero)
      if FLAGS.include_context:
        segment_id = tf.concat([
            tf.zeros_like(CLS_ID, dtype=tf.int64),
            tf.zeros_like(context),
            tf.zeros_like(SEP_ID, dtype=tf.int64),
            tf.ones_like(distractors_stripped),
            tf.ones_like(SEP_ID, dtype=tf.int64)
        ],
                               axis=0)
      else:
        segment_id = tf.concat([
            tf.zeros_like(CLS_ID, dtype=tf.int64),
            tf.zeros_like(distractors_stripped),
            tf.zeros_like(SEP_ID, dtype=tf.int64)
        ],
                               axis=0)
      segment_id = pad_and_cut(segment_id, FLAGS.max_seq_length)
      segment_ids.append(segment_id)
      if FLAGS.include_context:
        new_input = tf.concat(
            [CLS_ID, context, SEP_ID, distractors_stripped, SEP_ID], axis=0)
      else:
        new_input = tf.concat([CLS_ID, distractors_stripped, SEP_ID], axis=0)

      input_mask = tf.ones_like(new_input)
      input_mask = pad_and_cut(input_mask, FLAGS.max_seq_length)
      input_masks.append(input_mask)
      padded_new_input = pad_and_cut(new_input, FLAGS.max_seq_length)
      bert_inputs.append(padded_new_input)

  bert_inputs = tf.stack(bert_inputs, axis=0)
  input_masks = tf.stack(input_masks, axis=0)
  segment_ids = tf.stack(segment_ids, axis=0)
  out = Outputs_And_Context(bert_inputs, input_masks, segment_ids, None, None)
  return out
Example #26
0
 def create_mobile_mask(input_mask):
     return tf.reduce_all(tf.not_equal(0, input_mask),
                          axis=2,
                          keepdims=True)
 def fn_not_eos():
     return tf.not_equal(  # Check if the last predicted element is a EOS
         tf.squeeze(result[:, -1, :, :]), text_encoder.EOS_ID)
Example #28
0
    def call(self,
             yesno_logits,
             yesno_labels,
             supporting_fact_logits,
             supporting_fact_labels,
             block_ids,
             num_replicas=None,
             eps=0):
        """Calls the layer.

    Args:
      yesno_logits: <float32>[batch_size, 3] Logits per position.
      supporting_fact_logits: <float32>[batch_size] Logits per position fro
        supporting facts classification.
      block_ids: <int32>[batch_size] Block IDs of every sample in the batch.
      num_replicas: Number of replicas to gather summaries from. If None
        (default) then cross-replicas summaries are not used.
      eps: <float> Small constant for numerical stability.

    Returns:
        total_loss: <float>
    """
        batch_size = tf.shape(supporting_fact_logits)[0]
        supporting_fact_logits = tf.expand_dims(supporting_fact_logits, 1)
        supporting_fact_labels = tf.expand_dims(supporting_fact_labels, 1)
        example_mask = tf.cast(tf.expand_dims(tf.not_equal(block_ids, 0), 1),
                               tf.float32)

        # (1) Aggregate block_ids across global batch. Compute cross block mask.
        all_block_ids = block_ids
        if num_replicas:
            all_block_ids = tpu_utils.cross_replica_concat(
                tensor=all_block_ids,
                num_replicas=num_replicas,
                name='block_ids_concat')

        # [batch_size, global_batch_size]
        cross_blocks_eq_mask = tf.cast(
            tf.equal(tf.expand_dims(block_ids, 1),
                     tf.expand_dims(all_block_ids, 0)), tf.float32)

        # (2) Apply softmax over all positions in the (global) batch
        # across the blocks with the same `block_id`.

        # [batch_size, 3, 1]
        yes_no_span_probs = losses.cross_batch_softmax(
            tf.expand_dims(yesno_logits, 2), cross_blocks_eq_mask,
            num_replicas)
        yes_no_span_probs = tf.squeeze(yes_no_span_probs, 2)

        # [batch_size, 1]
        supporting_facts_probs = losses.cross_batch_softmax(
            tf.expand_dims(supporting_fact_logits, 2), cross_blocks_eq_mask,
            num_replicas)
        supporting_facts_probs = tf.squeeze(supporting_facts_probs, 2)

        # (3) Prepare one-hot labels based on annotation begins and ends

        supporting_fact_labels = tf.cast(supporting_fact_labels, tf.float32)

        # [batch_size, 3]
        yes_no_span_one_hot = tf.one_hot(yesno_labels,
                                         depth=3,
                                         dtype=tf.float32)
        yes_no_span_one_hot = yes_no_span_one_hot * supporting_fact_labels

        # (4) Compute the probability of the current begin / end positions across
        # the blocks with the same `block_id`.

        def mean_loss(all_losses):
            return tf.reduce_sum(all_losses * example_mask) / (
                tf.reduce_sum(example_mask) + eps)

        supporting_facts_loss = -mean_loss(
            tf.log(supporting_facts_probs * supporting_fact_labels + eps))

        yes_no_span_loss = -mean_loss(
            tf.log(yes_no_span_probs * yes_no_span_one_hot + eps))

        return yes_no_span_loss, supporting_facts_loss
Example #29
0
    def _create_inference(self):
        """
        Inference used for learning model parameters
        """
        # Mapped embeddings for users (u^c, u^i and u^s)
        self.u_c = tf.nn.embedding_lookup(self.uw_c, self.input_u)
        self.u_c = tf.reshape(self.u_c, [-1, self.embedding_size])
        self.u_i = tf.nn.embedding_lookup(self.uw_i, self.input_u)
        self.u_i = tf.reshape(self.u_i, [-1, self.embedding_size])
        self.u_s = tf.nn.embedding_lookup(self.uw_s, self.input_u)
        self.u_s = tf.reshape(self.u_s, [-1, self.embedding_size])
        # Our contribution (haven't added u^c2 because I figured, if we use the same u^c, we essentially use
        # shared knowledge between all domains. Maybe that is favorable?
        # TODO: We could test with only sharing between item domain and questionnaire domain.
        self.u_q = tf.nn.embedding_lookup(self.uw_q, self.input_u)
        self.u_q = tf.reshape(self.u_q, [-1, self.embedding_size])

        # Attentive transferred embeddings for users (p^I_u and p^S_u)
        self.P_iu, self.item_w = self._item_attentive_transfer()
        self.P_su, self.social_w = self._social_attentive_transfer()
        # Our contribution
        self.P_qu, self.question_w = self._questionnaire_attentive_transfer()

        # adding dropout on transferred embeddings to avoid overfitting
        self.P_iu = tf.nn.dropout(self.P_iu, self.dropout_keep_prob)
        self.P_su = tf.nn.dropout(self.P_su, self.dropout_keep_prob)
        self.P_qu = tf.nn.dropout(self.P_qu, self.dropout_keep_prob)

        # Looking up item embeddings from data
        self.pos_item = tf.nn.embedding_lookup(self.Q, self.input_ur)
        # Items used for this inference
        self.pos_n_ratings = tf.cast(tf.not_equal(self.input_ur, self.n_items),
                                     'float32')
        # Performing matrix multiplication to obtain item embeddings for this inference
        self.pos_item = tf.einsum('ab,abc->abc', self.pos_n_ratings,
                                  self.pos_item)
        # Transferred embeddings for items multiplied with item embeddings
        self.pos_r = tf.einsum('ac,abc->abc', self.P_iu, self.pos_item)
        # Need to multiply with H_i as well
        self.pos_r = tf.einsum('ajk,kl->ajl', self.pos_r, self.H_i)
        self.pos_r = tf.reshape(self.pos_r, [-1, max_items])

        # Social embeddings lookup
        self.pos_friend = tf.nn.embedding_lookup(self.G, self.input_uf)
        # Social interactions used for this inference
        self.pos_n_friends = tf.cast(tf.not_equal(self.input_uf, self.n_users),
                                     'float32')
        # Obtaining embeddings for socials used in this inference
        self.pos_friend = tf.einsum('ab,abc->abc', self.pos_n_friends,
                                    self.pos_friend)
        # Multiplying with social attentive transferred user embeddings
        self.pos_f = tf.einsum('ac,abc->abc', self.P_su, self.pos_friend)
        # Need to multiply with H_s as well
        self.pos_f = tf.einsum('abc,cd->abd', self.pos_friend, self.H_s)
        self.pos_f = tf.reshape(self.pos_f, [-1, max_friends])

        # Questionnaire embeddings lookup
        self.pos_questions = tf.nn.embedding_lookup(self.V, self.input_uq)
        # Answered questions for this inference
        self.pos_n_questions = tf.cast(
            tf.not_equal(self.input_uq, self.n_items),
            'float32')  # consider if this should numbers of questions instead?
        # Obtaining embeddings for questions used in this inference
        self.pos_questions = tf.einsum('ab,abc->abc', self.pos_n_questions,
                                       self.pos_questions)
        # Multiplying with question attentive transferred user embeddings
        self.pos_q = tf.einsum('ac,abc->abc', self.P_qu, self.pos_questions)
        # Need to multiply with H_q as well
        self.pos_q = tf.einsum('abc,cd->abd', self.pos_questions, self.H_q)
        self.pos_q = tf.reshape(self.pos_q, [-1, self.max_questions])
Example #30
0
def _loss_function(conf_gt, conf_logits, reg_gt, reg_logits, config):
    """
    Creates the PPN loss function.

    Returns (conf_loss, point_loss)

    conf_gt:
        Ground truth confidence, i.e. 1 for close anchors, 0 for anchors
        that are too far off and -1 for anchors to be ignored. Must have
        shape (?, fh, fw, k).
    conf_logits:
        PPN confidence output, must have shape (?, fh, fw, k).
    reg_gt:
        Ground truth point offsets, need only have valid values for the
        anchors with conf_gt of 1. Must have shape (?, fh, fw, 2k).
    reg_logits:
        PPN anchor offset output, must have shape (?, fh, fw, 2k).
    config
            The configuration dictionary. See ppn.config.ppn_config.
    """
    import tensorflow.compat.v1 as tf

    # mask out the invalid anchors:
    #     only penalize confidence of valid (i.e. not ignored) anchors
    #     only penalize points of positive anchors
    valid_mask = tf.stop_gradient(tf.not_equal(conf_gt, -1))
    pos_mask = tf.stop_gradient(tf.equal(conf_gt, 1))
    num_valid = tf.stop_gradient(tf.count_nonzero(valid_mask, dtype=tf.int32))
    num_pos = tf.stop_gradient(tf.count_nonzero(pos_mask, dtype=tf.int32))
    valid_conf_gt = tf.boolean_mask(conf_gt, valid_mask)
    valid_conf_logits = tf.boolean_mask(conf_logits, valid_mask)
    pos_reg_gt = tf.boolean_mask(reg_gt, pos_mask)
    pos_reg_logits = tf.boolean_mask(reg_logits, pos_mask)

    if config['loss_function'] == 'crossentropy':
        # get the confidence loss using sigmoidal cross entropy
        conf_loss = tf.nn.sigmoid_cross_entropy_with_logits(
            labels=tf.cast(valid_conf_gt, tf.float32),
            logits=valid_conf_logits)
    else:
        # get the confidence loss using focal loss
        conf_loss = _binary_focal_loss_with_logits(
            labels=tf.cast(valid_conf_gt, tf.float32),
            logits=valid_conf_logits,
            gamma=config['focal_gamma'],
            pos_weight=config['focal_pos_weight'])
        if config['focal_normalized']:
            # normalize according to number of positive anchors
            conf_loss = conf_loss / tf.cast(num_valid, tf.float32)
    conf_loss = tf.reduce_sum(conf_loss)

    # get the point loss using MSE
    point_loss = tf.losses.mean_squared_error(
        labels=pos_reg_gt,
        predictions=pos_reg_logits,
        reduction=tf.losses.Reduction.SUM)

    # zero out the losses if there were no valid points
    conf_loss = tf.where(tf.equal(num_valid, 0),
                            0.0,
                            conf_loss,
                            name='conf_loss')
    point_loss = tf.where(tf.equal(num_pos, 0),
                            0.0,
                            point_loss,
                            name='point_loss')

    # normalize losses to contribute equally and add
    N_conf, N_reg = config['N_conf'], config['N_reg']
    return ((1.0/N_conf) * conf_loss, (1.0/N_reg) * point_loss)