def resize_and_crop_boxes(self): """Resize boxes and crop it to the self._output dimension.""" boxlist = preprocessor.box_list.BoxList(self._boxes) # boxlist is in range of [0, 1], so here we pass the scale_height/width # instead of just scale. boxes = preprocessor.box_list_scale(boxlist, self._scaled_height, self._scaled_width).get() # Adjust box coordinates based on the offset. box_offset = tf.stack([ self._crop_offset_y, self._crop_offset_x, self._crop_offset_y, self._crop_offset_x, ]) boxes -= tf.cast(tf.reshape(box_offset, [1, 4]), tf.float32) # Clip the boxes. boxes = self.clip_boxes(boxes) # Filter out ground truth boxes that are illegal. indices = tf.where( tf.not_equal( (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]), 0)) boxes = tf.gather_nd(boxes, indices) classes = tf.gather_nd(self._classes, indices) return boxes, classes
def _stitch(features): """Stitch features on the first dimension.""" full_mask = tf.greater(features['task'], 1) step_mask = tf.reduce_any(full_mask, axis=-1) step_mask_exclude_last = tf.pad(step_mask, [[0, 0], [0, 1]], constant_values=False)[:, 1:] num_sequences = common_layers.shape_list(features['task'])[0] num_steps = common_layers.shape_list(features['task'])[1] connectors = tf.constant(PADDED_CONCATENATORS) # Select connectors connector_indices = tf.random.uniform([num_sequences * num_steps], minval=0, maxval=len(PADDED_CONCATENATORS), dtype=tf.int32) selected_connectors = tf.reshape( tf.gather(connectors, connector_indices), [num_sequences, num_steps, len(PADDED_CONCATENATORS[0])]) selected_connectors = tf.multiply(selected_connectors, tf.expand_dims( tf.to_int32(step_mask_exclude_last), 2), name='connector_mask') features['task'] = tf.concat([features['task'], selected_connectors], axis=-1) ref_offsets = tf.expand_dims( tf.cumsum(tf.reduce_sum(tf.to_int32(tf.greater(features['task'], 1)), -1), exclusive=True, axis=-1), 2) features['task'] = tf.reshape(features['task'], [num_sequences, -1]) full_mask = tf.greater(features['task'], 1) full_mask_int = tf.to_int32(full_mask) indices = tf.where( tf.sequence_mask(lengths=tf.reduce_sum(full_mask_int, -1))) values = tf.boolean_mask(tf.reshape(features['task'], [-1]), tf.reshape(full_mask, [-1])) sparse_task = tf.sparse.SparseTensor(indices=indices, values=values, dense_shape=tf.to_int64( tf.shape(features['task']))) # Stitch task and raw_task stitched_features = {} stitched_features['task'] = tf.sparse_tensor_to_dense(sparse_task) max_len = tf.reduce_max( tf.reduce_sum(tf.to_int32(tf.greater(stitched_features['task'], 1)), -1)) stitched_features['task'] = stitched_features['task'][:, :max_len] if 'raw_task' in features: connector_strs = tf.reshape( tf.gather(tf.constant(CONCATENATORS_STR), connector_indices), [num_sequences, num_steps]) masked_connector_strs = tf.where(step_mask_exclude_last, connector_strs, tf.fill(tf.shape(connector_strs), '')) stitched_features['raw_task'] = tf.strings.reduce_join( tf.strings.reduce_join(tf.concat([ tf.expand_dims(features['raw_task'], 2), tf.expand_dims(masked_connector_strs, 2) ], axis=2), axis=-1), -1) # Stitch screen sequences action_lengths = tf.reduce_sum( tf.to_int32( tf.greater(features['verb_refs'][:, :, 0, 1], features['verb_refs'][:, :, 0, 0])), -1) max_action_length = tf.reduce_max(action_lengths) def _pad(tensor, padding_value=0): shape_list = common_layers.shape_list(tensor) assert len(shape_list) >= 2 padding_list = [[0, 0], [0, 1]] + [[0, 0]] * (len(shape_list) - 2) return tf.pad(tensor[:, :max_action_length], padding_list, constant_values=padding_value) for key in features.keys(): if key.endswith('_refs'): features[key] = tf.squeeze(features[key], 2) ref_mask = tf.expand_dims( tf.to_int32( tf.not_equal(features[key][:, :, 0], features[key][:, :, 1])), 2) stitched_features[key] = tf.multiply((features[key] + ref_offsets), ref_mask, name='ref_mask') stitched_features[key] = _pad(stitched_features[key]) elif key in [ 'verbs', 'objects', 'consumed', 'obj_dom_pos', 'obj_text', 'obj_type', 'obj_clickable', 'obj_screen_pos', 'verb_refs', 'obj_refs', 'input_refs', 'obj_dom_dist' ]: features[key] = tf.squeeze(features[key], 2) stitched_features[key] = features[key] stitched_features[key] = _pad( stitched_features[key], padding_value=-1 if key == 'obj_type' else 0) elif key not in ['task', 'raw_task']: stitched_features[key] = features[key][:, 0] # Append eos to 'task' stitched_features['task'] = tf.pad(stitched_features['task'], [[0, 0], [0, 1]]) task_mask = tf.to_int32(tf.greater(stitched_features['task'], 1)) task_eos_mask = tf.pad(task_mask, [[0, 0], [1, 0]], constant_values=1)[:, :-1] stitched_features['task'] = stitched_features['task'] + (task_eos_mask - task_mask) # Append eos verb_mask = tf.to_int32(tf.greater(stitched_features['verbs'], 1)) verb_eos_mask = tf.pad(verb_mask, [[0, 0], [1, 0]], constant_values=1)[:, :-1] verb_eos = verb_eos_mask - verb_mask stitched_features['verbs'] = stitched_features['verbs'] + verb_eos # Append last step refs to 'verb_refs' task_lengths = tf.where(tf.equal(stitched_features['task'], 1))[:, 1] eos_pos = tf.to_int32(tf.stack([task_lengths, task_lengths + 1], axis=1)) action_mask = tf.to_int32( tf.sequence_mask(action_lengths, max_action_length + 1)) action_and_eos_mask = tf.pad(action_mask, [[0, 0], [1, 0]], constant_values=1)[:, :-1] verb_ref_eos = action_and_eos_mask - action_mask eos_refs = tf.multiply(tf.tile(tf.expand_dims(eos_pos, 1), [1, max_action_length + 1, 1]), tf.expand_dims(verb_ref_eos, 2), name='verb_ref_eos') stitched_features['verb_refs'] += eos_refs return stitched_features
def create_training_ops( self, phi_all, values, target_values, advantages, deltas_training, delta_sums_training, pieces_training, old_probs, params, ): clip_param, c1, c2, c3, c4, e = params["clipping_parameter"], params[ "value_loss"], params["policy_loss"], params[ "entropy_loss"], params["impossibility_loss"], 10**-6 #current phi(a|s) p_mask = tf.reshape(tf.one_hot(pieces_training[:, :], self.n_pieces), (-1, 1, 1, self.n_pieces), name='p_mask') values = tf.reduce_sum(values * p_mask, axis=[2, 3]) phi = tf.reduce_sum(phi_all * p_mask, axis=3, keepdims=True) delta_phi = phi * tf.cast(deltas_training, tf.float32) delta_sum_phi = phi * tf.cast(delta_sums_training, tf.float32) probability = (tf.reduce_sum(delta_phi, axis=[1, 2]) + e) / (tf.reduce_sum(delta_sum_phi, axis=[1, 2]) + e) #probability ratio r = tf.maximum(probability, e) / tf.maximum(old_probs, e) clipped_r = tf.clip_by_value(r, 1 - clip_param, 1 + clip_param) r_saturation = tf.reduce_mean( tf.cast(tf.not_equal(r, clipped_r), tf.float32)) advnorm = adv_normalizer(0.01, safety=2.0, clip_val=4.0) if self.settings["compress_advantages"]: advantages = advnorm(advantages) policy_loss = tf.minimum(r * advantages, clipped_r * advantages) #impossibility loss impossibility_loss_tf = phi * ( 1 - tf.minimum(1.0, tf.cast(delta_sums_training, tf.float32))) #entropy entropy_bonus = action_entropy = N.action_entropy( delta_sum_phi / tf.reduce_sum(tf.cast(delta_sums_training, tf.float32) + e, axis=[ 1, 2, 3, ], keepdims=True) + e) #tally up self.value_loss_tf = c1 * tf.losses.mean_squared_error( values, target_values) #reduce loss self.policy_loss_tf = -c2 * tf.reduce_mean( policy_loss) #increase expected advantages self.entropy_loss_tf = -c3 * tf.reduce_mean( entropy_bonus) #increase entropy self.impossibility_loss_tf = c4 * tf.reduce_mean(impossibility_loss_tf) self.regularizer_tf = self.settings["nn_regularizer"] * tf.add_n( [tf.nn.l2_loss(v) for v in self.main_net.variables]) self.loss_tf = self.value_loss_tf + self.policy_loss_tf + self.impossibility_loss_tf + self.entropy_loss_tf + self.regularizer_tf training_ops = self.settings["optimizer"]( learning_rate=params['lr']).minimize(self.loss_tf) #Stats: we like stats. self.output_as_stats(action_entropy, name='entropy') self.output_as_stats(entropy_bonus, name='entropy_bonus', only_mean=True) self.output_as_stats(values, name='values') self.output_as_stats(target_values, name='target_values') self.output_as_stats(r_saturation, name='clip_saturation', only_mean=True) self.output_as_stats(advnorm.a_mean, name='advantage_compressor', only_mean=True) self.output_as_stats(advnorm.a_max, name='advantage_compressor_max', only_mean=True) self.output_as_stats(advnorm.a_saturation, name='advantage_compressor_saturation', only_mean=True) self.output_as_stats(self.loss_tf, name='tot_loss', only_mean=True) self.output_as_stats(self.value_loss_tf, name='value_loss', only_mean=True) self.output_as_stats(-self.policy_loss_tf, name='policy_loss', only_mean=True) self.output_as_stats(-self.entropy_loss_tf, name='entropy_loss', only_mean=True) self.output_as_stats(self.impossibility_loss_tf, name='impossibility_loss', only_mean=True) self.output_as_stats(self.regularizer_tf, name='reg_loss', only_mean=True) self.output_as_stats(params["entropy_loss"], name='params/entropy_loss_weight', only_mean=True) for param_name in params: self.output_as_stats(params[param_name], name='params/' + param_name, only_mean=True) return [training_ops, advnorm.update_op]
def build_bert_inputs(example): """Convert example <Tensor [30, 70]> into bert inputs.""" k_size = FLAGS.k_size CLS_ID = tf.constant([101], dtype=tf.int64) # pylint: disable=invalid-name SEP_ID = tf.constant([102], dtype=tf.int64) # pylint: disable=invalid-name max_len = tf.constant([FLAGS.max_para_length]) context_size = tf.constant([FLAGS.context_size]) intermediate_examples_tensor = tf.reduce_sum(tf.abs(example), 1) examples_zero_vector = tf.zeros(shape=(1, 1), dtype=tf.int64) examples_bool_mask = tf.squeeze( tf.not_equal(intermediate_examples_tensor, examples_zero_vector)) paragraph_len = tf.reduce_sum(tf.cast(examples_bool_mask, tf.int32)) start = tf.random.uniform([1], 0, tf.reshape(paragraph_len, []) - tf.reshape(context_size, []) + 1, dtype=tf.int32) # Slice the document into the before, after and context. # Discard the zero padding. sizes = tf.squeeze( tf.concat([[ start, context_size, paragraph_len - context_size - start, max_len - paragraph_len ]], 0)) before, context, after, _ = tf.split(example, sizes, axis=0) # Gather the context removing zero padding at end of sentences. non_zeros = tf.where(tf.not_equal(context, tf.zeros_like(context))) context_gathered = tf.gather_nd(context, non_zeros) # Flip before so we select the 4 sentences closest to target before = tf.reverse(before, axis=[0]) # pad both to longer than needed paddings = tf.constant([[0, 8], [0, 0]]) before = tf.pad(before, paddings) after = tf.pad(after, paddings) # Extend targets to 3 sentences # pad both before_minus_one = before[1:][:k_size] before_minus_two = before[2:][:k_size] after_plus_one = after[1:][:k_size] after_plus_two = after[2:][:k_size] before = before[:k_size] after = after[:k_size] before = tf.concat([before_minus_two, before_minus_one, before], axis=1) after = tf.concat([after, after_plus_one, after_plus_two], axis=1) ############################################################################ # These 8 sentences are the 8 surrounding targets. Some are padding. targets = tf.concat([before, after], axis=0) # Remove the padding from the sourrounding sentences # Eg. if context starts at beginning of paragraph, before is all padding intermediate_tensor = tf.reduce_sum(tf.abs(targets), 1) zero_vector = tf.zeros(shape=(1, 1), dtype=tf.int64) bool_mask = tf.squeeze(tf.not_equal(intermediate_tensor, zero_vector)) bool_mask.set_shape([None]) targets = tf.boolean_mask(targets, bool_mask) # Randomly select 4 targets # We will also select the label_types for each selected target indices = tf.range(0, limit=tf.shape(targets)[0], dtype=tf.int32) shuffled_indices = tf.random.shuffle(indices)[:k_size] targets = tf.gather(targets, shuffled_indices) if k_size == 4: full_labels = tf.concat([tf.range(3, -1, -1), tf.range(4, 8)], axis=0) elif k_size == 3: full_labels = tf.concat([tf.range(2, -1, -1), tf.range(3, 6)], axis=0) elif k_size == 2: full_labels = tf.concat([tf.range(1, -1, -1), tf.range(2, 4)], axis=0) elif k_size == 1: full_labels = tf.concat([tf.range(0, -1, -1), tf.range(1, 2)], axis=0) label_types = tf.boolean_mask(full_labels, bool_mask) label_types = tf.gather(label_types, shuffled_indices) # create inputs bert_inputs = [] input_masks = [] segment_ids = [] # make context ctx_segment_id = tf.concat([ tf.zeros_like(CLS_ID, dtype=tf.int64), tf.zeros_like(context_gathered), tf.zeros_like(SEP_ID, dtype=tf.int64) ], axis=0) ctx_segment_id = pad_and_cut(ctx_segment_id, FLAGS.max_seq_length) segment_ids.append(ctx_segment_id) new_ctx_input = tf.concat([CLS_ID, context_gathered, SEP_ID], axis=0) ctx_input_mask = tf.ones_like(new_ctx_input) ctx_input_mask = pad_and_cut(ctx_input_mask, FLAGS.max_seq_length) input_masks.append(ctx_input_mask) padded_new_ctx_input = pad_and_cut(new_ctx_input, FLAGS.max_seq_length) bert_inputs.append(padded_new_ctx_input) for i in range(k_size): target_non_zero = tf.where( tf.not_equal(targets[i], tf.zeros_like(targets[i]))) targets_stripped = tf.gather_nd(targets[i], target_non_zero) if FLAGS.include_context: segment_id = tf.concat([ tf.zeros_like(CLS_ID, dtype=tf.int64), tf.zeros_like(context_gathered), tf.zeros_like(SEP_ID, dtype=tf.int64), tf.ones_like(targets_stripped), tf.ones_like(SEP_ID, dtype=tf.int64) ], axis=0) else: segment_id = tf.concat([ tf.zeros_like(CLS_ID, dtype=tf.int64), tf.zeros_like(targets_stripped), tf.zeros_like(SEP_ID, dtype=tf.int64) ], axis=0) segment_id = pad_and_cut(segment_id, FLAGS.max_seq_length) segment_ids.append(segment_id) if FLAGS.include_context: new_input = tf.concat( [CLS_ID, context_gathered, SEP_ID, targets_stripped, SEP_ID], axis=0) else: new_input = tf.concat([CLS_ID, targets_stripped, SEP_ID], axis=0) input_mask = tf.ones_like(new_input) input_mask = pad_and_cut(input_mask, FLAGS.max_seq_length) input_masks.append(input_mask) padded_new_input = pad_and_cut(new_input, FLAGS.max_seq_length) bert_inputs.append(padded_new_input) bert_inputs = tf.stack(bert_inputs, axis=0) input_masks = tf.stack(input_masks, axis=0) segment_ids = tf.stack(segment_ids, axis=0) out = Outputs_And_Context(bert_inputs, input_masks, segment_ids, label_types, context_gathered) return out
def detection_loss(cls_outputs, box_outputs, labels, params): """Computes total detection loss. Computes total detection loss including box and class loss from all levels. Args: cls_outputs: an OrderDict with keys representing levels and values representing logits in [batch_size, height, width, num_anchors]. box_outputs: an OrderDict with keys representing levels and values representing box regression targets in [batch_size, height, width, num_anchors * 4]. labels: the dictionary that returned from dataloader that includes groundtruth targets. params: the dictionary including training parameters specified in default_haprams function in this file. Returns: total_loss: an integer tensor representing total loss reducing from class and box losses from all levels. cls_loss: an integer tensor representing total class loss. box_loss: an integer tensor representing total box regression loss. box_iou_loss: an integer tensor representing total box iou loss. """ # Sum all positives in a batch for normalization and avoid zero # num_positives_sum, which would lead to inf loss during training num_positives_sum = tf.reduce_sum(labels['mean_num_positives']) + 1.0 levels = cls_outputs.keys() cls_losses = [] box_losses = [] box_iou_losses = [] for level in levels: if params['data_format'] == 'channels_first': labels['cls_targets_%d' % level] = tf.transpose( labels['cls_targets_%d' % level], [0, 3, 1, 2]) labels['box_targets_%d' % level] = tf.transpose( labels['box_targets_%d' % level], [0, 3, 1, 2]) # Onehot encoding for classification labels. cls_targets_at_level = tf.one_hot(labels['cls_targets_%d' % level], params['num_classes']) if params['data_format'] == 'channels_first': bs, _, width, height, _ = cls_targets_at_level.get_shape().as_list( ) cls_targets_at_level = tf.reshape(cls_targets_at_level, [bs, -1, width, height]) else: bs, width, height, _, _ = cls_targets_at_level.get_shape().as_list( ) cls_targets_at_level = tf.reshape(cls_targets_at_level, [bs, width, height, -1]) box_targets_at_level = labels['box_targets_%d' % level] cls_loss = _classification_loss(cls_outputs[level], cls_targets_at_level, num_positives_sum, alpha=params['alpha'], gamma=params['gamma']) if params['data_format'] == 'channels_first': cls_loss = tf.reshape( cls_loss, [bs, -1, width, height, params['num_classes']]) else: cls_loss = tf.reshape( cls_loss, [bs, width, height, -1, params['num_classes']]) cls_loss *= tf.cast( tf.expand_dims(tf.not_equal(labels['cls_targets_%d' % level], -2), -1), tf.float32) cls_losses.append(tf.reduce_sum(cls_loss)) box_losses.append( _box_loss(box_outputs[level], box_targets_at_level, num_positives_sum, delta=params['delta'])) if params['iou_loss_type']: box_iou_losses.append( _box_iou_loss(box_outputs[level], box_targets_at_level, num_positives_sum, params['iou_loss_type'])) # Sum per level losses to total loss. cls_loss = tf.add_n(cls_losses) box_loss = tf.add_n(box_losses) box_iou_loss = tf.add_n(box_iou_losses) if box_iou_losses else 0 total_loss = (cls_loss + params['box_loss_weight'] * box_loss + params['iou_loss_weight'] * box_iou_loss) return total_loss, cls_loss, box_loss, box_iou_loss
def add_distance_loss_to_center(labels, logits, groundtruth_coords): """Add distance loss function for ClickRegression.""" weights = tf.to_int32( tf.not_equal( labels, model_input.dataset_descriptors[FLAGS.dataset].ignore_label)) labels *= weights # Use GT box to get center if it exists. Less computation required. # Otherwise, calculate from label mask. if FLAGS.use_groundtruth_box: center_x = (groundtruth_coords['xmin'] + groundtruth_coords['xmax']) / 2.0 center_y = (groundtruth_coords['ymin'] + groundtruth_coords['ymax']) / 2.0 center = tf.stack([center_y, center_x], axis=1) else: # Make array of coordinates (each row contains three coordinates) ii, jj = tf.meshgrid(tf.range(FLAGS.image_size), tf.range(FLAGS.image_size), indexing='ij') coords = tf.stack([tf.reshape(ii, (-1, )), tf.reshape(jj, (-1, ))], axis=-1) coords = tf.cast(coords, tf.int32) # Rearrange input into one vector per volume volumes_flat = tf.reshape( labels, [-1, FLAGS.image_size * FLAGS.image_size * 1, 1]) # Compute total mass for each volume. Add 0.00001 to prevent division by 0 total_mass = tf.cast(tf.reduce_sum(volumes_flat, axis=1), tf.float32) + ZERO_DIV_OFFSET # Compute centre of mass center = tf.cast(tf.reduce_sum(volumes_flat * coords, axis=1), tf.float32) / total_mass center = center / FLAGS.image_size # Normalize coordinates by size of image logits = logits / FLAGS.image_size # Calculate loss based on the distance metric specified # Loss added later in model_fn by tf.losses.get_total_loss() if FLAGS.distance_metric == 'mse': tf.losses.mean_squared_error(center, logits) elif FLAGS.distance_metric in [ 'euclidean', 'euclidean_sqrt', 'euclidean_iter' ]: distance_to_center = tf.sqrt( tf.reduce_sum(tf.square(logits - center), axis=-1) + ZERO_DIV_OFFSET) if FLAGS.ratio_box_distance: distance_to_box = calc_distance_to_edge(groundtruth_coords, logits) box_distance_to_center = (tf.to_float(distance_to_center) - distance_to_box) loss = distance_to_center / (box_distance_to_center + ZERO_DIV_OFFSET) else: loss = distance_to_center if FLAGS.distance_metric == 'euclidean_sqrt': loss = tf.sqrt(loss) if FLAGS.distance_metric == 'euclidean_iter': iter_num = tf.to_float(tf.train.get_or_create_global_step()) step = (iter_num // FLAGS.euclidean_step) + 1.0 loss = tf.pow(loss, tf.to_float(1.0 / step)) tf.losses.compute_weighted_loss(loss)
def map_fn(x): """Internal function to flat_map over. Consumes a batch of input examples and produces a variable number of output examples. Args: x: a single example Returns: a tf.data.Dataset """ partial = empty_example.copy() i = tf.zeros([], dtype=tf.int32) first_key, *_ = keys dynamic_batch_size = tf.shape(x[first_key])[0] outputs = {} for k in keys: outputs[k] = tf.TensorArray(tf.int32, size=0, dynamic_size=True, element_shape=[length[k]]) outputs[k + "_position"] = tf.TensorArray( tf.int32, size=0, dynamic_size=True, element_shape=[length[k]]) def cond_fn(i, partial, outputs): del partial, outputs return i < dynamic_batch_size def body_fn(i, partial, outputs): """Body function for while_loop. Args: i: integer scalar partial: dictionary of Tensor (partially-constructed example) outputs: dictionary of TensorArray Returns: A triple containing the new values of the inputs. """ can_append = True one_example = {} for k in keys: val = tf.cast(x[k][i], tf.int32) val = val[:tf. reduce_sum(tf.cast(tf.not_equal(val, 0), tf.int32))] one_example[k] = val for k in keys: can_append = tf.logical_and( can_append, tf.less_equal( tf.size(partial[k]) + tf.size(one_example[k]), length[k])) def false_fn(): return write_packed_example(partial, outputs) def true_fn(): return partial, outputs partial, outputs = tf.cond(can_append, true_fn, false_fn) new_partial = {} for k in keys: new_seq = one_example[k][:length[k]] new_seq_len = tf.size(new_seq) new_partial[k] = tf.concat([partial[k], new_seq], 0) new_partial[k + "_position"] = tf.concat([ partial[k + "_position"], tf.range(new_seq_len, dtype=tf.int32) ], 0) partial = new_partial return i + 1, partial, outputs i, partial, outputs = tf.while_loop( cond_fn, body_fn, (i, partial, outputs), back_prop=False, shape_invariants=( tf.TensorShape([]), {k: tf.TensorShape([None]) for k in keys_etc}, {k: tf.TensorShape(None) for k in keys_etc}, )) partial, outputs = write_packed_example(partial, outputs) packed = {k: outputs[k].stack() for k in keys_etc} for k in keys: packed[k + "_segmentation"] = (tf.cumsum( tf.cast(tf.equal(packed[k + "_position"], 0), tf.int32), axis=1) * tf.cast(tf.not_equal(packed[k], 0), tf.int32)) return packed
def call(self, input_tensor, label_ids, positions=None, label_weights=None, padding_token_id=None, mlm_is_entity_mask=None, mlm_is_not_entity_mask=None): """Get loss and log probs for the masked LM.""" if padding_token_id is not None: pad_mask = tf.cast(tf.not_equal(label_ids, padding_token_id), tf.float32) if label_weights is not None: if padding_token_id is not None: label_weights *= pad_mask else: if padding_token_id is not None: label_weights = pad_mask else: label_weights = tf.ones_like(label_ids, tf.float32) if positions is not None: input_tensor = gather_indexes(input_tensor, positions) else: input_tensor = tf.reshape(input_tensor, [-1, self.hidden_size]) input_tensor.set_shape([None, self.hidden_size]) with tf.variable_scope('cls/predictions'): with tf.variable_scope('transform'): input_tensor = self.linear_fn(input_tensor) input_tensor = self.layer_norm(input_tensor) logits = tf.matmul(input_tensor, self.output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, self.output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) batch_size = tf.shape(label_ids)[0] mlm_labels_per_sample = tf.shape(label_ids)[1] label_ids_flattened = tf.reshape(label_ids, [-1]) label_weights_flattened = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot(label_ids_flattened, depth=self.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) mlm_predictions = tf.argmax(log_probs, axis=-1, output_type=tf.int32) loss = tf.reduce_sum( label_weights_flattened * per_example_loss) / (tf.reduce_sum(label_weights) + 1e-5) def weighted_sum_per_sample(values1, values2, weights): weights_per_sample = tf.reduce_sum(weights, 1) weights_denominator = weights_per_sample + 1e-5 return (tf.reduce_sum(values1 * weights, 1) / weights_denominator, tf.reduce_sum(values2 * weights, 1) / weights_denominator, weights_per_sample) mlm_loss = tf.reshape(per_example_loss, [batch_size, mlm_labels_per_sample]) mlm_accuracy = tf.reshape( tf.cast(tf.equal(mlm_predictions, label_ids_flattened), tf.float32), [batch_size, mlm_labels_per_sample]) (mlm_loss_per_sample, mlm_accuracy_per_sample, mlm_weight_per_sample) = weighted_sum_per_sample( mlm_loss, mlm_accuracy, label_weights) if mlm_is_entity_mask is not None: (mlm_loss_per_entity_sample, mlm_accuracy_per_entity_sample, mlm_weight_per_entity_sample) = weighted_sum_per_sample( mlm_loss, mlm_accuracy, label_weights * mlm_is_entity_mask) else: mlm_loss_per_entity_sample = None mlm_accuracy_per_entity_sample = None mlm_weight_per_entity_sample = None if mlm_is_not_entity_mask is not None: (mlm_loss_per_non_entity_sample, mlm_accuracy_per_non_entity_sample, mlm_weight_per_non_entity_sample) = weighted_sum_per_sample( mlm_loss, mlm_accuracy, label_weights * mlm_is_not_entity_mask) else: mlm_loss_per_non_entity_sample = None mlm_accuracy_per_non_entity_sample = None mlm_weight_per_non_entity_sample = None return LanguageModelOutput( loss=loss, mlm_predictions=mlm_predictions, mlm_loss_per_sample=mlm_loss_per_sample, mlm_accuracy_per_sample=mlm_accuracy_per_sample, mlm_weight_per_sample=mlm_weight_per_sample, mlm_loss_per_entity_sample=mlm_loss_per_entity_sample, mlm_accuracy_per_entity_sample=mlm_accuracy_per_entity_sample, mlm_weight_per_entity_sample=mlm_weight_per_entity_sample, mlm_loss_per_non_entity_sample=mlm_loss_per_non_entity_sample, mlm_accuracy_per_non_entity_sample= mlm_accuracy_per_non_entity_sample, mlm_weight_per_non_entity_sample=mlm_weight_per_non_entity_sample)
def __init__(self, item_num, args, reuse=None): self.args = args self.is_training = tf.placeholder(tf.bool, shape=()) self.input_seq = tf.placeholder(tf.int32, shape=(None, args.maxlen)) self.pos = tf.placeholder(tf.int32, shape=None) self.exemplar_logits = tf.placeholder(tf.float32, shape=(None, None)) self.exemplar_pos = tf.placeholder(tf.int32, shape=None) self.max_item = tf.placeholder(tf.int32, shape=()) self.lr = tf.placeholder(tf.float32, shape=()) self.dropout_rate = tf.placeholder(tf.float32, shape=()) pos = self.pos mask = tf.expand_dims(tf.to_float(tf.not_equal(self.input_seq, 0)), -1) with tf.variable_scope("SASRec", reuse=reuse): # sequence embedding, item embedding table self.seq, item_emb_table = embedding(self.input_seq, vocab_size=item_num + 1, num_units=args.hidden_units, zero_pad=True, scale=True, l2_reg=args.l2_emb, scope="input_embeddings", with_t=True, reuse=reuse ) # # Positional Encoding t, pos_emb_table = embedding( tf.tile(tf.expand_dims(tf.range(tf.shape(self.input_seq)[1]), 0), [tf.shape(self.input_seq)[0], 1]), vocab_size=args.maxlen, num_units=args.hidden_units, zero_pad=False, scale=False, l2_reg=args.l2_emb, scope="dec_pos", reuse=reuse, with_t=True ) self.seq += t # Dropout self.seq = tf.layers.dropout(self.seq, rate=self.dropout_rate, training=tf.convert_to_tensor(self.is_training), seed=args.random_seed) self.seq *= mask # Build blocks for i in range(args.num_blocks): with tf.variable_scope("num_blocks_%d" % i): # Self-attention self.seq = multihead_attention(queries=normalize(self.seq), keys=self.seq, num_units=args.hidden_units, num_heads=args.num_heads, dropout_rate=self.dropout_rate, seed=args.random_seed, is_training=self.is_training, causality=True, scope="self_attention") # Feed forward self.seq = feedforward(normalize(self.seq), num_units=[args.hidden_units, args.hidden_units], dropout_rate=self.dropout_rate, is_training=self.is_training, seed=args.random_seed) self.seq *= mask self.seq = normalize(self.seq) # find representation self.rep = self.seq[:, -1, :] # define loss seq_emb = tf.reshape(self.rep, [tf.shape(self.input_seq)[0], args.hidden_units]) indices = pos - 1 self.labels = tf.one_hot(indices, self.max_item) item_emb = tf.nn.embedding_lookup(item_emb_table, tf.range(1, self.max_item + 1)) self.logits = tf.matmul(seq_emb, tf.transpose(item_emb)) self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.labels, logits=self.logits)) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) # prediction self.test_item = tf.placeholder(tf.int32, shape=None) self.test_item_emb = tf.nn.embedding_lookup(item_emb_table, self.test_item) self.test_logits = tf.matmul(seq_emb, tf.transpose(self.test_item_emb)) self.test_logits = tf.reshape(self.test_logits, [tf.shape(self.input_seq)[0], tf.shape(self.test_item)[0]]) self.pred_last = tf.argsort(tf.argsort(-self.test_logits))
def c(i, j, k): return tf.equal( tf.not_equal(tf.less(i + j, 10), tf.less(j * k, 100)), tf.greater_equal(k, i + j))
def body(self, features, decode_step=None, cache=None, decoding_stats=None, add_summary=True): encoder_output = None extra_losses = [] padding_bias = None if not self.hparams.fast_decode: decode_step = None if "inputs" in features: inputs = features["inputs"] # remove the last two dimensions that are always 1. inputs = tf.reshape( inputs, utils.shape_list(inputs)[:2] + [self.hidden_size]) # Padding bias only used for seq2seq models. padding_bias = utils.embedding_to_padding(inputs) # Mask random positions shape = utils.shape_list(inputs) if self.hparams.input_dropout: inputs = tf.where( tf.random.uniform(shape) < self.hparams.input_dropout, tf.zeros_like(inputs), inputs) if self.hparams.add_timing_signal: inputs += utils.get_timing_signal_1d(self.hparams.max_length, self.hidden_size) if cache is not None and -1 in cache: encoder_output = cache[-1] else: encoder_output = utils.transformer_encoder_layers( inputs=inputs, num_layers=self.num_encoder_layers, hparams=self.hparams, losses=extra_losses, name="encoder", token_bias=features.get("token_bias_inputs"), padding_bias=padding_bias) if cache is not None and -1 not in cache: cache[-1] = encoder_output targets = tf.to_int32(features["targets"]) # remove the last two dimensions that are always 1. targets = tf.reshape(targets, utils.shape_list(targets)[:2]) # Clamp targets to max_target_length targets = targets[:, :self.hparams.max_target_length] if self.is_decode: targets = self.process_partial_targets_decoding(targets) decoder_input = self.prepare_decoder(targets) decoder_output = utils.transformer_decoder_layers( inputs=decoder_input, num_layers=self.num_decoder_layers, hparams=self.hparams, encoder_output=encoder_output, decode_step=decode_step, losses=extra_losses, cache=cache, name="decoder", decoding_stats=decoding_stats, token_bias_inputs=features.get("token_bias_inputs"), token_bias_targets=features.get("token_bias_targets"), padding_bias=padding_bias) logits = self.produce_output(decoder_output) # Return logits as-is in decoding mode if self.is_decode: return logits # Add cross entropy loss one_hot_targets = tf.one_hot(tf.cast(targets, dtype=tf.int32), self.vocab_size) x_entropy = tf.nn.softmax_cross_entropy_with_logits_v2( labels=one_hot_targets, logits=logits) weights = tf.to_float(tf.not_equal(targets, 0)) loss = tf.reduce_sum(x_entropy * weights) / tf.reduce_sum(weights) if add_summary: tf.summary.scalar("losses/weight", tf.reduce_sum(weights)) tf.summary.scalar("losses/x_entropy", tf.reduce_sum(x_entropy * weights)) loss_dict = {"training": loss} if extra_losses: loss_dict["extra_loss"] = tf.add_n(extra_losses) # hack for T2T metrics logits = tf.reshape( logits, utils.shape_list(logits)[:2] + [1, 1] + utils.shape_list(logits)[-1:]) return logits, loss_dict
def compute_loss(self, y_true, y_pred): """Compute mutlibox loss. # Arguments y_true: Ground truth targets, tensor of shape (?, num_boxes, 4 + num_classes + 8), priors in ground truth are fictitious, y_true[:, :, -8] has 1 if prior should be penalized or in other words is assigned to some ground truth box, y_true[:, :, -7:] are all 0. y_pred: Predicted logits, tensor of shape (?, num_boxes, 4 + num_classes + 8). # Returns loss: Loss for prediction, tensor of shape (?,). """ batch_size = tf.shape(y_true)[0] num_boxes = tf.to_float(tf.shape(y_true)[1]) # loss for all priors conf_loss = self._softmax_loss(y_true[:, :, 4:-8], y_pred[:, :, 4:-8]) loc_loss = self._l1_smooth_loss(y_true[:, :, :4], y_pred[:, :, :4]) # get positives loss num_pos = tf.reduce_sum(y_true[:, :, -8], axis=-1) pos_loc_loss = tf.reduce_sum(loc_loss * y_true[:, :, -8], axis=1) pos_conf_loss = tf.reduce_sum(conf_loss * y_true[:, :, -8], axis=1) # get negatives loss, we penalize only confidence here num_neg = tf.minimum(self.neg_pos_ratio * num_pos, num_boxes - num_pos) pos_num_neg_mask = tf.greater(num_neg, 0) has_min = tf.to_float(tf.reduce_any(pos_num_neg_mask)) num_neg = tf.concat( axis=0, values=[num_neg, [(1 - has_min) * self.negatives_for_hard]]) num_neg_batch = tf.reduce_min( tf.boolean_mask(num_neg, tf.greater(num_neg, 0))) num_neg_batch = tf.to_int32(num_neg_batch) confs_start = 4 + self.background_label_id + 1 confs_end = confs_start + self.num_classes - 1 max_confs = tf.reduce_max(y_pred[:, :, confs_start:confs_end], axis=2) _, indices = tf.nn.top_k(max_confs * (1 - y_true[:, :, -8]), k=num_neg_batch) batch_idx = tf.expand_dims(tf.range(0, batch_size), 1) batch_idx = tf.tile(batch_idx, (1, num_neg_batch)) full_indices = (tf.reshape(batch_idx, [-1]) * tf.to_int32(num_boxes) + tf.reshape(indices, [-1])) # full_indices = tf.concat(2, [tf.expand_dims(batch_idx, 2), # tf.expand_dims(indices, 2)]) # neg_conf_loss = tf.gather_nd(conf_loss, full_indices) neg_conf_loss = tf.gather(tf.reshape(conf_loss, [-1]), full_indices) neg_conf_loss = tf.reshape(neg_conf_loss, [batch_size, num_neg_batch]) neg_conf_loss = tf.reduce_sum(neg_conf_loss, axis=1) # loss is sum of positives and negatives total_loss = pos_conf_loss + neg_conf_loss total_loss /= (num_pos + tf.to_float(num_neg_batch)) num_pos = tf.where(tf.not_equal(num_pos, 0), num_pos, tf.ones_like(num_pos)) total_loss += (self.alpha * pos_loc_loss) / num_pos return total_loss
def filter_random_lighting(sequence_dir): sequence_name = tf.string_split([sequence_dir], '/').values[-1] lighting = tf.substr(sequence_name, 0, 6) return tf.not_equal(lighting, 'random')
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" logging.info("*** Model: Params ***") for name in sorted(params.keys()): logging.info(" %s = %s", name, params[name]) logging.info("*** Model: Features ***") for name in sorted(features.keys()): logging.info(" name = %s, shape = %s", name, features[name].shape) model = modeling.ReadItTwiceBertModel( config=model_config, use_one_hot_embeddings=use_one_hot_embeddings) span_prediction_layer = modeling.SpanPredictionHead( intermediate_size=model_config.intermediate_size, dropout_rate=model_config.hidden_dropout_prob) # [batch_size, main_seq_length] token_ids = features["token_ids"] main_seq_length = tf.shape(token_ids)[1] block_ids = features["block_ids"] block_pos = features["block_pos"] answer_type = features["answer_type"] supporting_fact = features["is_supporting_fact"] annotation_begins = features.get("entity_annotation_begins") annotation_ends = features.get("entity_annotation_ends") annotation_labels = features.get("entity_annotation_labels") # Do not attend padding tokens # [batch_size, main_seq_length, main_seq_length] att_mask = tf.tile( tf.expand_dims(tf.not_equal(token_ids, padding_token_id), 1), [1, main_seq_length, 1]) att_mask = tf.cast(att_mask, dtype=tf.int32) main_output = model( token_ids=token_ids, training=(mode == tf.estimator.ModeKeys.TRAIN), block_ids=block_ids, block_pos=block_pos, att_mask=att_mask, annotation_begins=annotation_begins, annotation_ends=annotation_ends, annotation_labels=annotation_labels, enable_side_inputs=enable_side_inputs, num_replicas_concat=num_replicas_concat, cross_block_attention_mode=cross_block_attention_mode) span_logits = span_prediction_layer( hidden_states=main_output.final_hidden_states, token_ids=token_ids, padding_token_id=padding_token_id, ignore_prefix_length=features["prefix_length"], training=(mode == tf.estimator.ModeKeys.TRAIN)) # The "pooler" converts the encoded sequence tensor of shape # [batch_size, seq_length, hidden_size] to a tensor of shape # [batch_size, hidden_size]. This is necessary for segment-level # (or segment-pair-level) classification tasks where we need a fixed # dimensional representation of the segment. with tf.variable_scope("pooler"): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained first_token_tensor = tf.squeeze( main_output.final_hidden_states[:, 0:1, :], axis=1) pooled_output = tf.layers.dense( first_token_tensor, model_config.hidden_size, activation=tf.tanh, kernel_initializer=tf.truncated_normal_initializer( stddev=model_config.initializer_range)) yesno_logits = yesno_model(pooled_output) supporting_fact_logits = supporting_fact_model(pooled_output) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = checkpoint_utils.get_assignment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: host_inputs = dict() span_prediction_loss = losses.BatchSpanCrossEntropyLoss() total_loss = 0 qa_loss = span_prediction_loss( logits=span_logits, annotation_begins=features["answer_annotation_begins"], annotation_ends=features["answer_annotation_ends"], annotation_labels=features["answer_annotation_labels"], block_ids=block_ids, num_replicas=num_replicas_concat, eps=1e-5) host_inputs["train_metrics/qa_loss"] = tf.expand_dims(qa_loss, 0) total_loss += qa_loss # example_mask = tf.cast(tf.not_equal(block_ids, 0), tf.float32) # yesno_loss = compute_pooled_loss(yesno_logits, answer_type, 3, # example_mask) # supporting_fact_loss = compute_supporting_facts_loss( # supporting_fact_logits, supporting_fact, example_mask) hotpot_qa_loss = hotpot_qa_losses.BatchSpanCrossEntropyLoss() yesno_loss, supporting_fact_loss = hotpot_qa_loss( yesno_logits, answer_type, supporting_fact_logits, supporting_fact, block_ids, eps=1e-5) host_inputs["train_metrics/yesno_loss"] = tf.expand_dims(yesno_loss, 0) total_loss += yesno_loss host_inputs["train_metrics/supporting_fact_loss"] = tf.expand_dims( supporting_fact_loss, 0) total_loss += supporting_fact_loss # Add regularization losses. if model.losses: total_loss += tf.math.add_n(model.losses) train_op = optimization.create_optimizer( total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu, optimizer, poly_power, start_warmup_step, learning_rate_schedule, reduce_loss_sum=True) host_inputs.update({ "global_step": tf.expand_dims(tf.train.get_or_create_global_step(), 0), "train_metrics/loss": tf.expand_dims(total_loss, 0), }) host_call = (functools.partial( record_summary_host_fn, metrics_dir=os.path.join(FLAGS.output_dir, "train_metrics")), host_inputs) output_spec = tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn, host_call=host_call) elif mode == tf.estimator.ModeKeys.PREDICT: begin_logits_values, begin_logits_indices = tf.math.top_k( span_logits[:, :, 0], k=nbest_logits_for_eval, ) end_logits_values, end_logits_indices = tf.math.top_k( span_logits[:, :, 1], k=nbest_logits_for_eval, ) predictions = { "block_ids": tf.identity(block_ids), "begin_logits_values": begin_logits_values, "begin_logits_indices": begin_logits_indices, "end_logits_values": end_logits_values, "end_logits_indices": end_logits_indices, "token_ids": tf.identity(token_ids), "answer_type": answer_type, "yesno_logits": yesno_logits, "supporting_fact_logits": supporting_fact_logits, "is_supporting_fact": supporting_fact, } output_spec = tf.estimator.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: raise ValueError("Only TRAIN and PREDICT modes is supported: %s" % mode) return output_spec
def build_graph(self, image, edgemap): image = image - tf.constant([104, 116, 122], dtype='float32') image = tf.transpose(image, [0, 3, 1, 2]) edgemap = tf.expand_dims(edgemap, 3, name='edgemap4d') def branch(name, l, up): with tf.variable_scope(name): l = Conv2D('convfc', l, 1, kernel_size=1, activation=tf.identity, use_bias=True, kernel_initializer=tf.constant_initializer()) while up != 1: l = CaffeBilinearUpSample('upsample{}'.format(up), l, 2) up = up // 2 return l with argscope(Conv2D, kernel_size=3, activation=tf.nn.relu), \ argscope([Conv2D, MaxPooling], data_format='NCHW'): l = Conv2D('conv1_1', image, 64) l = Conv2D('conv1_2', l, 64) b1 = branch('branch1', l, 1) l = MaxPooling('pool1', l, 2) l = Conv2D('conv2_1', l, 128) l = Conv2D('conv2_2', l, 128) b2 = branch('branch2', l, 2) l = MaxPooling('pool2', l, 2) l = Conv2D('conv3_1', l, 256) l = Conv2D('conv3_2', l, 256) l = Conv2D('conv3_3', l, 256) b3 = branch('branch3', l, 4) l = MaxPooling('pool3', l, 2) l = Conv2D('conv4_1', l, 512) l = Conv2D('conv4_2', l, 512) l = Conv2D('conv4_3', l, 512) b4 = branch('branch4', l, 8) l = MaxPooling('pool4', l, 2) l = Conv2D('conv5_1', l, 512) l = Conv2D('conv5_2', l, 512) l = Conv2D('conv5_3', l, 512) b5 = branch('branch5', l, 16) final_map = Conv2D('convfcweight', tf.concat([b1, b2, b3, b4, b5], 1), 1, kernel_size=1, kernel_initializer=tf.constant_initializer(0.2), use_bias=False, activation=tf.identity) costs = [] for idx, b in enumerate([b1, b2, b3, b4, b5, final_map]): b = tf.transpose(b, [0, 2, 3, 1]) output = tf.nn.sigmoid(b, name='output{}'.format(idx + 1)) xentropy = class_balanced_sigmoid_cross_entropy( b, edgemap, name='xentropy{}'.format(idx + 1)) costs.append(xentropy) # some magic threshold pred = tf.cast(tf.greater(output, 0.5), tf.int32, name='prediction') wrong = tf.cast(tf.not_equal(pred, edgemap), tf.float32) wrong = tf.reduce_mean(wrong, name='train_error') wd_w = tf.train.exponential_decay(2e-4, get_global_step_var(), 80000, 0.7, True) wd_cost = tf.multiply(wd_w, regularize_cost('.*/W', tf.nn.l2_loss), name='wd_cost') costs.append(wd_cost) add_param_summary(('.*/W', ['histogram'])) # monitor W total_cost = tf.add_n(costs, name='cost') add_moving_summary(wrong, total_cost, *costs) return total_cost
def parse_train_data(self, data): """Parse data for ShapeMask training.""" classes = data['groundtruth_classes'] boxes = data['groundtruth_boxes'] masks = data['groundtruth_instance_masks'] is_crowds = data['groundtruth_is_crowd'] # Skips annotations with `is_crowd` = True. if self._skip_crowd_during_training and self._is_training: num_groundtrtuhs = tf.shape(classes)[0] with tf.control_dependencies([num_groundtrtuhs, is_crowds]): indices = tf.cond( tf.greater(tf.size(is_crowds), 0), lambda: tf.where(tf.logical_not(is_crowds))[:, 0], lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64)) classes = tf.gather(classes, indices) boxes = tf.gather(boxes, indices) masks = tf.gather(masks, indices) # If not using category, makes all categories with id = 0. if not self._use_category: classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32) image = self.get_normalized_image(data) # Flips image randomly during training. if self._aug_rand_hflip: image, boxes, masks = input_utils.random_horizontal_flip( image, boxes, masks) # Converts boxes from normalized coordinates to pixel coordinates. image_shape = tf.shape(image)[0:2] boxes = box_utils.denormalize_boxes(boxes, image_shape) # Resizes and crops image. image, image_info = input_utils.resize_and_crop_image( image, self._output_size, self._output_size, aug_scale_min=self._aug_scale_min, aug_scale_max=self._aug_scale_max) self._train_image_scale = image_info[2, :] self._train_offset = image_info[3, :] # Resizes and crops boxes and masks. boxes = input_utils.resize_and_crop_boxes(boxes, self._train_image_scale, image_info[1, :], self._train_offset) # Filters out ground truth boxes that are all zeros. indices = box_utils.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) masks = tf.gather(masks, indices) # Assigns anchors. input_anchor = anchor.Anchor(self._min_level, self._max_level, self._num_scales, self._aspect_ratios, self._anchor_size, self._output_size) anchor_labeler = anchor.AnchorLabeler(input_anchor, self._match_threshold, self._unmatched_threshold) (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors( boxes, tf.cast(tf.expand_dims(classes, axis=1), tf.float32)) # Sample groundtruth masks/boxes/classes for mask branch. num_masks = tf.shape(masks)[0] mask_shape = tf.shape(masks)[1:3] # Pad sampled boxes/masks/classes to a constant batch size. padded_boxes = input_utils.pad_to_fixed_size(boxes, self._num_sampled_masks) padded_classes = input_utils.pad_to_fixed_size(classes, self._num_sampled_masks) padded_masks = input_utils.pad_to_fixed_size(masks, self._num_sampled_masks) # Randomly sample groundtruth masks for mask branch training. For the image # without groundtruth masks, it will sample the dummy padded tensors. rand_indices = tf.random.shuffle( tf.range(tf.maximum(num_masks, self._num_sampled_masks))) rand_indices = tf.mod(rand_indices, tf.maximum(num_masks, 1)) rand_indices = rand_indices[0:self._num_sampled_masks] rand_indices = tf.reshape(rand_indices, [self._num_sampled_masks]) sampled_boxes = tf.gather(padded_boxes, rand_indices) sampled_classes = tf.gather(padded_classes, rand_indices) sampled_masks = tf.gather(padded_masks, rand_indices) # Jitter the sampled boxes to mimic the noisy detections. sampled_boxes = box_utils.jitter_boxes( sampled_boxes, noise_scale=self._box_jitter_scale) sampled_boxes = box_utils.clip_boxes(sampled_boxes, self._output_size) # Compute mask targets in feature crop. A feature crop fully contains a # sampled box. mask_outer_boxes = box_utils.compute_outer_boxes( sampled_boxes, tf.shape(image)[0:2], scale=self._outer_box_scale) mask_outer_boxes = box_utils.clip_boxes(mask_outer_boxes, self._output_size) # Compensate the offset of mask_outer_boxes to map it back to original image # scale. mask_outer_boxes_ori = mask_outer_boxes mask_outer_boxes_ori += tf.tile( tf.expand_dims(self._train_offset, axis=0), [1, 2]) mask_outer_boxes_ori /= tf.tile( tf.expand_dims(self._train_image_scale, axis=0), [1, 2]) norm_mask_outer_boxes_ori = box_utils.normalize_boxes( mask_outer_boxes_ori, mask_shape) # Set sampled_masks shape to [batch_size, height, width, 1]. sampled_masks = tf.cast(tf.expand_dims(sampled_masks, axis=-1), tf.float32) mask_targets = tf.image.crop_and_resize( sampled_masks, norm_mask_outer_boxes_ori, box_ind=tf.range(self._num_sampled_masks), crop_size=[self._mask_crop_size, self._mask_crop_size], method='bilinear', extrapolation_value=0, name='train_mask_targets') mask_targets = tf.where(tf.greater_equal(mask_targets, 0.5), tf.ones_like(mask_targets), tf.zeros_like(mask_targets)) mask_targets = tf.squeeze(mask_targets, axis=-1) if self._up_sample_factor > 1: fine_mask_targets = tf.image.crop_and_resize( sampled_masks, norm_mask_outer_boxes_ori, box_ind=tf.range(self._num_sampled_masks), crop_size=[ self._mask_crop_size * self._up_sample_factor, self._mask_crop_size * self._up_sample_factor ], method='bilinear', extrapolation_value=0, name='train_mask_targets') fine_mask_targets = tf.where( tf.greater_equal(fine_mask_targets, 0.5), tf.ones_like(fine_mask_targets), tf.zeros_like(fine_mask_targets)) fine_mask_targets = tf.squeeze(fine_mask_targets, axis=-1) else: fine_mask_targets = mask_targets # If bfloat16 is used, casts input image to tf.bfloat16. if self._use_bfloat16: image = tf.cast(image, dtype=tf.bfloat16) valid_image = tf.cast(tf.not_equal(num_masks, 0), tf.int32) if self._mask_train_class == 'all': mask_is_valid = valid_image * tf.ones_like(sampled_classes, tf.int32) else: # Get the intersection of sampled classes with training splits. mask_valid_classes = tf.cast( tf.expand_dims( class_utils.coco_split_class_ids(self._mask_train_class), 1), sampled_classes.dtype) match = tf.reduce_any( tf.equal(tf.expand_dims(sampled_classes, 0), mask_valid_classes), 0) mask_is_valid = valid_image * tf.cast(match, tf.int32) # Packs labels for model_fn outputs. labels = { 'cls_targets': cls_targets, 'box_targets': box_targets, 'anchor_boxes': input_anchor.multilevel_boxes, 'num_positives': num_positives, 'image_info': image_info, # For ShapeMask. 'mask_boxes': sampled_boxes, 'mask_outer_boxes': mask_outer_boxes, 'mask_targets': mask_targets, 'fine_mask_targets': fine_mask_targets, 'mask_classes': sampled_classes, 'mask_is_valid': mask_is_valid, } return image, labels
def call(self, item_states, item_ids, global_item_states, global_item_ids, labels_mask=None, labels_weight=None): """Calls the layer. Args: item_states: <float32>[batch_size, hidden_size] item_ids: <int32>[batch_size, hidden_size] global_item_states: <float32>[global_batch_size, hidden_size] global_item_ids: <int32>[global_batch_size, hidden_size] labels_mask: <int32>[batch_size, global_batch_size] labels_weight: <float32>[batch_size, global_batch_size] Returns: total_loss: <float> """ # [batch_size, 1] item_ids_expanded = tf.expand_dims(item_ids, 1) # [1, global_batch_size] global_item_ids_expanded = tf.expand_dims(global_item_ids, 0) # Positive labels when IDs are the same # [batch_size, global_batch_size] labels = tf.equal(item_ids_expanded, global_item_ids_expanded) if labels_mask is not None: labels = tf.logical_and(labels, labels_mask) # In two cases the loss is ignored (label_weight is 0): # (1) Either of IDs is the padding ID # (2) Loss is computed when comparisng a sample to itself both_ids_are_not_padding = tf.logical_and( tf.not_equal(item_ids_expanded, self.padding_id), tf.not_equal(global_item_ids_expanded, self.padding_id)) if labels_weight is None: labels_weight = tf.cast(both_ids_are_not_padding, tf.float32) else: labels_weight = labels_weight * tf.cast(both_ids_are_not_padding, tf.float32) # Hacky way to tell if samples are exactly the same -- # their IDs are the same and their states are approximately the same. samples_are_the_same = tf.logical_and( tf.less( tf.norm(tf.expand_dims(item_states, 1) - tf.expand_dims(global_item_states, 0), axis=2), 1e-5), labels) # [batch_size, global_batch_size] labels_weight = (labels_weight * (1 - tf.cast(samples_are_the_same, tf.float32))) # [batch_size, global_batch_size] labels = tf.stop_gradient(tf.cast(labels, tf.float32)) labels_weight = tf.stop_gradient(tf.cast(labels_weight, tf.float32)) if self.apply_linear_layer: item_states = self.linear_fn(item_states) # [batch_size, global_batch_size] logits = tf.matmul(item_states, global_item_states, transpose_b=True) logits += self.bias_term # [batch_size, global_batch_size] loss_per_sample = tf.nn.sigmoid_cross_entropy_with_logits( labels=labels, logits=logits) loss_per_sample *= labels_weight # Here we compute mean because otherwise the loss becomes too large loss_per_sample = tf.reduce_sum(loss_per_sample, 1) loss_per_sample /= (tf.reduce_sum(labels_weight, 1) + 1e-5) return tf.reduce_sum(loss_per_sample)
def compute_mel_filterbank_features(waveforms, sample_rate=16000, dither=1.0 / np.iinfo(np.int16).max, preemphasis=0.97, frame_length=25, frame_step=10, fft_length=None, window_fn=functools.partial( tf.signal.hann_window, periodic=True), lower_edge_hertz=80.0, upper_edge_hertz=7600.0, num_mel_bins=80, log_noise_floor=1e-3, apply_mask=True): """Implement mel-filterbank extraction using tf ops. Args: waveforms: float32 tensor with shape [batch_size, max_len] sample_rate: sampling rate of the waveform dither: stddev of Gaussian noise added to waveform to prevent quantization artefacts preemphasis: waveform high-pass filtering constant frame_length: frame length in ms frame_step: frame_Step in ms fft_length: number of fft bins window_fn: windowing function lower_edge_hertz: lowest frequency of the filterbank upper_edge_hertz: highest frequency of the filterbank num_mel_bins: filterbank size log_noise_floor: clip small values to prevent numeric overflow in log apply_mask: When working on a batch of samples, set padding frames to zero Returns: filterbanks: a float32 tensor with shape [batch_size, len, num_bins, 1] """ # `stfts` is a complex64 Tensor representing the short-time Fourier # Transform of each signal in `signals`. Its shape is # [batch_size, ?, fft_unique_bins] # where fft_unique_bins = fft_length // 2 + 1 # Find the wave length: the largest index for which the value is !=0 # note that waveforms samples that are exactly 0.0 are quite common, so # simply doing sum(waveforms != 0, axis=-1) will not work correctly. wav_lens = tf.reduce_max( tf.expand_dims(tf.range(tf.shape(waveforms)[1]), 0) * tf.to_int32(tf.not_equal(waveforms, 0.0)), axis=-1) + 1 if dither > 0: waveforms += tf.random_normal(tf.shape(waveforms), stddev=dither) if preemphasis > 0: waveforms = waveforms[:, 1:] - preemphasis * waveforms[:, :-1] wav_lens -= 1 frame_length = int(frame_length * sample_rate / 1e3) frame_step = int(frame_step * sample_rate / 1e3) if fft_length is None: fft_length = int(2**(np.ceil(np.log2(frame_length)))) stfts = tf.contrib.signal.stft(waveforms, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length, window_fn=window_fn, pad_end=True) stft_lens = (wav_lens + (frame_step - 1)) // frame_step masks = tf.to_float( tf.less_equal(tf.expand_dims(tf.range(tf.shape(stfts)[1]), 0), tf.expand_dims(stft_lens, 1))) # An energy spectrogram is the magnitude of the complex-valued STFT. # A float32 Tensor of shape [batch_size, ?, 257]. magnitude_spectrograms = tf.abs(stfts) # Warp the linear-scale, magnitude spectrograms into the mel-scale. num_spectrogram_bins = magnitude_spectrograms.shape[-1].value linear_to_mel_weight_matrix = ( tf.contrib.signal.linear_to_mel_weight_matrix(num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz, upper_edge_hertz)) mel_spectrograms = tf.tensordot(magnitude_spectrograms, linear_to_mel_weight_matrix, 1) # Note: Shape inference for tensordot does not currently handle this case. mel_spectrograms.set_shape(magnitude_spectrograms.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) log_mel_sgram = tf.log(tf.maximum(log_noise_floor, mel_spectrograms)) if apply_mask: log_mel_sgram *= tf.expand_dims(tf.to_float(masks), -1) return tf.expand_dims(log_mel_sgram, -1, name="mel_sgrams")
def prepare_encoder_input(features, hparams, embed_scope=None, embed_token_fn=common_embed.embed_tokens): """Prepares the input for the screen encoder. Args: features: the feature dict. hparams: the hyperparameter. embed_scope: the embedding variable scope. embed_token_fn: the function for embedding tokens. Returns: object_embedding: a Tensor of shape [batch_size, num_steps, max_object_count, embed_depth] object_mask: a binary tensor of shape [batch_size, num_steps, max_object_count] nonpadding_bias: a Tensor of shape [batch_size, num_steps, max_object_count] """ with tf.control_dependencies( [tf.assert_equal(tf.rank(features["obj_text"]), 4)]): if hparams.get("synthetic_screen_noise", 0.) > 0.: num_objects = tf.shape(features["obj_text"])[2] # [batch, length, num_objects] target_obj_mask = tf.cast( tf.one_hot(features["objects"], depth=num_objects), tf.bool) num_tokens = tf.shape(features["obj_text"])[-1] target_obj_mask = tf.tile(tf.expand_dims(target_obj_mask, 3), [1, 1, 1, num_tokens]) # Randomly keep tokens keep_mask = tf.greater_equal( tf.random_uniform(shape=tf.shape(features["obj_text"])), hparams.synthetic_screen_noise) # Keep paddings keep_mask = tf.logical_or(tf.equal(features["obj_text"], 0), keep_mask) # Keep targets target_obj_mask = tf.logical_or(target_obj_mask, keep_mask) features["obj_text"] = tf.where( target_obj_mask, features["obj_text"], tf.random_uniform(shape=tf.shape(features["obj_text"]), maxval=50000, dtype=tf.int32)) text_embeddings, _ = embed_token_fn(features["obj_text"], hparams.task_vocab_size, hparams.hidden_size, hparams, embed_scope=embed_scope) with tf.variable_scope("obj_text_embed", reuse=tf.AUTO_REUSE): if hparams.obj_text_aggregation == "max": embed_bias = tf.cast(tf.less(features["obj_text"], 2), tf.float32) * -1e7 with tf.control_dependencies( [tf.assert_equal(tf.rank(embed_bias), 4)]): text_embeddings = tf.reduce_max( text_embeddings + tf.expand_dims(embed_bias, 4), -2) no_txt_embed = tf.get_variable(name="no_txt_embed", shape=[hparams.hidden_size]) shape = common_layers.shape_list(text_embeddings) no_txt_embed = tf.tile( tf.reshape(no_txt_embed, [1, 1, 1, hparams.hidden_size]), [shape[0], shape[1], shape[2], 1]) text_embeddings = tf.maximum(text_embeddings, no_txt_embed) elif hparams.obj_text_aggregation == "sum": # [batch, step, #max_obj, #max_token] 0 for padded tokens real_objects = tf.cast( tf.greater_equal(features["obj_text"], 2), tf.float32) # [batch, step, #max_obj, hidden] 0s for padded objects text_embeddings = tf.reduce_sum( text_embeddings * tf.expand_dims(real_objects, 4), -2) elif hparams.obj_text_aggregation == "mean": shape_list = common_layers.shape_list(text_embeddings) embeddings = tf.reshape(text_embeddings, [-1] + shape_list[3:]) emb_sum = tf.reduce_sum(tf.abs(embeddings), axis=-1) non_paddings = tf.not_equal(emb_sum, 0.0) embeddings = common_embed.average_bag_of_embeds( embeddings, non_paddings, use_bigrams=True, bigram_embed_scope=embed_scope, append_start_end=True) text_embeddings = tf.reshape( embeddings, shape_list[:3] + [hparams.hidden_size]) else: raise ValueError("Unrecognized token aggregation %s" % (hparams.obj_text_aggregation)) with tf.control_dependencies([ tf.assert_equal(tf.rank(features["obj_type"]), 3), tf.assert_equal(tf.rank(features["obj_clickable"]), 3) ]): with tf.variable_scope("encode_object_attr", reuse=tf.AUTO_REUSE): type_embedding = tf.nn.embedding_lookup(params=tf.get_variable( name="embed_type_w", shape=[hparams.get("num_types", 100), hparams.hidden_size]), ids=tf.maximum( features["obj_type"], 0)) clickable_embedding = tf.nn.embedding_lookup( params=tf.get_variable(name="embed_clickable_w", shape=[2, hparams.hidden_size]), ids=features["obj_clickable"]) with tf.control_dependencies( [tf.assert_equal(tf.rank(features["obj_screen_pos"]), 4)]): def _create_embed(feature_name, vocab_size, depth): """Embed a position feature.""" pos_embedding_list = [] with tf.variable_scope("encode_object_" + feature_name, reuse=tf.AUTO_REUSE): num_featues = common_layers.shape_list( features[feature_name])[-1] for i in range(num_featues): pos_embedding_list.append( tf.nn.embedding_lookup( params=tf.get_variable(name=feature_name + "_embed_w_%d" % i, shape=[vocab_size, depth]), ids=features[feature_name][:, :, :, i])) pos_embedding = tf.add_n(pos_embedding_list) return pos_embedding pos_embedding = _create_embed("obj_screen_pos", hparams.max_pixel_pos, hparams.hidden_size) if "all" == hparams.screen_embedding_feature or ( "dom" in hparams.screen_embedding_feature): dom_embedding = _create_embed("obj_dom_pos", hparams.max_dom_pos, hparams.hidden_size) object_embed = tf.zeros_like(text_embeddings, dtype=tf.float32) if hparams.screen_embedding_feature == "all": object_embed = (text_embeddings + type_embedding + pos_embedding + dom_embedding) elif "text" in hparams.screen_embedding_feature: object_embed += text_embeddings elif "type" in hparams.screen_embedding_feature: object_embed += type_embedding elif "pos" in hparams.screen_embedding_feature: object_embed += pos_embedding elif "dom" in hparams.screen_embedding_feature: object_embed += dom_embedding elif "click" in hparams.screen_embedding_feature: object_embed += clickable_embedding object_mask = tf.cast(tf.not_equal(features["obj_type"], -1), tf.float32) object_embed = object_embed * tf.expand_dims(object_mask, 3) att_bias = (1. - object_mask) * common_attention.large_compatible_negative( object_embed.dtype) return object_embed, object_mask, att_bias
def main(unused_argv): FLAGS.comb_dropout_keep_prob = 1.0 FLAGS.image_keep_prob = 1.0 FLAGS.elements_keep_prob = 1.0 # Get dataset-dependent information. tf.gfile.MakeDirs(FLAGS.eval_logdir) tf.logging.info('Evaluating on %s set', FLAGS.split) with tf.Graph().as_default(): samples = model_input.get_input_fn(FLAGS)() # Get model segmentation predictions. num_classes = model_input.dataset_descriptors[ FLAGS.dataset].num_classes output_to_num_classes = model.get_output_to_num_classes(FLAGS) if tuple(FLAGS.eval_scales) == (1.0, ): tf.logging.info('Performing single-scale test.') predictions, probs = model.predict_labels( samples['image'], samples, FLAGS, outputs_to_num_classes=output_to_num_classes, image_pyramid=FLAGS.image_pyramid, merge_method=FLAGS.merge_method, atrous_rates=FLAGS.atrous_rates, add_image_level_feature=FLAGS.add_image_level_feature, aspp_with_batch_norm=FLAGS.aspp_with_batch_norm, aspp_with_separable_conv=FLAGS.aspp_with_separable_conv, multi_grid=FLAGS.multi_grid, depth_multiplier=FLAGS.depth_multiplier, output_stride=FLAGS.output_stride, decoder_output_stride=FLAGS.decoder_output_stride, decoder_use_separable_conv=FLAGS.decoder_use_separable_conv, crop_size=[FLAGS.image_size, FLAGS.image_size], logits_kernel_size=FLAGS.logits_kernel_size, model_variant=FLAGS.model_variant) else: tf.logging.info('Performing multi-scale test.') predictions, probs = model.predict_labels_multi_scale( samples['image'], samples, FLAGS, outputs_to_num_classes=output_to_num_classes, eval_scales=FLAGS.eval_scales, add_flipped_images=FLAGS.add_flipped_images, merge_method=FLAGS.merge_method, atrous_rates=FLAGS.atrous_rates, add_image_level_feature=FLAGS.add_image_level_feature, aspp_with_batch_norm=FLAGS.aspp_with_batch_norm, aspp_with_separable_conv=FLAGS.aspp_with_separable_conv, multi_grid=FLAGS.multi_grid, depth_multiplier=FLAGS.depth_multiplier, output_stride=FLAGS.output_stride, decoder_output_stride=FLAGS.decoder_output_stride, decoder_use_separable_conv=FLAGS.decoder_use_separable_conv, crop_size=[FLAGS.image_size, FLAGS.image_size], logits_kernel_size=FLAGS.logits_kernel_size, model_variant=FLAGS.model_variant) metric_map = {} for output in output_to_num_classes: output_predictions = predictions[output] output_probs = probs[output] if output == 'segment': output_predictions = tf.expand_dims(output_predictions, 3) if num_classes == 2: labels = samples['label'] iou, weights = model.foreground_iou( labels, output_predictions, FLAGS) soft_iou, _ = model.foreground_iou( labels, output_probs[:, :, :, 1:2], FLAGS) metric_map['mIOU'] = tf.metrics.mean(iou) metric_map['soft_mIOU'] = tf.metrics.mean(soft_iou) high_prob_overlaps = calc_high_prob_overlaps( labels, output_probs, weights) metric_map['highestOverlaps'] = tf.metrics.mean( high_prob_overlaps) output_probs *= weights else: output_predictions = tf.reshape(output_predictions, shape=[-1]) labels = tf.reshape(samples['label'], shape=[-1]) weights = tf.to_float( tf.not_equal( labels, model_input.dataset_descriptors[ FLAGS.dataset].ignore_label)) # Set ignore_label regions to label 0, because metrics.mean_iou # requires range of labels=[0, dataset.num_classes). # Note the ignore_label regions are not evaluated since # the corresponding regions contain weights=0. labels = tf.where( tf.equal( labels, model_input.dataset_descriptors[ FLAGS.dataset].ignore_label), tf.zeros_like(labels), labels) predictions_tag = 'mIOU' for eval_scale in FLAGS.eval_scales: predictions_tag += '_' + str(eval_scale) if FLAGS.add_flipped_images: predictions_tag += '_flipped' # Define the evaluation metric. metric_map[ predictions_tag] = contrib_slim.metrics.mean_iou( output_predictions, labels, num_classes, weights=weights) def label_summary(labels, weights, name): tf.summary.image( name, tf.reshape( tf.cast( tf.to_float(labels * 255) / tf.to_float(num_classes), tf.uint8) * tf.cast(weights, tf.uint8), [-1, FLAGS.image_size, FLAGS.image_size, 1]), 8) label_summary(labels, weights, 'label') label_summary(output_predictions, weights, 'output_predictions') tf.summary.image('logits', tf.expand_dims(output_probs[:, :, :, 1], 3)) elif output == 'regression': labels = samples['label'] ignore_mask = model.get_ignore_mask(labels, FLAGS) accurate = calc_accuracy_in_box(labels, output_probs, ignore_mask) metric_map['inBoxAccuracy'] = tf.metrics.mean(accurate) tf.summary.image('image', samples['image'], 8) metrics_to_values, metrics_to_updates = contrib_slim.metrics.aggregate_metric_map( metric_map) for metric_name, metric_value in metrics_to_values.iteritems(): metric_value = tf.Print(metric_value, [metric_value], metric_name) tf.summary.scalar(metric_name, metric_value) num_batches = int( math.ceil(FLAGS.num_samples / float(FLAGS.batch_size))) tf.logging.info('Eval num images %d', FLAGS.num_samples) tf.logging.info('Eval batch size %d and num batch %d', FLAGS.batch_size, num_batches) contrib_slim.evaluation.evaluation_loop( master='', checkpoint_dir=FLAGS.checkpoint_dir, logdir=FLAGS.eval_logdir, num_evals=num_batches, eval_op=metrics_to_updates.values(), summary_op=tf.summary.merge_all(), max_number_of_evaluations=None, eval_interval_secs=FLAGS.eval_interval_secs)
def detection_loss(cls_outputs, box_outputs, labels, params): """Computes total detection loss. Computes total detection loss including box and class loss from all levels. Args: cls_outputs: an OrderDict with keys representing levels and values representing logits in [batch_size, height, width, num_anchors]. box_outputs: an OrderDict with keys representing levels and values representing box regression targets in [batch_size, height, width, num_anchors * 4]. labels: the dictionary that returned from dataloader that includes groundtruth targets. params: the dictionary including training parameters specified in default_haprams function in this file. Returns: total_loss: an integer tensor representing total loss reducing from class and box losses from all levels. cls_loss: an integer tensor representing total class loss. box_loss: an integer tensor representing total box regression loss. """ # Sum all positives in a batch for normalization and avoid zero # num_positives_sum, which would lead to inf loss during training num_positives_sum = tf.reduce_sum(labels['mean_num_positives']) + 1.0 positives_momentum = params.get('positives_momentum', None) or 0 if positives_momentum > 0: # normalize the num_positive_examples for training stability. moving_normalizer_var = tf.Variable( 0.0, name='moving_normalizer', dtype=tf.float32, synchronization=tf.VariableSynchronization.ON_READ, trainable=False, aggregation=tf.VariableAggregation.MEAN) num_positives_sum = tf.keras.backend.moving_average_update( moving_normalizer_var, num_positives_sum, momentum=params['positives_momentum']) elif positives_momentum < 0: num_positives_sum = utils.cross_replica_mean(num_positives_sum) levels = cls_outputs.keys() cls_losses = [] box_losses = [] for level in levels: # Onehot encoding for classification labels. cls_targets_at_level = tf.one_hot(labels['cls_targets_%d' % level], params['num_classes'], dtype=cls_outputs[level].dtype) if params['data_format'] == 'channels_first': bs, _, width, height, _ = cls_targets_at_level.get_shape().as_list( ) cls_targets_at_level = tf.reshape(cls_targets_at_level, [bs, -1, width, height]) else: bs, width, height, _, _ = cls_targets_at_level.get_shape().as_list( ) cls_targets_at_level = tf.reshape(cls_targets_at_level, [bs, width, height, -1]) box_targets_at_level = labels['box_targets_%d' % level] cls_loss = focal_loss(cls_outputs[level], cls_targets_at_level, params['alpha'], params['gamma'], normalizer=num_positives_sum, label_smoothing=params['label_smoothing']) if params['data_format'] == 'channels_first': cls_loss = tf.reshape( cls_loss, [bs, -1, width, height, params['num_classes']]) else: cls_loss = tf.reshape( cls_loss, [bs, width, height, -1, params['num_classes']]) cls_loss *= tf.cast( tf.expand_dims(tf.not_equal(labels['cls_targets_%d' % level], -2), -1), cls_loss.dtype) cls_loss_sum = tf.reduce_sum(cls_loss) cls_losses.append(tf.cast(cls_loss_sum, tf.float32)) if params['box_loss_weight']: box_losses.append( _box_loss(box_outputs[level], box_targets_at_level, num_positives_sum, delta=params['delta'])) # Sum per level losses to total loss. cls_loss = tf.add_n(cls_losses) box_loss = tf.add_n(box_losses) if box_losses else tf.constant(0.) total_loss = (cls_loss + params['box_loss_weight'] * box_loss) return total_loss, cls_loss, box_loss
def build_genie_model(feat_dict, cfg, batch_size, seq_len, is_training=True, seq_varlens=None, dtype=tf.float32): """Builds a Piano Genie model. Args: feat_dict: Dictionary containing input tensors. cfg: Configuration object. batch_size: Number of items in batch. seq_len: Length of each batch item. is_training: Set to False for evaluation. seq_varlens: If not None, a tensor with the batch sequence lengths. dtype: Model weight type. Returns: A dict containing tensors for relevant model config. """ out_dict = {} # Parse features pitches = util.demidify(feat_dict["midi_pitches"]) velocities = feat_dict["velocities"] pitches_scalar = ((tf.cast(pitches, tf.float32) / 87.) * 2.) - 1. # Create sequence lens if is_training and cfg.train_randomize_seq_len: seq_lens = tf.random_uniform( [batch_size], minval=cfg.train_seq_len_min, maxval=seq_len + 1, dtype=tf.int32) stp_varlen_mask = tf.sequence_mask( seq_lens, maxlen=seq_len, dtype=tf.float32) elif seq_varlens is not None: seq_lens = seq_varlens stp_varlen_mask = tf.sequence_mask( seq_varlens, maxlen=seq_len, dtype=tf.float32) else: seq_lens = tf.ones([batch_size], dtype=tf.int32) * seq_len stp_varlen_mask = None # Encode if (cfg.stp_emb_unconstrained or cfg.stp_emb_vq or cfg.stp_emb_iq or cfg.seq_emb_unconstrained or cfg.seq_emb_vae or cfg.lor_emb_unconstrained): # Build encoder features enc_feats = [] if cfg.enc_pitch_scalar: enc_feats.append(tf.expand_dims(pitches_scalar, axis=-1)) else: enc_feats.append(tf.one_hot(pitches, 88)) if "delta_times_int" in cfg.enc_aux_feats: enc_feats.append( tf.one_hot(feat_dict["delta_times_int"], cfg.data_max_discrete_times + 1)) if "velocities" in cfg.enc_aux_feats: enc_feats.append( tf.one_hot(velocities, cfg.data_max_discrete_velocities + 1)) enc_feats = tf.concat(enc_feats, axis=2) with tf.variable_scope("encoder"): enc_stp, enc_seq = simple_lstm_encoder( enc_feats, seq_lens, rnn_celltype=cfg.rnn_celltype, rnn_nlayers=cfg.rnn_nlayers, rnn_nunits=cfg.rnn_nunits, rnn_bidirectional=cfg.enc_rnn_bidirectional, dtype=dtype) latents = [] # Step embeddings (single vector per timestep) if cfg.stp_emb_unconstrained: with tf.variable_scope("stp_emb_unconstrained"): stp_emb_unconstrained = tf.layers.dense( enc_stp, cfg.stp_emb_unconstrained_embedding_dim) out_dict["stp_emb_unconstrained"] = stp_emb_unconstrained latents.append(stp_emb_unconstrained) # Quantized step embeddings with VQ-VAE if cfg.stp_emb_vq: import sonnet as snt # pylint:disable=g-import-not-at-top,import-outside-toplevel with tf.variable_scope("stp_emb_vq"): with tf.variable_scope("pre_vq"): # pre_vq_encoding is tf.float32 of [batch_size, seq_len, embedding_dim] pre_vq_encoding = tf.layers.dense(enc_stp, cfg.stp_emb_vq_embedding_dim) with tf.variable_scope("quantizer"): assert stp_varlen_mask is None vq_vae = snt.nets.VectorQuantizer( embedding_dim=cfg.stp_emb_vq_embedding_dim, num_embeddings=cfg.stp_emb_vq_codebook_size, commitment_cost=cfg.stp_emb_vq_commitment_cost) vq_vae_output = vq_vae(pre_vq_encoding, is_training=is_training) stp_emb_vq_quantized = vq_vae_output["quantize"] stp_emb_vq_discrete = tf.reshape( tf.argmax(vq_vae_output["encodings"], axis=1, output_type=tf.int32), [batch_size, seq_len]) stp_emb_vq_codebook = tf.transpose(vq_vae.embeddings) out_dict["stp_emb_vq_quantized"] = stp_emb_vq_quantized out_dict["stp_emb_vq_discrete"] = stp_emb_vq_discrete out_dict["stp_emb_vq_loss"] = vq_vae_output["loss"] out_dict["stp_emb_vq_codebook"] = stp_emb_vq_codebook out_dict["stp_emb_vq_codebook_ppl"] = vq_vae_output["perplexity"] latents.append(stp_emb_vq_quantized) # This tensor retrieves continuous embeddings from codebook. It should # *never* be used during training. out_dict["stp_emb_vq_quantized_lookup"] = tf.nn.embedding_lookup( stp_emb_vq_codebook, stp_emb_vq_discrete) # Integer-quantized step embeddings with straight-through if cfg.stp_emb_iq: with tf.variable_scope("stp_emb_iq"): with tf.variable_scope("pre_iq"): # pre_iq_encoding is tf.float32 of [batch_size, seq_len] pre_iq_encoding = tf.layers.dense(enc_stp, 1)[:, :, 0] def iqst(x, n): """Integer quantization with straight-through estimator.""" eps = 1e-7 s = float(n - 1) xp = tf.clip_by_value((x + 1) / 2.0, -eps, 1 + eps) xpp = tf.round(s * xp) xppp = 2 * (xpp / s) - 1 return xpp, x + tf.stop_gradient(xppp - x) with tf.variable_scope("quantizer"): # Pass rounded vals to decoder w/ straight-through estimator stp_emb_iq_discrete_f, stp_emb_iq_discrete_rescaled = iqst( pre_iq_encoding, cfg.stp_emb_iq_nbins) stp_emb_iq_discrete = tf.cast(stp_emb_iq_discrete_f + 1e-4, tf.int32) stp_emb_iq_discrete_f = tf.cast(stp_emb_iq_discrete, tf.float32) stp_emb_iq_quantized = tf.expand_dims( stp_emb_iq_discrete_rescaled, axis=2) # Determine which elements round to valid indices stp_emb_iq_inrange = tf.logical_and( tf.greater_equal(pre_iq_encoding, -1), tf.less_equal(pre_iq_encoding, 1)) stp_emb_iq_inrange_mask = tf.cast(stp_emb_iq_inrange, tf.float32) stp_emb_iq_valid_p = weighted_avg(stp_emb_iq_inrange_mask, stp_varlen_mask) # Regularize to encourage encoder to output in range stp_emb_iq_range_penalty = weighted_avg( tf.square(tf.maximum(tf.abs(pre_iq_encoding) - 1, 0)), stp_varlen_mask) # Regularize to correlate latent finite differences to input stp_emb_iq_dlatents = pre_iq_encoding[:, 1:] - pre_iq_encoding[:, :-1] if cfg.stp_emb_iq_contour_dy_scalar: stp_emb_iq_dnotes = pitches_scalar[:, 1:] - pitches_scalar[:, :-1] else: stp_emb_iq_dnotes = tf.cast(pitches[:, 1:] - pitches[:, :-1], tf.float32) if cfg.stp_emb_iq_contour_exp == 1: power_func = tf.identity elif cfg.stp_emb_iq_contour_exp == 2: power_func = tf.square else: raise NotImplementedError() if cfg.stp_emb_iq_contour_comp == "product": comp_func = tf.multiply elif cfg.stp_emb_iq_contour_comp == "quotient": comp_func = lambda x, y: tf.divide(x, y + 1e-6) else: raise NotImplementedError() stp_emb_iq_contour_penalty = weighted_avg( power_func( tf.maximum( cfg.stp_emb_iq_contour_margin - comp_func( stp_emb_iq_dnotes, stp_emb_iq_dlatents), 0)), None if stp_varlen_mask is None else stp_varlen_mask[:, 1:]) # Regularize to maintain note consistency stp_emb_iq_note_held = tf.cast( tf.equal(pitches[:, 1:] - pitches[:, :-1], 0), tf.float32) if cfg.stp_emb_iq_deviate_exp == 1: power_func = tf.abs elif cfg.stp_emb_iq_deviate_exp == 2: power_func = tf.square if stp_varlen_mask is None: mask = stp_emb_iq_note_held else: mask = stp_varlen_mask[:, 1:] * stp_emb_iq_note_held stp_emb_iq_deviate_penalty = weighted_avg( power_func(stp_emb_iq_dlatents), mask) # Calculate perplexity of discrete encoder posterior if stp_varlen_mask is None: mask = stp_emb_iq_inrange_mask else: mask = stp_varlen_mask * stp_emb_iq_inrange_mask stp_emb_iq_discrete_oh = tf.one_hot(stp_emb_iq_discrete, cfg.stp_emb_iq_nbins) stp_emb_iq_avg_probs = weighted_avg( stp_emb_iq_discrete_oh, mask, axis=[0, 1], expand_mask=True) stp_emb_iq_discrete_ppl = tf.exp(-tf.reduce_sum( stp_emb_iq_avg_probs * tf.log(stp_emb_iq_avg_probs + 1e-10))) out_dict["stp_emb_iq_quantized"] = stp_emb_iq_quantized out_dict["stp_emb_iq_discrete"] = stp_emb_iq_discrete out_dict["stp_emb_iq_valid_p"] = stp_emb_iq_valid_p out_dict["stp_emb_iq_range_penalty"] = stp_emb_iq_range_penalty out_dict["stp_emb_iq_contour_penalty"] = stp_emb_iq_contour_penalty out_dict["stp_emb_iq_deviate_penalty"] = stp_emb_iq_deviate_penalty out_dict["stp_emb_iq_discrete_ppl"] = stp_emb_iq_discrete_ppl latents.append(stp_emb_iq_quantized) # This tensor converts discrete values to continuous. # It should *never* be used during training. out_dict["stp_emb_iq_quantized_lookup"] = tf.expand_dims( 2. * (stp_emb_iq_discrete_f / (cfg.stp_emb_iq_nbins - 1.)) - 1., axis=2) # Sequence embedding (single vector per sequence) if cfg.seq_emb_unconstrained: with tf.variable_scope("seq_emb_unconstrained"): seq_emb_unconstrained = tf.layers.dense( enc_seq, cfg.seq_emb_unconstrained_embedding_dim) out_dict["seq_emb_unconstrained"] = seq_emb_unconstrained seq_emb_unconstrained = tf.stack([seq_emb_unconstrained] * seq_len, axis=1) latents.append(seq_emb_unconstrained) # Sequence embeddings (variational w/ reparameterization trick) if cfg.seq_emb_vae: with tf.variable_scope("seq_emb_vae"): seq_emb_vae = tf.layers.dense(enc_seq, cfg.seq_emb_vae_embedding_dim * 2) mean = seq_emb_vae[:, :cfg.seq_emb_vae_embedding_dim] stddev = 1e-6 + tf.nn.softplus( seq_emb_vae[:, cfg.seq_emb_vae_embedding_dim:]) seq_emb_vae = mean + stddev * tf.random_normal( tf.shape(mean), 0, 1, dtype=dtype) kl = tf.reduce_mean(0.5 * tf.reduce_sum( tf.square(mean) + tf.square(stddev) - tf.log(1e-8 + tf.square(stddev)) - 1, axis=1)) out_dict["seq_emb_vae"] = seq_emb_vae out_dict["seq_emb_vae_kl"] = kl seq_emb_vae = tf.stack([seq_emb_vae] * seq_len, axis=1) latents.append(seq_emb_vae) # Low-rate embeddings if cfg.lor_emb_unconstrained: assert seq_len % cfg.lor_emb_n == 0 with tf.variable_scope("lor_emb_unconstrained"): # Downsample step embeddings rnn_embedding_dim = int(enc_stp.get_shape()[-1]) enc_lor = tf.reshape(enc_stp, [ batch_size, seq_len // cfg.lor_emb_n, cfg.lor_emb_n * rnn_embedding_dim ]) lor_emb_unconstrained = tf.layers.dense( enc_lor, cfg.lor_emb_unconstrained_embedding_dim) out_dict["lor_emb_unconstrained"] = lor_emb_unconstrained # Upsample lo-rate embeddings for decoding lor_emb_unconstrained = tf.expand_dims(lor_emb_unconstrained, axis=2) lor_emb_unconstrained = tf.tile(lor_emb_unconstrained, [1, 1, cfg.lor_emb_n, 1]) lor_emb_unconstrained = tf.reshape( lor_emb_unconstrained, [batch_size, seq_len, cfg.lor_emb_unconstrained_embedding_dim]) latents.append(lor_emb_unconstrained) # Build decoder features dec_feats = latents if cfg.dec_autoregressive: # Retrieve pitch numbers curr_pitches = pitches last_pitches = curr_pitches[:, :-1] last_pitches = tf.pad( last_pitches, [[0, 0], [1, 0]], constant_values=-1) # Prepend <SOS> token out_dict["dec_last_pitches"] = last_pitches dec_feats.append(tf.one_hot(last_pitches + 1, 89)) if cfg.dec_pred_velocity: curr_velocities = velocities last_velocities = curr_velocities[:, :-1] last_velocities = tf.pad(last_velocities, [[0, 0], [1, 0]]) dec_feats.append( tf.one_hot(last_velocities, cfg.data_max_discrete_velocities + 1)) if "delta_times_int" in cfg.dec_aux_feats: dec_feats.append( tf.one_hot(feat_dict["delta_times_int"], cfg.data_max_discrete_times + 1)) if "velocities" in cfg.dec_aux_feats: assert not cfg.dec_pred_velocity dec_feats.append( tf.one_hot(feat_dict["velocities"], cfg.data_max_discrete_velocities + 1)) assert dec_feats dec_feats = tf.concat(dec_feats, axis=2) # Decode with tf.variable_scope("decoder"): dec_stp, dec_initial_state, dec_final_state = simple_lstm_decoder( dec_feats, seq_lens, batch_size, rnn_celltype=cfg.rnn_celltype, rnn_nlayers=cfg.rnn_nlayers, rnn_nunits=cfg.rnn_nunits) with tf.variable_scope("pitches"): dec_recons_logits = tf.layers.dense(dec_stp, 88) dec_recons_loss = weighted_avg( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=dec_recons_logits, labels=pitches), stp_varlen_mask) out_dict["dec_initial_state"] = dec_initial_state out_dict["dec_final_state"] = dec_final_state out_dict["dec_recons_logits"] = dec_recons_logits out_dict["dec_recons_scores"] = tf.nn.softmax(dec_recons_logits, axis=-1) out_dict["dec_recons_preds"] = tf.argmax( dec_recons_logits, output_type=tf.int32, axis=-1) out_dict["dec_recons_midi_preds"] = util.remidify( out_dict["dec_recons_preds"]) out_dict["dec_recons_loss"] = dec_recons_loss if cfg.dec_pred_velocity: with tf.variable_scope("velocities"): dec_recons_velocity_logits = tf.layers.dense( dec_stp, cfg.data_max_discrete_velocities + 1) dec_recons_velocity_loss = weighted_avg( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=dec_recons_velocity_logits, labels=velocities), stp_varlen_mask) out_dict["dec_recons_velocity_logits"] = dec_recons_velocity_logits out_dict["dec_recons_velocity_loss"] = dec_recons_velocity_loss # Stats if cfg.stp_emb_vq or cfg.stp_emb_iq: discrete = out_dict[ "stp_emb_vq_discrete" if cfg.stp_emb_vq else "stp_emb_iq_discrete"] dx = pitches[:, 1:] - pitches[:, :-1] dy = discrete[:, 1:] - discrete[:, :-1] contour_violation = tf.reduce_mean(tf.cast(tf.less(dx * dy, 0), tf.float32)) dx_hold = tf.equal(dx, 0) deviate_violation = weighted_avg( tf.cast(tf.not_equal(dy, 0), tf.float32), tf.cast(dx_hold, tf.float32)) out_dict["contour_violation"] = contour_violation out_dict["deviate_violation"] = deviate_violation return out_dict
def build(): """Builds the Tensorflow graph.""" inputs, labels, lengths = None, None, None if mode in ('train', 'eval'): if isinstance(no_event_label, numbers.Number): label_shape = [] else: label_shape = [len(no_event_label)] inputs, labels, lengths = magenta.common.get_padded_batch( sequence_example_file_paths, hparams.batch_size, input_size, label_shape=label_shape, shuffle=mode == 'train') elif mode == 'generate': inputs = tf.placeholder(tf.float32, [hparams.batch_size, None, input_size]) if isinstance(encoder_decoder, magenta.music.OneHotIndexEventSequenceEncoderDecoder): expanded_inputs = tf.one_hot( tf.cast(tf.squeeze(inputs, axis=-1), tf.int64), encoder_decoder.input_depth) else: expanded_inputs = inputs dropout_keep_prob = 1.0 if mode == 'generate' else hparams.dropout_keep_prob if hparams.use_cudnn: outputs, initial_state, final_state = make_cudnn( expanded_inputs, hparams.rnn_layer_sizes, hparams.batch_size, mode, dropout_keep_prob=dropout_keep_prob, residual_connections=hparams.residual_connections) else: cell = make_rnn_cell( hparams.rnn_layer_sizes, dropout_keep_prob=dropout_keep_prob, attn_length=hparams.attn_length, residual_connections=hparams.residual_connections) initial_state = cell.zero_state(hparams.batch_size, tf.float32) outputs, final_state = tf.nn.dynamic_rnn( cell, inputs, sequence_length=lengths, initial_state=initial_state, swap_memory=True) outputs_flat = magenta.common.flatten_maybe_padded_sequences( outputs, lengths) if isinstance(num_classes, numbers.Number): num_logits = num_classes else: num_logits = sum(num_classes) logits_flat = contrib_layers.linear(outputs_flat, num_logits) if mode in ('train', 'eval'): labels_flat = magenta.common.flatten_maybe_padded_sequences( labels, lengths) if isinstance(num_classes, numbers.Number): softmax_cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels_flat, logits=logits_flat) predictions_flat = tf.argmax(logits_flat, axis=1) else: logits_offsets = np.cumsum([0] + num_classes) softmax_cross_entropy = [] predictions = [] for i in range(len(num_classes)): softmax_cross_entropy.append( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels_flat[:, i], logits=logits_flat[ :, logits_offsets[i]:logits_offsets[i + 1]])) predictions.append( tf.argmax(logits_flat[ :, logits_offsets[i]:logits_offsets[i + 1]], axis=1)) predictions_flat = tf.stack(predictions, 1) correct_predictions = tf.to_float( tf.equal(labels_flat, predictions_flat)) event_positions = tf.to_float(tf.not_equal(labels_flat, no_event_label)) no_event_positions = tf.to_float(tf.equal(labels_flat, no_event_label)) # Compute the total number of time steps across all sequences in the # batch. For some models this will be different from the number of RNN # steps. def batch_labels_to_num_steps(batch_labels, lengths): num_steps = 0 for labels, length in zip(batch_labels, lengths): num_steps += encoder_decoder.labels_to_num_steps(labels[:length]) return np.float32(num_steps) num_steps = tf.py_func( batch_labels_to_num_steps, [labels, lengths], tf.float32) if mode == 'train': loss = tf.reduce_mean(softmax_cross_entropy) perplexity = tf.exp(loss) accuracy = tf.reduce_mean(correct_predictions) event_accuracy = ( tf.reduce_sum(correct_predictions * event_positions) / tf.reduce_sum(event_positions)) no_event_accuracy = ( tf.reduce_sum(correct_predictions * no_event_positions) / tf.reduce_sum(no_event_positions)) loss_per_step = tf.reduce_sum(softmax_cross_entropy) / num_steps perplexity_per_step = tf.exp(loss_per_step) optimizer = tf.train.AdamOptimizer(learning_rate=hparams.learning_rate) train_op = contrib_slim.learning.create_train_op( loss, optimizer, clip_gradient_norm=hparams.clip_norm) tf.add_to_collection('train_op', train_op) vars_to_summarize = { 'loss': loss, 'metrics/perplexity': perplexity, 'metrics/accuracy': accuracy, 'metrics/event_accuracy': event_accuracy, 'metrics/no_event_accuracy': no_event_accuracy, 'metrics/loss_per_step': loss_per_step, 'metrics/perplexity_per_step': perplexity_per_step, } elif mode == 'eval': vars_to_summarize, update_ops = contrib_metrics.aggregate_metric_map({ 'loss': tf.metrics.mean(softmax_cross_entropy), 'metrics/accuracy': tf.metrics.accuracy(labels_flat, predictions_flat), 'metrics/per_class_accuracy': tf.metrics.mean_per_class_accuracy(labels_flat, predictions_flat, num_classes), 'metrics/event_accuracy': tf.metrics.recall(event_positions, correct_predictions), 'metrics/no_event_accuracy': tf.metrics.recall(no_event_positions, correct_predictions), 'metrics/loss_per_step': tf.metrics.mean( tf.reduce_sum(softmax_cross_entropy) / num_steps, weights=num_steps), }) for updates_op in update_ops.values(): tf.add_to_collection('eval_ops', updates_op) # Perplexity is just exp(loss) and doesn't need its own update op. vars_to_summarize['metrics/perplexity'] = tf.exp( vars_to_summarize['loss']) vars_to_summarize['metrics/perplexity_per_step'] = tf.exp( vars_to_summarize['metrics/loss_per_step']) for var_name, var_value in six.iteritems(vars_to_summarize): tf.summary.scalar(var_name, var_value) tf.add_to_collection(var_name, var_value) elif mode == 'generate': temperature = tf.placeholder(tf.float32, []) if isinstance(num_classes, numbers.Number): softmax_flat = tf.nn.softmax( tf.div(logits_flat, tf.fill([num_classes], temperature))) softmax = tf.reshape( softmax_flat, [hparams.batch_size, -1, num_classes]) else: logits_offsets = np.cumsum([0] + num_classes) softmax = [] for i in range(len(num_classes)): sm = tf.nn.softmax( tf.div( logits_flat[:, logits_offsets[i]:logits_offsets[i + 1]], tf.fill([num_classes[i]], temperature))) sm = tf.reshape(sm, [hparams.batch_size, -1, num_classes[i]]) softmax.append(sm) tf.add_to_collection('inputs', inputs) tf.add_to_collection('temperature', temperature) tf.add_to_collection('softmax', softmax) # Flatten state tuples for metagraph compatibility. for state in tf_nest.flatten(initial_state): tf.add_to_collection('initial_state', state) for state in tf_nest.flatten(final_state): tf.add_to_collection('final_state', state)
def test_ne(self): input1 = tf.placeholder(shape=(4, 32, 32, 3), dtype=tf.float32) input2 = tf.placeholder(shape=(4, 32, 32, 3), dtype=tf.float32) output = tf.not_equal(input1, input2) self._test_conversion('ne', [input1, input2], [output])
def build_distractors(distractor_examples, context): """Create inputs with distractors.""" CLS_ID = tf.constant([101], dtype=tf.int64) # pylint: disable=invalid-name SEP_ID = tf.constant([102], dtype=tf.int64) # pylint: disable=invalid-name bert_inputs = [] input_masks = [] segment_ids = [] # for each distractor sample_size = int( (FLAGS.num_choices - FLAGS.k_size) / (FLAGS.data_window_size - 1)) for example in distractor_examples: # randomly sample 7 intermediate_examples_tensor = tf.reduce_sum(tf.abs(example), 1) examples_zero_vector = tf.zeros(shape=(1, 1), dtype=tf.int64) examples_bool_mask = tf.squeeze( tf.not_equal(intermediate_examples_tensor, examples_zero_vector)) paragraph_len = tf.reduce_sum(tf.cast(examples_bool_mask, tf.int32)) indices = tf.range(0, limit=paragraph_len, dtype=tf.int32) shuffled_indices = tf.random.shuffle(indices)[:sample_size] # extend examples / targets distractor_cand = example distractor_cand_plus_one = distractor_cand[1:] distractor_cand_plus_two = distractor_cand[2:] # pad extensions paddings_one = tf.constant([[0, 1], [0, 0]]) distractor_cand_plus_one = tf.pad(distractor_cand_plus_one, paddings_one) paddings_two = tf.constant([[0, 2], [0, 0]]) distractor_cand_plus_two = tf.pad(distractor_cand_plus_two, paddings_two) distractor_cand_ext = tf.concat( [distractor_cand, distractor_cand_plus_one, distractor_cand_plus_two], axis=1) distractors = tf.gather(distractor_cand_ext, shuffled_indices) for i in range(sample_size): distractors_non_zero = tf.where( tf.not_equal(distractors[i], tf.zeros_like(distractors[i]))) distractors_stripped = tf.gather_nd(distractors[i], distractors_non_zero) if FLAGS.include_context: segment_id = tf.concat([ tf.zeros_like(CLS_ID, dtype=tf.int64), tf.zeros_like(context), tf.zeros_like(SEP_ID, dtype=tf.int64), tf.ones_like(distractors_stripped), tf.ones_like(SEP_ID, dtype=tf.int64) ], axis=0) else: segment_id = tf.concat([ tf.zeros_like(CLS_ID, dtype=tf.int64), tf.zeros_like(distractors_stripped), tf.zeros_like(SEP_ID, dtype=tf.int64) ], axis=0) segment_id = pad_and_cut(segment_id, FLAGS.max_seq_length) segment_ids.append(segment_id) if FLAGS.include_context: new_input = tf.concat( [CLS_ID, context, SEP_ID, distractors_stripped, SEP_ID], axis=0) else: new_input = tf.concat([CLS_ID, distractors_stripped, SEP_ID], axis=0) input_mask = tf.ones_like(new_input) input_mask = pad_and_cut(input_mask, FLAGS.max_seq_length) input_masks.append(input_mask) padded_new_input = pad_and_cut(new_input, FLAGS.max_seq_length) bert_inputs.append(padded_new_input) bert_inputs = tf.stack(bert_inputs, axis=0) input_masks = tf.stack(input_masks, axis=0) segment_ids = tf.stack(segment_ids, axis=0) out = Outputs_And_Context(bert_inputs, input_masks, segment_ids, None, None) return out
def create_mobile_mask(input_mask): return tf.reduce_all(tf.not_equal(0, input_mask), axis=2, keepdims=True)
def fn_not_eos(): return tf.not_equal( # Check if the last predicted element is a EOS tf.squeeze(result[:, -1, :, :]), text_encoder.EOS_ID)
def call(self, yesno_logits, yesno_labels, supporting_fact_logits, supporting_fact_labels, block_ids, num_replicas=None, eps=0): """Calls the layer. Args: yesno_logits: <float32>[batch_size, 3] Logits per position. supporting_fact_logits: <float32>[batch_size] Logits per position fro supporting facts classification. block_ids: <int32>[batch_size] Block IDs of every sample in the batch. num_replicas: Number of replicas to gather summaries from. If None (default) then cross-replicas summaries are not used. eps: <float> Small constant for numerical stability. Returns: total_loss: <float> """ batch_size = tf.shape(supporting_fact_logits)[0] supporting_fact_logits = tf.expand_dims(supporting_fact_logits, 1) supporting_fact_labels = tf.expand_dims(supporting_fact_labels, 1) example_mask = tf.cast(tf.expand_dims(tf.not_equal(block_ids, 0), 1), tf.float32) # (1) Aggregate block_ids across global batch. Compute cross block mask. all_block_ids = block_ids if num_replicas: all_block_ids = tpu_utils.cross_replica_concat( tensor=all_block_ids, num_replicas=num_replicas, name='block_ids_concat') # [batch_size, global_batch_size] cross_blocks_eq_mask = tf.cast( tf.equal(tf.expand_dims(block_ids, 1), tf.expand_dims(all_block_ids, 0)), tf.float32) # (2) Apply softmax over all positions in the (global) batch # across the blocks with the same `block_id`. # [batch_size, 3, 1] yes_no_span_probs = losses.cross_batch_softmax( tf.expand_dims(yesno_logits, 2), cross_blocks_eq_mask, num_replicas) yes_no_span_probs = tf.squeeze(yes_no_span_probs, 2) # [batch_size, 1] supporting_facts_probs = losses.cross_batch_softmax( tf.expand_dims(supporting_fact_logits, 2), cross_blocks_eq_mask, num_replicas) supporting_facts_probs = tf.squeeze(supporting_facts_probs, 2) # (3) Prepare one-hot labels based on annotation begins and ends supporting_fact_labels = tf.cast(supporting_fact_labels, tf.float32) # [batch_size, 3] yes_no_span_one_hot = tf.one_hot(yesno_labels, depth=3, dtype=tf.float32) yes_no_span_one_hot = yes_no_span_one_hot * supporting_fact_labels # (4) Compute the probability of the current begin / end positions across # the blocks with the same `block_id`. def mean_loss(all_losses): return tf.reduce_sum(all_losses * example_mask) / ( tf.reduce_sum(example_mask) + eps) supporting_facts_loss = -mean_loss( tf.log(supporting_facts_probs * supporting_fact_labels + eps)) yes_no_span_loss = -mean_loss( tf.log(yes_no_span_probs * yes_no_span_one_hot + eps)) return yes_no_span_loss, supporting_facts_loss
def _create_inference(self): """ Inference used for learning model parameters """ # Mapped embeddings for users (u^c, u^i and u^s) self.u_c = tf.nn.embedding_lookup(self.uw_c, self.input_u) self.u_c = tf.reshape(self.u_c, [-1, self.embedding_size]) self.u_i = tf.nn.embedding_lookup(self.uw_i, self.input_u) self.u_i = tf.reshape(self.u_i, [-1, self.embedding_size]) self.u_s = tf.nn.embedding_lookup(self.uw_s, self.input_u) self.u_s = tf.reshape(self.u_s, [-1, self.embedding_size]) # Our contribution (haven't added u^c2 because I figured, if we use the same u^c, we essentially use # shared knowledge between all domains. Maybe that is favorable? # TODO: We could test with only sharing between item domain and questionnaire domain. self.u_q = tf.nn.embedding_lookup(self.uw_q, self.input_u) self.u_q = tf.reshape(self.u_q, [-1, self.embedding_size]) # Attentive transferred embeddings for users (p^I_u and p^S_u) self.P_iu, self.item_w = self._item_attentive_transfer() self.P_su, self.social_w = self._social_attentive_transfer() # Our contribution self.P_qu, self.question_w = self._questionnaire_attentive_transfer() # adding dropout on transferred embeddings to avoid overfitting self.P_iu = tf.nn.dropout(self.P_iu, self.dropout_keep_prob) self.P_su = tf.nn.dropout(self.P_su, self.dropout_keep_prob) self.P_qu = tf.nn.dropout(self.P_qu, self.dropout_keep_prob) # Looking up item embeddings from data self.pos_item = tf.nn.embedding_lookup(self.Q, self.input_ur) # Items used for this inference self.pos_n_ratings = tf.cast(tf.not_equal(self.input_ur, self.n_items), 'float32') # Performing matrix multiplication to obtain item embeddings for this inference self.pos_item = tf.einsum('ab,abc->abc', self.pos_n_ratings, self.pos_item) # Transferred embeddings for items multiplied with item embeddings self.pos_r = tf.einsum('ac,abc->abc', self.P_iu, self.pos_item) # Need to multiply with H_i as well self.pos_r = tf.einsum('ajk,kl->ajl', self.pos_r, self.H_i) self.pos_r = tf.reshape(self.pos_r, [-1, max_items]) # Social embeddings lookup self.pos_friend = tf.nn.embedding_lookup(self.G, self.input_uf) # Social interactions used for this inference self.pos_n_friends = tf.cast(tf.not_equal(self.input_uf, self.n_users), 'float32') # Obtaining embeddings for socials used in this inference self.pos_friend = tf.einsum('ab,abc->abc', self.pos_n_friends, self.pos_friend) # Multiplying with social attentive transferred user embeddings self.pos_f = tf.einsum('ac,abc->abc', self.P_su, self.pos_friend) # Need to multiply with H_s as well self.pos_f = tf.einsum('abc,cd->abd', self.pos_friend, self.H_s) self.pos_f = tf.reshape(self.pos_f, [-1, max_friends]) # Questionnaire embeddings lookup self.pos_questions = tf.nn.embedding_lookup(self.V, self.input_uq) # Answered questions for this inference self.pos_n_questions = tf.cast( tf.not_equal(self.input_uq, self.n_items), 'float32') # consider if this should numbers of questions instead? # Obtaining embeddings for questions used in this inference self.pos_questions = tf.einsum('ab,abc->abc', self.pos_n_questions, self.pos_questions) # Multiplying with question attentive transferred user embeddings self.pos_q = tf.einsum('ac,abc->abc', self.P_qu, self.pos_questions) # Need to multiply with H_q as well self.pos_q = tf.einsum('abc,cd->abd', self.pos_questions, self.H_q) self.pos_q = tf.reshape(self.pos_q, [-1, self.max_questions])
def _loss_function(conf_gt, conf_logits, reg_gt, reg_logits, config): """ Creates the PPN loss function. Returns (conf_loss, point_loss) conf_gt: Ground truth confidence, i.e. 1 for close anchors, 0 for anchors that are too far off and -1 for anchors to be ignored. Must have shape (?, fh, fw, k). conf_logits: PPN confidence output, must have shape (?, fh, fw, k). reg_gt: Ground truth point offsets, need only have valid values for the anchors with conf_gt of 1. Must have shape (?, fh, fw, 2k). reg_logits: PPN anchor offset output, must have shape (?, fh, fw, 2k). config The configuration dictionary. See ppn.config.ppn_config. """ import tensorflow.compat.v1 as tf # mask out the invalid anchors: # only penalize confidence of valid (i.e. not ignored) anchors # only penalize points of positive anchors valid_mask = tf.stop_gradient(tf.not_equal(conf_gt, -1)) pos_mask = tf.stop_gradient(tf.equal(conf_gt, 1)) num_valid = tf.stop_gradient(tf.count_nonzero(valid_mask, dtype=tf.int32)) num_pos = tf.stop_gradient(tf.count_nonzero(pos_mask, dtype=tf.int32)) valid_conf_gt = tf.boolean_mask(conf_gt, valid_mask) valid_conf_logits = tf.boolean_mask(conf_logits, valid_mask) pos_reg_gt = tf.boolean_mask(reg_gt, pos_mask) pos_reg_logits = tf.boolean_mask(reg_logits, pos_mask) if config['loss_function'] == 'crossentropy': # get the confidence loss using sigmoidal cross entropy conf_loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.cast(valid_conf_gt, tf.float32), logits=valid_conf_logits) else: # get the confidence loss using focal loss conf_loss = _binary_focal_loss_with_logits( labels=tf.cast(valid_conf_gt, tf.float32), logits=valid_conf_logits, gamma=config['focal_gamma'], pos_weight=config['focal_pos_weight']) if config['focal_normalized']: # normalize according to number of positive anchors conf_loss = conf_loss / tf.cast(num_valid, tf.float32) conf_loss = tf.reduce_sum(conf_loss) # get the point loss using MSE point_loss = tf.losses.mean_squared_error( labels=pos_reg_gt, predictions=pos_reg_logits, reduction=tf.losses.Reduction.SUM) # zero out the losses if there were no valid points conf_loss = tf.where(tf.equal(num_valid, 0), 0.0, conf_loss, name='conf_loss') point_loss = tf.where(tf.equal(num_pos, 0), 0.0, point_loss, name='point_loss') # normalize losses to contribute equally and add N_conf, N_reg = config['N_conf'], config['N_reg'] return ((1.0/N_conf) * conf_loss, (1.0/N_reg) * point_loss)