def _model_fn(features, labels, mode, params, model, variable_filter_fn=None): """Model definition entry. Args: features: the input image tensor with shape [batch_size, height, width, 3]. The height and width are fixed and equal. labels: the input labels in a dictionary. The labels include class targets and box targets which are dense label maps. The labels are generated from get_input_fn function in data/dataloader.py mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT. params: the dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. model: the model outputs class logits and box regression outputs. variable_filter_fn: the filter function that takes trainable_variables and returns the variable list after applying the filter rule. Returns: tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction. Raises: RuntimeError: if both ckpt and backbone_ckpt are set. """ # Convert params (dict) to Config for easier access. if params['data_format'] == 'channels_first': features = tf.transpose(features, [0, 3, 1, 2]) def _model_outputs(inputs): return model(inputs, config=hparams_config.Config(params)) cls_outputs, box_outputs = utils.build_model_with_precision( params['precision'], _model_outputs, features) levels = cls_outputs.keys() for level in levels: cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32) box_outputs[level] = tf.cast(box_outputs[level], tf.float32) # First check if it is in PREDICT mode. if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'image': features, } for level in levels: predictions['cls_outputs_%d' % level] = cls_outputs[level] predictions['box_outputs_%d' % level] = box_outputs[level] return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Set up training loss and learning rate. update_learning_rate_schedule_parameters(params) global_step = tf.train.get_or_create_global_step() learning_rate = learning_rate_schedule(params, global_step) # cls_loss and box_loss are for logging. only total_loss is optimized. det_loss, cls_loss, box_loss, box_iou_loss = detection_loss( cls_outputs, box_outputs, labels, params) l2loss = reg_l2_loss(params['weight_decay']) total_loss = det_loss + l2loss if mode == tf.estimator.ModeKeys.TRAIN: utils.scalar('lrn_rate', learning_rate) utils.scalar('trainloss/cls_loss', cls_loss) utils.scalar('trainloss/box_loss', box_loss) utils.scalar('trainloss/box_iou_loss', box_iou_loss) utils.scalar('trainloss/det_loss', det_loss) utils.scalar('trainloss/l2_loss', l2loss) utils.scalar('trainloss/loss', total_loss) moving_average_decay = params['moving_average_decay'] if moving_average_decay: ema = tf.train.ExponentialMovingAverage( decay=moving_average_decay, num_updates=global_step) ema_vars = utils.get_ema_vars() if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.MomentumOptimizer( learning_rate, momentum=params['momentum']) if params['use_tpu']: optimizer = tf.tpu.CrossShardOptimizer(optimizer) # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) var_list = tf.trainable_variables() if variable_filter_fn: var_list = variable_filter_fn(var_list) if params.get('clip_gradients_norm', 0) > 0: logging.info('clip gradients norm by %f', params['clip_gradients_norm']) grads_and_vars = optimizer.compute_gradients(total_loss, var_list) with tf.name_scope('clip'): grads = [gv[0] for gv in grads_and_vars] tvars = [gv[1] for gv in grads_and_vars] clipped_grads, gnorm = tf.clip_by_global_norm( grads, params['clip_gradients_norm']) utils.scalar('gnorm', gnorm) grads_and_vars = list(zip(clipped_grads, tvars)) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients(grads_and_vars, global_step) else: with tf.control_dependencies(update_ops): train_op = optimizer.minimize( total_loss, global_step, var_list=var_list) if moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(**kwargs): """Returns a dictionary that has the evaluation metrics.""" batch_size = params['batch_size'] if params['use_tpu']: batch_size = params['batch_size'] * params['num_shards'] eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) if params.get('testdev_dir', None): logging.info('Eval testdev_dir %s', params['testdev_dir']) coco_metrics = coco_metric_fn( batch_size, anchor_labeler, params['val_json_file'], testdev_dir=params['testdev_dir'], disable_pyfun=params.get('disable_pyfun', None), **kwargs) else: logging.info('Eval val with groudtruths %s.', params['val_json_file']) coco_metrics = coco_metric_fn(batch_size, anchor_labeler, params['val_json_file'], **kwargs) # Add metrics to output. output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics cls_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(cls_loss, 0), [params['batch_size'],]), [params['batch_size'], 1]) box_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(box_loss, 0), [params['batch_size'],]), [params['batch_size'], 1]) metric_fn_inputs = { 'cls_loss_repeat': cls_loss_repeat, 'box_loss_repeat': box_loss_repeat, 'source_ids': labels['source_ids'], 'groundtruth_data': labels['groundtruth_data'], 'image_scales': labels['image_scales'], } add_metric_fn_inputs(params, cls_outputs, box_outputs, metric_fn_inputs) eval_metrics = (metric_fn, metric_fn_inputs) checkpoint = params.get('ckpt') or params.get('backbone_ckpt') if checkpoint and mode == tf.estimator.ModeKeys.TRAIN: # Initialize the model from an EfficientDet or backbone checkpoint. if params.get('ckpt') and params.get('backbone_ckpt'): raise RuntimeError( '--backbone_ckpt and --checkpoint are mutually exclusive') if params.get('backbone_ckpt'): var_scope = params['backbone_name'] + '/' if params['ckpt_var_scope'] is None: # Use backbone name as default checkpoint scope. ckpt_scope = params['backbone_name'] + '/' else: ckpt_scope = params['ckpt_var_scope'] + '/' else: # Load every var in the given checkpoint var_scope = ckpt_scope = '/' def scaffold_fn(): """Loads pretrained model through scaffold function.""" logging.info('restore variables from %s', checkpoint) var_map = utils.get_ckpt_var_map( ckpt_path=checkpoint, ckpt_scope=ckpt_scope, var_scope=var_scope, var_exclude_expr=params.get('var_exclude_expr', None)) tf.train.init_from_checkpoint(checkpoint, var_map) return tf.train.Scaffold() elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay: def scaffold_fn(): """Load moving average variables for eval.""" logging.info('Load EMA vars with ema_decay=%f', moving_average_decay) restore_vars_dict = ema.variables_to_restore(ema_vars) saver = tf.train.Saver(restore_vars_dict) return tf.train.Scaffold(saver=saver) else: scaffold_fn = None return tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, eval_metrics=eval_metrics, host_call=utils.get_tpu_host_call(global_step, params), scaffold_fn=scaffold_fn)
def style_image_inputs(style_dataset_file, batch_size=None, image_size=None, square_crop=False, shuffle=True): """Loads a batch of random style image given the path of tfrecord dataset. Args: style_dataset_file: str, path to the tfrecord dataset of style files. The dataset is produced via the create_style_dataset.py script and is made of Example protobufs with the following features: * 'image_raw': byte encoding of the JPEG string of the style image. * 'label': integer identifier of the style image in [0, N - 1], where N is the number of examples in the dataset. * 'vgg_16/<LAYER_NAME>': Gram matrix at layer <LAYER_NAME> of the VGG-16 network (<LAYER_NAME> in {conv,pool}{1,2,3,4,5}) for the style image. batch_size: int. If provided, batches style images. Defaults to None. image_size: int. The images will be resized bilinearly so that the smallest side has size image_size. Defaults to None. square_crop: bool. If True, square-crops to [image_size, image_size]. Defaults to False. shuffle: bool, whether to shuffle style files at random. Defaults to True. Returns: If batch_size is defined, a 4-D tensor of shape [batch_size, ?, ?, 3] with values in [0, 1] for the style image, and 1-D tensor for the style label. Raises: ValueError: if center cropping is requested but no image size is provided, or if batch size is specified but center-cropping is not requested. """ vgg_layers = [ 'vgg_16/conv1', 'vgg_16/pool1', 'vgg_16/conv2', 'vgg_16/pool2', 'vgg_16/conv3', 'vgg_16/pool3', 'vgg_16/conv4', 'vgg_16/pool4', 'vgg_16/conv5', 'vgg_16/pool5' ] if square_crop and image_size is None: raise ValueError('center-cropping requires specifying the image size.') if batch_size is not None and not square_crop: raise ValueError('batching requires center-cropping.') with tf.name_scope('style_image_processing'): filename_queue = tf.train.string_input_producer([style_dataset_file], shuffle=False, capacity=1, name='filename_queue') if shuffle: examples_queue = tf.RandomShuffleQueue( capacity=64, min_after_dequeue=32, dtypes=[tf.string], name='random_examples_queue') else: examples_queue = tf.FIFOQueue(capacity=64, dtypes=[tf.string], name='fifo_examples_queue') reader = tf.TFRecordReader() _, value = reader.read(filename_queue) enqueue_ops = [examples_queue.enqueue([value])] tf.train.queue_runner.add_queue_runner( tf.train.queue_runner.QueueRunner(examples_queue, enqueue_ops)) example_serialized = examples_queue.dequeue() features = tf.parse_single_example( example_serialized, features={ 'label': tf.FixedLenFeature([], tf.int64), 'image_raw': tf.FixedLenFeature([], tf.string), 'vgg_16/conv1': tf.FixedLenFeature([64, 64], tf.float32), 'vgg_16/pool1': tf.FixedLenFeature([64, 64], tf.float32), 'vgg_16/conv2': tf.FixedLenFeature([128, 128], tf.float32), 'vgg_16/pool2': tf.FixedLenFeature([128, 128], tf.float32), 'vgg_16/conv3': tf.FixedLenFeature([256, 256], tf.float32), 'vgg_16/pool3': tf.FixedLenFeature([256, 256], tf.float32), 'vgg_16/conv4': tf.FixedLenFeature([512, 512], tf.float32), 'vgg_16/pool4': tf.FixedLenFeature([512, 512], tf.float32), 'vgg_16/conv5': tf.FixedLenFeature([512, 512], tf.float32), 'vgg_16/pool5': tf.FixedLenFeature([512, 512], tf.float32) }) image = tf.image.decode_jpeg(features['image_raw']) label = features['label'] gram_matrices = [features[vgg_layer] for vgg_layer in vgg_layers] image.set_shape([None, None, 3]) if image_size: if square_crop: image = _aspect_preserving_resize(image, image_size + 2) image = _central_crop([image], image_size, image_size)[0] image.set_shape([image_size, image_size, 3]) else: image = _aspect_preserving_resize(image, image_size) image = tf.to_float(image) / 255.0 if batch_size is None: image = tf.expand_dims(image, 0) else: image_label_gram_matrices = tf.train.batch([image, label] + gram_matrices, batch_size=batch_size) image, label = image_label_gram_matrices[:2] gram_matrices = image_label_gram_matrices[2:] gram_matrices = dict( (vgg_layer, gram_matrix) for vgg_layer, gram_matrix in zip(vgg_layers, gram_matrices)) return image, label, gram_matrices
def _parse_example_proto(example_serialized): """Parses an Example proto containing a training example of an image. The output of the build_image_data.py image preprocessing script is a dataset containing serialized Example protocol buffers. Each Example proto contains the following fields: image/height: 462 image/width: 581 image/colorspace: 'RGB' image/channels: 3 image/class/label: 615 image/class/synset: 'n03623198' image/class/text: 'knee pad' image/object/bbox/xmin: 0.1 image/object/bbox/xmax: 0.9 image/object/bbox/ymin: 0.2 image/object/bbox/ymax: 0.6 image/object/bbox/label: 615 image/format: 'JPEG' image/filename: 'ILSVRC2012_val_00041207.JPEG' image/encoded: <JPEG encoded string> Args: example_serialized: scalar Tensor tf.string containing a serialized Example protocol buffer. Returns: image_buffer: Tensor tf.string containing the contents of a JPEG file. label: Tensor tf.int32 containing the label. bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] where each coordinate is [0, 1) and the coordinates are arranged as [ymin, xmin, ymax, xmax]. text: Tensor tf.string containing the human-readable label. """ # Dense features in Example proto. feature_map = { 'image/encoded': tf.FixedLenFeature([], dtype=tf.string, default_value=''), 'image/class/label': tf.FixedLenFeature([1], dtype=tf.int64, default_value=-1), 'image/class/text': tf.FixedLenFeature([], dtype=tf.string, default_value=''), } sparse_float32 = tf.VarLenFeature(dtype=tf.float32) # Sparse features in Example proto. feature_map.update({ k: sparse_float32 for k in [ 'image/object/bbox/xmin', 'image/object/bbox/ymin', 'image/object/bbox/xmax', 'image/object/bbox/ymax' ] }) features = tf.parse_single_example(example_serialized, feature_map) label = tf.cast(features['image/class/label'], dtype=tf.int32) xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0) ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0) xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0) ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0) # Note that we impose an ordering of (y, x) just to make life difficult. bbox = tf.concat([ymin, xmin, ymax, xmax], 0) # Force the variable number of bounding boxes into the shape # [1, num_boxes, coords]. bbox = tf.expand_dims(bbox, 0) bbox = tf.transpose(bbox, [0, 2, 1]) return features['image/encoded'], label, bbox, features['image/class/text']
def decode(self, decoder_input, encoder_output, encoder_decoder_attention_bias, decoder_self_attention_bias, hparams, cache=None, decode_loop_step=None, nonpadding=None, losses=None, **kwargs): """Decode Universal Transformer outputs from encoder representation. It is similar to "transformer.decode", but it uses "universal_transformer_util.universal_transformer_decoder" instead of "transformer.transformer_decoder". Args: decoder_input: inputs to bottom of the model. [batch_size, decoder_length, hidden_dim] encoder_output: Encoder representation. [batch_size, input_length, hidden_dim] encoder_decoder_attention_bias: Bias and mask weights for encoder-decoder attention. [batch_size, input_length] decoder_self_attention_bias: Bias and mask weights for decoder self-attention. [batch_size, decoder_length] hparams: hyperparmeters for model. cache: Unimplemented. decode_loop_step: Unused. nonpadding: optional Tensor with shape [batch_size, decoder_length] losses: Unused. **kwargs: additional arguments to pass to decoder_function Returns: Tuple of: Final decoder representation. [batch_size, decoder_length, hidden_dim] encoder_extra_output: which is extra encoder output used in some variants of the model (e.g. in ACT, to pass the ponder-time to body) """ del decode_loop_step del losses # TODO(dehghani): enable caching. del cache decoder_input = tf.nn.dropout( decoder_input, 1.0 - hparams.layer_prepostprocess_dropout) # No caching in Universal Transformers! (decoder_output, dec_extra_output) = ( universal_transformer_util.universal_transformer_decoder( decoder_input, encoder_output, decoder_self_attention_bias, encoder_decoder_attention_bias, hparams, nonpadding=nonpadding, save_weights_to=self.attention_weights)) # Expand since t2t expects 4d tensors. return tf.expand_dims(decoder_output, axis=2), dec_extra_output
def get_prediction_module(self, bert_model, features, is_training, percent_done): final_hidden = bert_model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] answer_mask = tf.cast(features["input_mask"], tf.float32) answer_mask *= tf.cast(features["segment_ids"], tf.float32) answer_mask += tf.one_hot(0, seq_length) start_logits = tf.squeeze(tf.layers.dense(final_hidden, 1), -1) start_top_log_probs = tf.zeros([batch_size, self.config.beam_size]) start_top_index = tf.zeros([batch_size, self.config.beam_size], tf.int32) end_top_log_probs = tf.zeros( [batch_size, self.config.beam_size, self.config.beam_size]) end_top_index = tf.zeros( [batch_size, self.config.beam_size, self.config.beam_size], tf.int32) if self.config.joint_prediction: start_logits += 1000.0 * (answer_mask - 1) start_log_probs = tf.nn.log_softmax(start_logits) start_top_log_probs, start_top_index = tf.nn.top_k( start_log_probs, k=self.config.beam_size) if not is_training: # batch, beam, length, hidden end_features = tf.tile(tf.expand_dims(final_hidden, 1), [1, self.config.beam_size, 1, 1]) # batch, beam, length start_index = tf.one_hot(start_top_index, depth=seq_length, axis=-1, dtype=tf.float32) # batch, beam, hidden start_features = tf.reduce_sum( tf.expand_dims(final_hidden, 1) * tf.expand_dims(start_index, -1), axis=-2) # batch, beam, length, hidden start_features = tf.tile(tf.expand_dims(start_features, 2), [1, 1, seq_length, 1]) else: start_index = tf.one_hot(features[self.name + "_start_positions"], depth=seq_length, axis=-1, dtype=tf.float32) start_features = tf.reduce_sum( tf.expand_dims(start_index, -1) * final_hidden, axis=1) start_features = tf.tile(tf.expand_dims(start_features, 1), [1, seq_length, 1]) end_features = final_hidden final_repr = tf.concat([start_features, end_features], -1) final_repr = tf.layers.dense(final_repr, 512, activation=modeling.gelu, name="qa_hidden") # batch, beam, length (batch, length when training) end_logits = tf.squeeze(tf.layers.dense(final_repr, 1), -1, name="qa_logits") if is_training: end_logits += 1000.0 * (answer_mask - 1) else: end_logits += tf.expand_dims(1000.0 * (answer_mask - 1), 1) if not is_training: end_log_probs = tf.nn.log_softmax(end_logits) end_top_log_probs, end_top_index = tf.nn.top_k( end_log_probs, k=self.config.beam_size) end_logits = tf.zeros([batch_size, seq_length]) else: end_logits = tf.squeeze(tf.layers.dense(final_hidden, 1), -1) start_logits += 1000.0 * (answer_mask - 1) end_logits += 1000.0 * (answer_mask - 1) def compute_loss(logits, positions): one_hot_positions = tf.one_hot(positions, depth=seq_length, dtype=tf.float32) log_probs = tf.nn.log_softmax(logits, axis=-1) loss = -tf.reduce_sum(one_hot_positions * log_probs, axis=-1) return loss start_positions = features[self.name + "_start_positions"] end_positions = features[self.name + "_end_positions"] start_loss = compute_loss(start_logits, start_positions) end_loss = compute_loss(end_logits, end_positions) losses = (start_loss + end_loss) / 2.0 # plausible answer loss plau_logits = tf.layers.dense(final_hidden, 2) plau_logits = tf.reshape(plau_logits, [batch_size, seq_length, 2]) plau_logits = tf.transpose(plau_logits, [2, 0, 1]) unstacked_logits = tf.unstack(plau_logits, axis=0) (plau_start_logits, plau_end_logits) = (unstacked_logits[0], unstacked_logits[1]) plau_start_logits += 1000.0 * (answer_mask - 1) plau_end_logits += 1000.0 * (answer_mask - 1) plau_start_positions = features[self.name + "_plau_answer_start"] plau_end_positions = features[self.name + "_plau_answer_end"] plau_start_loss = compute_loss(plau_start_logits, plau_start_positions) plau_end_loss = compute_loss(plau_end_logits, plau_end_positions) losses += (plau_start_loss + plau_end_loss) / 2.0 answerable_logit = tf.zeros([batch_size]) if self.config.answerable_classifier: final_repr = final_hidden[:, 0] if self.config.answerable_uses_start_logits: start_p = tf.nn.softmax(start_logits) start_feature = tf.reduce_sum(tf.expand_dims(start_p, -1) * final_hidden, axis=1) final_repr = tf.concat([final_repr, start_feature], -1) final_repr = tf.layers.dense(final_repr, 512, activation=modeling.gelu) answerable_logit = tf.squeeze(tf.layers.dense(final_repr, 1), -1) answerable_loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.cast(features[self.name + "_is_impossible"], tf.float32), logits=answerable_logit) losses += answerable_loss * self.config.answerable_weight from finetune.qa.rl_loss import rl_loss loss_rl = rl_loss(start_logits, end_logits, start_positions, end_positions, sample_num=4) losses += 0.5 * loss_rl return losses, dict( loss=losses, start_logits=start_logits, end_logits=end_logits, answerable_logit=answerable_logit, start_positions=features[self.name + "_start_positions"], end_positions=features[self.name + "_end_positions"], start_top_log_probs=start_top_log_probs, start_top_index=start_top_index, end_top_log_probs=end_top_log_probs, end_top_index=end_top_index, eid=features[self.name + "_eid"], )
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None): """Model definition entry. Args: features: the input image tensor with shape [batch_size, height, width, 3]. The height and width are fixed and equal. labels: the input labels in a dictionary. The labels include class targets and box targets which are dense label maps. The labels are generated from get_input_fn function in data/dataloader.py mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT. params: the dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. model: the model outputs class logits and box regression outputs. variable_filter_fn: the filter function that takes trainable_variables and returns the variable list after applying the filter rule. Returns: tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction. Raises: RuntimeError: if both ckpt and backbone_ckpt are set. """ utils.image('input_image', features) training_hooks = [] if params['use_keras_model']: def model_fn(inputs): model = efficientdet_keras.EfficientDetNet( config=hparams_config.Config(params)) cls_out_list, box_out_list = model(inputs, params['is_training_bn']) cls_outputs, box_outputs = {}, {} for i in range(params['min_level'], params['max_level'] + 1): cls_outputs[i] = cls_out_list[i - params['min_level']] box_outputs[i] = box_out_list[i - params['min_level']] return cls_outputs, box_outputs else: model_fn = functools.partial(model, config=hparams_config.Config(params)) precision = utils.get_precision(params['strategy'], params['mixed_precision']) cls_outputs, box_outputs = utils.build_model_with_precision( precision, model_fn, features, params['is_training_bn']) levels = cls_outputs.keys() for level in levels: cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32) box_outputs[level] = tf.cast(box_outputs[level], tf.float32) # First check if it is in PREDICT mode. if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'image': features, } for level in levels: predictions['cls_outputs_%d' % level] = cls_outputs[level] predictions['box_outputs_%d' % level] = box_outputs[level] return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Set up training loss and learning rate. update_learning_rate_schedule_parameters(params) global_step = tf.train.get_or_create_global_step() learning_rate = learning_rate_schedule(params, global_step) # cls_loss and box_loss are for logging. only total_loss is optimized. det_loss, cls_loss, box_loss, box_iou_loss = detection_loss( cls_outputs, box_outputs, labels, params) reg_l2loss = reg_l2_loss(params['weight_decay']) total_loss = det_loss + reg_l2loss if mode == tf.estimator.ModeKeys.TRAIN: utils.scalar('lrn_rate', learning_rate) utils.scalar('trainloss/cls_loss', cls_loss) utils.scalar('trainloss/box_loss', box_loss) utils.scalar('trainloss/det_loss', det_loss) utils.scalar('trainloss/reg_l2_loss', reg_l2loss) utils.scalar('trainloss/loss', total_loss) if params['iou_loss_type']: utils.scalar('trainloss/box_iou_loss', box_iou_loss) train_epochs = tf.cast(global_step, tf.float32) / params['steps_per_epoch'] utils.scalar('train_epochs', train_epochs) moving_average_decay = params['moving_average_decay'] if moving_average_decay: ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay, num_updates=global_step) ema_vars = utils.get_ema_vars() if params['strategy'] == 'horovod': import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top learning_rate = learning_rate * hvd.size() if mode == tf.estimator.ModeKeys.TRAIN: if params['optimizer'].lower() == 'sgd': optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=params['momentum']) elif params['optimizer'].lower() == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate) else: raise ValueError('optimizers should be adam or sgd') if params['strategy'] == 'tpu': optimizer = tf.tpu.CrossShardOptimizer(optimizer) elif params['strategy'] == 'horovod': optimizer = hvd.DistributedOptimizer(optimizer) training_hooks = [hvd.BroadcastGlobalVariablesHook(0)] # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) var_list = tf.trainable_variables() if variable_filter_fn: var_list = variable_filter_fn(var_list) if params.get('clip_gradients_norm', None): logging.info('clip gradients norm by %f', params['clip_gradients_norm']) grads_and_vars = optimizer.compute_gradients(total_loss, var_list) with tf.name_scope('clip'): grads = [gv[0] for gv in grads_and_vars] tvars = [gv[1] for gv in grads_and_vars] # First clip each variable's norm, then clip global norm. clip_norm = abs(params['clip_gradients_norm']) clipped_grads = [tf.clip_by_norm(g, clip_norm) for g in grads] clipped_grads, _ = tf.clip_by_global_norm( clipped_grads, clip_norm) utils.scalar('gradient_norm', tf.linalg.global_norm(clipped_grads)) grads_and_vars = list(zip(clipped_grads, tvars)) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients(grads_and_vars, global_step) else: with tf.control_dependencies(update_ops): train_op = optimizer.minimize(total_loss, global_step, var_list=var_list) if moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(**kwargs): """Returns a dictionary that has the evaluation metrics.""" if params['nms_configs'].get('pyfunc', True): detections_bs = [] for index in range(kwargs['boxes'].shape[0]): nms_configs = params['nms_configs'] detections = tf.numpy_function( functools.partial(nms_np.per_class_nms, nms_configs=nms_configs), [ kwargs['boxes'][index], kwargs['scores'][index], kwargs['classes'][index], tf.slice(kwargs['image_ids'], [index], [1]), tf.slice(kwargs['image_scales'], [index], [1]), params['num_classes'], nms_configs['max_output_size'], ], tf.float32) detections_bs.append(detections) detections_bs = postprocess.transform_detections( tf.stack(detections_bs)) else: # These two branches should be equivalent, but currently they are not. # TODO(tanmingxing): enable the non_pyfun path after bug fix. nms_boxes, nms_scores, nms_classes, _ = postprocess.per_class_nms( params, kwargs['boxes'], kwargs['scores'], kwargs['classes'], kwargs['image_scales']) img_ids = tf.cast(tf.expand_dims(kwargs['image_ids'], -1), nms_scores.dtype) detections_bs = [ img_ids * tf.ones_like(nms_scores), nms_boxes[:, :, 1], nms_boxes[:, :, 0], nms_boxes[:, :, 3] - nms_boxes[:, :, 1], nms_boxes[:, :, 2] - nms_boxes[:, :, 0], nms_scores, nms_classes, ] detections_bs = tf.stack(detections_bs, axis=-1, name='detnections') if params.get('testdev_dir', None): logging.info('Eval testdev_dir %s', params['testdev_dir']) eval_metric = coco_metric.EvaluationMetric( testdev_dir=params['testdev_dir']) coco_metrics = eval_metric.estimator_metric_fn( detections_bs, tf.zeros([1])) else: logging.info('Eval val with groudtruths %s.', params['val_json_file']) eval_metric = coco_metric.EvaluationMetric( filename=params['val_json_file']) coco_metrics = eval_metric.estimator_metric_fn( detections_bs, kwargs['groundtruth_data'], params['label_map']) # Add metrics to output. cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics cls_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(cls_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) box_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(box_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) cls_outputs = postprocess.to_list(cls_outputs) box_outputs = postprocess.to_list(box_outputs) params['nms_configs']['max_nms_inputs'] = anchors.MAX_DETECTION_POINTS boxes, scores, classes = postprocess.pre_nms(params, cls_outputs, box_outputs) metric_fn_inputs = { 'cls_loss_repeat': cls_loss_repeat, 'box_loss_repeat': box_loss_repeat, 'image_ids': labels['source_ids'], 'groundtruth_data': labels['groundtruth_data'], 'image_scales': labels['image_scales'], 'boxes': boxes, 'scores': scores, 'classes': classes, } eval_metrics = (metric_fn, metric_fn_inputs) checkpoint = params.get('ckpt') or params.get('backbone_ckpt') if checkpoint and mode == tf.estimator.ModeKeys.TRAIN: # Initialize the model from an EfficientDet or backbone checkpoint. if params.get('ckpt') and params.get('backbone_ckpt'): raise RuntimeError( '--backbone_ckpt and --checkpoint are mutually exclusive') if params.get('backbone_ckpt'): var_scope = params['backbone_name'] + '/' if params['ckpt_var_scope'] is None: # Use backbone name as default checkpoint scope. ckpt_scope = params['backbone_name'] + '/' else: ckpt_scope = params['ckpt_var_scope'] + '/' else: # Load every var in the given checkpoint var_scope = ckpt_scope = '/' def scaffold_fn(): """Loads pretrained model through scaffold function.""" logging.info('restore variables from %s', checkpoint) var_map = utils.get_ckpt_var_map( ckpt_path=checkpoint, ckpt_scope=ckpt_scope, var_scope=var_scope, skip_mismatch=params['skip_mismatch']) tf.train.init_from_checkpoint(checkpoint, var_map) return tf.train.Scaffold() elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay: def scaffold_fn(): """Load moving average variables for eval.""" logging.info('Load EMA vars with ema_decay=%f', moving_average_decay) restore_vars_dict = ema.variables_to_restore(ema_vars) saver = tf.train.Saver(restore_vars_dict) return tf.train.Scaffold(saver=saver) else: scaffold_fn = None if params['strategy'] != 'tpu': # Profile every 1K steps. profile_hook = tf.train.ProfilerHook(save_steps=1000, output_dir=params['model_dir']) training_hooks.append(profile_hook) # Report memory allocation if OOM class OomReportingHook(tf.estimator.SessionRunHook): def before_run(self, run_context): return tf.estimator.SessionRunArgs( fetches=[], options=tf.RunOptions( report_tensor_allocations_upon_oom=True)) training_hooks.append(OomReportingHook()) logging_hook = tf.train.LoggingTensorHook( { 'step': global_step, 'det_loss': det_loss, 'cls_loss': cls_loss, 'box_loss': box_loss, }, every_n_iter=params.get('iterations_per_loop', 100), ) training_hooks.append(logging_hook) return tf.estimator.tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, eval_metrics=eval_metrics, host_call=utils.get_tpu_host_call( global_step, params), scaffold_fn=scaffold_fn, training_hooks=training_hooks)
def create_dual_approx(num_layers, batch_size, action_max, W_T_list, b_T_list, action_tensor_center, return_full_info=False): #layers_n: number of hidden units each layer #W_T_list, b_T_list: multiplicatie and bias weights for each layer #action_tensor_center: raw input, y: one-hot encoding of labels # List of bounds (l_i,u_i) for i = 2,...,K-1 l_list = [tf.zeros_like(action_tensor_center)] u_list = [tf.zeros_like(action_tensor_center)] # List of transition matrices D_i for i = 2,...,K-1 D_list = [tf.zeros_like(action_tensor_center)] # Indicators of spanning ReLu neurons for i = 2,...,K-1 I_list = [tf.zeros_like(action_tensor_center)] # Indicators of active ReLu neurons for i = 2,...,K-1 Ip_list = [tf.zeros_like(action_tensor_center)] # Final list of duals nu_i for i = 2,...,K-1 Nu_list = [tf.zeros([batch_size, W_T_list[0].get_shape().as_list()[1], 1])] # Initialize Nu_K Nu_K = -tf.expand_dims(-tf.eye(1), axis=-1) # Final list of b_i'*nu_{i+1} for i = 1,...,K-1 gamma_list = [b_T_list[0]] # Pre-compute bounds for layer 2 # Initialize Nu_hat_1 Nu_hat_1 = tf.tile(tf.expand_dims(W_T_list[0], axis=0), [batch_size, 1, 1]) # Initialize bounds l_2 = tf.matmul(action_tensor_center, W_T_list[0]) + gamma_list[0] - action_max * tf.norm( Nu_hat_1, 1, axis=1, keepdims=False) u_2 = tf.matmul(action_tensor_center, W_T_list[0]) + gamma_list[0] + action_max * tf.norm( Nu_hat_1, 1, axis=1, keepdims=False) # Add to list (store in vector format) l_list.append(l_2) u_list.append(u_2) # Recursion for i in range(2, num_layers): # form Ip, I Ip_i, I_i = get_I(l_list[i - 1], u_list[i - 1]) I_list.append(I_i) Ip_list.append(Ip_i) # form D D_i = get_D(l_list[i - 1], u_list[i - 1], Ip_i, I_i) D_list.append(D_i) # initialize nu_i Nu_list.append(tf.einsum('ij,jk->ijk', D_i, W_T_list[i - 1])) # initialize gamma_i gamma_list.append(b_T_list[i - 1]) # if final iteration, update with Nu_K if i == num_layers - 1: Nu_K = tf.tile(Nu_K, [Nu_list[i - 1].get_shape().as_list()[0], 1, 1]) Nu_list[i - 1] = tf.einsum('ijk,ikm->ijm', Nu_list[i - 1], Nu_K) gamma_list[i - 1] = tf.einsum('ij,ijm->im', gamma_list[i - 1], Nu_K) # initialize next layer bounds l_ip1 = tf.einsum('ij,ijm->im', l_list[i - 1] * I_list[i - 1], tf.nn.relu(-Nu_list[i - 1])) u_ip1 = -tf.einsum('ij,ijm->im', l_list[i - 1] * I_list[i - 1], tf.nn.relu(Nu_list[i - 1])) # update nu for layers i-1,...,2 for j in range(i - 1, 1, -1): Nu_hat_j = tf.einsum('jk,ikm->ijm', W_T_list[j - 1], Nu_list[j]) Nu_list[j - 1] = tf.einsum('ij,ijk->ijk', D_list[j - 1], Nu_hat_j) l_ip1 = tf.add( l_ip1, tf.einsum('ij,ijm->im', l_list[j - 1] * I_list[j - 1], tf.nn.relu(-Nu_list[j - 1]))) u_ip1 = tf.subtract( u_ip1, tf.einsum('ij,ijm->im', l_list[j - 1] * I_list[j - 1], tf.nn.relu(Nu_list[j - 1]))) # update nu_hat_1 Nu_hat_1 = tf.einsum('jk,ikm->ijm', W_T_list[0], Nu_list[1]) # start sum psi = tf.einsum('ij,ijm->im', action_tensor_center, Nu_hat_1) + gamma_list[i - 1] # update gamma for layers 1,...,i-1 for j in range(1, i): gamma_list[j - 1] = tf.einsum('ij,ijm->im', b_T_list[j - 1], Nu_list[j]) psi = tf.add(psi, gamma_list[j - 1]) Nu_hat_1_norm = tf.norm(Nu_hat_1, 1, axis=1, keepdims=False) if i < num_layers - 1: # finalize bounds l_ip1 = tf.add(l_ip1, psi - action_max * Nu_hat_1_norm) u_ip1 = tf.add(u_ip1, psi + action_max * Nu_hat_1_norm) # add to list l_list.append(l_ip1) u_list.append(u_ip1) else: # compute J_tilde J_tilde = -psi - action_max * Nu_hat_1_norm - u_ip1 if return_full_info: return (-J_tilde, l_list, u_list, D_list, Nu_list, gamma_list, psi, l_ip1, u_ip1, Nu_hat_1) else: return -J_tilde
def read_png(filename): """Creates graph to load a PNG image file.""" string = tf.io.read_file(filename) image = tf.image.decode_image(string) image = tf.expand_dims(image, 0) return image
def resize_to_range(image, masks=None, min_dimension=None, max_dimension=None, method=tf.image.ResizeMethod.BILINEAR, align_corners=False, pad_to_max_dimension=False): """Resizes an image so its dimensions are within the provided value. The output size can be described by two cases: 1. If the image can be rescaled so its minimum dimension is equal to the provided value without the other dimension exceeding max_dimension, then do so. 2. Otherwise, resize so the largest dimension is equal to max_dimension. Args: image: A 3D tensor of shape [height, width, channels] masks: (optional) rank 3 float32 tensor with shape [num_instances, height, width] containing instance masks. min_dimension: (optional) (scalar) desired size of the smaller image dimension. max_dimension: (optional) (scalar) maximum allowed size of the larger image dimension. method: (optional) interpolation method used in resizing. Defaults to BILINEAR. align_corners: bool. If true, exactly align all 4 corners of the input and output. Defaults to False. pad_to_max_dimension: Whether to resize the image and pad it with zeros so the resulting image is of the spatial size [max_dimension, max_dimension]. If masks are included they are padded similarly. Returns: Note that the position of the resized_image_shape changes based on whether masks are present. resized_image: A 3D tensor of shape [new_height, new_width, channels], where the image has been resized (with bilinear interpolation) so that min(new_height, new_width) == min_dimension or max(new_height, new_width) == max_dimension. resized_masks: If masks is not None, also outputs masks. A 3D tensor of shape [num_instances, new_height, new_width]. resized_image_shape: A 1D tensor of shape [3] containing shape of the resized image. Raises: ValueError: if the image is not a 3D tensor. """ if len(image.get_shape()) != 3: raise ValueError('Image should be 3D tensor') with tf.name_scope('ResizeToRange', values=[image, min_dimension]): if image.get_shape().is_fully_defined(): new_size = _compute_new_static_size(image, min_dimension, max_dimension) else: new_size = _compute_new_dynamic_size(image, min_dimension, max_dimension) new_image = tf.image.resize_images( image, new_size[:-1], method=method, align_corners=align_corners) if pad_to_max_dimension: new_image = tf.image.pad_to_bounding_box( new_image, 0, 0, max_dimension, max_dimension) result = [new_image] if masks is not None: new_masks = tf.expand_dims(masks, 3) new_masks = tf.image.resize_images( new_masks, new_size[:-1], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR, align_corners=align_corners) new_masks = tf.squeeze(new_masks, 3) if pad_to_max_dimension: new_masks = tf.image.pad_to_bounding_box( new_masks, 0, 0, max_dimension, max_dimension) result.append(new_masks) result.append(new_size) return result
def define_loss(self, features, outputs): """Obtain the loss of the model.""" # Intents. # Shape: (batch_size, max_num_intents + 1). intent_logits = outputs["logit_intent_status"] # Shape: (batch_size, max_num_intents). intent_labels = features["intent_status"] # Add label corresponding to NONE intent. num_active_intents = tf.expand_dims(tf.reduce_sum(intent_labels, axis=1), axis=1) none_intent_label = tf.ones_like( num_active_intents) - num_active_intents # Shape: (batch_size, max_num_intents + 1). onehot_intent_labels = tf.concat([none_intent_label, intent_labels], axis=1) intent_loss = tf.losses.softmax_cross_entropy( onehot_intent_labels, intent_logits, weights=features["is_real_example"]) # Requested slots. # Shape: (batch_size, max_num_slots). requested_slot_logits = outputs["logit_req_slot_status"] requested_slot_labels = features["req_slot_status"] max_num_requested_slots = requested_slot_labels.get_shape().as_list( )[-1] weights = tf.sequence_mask(features["req_slot_num"], maxlen=max_num_requested_slots) # Sigmoid cross entropy is used because more than one slots can be requested # in a single utterance. requested_slot_loss = tf.losses.sigmoid_cross_entropy( requested_slot_labels, requested_slot_logits, weights=weights) # Categorical slot status. # Shape: (batch_size, max_num_cat_slots, 3). cat_slot_status_logits = outputs["logit_cat_slot_status"] cat_slot_status_labels = features["cat_slot_status"] max_num_cat_slots = cat_slot_status_labels.get_shape().as_list()[-1] one_hot_labels = tf.one_hot(cat_slot_status_labels, 3, dtype=tf.int32) cat_weights = tf.sequence_mask(features["cat_slot_num"], maxlen=max_num_cat_slots, dtype=tf.float32) cat_slot_status_loss = tf.losses.softmax_cross_entropy( tf.reshape(one_hot_labels, [-1, 3]), tf.reshape(cat_slot_status_logits, [-1, 3]), weights=tf.reshape(cat_weights, [-1])) # Categorical slot values. # Shape: (batch_size, max_num_cat_slots, max_num_slot_values). cat_slot_value_logits = outputs["logit_cat_slot_value"] cat_slot_value_labels = features["cat_slot_value"] max_num_slot_values = cat_slot_value_logits.get_shape().as_list()[-1] one_hot_labels = tf.one_hot(cat_slot_value_labels, max_num_slot_values, dtype=tf.int32) # Zero out losses for categorical slot value when the slot status is not # active. cat_loss_weight = tf.cast( tf.equal(cat_slot_status_labels, data_utils.STATUS_ACTIVE), tf.float32) cat_slot_value_loss = tf.losses.softmax_cross_entropy( tf.reshape(one_hot_labels, [-1, max_num_slot_values]), tf.reshape(cat_slot_value_logits, [-1, max_num_slot_values]), weights=tf.reshape(cat_weights * cat_loss_weight, [-1])) # Non-categorical slot status. # Shape: (batch_size, max_num_noncat_slots, 3). noncat_slot_status_logits = outputs["logit_noncat_slot_status"] noncat_slot_status_labels = features["noncat_slot_status"] max_num_noncat_slots = noncat_slot_status_labels.get_shape().as_list( )[-1] one_hot_labels = tf.one_hot(noncat_slot_status_labels, 3, dtype=tf.int32) noncat_weights = tf.sequence_mask(features["noncat_slot_num"], maxlen=max_num_noncat_slots, dtype=tf.float32) # Logits for padded (invalid) values are already masked. noncat_slot_status_loss = tf.losses.softmax_cross_entropy( tf.reshape(one_hot_labels, [-1, 3]), tf.reshape(noncat_slot_status_logits, [-1, 3]), weights=tf.reshape(noncat_weights, [-1])) # Non-categorical slot spans. # Shape: (batch_size, max_num_noncat_slots, max_num_tokens). span_start_logits = outputs["logit_noncat_slot_start"] span_start_labels = features["noncat_slot_value_start"] max_num_tokens = span_start_logits.get_shape().as_list()[-1] onehot_start_labels = tf.one_hot(span_start_labels, max_num_tokens, dtype=tf.int32) # Shape: (batch_size, max_num_noncat_slots, max_num_tokens). span_end_logits = outputs["logit_noncat_slot_end"] span_end_labels = features["noncat_slot_value_end"] onehot_end_labels = tf.one_hot(span_end_labels, max_num_tokens, dtype=tf.int32) # Zero out losses for non-categorical slot spans when the slot status is not # active. noncat_loss_weight = tf.cast( tf.equal(noncat_slot_status_labels, data_utils.STATUS_ACTIVE), tf.float32) span_start_loss = tf.losses.softmax_cross_entropy( tf.reshape(onehot_start_labels, [-1, max_num_tokens]), tf.reshape(span_start_logits, [-1, max_num_tokens]), weights=tf.reshape(noncat_weights * noncat_loss_weight, [-1])) span_end_loss = tf.losses.softmax_cross_entropy( tf.reshape(onehot_end_labels, [-1, max_num_tokens]), tf.reshape(span_end_logits, [-1, max_num_tokens]), weights=tf.reshape(noncat_weights * noncat_loss_weight, [-1])) losses = { "intent_loss": intent_loss, "requested_slot_loss": requested_slot_loss, "cat_slot_status_loss": cat_slot_status_loss, "cat_slot_value_loss": cat_slot_value_loss, "noncat_slot_status_loss": noncat_slot_status_loss, "span_start_loss": span_start_loss, "span_end_loss": span_end_loss, } for loss_name, loss in losses.items(): tf.summary.scalar(loss_name, loss) return sum(losses.values()) / len(losses)
def __init__(self, network_name, initializer, regularizer, vocab_size, embedding_size, n_class, batch_size, filter_heights, num_filters, num_units, layers=3, *args, **kwargs): self.network_name = network_name self.initializer = initializer self.regularizer = regularizer self.vocab_size = vocab_size self.n_class = n_class self.batch_size = batch_size self.filter_heights = filter_heights if isinstance(num_filters, list): # isinstance: 判断num_filters对象是不是list,是返回True,否则返回False if len(self.filter_heights) != len(num_filters): raise Exception("filter_heights和num_filters必须长度一致") else: self.num_filters = num_filters elif isinstance(num_filters, int): self.num_filters = [num_filters for _ in self.filter_heights] else: raise Exception("参数num_filters只能是list列表或者int类型的数字!!!") self.embedding_size = embedding_size self.num_units = num_units self.layers = layers with tf.variable_scope(self.network_name, initializer=self.initializer, regularizer=self.regularizer): # 1. Placeholders for input, output, dropout, batch_size with tf.variable_scope("placeholders"): self.input = tf.placeholder(tf.int32, [None, None], name='input_x') self.output = tf.placeholder(tf.int32, [None], name='input_y') self.dropout_keep_prob = tf.placeholder_with_default( 1.0, shape=[], name='dropout_keep_prob') self.batch_size = tf.placeholder_with_default( self.batch_size, shape=[], name='batch_size') # 计算一个批次中序列的长度(因为填充式填充0) # [N,T] -> [N,T] -> [N,T] -> [N,] self.lengths = tf.reduce_sum(tf.sign(tf.abs(self.input)), axis=-1) # 1.5 Embedding Layer with tf.device('/cpu:0'), tf.name_scope("embedding"): self.embedding = tf.Variable( # 指定初始化的范围 tf.random_uniform([self.vocab_size, self.embedding_size], -1.0, 1.0), name="W") # embedded_chars结构为[batch_size, sequence_length, embedding_size], [N, T, E] self.embedded_chars = tf.nn.embedding_lookup( self.embedding, self.input) # 转化为4维的,原本是三维的,tf处理的是4维的,新维度是-1; # [batch_size, sequence_length, embedding_size, channel], [N, T, E, 1] self.embedded_chars_expanded = tf.expand_dims( self.embedded_chars, -1) # 2. Build CNN + LSTM output outputs = [] num_filters_total = 0 print(filter_heights, num_filters) with tf.variable_scope("cnn-rnn"): for idx, filter_height in enumerate(self.filter_heights): with tf.variable_scope("conv-%s" % idx): # Convolution Layer num_filters_total += self.num_filters[idx] # filter_size选几个单词h,embedding_size每个占了多长w 7*5*1 输入1维,输出128维 128个特征图 filter_shape = [ filter_height, self.embedding_size, 1, self.num_filters[idx] ] # 高斯初始化 print(filter_shape) W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.01), name="W") print(W) # 初始化为常量0.1 b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b") print(b) conv = tf.nn.conv2d( self.embedded_chars_expanded, W, strides=[1, 1, 1, 1], padding="VALID", # 不做padding name="conv") # Apply nonlinearity: [N, H, W, C] # N: 样本数目(批次大小) # H: 卷积之后的高度: h = length - filter_height + 1 # W: 1 # C: self.num_filters[i] h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") with tf.variable_scope("lstm-%s" % idx): # 0. 构建lstm的输入以及长度(因为这里的卷积不进行填充,序列长度会发生改变) lengths = self.lengths - filter_height + 1 cell_inputs = tf.squeeze( h, axis=2) # [B,T,1,D] -> [B,T,D] # 1. 构建RNN Cell def cell(units): return tf.nn.rnn_cell.BasicLSTMCell(units) cell_fw = tf.nn.rnn_cell.MultiRNNCell(cells=[ cell(self.num_units) for _ in range(self.layers) ]) cell_bw = tf.nn.rnn_cell.MultiRNNCell(cells=[ cell(self.num_units) for _ in range(self.layers) ]) # 2. 动态构建RNN结构 (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, # 正向RNN Cell cell_bw=cell_bw, # 反向RNN Cell inputs= cell_inputs, # RNN的输入,动态RNN要求输入的数据格式必须为: [B,T,D] sequence_length=lengths, # RNN输入数据的序列长度,必须为: [B,] dtype=cell_inputs.dtype # RNN初始化状态的数据类型 ) # 3. 结果拼接(如果是做反向的LSTM的话,获取最后一个时刻对应的输出值实际上是无用的) batch_size = tf.shape(output_fw)[0] # 获取批次大小 indices_fw = tf.concat( [ tf.reshape(tf.range(batch_size), shape=(-1, 1)), # 样本索引, [0,N-1] tf.reshape( lengths - 1, shape=(-1, 1)) # 样本长度最后一个时刻的索引值, 每个样本的长度信息 ], axis=-1) indices_bw = tf.concat( [ tf.reshape(tf.range(batch_size), shape=(-1, 1)), # 样本索引, [0,N-1] tf.reshape(tf.zeros_like(lengths - 1), shape=(-1, 1)) # 反向获取第一个时刻的值,索引位置为0 ], axis=-1) # 获取对应索引位置的值后,进行拼接 output = tf.concat( ( tf.gather_nd( output_fw, indices_fw ), # 基于索引获取对应位置的值,[B,U], 获取正向的最后一个时刻的值 tf.gather_nd(output_bw, indices_bw ) # 基于索引获取对应位置的值,[B,U], 获取第一个时刻的值 ), axis=-1) outputs.append(output) # 做一个合并 output = tf.concat(outputs, -1) # d. 做一个drop out操作 h_drop = tf.nn.dropout(output, keep_prob=self.dropout_keep_prob) # 3. Build FC output with tf.variable_scope("fc"): in_units = h_drop.get_shape()[-1] w = tf.get_variable(name='w', shape=[in_units, self.n_class]) b = tf.get_variable(name='b', shape=[self.n_class]) self.scores = tf.nn.xw_plus_b(h_drop, weights=w, biases=b, name='scores') self.predictions = tf.argmax(self.scores, axis=1, name='predictions') # 4. Build Loss with tf.variable_scope("loss"): self.losses = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.output, logits=self.scores)) tf.losses.add_loss(self.losses) self.total_loss = tf.losses.get_total_loss(name='total_loss') tf.summary.scalar('total_loss', self.total_loss) tf.summary.scalar('loss', self.losses) # 5. Build Estimate eval with tf.variable_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.cast(self.output, tf.int64)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name='accuracy') tf.summary.scalar('accuracy', self.accuracy)
def regular_log_prob_fn(params): if regression_hier_type == "none": beta = params beta_scaled = beta elif regression_hier_type == "centered": mu_0 = params[Ellipsis, -1] tau_0 = tf.nn.softplus(params[Ellipsis, -2]) beta = params[Ellipsis, :-2] beta_scaled = beta elif regression_hier_type == "non_centered": mu_0 = params[Ellipsis, -1] tau_0 = tf.nn.softplus(params[Ellipsis, -2]) beta = params[Ellipsis, :-2] beta_scaled = beta / tf.expand_dims(tau_0, -1) + tf.expand_dims( mu_0, -1) else: raise ValueError("Unknown regression_hier_type:" + regression_hier_type) if batch_size: def body(_, i): y_dist = tfd.Categorical( logits=tf.einsum( "ij,kjm->kim", x[i:i + batch_size], tf.reshape(beta_scaled, [-1, num_features, num_classes]))) return tf.reduce_sum(y_dist.log_prob(y[i:i + batch_size]), -1) log_prob = tf.reduce_sum( tf.scan( body, tf.range(0, x.shape[0], batch_size), initializer=tf.zeros(tf.shape(params)[:1]), parallel_iterations=1), 0) else: y_dist = tfd.Categorical( logits=tf.einsum( "ij,kjm->kim", x, tf.reshape(beta_scaled, [-1, num_features, num_classes]))) log_prob = tf.reduce_sum(y_dist.log_prob(y), -1) def make_beta_dist(loc, scale): if regression_beta_prior == "normal": return tfd.Normal(loc=loc, scale=scale) else: if tf.convert_to_tensor(loc).shape.ndims == 0: loc = tf.fill( tf.stack([tf.shape(params)[0], num_features * num_classes]), loc) if tf.convert_to_tensor(scale).shape.ndims == 0: scale = tf.fill( tf.stack([tf.shape(params)[0], num_features * num_classes]), scale) scale = tf.linalg.LinearOperatorDiag(scale) return tfd.MultivariateStudentTLinearOperator( loc=loc, scale=scale, df=t_dof) if regression_hier_type == "none": beta_dist = make_beta_dist(loc=0.0, scale=10.0) else: mu_0_dist = tfd.Normal(loc=0.0, scale=10.0) tau_0_dist = tfd.Gamma(2.0, 1.0) log_prob += mu_0_dist.log_prob(mu_0) + tau_0_dist.log_prob(tau_0) if regression_hier_type == "centered": mu_0 = tf.tile( tf.expand_dims(mu_0, -1), [1, num_features * num_classes]) tau_0 = tf.tile( tf.expand_dims(tau_0, -1), [1, num_features * num_classes]) beta_dist = make_beta_dist(loc=mu_0, scale=1.0 / tau_0) elif regression_hier_type == "non_centered": beta_dist = make_beta_dist(loc=0.0, scale=1.0) log_prob += tf.reduce_sum(beta_dist.log_prob(beta), -1) return log_prob
def multilevel_roi_align(features, boxes, box_levels, output_size, num_samples_per_cell_y=1, num_samples_per_cell_x=1, align_corners=False, extrapolation_value=0.0, scope=None): """Applies RoI Align op and returns feature for boxes. Given multiple features maps indexed by different levels, and a set of boxes where each box is mapped to a certain level, this function selectively crops and resizes boxes from the corresponding feature maps. We follow the RoI Align technique in https://arxiv.org/pdf/1703.06870.pdf figure 3. Specifically, each box is subdivided uniformly into a grid consisting of output_size[0] x output_size[1] rectangular cells. Within each cell we select `num_points` points uniformly and compute feature values using bilinear interpolation. Finally, we average pool the interpolated values in each cell to obtain a [output_size[0], output_size[1], channels] feature. If `align_corners` is true, sampling points are uniformly spread such that corner points exactly overlap corners of the boxes. In this function we also follow the convention of treating feature pixels as point objects with no spatial extent. Args: features: A list of 4D float tensors of shape [batch_size, max_height, max_width, channels] containing features. Note that each feature map must have the same number of channels. boxes: A 3D float tensor of shape [batch_size, num_boxes, 4] containing boxes of the form [ymin, xmin, ymax, xmax] in normalized coordinates. box_levels: A 3D int32 tensor of shape [batch_size, num_boxes] representing the feature level index for each box. output_size: An list of two integers [size_y, size_x] indicating the output feature size for each box. num_samples_per_cell_y: Number of grid points to sample along y axis in each cell. num_samples_per_cell_x: Number of grid points to sample along x axis in each cell. align_corners: Whether to align the corner grid points exactly with box corners. extrapolation_value: a float value to use for extrapolation. scope: Scope name to use for this op. Returns: A 5D float tensor of shape [batch_size, num_boxes, output_size[0], output_size[1], channels] representing the cropped features. """ with tf.name_scope(scope, 'MultiLevelRoIAlign'): features, true_feature_shapes = pad_to_max_size(features) batch_size = tf.shape(features)[0] num_levels = features.get_shape().as_list()[1] max_feature_height = tf.shape(features)[2] max_feature_width = tf.shape(features)[3] num_filters = features.get_shape().as_list()[4] num_boxes = tf.shape(boxes)[1] # Convert boxes to absolute co-ordinates. true_feature_shapes = tf.cast(true_feature_shapes, dtype=boxes.dtype) true_feature_shapes = tf.gather(true_feature_shapes, box_levels) boxes *= tf.concat([true_feature_shapes - 1] * 2, axis=-1) size_y = output_size[0] * num_samples_per_cell_y size_x = output_size[1] * num_samples_per_cell_x box_grid_y, box_grid_x = box_grid_coordinate_vectors( boxes, size_y=size_y, size_x=size_x, align_corners=align_corners) (feature_grid_y0, feature_grid_x0, feature_grid_y1, feature_grid_x1) = feature_grid_coordinate_vectors(box_grid_y, box_grid_x) feature_grid_y = tf.reshape( tf.stack([feature_grid_y0, feature_grid_y1], axis=3), [batch_size, num_boxes, -1]) feature_grid_x = tf.reshape( tf.stack([feature_grid_x0, feature_grid_x1], axis=3), [batch_size, num_boxes, -1]) feature_coordinates = ravel_indices(feature_grid_y, feature_grid_x, num_levels, max_feature_height, max_feature_width, box_levels) valid_indices = _valid_indicator(feature_grid_y, feature_grid_x, true_feature_shapes) feature_coordinates = tf.where(valid_indices, feature_coordinates, -1 * tf.ones_like(feature_coordinates)) flattened_features = tf.reshape(features, [-1, num_filters]) flattened_feature_values = _gather_valid_indices(flattened_features, feature_coordinates, extrapolation_value) features_per_box = tf.reshape( flattened_feature_values, [batch_size, num_boxes, size_y * 2, size_x * 2, num_filters]) # Cast tensors into dtype of features. box_grid_y = tf.cast(box_grid_y, dtype=features_per_box.dtype) box_grid_x = tf.cast(box_grid_x, dtype=features_per_box.dtype) feature_grid_y0 = tf.cast(feature_grid_y0, dtype=features_per_box.dtype) feature_grid_x0 = tf.cast(feature_grid_x0, dtype=features_per_box.dtype) # RoI Align operation is a bilinear interpolation of four # neighboring feature points f0, f1, f2, and f3 onto point y, x given by # f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T # [f10, f11]] # # Unrolling the matrix multiplies gives us: # f(y, x) = (hy * hx) f00 + (hy * lx) f01 + (ly * hx) f10 + (lx * ly) f11 # f(y, x) = w00 * f00 + w01 * f01 + w10 * f10 + w11 * f11 # # This can be computed by applying pointwise multiplication and sum_pool in # a 2x2 window. ly = box_grid_y - feature_grid_y0 lx = box_grid_x - feature_grid_x0 hy = 1.0 - ly hx = 1.0 - lx kernel_y = tf.reshape( tf.stack([hy, ly], axis=3), [batch_size, num_boxes, size_y * 2, 1]) kernel_x = tf.reshape( tf.stack([hx, lx], axis=3), [batch_size, num_boxes, 1, size_x * 2]) # Multiplier 4 is to make tf.nn.avg_pool behave like sum_pool. interpolation_kernel = kernel_y * kernel_x * 4 # Interpolate the gathered features with computed interpolation kernels. features_per_box *= tf.expand_dims(interpolation_kernel, axis=4), features_per_box = tf.reshape( features_per_box, [batch_size * num_boxes, size_y * 2, size_x * 2, num_filters]) # This combines the two pooling operations - sum_pool to perform bilinear # interpolation and avg_pool to pool the values in each bin. features_per_box = tf.nn.avg_pool( features_per_box, [1, num_samples_per_cell_y * 2, num_samples_per_cell_x * 2, 1], [1, num_samples_per_cell_y * 2, num_samples_per_cell_x * 2, 1], 'VALID') features_per_box = tf.reshape( features_per_box, [batch_size, num_boxes, output_size[0], output_size[1], num_filters]) return features_per_box
def simulate(self, action): with tf.name_scope("environment/simulate"): actions = tf.concat([tf.expand_dims(action, axis=1)] * self._num_frames, axis=1) history = self.history_buffer.get_all_elements() with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): # We only need 1 target frame here, set it. hparams_target_frames = self._model.hparams.video_num_target_frames self._model.hparams.video_num_target_frames = 1 model_output = self._model.infer({ "inputs": history, "input_action": actions, "reset_internal_states": self._reset_model.read_value() }) self._model.hparams.video_num_target_frames = hparams_target_frames observ = tf.cast(tf.squeeze(model_output["targets"], axis=1), self.observ_dtype) reward = tf.to_float(model_output["target_reward"]) reward = tf.reshape(reward, shape=(self.batch_size, )) + self._min_reward if self._intrinsic_reward_scale: # Use the model's uncertainty about its prediction as an intrinsic # reward. The uncertainty is measured by the log probability of the # predicted pixel value. if "targets_logits" not in model_output: raise ValueError( "The use of intrinsic rewards requires access to " "the logits. Ensure that model.infer returns " "'targets_logits'") uncertainty_reward = compute_uncertainty_reward( model_output["targets_logits"], model_output["targets"]) uncertainty_reward = tf.minimum( 1., self._intrinsic_reward_scale * uncertainty_reward) uncertainty_reward = tf.Print(uncertainty_reward, [uncertainty_reward], message="uncertainty_reward", first_n=1, summarize=8) reward += uncertainty_reward done = tf.constant(False, tf.bool, shape=(self.batch_size, )) with tf.control_dependencies([observ]): dump_frame_op = tf.cond( self._video_condition, lambda: tf.py_func( self._video_dump_frame, # pylint: disable=g-long-lambda [observ, reward], []), tf.no_op) with tf.control_dependencies([ self._observ.assign(observ), self.history_buffer.move_by_one_element(observ), dump_frame_op ]): clear_reset_model_op = tf.assign(self._reset_model, tf.constant(0.0)) with tf.control_dependencies([clear_reset_model_op]): return tf.identity(reward), tf.identity(done)
def _single_column_cell_selection_loss(token_logits, column_logits, label_ids, cell_index, col_index, cell_mask): """Computes the loss for cell selection constrained to a single column. The loss is a hierarchical log-likelihood. The model first predicts a column and then selects cells within that column (conditioned on the column). Cells outside the selected column are never selected. Args: token_logits: <float>[batch_size, seq_length] Logits per token. column_logits: <float>[batch_size, max_num_cols] Logits per column. label_ids: <int32>[batch_size, seq_length] Labels per token. cell_index: segmented_tensor.IndexMap [batch_size, seq_length] Index that groups tokens into cells. col_index: segmented_tensor.IndexMap [batch_size, seq_length] Index that groups tokens into columns. cell_mask: <float>[batch_size, max_num_rows * max_num_cols] Input mask per cell, 1 for cells that exists in the example and 0 for padding. Returns: selection_loss_per_example: <float>[batch_size] Loss for each example. logits: <float>[batch_size, seq_length] New logits which are only allowed to select cells in a single column. Logits outside of the most likely column according to `column_logits` will be set to a very low value (such that the probabilities are 0). """ # First find the column we should select. We use the column with maximum # number of selected cells. labels_per_column, _ = segmented_tensor.reduce_sum( tf.cast(label_ids, tf.float32), col_index) column_label = tf.argmax(labels_per_column, axis=-1, output_type=tf.int32) # Check if there are no selected cells in the column. In that case the model # should predict the special column id 0, which means "select nothing". no_cell_selected = tf.equal(tf.reduce_max(labels_per_column, axis=-1), 0) column_label = tf.where(no_cell_selected, tf.zeros_like(column_label), column_label) column_dist = tfp.distributions.Categorical(logits=column_logits) column_loss_per_example = -column_dist.log_prob(column_label) # Reduce the labels and logits to per-cell from per-token. logits_per_cell, _ = segmented_tensor.reduce_mean(token_logits, cell_index) labels_per_cell, labels_index = segmented_tensor.reduce_max( tf.cast(label_ids, tf.int32), cell_index) # Mask for the selected column. column_id_for_cells = cell_index.project_inner(labels_index).indices column_mask = tf.cast( tf.equal(column_id_for_cells, tf.expand_dims(column_label, axis=1)), tf.float32) # Compute the log-likelihood for cells, but only for the selected column. cell_dist = tfp.distributions.Bernoulli(logits=logits_per_cell) cell_log_prob = cell_dist.log_prob(labels_per_cell) cell_loss = -tf.reduce_sum(cell_log_prob * column_mask * cell_mask, axis=1) # We need to normalize the loss by the number of cells in the column. cell_loss /= tf.reduce_sum(column_mask * cell_mask, axis=1) + _EPSILON_ZERO_DIVISION selection_loss_per_example = column_loss_per_example selection_loss_per_example += tf.where( no_cell_selected, tf.zeros_like(selection_loss_per_example), cell_loss) # Set the probs outside the selected column (selected by the *model*) # to 0. This ensures backwards compatibility with models that select # cells from multiple columns. selected_column_id = tf.argmax(column_logits, axis=-1, output_type=tf.int32) selected_column_mask = tf.cast( tf.equal(column_id_for_cells, tf.expand_dims(selected_column_id, axis=-1)), tf.float32) # Never select cells with the special column id 0. selected_column_mask = tf.where(tf.equal(column_id_for_cells, 0), tf.zeros_like(selected_column_mask), selected_column_mask) logits_per_cell += _CLOSE_ENOUGH_TO_LOG_ZERO * ( 1.0 - cell_mask * selected_column_mask) logits = segmented_tensor.gather(logits_per_cell, cell_index) return selection_loss_per_example, logits
def _encoder_preprocessor( self, position_sequence, n_node, global_context, particle_types): # Extract important features from the position_sequence. most_recent_position = position_sequence[:, -1] velocity_sequence = time_diff(position_sequence) # Finite-difference. # Get connectivity of the graph. (senders, receivers, n_edge ) = connectivity_utils.compute_connectivity_for_batch_pyfunc( most_recent_position, n_node, self._connectivity_radius) # Collect node features. node_features = [] # Normalized velocity sequence, merging spatial an time axis. velocity_stats = self._normalization_stats["velocity"] normalized_velocity_sequence = ( velocity_sequence - velocity_stats.mean) / velocity_stats.std flat_velocity_sequence = snt.MergeDims(start=1, size=2)( normalized_velocity_sequence) node_features.append(flat_velocity_sequence) # Normalized clipped distances to lower and upper boundaries. # boundaries are an array of shape [num_dimensions, 2], where the second # axis, provides the lower/upper boundaries. boundaries = tf.constant(self._boundaries, dtype=tf.float32) distance_to_lower_boundary = ( most_recent_position - tf.expand_dims(boundaries[:, 0], 0)) distance_to_upper_boundary = ( tf.expand_dims(boundaries[:, 1], 0) - most_recent_position) distance_to_boundaries = tf.concat( [distance_to_lower_boundary, distance_to_upper_boundary], axis=1) normalized_clipped_distance_to_boundaries = tf.clip_by_value( distance_to_boundaries / self._connectivity_radius, -1., 1.) node_features.append(normalized_clipped_distance_to_boundaries) # Particle type. if self._num_particle_types > 1: particle_type_embeddings = tf.nn.embedding_lookup( self._particle_type_embedding, particle_types) node_features.append(particle_type_embeddings) # Collect edge features. edge_features = [] # Relative displacement and distances normalized to radius normalized_relative_displacements = ( tf.gather(most_recent_position, senders) - tf.gather(most_recent_position, receivers)) / self._connectivity_radius edge_features.append(normalized_relative_displacements) normalized_relative_distances = tf.norm( normalized_relative_displacements, axis=-1, keepdims=True) edge_features.append(normalized_relative_distances) # Normalize the global context. if global_context is not None: context_stats = self._normalization_stats["context"] # Context in some datasets are all zero, so add an epsilon for numerical # stability. global_context = (global_context - context_stats.mean) / tf.math.maximum( context_stats.std, STD_EPSILON) return gn.graphs.GraphsTuple( nodes=tf.concat(node_features, axis=-1), edges=tf.concat(edge_features, axis=-1), globals=global_context, # self._graph_net will appending this to nodes. n_node=n_node, n_edge=n_edge, senders=senders, receivers=receivers, )
def detection_loss(cls_outputs, box_outputs, labels, params): """Computes total detection loss. Computes total detection loss including box and class loss from all levels. Args: cls_outputs: an OrderDict with keys representing levels and values representing logits in [batch_size, height, width, num_anchors]. box_outputs: an OrderDict with keys representing levels and values representing box regression targets in [batch_size, height, width, num_anchors * 4]. labels: the dictionary that returned from dataloader that includes groundtruth targets. params: the dictionary including training parameters specified in default_haprams function in this file. Returns: total_loss: an integer tensor representing total loss reducing from class and box losses from all levels. cls_loss: an integer tensor representing total class loss. box_loss: an integer tensor representing total box regression loss. box_iou_loss: an integer tensor representing total box iou loss. """ # Sum all positives in a batch for normalization and avoid zero # num_positives_sum, which would lead to inf loss during training num_positives_sum = tf.reduce_sum(labels['mean_num_positives']) + 1.0 if params.get('positives_momentum', 0) > 0: # normalize the num_positive_examples for training stability. moving_normalizer_var = tf.Variable( 0.0, name='moving_normalizer', dtype=tf.float32, synchronization=tf.VariableSynchronization.ON_READ, trainable=False, aggregation=tf.VariableAggregation.MEAN) num_positives_sum = tf.keras.backend.moving_average_update( moving_normalizer_var, num_positives_sum, momentum=params['positives_momentum']) elif params['positives_momentum'] < 0: num_positives_sum = utils.cross_replica_mean(num_positives_sum) levels = cls_outputs.keys() cls_losses = [] box_losses = [] for level in levels: # Onehot encoding for classification labels. cls_targets_at_level = tf.one_hot(labels['cls_targets_%d' % level], params['num_classes']) if params['data_format'] == 'channels_first': bs, _, width, height, _ = cls_targets_at_level.get_shape().as_list( ) cls_targets_at_level = tf.reshape(cls_targets_at_level, [bs, -1, width, height]) else: bs, width, height, _, _ = cls_targets_at_level.get_shape().as_list( ) cls_targets_at_level = tf.reshape(cls_targets_at_level, [bs, width, height, -1]) box_targets_at_level = labels['box_targets_%d' % level] cls_loss = focal_loss(cls_outputs[level], cls_targets_at_level, params['alpha'], params['gamma'], normalizer=num_positives_sum, label_smoothing=params['label_smoothing']) if params['data_format'] == 'channels_first': cls_loss = tf.reshape( cls_loss, [bs, -1, width, height, params['num_classes']]) else: cls_loss = tf.reshape( cls_loss, [bs, width, height, -1, params['num_classes']]) cls_loss *= tf.cast( tf.expand_dims(tf.not_equal(labels['cls_targets_%d' % level], -2), -1), tf.float32) cls_losses.append(tf.clip_by_value(tf.reduce_sum(cls_loss), 0.0, 2.0)) if params['box_loss_weight']: box_losses.append( _box_loss(box_outputs[level], box_targets_at_level, num_positives_sum, delta=params['delta'])) if params['iou_loss_type']: input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) box_output_list = [tf.reshape(box_outputs[i], [-1, 4]) for i in levels] box_outputs = tf.concat(box_output_list, axis=0) box_target_list = [ tf.reshape(labels['box_targets_%d' % level], [-1, 4]) for level in levels ] box_targets = tf.concat(box_target_list, axis=0) anchor_boxes = tf.tile(input_anchors.boxes, [params['batch_size'], 1]) box_outputs = anchors.decode_box_outputs(box_outputs, anchor_boxes) box_targets = anchors.decode_box_outputs(box_targets, anchor_boxes) box_iou_loss = _box_iou_loss(box_outputs, box_targets, num_positives_sum, params['iou_loss_type']) else: box_iou_loss = 0 # Sum per level losses to total loss. cls_loss = tf.add_n(cls_losses) box_loss = tf.add_n(box_losses) if box_losses else 0 total_loss = (cls_loss + params['box_loss_weight'] * box_loss + params['iou_loss_weight'] * box_iou_loss) return total_loss, cls_loss, box_loss, box_iou_loss
def calc_center_bb(binary_class_mask): """ Returns the center of mass coordinates for the given binary_class_mask. """ with tf.variable_scope('calc_center_bb'): binary_class_mask = tf.cast(binary_class_mask, tf.int32) binary_class_mask = tf.equal(binary_class_mask, 1) s = binary_class_mask.get_shape().as_list() if len(s) == 4: binary_class_mask = tf.squeeze(binary_class_mask, [3]) s = binary_class_mask.get_shape().as_list() assert len(s) == 3, "binary_class_mask must be 3D." assert (s[0] < s[1]) and ( s[0] < s[2]), "binary_class_mask must be [Batch, Width, Height]" # my meshgrid x_range = tf.expand_dims(tf.range(s[1]), 1) y_range = tf.expand_dims(tf.range(s[2]), 0) X = tf.tile(x_range, [1, s[2]]) Y = tf.tile(y_range, [s[1], 1]) bb_list = list() center_list = list() crop_size_list = list() for i in range(s[0]): X_masked = tf.cast(tf.boolean_mask(X, binary_class_mask[i, :, :]), tf.float32) Y_masked = tf.cast(tf.boolean_mask(Y, binary_class_mask[i, :, :]), tf.float32) x_min = tf.reduce_min(X_masked) x_max = tf.reduce_max(X_masked) y_min = tf.reduce_min(Y_masked) y_max = tf.reduce_max(Y_masked) start = tf.stack([x_min, y_min]) end = tf.stack([x_max, y_max]) bb = tf.stack([start, end], 1) bb_list.append(bb) center_x = 0.5 * (x_max + x_min) center_y = 0.5 * (y_max + y_min) center = tf.stack([center_x, center_y], 0) center = tf.cond(tf.reduce_all(tf.is_finite(center)), lambda: center, lambda: tf.constant([160.0, 160.0])) center.set_shape([2]) center_list.append(center) crop_size_x = x_max - x_min crop_size_y = y_max - y_min crop_size = tf.expand_dims(tf.maximum(crop_size_x, crop_size_y), 0) crop_size = tf.cond(tf.reduce_all(tf.is_finite(crop_size)), lambda: crop_size, lambda: tf.constant([100.0])) crop_size.set_shape([1]) crop_size_list.append(crop_size) bb = tf.stack(bb_list) center = tf.stack(center_list) crop_size = tf.stack(crop_size_list) return center, bb, crop_size
def metric_fn(**kwargs): """Returns a dictionary that has the evaluation metrics.""" if params['nms_configs'].get('pyfunc', True): detections_bs = [] for index in range(kwargs['boxes'].shape[0]): nms_configs = params['nms_configs'] detections = tf.numpy_function( functools.partial(nms_np.per_class_nms, nms_configs=nms_configs), [ kwargs['boxes'][index], kwargs['scores'][index], kwargs['classes'][index], tf.slice(kwargs['image_ids'], [index], [1]), tf.slice(kwargs['image_scales'], [index], [1]), params['num_classes'], nms_configs['max_output_size'], ], tf.float32) detections_bs.append(detections) detections_bs = postprocess.transform_detections( tf.stack(detections_bs)) else: # These two branches should be equivalent, but currently they are not. # TODO(tanmingxing): enable the non_pyfun path after bug fix. nms_boxes, nms_scores, nms_classes, _ = postprocess.per_class_nms( params, kwargs['boxes'], kwargs['scores'], kwargs['classes'], kwargs['image_scales']) img_ids = tf.cast(tf.expand_dims(kwargs['image_ids'], -1), nms_scores.dtype) detections_bs = [ img_ids * tf.ones_like(nms_scores), nms_boxes[:, :, 1], nms_boxes[:, :, 0], nms_boxes[:, :, 3] - nms_boxes[:, :, 1], nms_boxes[:, :, 2] - nms_boxes[:, :, 0], nms_scores, nms_classes, ] detections_bs = tf.stack(detections_bs, axis=-1, name='detnections') if params.get('testdev_dir', None): logging.info('Eval testdev_dir %s', params['testdev_dir']) eval_metric = coco_metric.EvaluationMetric( testdev_dir=params['testdev_dir']) coco_metrics = eval_metric.estimator_metric_fn( detections_bs, tf.zeros([1])) else: logging.info('Eval val with groudtruths %s.', params['val_json_file']) eval_metric = coco_metric.EvaluationMetric( filename=params['val_json_file']) coco_metrics = eval_metric.estimator_metric_fn( detections_bs, kwargs['groundtruth_data'], params['label_map']) # Add metrics to output. cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics
def _parse_train_data(self, data): """Parses data for training. Args: data: the decoded tensor dictionary from TfExampleDecoder. Returns: image: image tensor that is preproessed to have normalized value and dimension [output_size[0], output_size[1], 3] labels: a dictionary of tensors used for training. The following describes {key: value} pairs in the dictionary. image_info: a 2D `Tensor` that encodes the information of the image and the applied preprocessing. It is in the format of [[original_height, original_width], [scaled_height, scaled_width], anchor_boxes: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, 4] representing anchor boxes at each level. rpn_score_targets: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, anchors_per_location]. The height_l and width_l represent the dimension of class logits at l-th level. rpn_box_targets: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, anchors_per_location * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. gt_boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled image that is fed to the network. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. gt_classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. gt_masks: groundtrugh masks cropped by the bounding box and resized to a fixed size determined by mask_crop_size. """ classes = data['groundtruth_classes'] boxes = data['groundtruth_boxes'] if self._include_mask: masks = data['groundtruth_instance_masks'] is_crowds = data['groundtruth_is_crowd'] # Skips annotations with `is_crowd` = True. if self._skip_crowd_during_training and self._is_training: num_groundtrtuhs = tf.shape(classes)[0] with tf.control_dependencies([num_groundtrtuhs, is_crowds]): indices = tf.cond( tf.greater(tf.size(is_crowds), 0), lambda: tf.where(tf.logical_not(is_crowds))[:, 0], lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64)) classes = tf.gather(classes, indices) boxes = tf.gather(boxes, indices) if self._include_mask: masks = tf.gather(masks, indices) # Gets original image and its size. image = data['image'] image_shape = tf.shape(image)[0:2] # Normalizes image with mean and std pixel values. image = input_utils.normalize_image(image) # Flips image randomly during training. if self._aug_rand_hflip: if self._include_mask: image, boxes, masks = input_utils.random_horizontal_flip( image, boxes, masks) else: image, boxes = input_utils.random_horizontal_flip( image, boxes) # Converts boxes from normalized coordinates to pixel coordinates. # Now the coordinates of boxes are w.r.t. the original image. boxes = box_utils.denormalize_boxes(boxes, image_shape) # Resizes and crops image. image, image_info = input_utils.resize_and_crop_image( image, self._output_size, padded_size=input_utils.compute_padded_size( self._output_size, 2 ** self._max_level), aug_scale_min=self._aug_scale_min, aug_scale_max=self._aug_scale_max) image_height, image_width, _ = image.get_shape().as_list() # Resizes and crops boxes. # Now the coordinates of boxes are w.r.t the scaled image. image_scale = image_info[2, :] offset = image_info[3, :] boxes = input_utils.resize_and_crop_boxes( boxes, image_scale, image_info[1, :], offset) # Filters out ground truth boxes that are all zeros. indices = box_utils.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) if self._include_mask: masks = tf.gather(masks, indices) # Transfer boxes to the original image space and do normalization. cropped_boxes = boxes + tf.tile(tf.expand_dims(offset, axis=0), [1, 2]) cropped_boxes /= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2]) cropped_boxes = box_utils.normalize_boxes(cropped_boxes, image_shape) num_masks = tf.shape(masks)[0] masks = tf.image.crop_and_resize( tf.expand_dims(masks, axis=-1), cropped_boxes, box_indices=tf.range(num_masks, dtype=tf.int32), crop_size=[self._mask_crop_size, self._mask_crop_size], method='bilinear') masks = tf.squeeze(masks, axis=-1) # Assigns anchor targets. # Note that after the target assignment, box targets are absolute pixel # offsets w.r.t. the scaled image. input_anchor = anchor.Anchor( self._min_level, self._max_level, self._num_scales, self._aspect_ratios, self._anchor_size, (image_height, image_width)) anchor_labeler = anchor.RpnAnchorLabeler( input_anchor, self._rpn_match_threshold, self._rpn_unmatched_threshold, self._rpn_batch_size_per_im, self._rpn_fg_fraction) rpn_score_targets, rpn_box_targets = anchor_labeler.label_anchors( boxes, tf.cast(tf.expand_dims(classes, axis=-1), dtype=tf.float32)) # If bfloat16 is used, casts input image to tf.bfloat16. if self._use_bfloat16: image = tf.cast(image, dtype=tf.bfloat16) # Packs labels for model_fn outputs. labels = { 'anchor_boxes': input_anchor.multilevel_boxes, 'image_info': image_info, 'rpn_score_targets': rpn_score_targets, 'rpn_box_targets': rpn_box_targets, } labels['gt_boxes'] = input_utils.pad_to_fixed_size( boxes, self._max_num_instances, -1) labels['gt_classes'] = input_utils.pad_to_fixed_size( classes, self._max_num_instances, -1) if self._include_mask: labels['gt_masks'] = input_utils.pad_to_fixed_size( masks, self._max_num_instances, -1) return image, labels
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" logging.info("*** Model: Params ***") for name in sorted(params.keys()): logging.info(" %s = %s", name, params[name]) logging.info("*** Model: Features ***") for name in sorted(features.keys()): logging.info(" name = %s, shape = %s", name, features[name].shape) model = modeling.ReadItTwiceBertModel( config=model_config, use_one_hot_embeddings=use_one_hot_embeddings) span_prediction_layer = modeling.SpanPredictionHead( intermediate_size=model_config.intermediate_size, dropout_rate=model_config.hidden_dropout_prob) # [batch_size, main_seq_length] token_ids = features["token_ids"] main_seq_length = tf.shape(token_ids)[1] block_ids = features["block_ids"] block_pos = features["block_pos"] annotation_begins = features.get("entity_annotation_begins") annotation_ends = features.get("entity_annotation_ends") annotation_labels = features.get("entity_annotation_labels") # Do not attend padding tokens # [batch_size, main_seq_length, main_seq_length] att_mask = tf.tile( tf.expand_dims(tf.not_equal(token_ids, padding_token_id), 1), [1, main_seq_length, 1]) att_mask = tf.cast(att_mask, dtype=tf.int32) main_output = model( token_ids=token_ids, training=(mode == tf.estimator.ModeKeys.TRAIN), block_ids=block_ids, block_pos=block_pos, att_mask=att_mask, annotation_begins=annotation_begins, annotation_ends=annotation_ends, annotation_labels=annotation_labels, enable_side_inputs=enable_side_inputs, num_replicas_concat=num_replicas_concat, cross_block_attention_mode=cross_block_attention_mode, ).final_hidden_states span_logits = span_prediction_layer( hidden_states=main_output, token_ids=token_ids, padding_token_id=padding_token_id, ignore_prefix_length=features["prefix_length"], training=(mode == tf.estimator.ModeKeys.TRAIN)) tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = checkpoint_utils.get_assignment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: span_prediction_loss = losses.BatchSpanCrossEntropyLoss() total_loss = span_prediction_loss( logits=span_logits, annotation_begins=features["answer_annotation_begins"], annotation_ends=features["answer_annotation_ends"], annotation_labels=features["answer_annotation_labels"], block_ids=block_ids, num_replicas=num_replicas_concat, eps=1e-5) # Add regularization losses. if model.losses: total_loss += tf.math.add_n(model.losses) train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu, optimizer, poly_power, start_warmup_step, learning_rate_schedule, reduce_loss_sum=True) host_inputs = { "global_step": tf.expand_dims(tf.train.get_or_create_global_step(), 0), "train_metrics/loss": tf.expand_dims(total_loss, 0), } host_call = (functools.partial(record_summary_host_fn, metrics_dir=os.path.join( FLAGS.output_dir, "train_metrics")), host_inputs) output_spec = tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn, host_call=host_call) elif mode == tf.estimator.ModeKeys.PREDICT: begin_logits_values, begin_logits_indices = tf.math.top_k( span_logits[:, :, 0], k=nbest_logits_for_eval, ) end_logits_values, end_logits_indices = tf.math.top_k( span_logits[:, :, 1], k=nbest_logits_for_eval, ) predictions = { "block_ids": tf.identity(block_ids), "begin_logits_values": begin_logits_values, "begin_logits_indices": begin_logits_indices, "end_logits_values": end_logits_values, "end_logits_indices": end_logits_indices, "token_ids": tf.identity(token_ids), } output_spec = tf.estimator.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: raise ValueError("Only TRAIN and PREDICT modes is supported: %s" % (mode)) return output_spec
def body(self, features): hp = self.hparams # pylint: disable=eval-used if hp.image_input_type == "image": image_feat = vqa_layers.image_embedding( features["inputs"], model_fn=eval(hp.image_model_fn), trainable=hp.train_resnet, is_training=hp.mode == tf_estimator.ModeKeys.TRAIN) else: image_feat = features["inputs"] image_feat = common_layers.flatten4d3d(image_feat) image_feat = common_layers.dense(image_feat, hp.hidden_size) utils.collect_named_outputs("norms", "image_feat_after_proj", tf.norm(image_feat, axis=-1)) question = common_layers.flatten4d3d(features["question"]) utils.collect_named_outputs("norms", "question_embedding", tf.norm(question, axis=-1)) (encoder_input, encoder_self_attention_bias, encoder_decoder_attention_bias) = prepare_image_question_encoder( image_feat, question, hp) encoder_input = tf.nn.dropout(encoder_input, keep_prob=1. - hp.layer_prepostprocess_dropout) encoder_output, _ = recurrent_transformer_decoder( encoder_input, None, encoder_self_attention_bias, None, hp, name="encoder") utils.collect_named_outputs("norms", "encoder_output", tf.norm(encoder_output, axis=-1)) # scale query by sqrt(hidden_size) query = tf.get_variable("query", [hp.hidden_size]) * hp.hidden_size**0.5 query = tf.expand_dims(tf.expand_dims(query, axis=0), axis=0) batch_size = common_layers.shape_list(encoder_input)[0] query = tf.tile(query, [batch_size, 1, 1]) query = tf.nn.dropout(query, keep_prob=1. - hp.layer_prepostprocess_dropout) decoder_output, _ = recurrent_transformer_decoder( query, encoder_output, None, encoder_decoder_attention_bias, hp, name="decoder") utils.collect_named_outputs("norms", "decoder_output", tf.norm(decoder_output, axis=-1)) norm_tensors = utils.convert_collection_to_dict("norms") vqa_layers.summarize_tensors(norm_tensors, tag="norms/") # Expand dimension 1 and 2 return tf.expand_dims(decoder_output, axis=1)
def compress(args): """Compresses an image.""" # Load input image and add batch dimension. x = read_png(args.input_file) x = tf.expand_dims(x, 0) x.set_shape([1, None, None, 3]) x_shape = tf.shape(x) # Instantiate model. analysis_transform = AnalysisTransform(args.num_filters) entropy_bottleneck = tfc.EntropyBottleneck() synthesis_transform = SynthesisTransform(args.num_filters) # Transform and compress the image. y = analysis_transform(x) string = entropy_bottleneck.compress(y) # Transform the quantized image back (if requested). y_hat, likelihoods = entropy_bottleneck(y, training=False) x_hat = synthesis_transform(y_hat) x_hat = x_hat[:, :x_shape[1], :x_shape[2], :] num_pixels = tf.cast(tf.reduce_prod(tf.shape(x)[:-1]), dtype=tf.float32) # Total number of bits divided by number of pixels. eval_bpp = tf.reduce_sum(tf.log(likelihoods)) / (-np.log(2) * num_pixels) # Bring both images back to 0..255 range. x *= 255 x_hat = tf.clip_by_value(x_hat, 0, 1) x_hat = tf.round(x_hat * 255) mse = tf.reduce_mean(tf.squared_difference(x, x_hat)) psnr = tf.squeeze(tf.image.psnr(x_hat, x, 255)) msssim = tf.squeeze(tf.image.ssim_multiscale(x_hat, x, 255)) with tf.Session() as sess: # Load the latest model checkpoint, get the compressed string and the tensor # shapes. latest = tf.train.latest_checkpoint(checkpoint_dir=args.checkpoint_dir) tf.train.Saver().restore(sess, save_path=latest) tensors = [string, tf.shape(x)[1:-1], tf.shape(y)[1:-1]] arrays = sess.run(tensors) # Write a binary file with the shape information and the compressed string. packed = tfc.PackedTensors() packed.pack(tensors, arrays) with open(args.output_file, "wb") as f: f.write(packed.string) # If requested, transform the quantized image back and measure performance. if args.verbose: eval_bpp, mse, psnr, msssim, num_pixels = sess.run( [eval_bpp, mse, psnr, msssim, num_pixels]) # The actual bits per pixel including overhead. bpp = len(packed.string) * 8 / num_pixels print("Mean squared error: {:0.4f}".format(mse)) print("PSNR (dB): {:0.2f}".format(psnr)) print("Multiscale SSIM: {:0.4f}".format(msssim)) print("Multiscale SSIM (dB): {:0.2f}".format(-10 * np.log10(1 - msssim))) print("Information content in bpp: {:0.4f}".format(eval_bpp)) print("Actual bits per pixel: {:0.4f}".format(bpp))
def compress(args): """ compress an image :param args: :return: """ img = Image.open(args.input_file) w, h = img.size # Load input image and add batch dimension. x = read_png(args.input_file) x = tf.expand_dims(x, 0) x.set_shape([1, h, w, 3]) x_shape = tf.shape(x) # Instantiate model. analysis_transform = AnalysisTransform(args.num_filters) synthesis_transform = SynthesisTransform(args.num_filters) hyper_analysis_transform = HyperAnalysisTransform(args.num_filters) hyper_synthesis_transform = HyperSynthesisTransform(args.num_filters) entropy_bottleneck = tfc.EntropyBottleneck() entropy_bottleneck1 = tfc.EntropyBottleneck() ftransform = FTransform(args.num_filters) # Transform and compress the image. y = analysis_transform(x) y_shape = tf.shape(y) y_prime, y_prime_likelihoods = entropy_bottleneck1(y, training=False) z = hyper_analysis_transform(abs(y_prime)) z_hat, z_likelihoods = entropy_bottleneck(z, training=False) c_prime = hyper_synthesis_transform(z_hat) c_prime = c_prime[:, :y_shape[1], :y_shape[2], :] mean, sigma = get_sigma_mu(y_prime, c_prime, ftransform) scale_table = np.exp( np.linspace(np.log(SCALES_MIN), np.log(SCALES_MAX), SCALES_LEVELS)) conditional_bottleneck = tfc.GaussianConditional(sigma, scale_table, mean=mean) side_string = entropy_bottleneck.compress(z) string = conditional_bottleneck.compress(y) y_string = entropy_bottleneck1.compress(y) # Transform the quantized image back (if requested). y_hat, y_likelihoods = conditional_bottleneck(y, training=False) x_hat = synthesis_transform(y_hat) x_hat = x_hat[:, :x_shape[1], :x_shape[2], :] num_pixels = tf.cast(tf.reduce_prod(tf.shape(x)[:-1]), dtype=tf.float32) # Total number of bits divided by number of pixels. eval_bpp = (tf.reduce_sum(tf.log(y_likelihoods)) + tf.reduce_sum( tf.log(z_likelihoods))) / (-np.log(2) * num_pixels) # Bring both images back to 0..255 range. x *= 255 x_hat = tf.clip_by_value(x_hat, 0, 1) # 将每个维度控制在0,1之间 x_hat = tf.round(x_hat * 255) mse = tf.reduce_mean(tf.squared_difference(x, x_hat)) psnr = tf.squeeze(tf.image.psnr(x_hat, x, 255)) msssim = tf.squeeze(tf.image.ssim_multiscale(x_hat, x, 255)) with tf.Session() as sess: # Load the latest model checkpoint, get the compressed string and the tensor # shapes. latest = tf.train.latest_checkpoint(checkpoint_dir=args.checkpoint_dir) tf.train.Saver().restore(sess, save_path=latest) tensors = [ string, side_string, tf.shape(x)[1:-1], tf.shape(y)[1:-1], tf.shape(z)[1:-1], y_string ] arrays = sess.run(tensors) print(sess.run([tf.shape(sigma), tf.shape(y), y_hat])) # Write a binary file with the shape information and the compressed string. packed = tfc.PackedTensors() packed.pack(tensors, arrays) with open(args.output_file, "wb") as f: f.write(packed.string) # If requested, transform the quantized image back and measure performance. if args.verbose: eval_bpp, mse, psnr, msssitm, num_pixels = sess.run( [eval_bpp, mse, psnr, msssim, num_pixels]) # The actual bits per pixel including overhead. bpp = len(packed.string) * 8 / num_pixels print("Mean squared error: {:0.4f}".format(mse)) print("PSNR (dB): {:0.2f}".format(psnr)) print("Multiscale SSIM: {:0.4f}".format(msssim)) print("Multiscale SSIM (dB): {:0.2f}".format(-10 * np.log10(1 - msssim))) print("Information content in bpp: {:0.4f}".format(eval_bpp)) print("Actual bits per pixel: {:0.4f}".format(bpp))
def GlobalToGenerator(inputs, channels): with tf.variable_scope("GlobalToGenerator1"): fc1 = fullyConnected(inputs, channels, False, "fullyConnected_global_to_unet", 0.01) #Why so low ? return tf.expand_dims(tf.expand_dims(fc1, axis=1), axis=1)
def prepare_processing_graph(self, flags): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - foreground_resampling_placeholder_: Controls signal stretching/squeezing - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - output_: Output 2D fingerprint of processed audio or raw audio. Args: flags: data and model parameters, described at model_train.py Raises: ValueError: If the preprocessing mode isn't recognized. Exception: If the preprocessor wasn't compiled in. """ with tf.get_default_graph().name_scope('data'): desired_samples = flags.desired_samples self.wav_filename_placeholder_ = tf.placeholder( tf.string, [], name='wav_filename') wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = tf.audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder( tf.float32, [], name='foreground_volume') # signal resampling to generate more training data # it will stretch or squeeze input signal proportinally to: self.foreground_resampling_placeholder_ = tf.placeholder( tf.float32, []) if self.foreground_resampling_placeholder_ != 1.0: image = tf.expand_dims(wav_decoder.audio, 0) image = tf.expand_dims(image, 2) shape = tf.shape(wav_decoder.audio) image_resized = tf.image.resize( images=image, size=(tf.cast((tf.cast(shape[0], tf.float32) * self.foreground_resampling_placeholder_), tf.int32), 1), preserve_aspect_ratio=False) image_resized_cropped = tf.image.resize_with_crop_or_pad( image_resized, target_height=desired_samples, target_width=1, ) image_resized_cropped = tf.squeeze(image_resized_cropped, axis=[0, 3]) scaled_foreground = tf.multiply( image_resized_cropped, self.foreground_volume_placeholder_) else: scaled_foreground = tf.multiply( wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder( tf.int32, [2, 2], name='time_shift_padding') self.time_shift_offset_placeholder_ = tf.placeholder( tf.int32, [2], name='time_shift_offset') padded_foreground = tf.pad( tensor=scaled_foreground, paddings=self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) if flags.preprocess == 'raw': # return raw audio self.output_ = background_clamp tf.summary.image('input_audio', tf.expand_dims( tf.expand_dims(background_clamp, -1), -1), max_outputs=1) else: # Run the spectrogram and MFCC ops to get a 2D audio 'fingerprint' spectrogram = audio_ops.audio_spectrogram( background_clamp, window_size=flags.window_size_samples, stride=flags.window_stride_samples, magnitude_squared=True) tf.summary.image('spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1) # The number of buckets in each FFT row in the spectrogram will depend # on how many input samples there are in each window. This can be quite # large, with a 160 sample window producing 127 buckets for example. We # don't need this level of detail for classification, so we often want # to shrink them down to produce a smaller result. That's what this # section implements. One method is to use average pooling to merge # adjacent buckets, but a more sophisticated approach is to apply the # MFCC algorithm to shrink the representation. if flags.preprocess == 'average': self.output_ = tf.nn.pool( input=tf.expand_dims(spectrogram, -1), window_shape=[1, flags.average_window_width], strides=[1, flags.average_window_width], pooling_type='AVG', padding='SAME') tf.summary.image('shrunk_spectrogram', self.output_, max_outputs=1) elif flags.preprocess == 'mfcc': self.output_ = audio_ops.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=flags.fingerprint_width) tf.summary.image('mfcc', tf.expand_dims(self.output_, -1), max_outputs=1) elif flags.preprocess == 'micro': if not frontend_op: raise Exception( 'Micro frontend op is currently not available when running' ' TensorFlow directly from Python, you need to build and run' ' through Bazel') sample_rate = flags.sample_rate window_size_ms = (flags.window_size_samples * 1000) / sample_rate window_step_ms = (flags.window_stride_samples * 1000) / sample_rate int16_input = tf.cast(tf.multiply(background_clamp, 32768), tf.int16) micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=sample_rate, window_size=window_size_ms, window_step=window_step_ms, num_channels=flags.fingerprint_width, out_scale=1, out_type=tf.float32) self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0)) tf.summary.image('micro', tf.expand_dims( tf.expand_dims(self.output_, -1), 0), max_outputs=1) else: raise ValueError( 'Unknown preprocess mode "%s" (should be "mfcc", ' ' "average", or "micro")' % (flags.preprocess)) # Merge all the summaries and write them out to /tmp/retrain_logs (by # default) self.merged_summaries_ = tf.summary.merge_all(scope='data') if flags.summaries_dir: self.summary_writer_ = tf.summary.FileWriter( flags.summaries_dir + '/data', tf.get_default_graph())
def arbitrary_style_image_inputs(style_dataset_file, batch_size=None, image_size=None, center_crop=True, shuffle=True, augment_style_images=False, random_style_image_size=False, min_rand_image_size=128, max_rand_image_size=300): """Loads a batch of random style image given the path of tfrecord dataset. This method does not return pre-compute Gram matrices for the images like style_image_inputs. But it can provide data augmentation. If augment_style_images is equal to True, then style images will randomly modified (eg. changes in brightness, hue or saturation) for data augmentation. If random_style_image_size is set to True then all images in one batch will be resized to a random size. Args: style_dataset_file: str, path to the tfrecord dataset of style files. batch_size: int. If provided, batches style images. Defaults to None. image_size: int. The images will be resized bilinearly so that the smallest side has size image_size. Defaults to None. center_crop: bool. If True, center-crops to [image_size, image_size]. Defaults to False. shuffle: bool, whether to shuffle style files at random. Defaults to False. augment_style_images: bool. Wheather to augment style images or not. random_style_image_size: bool. If this value is True, then all the style images in one batch will be resized to a random size between min_rand_image_size and max_rand_image_size. min_rand_image_size: int. If random_style_image_size is True, this value specifies the minimum image size. max_rand_image_size: int. If random_style_image_size is True, this value specifies the maximum image size. Returns: 4-D tensor of shape [1, ?, ?, 3] with values in [0, 1] for the style image (with random changes for data augmentation if augment_style_image_size is set to true), and 0-D tensor for the style label, 4-D tensor of shape [1, ?, ?, 3] with values in [0, 1] for the style image without random changes for data augmentation. Raises: ValueError: if center cropping is requested but no image size is provided, or if batch size is specified but center-cropping or augment-style-images is not requested, or if both augment-style-images and center-cropping are requested. """ if center_crop and image_size is None: raise ValueError('center-cropping requires specifying the image size.') if center_crop and augment_style_images: raise ValueError( 'When augment_style_images is true images will be randomly cropped.' ) if batch_size is not None and not center_crop and not augment_style_images: raise ValueError( 'batching requires same image sizes (Set center-cropping or ' 'augment_style_images to true)') with tf.name_scope('style_image_processing'): # Force all input processing onto CPU in order to reserve the GPU for the # forward inference and back-propagation. with tf.device('/cpu:0'): filename_queue = tf.train.string_input_producer( [style_dataset_file], shuffle=False, capacity=1, name='filename_queue') if shuffle: examples_queue = tf.RandomShuffleQueue( capacity=64, min_after_dequeue=32, dtypes=[tf.string], name='random_examples_queue') else: examples_queue = tf.FIFOQueue(capacity=64, dtypes=[tf.string], name='fifo_examples_queue') reader = tf.TFRecordReader() _, value = reader.read(filename_queue) enqueue_ops = [examples_queue.enqueue([value])] tf.train.queue_runner.add_queue_runner( tf.train.queue_runner.QueueRunner(examples_queue, enqueue_ops)) example_serialized = examples_queue.dequeue() features = tf.parse_single_example( example_serialized, features={ 'label': tf.FixedLenFeature([], tf.int64), 'image_raw': tf.FixedLenFeature([], tf.string) }) image = tf.image.decode_jpeg(features['image_raw']) image.set_shape([None, None, 3]) label = features['label'] if image_size is not None: image_channels = image.shape[2].value if augment_style_images: image_orig = image image = tf.image.random_brightness(image, max_delta=0.8) image = tf.image.random_saturation(image, lower=0.5, upper=1.5) image = tf.image.random_hue(image, max_delta=0.2) image = tf.image.random_flip_left_right(image) image = tf.image.random_flip_up_down(image) random_larger_image_size = tf.random_uniform( [], minval=image_size + 2, maxval=image_size + 200, dtype=tf.int32) image = _aspect_preserving_resize( image, random_larger_image_size) image = tf.random_crop( image, size=[image_size, image_size, image_channels]) image.set_shape([image_size, image_size, image_channels]) image_orig = _aspect_preserving_resize( image_orig, image_size + 2) image_orig = _central_crop([image_orig], image_size, image_size)[0] image_orig.set_shape([image_size, image_size, 3]) elif center_crop: image = _aspect_preserving_resize(image, image_size + 2) image = _central_crop([image], image_size, image_size)[0] image.set_shape([image_size, image_size, image_channels]) image_orig = image else: image = _aspect_preserving_resize(image, image_size) image_orig = image image = tf.to_float(image) / 255.0 image_orig = tf.to_float(image_orig) / 255.0 if batch_size is None: image = tf.expand_dims(image, 0) else: [image, image_orig, label] = tf.train.batch([image, image_orig, label], batch_size=batch_size) if random_style_image_size: # Selects a random size for the style images and resizes all the images # in the batch to that size. image = _aspect_preserving_resize( image, tf.random_uniform([], minval=min_rand_image_size, maxval=max_rand_image_size, dtype=tf.int32)) return image, label, image_orig
def _calculate_expected_result(dist_per_cell, numeric_values, numeric_values_scale, input_mask_float, logits_aggregation, config): """Calculate the expected result given cell and aggregation probabilities.""" if config.use_gumbel_for_cells: gumbel_dist = tfp.distributions.RelaxedBernoulli( # The token logits where already divided by the temperature and used for # computing cell selection errors so we need to multiply it again here config.temperature, logits=dist_per_cell.logits_parameter() * config.temperature) scaled_probability_per_cell = gumbel_dist.sample() else: scaled_probability_per_cell = _get_probs(dist_per_cell) # <float32>[batch_size, seq_length] scaled_probability_per_cell = (scaled_probability_per_cell / numeric_values_scale) * input_mask_float count_result = tf.reduce_sum(scaled_probability_per_cell, axis=1) numeric_values_masked = tf.where( tf.is_nan(numeric_values), tf.zeros_like(numeric_values), numeric_values) # Mask non-numeric table values to zero. sum_result = tf.reduce_sum(scaled_probability_per_cell * numeric_values_masked, axis=1) avg_approximation = config.average_approximation_function if avg_approximation == AverageApproximationFunction.RATIO: average_result = sum_result / (count_result + _EPSILON_ZERO_DIVISION) elif avg_approximation == AverageApproximationFunction.FIRST_ORDER: # The sum of all probabilities exept that correspond to other cells ex = tf.reduce_sum(scaled_probability_per_cell, axis=1, keepdims=True) \ - scaled_probability_per_cell + 1 average_result = tf.reduce_sum(numeric_values_masked * scaled_probability_per_cell / ex, axis=1) elif avg_approximation == AverageApproximationFunction.SECOND_ORDER: # The sum of all probabilities exept that correspond to other cells ex = tf.reduce_sum(scaled_probability_per_cell, axis=1, keepdims=True) \ - scaled_probability_per_cell + 1 pointwise_var = scaled_probability_per_cell * \ (1 - scaled_probability_per_cell) var = tf.reduce_sum(pointwise_var, axis=1, keepdims=True) - pointwise_var multiplier = (var / tf.math.square(ex) + 1) / ex average_result = tf.reduce_sum( numeric_values_masked * scaled_probability_per_cell * multiplier, axis=1) else: tf.logging.error("Invalid average_approximation_function: %s", config.average_approximation_function) if config.use_gumbel_for_agg: gumbel_dist = tfp.distributions.RelaxedOneHotCategorical( config.agg_temperature, logits=logits_aggregation[:, 1:]) # <float32>[batch_size, num_aggregation_labels - 1] aggregation_op_only_probs = gumbel_dist.sample() else: # <float32>[batch_size, num_aggregation_labels - 1] aggregation_op_only_probs = tf.nn.softmax(logits_aggregation[:, 1:] / config.agg_temperature, axis=-1) all_results = tf.concat([ tf.expand_dims(sum_result, axis=1), tf.expand_dims(average_result, axis=1), tf.expand_dims(count_result, axis=1) ], axis=1) expected_result = tf.reduce_sum(all_results * aggregation_op_only_probs, axis=1) return expected_result
def calculate_influence_ood(params): """Calculates influence functions for pre-trained model with OOD classes. Args: params (dict): contains a number of params - as loaded from flags. Should contain: seed (int) - random seed for Tensorflow and Numpy initialization. training_results_dir (str) - parent directory of the pre-trained model. clf_name (str) - the name of the pre-trained model's directory. n_test_infl (int) - number of examples to run influence functions for. start_ix_test_infl (int) - index to start loading examples from. cg_maxiter (int) - max number of iterations for conjugate gradient. squared (bool) - whether to calculate squared Hessian directly. tol (float) - tolerance for conjugate gradient. lam (float) - L2 regularization amount for Hessian. hvp_samples (int) - number of samples to take in HVP estimation. output_dir (str) - where results should be written - defaults to training_results_dir/clf_name/influence_results. tname (str) - extra string to add to saved tensor names; can be ''. preloaded_model (model or None) - if None, we should load the model ourselves. Otherwise, preloaded_model is the model we are interested in. preloaded_itr (Iterator or None) - if None, load the data iterator ourselves; otherwise, use preloaded_itr as the data iterator. """ tf.set_random_seed(params['seed']) np.random.seed(params['seed']) # Load a trained classifier. modeldir = os.path.join(params['training_results_dir'], params['clf_name']) param_file = os.path.join(modeldir, 'params.json') model_params = utils.load_json(param_file) if params['preloaded_model'] is None: ckpt_path = os.path.join(modeldir, 'ckpts/bestmodel-1') cnn_args = { 'conv_dims': [int(x) for x in model_params['conv_dims'].split(',')], 'conv_sizes': [int(x) for x in model_params['conv_sizes'].split(',')], 'dense_sizes': [int(x) for x in model_params['dense_sizes'].split(',')], 'n_classes': model_params['n_classes'], 'onehot': True } model = utils.load_model(ckpt_path, classifier.CNN, cnn_args) else: model = params['preloaded_model'] # Load train/validation/test examples tensordir = os.path.join(modeldir, 'tensors') validation_x = utils.load_tensor( os.path.join(tensordir, 'valid_x_infl.npy')) test_x = utils.load_tensor(os.path.join(tensordir, 'test_x_infl.npy')) ood_x = utils.load_tensor(os.path.join(tensordir, 'ood_x_infl.npy')) # Get in- and out-of-distribution classes. n_labels = model_params['n_classes'] all_classes = range(n_labels) ood_classes = ([int(x) for x in model_params['ood_classes'].split(',')] if 'ood_classes' in model_params else []) ind_classes = [x for x in all_classes if x not in ood_classes] # Load an iterator of training data. label_noise = (model_params['label_noise'] if 'label_noise' in model_params else 0.) # We only look at a portion of the test set for computational reasons. ninfl = params['n_test_infl'] start_ix = params['start_ix_test_infl'] end_ix = start_ix + ninfl xinfl_validation = validation_x[start_ix:end_ix] xinfl_test = test_x[start_ix:end_ix] xinfl_ood = ood_x[start_ix:end_ix] # We want to rotate through all the label options. y_all = tf.concat([ tf.one_hot(tf.fill((ninfl, ), lab), depth=n_labels) for lab in ind_classes ], axis=0) y_all = tf.concat([y_all, y_all, y_all], axis=0) xinfl_validation_all = tf.concat([xinfl_validation for _ in ind_classes], axis=0) xinfl_test_all = tf.concat([xinfl_test for _ in ind_classes], axis=0) xinfl_ood_all = tf.concat([xinfl_ood for _ in ind_classes], axis=0) x_all = tf.concat([xinfl_validation_all, xinfl_test_all, xinfl_ood_all], axis=0) cg_approx_params = { 'maxiter': params['cg_maxiter'], 'squared': params['squared'], 'tol': params['tol'], 'hvp_samples': params['hvp_samples'] } # Here we run conjugate gradient one example at a time, collecting # the following outputs. # H^{-1}g infl_value = [] # gH^{-1}g infl_laplace = [] # H^{-2}g infl_deriv = [] # g grads = [] # When calculating H^{-1}g with conjugate gradient, Scipy returns a flag # denoting the optimization's success. warning_flags = [] # When calculating H^{-2}g with conjugate gradient, Scipy returns a flag # denoting the optimization's success. warning_flags_deriv = [] for i in range(x_all.shape[0]): logging.info('Example {:d}'.format(i)) s = time.time() xi = tf.expand_dims(x_all[i], 0) yi = tf.expand_dims(y_all[i], 0) if params['preloaded_itr'] is None: itr_train, _, _, _ = dataset_utils.load_dataset_ood_supervised_onehot( ind_classes, ood_classes, label_noise=label_noise) else: itr_train = params['preloaded_itr'] infl_value_i, grads_i, warning_flag_i = get_parameter_influence( model, xi, yi, itr_train, approx_params=cg_approx_params, damping=params['lam']) t = time.time() logging.info('IHVP calculation took {:.3f} seconds'.format(t - s)) infl_laplace_i = tf.multiply(infl_value_i, grads_i) infl_value_wtshape = tensor_utils.reshape_vector_as( model.weights, infl_value_i) loss_function = calculate_influence.make_loss_fn(model, params['lam']) gradient_function = calculate_influence.make_grad_fn(model) map_gradient_function = calculate_influence.make_map_grad_fn(model) s = time.time() infl_deriv_i, warning_flag_deriv_i = get_ihvp_conjugate_gradient( infl_value_wtshape, itr_train, loss_function, gradient_function, map_gradient_function, approx_params=cg_approx_params) t = time.time() logging.info('Second IHVP calculation took {:.3f} seconds'.format(t - s)) infl_value.append(infl_value_i) infl_laplace.append(infl_laplace_i) infl_deriv.append(infl_deriv_i) grads.append(grads_i) warning_flags.append(tf.expand_dims(warning_flag_i, 0)) warning_flags_deriv.append(tf.expand_dims(warning_flag_deriv_i, 0)) infl_value = tf.concat(infl_value, axis=0) infl_laplace = tf.concat(infl_laplace, axis=0) infl_deriv = tf.concat(infl_deriv, axis=0) grads = tf.concat(grads, axis=0) warning_flags = tf.concat(warning_flags, axis=0) warning_flags_deriv = tf.concat(warning_flags_deriv, axis=0) res = {} for infl_res, nm in [(infl_value, 'infl'), (infl_deriv, 'deriv'), (infl_laplace, 'laplace'), (grads, 'grads'), (warning_flags, 'warnflags'), (warning_flags_deriv, 'warnflags_deriv')]: res['valid_{}'.format(nm)] = infl_res[:ninfl * len(ind_classes)] res['test_{}'.format(nm)] = infl_res[ninfl * len(ind_classes):2 * ninfl * len(ind_classes)] res['ood_{}'.format(nm)] = infl_res[2 * ninfl * len(ind_classes):] # Save the results of these calculations. if params['output_dir']: resdir = utils.make_subdir(params['output_dir'], 'influence_results') else: resdir = utils.make_subdir(modeldir, 'influence_results') tensor_name_template = '{}{}-inv_hvp-cg-ix{:d}-ninfl{:d}' + ( '_squared' if params['squared'] else '') infl_tensors = [(tensor_name_template.format(params['tname'], label, start_ix, ninfl), res[label]) for label in res.keys()] utils.save_tensors(infl_tensors, resdir)
def add_metric_fn_inputs(params, cls_outputs, box_outputs, metric_fn_inputs, max_detection_points=anchors.MAX_DETECTION_POINTS): """Selects top-k predictions and adds the selected to metric_fn_inputs. Args: params: a parameter dictionary that includes `min_level`, `max_level`, `batch_size`, and `num_classes`. cls_outputs: an OrderDict with keys representing levels and values representing logits in [batch_size, height, width, num_anchors]. box_outputs: an OrderDict with keys representing levels and values representing box regression targets in [batch_size, height, width, num_anchors * 4]. metric_fn_inputs: a dictionary that will hold the top-k selections. max_detection_points: an integer specifing the maximum detection points to keep before NMS. Keep all anchors if max_detection_points <= 0. """ batch_size = params['batch_size'] num_classes = params['num_classes'] cls_outputs_all = [] box_outputs_all = [] # Concatenates class and box of all levels into one tensor. for level in range(params['min_level'], params['max_level'] + 1): if params['data_format'] == 'channels_first': cls_outputs[level] = tf.transpose(cls_outputs[level], [0, 2, 3, 1]) box_outputs[level] = tf.transpose(box_outputs[level], [0, 2, 3, 1]) cls_outputs_all.append(tf.reshape( cls_outputs[level], [batch_size, -1, num_classes])) box_outputs_all.append(tf.reshape(box_outputs[level], [batch_size, -1, 4])) cls_outputs_all = tf.concat(cls_outputs_all, 1) box_outputs_all = tf.concat(box_outputs_all, 1) if max_detection_points > 0: # Prune anchors and detections to only keep max_detection_points. # Due to some issues, top_k is currently slow in graph model. cls_outputs_all_reshape = tf.reshape(cls_outputs_all, [batch_size, -1]) _, cls_topk_indices = tf.math.top_k(cls_outputs_all_reshape, k=max_detection_points, sorted=False) indices = cls_topk_indices // num_classes classes = cls_topk_indices % num_classes cls_indices = tf.stack([indices, classes], axis=2) cls_outputs_all_after_topk = tf.gather_nd( cls_outputs_all, cls_indices, batch_dims=1) box_outputs_all_after_topk = tf.gather_nd( box_outputs_all, tf.expand_dims(indices, 2), batch_dims=1) else: # Keep all anchors, but for each anchor, just keep the max probablity for # each class. cls_outputs_idx = tf.math.argmax(cls_outputs_all, axis=-1) num_anchors = cls_outputs_all.shape[1] classes = cls_outputs_idx indices = tf.tile(tf.expand_dims(tf.range(num_anchors), axis=0), [batch_size, 1]) cls_outputs_all_after_topk = tf.reduce_max(cls_outputs_all, -1) box_outputs_all_after_topk = box_outputs_all metric_fn_inputs['cls_outputs_all'] = cls_outputs_all_after_topk metric_fn_inputs['box_outputs_all'] = box_outputs_all_after_topk metric_fn_inputs['indices_all'] = indices metric_fn_inputs['classes_all'] = classes