def train(flags): """Training entry point.""" log_dir = flags.log_dir flags.pretrained_model_dir = log_dir log_dir = os.path.join(log_dir, 'train') flags.eval_interval_secs = 0 with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False, name='global_step', dtype=tf.int64) global_step_confidence = tf.Variable(0, trainable=False, name='global_step_confidence', dtype=tf.int64) model = build_model(flags) images_query_pl, labels_query_pl, \ images_support_pl, labels_support_pl = \ build_episode_placeholder(flags) # Augments the input. if flags.dataset == 'cifar10' or flags.dataset == 'cifar100': images_query_pl_aug = data_loader.augment_cifar(images_query_pl, is_training=True) images_support_pl_aug = data_loader.augment_cifar( images_support_pl, is_training=True) elif flags.dataset == 'tinyimagenet': images_query_pl_aug = data_loader.augment_tinyimagenet( images_query_pl, is_training=True) images_support_pl_aug = data_loader.augment_tinyimagenet( images_support_pl, is_training=True) logits, logits_z = build_proto_train_graph( images_query=images_query_pl_aug, images_support=images_support_pl_aug, flags=flags, is_training=True, model=model) # Losses and optimizer ## Classification loss loss_classification = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=tf.one_hot(labels_query_pl, flags.num_classes_train))) # Confidence loss _, top_k_indices = tf.nn.top_k(logits, k=1) pred = tf.squeeze(top_k_indices) incorrect_mask = tf.math.logical_not( tf.math.equal(pred, labels_query_pl)) incorrect_logits_z = tf.boolean_mask(logits_z, incorrect_mask) incorrect_labels_z = tf.boolean_mask(labels_query_pl, incorrect_mask) signal_variance = tf.math.reduce_sum(tf.cast(incorrect_mask, tf.int32)) loss_variance_incorrect = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=incorrect_logits_z, labels=tf.one_hot(incorrect_labels_z, flags.num_classes_train))) loss_variance_zero = 0.0 loss_confidence = tf.cond(tf.greater(signal_variance, 0), lambda: loss_variance_incorrect, lambda: loss_variance_zero) regu_losses = tf.losses.get_regularization_losses() loss = tf.add_n([loss_classification] + regu_losses) # Learning rate if flags.lr_anneal == 'const': learning_rate = flags.init_learning_rate elif flags.lr_anneal == 'pwc': learning_rate = get_pwc_learning_rate(global_step, flags) elif flags.lr_anneal == 'exp': lr_decay_step = flags.number_of_steps // flags.n_lr_decay learning_rate = tf.train.exponential_decay( flags.init_learning_rate, global_step, lr_decay_step, 1.0 / flags.lr_decay_rate, staircase=True) else: raise Exception('Not implemented') # Optimizer optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9) optimizer_confidence = tf.train.MomentumOptimizer( learning_rate=learning_rate, momentum=0.9) train_op = contrib_slim.learning.create_train_op( total_loss=loss, optimizer=optimizer, global_step=global_step, clip_gradient_norm=flags.clip_gradient_norm) variable_variance = [] for v in tf.trainable_variables(): if 'fc_variance' in v.name: variable_variance.append(v) train_op_confidence = contrib_slim.learning.create_train_op( total_loss=loss_confidence, optimizer=optimizer_confidence, global_step=global_step_confidence, clip_gradient_norm=flags.clip_gradient_norm, variables_to_train=variable_variance) tf.summary.scalar('loss', loss) tf.summary.scalar('loss_classification', loss_classification) tf.summary.scalar('loss_variance', loss_confidence) tf.summary.scalar('regu_loss', tf.add_n(regu_losses)) tf.summary.scalar('learning_rate', learning_rate) # Merges all summaries except for pretrain summary = tf.summary.merge( tf.get_collection('summaries', scope='(?!pretrain).*')) # Gets datasets few_shot_data_train, test_dataset, train_dataset = get_train_datasets( flags) # Defines session and logging summary_writer_train = tf.summary.FileWriter(log_dir, flush_secs=1) saver = tf.train.Saver(max_to_keep=1, save_relative_paths=True) print(saver.saver_def.filename_tensor_name) print(saver.saver_def.restore_op_name) # pylint: disable=unused-variable run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() supervisor = tf.train.Supervisor( logdir=log_dir, init_feed_dict=None, summary_op=None, init_op=tf.global_variables_initializer(), summary_writer=summary_writer_train, saver=saver, global_step=global_step, save_summaries_secs=flags.save_summaries_secs, save_model_secs=0) with supervisor.managed_session() as sess: checkpoint_step = sess.run(global_step) if checkpoint_step > 0: checkpoint_step += 1 eval_interval_steps = flags.eval_interval_steps for step in range(checkpoint_step, flags.number_of_steps): # Computes the classification loss using a batch of data. images_query, labels_query,\ images_support, labels_support = \ few_shot_data_train.next_few_shot_batch( query_batch_size_per_task=flags.train_batch_size, num_classes_per_task=flags.num_classes_train, num_supports_per_class=flags.num_shots_train, num_tasks=flags.num_tasks_per_batch) feed_dict = { images_query_pl: images_query.astype(dtype=np.float32), labels_query_pl: labels_query, images_support_pl: images_support.astype(dtype=np.float32), labels_support_pl: labels_support } t_batch = time.time() dt_batch = time.time() - t_batch t_train = time.time() loss, loss_confidence = sess.run( [train_op, train_op_confidence], feed_dict=feed_dict) dt_train = time.time() - t_train if step % 100 == 0: summary_str = sess.run(summary, feed_dict=feed_dict) summary_writer_train.add_summary(summary_str, step) summary_writer_train.flush() logging.info( 'step %d, loss : %.4g, dt: %.3gs, dt_batch: %.3gs', step, loss, dt_train, dt_batch) if float(step) / flags.number_of_steps > 0.5: eval_interval_steps = flags.eval_interval_fine_steps if eval_interval_steps > 0 and step % eval_interval_steps == 0: saver.save(sess, os.path.join(log_dir, 'model'), global_step=step) eval(flags=flags, train_dataset=train_dataset, test_dataset=test_dataset) if float( step ) > 0.5 * flags.number_of_steps + flags.number_of_steps_to_early_stop: break
def build(self): self.lr = tf.placeholder(tf.float32, shape=None, name='learning_rate') # Inputs self.s = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='state') self.a = tf.placeholder(tf.int32, shape=(None, ), name='action') self.returns = tf.placeholder(tf.float32, shape=(None, ), name='return') # Build network self.pi = dense_nn(self.s, self.layer_sizes + [self.act_size], name='pi_network') self.sampled_actions = tf.squeeze(tf.multinomial(self.pi, 1)) self.pi_vars = self.scope_vars('pi_network') if self.baseline: # State value estimation as the baseline self.v = dense_nn(self.s, self.layer_sizes + [1], name='v_network') self.target = self.returns - self.v # advantage with tf.variable_scope('v_optimize'): self.loss_v = tf.reduce_mean( tf.squared_difference(self.v, self.returns)) self.optim_v = tf.train.AdamOptimizer(self.lr).minimize( self.loss_v, name='adam_optim_v') else: self.target = tf.identity(self.returns) with tf.variable_scope('pi_optimize'): self.loss_pi = tf.reduce_mean( tf.stop_gradient(self.target) * tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.pi, labels=self.a), name='loss_pi') # self.optim_pi = tf.train.AdamOptimizer(self.lr) # self.grads_pi = self.optim_pi.compute_gradients(self.loss_pi, self.pi_vars) # self.train_pi_op = self.optim_pi.apply_gradients(self.grads_pi) self.optim_pi = tf.train.AdamOptimizer(self.lr).minimize( self.loss_pi, name='adam_optim_pi') with tf.variable_scope('summary'): self.loss_pi_summ = tf.summary.scalar('loss_pi', self.loss_pi) self.ep_reward = tf.placeholder(tf.float32, name='episode_reward') self.ep_reward_summ = tf.summary.scalar('episode_reward', self.ep_reward) summ_list = [self.loss_pi_summ, self.ep_reward_summ] if self.baseline: self.loss_v_summ = tf.summary.scalar('loss_v', self.loss_v) summ_list.append(self.loss_v_summ) self.merged_summary = tf.summary.merge(summ_list) if self.baseline: self.train_ops = [self.optim_pi, self.optim_v] else: self.train_ops = [self.optim_pi] self.sess.run(tf.global_variables_initializer())
def transformer_ffn_layer(x, hparams, pad_remover=None, conv_padding="LEFT", nonpadding_mask=None, losses=None, cache=None, decode_loop_step=None, readout_d_ff=0, layer_collection=None): """Feed-forward layer in the transformer. Args: x: a Tensor of shape [batch_size, length, hparams.model_d] hparams: hyperparameters for model pad_remover: an expert_utils.PadRemover object tracking the padding positions. If provided, when using convolutional settings, the padding is removed before applying the convolution, and restored afterward. This can give a significant speedup. conv_padding: a string - either "LEFT" or "SAME". nonpadding_mask: an optional Tensor with shape [batch_size, length]. needed for convolutional layers with "SAME" padding. Contains 1.0 in positions corresponding to nonpadding. losses: optional list onto which to append extra training losses cache: dict, containing tensors which are the results of previous attentions, used for fast decoding. decode_loop_step: An integer, step number of the decoding loop. Only used for inference on TPU. readout_d_ff: if it's greater than 0, then it will be used instead of d_ff layer_collection: A tensorflow_kfac.LayerCollection. Only used by the KFAC optimizer. Default is None. Returns: a Tensor of shape [batch_size, length, hparams.model_d] Raises: ValueError: If losses arg is None, but layer generates extra losses. """ ffn_layer = hparams.ffn_layer relu_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "relu_dropout_broadcast_dims", ""))) if ffn_layer == "conv_hidden_relu": # Backwards compatibility ffn_layer = "dense_relu_dense" if ffn_layer == "dense_relu_dense": # In simple convolution mode, use `pad_remover` to speed up processing. mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_FFN_FILTER_DENSE, value={ "d_ff": hparams.d_ff, "use_bias": "True", "activation": mlperf_log.RELU }) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_FFN_OUTPUT_DENSE, value={ "model_d": hparams.model_d, "use_bias": "True", }) mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_RELU_DROPOUT, value=hparams.relu_dropout) if pad_remover: original_shape = common_layers.shape_list(x) # Collapse `x` across examples, and remove padding positions. x = tf.reshape(x, tf.concat([[-1], original_shape[2:]], axis=0)) x = tf.expand_dims(pad_remover.remove(x), axis=0) conv_output = common_layers.dense_relu_dense( x, hparams.d_ff, hparams.model_d, dropout=hparams.relu_dropout, dropout_broadcast_dims=relu_dropout_broadcast_dims, layer_collection=layer_collection) if pad_remover: # Restore `conv_output` to the original shape of `x`, including padding. conv_output = tf.reshape( pad_remover.restore(tf.squeeze(conv_output, axis=0)), original_shape) return conv_output elif ffn_layer == "conv_relu_conv": return common_layers.conv_relu_conv( x, readout_d_ff or hparams.d_ff, hparams.model_d, first_kernel_size=hparams.conv_first_kernel, second_kernel_size=1, padding=conv_padding, nonpadding_mask=nonpadding_mask, dropout=hparams.relu_dropout, cache=cache, decode_loop_step=decode_loop_step) elif ffn_layer == "parameter_attention": return common_attention.parameter_attention( x, hparams.parameter_attention_key_channels or hparams.model_d, hparams.parameter_attention_value_channels or hparams.model_d, hparams.model_d, readout_d_ff or hparams.d_ff, hparams.num_heads, hparams.attention_dropout) elif ffn_layer == "conv_hidden_relu_with_sepconv": return common_layers.conv_hidden_relu(x, readout_d_ff or hparams.d_ff, hparams.model_d, kernel_size=(3, 1), second_kernel_size=(31, 1), padding="LEFT", dropout=hparams.relu_dropout) elif ffn_layer == "sru": return common_layers.sru(x) elif ffn_layer == "local_moe_tpu": overhead = hparams.moe_overhead_eval if hparams.mode == tf.estimator.ModeKeys.TRAIN: overhead = hparams.moe_overhead_train ret, loss = expert_utils.local_moe_tpu(x, hparams.d_ff // 2, hparams.model_d, hparams.moe_num_experts, overhead=overhead, loss_coef=hparams.moe_loss_coef) elif ffn_layer == "local_moe": overhead = hparams.moe_overhead_eval if hparams.mode == tf.estimator.ModeKeys.TRAIN: overhead = hparams.moe_overhead_train ret, loss = expert_utils.local_moe(x, True, expert_utils.ffn_expert_fn( hparams.model_d, [hparams.d_ff], hparams.model_d), hparams.moe_num_experts, k=hparams.moe_k, hparams=hparams) losses.append(loss) return ret else: assert ffn_layer == "none" return x
def resize(x): x["user_id"] = tf.squeeze(x["user_id"], axis=[-1]) x["item_id"] = tf.squeeze(x["item_id"], axis=[-1]) return x
def crop_mask_in_target_box(masks, boxes, target_boxes, output_size): """Crop masks in target boxes. Args: masks: A tensor with a shape of [batch_size, num_masks, height, width]. boxes: a float tensor representing box cooridnates that tightly enclose masks with a shape of [batch_size, num_masks, 4] in un-normalized coordinates. A box is represented by [ymin, xmin, ymax, xmax]. target_boxes: a float tensor representing target box cooridnates for masks with a shape of [batch_size, num_masks, 4] in un-normalized coordinates. A box is represented by [ymin, xmin, ymax, xmax]. output_size: A scalar to indicate the output crop size. It currently only supports to output a square shape outputs. Returns: A 4-D tensor representing feature crop of shape [batch_size, num_boxes, output_size, output_size]. """ with tf.name_scope('crop_mask_in_target_box'): batch_size, num_masks, height, width = masks.get_shape().as_list() masks = tf.reshape(masks, [batch_size*num_masks, height, width, 1]) # Pad zeros on the boundary of masks. masks = tf.image.pad_to_bounding_box(masks, 2, 2, height + 4, width + 4) masks = tf.reshape(masks, [batch_size, num_masks, height+4, width+4, 1]) # Projects target box locations and sizes to corresponding cropped # mask coordinates. gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split( value=boxes, num_or_size_splits=4, axis=2) bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split( value=target_boxes, num_or_size_splits=4, axis=2) y_transform = (bb_y_min - gt_y_min) * height / ( gt_y_max - gt_y_min + _EPSILON) + 2 x_transform = (bb_x_min - gt_x_min) * height / ( gt_x_max - gt_x_min + _EPSILON) + 2 h_transform = (bb_y_max - bb_y_min) * width / ( gt_y_max - gt_y_min + _EPSILON) w_transform = (bb_x_max - bb_x_min) * width / ( gt_x_max - gt_x_min + _EPSILON) boundaries = tf.concat( [tf.to_float(tf.ones_like(y_transform) * ((height + 4) - 1)), tf.to_float(tf.ones_like(x_transform) * ((width + 4) - 1))], axis=-1) # Reshape tensors to have the right shape for selective_crop_and_resize. trasnformed_boxes = tf.concat( [y_transform, x_transform, h_transform, w_transform], -1) levels = tf.tile(tf.reshape(tf.range(num_masks), [1, num_masks]), [batch_size, 1]) cropped_masks = selective_crop_and_resize( masks, trasnformed_boxes, levels, boundaries, output_size, sample_offset=0) cropped_masks = tf.squeeze(cropped_masks, axis=-1) return cropped_masks
def _update_block_mask(self, weights, threshold, mask): """Performs block-granular masking of the weights. Block pruning occurs only if the block_height or block_width is > 1 and if the weight tensor, when squeezed, has ndims = 2. Otherwise, elementwise pruning occurs. Args: weights: The weight tensor that needs to be masked. threshold: The current threshold value. The function will compute a new threshold and return the exponential moving average using the current value of threshold mask: The mask from the previous pruning update. Returns: new_threshold: The new value of the threshold based on weights, and sparsity at the current global_step new_mask: A numpy array of the same size and shape as weights containing 0 or 1 to indicate which of the values in weights falls below the threshold Raises: ValueError: if block pooling function is not AVG or MAX """ squeezed_weights = tf.squeeze(weights) if squeezed_weights.get_shape().ndims != 2 or self._block_dim == [ 1, 1 ]: if self._pruning_method == 'threshold': return self._update_mask(weights, threshold) # random_cumulative removes at random taking into account previous # random modification. random_indepent simply removes at random. elif self._pruning_method in [ 'random_independent', 'random_cumulative' ]: return self._update_random_mask(weights, mask) else: raise ValueError('Unknown pruning method: %s' % self._pruning_method) if self._block_pooling_function not in ['AVG', 'MAX']: raise ValueError( 'Unknown pooling function for block sparsity: %s' % self._block_pooling_function) with tf.name_scope(weights.op.name + '_pruning_ops'): abs_weights = tf.abs(squeezed_weights) pool_window = [self._block_dim[0], self._block_dim[1]] pool_fn = pruning_utils.factorized_pool if not self._use_tpu: pool_fn = tf.pool abs_weights = tf.reshape(abs_weights, [ 1, abs_weights.get_shape()[0], abs_weights.get_shape()[1], 1 ]) pooled_weights = pool_fn(abs_weights, window_shape=pool_window, pooling_type=self._block_pooling_function, strides=pool_window, padding='SAME', name=weights.op.name + '_pooled') if pooled_weights.get_shape().ndims != 2: pooled_weights = tf.squeeze(pooled_weights) if self._pruning_method == 'threshold': smoothed_threshold, new_mask = self._update_mask( pooled_weights, threshold) elif self._pruning_method in [ 'random_independent', 'random_cumulative' ]: smoothed_threshold, new_mask = self._update_random_mask( pooled_weights, mask) else: raise ValueError('Unknown pruning method: %s' % self._pruning_method) ## this is the process that updates the mask. updated_mask = pruning_utils.kronecker_product( new_mask, tf.ones(self._block_dim)) sliced_mask = tf.slice(updated_mask, [0, 0], [ squeezed_weights.get_shape()[0], squeezed_weights.get_shape()[1] ]) return smoothed_threshold, tf.reshape(sliced_mask, tf.shape(weights))
def call(self, inputs, training=True, features_only=None, pooled_features_only=False): """Implementation of call(). Args: inputs: input tensors. training: boolean, whether the model is constructed for training. features_only: build the base feature network only. pooled_features_only: build the base network for features extraction (after 1x1 conv layer and global pooling, but before dropout and fc head). Returns: output tensors. """ outputs = None self.endpoints = {} reduction_idx = 0 # Calls Stem layers with tf.name_scope('stem'): outputs = self._relu_fn( self._bn0(self._conv_stem(inputs), training=training)) logging.info('Built stem layers with output shape: %s', outputs.shape) self.endpoints['stem'] = outputs # Calls blocks. for idx, block in enumerate(self._blocks): is_reduction = False # reduction flag for blocks after the stem layer # If the first block has super-pixel (space-to-depth) layer, then stem is # the first reduction point. if (block.block_args().super_pixel == 1 and idx == 0): reduction_idx += 1 self.endpoints['reduction_%s' % reduction_idx] = outputs elif ((idx == len(self._blocks) - 1) or self._blocks[idx + 1].block_args().strides[0] > 1): is_reduction = True reduction_idx += 1 with tf.name_scope('blocks_%s' % idx): survival_prob = self._global_params.survival_prob if survival_prob: drop_rate = 1.0 - survival_prob survival_prob = 1.0 - drop_rate * float(idx) / len( self._blocks) logging.info('block_%s survival_prob: %s', idx, survival_prob) outputs = block.call(outputs, training=training, survival_prob=survival_prob) self.endpoints['block_%s' % idx] = outputs if is_reduction: self.endpoints['reduction_%s' % reduction_idx] = outputs if block.endpoints: for k, v in six.iteritems(block.endpoints): self.endpoints['block_%s/%s' % (idx, k)] = v if is_reduction: self.endpoints['reduction_%s/%s' % (reduction_idx, k)] = v self.endpoints['features'] = outputs if not features_only: # Calls final layers and returns logits. with tf.name_scope('head'): outputs = self._relu_fn( self._bn1(self._conv_head(outputs), training=training)) self.endpoints['head_1x1'] = outputs if self._global_params.local_pooling: shape = outputs.get_shape().as_list() kernel_size = [ 1, shape[self._spatial_dims[0]], shape[self._spatial_dims[1]], 1 ] outputs = tf.nn.avg_pool(outputs, ksize=kernel_size, strides=[1, 1, 1, 1], padding='VALID') self.endpoints['pooled_features'] = outputs if not pooled_features_only: if self._dropout: outputs = self._dropout(outputs, training=training) self.endpoints['global_pool'] = outputs if self._fc: outputs = tf.squeeze(outputs, self._spatial_dims) outputs = self._fc(outputs) self.endpoints['head'] = outputs else: outputs = self._avg_pooling(outputs) self.endpoints['pooled_features'] = outputs if not pooled_features_only: if self._dropout: outputs = self._dropout(outputs, training=training) self.endpoints['global_pool'] = outputs if self._fc: outputs = self._fc(outputs) self.endpoints['head'] = outputs return outputs
def position_sensitive_crop_regions(image, boxes, crop_size, num_spatial_bins, global_pool): """Position-sensitive crop and pool rectangular regions from a feature grid. The output crops are split into `spatial_bins_y` vertical bins and `spatial_bins_x` horizontal bins. For each intersection of a vertical and a horizontal bin the output values are gathered by performing `tf.image.crop_and_resize` (bilinear resampling) on a a separate subset of channels of the image. This reduces `depth` by a factor of `(spatial_bins_y * spatial_bins_x)`. When global_pool is True, this function implements a differentiable version of position-sensitive RoI pooling used in [R-FCN detection system](https://arxiv.org/abs/1605.06409). When global_pool is False, this function implements a differentiable version of position-sensitive assembling operation used in [instance FCN](https://arxiv.org/abs/1603.08678). Args: image: A `Tensor`. Must be one of the following types: `uint8`, `int8`, `int16`, `int32`, `int64`, `half`, `float32`, `float64`. A 3-D tensor of shape `[image_height, image_width, depth]`. Both `image_height` and `image_width` need to be positive. boxes: A `Tensor` of type `float32`. A 2-D tensor of shape `[num_boxes, 4]`. Each box is specified in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the `[0, 1]` interval of normalized image height is mapped to `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in which case the sampled crop is an up-down flipped version of the original image. The width dimension is treated similarly. crop_size: A list of two integers `[crop_height, crop_width]`. All cropped image patches are resized to this size. The aspect ratio of the image content is not preserved. Both `crop_height` and `crop_width` need to be positive. num_spatial_bins: A list of two integers `[spatial_bins_y, spatial_bins_x]`. Represents the number of position-sensitive bins in y and x directions. Both values should be >= 1. `crop_height` should be divisible by `spatial_bins_y`, and similarly for width. The number of image channels should be divisible by (spatial_bins_y * spatial_bins_x). Suggested value from R-FCN paper: [3, 3]. global_pool: A boolean variable. If True, we perform average global pooling on the features assembled from the position-sensitive score maps. If False, we keep the position-pooled features without global pooling over the spatial coordinates. Note that using global_pool=True is equivalent to but more efficient than running the function with global_pool=False and then performing global average pooling. Returns: position_sensitive_features: A 4-D tensor of shape `[num_boxes, K, K, crop_channels]`, where `crop_channels = depth / (spatial_bins_y * spatial_bins_x)`, where K = 1 when global_pool is True (Average-pooled cropped regions), and K = crop_size when global_pool is False. Raises: ValueError: Raised in four situations: `num_spatial_bins` is not >= 1; `num_spatial_bins` does not divide `crop_size`; `(spatial_bins_y*spatial_bins_x)` does not divide `depth`; `bin_crop_size` is not square when global_pool=False due to the constraint in function space_to_depth. """ total_bins = 1 bin_crop_size = [] for (num_bins, crop_dim) in zip(num_spatial_bins, crop_size): if num_bins < 1: raise ValueError('num_spatial_bins should be >= 1') if crop_dim % num_bins != 0: raise ValueError('crop_size should be divisible by num_spatial_bins') total_bins *= num_bins bin_crop_size.append(crop_dim // num_bins) if not global_pool and bin_crop_size[0] != bin_crop_size[1]: raise ValueError('Only support square bin crop size for now.') ymin, xmin, ymax, xmax = tf.unstack(boxes, axis=1) spatial_bins_y, spatial_bins_x = num_spatial_bins # Split each box into spatial_bins_y * spatial_bins_x bins. position_sensitive_boxes = [] for bin_y in range(spatial_bins_y): step_y = (ymax - ymin) / spatial_bins_y for bin_x in range(spatial_bins_x): step_x = (xmax - xmin) / spatial_bins_x box_coordinates = [ymin + bin_y * step_y, xmin + bin_x * step_x, ymin + (bin_y + 1) * step_y, xmin + (bin_x + 1) * step_x, ] position_sensitive_boxes.append(tf.stack(box_coordinates, axis=1)) image_splits = tf.split(value=image, num_or_size_splits=total_bins, axis=2) image_crops = [] for (split, box) in zip(image_splits, position_sensitive_boxes): if split.shape.is_fully_defined() and box.shape.is_fully_defined(): crop = tf.squeeze( matmul_crop_and_resize( tf.expand_dims(split, axis=0), tf.expand_dims(box, axis=0), bin_crop_size), axis=0) else: crop = tf.image.crop_and_resize( tf.expand_dims(split, 0), box, tf.zeros(tf.shape(boxes)[0], dtype=tf.int32), bin_crop_size) image_crops.append(crop) if global_pool: # Average over all bins. position_sensitive_features = tf.add_n(image_crops) / len(image_crops) # Then average over spatial positions within the bins. position_sensitive_features = tf.reduce_mean( position_sensitive_features, [1, 2], keepdims=True) else: # Reorder height/width to depth channel. block_size = bin_crop_size[0] if block_size >= 2: image_crops = [tf.space_to_depth( crop, block_size=block_size) for crop in image_crops] # Pack image_crops so that first dimension is for position-senstive boxes. position_sensitive_features = tf.stack(image_crops, axis=0) # Unroll the position-sensitive boxes to spatial positions. position_sensitive_features = tf.squeeze( tf.batch_to_space_nd(position_sensitive_features, block_shape=[1] + num_spatial_bins, crops=tf.zeros((3, 2), dtype=tf.int32)), axis=[0]) # Reorder back the depth channel. if block_size >= 2: position_sensitive_features = tf.depth_to_space( position_sensitive_features, block_size=block_size) return position_sensitive_features
def __init__(self, config, is_training, input_ids, input_mask=None, attention_mask=None, token_weights=None, custom_attention_layer=None, token_type_ids=None, extra_embeddings=None, use_position_embeddings=True, reset_position_index_per_cell=False, scope=None): """Constructor for BertModel. Args: config: `BertConfig` instance. is_training: bool. true for training model, false for eval model. Controls whether dropout will be applied. input_ids: int32 Tensor of shape [batch_size, seq_length]. input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. attention_mask: (optional) float32 Tensor of shape [batch_size, seq_length, seq_length]. token_weights: (optional) float32 Tensor of shape [batch_size, seq_length] in [0,1]. custom_attention_layer: (optional) function with the same signature as `attention_layer` in order to replace it for sparse alternatives. token_type_ids: (optional) nested structure of int32 Tensors of shape [batch_size, seq_length]. extra_embeddings: (optional) float32 Tensor of shape [batch_size, seq_len, embedding_dim]. Additional embeddings concatenated with all the other embeddings. use_position_embeddings: (optional) bool. Whether to use position embeddings. reset_position_index_per_cell: bool. Whether to restart position index when a new cell starts. scope: (optional) variable scope. Defaults to "bert". Raises: ValueError: The config is invalid or one of the input tensor shapes is invalid. """ config = copy.deepcopy(config) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 input_shape = get_shape_list(input_ids, expected_rank=2) batch_size = input_shape[0] seq_length = input_shape[1] if input_mask is None: input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) if token_weights is not None: input_mask = token_weights * tf.cast(input_mask, dtype=tf.float32) if token_type_ids is None: token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) with tf.variable_scope(scope, default_name="bert"): with tf.variable_scope("embeddings"): # Perform embedding lookup on the word ids. (self.embedding_output, self.embedding_table) = embedding_lookup( input_ids=input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings") # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.embedding_output = embedding_postprocessor( input_tensor=self.embedding_output, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=use_position_embeddings, reset_position_index_per_cell=reset_position_index_per_cell, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, extra_embeddings=extra_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope("encoder"): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. if attention_mask is None: attention_mask = create_attention_mask_from_input_mask( input_ids, input_mask) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. self.all_encoder_layers, self.all_attention_probs = transformer_model( input_tensor=self.embedding_output, attention_mask=attention_mask, custom_attention_layer=custom_attention_layer, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True, do_return_attention_probs=True, softmax_temperature=config.softmax_temperature) self.sequence_output = self.all_encoder_layers[-1] # The "pooler" converts the encoded sequence tensor of shape # [batch_size, seq_length, hidden_size] to a tensor of shape # [batch_size, hidden_size]. This is necessary for segment-level # (or segment-pair-level) classification tasks where we need a fixed # dimensional representation of the segment. with tf.variable_scope("pooler"): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) self.pooled_output = tf.layers.dense( first_token_tensor, config.hidden_size, activation=tf.tanh, kernel_initializer=create_initializer( config.initializer_range))
def call(self, inputs): out = tf.squeeze(self.dense(inputs), axis=-1) return out
def direction_net_translation(src_img, trt_img, rotation_gt, translation_gt, fov_gt, rotation_pred, derotate_both=False): """Build the computation graph to train the DirectionNet-T. Args: src_img: [BATCH, HEIGHT, WIDTH, 3] input source images. trt_img: [BATCH, HEIGHT, WIDTH, 3] input target images. rotation_gt: [BATCH, 3, 3] ground truth rotation matrices. translation_gt: [BATCH, 3] ground truth translation directions. fov_gt: [BATCH] the ground truth field of view (degrees) of input images. rotation_pred: [BATCH, 3, 3] estimated rotations from DirectionNet-R. derotate_both: (bool) transform both input images to a middle frame by half the relative rotation between them to cancel out the rotation if true. Otherwise, only derotate the target image to the source image's frame. Returns: A collection of tensors including training ops, loss, and global step count. """ net = model.DirectionNet(1) global_step = tf.train.get_or_create_global_step() perturbed_rotation = tf.cond( tf.less(tf.random_uniform([], 0, 1.0), 0.5), lambda: util.perturb_rotation(rotation_gt, [10., 5., 10.]), lambda: rotation_pred) (transformed_src, transformed_trt) = util.derotation( src_img, trt_img, perturbed_rotation, fov_gt, FLAGS.transformed_fov, [FLAGS.transformed_height, FLAGS.transformed_width], derotate_both) (transformed_src_gt, transformed_trt_gt) = util.derotation( src_img, trt_img, rotation_gt, fov_gt, FLAGS.transformed_fov, [FLAGS.transformed_height, FLAGS.transformed_width], derotate_both) half_derotation = util.half_rotation(perturbed_rotation) translation_gt = tf.squeeze( tf.matmul(half_derotation, tf.expand_dims(translation_gt, -1), transpose_a=True), -1) translation_gt = tf.expand_dims(translation_gt, 1) distribution_gt = util.spherical_normalization(util.von_mises_fisher( translation_gt, tf.constant(FLAGS.kappa, tf.float32), [FLAGS.distribution_height, FLAGS.distribution_width]), rectify=False) pred = net(transformed_src, transformed_trt, training=True) directions, expectation, distribution_pred = util.distributions_to_directions( pred) direction_loss = losses.direction_loss(directions, translation_gt) distribution_loss = tf.constant(FLAGS.alpha, tf.float32) * losses.distribution_loss( distribution_pred, distribution_gt) spread_loss = tf.cast(FLAGS.beta, tf.float32) * losses.spread_loss(expectation) direction_error = tf.reduce_mean( tf.acos( tf.clip_by_value(tf.reduce_sum(directions * translation_gt, -1), -1., 1.))) loss = direction_loss + distribution_loss + spread_loss tf.summary.scalar('loss', loss) tf.summary.scalar('distribution_loss', distribution_loss) tf.summary.scalar('spread_loss', spread_loss) tf.summary.scalar('direction_error', util.radians_to_degrees(direction_error)) tf.summary.image('distribution/translation/ground_truth', distribution_gt, max_outputs=4) tf.summary.image('distribution/translation/prediction', distribution_pred, max_outputs=4) tf.summary.image('source_image', src_img, max_outputs=4) tf.summary.image('target_image', trt_img, max_outputs=4) tf.summary.image('transformed_source_image', transformed_src, max_outputs=4) tf.summary.image('transformed_target_image', transformed_trt, max_outputs=4) tf.summary.image('transformed_source_image_gt', transformed_src_gt, max_outputs=4) tf.summary.image('transformed_target_image_gt', transformed_trt_gt, max_outputs=4) optimizer = tf.train.GradientDescentOptimizer(FLAGS.lr) train_op = optimizer.minimize(loss, global_step=global_step, name='train') update_op = net.updates return Computation(tf.group([train_op, update_op]), loss, global_step)
def train_step(self): def step_fn(inputs): """Step functon. Args: inputs: inputs from data iterator Returns: a set of variables want to observe in Tensorboard """ net = self.net (all_images, labels), (self.probe_images, self.probe_labels) = inputs assert len(all_images.shape) == 5 images, self.aug_images = all_images[:, 0], all_images[:, 1] self.images, self.labels = images, labels batch_size = int(self.batch_size / self.strategy.num_replicas_in_sync) logits = net(images, name='model', reuse=tf.AUTO_REUSE, training=True) self.logits = logits # other losses # initialized first to use self.guessed_label for meta step xe_loss, cs_loss = self.unsupervised_loss() # meta optimization weight, eps, meta_loss, meta_acc = self.meta_optimize() ## losses w.r.t new weight and loss onehot_labels = tf.one_hot(labels, self.dataset.num_classes) onehot_labels = tf.cast(onehot_labels, tf.float32) eps_k = tf.reshape(eps, [batch_size, 1]) mixed_labels = tf.math.add( eps_k * onehot_labels, (1 - eps_k) * self.guessed_label, name='mixed_labels') net_cost = tf.losses.softmax_cross_entropy( mixed_labels, logits, reduction=tf.losses.Reduction.NONE) # loss with initial weight net_loss1 = tf.reduce_mean(net_cost) # loss with initial eps init_eps = tf.constant( [FLAGS.grad_eps_init] * batch_size, dtype=tf.float32) init_eps = tf.reshape(init_eps, (-1, 1)) init_mixed_labels = tf.math.add( init_eps * onehot_labels, (1 - init_eps) * self.guessed_label, name='init_mixed_labels') net_cost2 = tf.losses.softmax_cross_entropy( init_mixed_labels, logits, reduction=tf.losses.Reduction.NONE) net_loss2 = tf.reduce_sum(tf.math.multiply(net_cost2, weight)) net_loss = (net_loss1 + net_loss2) / 2 net_loss = net_loss + tf.add_n([xe_loss, cs_loss]) net_loss += net.regularization_loss net_loss /= self.strategy.num_replicas_in_sync # rescale by gpus with tf.control_dependencies(net.updates): net_grads = tf.gradients(net_loss, net.trainable_variables) minimizer_op = self.optimizer.apply_gradients( zip(net_grads, net.trainable_variables), global_step=self.global_step) with tf.control_dependencies([minimizer_op]): train_op = self.ema.apply(net.trainable_variables) acc_op, acc_update_op = self.acc_func(labels, tf.argmax(logits, axis=1)) with tf.control_dependencies([train_op, acc_update_op]): return (tf.identity(net_loss), tf.identity(xe_loss), tf.identity(cs_loss), tf.identity(meta_loss), tf.identity(meta_acc), tf.identity(acc_op), tf.identity(weight), tf.identity(labels)) # end of parallel (pr_net_loss, pr_xe_loss, pr_cs_loss, pr_metaloss, pr_metaacc, pr_acc, <<<<<<< HEAD pr_weight, pr_labels) = self.strategy.experimental_run_v2( ======= pr_weight, pr_labels) = self.strategy.run( >>>>>>> 644f9f8cbfbc56c33eea7af6eb16db4a79e90bf1 step_fn, args=((next(self.train_input_iterator), next(self.probe_input_iterator)),)) # collect device variables weights = self.strategy.unwrap(pr_weight) weights = tf.concat(weights, axis=0) labels = self.strategy.unwrap(pr_labels) labels = tf.concat(labels, axis=0) mean_acc = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, pr_acc) mean_metaacc = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, pr_metaacc) net_loss = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, pr_net_loss) xe_loss = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, pr_xe_loss) cs_loss = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, pr_cs_loss) meta_loss = self.strategy.reduce(tf.distribute.ReduceOp.MEAN, pr_metaloss) # The following add variables for tensorboard visualization merges = [] merges.append(tf.summary.scalar('acc/train', mean_acc)) merges.append(tf.summary.scalar('loss/xemin', xe_loss)) merges.append(tf.summary.scalar('loss/consistency', cs_loss)) merges.append(tf.summary.scalar('loss/net', net_loss)) merges.append(tf.summary.scalar('loss/meta', meta_loss)) merges.append(tf.summary.scalar('acc/meta', mean_metaacc)) zw_inds = tf.squeeze( tf.where(tf.less_equal(weights, 0), name='zero_weight_index')) merges.append( tf.summary.scalar( 'weights/zeroratio', tf.math.divide( tf.cast(tf.size(zw_inds), tf.float32), tf.cast(tf.size(weights), tf.float32)))) self.epoch_var = tf.cast( self.global_step / self.iter_epoch, tf.float32, name='epoch') merges.append(tf.summary.scalar('epoch', self.epoch_var)) merges.append(tf.summary.scalar('learningrate', self.learning_rate)) summary = tf.summary.merge(merges) return [ net_loss, meta_loss, xe_loss, cs_loss, mean_acc, mean_metaacc, summary, weights ]
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = GroverModel( config=config, is_training=is_training, input_ids=input_ids, pad_token_id=config.pad_token_id, chop_off_last_token=True, ) total_loss = model.lm_loss() if is_training: train_op, train_metrics = optimization_adafactor.create_optimizer( total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) tvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) else: train_op = None train_metrics = {} tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names) = get_assignment_map_from_checkpoint( tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: if use_tpu: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=construct_scalar_host_call( metric_dict=train_metrics, model_dir=params['model_dir'], prefix='training/'), scaffold_fn=scaffold_fn) else: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, training_hooks=[ tf.train.LoggingTensorHook( {'loss': tf.metrics.mean(total_loss)[1]}, every_n_iter=100) ], scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(total_loss): loss = tf.metrics.mean(values=total_loss) return { "eval_loss": loss, } eval_metrics = (metric_fn, [total_loss]) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: gt_logprobs = tf.squeeze(tf.batch_gather( model.log_probs, model.target_ids[:, :, None]), axis=2) # Need top-p required under topp sampling! better_than_gt = model.log_probs > gt_logprobs[:, :, None] top_p_required = tf.reduce_sum( tf.cast(better_than_gt, tf.float32) * tf.exp(model.log_probs), axis=2) # No top-p sampling for now, since this seems to be too slow on TPUs if use_tpu: predictions = tf.reshape( tf.random.categorical(logits=model.logits_flat, num_samples=1), get_shape_list(model.target_ids), ) else: # Argmax # predictions = tf.math.argmax(model.log_probs, axis=-1, output_type=tf.int32) predictions = tf.reshape( _top_p_sample(model.logits_flat, num_samples=1, p=0.99)['sample'], get_shape_list(model.target_ids), ) pred_logprobs = tf.squeeze(tf.batch_gather(model.log_probs, predictions[:, :, None]), axis=2) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions={ 'gt_logprobs': gt_logprobs, 'top_p_required': top_p_required, 'predictions': predictions, 'pred_logprobs': pred_logprobs, 'labels': input_ids }, scaffold_fn=scaffold_fn) return output_spec
def _slow_greedy_infer_guess_and_check(self, features, decode_length): assert self._hparams.block_size > 0 assert self._hparams.force_full_predict assert self._hparams.sampling_method == "argmax" assert self._decode_hparams.batch_size == 1 assert self._decode_hparams.block_size > 0 assert self._decode_hparams.block_size <= self._hparams.block_size assert self._decode_hparams.guess_and_check_top_k > 0 inputs_old = features["inputs"] assert "targets" not in features assert len(features["inputs"].shape) in [3, 4] if len(features["inputs"].shape) < 4: features["inputs"] = tf.expand_dims(features["inputs"], 2) block_size = self._decode_hparams.block_size decode_length += tf.shape(features["inputs"])[1] def while_exit_cond(result, length): # pylint: disable=unused-argument return tf.logical_and( length < decode_length, tf.reduce_all( tf.not_equal(result[:, :length, :, :], text_encoder.EOS_ID))) def infer_step(result, length): """Inference step.""" def print_info(result, length, new_length): vocab = self.problem_hparams.vocabulary["targets"] tf.logging.info( "length=%s new_length=%s length_diff=%s new_suffix=%s", length, new_length, new_length - length, str([ vocab._subtoken_id_to_subtoken_string(index) # pylint: disable=protected-access for index in result[0, -block_size:, 0, 0][:new_length - length] ]).decode("unicode-escape"), ) features["targets"] = tf.pad(result, [[0, 0], [0, 1], [0, 0], [0, 0]]) samples, logits, losses = self.sample(features) # pylint: disable=unused-variable _, top_k_indices = tf.nn.top_k( logits[:, :-1, :1, :, :], k=self._decode_hparams.guess_and_check_top_k) in_top_k = tf.reduce_any(tf.equal(tf.to_int64(top_k_indices), tf.expand_dims(result, 4)), axis=4) eos_cumsum = tf.cumsum(tf.to_int32( tf.equal(result, text_encoder.EOS_ID)), axis=1) after_eos = tf.greater(common_layers.shift_right(eos_cumsum), 0) correct = tf.logical_and(in_top_k, tf.logical_not(after_eos)) correct_cumsum = tf.cumsum(tf.to_int32(correct), axis=1) perfect_cumsum = 1 + tf.range(tf.shape(correct)[1]) for axis in [0, 2, 3]: perfect_cumsum = tf.expand_dims(perfect_cumsum, axis=axis) new_length = tf.reduce_sum(tf.to_int32( tf.equal(correct_cumsum, perfect_cumsum)), axis=1) new_length = tf.squeeze(new_length, axis=[0, 1, 2]) new_length = tf.minimum(new_length, decode_length) new_result = tf.concat([ result[:, :new_length, :, :], tf.reshape(samples[:, new_length, :block_size, :], [1, block_size, 1, 1]) ], axis=1) with tf.control_dependencies( [tf.py_func(print_info, [result, length, new_length], [])]): new_result = tf.identity(new_result) return new_result, new_length result = tf.zeros((1, 0, 1, 1), dtype=tf.int64) length = tf.squeeze(tf.zeros(1, dtype=tf.int32)) result, length = tf.while_loop(while_exit_cond, infer_step, [result, length], shape_invariants=[ tf.TensorShape([1, None, 1, 1]), tf.TensorShape([]), ], back_prop=False, parallel_iterations=1) result = result[:, :length, :, :] features["inputs"] = inputs_old return { "outputs": result, "scores": None, }
def mtf_model_fn(self, features, mesh): features = copy.copy(features) tf.logging.info("features = %s" % features) hparams = self._hparams activation_dtype = self.set_activation_type() is_training = hparams.mode == tf.estimator.ModeKeys.TRAIN # Declare all the dimensions batch_dim = mtf.Dimension("batch", hparams.batch_size) hidden_dim = mtf.Dimension("hidden", hparams.hidden_size) filter_dim = mtf.Dimension("filters", hparams.filter_sizes[0]) rows_dim = mtf.Dimension("rows_size", hparams.rows_size) cols_dim = mtf.Dimension("cols_size", hparams.cols_size) row_blocks_dim = mtf.Dimension("row_blocks", hparams.row_blocks) col_blocks_dim = mtf.Dimension("col_blocks", hparams.col_blocks) classes_dim = mtf.Dimension("classes", 10) channels_dim = mtf.Dimension("channels", 3) one_channel_dim = mtf.Dimension("one_channel", 1) inputs = features["inputs"] x = mtf.import_tf_tensor( mesh, tf.reshape(inputs, [ hparams.batch_size, hparams.row_blocks, hparams.rows_size // hparams.row_blocks, hparams.col_blocks, hparams.num_channels * hparams.cols_size // hparams.col_blocks, hparams.num_channels ]), mtf.Shape([ batch_dim, row_blocks_dim, rows_dim, col_blocks_dim, cols_dim, channels_dim ])) x = mtf.transpose(x, [ batch_dim, row_blocks_dim, col_blocks_dim, rows_dim, cols_dim, channels_dim ]) x = mtf.to_float(x) x = mtf.layers.conv2d_with_blocks(x, filter_dim, filter_size=[3, 3], strides=[1, 1], padding="SAME", h_blocks_dim=None, w_blocks_dim=col_blocks_dim, name="initial_filter") x = batch_norm_relu(x, is_training) # Conv blocks # [block - strided block layer - strided block layer] x n for layer in range(hparams.num_layers): layer_name = "block_layer_%d" % layer with tf.variable_scope(layer_name): # Residual block layer x = block_layer(inputs=x, filters=hparams.filter_sizes[0], blocks=hparams.layer_sizes[0], strides=[1, 1], is_training=is_training, name="block_layer1", row_blocks_dim=None, col_blocks_dim=None) x = block_layer(inputs=x, filters=hparams.filter_sizes[1], blocks=hparams.layer_sizes[1], strides=[1, 1], is_training=is_training, name="block_layer2", row_blocks_dim=None, col_blocks_dim=None) x = block_layer(inputs=x, filters=hparams.filter_sizes[2], blocks=hparams.layer_sizes[2], strides=[1, 1], is_training=is_training, name="block_layer3", row_blocks_dim=None, col_blocks_dim=None) # Calculate the logits and loss. out = x outputs = mtf.layers.dense(out, hidden_dim, reduced_dims=out.shape.dims[-5:], activation=mtf.relu, name="dense") # We assume fixed vocab size for targets labels = tf.squeeze(tf.to_int32(features["targets"]), [2, 3]) labels = mtf.import_tf_tensor(mesh, tf.reshape(labels, [hparams.batch_size]), mtf.Shape([batch_dim])) logits = mtf.layers.dense(outputs, classes_dim, name="logits") soft_targets = mtf.one_hot(labels, classes_dim, dtype=activation_dtype) loss = mtf.layers.softmax_cross_entropy_with_logits( logits, soft_targets, classes_dim) # Reshape logits so it doesn't break inside t2t. logits = mtf.reshape( logits, mtf.Shape([batch_dim, one_channel_dim, classes_dim])) loss = mtf.reduce_mean(loss) return logits, loss
def _plot(self, data, res, name=None): img = self._img(data) label = self._label(data) if label is not None: label_one_hot = tf.one_hot(label, depth=self._n_classes) _render_activations = functools.partial( # pylint:disable=invalid-name plot.render_activations, height=int(img.shape[1]), pixels_per_caps=3, cmap='viridis') mass_explained_by_capsule = tf.reduce_sum(res.posterior_mixing_probs, 1) normalized_mass_expplained_by_capsule = mass_explained_by_capsule / tf.reduce_max( mass_explained_by_capsule, -1, keepdims=True) # pylint:disable=line-too-long posterior_caps_activation = _render_activations( normalized_mass_expplained_by_capsule) # pylint:disable=line-too-long prior_caps_activation = _render_activations(res.caps_presence_prob) is_from_capsule = snt.BatchApply(_render_activations)( res.posterior_mixing_probs) green = res.top_down_rec rec_red = res.rec_mode rec_green = green.pdf.mode() flat_per_caps_rec = res.top_down_per_caps_rec.pdf.mode() shape = res.vote.shape[:2].concatenate(flat_per_caps_rec.shape[1:]) per_caps_rec = tf.reshape(flat_per_caps_rec, shape) per_caps_rec = plot.concat_images( tf.unstack(per_caps_rec, axis=1), 1, vertical=False) one_image = tf.reduce_mean( self._img(data, self._prep), axis=-1, keepdims=True) one_rec = tf.reduce_mean(rec_red, axis=-1, keepdims=True) diff = tf.concat([one_image, one_rec, tf.zeros_like(one_image)], -1) used_templates = tf.reduce_mean(res.used_templates, axis=-1, keepdims=True) green_templates = tf.reduce_mean( green.transformed_templates, axis=-1, keepdims=True) templates = tf.concat( [used_templates, green_templates, tf.zeros_like(used_templates)], -1) templates = tf.concat( [templates, tf.ones_like(templates[:, :, :, :1]), is_from_capsule], 3) all_imgs = [ img, rec_red, rec_green, diff, prior_caps_activation, tf.zeros_like(rec_red[:, :, :1]), posterior_caps_activation, per_caps_rec ] + list(tf.unstack(templates, axis=1)) for i, img in enumerate(all_imgs): if img.shape[-1] == 1: all_imgs[i] = tf.image.grayscale_to_rgb(img) img_with_templates = plot.concat_images(all_imgs, 1, vertical=False) def render_corr(x, y): corr = abs(plot.correlation(x, y)) rendered_corr = tf.expand_dims(_render_activations(corr), 0) return plot.concat_images( tf.unstack(rendered_corr, axis=1), 3, vertical=False) if label is not None: posterior_label_corr = render_corr(normalized_mass_expplained_by_capsule, label_one_hot) prior_label_corr = render_corr(res.caps_presence_prob, label_one_hot) label_corr = plot.concat_images([prior_label_corr, posterior_label_corr], 3, vertical=True) else: label_corr = tf.zeros_like(img) n_examples = min(int(shape[0]), 16) plot_params = dict( img_with_templates=dict( grid_height=n_examples, zoom=3., )) templates = res.templates if len(templates.shape) == 5: if templates.shape[0] == 1: templates = tf.squeeze(templates, 0) else: templates = templates[:n_examples] templates = plot.concat_images( tf.unstack(templates, axis=1), 1, vertical=False) plot_params['templates'] = dict(grid_height=n_examples) plot_dict = dict( templates=templates, img_with_templates=img_with_templates[:n_examples], label_corr=label_corr, ) return plot_dict, plot_params
def pc_encoder(point_cloud, nasmples, is_training, bn_decay=None): batch_size = point_cloud.get_shape()[0].value num_point = point_cloud.get_shape()[1].value point_dim = point_cloud.get_shape()[2].value with tf.variable_scope('transform_net1') as sc: transform = input_transform_net(point_cloud, is_training, bn_decay, K=3) point_cloud_transformed = tf.matmul(point_cloud, transform) point_cloud_transformed = tf.expand_dims(point_cloud_transformed, -1) nn_dis, idx_batch = tf_util.get_knn(point_cloud, 12) # Encoder net = tf_util.conv2d(point_cloud_transformed, 64, [1, point_dim], padding='VALID', stride=[1, 1], bn=True, is_training=is_training, scope='conv1', bn_decay=bn_decay) net = tf_util.conv2d(net, 64, [1, 1], padding='VALID', stride=[1, 1], bn=True, is_training=is_training, scope='conv2', bn_decay=bn_decay) point_feat_1 = tf_util.conv2d(net, 128, [1, 1], padding='VALID', stride=[1, 1], bn=True, is_training=is_training, scope='conv3', bn_decay=bn_decay) print('------------ convPN_1 ------------') point_feat = tf_util.conv2d(point_feat_1, 256, [1, 1], padding='VALID', stride=[1, 1], bn=True, is_training=is_training, scope='conv4', bn_decay=bn_decay) point_feat = tf_util.conv2d(point_feat, 256, [1, 1], padding='VALID', stride=[1, 1], bn=True, is_training=is_training, scope='conv5', bn_decay=bn_decay) feature = tf.squeeze(point_feat, squeeze_dims=2) knn_feat = tf_util.cuda_maxpooling(feature, idx_batch) knn_feat = tf.expand_dims(knn_feat, axis=2) point_feat_2 = tf.concat([point_feat, knn_feat], axis=-1) # 32 256 1 256 print('------------ convPN_2 ------------') point_feat = tf_util.conv2d(point_feat_2, 256, [1, 1], padding='VALID', stride=[1, 1], bn=True, is_training=is_training, scope='conv6', bn_decay=bn_decay) point_feat = tf_util.conv2d(point_feat, 256, [1, 1], padding='VALID', stride=[1, 1], bn=True, is_training=is_training, scope='conv7', bn_decay=bn_decay) feature = tf.squeeze(point_feat, squeeze_dims=2) knn_feat = tf_util.cuda_maxpooling(feature, idx_batch) knn_feat = tf.expand_dims(knn_feat, axis=2) point_feat_3 = tf.concat([point_feat, knn_feat], axis=-1) # 32 256 1 512 mix_feature = tf.concat([point_feat_1, point_feat_2, point_feat_3], axis=-1) # ----------- maxpooling-------------- global_feature = tf_util.max_pool2d(mix_feature, [num_point, 1], padding='VALID', scope='maxpool_1') net = tf.reshape(global_feature, [batch_size, -1]) net = tf_util.fully_connected(net, 512, bn=True, is_training=is_training, scope='fc00', bn_decay=bn_decay) net = tf_util.fully_connected(net, 512, bn=True, is_training=is_training, scope='fc01', bn_decay=bn_decay) net = tf_util.fully_connected(net, 512, bn=True, is_training=is_training, scope='fc02', bn_decay=bn_decay) net = tf.reshape(net, [batch_size, -1]) return net
def body(self, features): # Remove dropout if not training hparams = self._hparams ps_devices = self._ps_devices assert hparams.num_model_shards % len(ps_devices) == 0 shards_per_device = hparams.num_model_shards // len(ps_devices) model_devices = [ps_devices[i // shards_per_device] for i in range(hparams.num_model_shards)] print("model_devices = %s" % model_devices) mp = expert_utils.Parallelism(model_devices, reuse=False) vocab_size = self._problem_hparams.vocabulary["targets"].vocab_size # squeeze out channels, heights targets = features["targets_raw"] targets = tf.squeeze(targets, 3) targets = tf.squeeze(targets, 2) shifted_targets = common_layers.shift_right_2d(targets) # Bypass the symbol modality and use a different embedding on each shard. decoder_input = mp( common_layers.embedding, shifted_targets, vocab_size, hparams.hidden_size, multiplier=hparams.hidden_size**0.5, symbol_dropout_rate=hparams.symbol_dropout) decoder_self_attention_bias = mp( common_attention.attention_bias_lower_triangle, tf.shape(targets)[1]) if "targets_segmentation" in features: # "Packed" dataset - keep the examples from seeing each other. targets_segmentation = features["targets_segmentation"] targets_position = features["targets_position"] decoder_self_attention_bias = mp( tf.add, decoder_self_attention_bias, mp(common_attention.attention_bias_same_segment, targets_segmentation, targets_segmentation)) else: targets_position = None if hparams.pos == "timing": if targets_position is None: decoder_input = mp(common_attention.add_timing_signal_1d, decoder_input) else: decoder_input = mp( common_attention.add_timing_signal_1d_given_position, decoder_input, targets_position) decoder_input = mp( tf.nn.dropout, decoder_input, 1.0 - hparams.layer_prepostprocess_dropout) decoder_output, extra_loss = _super_stack( decoder_input, decoder_self_attention_bias, hparams, mp) # Bypass the symbol modality and compute logits directly. # We compute a different set of logits on each shard, and sum them. logits = mp(tf.layers.dense, decoder_output, vocab_size, name="logits") logits = expert_utils.all_reduce_ring(logits, mp) logits = mp(tf.multiply, logits, mp.n ** -0.5) # We now have identical logits on all shards. # Shard 0 gets returned to the estimator. logits_shard_0 = logits[0] logits_shard_0 = tf.expand_dims(logits_shard_0, 2) logits_shard_0 = tf.expand_dims(logits_shard_0, 3) # On each device, we compute the loss for a part of the batch. # This is faster than computing the whole loss on one shard. mp, logits = expert_utils.reduce_by_device(mp, logits, lambda l: l[0]) def _loss_for_shard(logits, targets, shard): if mp.n > 1: logits = common_layers.approximate_split(logits, mp.n, 0)[shard] targets = common_layers.approximate_split(targets, mp.n, 0)[shard] return common_layers.padded_cross_entropy( logits, targets, hparams.label_smoothing) num, denom = mp(_loss_for_shard, logits, targets, range(mp.n)) # override training loss so that it is not computed externally. losses = {"training": tf.add_n(num) / tf.add_n(denom)} if extra_loss is not None: losses["extra"] = extra_loss return logits_shard_0, losses
def build(): """Builds the Tensorflow graph.""" inputs, labels, lengths = None, None, None if mode in ('train', 'eval'): if isinstance(no_event_label, numbers.Number): label_shape = [] else: label_shape = [len(no_event_label)] inputs, labels, lengths = magenta.common.get_padded_batch( sequence_example_file_paths, hparams.batch_size, input_size, label_shape=label_shape, shuffle=mode == 'train') elif mode == 'generate': inputs = tf.placeholder(tf.float32, [hparams.batch_size, None, input_size]) if isinstance(encoder_decoder, magenta.music.OneHotIndexEventSequenceEncoderDecoder): expanded_inputs = tf.one_hot( tf.cast(tf.squeeze(inputs, axis=-1), tf.int64), encoder_decoder.input_depth) else: expanded_inputs = inputs dropout_keep_prob = 1.0 if mode == 'generate' else hparams.dropout_keep_prob cell = make_rnn_cell(hparams.rnn_layer_sizes, dropout_keep_prob=dropout_keep_prob, attn_length=hparams.attn_length, residual_connections=hparams.residual_connections) initial_state = cell.zero_state(hparams.batch_size, tf.float32) outputs, final_state = tf.nn.dynamic_rnn(cell, expanded_inputs, sequence_length=lengths, initial_state=initial_state, swap_memory=True) outputs_flat = magenta.common.flatten_maybe_padded_sequences( outputs, lengths) if isinstance(num_classes, numbers.Number): num_logits = num_classes else: num_logits = sum(num_classes) logits_flat = tf_slim.layers.linear(outputs_flat, num_logits) if mode in ('train', 'eval'): labels_flat = magenta.common.flatten_maybe_padded_sequences( labels, lengths) if isinstance(num_classes, numbers.Number): softmax_cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels_flat, logits=logits_flat) predictions_flat = tf.argmax(logits_flat, axis=1) else: logits_offsets = np.cumsum([0] + num_classes) softmax_cross_entropy = [] predictions = [] for i in range(len(num_classes)): softmax_cross_entropy.append( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels_flat[:, i], logits=logits_flat[:, logits_offsets[i]: logits_offsets[i + 1]])) predictions.append( tf.argmax( logits_flat[:, logits_offsets[i]:logits_offsets[i + 1]], axis=1)) predictions_flat = tf.stack(predictions, 1) correct_predictions = tf.to_float( tf.equal(labels_flat, predictions_flat)) event_positions = tf.to_float( tf.not_equal(labels_flat, no_event_label)) no_event_positions = tf.to_float( tf.equal(labels_flat, no_event_label)) # Compute the total number of time steps across all sequences in the # batch. For some models this will be different from the number of RNN # steps. def batch_labels_to_num_steps(batch_labels, lengths): num_steps = 0 for labels, length in zip(batch_labels, lengths): num_steps += encoder_decoder.labels_to_num_steps( labels[:length]) return np.float32(num_steps) num_steps = tf.py_func(batch_labels_to_num_steps, [labels, lengths], tf.float32) if mode == 'train': loss = tf.reduce_mean(softmax_cross_entropy) perplexity = tf.exp(loss) accuracy = tf.reduce_mean(correct_predictions) event_accuracy = ( tf.reduce_sum(correct_predictions * event_positions) / tf.reduce_sum(event_positions)) no_event_accuracy = ( tf.reduce_sum(correct_predictions * no_event_positions) / tf.reduce_sum(no_event_positions)) loss_per_step = tf.reduce_sum( softmax_cross_entropy) / num_steps perplexity_per_step = tf.exp(loss_per_step) optimizer = tf.train.AdamOptimizer( learning_rate=hparams.learning_rate) train_op = tf_slim.learning.create_train_op( loss, optimizer, clip_gradient_norm=hparams.clip_norm) tf.add_to_collection('train_op', train_op) vars_to_summarize = { 'loss': loss, 'metrics/perplexity': perplexity, 'metrics/accuracy': accuracy, 'metrics/event_accuracy': event_accuracy, 'metrics/no_event_accuracy': no_event_accuracy, 'metrics/loss_per_step': loss_per_step, 'metrics/perplexity_per_step': perplexity_per_step, } elif mode == 'eval': vars_to_summarize, update_ops = tf_slim.metrics.aggregate_metric_map( { 'loss': tf.metrics.mean(softmax_cross_entropy), 'metrics/accuracy': tf.metrics.accuracy(labels_flat, predictions_flat), 'metrics/per_class_accuracy': tf.metrics.mean_per_class_accuracy( labels_flat, predictions_flat, num_classes), 'metrics/event_accuracy': tf.metrics.recall(event_positions, correct_predictions), 'metrics/no_event_accuracy': tf.metrics.recall(no_event_positions, correct_predictions), 'metrics/loss_per_step': tf.metrics.mean(tf.reduce_sum(softmax_cross_entropy) / num_steps, weights=num_steps), }) for updates_op in update_ops.values(): tf.add_to_collection('eval_ops', updates_op) # Perplexity is just exp(loss) and doesn't need its own update op. vars_to_summarize['metrics/perplexity'] = tf.exp( vars_to_summarize['loss']) vars_to_summarize['metrics/perplexity_per_step'] = tf.exp( vars_to_summarize['metrics/loss_per_step']) for var_name, var_value in vars_to_summarize.items(): tf.summary.scalar(var_name, var_value) tf.add_to_collection(var_name, var_value) elif mode == 'generate': temperature = tf.placeholder(tf.float32, []) if isinstance(num_classes, numbers.Number): softmax_flat = tf.nn.softmax( tf.div(logits_flat, tf.fill([num_classes], temperature))) softmax = tf.reshape(softmax_flat, [hparams.batch_size, -1, num_classes]) else: logits_offsets = np.cumsum([0] + num_classes) softmax = [] for i in range(len(num_classes)): sm = tf.nn.softmax( tf.div( logits_flat[:, logits_offsets[i]:logits_offsets[i + 1]], tf.fill([num_classes[i]], temperature))) sm = tf.reshape(sm, [hparams.batch_size, -1, num_classes[i]]) softmax.append(sm) tf.add_to_collection('inputs', inputs) tf.add_to_collection('temperature', temperature) tf.add_to_collection('softmax', softmax) # Flatten state tuples for metagraph compatibility. for state in tf.nest.flatten(initial_state): tf.add_to_collection('initial_state', state) for state in tf.nest.flatten(final_state): tf.add_to_collection('final_state', state)
def build_cnn18(self): x = self.placeholders['img_inp'] x = tf.expand_dims(x, 0) #224 224 x = tflearn.layers.conv.conv_2d(x, 16, (3, 3), strides=1, activation='relu', weight_decay=1e-5, regularizer='L2') x = tflearn.layers.conv.conv_2d(x, 16, (3, 3), strides=1, activation='relu', weight_decay=1e-5, regularizer='L2') x0 = x x = tflearn.layers.conv.conv_2d(x, 32, (3, 3), strides=2, activation='relu', weight_decay=1e-5, regularizer='L2') #112 112 x = tflearn.layers.conv.conv_2d(x, 32, (3, 3), strides=1, activation='relu', weight_decay=1e-5, regularizer='L2') x = tflearn.layers.conv.conv_2d(x, 32, (3, 3), strides=1, activation='relu', weight_decay=1e-5, regularizer='L2') x1 = x x = tflearn.layers.conv.conv_2d(x, 64, (3, 3), strides=2, activation='relu', weight_decay=1e-5, regularizer='L2') #56 56 x = tflearn.layers.conv.conv_2d(x, 64, (3, 3), strides=1, activation='relu', weight_decay=1e-5, regularizer='L2') x = tflearn.layers.conv.conv_2d(x, 64, (3, 3), strides=1, activation='relu', weight_decay=1e-5, regularizer='L2') x2 = x x = tflearn.layers.conv.conv_2d(x, 128, (3, 3), strides=2, activation='relu', weight_decay=1e-5, regularizer='L2') #28 28 x = tflearn.layers.conv.conv_2d(x, 128, (3, 3), strides=1, activation='relu', weight_decay=1e-5, regularizer='L2') x = tflearn.layers.conv.conv_2d(x, 128, (3, 3), strides=1, activation='relu', weight_decay=1e-5, regularizer='L2') x3 = x x = tflearn.layers.conv.conv_2d(x, 256, (5, 5), strides=2, activation='relu', weight_decay=1e-5, regularizer='L2') #14 14 x = tflearn.layers.conv.conv_2d(x, 256, (3, 3), strides=1, activation='relu', weight_decay=1e-5, regularizer='L2') x = tflearn.layers.conv.conv_2d(x, 256, (3, 3), strides=1, activation='relu', weight_decay=1e-5, regularizer='L2') x4 = x x = tflearn.layers.conv.conv_2d(x, 512, (5, 5), strides=2, activation='relu', weight_decay=1e-5, regularizer='L2') #7 7 x = tflearn.layers.conv.conv_2d(x, 512, (3, 3), strides=1, activation='relu', weight_decay=1e-5, regularizer='L2') x = tflearn.layers.conv.conv_2d(x, 512, (3, 3), strides=1, activation='relu', weight_decay=1e-5, regularizer='L2') x = tflearn.layers.conv.conv_2d(x, 512, (3, 3), strides=1, activation='relu', weight_decay=1e-5, regularizer='L2') x5 = x #updata image feature self.placeholders.update({ 'img_feat': [tf.squeeze(x2), tf.squeeze(x3), tf.squeeze(x4), tf.squeeze(x5)] })
def attention(query, attend_in, single_dot_in, elements_mask, do_softmax, attention_method, flags): """Returns the attention mask using the method described by attention_method. Args: query: Query vector. Shape: [batch_size, query_size] attend_in: Values for each item to use for attention. [batch_size * elements_per_query, attend_size] single_dot_in: Values for each item to use for attention in single dot mode. [batch_size * elements_per_query, single_dot_attend_size] single_dot_attend_size must be greater than query_size elements_mask: Mask for what elements items exist in the input. do_softmax: Whether to put the output through softmax. attention_method: The attention method to use. flags: The input Flags. (Currently unused) Returns: The attention mask. """ del flags elements_item_size = attend_in.shape[1] # Use different weights for DNN ontop of Ref Exp, and Elements if 'sepDotAtten' == attention_method: elements_enc_attend = tf.layers.dense(attend_in, elements_item_size) query_attend = tf.layers.dense(query, elements_item_size) attention_mask = atten_metric(elements_enc_attend, query_attend, elements_mask, do_softmax) # Use the same weights for DNN ontop of Ref Exp, and Elements if 'singDotAtten' == attention_method: elements_enc_attend = single_dot_in query_attend = tf.concat([ query, tf.zeros([ tf.shape(query)[0], tf.shape(single_dot_in)[1] - tf.shape(query)[1] ]) ], 1) # Concat along batch dim, so same weights used for each. all_attend = tf.concat([elements_enc_attend, query_attend], 0) all_attend = tf.layers.dense(all_attend, elements_item_size, tf.nn.relu) all_attend = tf.layers.dense(all_attend, elements_item_size) elements_enc_attend, query_attend = tf.split( all_attend, [tf.shape(elements_enc_attend)[0], tf.shape(query_attend)[0]]) attention_mask = atten_metric(elements_enc_attend, query_attend, elements_mask, do_softmax) # Combine Ref Exp, and Elements before input to DNN if 'combAtten' == attention_method: query_tile = tile_ref_enc_to_elements(query, elements_mask) attention_mask = tf.concat([attend_in, query_tile], 1) attention_mask = tf.layers.dense(attention_mask, elements_item_size, tf.nn.relu) attention_mask = tf.layers.dense(attention_mask, 1) attention_mask = tf.squeeze(attention_mask, 1) if do_softmax: attention_mask = atten_softmax(attention_mask, elements_mask) tf.summary.histogram('attention_mask', attention_mask) return attention_mask
def test_squeeze(self): input = tf.placeholder(shape=(4, 32, 32, 1), dtype=tf.float32) output = tf.squeeze(input, axis=[3]) self._test_conversion('squeeze', [input], [output])
def mtf_model_fn(self, features, mesh): features = copy.copy(features) tf.logging.info("features = %s" % features) hparams = self._hparams activation_dtype = self.activation_type # We assume fixed vocab size for targets targets = tf.to_int32(features["targets"]) # Image preprocessing, reshape into a 1D sequence and shift right. length = hparams.img_len * hparams.img_len * hparams.num_channels targets = tf.reshape(targets, [hparams.batch_size, length]) shifted_targets = common_layers.shift_right_2d(targets) # Declare all the dimensions batch_dim = mtf.Dimension("batch", hparams.batch_size) def import_to_batch_by_length(x, name): return mtf.import_tf_tensor(mesh, x, mtf.Shape([batch_dim, self.length_dim]), name=name) targets = import_to_batch_by_length(targets, "targets") shifted_targets = import_to_batch_by_length(shifted_targets, "shifted_targets") extra_losses = [] # Create targets content and position embeddings. # Create embedding var for targets and positions and do a gather. targets_embedding_var = mtf.get_variable( mesh, "targets_embedding", mtf.Shape([self.targets_vocab_dim, self.model_dim]), initializer=tf.random_normal_initializer(), activation_dtype=activation_dtype) x = mtf.gather(targets_embedding_var, shifted_targets, self.targets_vocab_dim) # Add positional embeddings x += mtf.reshape(self.create_positional_emb_2d(targets), [self.length_dim, self.model_dim]) # If conditional and input is given, add the input embedding to the target. # TODO(nikip): Verify conditional. if self.has_input and not hparams.unconditional: inputs = tf.squeeze(tf.to_int32(features["inputs"]), [2, 3]) inputs = import_to_batch_by_length(inputs, "inputs") # Input embeddings inputs_embedding_var = mtf.layers.embedding( mesh, "input_embedding", mtf.Shape([self.inputs_vocab_dim, self.model_dim]), activation_dtype=activation_dtype) inputs_emb = mtf.gather(inputs_embedding_var, inputs, self.inputs_vocab_dim) x += inputs_emb # Image Transformer Decoder # [ self attention - ffn - residual + dropout] x n if hparams.attention_type == "local1d_spatial": decoder_output = local_attention1d_spatial_decoder( x, self.kv_dim, self.heads_dim, self.feedforward_dim, hparams) elif hparams.attention_type == "local2d_spatial": decoder_output = local_attention2d_spatial_decoder( x, self.kv_dim, self.heads_dim, self.feedforward_dim, hparams) elif hparams.attention_type == "local1d": decoder_output = local_attention1d_masked_decoder( x, self.kv_dim, self.heads_dim, self.feedforward_dim, hparams) else: raise ValueError("Invalid attention type.") # Calculate the logits and loss. logits = mtf.layers.dense(decoder_output, self.outputs_vocab_dim, name="logits") # Need a reshape for logits logits = mtf.reshape( logits, mtf.Shape([batch_dim, self.length_dim, self.outputs_vocab_dim])) soft_targets = mtf.one_hot(targets, self.outputs_vocab_dim, dtype=activation_dtype) loss = mtf.layers.softmax_cross_entropy_with_logits( logits, soft_targets, self.outputs_vocab_dim) loss = mtf.reduce_mean(loss) for l in extra_losses: loss += l # Reshape logits to original target shape. logits = mtf.reshape( logits, mtf.Shape([ batch_dim, self.rows_dim, self.orig_cols_dim, self.channels_dim, self.outputs_vocab_dim ])) return logits, loss
def _generate_detections_tf(cls_outputs, box_outputs, anchor_boxes, indices, classes, image_id, image_scale, num_classes, min_score_thresh=0.2, max_boxes_to_draw=50, soft_nms_sigma=0.0, iou_threshold=0.5, use_native_nms=False): """Generates detections with model outputs and anchors. Args: cls_outputs: a numpy array with shape [N, 1], which has the highest class scores on all feature levels. The N is the number of selected top-K total anchors on all levels. (k being MAX_DETECTION_POINTS) box_outputs: a numpy array with shape [N, 4], which stacks box regression outputs on all feature levels. The N is the number of selected top-k total anchors on all levels. (k being MAX_DETECTION_POINTS) anchor_boxes: a numpy array with shape [N, 4], which stacks anchors on all feature levels. The N is the number of selected top-k total anchors on all levels. indices: a numpy array with shape [N], which is the indices from top-k selection. classes: a numpy array with shape [N], which represents the class prediction on all selected anchors from top-k selection. image_id: an integer number to specify the image id. image_scale: a float tensor representing the scale between original image and input image for the detector. It is used to rescale detections for evaluating with the original groundtruth annotations. num_classes: a integer that indicates the number of classes. min_score_thresh: A float representing the threshold for deciding when to remove boxes based on score. max_boxes_to_draw: Max number of boxes to draw. soft_nms_sigma: A scalar float representing the Soft NMS sigma parameter; See Bodla et al, https://arxiv.org/abs/1704.04503). When `soft_nms_sigma=0.0` (which is default), we fall back to standard (hard) NMS. iou_threshold: A float representing the threshold for deciding whether boxes overlap too much with respect to IOU. use_native_nms: a bool that indicates whether to use native nms. Returns: detections: detection results in a tensor with each row representing [image_id, y, x, height, width, score, class] """ anchor_boxes = tf.gather(anchor_boxes, indices) scores = tf.math.sigmoid(cls_outputs) # apply bounding box regression to anchors boxes = decode_box_outputs_tf(tf.transpose(box_outputs, [1, 0]), tf.transpose(anchor_boxes, [1, 0])) def _else(detections, class_id, indices): """Else branch for generating detections.""" boxes_cls = tf.gather(boxes, indices) scores_cls = tf.gather(scores, indices) # Select top-scoring boxes in each class and apply non-maximum suppression # (nms) for boxes in the same class. The selected boxes from each class are # then concatenated for the final detection outputs. if use_native_nms: top_detection_idx, scores_cls = tf.image.non_max_suppression_with_scores( boxes_cls, scores_cls, max_boxes_to_draw, iou_threshold=iou_threshold, score_threshold=min_score_thresh, soft_nms_sigma=soft_nms_sigma) scores_cls = tf.expand_dims(scores_cls, axis=1) boxes_cls = tf.gather(boxes_cls, top_detection_idx) top_detections_cls = tf.concat([boxes_cls, scores_cls], axis=1) else: scores_cls = tf.expand_dims(scores_cls, axis=1) all_detections_cls = tf.concat([boxes_cls, scores_cls], axis=1) top_detection_idx = nms_tf(all_detections_cls, iou_threshold) top_detections_cls = tf.gather(all_detections_cls, top_detection_idx) height = top_detections_cls[:, 2] - top_detections_cls[:, 0] width = top_detections_cls[:, 3] - top_detections_cls[:, 1] top_detections_cls = tf.stack([ top_detections_cls[:, 0] * image_scale, top_detections_cls[:, 1] * image_scale, height * image_scale, width * image_scale, top_detections_cls[:, 4] ], axis=-1) top_detections_cls = tf.stack([ tf.cast(tf.repeat(image_id, tf.size(top_detection_idx)), tf.float32), *tf.unstack(top_detections_cls, 5, axis=1), tf.repeat(class_id + 1.0, tf.size(top_detection_idx)) ], axis=1) detections = tf.concat([detections, top_detections_cls], axis=0) return detections detections = tf.constant([], tf.float32, [0, 7]) for c in range(num_classes): indices_cls = tf.squeeze(tf.where_v2(tf.equal(classes, c)), axis=-1) detections = tf.cond( tf.equal(tf.size(indices), 0), lambda: detections, lambda id=c, id_cls=indices_cls: _else(detections, id, id_cls)) indices_final = tf.argsort(detections[:, -2], direction='DESCENDING') detections = tf.gather(detections, indices_final[:max_boxes_to_draw], name='detection') return detections