def _calculate(self): # On tpu we strive to stack tensors together and perform ops once on the # entire stack, to save time HBM memory. We thus stack the batch-of-first- # frames and the batch-of-second frames, for both depth and RGB. The batch # dimension of rgb_stack and gt_depth_stack are thus twice the original # batch size. # Create stacks for features that need to be scaled into pyramids for # multi-scale training. rgb_stack_ = tf.concat(self._endpoints['rgb'], axis=0) flipped_rgb_stack_ = tf.concat(self._endpoints['rgb'][::-1], axis=0) predicted_depth_stack_ = tf.concat(self._endpoints['predicted_depth'], axis=0) flipped_predicted_depth_stack_ = tf.concat( self._endpoints['predicted_depth'][::-1], axis=0) residual_translation_ = tf.concat( self._endpoints['residual_translation'], axis=0) flipped_residual_translation_ = tf.concat( self._endpoints['residual_translation'][::-1], axis=0) intrinsics_mat_ = tf.concat(self._endpoints['intrinsics_mat'], axis=0) # Create pyramids from each stack to support multi-scale training. num_scales = self._params.num_scales rgb_pyramid = _get_pyramid(rgb_stack_, num_scales=num_scales) flipped_rgb_pyramid = _get_pyramid(flipped_rgb_stack_, num_scales=num_scales) predicted_depth_pyramid = _get_pyramid(predicted_depth_stack_, num_scales=num_scales) flipped_predicted_depth_pyramid = _get_pyramid( flipped_predicted_depth_stack_, num_scales=num_scales) residual_translation_pyramid = _get_pyramid(residual_translation_, num_scales=num_scales) flipped_residual_translation_pyramid = _get_pyramid( flipped_residual_translation_, num_scales=num_scales) intrinsics_mat_pyramid = _get_intrinsics_mat_pyramid( intrinsics_mat_, num_scales=num_scales) validity_mask_ = self._endpoints.get('validity_mask') if validity_mask_ is not None: validity_mask_ = tf.concat(validity_mask_, axis=0) validity_mask_pyramid = _get_pyramid(validity_mask_, num_scales, _min_pool2d) else: validity_mask_pyramid = [None] * num_scales if 'groundtruth_depth' in self._endpoints: gt_depth_stack_ = tf.concat(self._endpoints['groundtruth_depth'], axis=0) gt_depth_pyramid = _get_pyramid(gt_depth_stack_, num_scales=num_scales) if 'groundtruth_depth_weight' in self._endpoints: gt_depth_weight_stack_ = tf.concat( self._endpoints['groundtruth_depth_weight'], axis=0) else: gt_depth_weight_stack_ = tf.cast( tf.greater(gt_depth_stack_, 0.2), tf.float32) gt_depth_weight_pyramid = _get_pyramid(gt_depth_weight_stack_, num_scales=num_scales) if 'groundtruth_depth_filter' in self._endpoints: depth_filter_ = tf.concat( self._endpoints['groundtruth_depth_filter'], axis=0) depth_filter_ = tf.cast(depth_filter_, tf.float32) depth_filter_pyramid = _get_pyramid(gt_depth_stack_, num_scales=num_scales) # Calculate losses at each scale. Iterate in reverse so that the final # output values are set at scale 0. for s in reversed(range(self._params.num_scales)): # Weight applied to all losses at this scale. scale_w = 1.0 / 2**s rgb_stack = rgb_pyramid[s] predicted_depth_stack = predicted_depth_pyramid[s] flipped_predicted_depth_stack = flipped_predicted_depth_pyramid[s] if 'groundtruth_depth' in self._endpoints: gt_depth_stack = gt_depth_pyramid[s] depth_error = tf.abs(gt_depth_stack - predicted_depth_stack) # Weigh the spatial loss if a weight map is provided. Otherwise, revert # to original behavior. gt_depth_weight_stack = gt_depth_weight_pyramid[s] depth_error = depth_error * gt_depth_weight_stack # Optionally filter the depth map if a boolean depth filter is provided. # We use a TPU-friendly equivalent of tf.boolean_mask. depth_filter = tf.ones_like(depth_error, tf.float32) if 'groundtruth_depth_filter' in self._endpoints: depth_filter = depth_filter_pyramid[s] self._losses['depth_supervision'] += scale_w * tf.reduce_mean( depth_error * depth_filter) / tf.reduce_mean(depth_filter) # In theory, the training losses should be agnostic to the global scale of # the predicted depth. However in reality second order effects can lead to # (https://en.wikipedia.org/wiki/Von_Neumann_stability_analysis) diverging # modes. For some reason this happens when training on TPU. Since the # scale is immaterial anyway, we normalize it out, and the training # stabilizes. # # Note that the depth supervision term, which is sensitive to the scale, # was applied before this normalization. Therefore the scale of the depth # is learned. mean_depth = tf.reduce_mean(predicted_depth_stack) # When training starts, the depth sometimes tends to collapse to a # constant value, which seems to be a fixed point where the trainig can # stuck. To discourage this collapse, we penalize the reciprocal of the # variance with a tiny weight. Note that the mean of predicted_depth is # one, hence we subtract 1.0. depth_var = tf.reduce_mean( tf.square(predicted_depth_stack / mean_depth - 1.0)) self._losses['depth_variance'] = scale_w * 1.0 / depth_var if self._params.scale_normalization: predicted_depth_stack /= mean_depth flipped_predicted_depth_stack /= mean_depth disp = 1.0 / predicted_depth_stack mean_disp = tf.reduce_mean(disp, axis=[1, 2, 3], keep_dims=True) self._losses['depth_smoothing'] += ( scale_w * regularizers.joint_bilateral_smoothing( disp / mean_disp, rgb_stack)) self._output_endpoints['disparity'] = disp flipped_rgb_stack = flipped_rgb_pyramid[s] background_translation = tf.concat( self._endpoints['background_translation'], axis=0) flipped_background_translation = tf.concat( self._endpoints['background_translation'][::-1], axis=0) residual_translation = residual_translation_pyramid[s] flipped_residual_translation = flipped_residual_translation_pyramid[ s] if self._params.scale_normalization: background_translation /= mean_depth flipped_background_translation /= mean_depth residual_translation /= mean_depth flipped_residual_translation /= mean_depth translation = residual_translation + background_translation flipped_translation = (flipped_residual_translation + flipped_background_translation) rotation = tf.concat(self._endpoints['rotation'], axis=0) flipped_rotation = tf.concat(self._endpoints['rotation'][::-1], axis=0) intrinsics_mat = intrinsics_mat_pyramid[s] intrinsics_mat_inv = intrinsics_utils.invert_intrinsics_matrix( intrinsics_mat) validity_mask = validity_mask_pyramid[s] transformed_depth = transform_depth_map.using_motion_vector( tf.squeeze(predicted_depth_stack, axis=-1), translation, rotation, intrinsics_mat, intrinsics_mat_inv) flipped_predicted_depth_stack = tf.squeeze( flipped_predicted_depth_stack, axis=-1) if self._params.target_depth_stop_gradient: flipped_predicted_depth_stack = tf.stop_gradient( flipped_predicted_depth_stack) # The first and second halves of the batch not contain Frame1's and # Frame2's depths transformed onto Frame2 and Frame1 respectively. Te # demand consistency, we need to `flip` `predicted_depth` as well. loss_endpoints = ( consistency_losses.rgbd_and_motion_consistency_loss( transformed_depth, rgb_stack, flipped_predicted_depth_stack, flipped_rgb_stack, rotation, translation, flipped_rotation, flipped_translation, validity_mask=validity_mask)) normalized_trans = regularizers.normalize_motion_map( residual_translation, translation) self._losses[ 'motion_smoothing'] += scale_w * regularizers.l1smoothness( normalized_trans, self._weights.motion_drift == 0) self._losses[ 'motion_drift'] += scale_w * regularizers.sqrt_sparsity( normalized_trans) self._losses['depth_consistency'] += ( scale_w * loss_endpoints['depth_error']) self._losses[ 'rgb_consistency'] += scale_w * loss_endpoints['rgb_error'] self._losses[ 'ssim'] += scale_w * 0.5 * loss_endpoints['ssim_error'] self._losses['rotation_cycle_consistency'] += ( scale_w * loss_endpoints['rotation_error']) self._losses['translation_cycle_consistency'] += ( scale_w * loss_endpoints['translation_error']) self._output_endpoints['depth_proximity_weight'] = loss_endpoints[ 'depth_proximity_weight'] self._output_endpoints['trans'] = translation self._output_endpoints['inv_trans'] = flipped_translation for k, w in self._weights.as_dict().items(): # multiply by 2 to match the scale of the old code. self._losses[k] *= w * 2 if tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES): self._losses[tf.GraphKeys.REGULARIZATION_LOSSES] = tf.add_n( tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
def box_matching(boxes, gt_boxes, gt_classes): """Match boxes to groundtruth boxes. Given the proposal boxes and the groundtruth boxes and classes, perform the groundtruth matching by taking the argmax of the IoU between boxes and groundtruth boxes. Args: boxes: a tensor of shape of [batch_size, N, 4] representing the box coordiantes to be matched to groundtruth boxes. gt_boxes: a tensor of shape of [batch_size, MAX_INSTANCES, 4] representing the groundtruth box coordinates. It is padded with -1s to indicate the invalid boxes. gt_classes: [batch_size, MAX_INSTANCES] representing the groundtruth box classes. It is padded with -1s to indicate the invalid classes. Returns: matched_gt_boxes: a tensor of shape of [batch_size, N, 4], representing the matched groundtruth box coordinates for each input box. If the box does not overlap with any groundtruth boxes, the matched boxes of it will be set to all 0s. matched_gt_classes: a tensor of shape of [batch_size, N], representing the matched groundtruth classes for each input box. If the box does not overlap with any groundtruth boxes, the matched box classes of it will be set to 0, which corresponds to the background class. matched_gt_indices: a tensor of shape of [batch_size, N], representing the indices of the matched groundtruth boxes in the original gt_boxes tensor. If the box does not overlap with any groundtruth boxes, the index of the matched groundtruth will be set to -1. matched_iou: a tensor of shape of [batch_size, N], representing the IoU between the box and its matched groundtruth box. The matched IoU is the maximum IoU of the box and all the groundtruth boxes. iou: a tensor of shape of [batch_size, N, K], representing the IoU matrix between boxes and the groundtruth boxes. The IoU between a box and the invalid groundtruth boxes whose coordinates are [-1, -1, -1, -1] is -1. """ # Compute IoU between boxes and gt_boxes. # iou <- [batch_size, N, K] iou = box_utils.bbox_overlap(boxes, gt_boxes) # max_iou <- [batch_size, N] # 0.0 -> no match to gt, or -1.0 match to no gt matched_iou = tf.reduce_max(iou, axis=-1) # background_box_mask <- bool, [batch_size, N] background_box_mask = tf.less_equal(matched_iou, 0.0) argmax_iou_indices = tf.argmax(iou, axis=-1, output_type=tf.int32) argmax_iou_indices_shape = tf.shape(argmax_iou_indices) batch_indices = ( tf.expand_dims(tf.range(argmax_iou_indices_shape[0]), axis=-1) * tf.ones([1, argmax_iou_indices_shape[-1]], dtype=tf.int32)) gather_nd_indices = tf.stack([batch_indices, argmax_iou_indices], axis=-1) matched_gt_boxes = tf.gather_nd(gt_boxes, gather_nd_indices) matched_gt_boxes = tf.where( tf.tile(tf.expand_dims(background_box_mask, axis=-1), [1, 1, 4]), tf.zeros_like(matched_gt_boxes, dtype=tf.float32), matched_gt_boxes) matched_gt_classes = tf.gather_nd(gt_classes, gather_nd_indices) matched_gt_classes = tf.where(background_box_mask, tf.zeros_like(matched_gt_classes), matched_gt_classes) matched_gt_indices = tf.where(background_box_mask, -tf.ones_like(argmax_iou_indices), argmax_iou_indices) return (matched_gt_boxes, matched_gt_classes, matched_gt_indices, matched_iou, iou)
def decode(self, tf_example_string_tensor): """Decodes serialized tensorflow example and returns a tensor dictionary. Args: tf_example_string_tensor: a string tensor holding a serialized tensorflow example proto. Returns: A dictionary of the following tensors. fields.InputDataFields.image - 3D uint8 tensor of shape [None, None, 3] containing image. fields.InputDataFields.original_image_spatial_shape - 1D int32 tensor of shape [2] containing shape of the image. fields.InputDataFields.source_id - string tensor containing original image id. fields.InputDataFields.key - string tensor with unique sha256 hash key. fields.InputDataFields.filename - string tensor with original dataset filename. fields.InputDataFields.groundtruth_boxes - 2D float32 tensor of shape [None, 4] containing box corners. fields.InputDataFields.groundtruth_classes - 1D int64 tensor of shape [None] containing classes for the boxes. fields.InputDataFields.groundtruth_weights - 1D float32 tensor of shape [None] indicating the weights of groundtruth boxes. fields.InputDataFields.groundtruth_area - 1D float32 tensor of shape [None] containing containing object mask area in pixel squared. fields.InputDataFields.groundtruth_is_crowd - 1D bool tensor of shape [None] indicating if the boxes enclose a crowd. Optional: fields.InputDataFields.groundtruth_image_confidences - 1D float tensor of shape [None] indicating if a class is present in the image (1.0) or a class is not present in the image (0.0). fields.InputDataFields.image_additional_channels - 3D uint8 tensor of shape [None, None, num_additional_channels]. 1st dim is height; 2nd dim is width; 3rd dim is the number of additional channels. fields.InputDataFields.groundtruth_difficult - 1D bool tensor of shape [None] indicating if the boxes represent `difficult` instances. fields.InputDataFields.groundtruth_group_of - 1D bool tensor of shape [None] indicating if the boxes represent `group_of` instances. fields.InputDataFields.groundtruth_keypoints - 3D float32 tensor of shape [None, num_keypoints, 2] containing keypoints, where the coordinates of the keypoints are ordered (y, x). fields.InputDataFields.groundtruth_keypoint_visibilities - 2D bool tensor of shape [None, num_keypoints] containing keypoint visibilites. fields.InputDataFields.groundtruth_instance_masks - 3D float32 tensor of shape [None, None, None] containing instance masks. fields.InputDataFields.groundtruth_image_classes - 1D int64 of shape [None] containing classes for the boxes. fields.InputDataFields.multiclass_scores - 1D float32 tensor of shape [None * num_classes] containing flattened multiclass scores for groundtruth boxes. fields.InputDataFields.context_features - 1D float32 tensor of shape [context_feature_length * num_context_features] fields.InputDataFields.context_feature_length - int32 tensor specifying the length of each feature in context_features """ serialized_example = tf.reshape(tf_example_string_tensor, shape=[]) decoder = slim_example_decoder.TFExampleDecoder( self.keys_to_features, self.items_to_handlers) keys = decoder.list_items() tensors = decoder.decode(serialized_example, items=keys) tensor_dict = dict(zip(keys, tensors)) is_crowd = fields.InputDataFields.groundtruth_is_crowd tensor_dict[is_crowd] = tf.cast(tensor_dict[is_crowd], dtype=tf.bool) tensor_dict[fields.InputDataFields.image].set_shape([None, None, 3]) tensor_dict[ fields.InputDataFields.original_image_spatial_shape] = tf.shape( tensor_dict[fields.InputDataFields.image])[:2] if fields.InputDataFields.image_additional_channels in tensor_dict: channels = tensor_dict[ fields.InputDataFields.image_additional_channels] channels = tf.squeeze(channels, axis=3) channels = tf.transpose(channels, perm=[1, 2, 0]) tensor_dict[ fields.InputDataFields.image_additional_channels] = channels def default_groundtruth_weights(): return tf.ones([ tf.shape( tensor_dict[fields.InputDataFields.groundtruth_boxes])[0] ], dtype=tf.float32) tensor_dict[fields.InputDataFields.groundtruth_weights] = tf.cond( tf.greater( tf.shape(tensor_dict[ fields.InputDataFields.groundtruth_weights])[0], 0), lambda: tensor_dict[fields.InputDataFields.groundtruth_weights], default_groundtruth_weights) if fields.InputDataFields.groundtruth_keypoints in tensor_dict: # Set all keypoints that are not labeled to NaN. gt_kpt_fld = fields.InputDataFields.groundtruth_keypoints gt_kpt_vis_fld = fields.InputDataFields.groundtruth_keypoint_visibilities visibilities_tiled = tf.tile( tf.expand_dims(tensor_dict[gt_kpt_vis_fld], -1), [1, 1, 2]) tensor_dict[gt_kpt_fld] = tf.where( visibilities_tiled, tensor_dict[gt_kpt_fld], np.nan * tf.ones_like(tensor_dict[gt_kpt_fld])) if self._expand_hierarchy_labels: input_fields = fields.InputDataFields image_classes, image_confidences = self._expand_image_label_hierarchy( tensor_dict[input_fields.groundtruth_image_classes], tensor_dict[input_fields.groundtruth_image_confidences]) tensor_dict[input_fields.groundtruth_image_classes] = image_classes tensor_dict[input_fields.groundtruth_image_confidences] = ( image_confidences) box_fields = [ fields.InputDataFields.groundtruth_group_of, fields.InputDataFields.groundtruth_is_crowd, fields.InputDataFields.groundtruth_difficult, fields.InputDataFields.groundtruth_area, fields.InputDataFields.groundtruth_boxes, fields.InputDataFields.groundtruth_weights, ] def expand_field(field_name): return self._expansion_box_field_labels( tensor_dict[input_fields.groundtruth_classes], tensor_dict[field_name]) # pylint: disable=cell-var-from-loop for field in box_fields: if field in tensor_dict: tensor_dict[field] = tf.cond( tf.size(tensor_dict[field]) > 0, lambda: expand_field(field), lambda: tensor_dict[field]) # pylint: enable=cell-var-from-loop tensor_dict[input_fields.groundtruth_classes] = ( self._expansion_box_field_labels( tensor_dict[input_fields.groundtruth_classes], tensor_dict[input_fields.groundtruth_classes], True)) if fields.InputDataFields.groundtruth_group_of in tensor_dict: group_of = fields.InputDataFields.groundtruth_group_of tensor_dict[group_of] = tf.cast(tensor_dict[group_of], dtype=tf.bool) if fields.InputDataFields.groundtruth_dp_num_points in tensor_dict: tensor_dict[ fields.InputDataFields.groundtruth_dp_num_points] = tf.cast( tensor_dict[ fields.InputDataFields.groundtruth_dp_num_points], dtype=tf.int32) tensor_dict[ fields.InputDataFields.groundtruth_dp_part_ids] = tf.cast( tensor_dict[ fields.InputDataFields.groundtruth_dp_part_ids], dtype=tf.int32) if fields.InputDataFields.groundtruth_track_ids in tensor_dict: tensor_dict[ fields.InputDataFields.groundtruth_track_ids] = tf.cast( tensor_dict[fields.InputDataFields.groundtruth_track_ids], dtype=tf.int32) return tensor_dict
def to_homogenous(tensor): one = tf.ones_like(tensor[Ellipsis, :1]) return tf.concat([tensor, one], -1)
def metric_fn(**kwargs): """Returns a dictionary that has the evaluation metrics.""" if params['nms_configs'].get('pyfunc', True): detections_bs = [] nms_configs = params['nms_configs'] for index in range(kwargs['boxes'].shape[0]): detections = tf.numpy_function( functools.partial(nms_np.per_class_nms, nms_configs=nms_configs), [ kwargs['boxes'][index], kwargs['scores'][index], kwargs['classes'][index], tf.slice(kwargs['image_ids'], [index], [1]), tf.slice(kwargs['image_scales'], [index], [1]), params['num_classes'], nms_configs['max_output_size'], ], tf.float32) detections_bs.append(detections) detections_bs = postprocess.transform_detections( tf.stack(detections_bs)) else: # These two branches should be equivalent, but currently they are not. # TODO(tanmingxing): enable the non_pyfun path after bug fix. nms_boxes, nms_scores, nms_classes, _ = postprocess.per_class_nms( params, kwargs['boxes'], kwargs['scores'], kwargs['classes'], kwargs['image_scales']) img_ids = tf.cast(tf.expand_dims(kwargs['image_ids'], -1), nms_scores.dtype) detections_bs = [ img_ids * tf.ones_like(nms_scores), nms_boxes[:, :, 1], nms_boxes[:, :, 0], nms_boxes[:, :, 3] - nms_boxes[:, :, 1], nms_boxes[:, :, 2] - nms_boxes[:, :, 0], nms_scores, nms_classes, ] detections_bs = tf.stack(detections_bs, axis=-1, name='detnections') if params.get('testdev_dir', None): logging.info('Eval testdev_dir %s', params['testdev_dir']) eval_metric = coco_metric.EvaluationMetric( testdev_dir=params['testdev_dir']) coco_metrics = eval_metric.estimator_metric_fn( detections_bs, tf.zeros([1])) else: logging.info('Eval val with groudtruths %s.', params['val_json_file']) eval_metric = coco_metric.EvaluationMetric( filename=params['val_json_file'], label_map=params['label_map']) coco_metrics = eval_metric.estimator_metric_fn( detections_bs, kwargs['groundtruth_data']) # Add metrics to output. cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics
def build_inpaint_net(self, x, mask, reuse=False, training=True, padding='SAME', name='inpaint_net'): """Inpaint network. Args: x: incomplete image, [-1, 1] mask: mask region {0, 1} Returns: [-1, 1] as predicted image """ xin = x offset_flow = None ones_x = tf.ones_like(x)[:, :, :, 0:1] x = tf.concat([x, ones_x, ones_x*mask], axis=3) # two stage network cnum = 48 with tf.compat.v1.variable_scope(name, reuse=reuse), \ arg_scope([gen_conv, gen_deconv], training=training, padding=padding): # stage1 x = gen_conv(x, cnum, 5, 1, name='conv1') x = gen_conv(x, 2*cnum, 3, 2, name='conv2_downsample') x = gen_conv(x, 2*cnum, 3, 1, name='conv3') x = gen_conv(x, 4*cnum, 3, 2, name='conv4_downsample') x = gen_conv(x, 4*cnum, 3, 1, name='conv5') x = gen_conv(x, 4*cnum, 3, 1, name='conv6') mask_s = resize_mask_like(mask, x) x = gen_conv(x, 4*cnum, 3, rate=2, name='conv7_atrous') x = gen_conv(x, 4*cnum, 3, rate=4, name='conv8_atrous') x = gen_conv(x, 4*cnum, 3, rate=8, name='conv9_atrous') x = gen_conv(x, 4*cnum, 3, rate=16, name='conv10_atrous') x = gen_conv(x, 4*cnum, 3, 1, name='conv11') x = gen_conv(x, 4*cnum, 3, 1, name='conv12') x = gen_deconv(x, 2*cnum, name='conv13_upsample') x = gen_conv(x, 2*cnum, 3, 1, name='conv14') x = gen_deconv(x, cnum, name='conv15_upsample') x = gen_conv(x, cnum//2, 3, 1, name='conv16') x = gen_conv(x, 3, 3, 1, activation=None, name='conv17') x = tf.nn.tanh(x) x_stage1 = x # stage2, paste result as input x = x*mask + xin[:, :, :, 0:3]*(1.-mask) x.set_shape(xin[:, :, :, 0:3].get_shape().as_list()) # conv branch # xnow = tf.concat([x, ones_x, ones_x*mask], axis=3) xnow = x x = gen_conv(xnow, cnum, 5, 1, name='xconv1') x = gen_conv(x, cnum, 3, 2, name='xconv2_downsample') x = gen_conv(x, 2*cnum, 3, 1, name='xconv3') x = gen_conv(x, 2*cnum, 3, 2, name='xconv4_downsample') x = gen_conv(x, 4*cnum, 3, 1, name='xconv5') x = gen_conv(x, 4*cnum, 3, 1, name='xconv6') x = gen_conv(x, 4*cnum, 3, rate=2, name='xconv7_atrous') x = gen_conv(x, 4*cnum, 3, rate=4, name='xconv8_atrous') x = gen_conv(x, 4*cnum, 3, rate=8, name='xconv9_atrous') x = gen_conv(x, 4*cnum, 3, rate=16, name='xconv10_atrous') x_hallu = x # attention branch x = gen_conv(xnow, cnum, 5, 1, name='pmconv1') x = gen_conv(x, cnum, 3, 2, name='pmconv2_downsample') x = gen_conv(x, 2*cnum, 3, 1, name='pmconv3') x = gen_conv(x, 4*cnum, 3, 2, name='pmconv4_downsample') x = gen_conv(x, 4*cnum, 3, 1, name='pmconv5') x = gen_conv(x, 4*cnum, 3, 1, name='pmconv6', activation=tf.nn.relu) x, offset_flow = contextual_attention(x, x, mask_s, 3, 1, rate=2) x = gen_conv(x, 4*cnum, 3, 1, name='pmconv9') x = gen_conv(x, 4*cnum, 3, 1, name='pmconv10') pm = x x = tf.concat([x_hallu, pm], axis=3) x = gen_conv(x, 4*cnum, 3, 1, name='allconv11') x = gen_conv(x, 4*cnum, 3, 1, name='allconv12') x = gen_deconv(x, 2*cnum, name='allconv13_upsample') x = gen_conv(x, 2*cnum, 3, 1, name='allconv14') x = gen_deconv(x, cnum, name='allconv15_upsample') x = gen_conv(x, cnum//2, 3, 1, name='allconv16') x = gen_conv(x, 3, 3, 1, activation=None, name='allconv17') x = tf.nn.tanh(x) x_stage2 = x return x_stage1, x_stage2, offset_flow
b_real, phase_in=phase, scope='b2a') # generate fake-unfixed-BF using real-fixed BF fake_b_dis = discriminator.discriminator(fake_b, training=phase, scope='b') fake_a_dis = discriminator.discriminator(fake_a, training=phase, scope='a') rec_a = generator.generator( fake_b, phase_in=phase, scope='b2a') # reconstructing unfixed-BF from fake-fixed-BF rec_b = generator.generator( fake_a, phase_in=phase, scope='a2b') # reconstructing fixed-BF from fake-unfixed-BF gen_a2b_loss = tf.reduce_mean( tf.losses.mean_squared_error(fake_b_dis, tf.ones_like(fake_b_dis))) gen_b2a_loss = tf.reduce_mean( tf.losses.mean_squared_error(fake_a_dis, tf.ones_like(fake_a_dis))) # cycle_loss_unfixed = tf.reduce_mean(tf.abs(a_real - rec_a)) # cycle_loss_fixed = tf.reduce_mean(tf.abs(b_real - rec_b)) cycle_loss_unfixed = tf.reduce_mean(tf.losses.mean_squared_error( a_real, rec_a)) cycle_loss_fixed = tf.reduce_mean(tf.losses.mean_squared_error(b_real, rec_b)) # final generator loss g_loss = (gen_a2b_loss + gen_b2a_loss) + 10 * ( cycle_loss_unfixed + cycle_loss_fixed) # FOR deepDeconv # g_loss = (gen_a2b_loss + gen_b2a_loss) + 10 * (cycle_loss_unfixed + cycle_loss_fixed) # FOR das_despeckle # g_loss = (gen_a2b_loss + gen_b2a_loss) + 20 * (cycle_loss_unfixed + cycle_loss_fixed) # FOR deconv_despeckle
def retrieve(features, retriever_beam_size, mode, params): """Do retrieval.""" tokenizer, vocab_lookup_table = bert_utils.get_tf_tokenizer( params["retriever_module_path"]) question_token_ids = tokenizer.tokenize( tf.expand_dims(features["question"], 0)) question_token_ids = tf.cast( question_token_ids.merge_dims(1, 2).to_tensor(), tf.int32) cls_token_id = vocab_lookup_table.lookup(tf.constant("[CLS]")) sep_token_id = vocab_lookup_table.lookup(tf.constant("[SEP]")) question_token_ids = tf.concat( [[[tf.cast(cls_token_id, tf.int32)]], question_token_ids, [[tf.cast(sep_token_id, tf.int32)]]], -1) retriever_module = hub.Module( params["retriever_module_path"], tags={"train"} if mode == tf_estimator.ModeKeys.TRAIN else {}, trainable=True) # [1, projection_size] question_emb = retriever_module(inputs=dict( input_ids=question_token_ids, input_mask=tf.ones_like(question_token_ids), segment_ids=tf.zeros_like(question_token_ids)), signature="projected") block_emb, searcher = scann_utils.load_scann_searcher( var_name="block_emb", checkpoint_path=os.path.join(params["retriever_module_path"], "encoded", "encoded.ckpt"), num_neighbors=retriever_beam_size) # [1, retriever_beam_size] retrieved_block_ids, _ = searcher.search_batched(question_emb) # [1, retriever_beam_size, projection_size] retrieved_block_emb = tf.gather(block_emb, retrieved_block_ids) # [retriever_beam_size] retrieved_block_ids = tf.squeeze(retrieved_block_ids) # [retriever_beam_size, projection_size] retrieved_block_emb = tf.squeeze(retrieved_block_emb) # [1, retriever_beam_size] retrieved_logits = tf.matmul(question_emb, retrieved_block_emb, transpose_b=True) # [retriever_beam_size] retrieved_logits = tf.squeeze(retrieved_logits, 0) blocks_dataset = tf.data.TFRecordDataset(params["block_records_path"], buffer_size=512 * 1024 * 1024) blocks_dataset = blocks_dataset.batch(params["num_block_records"], drop_remainder=True) blocks = tf.get_local_variable( "blocks", initializer=tf.data.experimental.get_single_element(blocks_dataset)) retrieved_blocks = tf.gather(blocks, retrieved_block_ids) return RetrieverOutputs(logits=retrieved_logits, blocks=retrieved_blocks)
def sinc(inputs): x = tf.where( tf.abs(inputs) < epsilon, epsilon * tf.ones_like(inputs), inputs) return tf.sin(x) / x
def _compute_word_overlap(context_ids, context_len, question_ids, question_len, reduce_type, weighted, vocab_df): """Compute word overlap between question and context ids. Args: context_ids: <int32> [batch_size, num_contexts, max_context_len] context_len: <int32> [batch_size, num_contexts] question_ids: <int32> [batch_size, max_question_len] question_len: <int32> [batch_size] reduce_type: String for reduce type when computing overlap. Choices are: max - Allows at most one match per question word. sum - Sums over all matches for each question word. weighted: Boolean indicate whether or not weight the overlap by IDF. vocab_df: Tensor of shape [vocab_size] for word frequency. Computes this at the document-level if not given. Returns: overlap: <float32> [batch_size, num_contexts] Raises: Exception: If invalid reduce_type is provided. """ # <float> [batch_size, num_contexts, question_len, context_len] overlap = tf.to_float( _word_overlap_helper(question_ids=question_ids, context_ids=context_ids)) # <float> [batch_size, question_len] question_mask = tf.sequence_mask(question_len, tf.shape(question_ids)[1], dtype=tf.float32) # <float> [batch_size, num_contexts, context_len] context_mask = tf.sequence_mask(context_len, tf.shape(context_ids)[2], dtype=tf.float32) overlap *= tf.expand_dims(tf.expand_dims(question_mask, 1), -1) overlap *= tf.expand_dims(context_mask, 2) if weighted: if vocab_df is None: # Use document-level IDF computed with respect to the current batch. flat_context_ids = tf.to_int32(tf.reshape(context_ids, [-1])) # <float> [number of unique words] vocab_df = tf.bincount(flat_context_ids, minlength=tf.reduce_max(question_ids) + 1, dtype=tf.float32) # Replace all zeros with ones. vocab_df = tf.where(tf.equal(vocab_df, 0), x=tf.ones_like(vocab_df), y=vocab_df) # <float>[batch_size, question_len] expanded to # <float> [batch_size, 1, question_len, 1] question_df = tf.gather(vocab_df, question_ids) question_df = tf.expand_dims(tf.expand_dims(question_df, 1), -1) # <float> [batch_size, num_contexts, question_len, context_len] overlap = tf.divide(tf.to_float(overlap), question_df) if reduce_type == "max": # <float> [batch_size, num_contexts] overlap = tf.reduce_sum(tf.reduce_max(overlap, axis=[3]), axis=[2]) elif reduce_type == "sum": # <float> [batch_size, num_contexts] overlap = tf.reduce_sum(overlap, axis=[2, 3]) else: raise Exception("Reduce type %s is invalid." % reduce_type) return overlap
def clip_boxes(self, boxes): """Clip boxes to fit in an image.""" boxes = tf.where(tf.less(boxes, 0), tf.zeros_like(boxes), boxes) boxes = tf.where(tf.greater(boxes, self._output_size - 1), (self._output_size - 1) * tf.ones_like(boxes), boxes) return boxes
def make_global_local_transformer_side_inputs( long_paragraph_breakpoints: tf.Tensor, long_paragraph_ids: tf.Tensor, long_sentence_ids: tf.Tensor, global_paragraph_breakpoints: tf.Tensor, local_radius: int, relative_pos_max_distance: int, use_hard_g2l_mask: bool = False, ignore_hard_g2l_mask: tf.Tensor = None, use_hard_l2g_mask: bool = False, ignore_hard_l2g_mask: tf.Tensor = None, flat_sequence: bool = False, l2g_linked_ids: Optional[tf.Tensor] = None, name: Optional[Text] = None ) -> input_utils.GlobalLocalTransformerSideInputs: """Makes attention masks and relative ids for l2l, l2g, g2g, g2l for QA tasks. When `use_hard_g2l_mask=True` and `use_hard_l2g_mask=False`, the resulting attention pattern is similar to Figure 3b of the paper for representing a set of (unordered) contexts ("paragraphs" here), except instead of defining a new relative position label between a global paragraph token and its global sentence tokens, we just place each global paragraph token as the first token before subsequent global sentence tokens belonging to it. Note: This function assumes that we don't pack multiple examples into a single example, which is only done for pre-training. See `GlobalLocalTransformerLayers.call()` in `layers/transformer.py` for a description of the 8 side inputs. Args: long_paragraph_breakpoints: <int32>[batch_size, global_seq_len] Tensor of `0`s and `1`s indicating paragraph boundaries in the long input. long_paragraph_ids: <int32>[batch_size, long_seq_len] Tensor of ids indicating the paragraph each token belongs to. long_sentence_ids: <int32>[batch_size, long_seq_len] Tensor of ids indicating which sentence each token belongs to. global_paragraph_breakpoints: <int32>[batch_size, global_seq_len] Tensor of of `0`s and `1`s indicating paragraph boundaries in the global input. local_radius: How many tokens to the left/right for input tokens to locally self-attend to. For example, a value of 1 would allow each token to only attend to 1 token to the left and 1 token to the right of it. relative_pos_max_distance: Maximum distance to use for relative position representations. All larger distances will be clipped to this value. Use 0 to skip relative position representations entirely. use_hard_g2l_mask: If True, global tokens only attend to tokens of the corresponding sentences in the long input. If False, global tokens attend to all sentences within the corresponding global example. ignore_hard_g2l_mask: <int32>[batch_size, global_seq_len] Tensor of `0`s and `1`s indicating the indices in the global input which should ignore the `use_hard_g2l_mask`. `1` is for ignoring the hard mask and these tokens essentially attend to everything (except for padding tokens) in the long input. This can be useful to force some tokens (e.g, CLS) to attend to everything in the long input even though they don't necessarily map to anything in the long input via sentence / paragraph ids etc. This tensor will be applicable only when `use_hard_g2l` is enabled. use_hard_l2g_mask: If True, long tokens only attend to tokens of the corresponding global tokens. If False, long tokens attend to all the global tokens within the corresponding global example. ignore_hard_l2g_mask: <int32>[batch_size, long_seq_len] Tensor of `0`s and `1`s indicating the indices in the long input which should ignore the `use_hard_l2g_mask`. `1` is for ignoring the hard mask and these tokens essentially attend to everything (except for padding tokens) in the global input. This can be useful to force some tokens (e.g, query tokens) to attend to everything in the global input even though they don't necessarily map to anything in the global input via sentence / paragraph ids etc. This tensor will be applicable only when `use_hard_l2g` is enabled. flat_sequence: If True, the attention masks / relative attention ids would be computing assuming the default ETC setting where there is not any structure (except for having the notion of a "sentence"). l2g_linked_ids: <int32>[batch_size, long_seq_len] Tensor specifying the long tokens which should be linked to the global tokens. If the input is [[-1, -1, 0, 1, 1, -1]], then 2nd long token would be linked to 0-th global token and 3rd, 4-th long tokens woulbe linked to the 1st global token. name: A name for the operation (optional). Returns: A `GlobalLocalTransformerSideInputs` with all relevant tensors set. """ with tf.name_scope(name or 'make_global_local_transformer_side_inputs'): long_input_mask = tf.minimum( tf.cumsum(long_paragraph_breakpoints, axis=-1, reverse=True), 1) global_input_mask = tf.minimum( tf.cumsum(global_paragraph_breakpoints, axis=-1, reverse=True), 1) if flat_sequence: # Here we don't use any structure in the input i.e it falls back to # the default ETC setting where: # a) everything in the long can attend to everything in the global and # vice-versa. # b) everything in global attends to everything in global. # c) everything in long can attend to everything in long that is within # the local radius # # Note that there is a small caveat here: The paragraph / cls level tokens # in the global input would be orphaned (i.e they wouldn't be linked to # anything in the long), but that should be probably # okay as they still attend to everything in the global. # # We don't have any packing here. So we need to construct # long/global breakpoints to indicate there's only one example. # The structure of these breakpoints should be as follows: # [0, 0, .....,1, 0, 0, 0] i.e there should be a single `1` just before # the padding begins, rest of the tokens should be `0`. return (input_utils. make_global_local_transformer_side_inputs_from_example_ids( long_example_ids=long_input_mask, global_example_ids=global_input_mask, sentence_ids=long_sentence_ids, local_radius=local_radius, relative_pos_max_distance=relative_pos_max_distance, use_hard_g2l_mask=use_hard_g2l_mask, use_hard_l2g_mask=use_hard_l2g_mask)) # Make paragraphs not attend to other paragraphs in the long input. long_paragraph_breakpoints = tf.convert_to_tensor( long_paragraph_breakpoints) long_paragraph_breakpoint_segments = tf.cumsum( long_paragraph_breakpoints, axis=-1, reverse=True) l2l_att_mask = feature_utils.make_local_segmented_att_mask( long_paragraph_breakpoint_segments, local_radius) global_paragraph_breakpoints = tf.convert_to_tensor( global_paragraph_breakpoints) global_paragraph_breakpoint_segments = tf.cumsum( global_paragraph_breakpoints, axis=-1, reverse=True) # For g2l, g2g and l2g, we can have everything attend everything else. # So we can have attention tokens as all `1`s and account for padding via # a mask. def _make_input_mask_from_breakpoints( breakpoint_segments: tf.Tensor) -> tf.Tensor: return tf.minimum(tf.cast(1, dtype=breakpoint_segments.dtype), breakpoint_segments) long_attention_tokens = _make_input_mask_from_breakpoints( long_paragraph_breakpoint_segments) # Ignore the padding tokens. global_attention_tokens = _make_input_mask_from_breakpoints( global_paragraph_breakpoint_segments) g2g_att_mask = feature_utils.make_segmented_att_mask( global_attention_tokens) l2g_att_mask = tf.cast( tf.equal(long_attention_tokens[:, :, tf.newaxis], global_attention_tokens[:, tf.newaxis, :]), tf.int32) g2l_att_mask = tf.transpose(l2g_att_mask, perm=[0, 2, 1]) long_seq_len = long_paragraph_breakpoints.shape.as_list()[1] assert long_seq_len is not None global_seq_len = global_paragraph_breakpoints.shape.as_list()[1] assert global_seq_len is not None batch_size = tf.shape(long_paragraph_breakpoints)[0] assert batch_size is not None global_range = tf.range(global_seq_len, dtype=long_sentence_ids.dtype) long_ones = tf.ones_like(long_sentence_ids) global_ones = tf.ones_like(global_paragraph_breakpoints) if use_hard_g2l_mask: if ignore_hard_g2l_mask is None: ignore_hard_g2l_mask = tf.zeros_like( global_paragraph_breakpoints) else: ignore_hard_g2l_mask = tf.convert_to_tensor( ignore_hard_g2l_mask) # Have each global token attend to just one sentence instead of having # it attend to all the sentences within a global example. sentence_hard_g2l_att_mask = tf.equal( global_range[tf.newaxis, :, tf.newaxis], long_sentence_ids[:, tf.newaxis, :]) # Also have paragraph global tokens attend to the corresponding long # paragraphs. paragraph_hard_g2l_att_mask = tf.equal( global_range[tf.newaxis, :, tf.newaxis], long_paragraph_ids[:, tf.newaxis, :]) ignore_hard_g2l_att_mask = tf.equal( ignore_hard_g2l_mask[:, :, tf.newaxis], long_ones[:, tf.newaxis, :]) # It's possible that certain global tokens, although linked to a long # sentence, might still be present in `ignore_hard_g2l_mask`. Such tokens # should also attend to everything in the long. hard_g2l_att_mask = tf.math.logical_or( tf.math.logical_or(sentence_hard_g2l_att_mask, paragraph_hard_g2l_att_mask), ignore_hard_g2l_att_mask) hard_g2l_att_mask = tf.cast(hard_g2l_att_mask, dtype=tf.int32) g2l_att_mask *= hard_g2l_att_mask if use_hard_l2g_mask: if ignore_hard_l2g_mask is None: ignore_hard_l2g_mask = tf.zeros_like(long_sentence_ids) else: ignore_hard_l2g_mask = tf.convert_to_tensor( ignore_hard_l2g_mask) # Have each long token attend to just the corresponding global token # instead of having it attend to all the global tokens within a # global example. sentence_hard_l2g_att_mask = tf.equal( long_sentence_ids[:, :, tf.newaxis], global_range[tf.newaxis, tf.newaxis, :]) # Also have paragraph global tokens attend to the corresponding long # paragraphs. paragraph_hard_l2g_att_mask = tf.equal( long_paragraph_ids[:, :, tf.newaxis], global_range[tf.newaxis, tf.newaxis, :]) ignore_hard_l2g_att_mask = tf.equal( ignore_hard_l2g_mask[:, :, tf.newaxis], global_ones[:, tf.newaxis, :]) # It's possible that certain long tokens, although linked to global tokens # might still be present in `ignore_hard_l2g_mask`. Such tokens # should also attend to everything in the global. hard_l2g_att_mask = tf.math.logical_or( tf.math.logical_or(sentence_hard_l2g_att_mask, paragraph_hard_l2g_att_mask), ignore_hard_l2g_att_mask) hard_l2g_att_mask = tf.cast(hard_l2g_att_mask, dtype=tf.int32) l2g_att_mask *= hard_l2g_att_mask l2l_relative_att_ids = None g2g_relative_att_ids = None l2g_relative_att_ids = None g2l_relative_att_ids = None if relative_pos_max_distance > 0: relative_pos_generator = feature_utils.RelativePositionGenerator( relative_pos_max_distance) l2l_relative_att_ids = relative_pos_generator.make_local_relative_att_ids( seq_len=long_seq_len, local_radius=local_radius, batch_size=batch_size) sentence_l2g_relative_att_ids = tf.equal( long_sentence_ids[:, :, tf.newaxis], global_range[tf.newaxis, tf.newaxis, :]) # Add relative att ids for global paragraph level tokens. paragraph_l2g_relative_att_ids = tf.equal( global_range[tf.newaxis, tf.newaxis, :], long_paragraph_ids[:, :, tf.newaxis]) if l2g_linked_ids is None: l2g_linked_relative_att_ids = tf.zeros_like( paragraph_l2g_relative_att_ids) else: l2g_linked_ids = tf.convert_to_tensor(l2g_linked_ids) l2g_linked_relative_att_ids = tf.equal( global_range[tf.newaxis, tf.newaxis, :], l2g_linked_ids[:, :, tf.newaxis]) l2g_relative_att_ids = tf.cast(tf.math.logical_or( l2g_linked_relative_att_ids, tf.math.logical_or(sentence_l2g_relative_att_ids, paragraph_l2g_relative_att_ids)), dtype=tf.int32) g2l_relative_att_ids = tf.transpose(l2g_relative_att_ids, perm=[0, 2, 1]) # For fused attention, l2l and l2g share the same relative vocabulary, as # do g2g and g2l, so we add an offset for l2g and g2l so their original # 0/1 ids don't collide with l2l and g2g relative position ids. l2g_relative_att_ids += relative_pos_generator.relative_vocab_size g2l_relative_att_ids += relative_pos_generator.relative_vocab_size g2g_relative_att_ids = relative_pos_generator.make_relative_att_ids( seq_len=global_seq_len, batch_size=batch_size) # We used up 2 ids to account for the collision in fused attention as # mentioned above. Hence the +2. g2g_max_rel_id = relative_pos_generator.relative_vocab_size + 2 g2g_relative_att_ids = ( feature_utils.overwrite_relative_att_ids_outside_segments( rel_att_ids=g2g_relative_att_ids, segment_ids=global_paragraph_breakpoint_segments, overwrite_value=g2g_max_rel_id)) return input_utils.GlobalLocalTransformerSideInputs( l2l_att_mask=l2l_att_mask, g2g_att_mask=g2g_att_mask, l2g_att_mask=l2g_att_mask, g2l_att_mask=g2l_att_mask, l2l_relative_att_ids=l2l_relative_att_ids, g2g_relative_att_ids=g2g_relative_att_ids, l2g_relative_att_ids=l2g_relative_att_ids, g2l_relative_att_ids=g2l_relative_att_ids)
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') # Parameters governing how the x coordinate of the spline will be laid out. # We will construct a spline with knots at # [0 : 1 / x_scale : x_max], # by fitting it to values sampled at # [0 : 1 / (x_scale * redundancy) : x_max] x_max = 12 x_scale = 1024 redundancy = 4 # Must be >= 2 for the spline to be useful. spline_spacing = 1. / (x_scale * redundancy) x_knots = np.arange(0, x_max + spline_spacing, spline_spacing, dtype=np.float64) table = [] with tf.Session() as sess: x_knot_ph = tf.placeholder(dtype=tf.float64, shape=()) alpha_ph = distribution.inv_partition_spline_curve(x_knot_ph) partition_ph = numerical_base_partition_function(alpha_ph) # We iterate over knots, and for each knot recover the alpha value # corresponding to that knot with inv_partition_spline_curve(), and then # with that alpha we accurately approximate its partition function using # numerical_base_partition_function(). for x_knot in x_knots: alpha, partition = sess.run((alpha_ph, partition_ph), {x_knot_ph: x_knot}) table.append((x_knot, alpha, partition)) print(table[-1]) table = np.array(table) x = table[:, 0] alpha = table[:, 1] y_gt = np.log(table[:, 2]) # We grab the values from the true log-partition table that correpond to # knots, by looking for where x * x_scale is an integer. mask = np.abs(np.round(x * x_scale) - (x * x_scale)) <= 1e-8 values = y_gt[mask] # Initialize `tangents` using a central differencing scheme. values_pad = np.concatenate([[values[0] - values[1] + values[0]], values, [values[-1] - values[-2] + values[-1]]], 0) tangents = (values_pad[2:] - values_pad[:-2]) / 2. # Construct the spline's value and tangent TF variables, constraining the last # knot to have a fixed value Z(infinity) and a tangent of zero. n = len(values) tangents = tf.Variable(tangents, tf.float64) tangents = tf.where( np.arange(n) == (n - 1), tf.zeros_like(tangents), tangents) values = tf.Variable(values, tf.float64) values = tf.where( np.arange(n) == (n - 1), tf.ones_like(tangents) * 0.70526025442689566, values) # Interpolate into the spline. y = cubic_spline.interpolate1d(x * x_scale, values, tangents) # We minimize the maximum residual, which makes for a very ugly optimization # problem but appears to work in practice, and is what we most care about. loss = tf.reduce_max(tf.abs(y - y_gt)) # Fit the spline. num_iters = 10001 with tf.Session() as sess: global_step = tf.Variable(0, trainable=False) opt = tf.train.MomentumOptimizer(learning_rate=1e-9, momentum=0.99) step = opt.minimize(loss, global_step=global_step) sess.run(tf.global_variables_initializer()) trace = [] for ii in range(num_iters): _, i_loss, i_values, i_tangents, i_y = sess.run( [step, loss, values, tangents, y]) trace.append(i_loss) if (ii % 200) == 0: print('%5d: %e' % (ii, i_loss)) mask = alpha <= 4 print('Max Error (a <= 4): %e' % np.max(np.abs(i_y[mask] - y_gt[mask]))) print('Max Error: %e' % np.max(np.abs(i_y - y_gt))) # Save the spline to disk. np.savez('./data/partition_spline.npz', x_scale=x_scale, values=i_values, tangents=i_tangents)
def call(self, x): input_image, y_pred, y_true, true_boxes = x # adjust the shape of the y_predict [batch, grid_h, grid_w, 3, 4+1+nb_class] y_pred = tf.reshape( y_pred, tf.concat([tf.shape(input=y_pred)[:3], tf.constant([3, -1])], axis=0)) # initialize the masks object_mask = tf.expand_dims(y_true[..., 4], 4) # the variable to keep track of number of batches processed batch_seen = tf.Variable(0.) # compute grid factor and net factor grid_h = tf.shape(input=y_true)[1] grid_w = tf.shape(input=y_true)[2] grid_factor = tf.reshape(tf.cast([grid_w, grid_h], tf.float32), [1, 1, 1, 1, 2]) net_h = tf.shape(input=input_image)[1] net_w = tf.shape(input=input_image)[2] net_factor = tf.reshape(tf.cast([net_w, net_h], tf.float32), [1, 1, 1, 1, 2]) """ Adjust prediction """ pred_box_xy = (self.cell_grid[:, :grid_h, :grid_w, :, :] + tf.sigmoid(y_pred[..., :2])) # sigma(t_xy) + c_xy pred_box_wh = y_pred[..., 2:4] # t_wh pred_box_conf = tf.expand_dims(tf.sigmoid(y_pred[..., 4]), 4) # adjust confidence pred_box_class = y_pred[..., 5:] # adjust class probabilities """ Adjust ground truth """ true_box_xy = y_true[..., 0:2] # (sigma(t_xy) + c_xy) true_box_wh = y_true[..., 2:4] # t_wh true_box_conf = tf.expand_dims(y_true[..., 4], 4) true_box_class = tf.argmax(input=y_true[..., 5:], axis=-1) """ Compare each predicted box to all true boxes """ # initially, drag all objectness of all boxes to 0 conf_delta = pred_box_conf - 0 # then, ignore the boxes which have good overlap with some true box true_xy = true_boxes[..., 0:2] / grid_factor true_wh = true_boxes[..., 2:4] / net_factor true_wh_half = true_wh / 2. true_mins = true_xy - true_wh_half true_maxes = true_xy + true_wh_half pred_xy = tf.expand_dims(pred_box_xy / grid_factor, 4) pred_wh = tf.expand_dims( tf.exp(pred_box_wh) * self.anchors / net_factor, 4) pred_wh_half = pred_wh / 2. pred_mins = pred_xy - pred_wh_half pred_maxes = pred_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_wh[..., 0] * true_wh[..., 1] pred_areas = pred_wh[..., 0] * pred_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) best_ious = tf.reduce_max(input_tensor=iou_scores, axis=4) conf_delta *= tf.expand_dims( tf.cast(best_ious < self.ignore_thresh, dtype=tf.float32), 4) """ Compute some online statistics """ true_xy = true_box_xy / grid_factor true_wh = tf.exp(true_box_wh) * self.anchors / net_factor true_wh_half = true_wh / 2. true_mins = true_xy - true_wh_half true_maxes = true_xy + true_wh_half pred_xy = pred_box_xy / grid_factor pred_wh = tf.exp(pred_box_wh) * self.anchors / net_factor pred_wh_half = pred_wh / 2. pred_mins = pred_xy - pred_wh_half pred_maxes = pred_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_wh[..., 0] * true_wh[..., 1] pred_areas = pred_wh[..., 0] * pred_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) iou_scores = object_mask * tf.expand_dims(iou_scores, 4) count = tf.reduce_sum(input_tensor=object_mask) count_noobj = tf.reduce_sum(input_tensor=1 - object_mask) detect_mask = tf.cast((pred_box_conf * object_mask) >= 0.5, dtype=tf.float32) class_mask = tf.expand_dims( tf.cast(tf.equal(tf.argmax(input=pred_box_class, axis=-1), true_box_class), dtype=tf.float32), 4) recall50 = tf.reduce_sum( input_tensor=tf.cast(iou_scores >= 0.5, dtype=tf.float32) * detect_mask * class_mask) / (count + 1e-3) recall75 = tf.reduce_sum( input_tensor=tf.cast(iou_scores >= 0.75, dtype=tf.float32) * detect_mask * class_mask) / (count + 1e-3) avg_iou = tf.reduce_sum(input_tensor=iou_scores) / (count + 1e-3) avg_obj = tf.reduce_sum(input_tensor=pred_box_conf * object_mask) / (count + 1e-3) avg_noobj = tf.reduce_sum(input_tensor=pred_box_conf * (1 - object_mask)) / (count_noobj + 1e-3) avg_cat = tf.reduce_sum(input_tensor=object_mask * class_mask) / (count + 1e-3) """ Warm-up training """ batch_seen = tf.assign_add(batch_seen, 1.) true_box_xy, true_box_wh, xywh_mask = tf.cond( pred=tf.less(batch_seen, self.warmup_batches + 1), true_fn=lambda: [ true_box_xy + (0.5 + self.cell_grid[:, :grid_h, :grid_w, :, :]) * (1 - object_mask), true_box_wh + tf.zeros_like(true_box_wh) * (1 - object_mask), tf.ones_like(object_mask) ], false_fn=lambda: [true_box_xy, true_box_wh, object_mask]) """ Compare each true box to all anchor boxes """ wh_scale = tf.exp(true_box_wh) * self.anchors / net_factor wh_scale = tf.expand_dims( 2 - wh_scale[..., 0] * wh_scale[..., 1], axis=4) # the smaller the box, the bigger the scale xy_delta = xywh_mask * (pred_box_xy - true_box_xy) * wh_scale * self.xywh_scale wh_delta = xywh_mask * (pred_box_wh - true_box_wh) * wh_scale * self.xywh_scale conf_delta = object_mask * ( pred_box_conf - true_box_conf) * self.obj_scale + ( 1 - object_mask) * conf_delta * self.noobj_scale class_delta = object_mask * \ tf.expand_dims(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class), 4) * \ self.class_scale loss_xy = tf.reduce_sum(input_tensor=tf.square(xy_delta), axis=list(range(1, 5))) loss_wh = tf.reduce_sum(input_tensor=tf.square(wh_delta), axis=list(range(1, 5))) loss_conf = tf.reduce_sum(input_tensor=tf.square(conf_delta), axis=list(range(1, 5))) loss_class = tf.reduce_sum(input_tensor=class_delta, axis=list(range(1, 5))) loss = loss_xy + loss_wh + loss_conf + loss_class loss = tf.Print(loss, [grid_h, avg_obj], message='avg_obj \t\t', summarize=1000) loss = tf.Print(loss, [grid_h, avg_noobj], message='avg_noobj \t\t', summarize=1000) loss = tf.Print(loss, [grid_h, avg_iou], message='avg_iou \t\t', summarize=1000) loss = tf.Print(loss, [grid_h, avg_cat], message='avg_cat \t\t', summarize=1000) loss = tf.Print(loss, [grid_h, recall50], message='recall50 \t', summarize=1000) loss = tf.Print(loss, [grid_h, recall75], message='recall75 \t', summarize=1000) loss = tf.Print(loss, [grid_h, count], message='count \t', summarize=1000) loss = tf.Print(loss, [ grid_h, tf.reduce_sum(input_tensor=loss_xy), tf.reduce_sum(input_tensor=loss_wh), tf.reduce_sum(input_tensor=loss_conf), tf.reduce_sum(input_tensor=loss_class) ], message='loss xy, wh, conf, class: \t', summarize=1000) return loss * self.grid_scale
def multilevel_roi_align(features, boxes, box_levels, output_size, num_samples_per_cell_y=1, num_samples_per_cell_x=1, align_corners=False, extrapolation_value=0.0, scope=None): """Applies RoI Align op and returns feature for boxes. Given multiple features maps indexed by different levels, and a set of boxes where each box is mapped to a certain level, this function selectively crops and resizes boxes from the corresponding feature maps. We follow the RoI Align technique in https://arxiv.org/pdf/1703.06870.pdf figure 3. Specifically, each box is subdivided uniformly into a grid consisting of output_size[0] x output_size[1] rectangular cells. Within each cell we select `num_points` points uniformly and compute feature values using bilinear interpolation. Finally, we average pool the interpolated values in each cell to obtain a [output_size[0], output_size[1], channels] feature. If `align_corners` is true, sampling points are uniformly spread such that corner points exactly overlap corners of the boxes. In this function we also follow the convention of treating feature pixels as point objects with no spatial extent. Args: features: A list of 4D float tensors of shape [batch_size, max_height, max_width, channels] containing features. Note that each feature map must have the same number of channels. boxes: A 3D float tensor of shape [batch_size, num_boxes, 4] containing boxes of the form [ymin, xmin, ymax, xmax] in normalized coordinates. box_levels: A 3D int32 tensor of shape [batch_size, num_boxes] representing the feature level index for each box. output_size: An list of two integers [size_y, size_x] indicating the output feature size for each box. num_samples_per_cell_y: Number of grid points to sample along y axis in each cell. num_samples_per_cell_x: Number of grid points to sample along x axis in each cell. align_corners: Whether to align the corner grid points exactly with box corners. extrapolation_value: a float value to use for extrapolation. scope: Scope name to use for this op. Returns: A 5D float tensor of shape [batch_size, num_boxes, output_size[0], output_size[1], channels] representing the cropped features. """ with tf.name_scope(scope, 'MultiLevelRoIAlign'): features, true_feature_shapes = pad_to_max_size(features) batch_size = tf.shape(features)[0] num_levels = features.get_shape().as_list()[1] max_feature_height = tf.shape(features)[2] max_feature_width = tf.shape(features)[3] num_filters = features.get_shape().as_list()[4] num_boxes = tf.shape(boxes)[1] # Convert boxes to absolute co-ordinates. true_feature_shapes = tf.cast(true_feature_shapes, dtype=boxes.dtype) true_feature_shapes = tf.gather(true_feature_shapes, box_levels) boxes *= tf.concat([true_feature_shapes - 1] * 2, axis=-1) size_y = output_size[0] * num_samples_per_cell_y size_x = output_size[1] * num_samples_per_cell_x box_grid_y, box_grid_x = box_grid_coordinate_vectors( boxes, size_y=size_y, size_x=size_x, align_corners=align_corners) (feature_grid_y0, feature_grid_x0, feature_grid_y1, feature_grid_x1) = feature_grid_coordinate_vectors(box_grid_y, box_grid_x) feature_grid_y = tf.reshape( tf.stack([feature_grid_y0, feature_grid_y1], axis=3), [batch_size, num_boxes, -1]) feature_grid_x = tf.reshape( tf.stack([feature_grid_x0, feature_grid_x1], axis=3), [batch_size, num_boxes, -1]) feature_coordinates = ravel_indices(feature_grid_y, feature_grid_x, num_levels, max_feature_height, max_feature_width, box_levels) valid_indices = _valid_indicator(feature_grid_y, feature_grid_x, true_feature_shapes) feature_coordinates = tf.where(valid_indices, feature_coordinates, -1 * tf.ones_like(feature_coordinates)) flattened_features = tf.reshape(features, [-1, num_filters]) flattened_feature_values = _gather_valid_indices(flattened_features, feature_coordinates, extrapolation_value) features_per_box = tf.reshape( flattened_feature_values, [batch_size, num_boxes, size_y * 2, size_x * 2, num_filters]) # Cast tensors into dtype of features. box_grid_y = tf.cast(box_grid_y, dtype=features_per_box.dtype) box_grid_x = tf.cast(box_grid_x, dtype=features_per_box.dtype) feature_grid_y0 = tf.cast(feature_grid_y0, dtype=features_per_box.dtype) feature_grid_x0 = tf.cast(feature_grid_x0, dtype=features_per_box.dtype) # RoI Align operation is a bilinear interpolation of four # neighboring feature points f0, f1, f2, and f3 onto point y, x given by # f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T # [f10, f11]] # # Unrolling the matrix multiplies gives us: # f(y, x) = (hy * hx) f00 + (hy * lx) f01 + (ly * hx) f10 + (lx * ly) f11 # f(y, x) = w00 * f00 + w01 * f01 + w10 * f10 + w11 * f11 # # This can be computed by applying pointwise multiplication and sum_pool in # a 2x2 window. ly = box_grid_y - feature_grid_y0 lx = box_grid_x - feature_grid_x0 hy = 1.0 - ly hx = 1.0 - lx kernel_y = tf.reshape( tf.stack([hy, ly], axis=3), [batch_size, num_boxes, size_y * 2, 1]) kernel_x = tf.reshape( tf.stack([hx, lx], axis=3), [batch_size, num_boxes, 1, size_x * 2]) # Multiplier 4 is to make tf.nn.avg_pool behave like sum_pool. interpolation_kernel = kernel_y * kernel_x * 4 # Interpolate the gathered features with computed interpolation kernels. features_per_box *= tf.expand_dims(interpolation_kernel, axis=4), features_per_box = tf.reshape( features_per_box, [batch_size * num_boxes, size_y * 2, size_x * 2, num_filters]) # This combines the two pooling operations - sum_pool to perform bilinear # interpolation and avg_pool to pool the values in each bin. features_per_box = tf.nn.avg_pool( features_per_box, [1, num_samples_per_cell_y * 2, num_samples_per_cell_x * 2, 1], [1, num_samples_per_cell_y * 2, num_samples_per_cell_x * 2, 1], 'VALID') features_per_box = tf.reshape( features_per_box, [batch_size, num_boxes, output_size[0], output_size[1], num_filters]) return features_per_box
def get_stage_1_loss(pred_labels_key_p,pred_labels_direction,pred_regression_direction,pred_regression_position, \ pred_labels_type,labels_key_p,labels_direction,regression_direction,regression_position,labels_type,\ simmat_pl,neg_simmat_pl,pred_simmat,pred_conf_logits): batch_size = pred_labels_key_p.get_shape()[0].value num_point = pred_labels_key_p.get_shape()[1].value mask = tf.cast(labels_key_p, tf.float32) neg_mask = tf.ones_like(mask) - mask Np = tf.expand_dims(tf.reduce_sum(mask, axis=1), 1) Ng = tf.expand_dims(tf.reduce_sum(neg_mask, axis=1), 1) all_mask = tf.ones_like(mask) #loss:task1 task_1_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pred_labels_key_p, labels=labels_key_p) * (mask * (Ng / Np) + 1)) task_1_recall = tf.reduce_mean(tf.reduce_sum(tf.cast(tf.equal(tf.argmax(pred_labels_key_p,axis=2,output_type = tf.int32),\ labels_key_p),tf.float32)*mask,axis = 1)/tf.reduce_sum(mask,axis=1)) task_1_acc = tf.reduce_mean(tf.reduce_sum(tf.cast(tf.equal(tf.argmax(pred_labels_key_p,axis=2,output_type = tf.int32),\ labels_key_p),tf.float32),axis = 1)/num_point) #loss:task2_1 task_2_1_loss = tf.reduce_mean(tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(logits = pred_labels_direction,\ labels = labels_direction)*mask,axis = 1)/tf.reduce_sum(mask,axis=1)) task_2_1_acc = tf.reduce_mean(tf.reduce_sum(tf.cast(tf.equal(tf.argmax(pred_labels_direction,axis=2,output_type=tf.int32), \ labels_direction),tf.float32)*mask,axis=1)/tf.reduce_sum(mask,axis=1)) #loss:task2_2 task_2_2_loss = tf.reduce_mean(tf.reduce_sum(tf.reduce_mean(smooth_l1_dist(pred_regression_direction-regression_direction),axis=2)*mask, \ axis = 1)/tf.reduce_sum(mask,axis=1)) #loss:task3 task_3_loss = tf.reduce_mean(tf.reduce_sum(tf.reduce_mean(smooth_l1_dist(pred_regression_position-regression_position),axis=2)*mask, \ axis = 1)/tf.reduce_sum(mask,axis=1)) #loss:task4 task_4_loss = tf.reduce_mean( tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pred_labels_type, labels=labels_type) * mask, axis=1) / tf.reduce_sum(mask, axis=1)) task_4_acc = tf.reduce_mean(tf.reduce_sum(tf.cast(tf.equal(tf.argmax(pred_labels_type,axis=2,output_type = tf.int32),\ labels_type),tf.float32)*mask,axis = 1)/tf.reduce_sum(mask,axis=1)) #loss: task_5 pos = pred_simmat * simmat_pl neg = tf.maximum(80 - pred_simmat, 0) * neg_simmat_pl task_5_loss = tf.reduce_mean(pos + neg) #loss: task_6 ng_label = tf.greater(simmat_pl, 0.5) ng = tf.less(pred_simmat, 80) epsilon = tf.constant( np.ones(ng_label.get_shape()[:2]).astype(np.float32) * 1e-6) pts_iou = tf.reduce_sum(tf.cast(tf.logical_and(ng, ng_label), tf.float32), axis=2) / \ (tf.reduce_sum(tf.cast(tf.logical_or(ng, ng_label), tf.float32), axis=2) + epsilon) task_6_loss = tf.reduce_mean( tf.squared_difference(pts_iou, tf.squeeze(pred_conf_logits, [2]))) w1 = 1 w2_1 = 1 w2_2 = 100 w3 = 100 w4 = 1 w5 = 1 w6 = 100 loss = task_1_loss * w1 + task_2_1_loss * w2_1 + task_2_2_loss * w2_2 + task_3_loss * w3 + task_4_loss * w4 + task_5_loss * w5 + task_6_loss * w6 tf.summary.scalar('all loss', loss) tf.add_to_collection('losses', loss) return task_1_loss, task_1_recall, task_1_acc, task_2_1_loss, task_2_1_acc, task_2_2_loss, task_3_loss, task_4_loss, task_4_acc, task_5_loss, task_6_loss, loss
def losses(generator_fn, discriminator_fn, real_data, z, disc_params, flags): """Returns loss variables for the generator and discriminator.""" fake_data = generator_fn(z) if flags.acts_loss > 0.: disc_real, disc_real_acts = discriminator_fn(real_data, return_acts=True) disc_fake, disc_fake_acts = discriminator_fn(fake_data, return_acts=True) else: disc_real = discriminator_fn(real_data) disc_fake = discriminator_fn(fake_data) acts_l2_loss = 0. acts_count = 1. if flags.acts_loss > 0.: all_disc_acts = disc_real_acts + disc_fake_acts for act in all_disc_acts: acts_l2_loss += tf.nn.l2_loss(act) acts_count += tf.reduce_sum(tf.ones_like(act)) l2_reg_d_cost = 0. if flags.l2_reg_d > 0: for p in disc_params: if 'weights' in p.name: l2_reg_d_cost += tf.nn.l2_loss(p) l2_reg_d_cost *= flags.l2_reg_d def cn(x): """compressive nonlinearity.""" return tf.asinh(4. * x) / 4. if flags.algorithm == 'vanilla': gen_cost = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=disc_fake, labels=tf.ones_like(disc_fake))) disc_cost = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=disc_fake, labels=tf.zeros_like(disc_fake))) disc_cost += tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=disc_real, labels=tf.ones_like(disc_real))) divergence = gen_cost disc_cost += l2_reg_d_cost disc_cost += flags.acts_loss * (acts_l2_loss / (1e-2 + acts_count)) elif flags.algorithm == 'vanilla_minimax': disc_cost = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=disc_fake, labels=tf.zeros_like(disc_fake))) disc_cost += tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=disc_real, labels=tf.ones_like(disc_real))) gen_cost = -disc_cost divergence = ((-disc_cost) + tf.log(4.)) / 2. disc_cost += l2_reg_d_cost disc_cost += flags.acts_loss * (acts_l2_loss / (1e-2 + acts_count)) elif flags.algorithm == 'wgan-gp': input_ndim = len(real_data.get_shape()) if flags.wgangp_compressive_loss: disc_fake = cn(disc_fake) disc_real = cn(disc_real) wgan_disc_cost = tf.reduce_mean(disc_fake) - tf.reduce_mean(disc_real) alpha = tf.random_uniform(shape=[tf.shape(real_data)[0]] + [1 for i in range(input_ndim - 1)], minval=0., maxval=1.) differences = fake_data - real_data interpolates = real_data + (alpha * differences) if flags.acts_loss > 0.: disc_interps, disc_interp_acts = discriminator_fn(interpolates, return_acts=True) else: disc_interps = discriminator_fn(interpolates) gradients = tf.gradients(disc_interps, [interpolates])[0] slopes = tf.sqrt( 1e-8 + tf.reduce_sum(tf.square(gradients), reduction_indices=[i for i in range(1, input_ndim)])) gradient_penalty = tf.reduce_mean((slopes - 1.)**2) disc_cost = wgan_disc_cost + (flags.wgangp_lambda * gradient_penalty) disc_cost += l2_reg_d_cost if flags.acts_loss > 0.: for act in disc_interp_acts: acts_l2_loss += flags.acts_loss * tf.nn.l2_loss(act) acts_count += tf.reduce_sum(tf.ones_like(act)) disc_cost += flags.acts_loss * (acts_l2_loss / (1e-2 + acts_count)) if flags.wgangp_minimax: gen_cost = -disc_cost divergence = -disc_cost else: gen_cost = -tf.reduce_mean(disc_fake) divergence = -wgan_disc_cost elif flags.algorithm == 'r1': disc_cost = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=disc_fake, labels=tf.zeros_like(disc_fake))) disc_cost += tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=disc_real, labels=tf.ones_like(disc_real))) gen_cost = -disc_cost divergence = ((-disc_cost) + tf.log(4.)) / 2. input_ndim = len(real_data.get_shape()) gradients = tf.gradients(tf.nn.sigmoid(disc_real), [real_data])[0] slopes = tf.sqrt( 1e-8 + tf.reduce_sum(tf.square(gradients), reduction_indices=[i for i in range(1, input_ndim)])) gradient_penalty = 0.5 * tf.reduce_mean(slopes**2) disc_cost += flags.wgangp_lambda * gradient_penalty disc_cost += l2_reg_d_cost disc_cost += flags.acts_loss * (acts_l2_loss / (1e-2 + acts_count)) elif flags.algorithm == 'r1-ns': disc_cost = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=disc_fake, labels=tf.zeros_like(disc_fake))) disc_cost += tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=disc_real, labels=tf.ones_like(disc_real))) divergence = ((-disc_cost) + tf.log(4.)) / 2. gen_cost = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=disc_fake, labels=tf.ones_like(disc_fake))) input_ndim = len(real_data.get_shape()) gradients = tf.gradients(tf.nn.sigmoid(disc_real), [real_data])[0] slopes = tf.sqrt( 1e-8 + tf.reduce_sum(tf.square(gradients), reduction_indices=[i for i in range(1, input_ndim)])) gradient_penalty = 0.5 * tf.reduce_mean(slopes**2) disc_cost += flags.wgangp_lambda * gradient_penalty disc_cost += l2_reg_d_cost disc_cost += flags.acts_loss * (acts_l2_loss / (1e-2 + acts_count)) return gen_cost, disc_cost, divergence
def build_distractors(distractor_examples, context): """Create inputs with distractors.""" CLS_ID = tf.constant([101], dtype=tf.int64) # pylint: disable=invalid-name SEP_ID = tf.constant([102], dtype=tf.int64) # pylint: disable=invalid-name bert_inputs = [] input_masks = [] segment_ids = [] # for each distractor sample_size = int((FLAGS.num_choices - 4) / (FLAGS.data_window_size - 1)) for example in distractor_examples: # randomly sample 7 intermediate_examples_tensor = tf.reduce_sum(tf.abs(example), 1) examples_zero_vector = tf.zeros(shape=(1, 1), dtype=tf.int64) examples_bool_mask = tf.squeeze( tf.not_equal(intermediate_examples_tensor, examples_zero_vector)) paragraph_len = tf.reduce_sum(tf.cast(examples_bool_mask, tf.int32)) indices = tf.range(0, limit=paragraph_len, dtype=tf.int32) shuffled_indices = tf.random.shuffle(indices)[:sample_size] # extend examples / targets distractor_cand = example distractor_cand_plus_one = distractor_cand[1:] distractor_cand_plus_two = distractor_cand[2:] # pad extensions paddings_one = tf.constant([[0, 1], [0, 0]]) distractor_cand_plus_one = tf.pad(distractor_cand_plus_one, paddings_one) paddings_two = tf.constant([[0, 2], [0, 0]]) distractor_cand_plus_two = tf.pad(distractor_cand_plus_two, paddings_two) distractor_cand_ext = tf.concat([ distractor_cand, distractor_cand_plus_one, distractor_cand_plus_two ], axis=1) distractors = tf.gather(distractor_cand_ext, shuffled_indices) for i in range(sample_size): distractors_non_zero = tf.where( tf.not_equal(distractors[i], tf.zeros_like(distractors[i]))) distractors_stripped = tf.gather_nd(distractors[i], distractors_non_zero) segment_id = tf.concat([ tf.zeros_like(CLS_ID, dtype=tf.int64), tf.zeros_like(context), tf.zeros_like(SEP_ID, dtype=tf.int64), tf.ones_like(distractors_stripped), tf.ones_like(SEP_ID, dtype=tf.int64) ], axis=0) segment_id = pad_and_cut(segment_id, FLAGS.max_seq_length) segment_ids.append(segment_id) new_input = tf.concat( [CLS_ID, context, SEP_ID, distractors_stripped, SEP_ID], axis=0) input_mask = tf.ones_like(new_input) input_mask = pad_and_cut(input_mask, FLAGS.max_seq_length) input_masks.append(input_mask) padded_new_input = pad_and_cut(new_input, FLAGS.max_seq_length) bert_inputs.append(padded_new_input) bert_inputs = tf.stack(bert_inputs, axis=0) input_masks = tf.stack(input_masks, axis=0) segment_ids = tf.stack(segment_ids, axis=0) out = Outputs_And_Context(bert_inputs, input_masks, segment_ids, None, None) return out
def model_fn(features, labels, mode): """AdversarialReweightingModel model_fn. Args: features: `Tensor` or `dict` of `Tensor`. labels: A `dict` of `Tensor` Objects. Expects to have a key/value pair for the key self.label_column_name. mode: Defines whether this is training, evaluation or prediction. See `ModeKeys`. Currently PREDICT mode is not implemented. Returns: An instance of `tf.estimator.EstimatorSpec', which encapsulates the `mode`, `predictions`, `loss` and the `train_op`. Note that here `predictions` is either a `Tensor` or a `dict` of `Tensor` objects, representing the prediction of the bianry classification model. 'loss` is a scalar containing the loss of the step and `train_op` is the op for training. """ # Instantiates a tensor with weight for positive class examples only pos_weights = tf.cast(tf.equal(labels[self._label_column_name], 1), dtype=tf.float32) # Instantiates a tensor with true class labels class_labels = labels[self._label_column_name] # Initialize a global step variable used for alternate training current_step = self._get_or_create_global_step_var() if mode == tf.estimator.ModeKeys.EVAL: tf.logging.info('model_fn: EVAL, {}'.format(mode)) elif mode == tf.estimator.ModeKeys.TRAIN: tf.logging.info('model_fn: TRAIN, {}'.format(mode)) # Creates a DNN architecture for primary binary classification task with tf.name_scope('primary_NN'): with tf.variable_scope('primary'): input_layer = tf.feature_column.input_layer( features, self._feature_columns) h1 = tf.layers.Dense( self._primary_hidden_units[0], activation=self._activation)(input_layer) h2 = tf.layers.Dense(self._primary_hidden_units[1], activation=self._activation)(h1) logits = tf.layers.Dense(1)(h2) sigmoid_output = tf.nn.sigmoid(logits, name='sigmoid') class_predictions = tf.cast( tf.greater(sigmoid_output, 0.5), tf.float32) tf.summary.histogram('class_predictions', class_predictions) # Creates a network architecture for the adversarial regression task with tf.name_scope('adversary_NN'): with tf.variable_scope('adversary'): # Gets adversary features and features columns adversarial_features, adversary_feature_columns = self._get_adversary_features_and_feature_columns(features, labels) # pylint: disable=line-too-long adv_input_layer = tf.feature_column.input_layer( adversarial_features, adversary_feature_columns) adv_h1 = tf.layers.Dense( self._adversary_hidden_units[0])(adv_input_layer) adv_output_layer = tf.layers.Dense(1, use_bias=True)(adv_h1) example_weights = tf.cond( tf.greater(current_step, self._pretrain_steps), true_fn=lambda: self._compute_example_weights( adv_output_layer), false_fn=lambda: tf.ones_like(class_labels)) # Adds summary variables to tensorboard with tf.name_scope('example_weights'): tf.summary.histogram('example_weights', example_weights) tf.summary.histogram('label', class_labels) # Initializes Loss Functions primary_loss = self._primary_loss(class_labels, logits, example_weights) adversary_loss = self._adversary_loss(class_labels, logits, pos_weights, example_weights, self._adversary_loss_type) # Sets up dictionaries used for computing performance metrics predictions = { (self._label_column_name, 'class_ids'): tf.reshape(class_predictions, [-1]), (self._label_column_name, 'logistic'): tf.reshape(sigmoid_output, [-1]), ('example_weights'): tf.reshape(example_weights, [-1]) } class_id_kwargs = { 'labels': class_labels, 'predictions': class_predictions } logistics_kwargs = { 'labels': class_labels, 'predictions': sigmoid_output } # EVAL Mode if mode == tf.estimator.ModeKeys.EVAL: with tf.name_scope('eval_metrics'): eval_metric_ops = { 'accuracy': tf.metrics.accuracy(**class_id_kwargs), 'precision': tf.metrics.precision(**class_id_kwargs), 'recall': tf.metrics.recall(**class_id_kwargs), 'fp': tf.metrics.false_positives(**class_id_kwargs), 'fn': tf.metrics.false_negatives(**class_id_kwargs), 'tp': tf.metrics.true_positives(**class_id_kwargs), 'tn': tf.metrics.true_negatives(**class_id_kwargs), 'fpr': contrib_metrics.streaming_false_positive_rate( **class_id_kwargs), # pylint: disable=line-too-long 'fnr': contrib_metrics.streaming_false_negative_rate( **class_id_kwargs), # pylint: disable=line-too-long 'auc': tf.metrics.auc(curve='ROC', **logistics_kwargs), 'aucpr': tf.metrics.auc(curve='PR', **logistics_kwargs) } # EstimatorSpec object for evaluation estimator_spec = tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, loss=primary_loss, eval_metric_ops=eval_metric_ops) # TRAIN Mode if mode == tf.estimator.ModeKeys.TRAIN: # Filters trainable variables for each task all_trainable_vars = tf.trainable_variables() primary_trainable_vars = [ v for v in all_trainable_vars if 'primary' in v.op.name ] adversary_trainable_vars = [ v for v in all_trainable_vars if 'adversary' in v.op.name ] # TRAIN_OP for adversary DNN train_op_adversary = contrib_layers.optimize_loss( loss=adversary_loss, variables=adversary_trainable_vars, global_step=contrib_framework.get_global_step(), learning_rate=self._adversary_learning_rate, optimizer=self._optimizer) # TRAIN_OP for primary DNN train_op_primary = contrib_layers.optimize_loss( loss=primary_loss, variables=primary_trainable_vars, global_step=contrib_framework.get_global_step(), learning_rate=self._primary_learning_rate, optimizer=self._optimizer) # Upto ``pretrain_steps'' trains primary only. # Beyond ``pretrain_steps'' alternates between primary and adversary. estimator_spec = tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, loss=primary_loss + adversary_loss, train_op=tf.cond( tf.greater(current_step, self._pretrain_steps), true_fn=lambda: tf.group( [train_op_primary, train_op_adversary]), # pylint: disable=line-too-long false_fn=lambda: tf.group([train_op_primary]))) return estimator_spec
def build_bert_inputs(example): """Convert example <Tensor [30, 70]> into bert inputs.""" CLS_ID = tf.constant([101], dtype=tf.int64) # pylint: disable=invalid-name SEP_ID = tf.constant([102], dtype=tf.int64) # pylint: disable=invalid-name max_len = tf.constant([FLAGS.max_para_length]) context_size = tf.constant([FLAGS.context_size]) intermediate_examples_tensor = tf.reduce_sum(tf.abs(example), 1) examples_zero_vector = tf.zeros(shape=(1, 1), dtype=tf.int64) examples_bool_mask = tf.squeeze( tf.not_equal(intermediate_examples_tensor, examples_zero_vector)) paragraph_len = tf.reduce_sum(tf.cast(examples_bool_mask, tf.int32)) start = tf.random.uniform([1], 0, tf.reshape(paragraph_len, []) - tf.reshape(context_size, []) + 1, dtype=tf.int32) # Slice the document into the before, after and context. # Discard the zero padding. sizes = tf.squeeze( tf.concat([[ start, context_size, paragraph_len - context_size - start, max_len - paragraph_len ]], 0)) before, context, after, _ = tf.split(example, sizes, axis=0) # Gather the context removing zero padding at end of sentences. non_zeros = tf.where(tf.not_equal(context, tf.zeros_like(context))) context_gathered = tf.gather_nd(context, non_zeros) # Flip before so we select the 4 sentences closest to target before = tf.reverse(before, axis=[0]) # pad both to longer than needed paddings = tf.constant([[0, 8], [0, 0]]) before = tf.pad(before, paddings) after = tf.pad(after, paddings) # Extend targets to 3 sentences # pad both before_minus_one = before[1:][:4] before_minus_two = before[2:][:4] after_plus_one = after[1:][:4] after_plus_two = after[2:][:4] before = before[:4] after = after[:4] before = tf.concat([before_minus_two, before_minus_one, before], axis=1) after = tf.concat([after, after_plus_one, after_plus_two], axis=1) ############################################################################ # before = before[:4] # after = after[:4] # These 8 sentences are the 8 surrounding targets. Some are padding. targets = tf.concat([before, after], axis=0) # Remove the padding from the sourrounding sentences # Eg. if context starts at beginning of paragraph, before is all padding intermediate_tensor = tf.reduce_sum(tf.abs(targets), 1) zero_vector = tf.zeros(shape=(1, 1), dtype=tf.int64) bool_mask = tf.squeeze(tf.not_equal(intermediate_tensor, zero_vector)) bool_mask.set_shape([None]) targets = tf.boolean_mask(targets, bool_mask) # Randomly select 4 targets # We will also select the label_types for each selected target indices = tf.range(0, limit=tf.shape(targets)[0], dtype=tf.int32) shuffled_indices = tf.random.shuffle(indices)[:4] targets = tf.gather(targets, shuffled_indices) full_labels = tf.concat([tf.range(3, -1, -1), tf.range(4, 8)], axis=0) label_types = tf.boolean_mask(full_labels, bool_mask) label_types = tf.gather(label_types, shuffled_indices) # create inputs bert_inputs = [] input_masks = [] segment_ids = [] for i in range(4): target_non_zero = tf.where( tf.not_equal(targets[i], tf.zeros_like(targets[i]))) targets_stripped = tf.gather_nd(targets[i], target_non_zero) segment_id = tf.concat([ tf.zeros_like(CLS_ID, dtype=tf.int64), tf.zeros_like(context_gathered), tf.zeros_like(SEP_ID, dtype=tf.int64), tf.ones_like(targets_stripped), tf.ones_like(SEP_ID, dtype=tf.int64) ], axis=0) segment_id = pad_and_cut(segment_id, FLAGS.max_seq_length) segment_ids.append(segment_id) new_input = tf.concat( [CLS_ID, context_gathered, SEP_ID, targets_stripped, SEP_ID], axis=0) input_mask = tf.ones_like(new_input) input_mask = pad_and_cut(input_mask, FLAGS.max_seq_length) input_masks.append(input_mask) padded_new_input = pad_and_cut(new_input, FLAGS.max_seq_length) bert_inputs.append(padded_new_input) bert_inputs = tf.stack(bert_inputs, axis=0) input_masks = tf.stack(input_masks, axis=0) segment_ids = tf.stack(segment_ids, axis=0) out = Outputs_And_Context(bert_inputs, input_masks, segment_ids, label_types, context_gathered) return out
def safe_log(tensor, eps=1e-16): is_zero = tf.less(tensor, eps) tensor = tf.where(is_zero, tf.ones_like(tensor), tensor) tensor = tf.where(is_zero, tf.zeros_like(tensor) - 1e8, tf.log(tensor)) return tensor
def _decode_record(record, name_to_features, vocab_table): """Decodes a record to a TensorFlow example.""" target_example = tf.parse_single_example(record[0], name_to_features) target_example = tf.reshape( target_example["sents"], [FLAGS.max_para_length, FLAGS.max_sent_length]) # distractor_examples = [] # for rec in record[1:]: # distractor_examples.append( # tf.reshape( # tf.parse_single_example(rec, name_to_features)["sents"], # [FLAGS.max_para_length, FLAGS.max_sent_length])) # This is an unfortunate hack but is necessary to get around a TF error. dist0 = tf.reshape( tf.parse_single_example(record[1], name_to_features)["sents"], [FLAGS.max_para_length, FLAGS.max_sent_length]) dist1 = tf.reshape( tf.parse_single_example(record[2], name_to_features)["sents"], [FLAGS.max_para_length, FLAGS.max_sent_length]) dist2 = tf.reshape( tf.parse_single_example(record[3], name_to_features)["sents"], [FLAGS.max_para_length, FLAGS.max_sent_length]) dist3 = tf.reshape( tf.parse_single_example(record[4], name_to_features)["sents"], [FLAGS.max_para_length, FLAGS.max_sent_length]) inputs_obj = build_bert_inputs(target_example) distractor_obj = build_distractors([dist0, dist1, dist2, dist3], inputs_obj.context) example = {} example["input_ids"] = tf.concat( [inputs_obj.input_ids, distractor_obj.input_ids], axis=0) example["input_mask"] = tf.concat( [inputs_obj.input_mask, distractor_obj.input_mask], axis=0) example["segment_ids"] = tf.concat( [inputs_obj.segment_ids, distractor_obj.segment_ids], axis=0) example["label_types"] = inputs_obj.label_types # Add masking: if add_masking: mask_rate = FLAGS.mask_rate max_predictions_per_seq = int( math.ceil(FLAGS.max_seq_length * mask_rate)) cls_token = "[CLS]" sep_token = "[SEP]" mask_token = "[MASK]" # pad_token = "[PAD]" mask_blacklist = tf.constant([cls_token, sep_token]) # , pad_token]) mask_blacklist_ids = tf.to_int32( vocab_table.lookup(mask_blacklist)) mask_token_id = tf.to_int32( vocab_table.lookup(tf.constant(mask_token))) input_ids = tf.to_int32(example["input_ids"]) def call_sample_mask_indices(x): return ip.sample_mask_indices(x, mask_rate, mask_blacklist_ids, max_predictions_per_seq) mask_indices = tf.map_fn(call_sample_mask_indices, input_ids, dtype=tf.int32) def call_get_target_tokens(x): input_len = tf.shape(input_ids)[-1] x_input_id = x[:input_len] x_mask_indices = x[input_len:] return ip.get_target_tokens_for_apply(x_input_id, x_mask_indices) map_input = tf.concat([input_ids, mask_indices], -1) target_token_ids = tf.map_fn(call_get_target_tokens, map_input) def call_apply_masking(x): input_len = tf.shape(input_ids)[-1] mask_idx_len = tf.shape(mask_indices)[-1] x_input_id = x[:input_len] x_mask_indices = x[input_len:input_len + mask_idx_len] x_target_token_ids = x[input_len + mask_idx_len:] return ip.apply_masking(x_input_id, x_target_token_ids, x_mask_indices, mask_token_id, 1000) map_input2 = tf.concat([input_ids, mask_indices, target_token_ids], -1) token_ids_masked = tf.map_fn(call_apply_masking, tf.to_int64(map_input2)) target_token_weights = tf.ones_like(target_token_ids, dtype=tf.float32) pad_targets = tf.where( tf.equal(target_token_ids, 0), tf.ones_like(target_token_ids, dtype=tf.float32), tf.zeros_like(target_token_ids, dtype=tf.float32)) target_token_weights = target_token_weights - pad_targets example["target_token_weights"] = target_token_weights example["target_token_ids"] = target_token_ids example["input_ids"] = token_ids_masked example["mask_indices"] = mask_indices # Set shape explicitly for TPU example["target_token_weights"].set_shape( [FLAGS.num_choices, max_predictions_per_seq]) example["target_token_ids"].set_shape( [FLAGS.num_choices, max_predictions_per_seq]) example["mask_indices"].set_shape( [FLAGS.num_choices, max_predictions_per_seq]) # Set shape explicitly for TPU example["input_ids"].set_shape( [FLAGS.num_choices, FLAGS.max_seq_length]) example["input_mask"].set_shape( [FLAGS.num_choices, FLAGS.max_seq_length]) example["segment_ids"].set_shape( [FLAGS.num_choices, FLAGS.max_seq_length]) example["label_types"].set_shape([4]) example["label_ids"] = tf.scatter_nd( tf.reshape(example["label_types"], [4, 1]), tf.range(4), [8]) # tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. for name in list(example.keys()): # pylint: disable=g-builtin-op t = example[name] if t.dtype == tf.int64: t = tf.to_int32(t) example[name] = t return example
def GAN(X, experiment_name, NtoGenerate, z_size=100, g_hidden_size=128, d_hidden_size=128, alpha=0.01, smooth=0.1, learning_rate=0.0002, epochs=100): # Should be used on each of the binary classes. checkpoint_name = "Checkpoint_" + experiment_name # X[np.where(y[:, 1] == 1)] # Hyperparameters # Size of input image to discriminator input_size = 475 # size of each window # Size of latent vector to generator, typically 100, however NVIDIA used N equal to size of max number of channels in the convolutions # z_size = 100 # Sizes of hidden layers in generator and discriminator #g_hidden_size = 128 #d_hidden_size = 128 # Leak factor for leaky ReLU #alpha = 0.01 # Label smoothing #smooth = 0.1 tf.reset_default_graph() # Create our input placeholders input_real, input_z = model_inputs(input_size, z_size) # Generator network here g_model, g_logits = generator(input_z, input_size, n_units=g_hidden_size, reuse=False, alpha=alpha) # g_model is the generator output # Disriminator network here d_model_real, d_logits_real = discriminator(input_real, n_units=d_hidden_size, reuse=False, alpha=alpha) d_model_fake, d_logits_fake = discriminator(g_model, n_units=d_hidden_size, reuse=True, alpha=alpha) # Calculate losses d_loss_real = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=d_logits_real, labels=tf.ones_like(d_logits_real) * (1 - smooth))) d_loss_fake = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=d_logits_fake, labels=tf.zeros_like(d_logits_real))) d_loss = d_loss_real + d_loss_fake g_loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=d_logits_fake, labels=tf.ones_like(d_logits_fake))) # Get the trainable_variables, split into G and D parts t_vars = tf.trainable_variables() g_vars = [var for var in t_vars if var.name.startswith('generator')] d_vars = [var for var in t_vars if var.name.startswith('discriminator')] d_train_opt = tf.train.AdamOptimizer(learning_rate).minimize( d_loss, var_list=d_vars) g_train_opt = tf.train.AdamOptimizer(learning_rate).minimize( g_loss, var_list=g_vars) # TRAINING batch_size = 100 # TODO: Might have to be different size according to the size of the class (This determines amount of generated data) samples = [] losses = [] saver = tf.train.Saver(var_list=g_vars) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for e in range(epochs): for ii in range(len(X) // batch_size): #print(batch_size * ii, batch_size * (ii + 1)) batch = X[batch_size * ii:batch_size * (ii + 1)] # Get images, reshape and rescale to pass to D # batch_images = batch[0].reshape((batch_size, 475)) # The images should be rescaled to be between -1 and 1, as tanh works best. (Rescale back afterwards?) batch_images = (batch - np.min(batch)) / ( np.max(batch) - np.min(batch)) * (1 - (-1)) + -1 # When rescaling back: # -(np.max(batch)*(-1) - (np.max(batch)*batch_images) - (np.min(batch)*1) + np.min(batch)*batch_images)/(1 - (-1)) # Sample random noise for G batch_z = np.random.uniform(-1, 1, size=(batch_size, z_size)) # Run optimizers _ = sess.run(d_train_opt, feed_dict={ input_real: batch_images, input_z: batch_z }) _ = sess.run(g_train_opt, feed_dict={input_z: batch_z}) # At the end of each epoch, get the losses and print them out #train_loss_d = sess.run(d_loss, {input_z: batch_z, input_real: batch_images}) #train_loss_g = g_loss.eval({input_z: batch_z}) #print("Epoch {}/{}...".format(e + 1, epochs), # "Discriminator Loss: {:.4f}...".format(train_loss_d), # "Generator Loss: {:.4f}".format(train_loss_g)) # Save losses to view after training #losses.append((train_loss_d, train_loss_g)) # Might be unnecessary # Sample from generator as we're training for viewing afterwards sample_z = np.random.uniform(-1, 1, size=(16, z_size)) gen_samples = sess.run(generator(input_z, input_size, n_units=g_hidden_size, reuse=True), feed_dict={input_z: sample_z}) samples.append(gen_samples) saver.save(sess, './checkpoints/' + checkpoint_name + '.ckpt') # Save training generator samples with open('train_samples.pkl', 'wb') as f: pkl.dump(samples, f) #fig, ax = plt.subplots() #losses = np.array(losses) #plt.plot(losses.T[0], label='Discriminator') #plt.plot(losses.T[1], label='Generator') #plt.title("Training Losses") #plt.legend() #plt.show() # Generating the new observations after training: saver = tf.train.Saver(var_list=g_vars) with tf.Session() as sess: saver.restore(sess, './checkpoints/' + checkpoint_name + '.ckpt') sample_z = np.random.uniform(-1, 1, size=(NtoGenerate, z_size)) gen_samples = sess.run(generator(input_z, input_size, n_units=g_hidden_size, reuse=True), feed_dict={input_z: sample_z}) # Scaling back to normal: gen_samples = 1 / 2 * gen_samples[0] * np.max(X) - 1 / 2 * gen_samples[ 0] * np.min(X) + 1 / 2 * np.max(X) + 1 / 2 * np.min(X) return gen_samples
tf.shape( tensor_dict[fields.InputDataFields.groundtruth_weights])[0], 0), lambda: tensor_dict[fields.InputDataFields.groundtruth_weights], default_groundtruth_weights) if fields.InputDataFields.groundtruth_keypoints in tensor_dict: # Set all keypoints that are not labeled to NaN. gt_kpt_fld = fields.InputDataFields.groundtruth_keypoints gt_kpt_vis_fld = fields.InputDataFields.groundtruth_keypoint_visibilities visibilities_tiled = tf.tile( tf.expand_dims(tensor_dict[gt_kpt_vis_fld], -1), [1, 1, 2]) tensor_dict[gt_kpt_fld] = tf.where( visibilities_tiled, tensor_dict[gt_kpt_fld], np.nan * tf.ones_like(tensor_dict[gt_kpt_fld])) if self._expand_hierarchy_labels: input_fields = fields.InputDataFields image_classes, image_confidences = self._expand_image_label_hierarchy( tensor_dict[input_fields.groundtruth_image_classes], tensor_dict[input_fields.groundtruth_image_confidences]) tensor_dict[input_fields.groundtruth_image_classes] = image_classes tensor_dict[input_fields.groundtruth_image_confidences] = ( image_confidences) box_fields = [ fields.InputDataFields.groundtruth_group_of, fields.InputDataFields.groundtruth_is_crowd, fields.InputDataFields.groundtruth_difficult, fields.InputDataFields.groundtruth_area,
def build_train_graph(self, inputs, min_depth, max_depth, num_mpi_planes, learning_rate=0.0002, beta1=0.9, vgg_model_file=None, global_step=0): """Construct the training computation graph. Args: inputs: dictionary of tensors (see 'input_data' below) needed for training min_depth: minimum depth for the PSV and MPI planes max_depth: maximum depth for the PSV and MPI planes num_mpi_planes: number of MPI planes to infer learning_rate: learning rate beta1: hyperparameter for Adam vgg_model_file: path to vgg weights (needed when vgg loss is used) global_step: current optimization step Returns: A train_op to be used for training. """ print("starting to build graph") with tf.name_scope("input_size_randomization"): dim_choices = tf.constant([[1, 16], [2, 32], [4, 32], [4, 64], [4, 128], [8, 32], [8, 64], [8, 128]], dtype=tf.int32) rand_dim = tf.random_shuffle(dim_choices)[0, :] height_div = rand_dim[0] width_div = rand_dim[0] num_mpi_planes = rand_dim[1] tf.summary.scalar("num_mpi_planes", num_mpi_planes) with tf.name_scope("setup"): mpi_planes = self.inv_depths(min_depth, max_depth, num_mpi_planes) with tf.name_scope("input_data"): raw_tgt_image = inputs["tgt_image"] raw_ref_image = inputs["ref_image"] raw_src_images = inputs["src_images"] _, img_height, img_width, _ = raw_src_images.get_shape().as_list( ) img_height = img_height // height_div img_width = img_width // width_div raw_tgt_image = tf.image.convert_image_dtype( raw_tgt_image, dtype=tf.float32) raw_ref_image = tf.image.convert_image_dtype( raw_ref_image, dtype=tf.float32) raw_src_images = tf.image.convert_image_dtype( raw_src_images, dtype=tf.float32) raw_tgt_image = tf.image.resize_area(raw_tgt_image, [img_height, img_width]) raw_ref_image = tf.image.resize_area(raw_ref_image, [img_height, img_width]) raw_src_images = tf.image.resize_area(raw_src_images, [img_height, img_width]) tgt_pose = inputs["tgt_pose"] ref_pose = inputs["ref_pose"] src_poses = inputs["src_poses"] intrinsics = inputs["intrinsics"] # Scale intrinsics based on size randomization intrinsics = tf.concat([ intrinsics[:, 0:1, :] / tf.to_float(width_div), intrinsics[:, 1:2, :] / tf.to_float(height_div), intrinsics[:, 2:3, :] ], axis=1) inputs["intrinsics"] = intrinsics _, num_source, _, _ = src_poses.get_shape().as_list() with tf.name_scope("inference"): print("setting up MPI inference") num_mpi_planes = tf.shape(mpi_planes)[0] pred = self.infer_mpi(raw_src_images, raw_ref_image, ref_pose, src_poses, intrinsics, num_mpi_planes, mpi_planes) rgba_layers = pred["rgba_layers"] rgba_layers_refine = pred["rgba_layers_refine"] stuff_behind = pred["stuff_behind"] refine_input_mpi = pred["refine_input_mpi"] psv = pred["psv"] with tf.name_scope("synthesis"): print("setting up rendering") rel_pose = tf.matmul(tgt_pose, tf.matrix_inverse(ref_pose)) output_image, output_layers = self.mpi_render_view( rgba_layers, rel_pose, mpi_planes, intrinsics) output_alpha = output_layers[Ellipsis, -1] output_image_refine, _ = self.mpi_render_view( rgba_layers_refine, rel_pose, mpi_planes, intrinsics) with tf.name_scope("loss"): print("computing losses") # Mask loss for pixels outside reference frustum loss_mask = tf.where( tf.equal( tf.reduce_min( tf.abs(tf.reduce_sum(output_layers, axis=-1)), axis=3, keep_dims=True), 0.0), tf.zeros_like(output_alpha[:, :, :, 0:1]), tf.ones_like(output_alpha[:, :, :, 0:1])) loss_mask = tf.stop_gradient(loss_mask) tf.summary.image("loss_mask", loss_mask) # Helper functions for loss def compute_error(real, fake, mask): return tf.reduce_mean(mask * tf.abs(fake - real)) # Normalized VGG loss (from # https://github.com/CQFIO/PhotographicImageSynthesis) downsample = lambda tensor, ds: tf.nn.avg_pool(tensor, [1, ds, ds, 1], [1, ds, ds, 1], "SAME") def vgg_loss(raw_tgt_image, output_image, loss_mask): """Compute VGG loss.""" vgg_real = build_vgg19(raw_tgt_image * 255.0, vgg_model_file) rescaled_output_image = (output_image + 1.)/2. * 255.0 vgg_fake = build_vgg19( rescaled_output_image, vgg_model_file, reuse=True) p0 = compute_error(vgg_real["input"], vgg_fake["input"], loss_mask) p1 = compute_error(vgg_real["conv1_2"], vgg_fake["conv1_2"], loss_mask)/2.6 p2 = compute_error(vgg_real["conv2_2"], vgg_fake["conv2_2"], downsample(loss_mask, 2))/4.8 p3 = compute_error(vgg_real["conv3_2"], vgg_fake["conv3_2"], downsample(loss_mask, 4))/3.7 p4 = compute_error(vgg_real["conv4_2"], vgg_fake["conv4_2"], downsample(loss_mask, 8))/5.6 p5 = compute_error(vgg_real["conv5_2"], vgg_fake["conv5_2"], downsample(loss_mask, 16))*10/1.5 total_loss = p0+p1+p2+p3+p4+p5 return total_loss, vgg_real, vgg_fake vgg_loss_initial, _, _ = vgg_loss(raw_tgt_image, output_image, loss_mask) tf.summary.scalar("vgg_loss_initial", vgg_loss_initial) total_loss = vgg_loss_initial vgg_loss_refine, _, _ = vgg_loss(raw_tgt_image, output_image_refine, loss_mask) tf.summary.scalar("vgg_loss_refine", vgg_loss_refine) total_loss += vgg_loss_refine with tf.name_scope("train_op"): print("setting up train op") train_vars = [var for var in tf.trainable_variables()] optim = tf.train.AdamOptimizer(learning_rate, beta1) grads_and_vars = optim.compute_gradients(total_loss, var_list=train_vars) train_op = [optim.apply_gradients(grads_and_vars)] # Summaries tf.summary.scalar("total_loss", total_loss) # Source images for i in range(num_source): src_image = raw_src_images[:, :, :, i*3:(i+1)*3] tf.summary.image("src_image_%d" % i, src_image) # Output image tf.summary.image("output_image", self.deprocess_image(output_image)) # Refined output image tf.summary.image("output_image_refine", self.deprocess_image(output_image_refine)) # Target image tf.summary.image("tgt_image", raw_tgt_image) # Ref image tf.summary.image("ref_image", raw_ref_image) # Predicted color and alpha layers, and PSV num_summ = 16 # Number of plane summaries to show in tensorboard for i in range(num_summ): ind = tf.to_int32(i * num_mpi_planes/num_summ) rgb = rgba_layers[:, :, :, ind, :3] alpha = rgba_layers[:, :, :, ind, -1:] ref_plane = psv[:, :, :, ind, 3:6] source_plane = psv[:, :, :, ind, :3] output_rgb = output_layers[:, :, :, ind, :3] tf.summary.image("rgb_layer_%d" % i, self.deprocess_image(rgb)) tf.summary.image("alpha_layer_%d" % i, alpha) tf.summary.image("rgba_layer_%d" % i, self.deprocess_image(rgb * alpha)) tf.summary.image("psv_avg_%d" % i, (self.deprocess_image(0.5*ref_plane + 0.5*source_plane))) tf.summary.image("output_rgb_%d" % i, self.deprocess_image(output_rgb)) tf.summary.image("psv_ref_%d" % i, self.deprocess_image(ref_plane)) tf.summary.image("psv_source_%d" % i, self.deprocess_image(source_plane)) # Cumulative rendered images and refined MPI for i in range(num_summ): ind = tf.to_int32(i * num_mpi_planes/num_summ) rgb = rgba_layers_refine[:, :, :, ind, :3] alpha = rgba_layers_refine[:, :, :, ind, 3:] render = stuff_behind[:, :, :, ind, :3] input_colors = refine_input_mpi[:, :, :, ind, :3] tf.summary.image("rgb_layer_refine_%d" % i, self.deprocess_image(rgb)) tf.summary.image("alpha_layer_refine_%d" % i, alpha) tf.summary.image("rgba_layer_refine_%d" % i, self.deprocess_image(rgb * alpha)) tf.summary.image("cumulative_render_%d" % i, self.deprocess_image(render)) tf.summary.image("input_colors_refine_%d" % i, self.deprocess_image(input_colors)) return train_op
def get_losses(pointclouds_pl, end_points, dir_labels_pc_cam, offset_labels_pc, grasp_success_labels_pc, approach_labels_pc_cam, global_config): """ Computes loss terms from pointclouds, network predictions and labels Arguments: pointclouds_pl {tf.placeholder} -- bxNx3 input point clouds end_points {dict[str:tf.variable]} -- endpoints of the network containing predictions dir_labels_pc_cam {tf.variable} -- base direction labels in camera coordinates (bxNx3) offset_labels_pc {tf.variable} -- grasp width labels (bxNx1) grasp_success_labels_pc {tf.variable} -- contact success labels (bxNx1) approach_labels_pc_cam {tf.variable} -- approach direction labels in camera coordinates (bxNx3) global_config {dict} -- config dict Returns: [dir_cosine_loss, bin_ce_loss, offset_loss, approach_cosine_loss, adds_loss, adds_loss_gt2pred, gt_control_points, pred_control_points, pos_grasps_in_view] -- All losses (not all are used for training) """ grasp_dir_head = end_points['grasp_dir_head'] grasp_offset_head = end_points['grasp_offset_head'] approach_dir_head = end_points['approach_dir_head'] bin_weights = global_config['DATA']['labels']['bin_weights'] tf_bin_weights = tf.constant(bin_weights) min_geom_loss_divisor = tf.constant( float(global_config['LOSS']['min_geom_loss_divisor']) ) if 'min_geom_loss_divisor' in global_config['LOSS'] else tf.constant(1.) pos_grasps_in_view = tf.math.maximum( tf.reduce_sum(grasp_success_labels_pc, axis=1), min_geom_loss_divisor) ### ADS Gripper PC Loss if global_config['MODEL']['bin_offsets']: thickness_pred = tf.gather_nd( get_bin_vals(global_config), tf.expand_dims(tf.argmax(grasp_offset_head, axis=2), axis=2)) thickness_gt = tf.gather_nd( get_bin_vals(global_config), tf.expand_dims(tf.argmax(offset_labels_pc, axis=2), axis=2)) else: thickness_pred = grasp_offset_head[:, :, 0] thickness_gt = offset_labels_pc[:, :, 0] pred_grasps = build_6d_grasp(approach_dir_head, grasp_dir_head, pointclouds_pl, thickness_pred, use_tf=True) # b x num_point x 4 x 4 gt_grasps_proj = build_6d_grasp(approach_labels_pc_cam, dir_labels_pc_cam, pointclouds_pl, thickness_gt, use_tf=True) # b x num_point x 4 x 4 pos_gt_grasps_proj = tf.where( tf.broadcast_to( tf.expand_dims( tf.expand_dims(tf.cast(grasp_success_labels_pc, tf.bool), 2), 3), gt_grasps_proj.shape), gt_grasps_proj, tf.ones_like(gt_grasps_proj) * 100000) # pos_gt_grasps_proj = tf.reshape(pos_gt_grasps_proj, (global_config['OPTIMIZER']['batch_size'], -1, 4, 4)) gripper = mesh_utils.create_gripper('panda') gripper_control_points = gripper.get_control_point_tensor( global_config['OPTIMIZER']['batch_size']) # b x 5 x 3 sym_gripper_control_points = gripper.get_control_point_tensor( global_config['OPTIMIZER']['batch_size'], symmetric=True) gripper_control_points_homog = tf.concat([ gripper_control_points, tf.ones((global_config['OPTIMIZER']['batch_size'], gripper_control_points.shape[1], 1)) ], axis=2) # b x 5 x 4 sym_gripper_control_points_homog = tf.concat([ sym_gripper_control_points, tf.ones((global_config['OPTIMIZER']['batch_size'], gripper_control_points.shape[1], 1)) ], axis=2) # b x 5 x 4 # only use per point pred grasps but not per point gt grasps control_points = tf.keras.backend.repeat_elements( tf.expand_dims(gripper_control_points_homog, 1), gt_grasps_proj.shape[1], axis=1) # b x num_point x 5 x 4 sym_control_points = tf.keras.backend.repeat_elements( tf.expand_dims(sym_gripper_control_points_homog, 1), gt_grasps_proj.shape[1], axis=1) # b x num_point x 5 x 4 pred_control_points = tf.matmul( control_points, tf.transpose(pred_grasps, perm=[0, 1, 3, 2]))[:, :, :, :3] # b x num_point x 5 x 3 ### Pred Grasp to GT Grasp ADD-S Loss gt_control_points = tf.matmul( control_points, tf.transpose(pos_gt_grasps_proj, perm=[0, 1, 3, 2 ]))[:, :, :, :3] # b x num_pos_grasp_point x 5 x 3 sym_gt_control_points = tf.matmul( sym_control_points, tf.transpose(pos_gt_grasps_proj, perm=[0, 1, 3, 2 ]))[:, :, :, :3] # b x num_pos_grasp_point x 5 x 3 squared_add = tf.reduce_sum( (tf.expand_dims(pred_control_points, 2) - tf.expand_dims(gt_control_points, 1))**2, axis=(3, 4)) # b x num_point x num_pos_grasp_point x ( 5 x 3) sym_squared_add = tf.reduce_sum( (tf.expand_dims(pred_control_points, 2) - tf.expand_dims(sym_gt_control_points, 1))**2, axis=(3, 4)) # b x num_point x num_pos_grasp_point x ( 5 x 3) # symmetric ADD-S neg_squared_adds = -tf.concat( [squared_add, sym_squared_add], axis=2) # b x num_point x 2num_pos_grasp_point neg_squared_adds_k = tf.math.top_k(neg_squared_adds, k=1, sorted=False)[0] # b x num_point # If any pos grasp exists min_adds = tf.minimum( tf.reduce_sum(grasp_success_labels_pc, axis=1, keepdims=True), tf.ones_like(neg_squared_adds_k[:, :, 0]) ) * tf.sqrt( -neg_squared_adds_k[:, :, 0] ) #tf.minimum(tf.sqrt(-neg_squared_adds_k), tf.ones_like(neg_squared_adds_k)) # b x num_point adds_loss = tf.reduce_mean(end_points['binary_seg_pred'][:, :, 0] * min_adds) ### GT Grasp to pred Grasp ADD-S Loss gt_control_points = tf.matmul( control_points, tf.transpose(gt_grasps_proj, perm=[0, 1, 3, 2 ]))[:, :, :, :3] # b x num_pos_grasp_point x 5 x 3 sym_gt_control_points = tf.matmul( sym_control_points, tf.transpose(gt_grasps_proj, perm=[0, 1, 3, 2 ]))[:, :, :, :3] # b x num_pos_grasp_point x 5 x 3 neg_squared_adds = -tf.reduce_sum( (tf.expand_dims(pred_control_points, 1) - tf.expand_dims(gt_control_points, 2))**2, axis=(3, 4)) # b x num_point x num_pos_grasp_point x ( 5 x 3) neg_squared_adds_sym = -tf.reduce_sum( (tf.expand_dims(pred_control_points, 1) - tf.expand_dims(sym_gt_control_points, 2))**2, axis=(3, 4)) # b x num_point x num_pos_grasp_point x ( 5 x 3) neg_squared_adds_k_gt2pred, pred_grasp_idcs = tf.math.top_k( neg_squared_adds, k=1, sorted=False) # b x num_pos_grasp_point neg_squared_adds_k_sym_gt2pred, pred_grasp_sym_idcs = tf.math.top_k( neg_squared_adds_sym, k=1, sorted=False) # b x num_pos_grasp_point pred_grasp_idcs_joined = tf.where( neg_squared_adds_k_gt2pred < neg_squared_adds_k_sym_gt2pred, pred_grasp_sym_idcs, pred_grasp_idcs) min_adds_gt2pred = tf.minimum( -neg_squared_adds_k_gt2pred, -neg_squared_adds_k_sym_gt2pred) # b x num_pos_grasp_point x 1 # min_adds_gt2pred = tf.math.exp(-min_adds_gt2pred) masked_min_adds_gt2pred = tf.multiply(min_adds_gt2pred[:, :, 0], grasp_success_labels_pc) batch_idcs = tf.meshgrid(tf.range(pred_grasp_idcs_joined.shape[1]), tf.range(pred_grasp_idcs_joined.shape[0])) gather_idcs = tf.stack((batch_idcs[1], pred_grasp_idcs_joined[:, :, 0]), axis=2) nearest_pred_grasp_confidence = tf.gather_nd( end_points['binary_seg_pred'][:, :, 0], gather_idcs) adds_loss_gt2pred = tf.reduce_mean( tf.reduce_sum(nearest_pred_grasp_confidence * masked_min_adds_gt2pred, axis=1) / pos_grasps_in_view) ### Grasp baseline Loss cosine_distance = tf.constant(1.) - tf.reduce_sum( tf.multiply(dir_labels_pc_cam, grasp_dir_head), axis=2) # only pass loss where we have labeled contacts near pc points masked_cosine_loss = tf.multiply(cosine_distance, grasp_success_labels_pc) dir_cosine_loss = tf.reduce_mean( tf.reduce_sum(masked_cosine_loss, axis=1) / pos_grasps_in_view) ### Grasp Approach Loss approach_labels_orthog = tf.math.l2_normalize( approach_labels_pc_cam - tf.reduce_sum(tf.multiply(grasp_dir_head, approach_labels_pc_cam), axis=2, keepdims=True) * grasp_dir_head, axis=2) cosine_distance_approach = tf.constant(1.) - tf.reduce_sum( tf.multiply(approach_labels_orthog, approach_dir_head), axis=2) masked_approach_loss = tf.multiply(cosine_distance_approach, grasp_success_labels_pc) approach_cosine_loss = tf.reduce_mean( tf.reduce_sum(masked_approach_loss, axis=1) / pos_grasps_in_view) ### Grasp Offset/Thickness Loss if global_config['MODEL']['bin_offsets']: if global_config['LOSS'][ 'offset_loss_type'] == 'softmax_cross_entropy': offset_loss = tf.losses.softmax_cross_entropy( offset_labels_pc, grasp_offset_head) else: offset_loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=offset_labels_pc, logits=grasp_offset_head) if 'too_small_offset_pred_bin_factor' in global_config[ 'LOSS'] and global_config['LOSS'][ 'too_small_offset_pred_bin_factor']: too_small_offset_pred_bin_factor = tf.constant( global_config['LOSS']['too_small_offset_pred_bin_factor'], tf.float32) collision_weight = tf.math.cumsum( offset_labels_pc, axis=2, reverse=True ) * too_small_offset_pred_bin_factor + tf.constant(1.) offset_loss = tf.multiply(collision_weight, offset_loss) offset_loss = tf.reduce_mean(tf.multiply( tf.reshape(tf_bin_weights, (1, 1, -1)), offset_loss), axis=2) else: offset_loss = (grasp_offset_head[:, :, 0] - offset_labels_pc[:, :, 0])**2 masked_offset_loss = tf.multiply(offset_loss, grasp_success_labels_pc) offset_loss = tf.reduce_mean( tf.reduce_sum(masked_offset_loss, axis=1) / pos_grasps_in_view) ### Grasp Confidence Loss bin_ce_loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.expand_dims(grasp_success_labels_pc, axis=2), logits=end_points['binary_seg_head']) if 'topk_confidence' in global_config['LOSS'] and global_config['LOSS'][ 'topk_confidence']: bin_ce_loss, _ = tf.math.top_k( tf.squeeze(bin_ce_loss), k=global_config['LOSS']['topk_confidence']) bin_ce_loss = tf.reduce_mean(bin_ce_loss) return dir_cosine_loss, bin_ce_loss, offset_loss, approach_cosine_loss, adds_loss, adds_loss_gt2pred
def evaluate(self, params, tpts): return params[0] * tf.ones_like(tpts)
def compute_labels(pos_contact_pts_mesh, pos_contact_dirs_mesh, pos_contact_approaches_mesh, pos_finger_diffs, pc_cam_pl, camera_pose_pl, global_config): """ Project grasp labels defined on meshes onto rendered point cloud from a camera pose via nearest neighbor contacts within a maximum radius. All points without nearby successful grasp contacts are considered negativ contact points. Arguments: pos_contact_pts_mesh {tf.constant} -- positive contact points on the mesh scene (Mx3) pos_contact_dirs_mesh {tf.constant} -- respective contact base directions in the mesh scene (Mx3) pos_contact_approaches_mesh {tf.constant} -- respective contact approach directions in the mesh scene (Mx3) pos_finger_diffs {tf.constant} -- respective grasp widths in the mesh scene (Mx1) pc_cam_pl {tf.placeholder} -- bxNx3 rendered point clouds camera_pose_pl {tf.placeholder} -- bx4x4 camera poses global_config {dict} -- global config Returns: [dir_labels_pc_cam, offset_labels_pc, grasp_success_labels_pc, approach_labels_pc_cam] -- Per-point contact success labels and per-contact pose labels in rendered point cloud """ label_config = global_config['DATA']['labels'] model_config = global_config['MODEL'] nsample = label_config['k'] radius = label_config['max_radius'] filter_z = label_config['filter_z'] z_val = label_config['z_val'] xyz_cam = pc_cam_pl[:, :, :3] pad_homog = tf.ones((xyz_cam.shape[0], xyz_cam.shape[1], 1)) pc_mesh = tf.matmul( tf.concat([xyz_cam, pad_homog], 2), tf.transpose(tf.linalg.inv(camera_pose_pl), perm=[0, 2, 1]))[:, :, :3] contact_point_offsets_batch = tf.keras.backend.repeat_elements( tf.expand_dims(pos_finger_diffs, 0), pc_mesh.shape[0], axis=0) pad_homog2 = tf.ones((pc_mesh.shape[0], pos_contact_dirs_mesh.shape[0], 1)) contact_point_dirs_batch = tf.keras.backend.repeat_elements( tf.expand_dims(pos_contact_dirs_mesh, 0), pc_mesh.shape[0], axis=0) contact_point_dirs_batch_cam = tf.matmul( contact_point_dirs_batch, tf.transpose(camera_pose_pl[:, :3, :3], perm=[0, 2, 1]))[:, :, :3] pos_contact_approaches_batch = tf.keras.backend.repeat_elements( tf.expand_dims(pos_contact_approaches_mesh, 0), pc_mesh.shape[0], axis=0) pos_contact_approaches_batch_cam = tf.matmul( pos_contact_approaches_batch, tf.transpose(camera_pose_pl[:, :3, :3], perm=[0, 2, 1]))[:, :, :3] contact_point_batch_mesh = tf.keras.backend.repeat_elements( tf.expand_dims(pos_contact_pts_mesh, 0), pc_mesh.shape[0], axis=0) contact_point_batch_cam = tf.matmul( tf.concat([contact_point_batch_mesh, pad_homog2], 2), tf.transpose(camera_pose_pl, perm=[0, 2, 1]))[:, :, :3] if filter_z: dir_filter_passed = tf.keras.backend.repeat_elements(tf.math.greater( contact_point_dirs_batch_cam[:, :, 2:3], tf.constant([z_val])), 3, axis=2) contact_point_batch_mesh = tf.where( dir_filter_passed, contact_point_batch_mesh, tf.ones_like(contact_point_batch_mesh) * 100000) squared_dists_all = tf.reduce_sum( (tf.expand_dims(contact_point_batch_cam, 1) - tf.expand_dims(xyz_cam, 2))**2, axis=3) neg_squared_dists_k, close_contact_pt_idcs = tf.math.top_k( -squared_dists_all, k=nsample, sorted=False) squared_dists_k = -neg_squared_dists_k # Nearest neighbor mapping grasp_success_labels_pc = tf.cast( tf.less(tf.reduce_mean(squared_dists_k, axis=2), radius * radius), tf.float32) # (batch_size, num_point) grouped_dirs_pc_cam = group_point(contact_point_dirs_batch_cam, close_contact_pt_idcs) grouped_approaches_pc_cam = group_point(pos_contact_approaches_batch_cam, close_contact_pt_idcs) grouped_offsets = group_point( tf.expand_dims(contact_point_offsets_batch, 2), close_contact_pt_idcs) dir_labels_pc_cam = tf.math.l2_normalize( tf.reduce_mean(grouped_dirs_pc_cam, axis=2), axis=2) # (batch_size, num_point, 3) approach_labels_pc_cam = tf.math.l2_normalize( tf.reduce_mean(grouped_approaches_pc_cam, axis=2), axis=2) # (batch_size, num_point, 3) offset_labels_pc = tf.reduce_mean(grouped_offsets, axis=2) return dir_labels_pc_cam, offset_labels_pc, grasp_success_labels_pc, approach_labels_pc_cam
def meta_optimize(self): """Meta optimization step.""" probe_images, probe_labels = self.probe_images, self.probe_labels labels = self.labels net = self.net logits = self.logits gate_gradients = 1 batch_size = int(self.batch_size / self.strategy.num_replicas_in_sync) init_eps_val = float(1) / batch_size meta_net = networks.MetaImage(self.net, name='meta_model') if FLAGS.meta_momentum and not self.optimizer.variables(): # Initializing momentum state of optimizer for meta momentum update. # It is a hacky implementation logging.info('Pre-initialize optimizer momentum states.') idle_net_cost = tf.losses.sparse_softmax_cross_entropy( self.labels, logits) tmp_var_grads = self.optimizer.compute_gradients( tf.reduce_mean(idle_net_cost), net.trainable_variables) self.optimizer.apply_gradients(tmp_var_grads) with tf.name_scope('coefficient'): # Data weight coefficient target = tf.constant([init_eps_val] * batch_size, shape=(batch_size, ), dtype=np.float32, name='weight') # Data re-labeling coefficient eps = tf.constant([FLAGS.grad_eps_init] * batch_size, shape=(batch_size, ), dtype=tf.float32, name='eps') onehot_labels = tf.one_hot(labels, self.dataset.num_classes) onehot_labels = tf.cast(onehot_labels, tf.float32) eps_k = tf.reshape(eps, [batch_size, 1]) mixed_labels = eps_k * onehot_labels + (1 - eps_k) * self.guessed_label # raw softmax loss log_softmax = tf.nn.log_softmax(logits) net_cost = -tf.reduce_sum(mixed_labels * log_softmax, 1) lookahead_loss = tf.reduce_sum(tf.multiply(target, net_cost)) lookahead_loss = lookahead_loss + net.regularization_loss with tf.control_dependencies([lookahead_loss]): train_vars = net.trainable_variables var_grads = tf.gradients(lookahead_loss, train_vars, gate_gradients=gate_gradients) static_vars = [] for i in range(len(train_vars)): if FLAGS.meta_momentum > 0: actual_grad = self.meta_momentum_update( var_grads[i], train_vars[i].name, self.optimizer) static_vars.append( tf.math.subtract(train_vars[i], FLAGS.meta_stepsize * actual_grad)) else: static_vars.append( tf.math.subtract(train_vars[i], FLAGS.meta_stepsize * var_grads[i])) # new style meta_net.add_variable_alias(static_vars[-1], var_name=train_vars[i].name) for uv in net.updates_variables: meta_net.add_variable_alias(uv, var_name=uv.name, var_type='updates_variables') meta_net.verbose() with tf.control_dependencies(static_vars): g_logits = meta_net(probe_images, name='meta_model', reuse=True, training=True) desired_y = tf.one_hot(probe_labels, self.dataset.num_classes) meta_loss = tf.nn.softmax_cross_entropy_with_logits_v2( desired_y, g_logits) meta_loss = tf.reduce_mean(meta_loss, name='meta_loss') meta_loss = meta_loss + meta_net.get_regularization_loss(net.wd) meta_acc, meta_acc_op = tf.metrics.accuracy( probe_labels, tf.argmax(g_logits, axis=1)) with tf.control_dependencies([meta_loss] + [meta_acc_op]): meta_train_vars = meta_net.trainable_variables grad_meta_vars = tf.gradients(meta_loss, meta_train_vars, gate_gradients=gate_gradients) grad_target, grad_eps = tf.gradients(static_vars, [target, eps], grad_ys=grad_meta_vars, gate_gradients=gate_gradients) # updates weight raw_weight = target - grad_target raw_weight = raw_weight - init_eps_val unorm_weight = tf.clip_by_value(raw_weight, clip_value_min=0, clip_value_max=float('inf')) norm_c = tf.reduce_sum(unorm_weight) weight = tf.divide(unorm_weight, norm_c + 0.00001) # gets new lambda by the sign of gradient new_eps = tf.where(grad_eps < 0, x=tf.ones_like(eps), y=tf.zeros_like(eps)) return tf.stop_gradient(weight), tf.stop_gradient( new_eps), meta_loss, meta_acc
def crop_mask_in_target_box(masks, boxes, target_boxes, output_size, sample_offset=0, use_einsum=True): """Crop masks in target boxes. Args: masks: A tensor with a shape of [batch_size, num_masks, height, width]. boxes: a float tensor representing box cooridnates that tightly enclose masks with a shape of [batch_size, num_masks, 4] in un-normalized coordinates. A box is represented by [ymin, xmin, ymax, xmax]. target_boxes: a float tensor representing target box cooridnates for masks with a shape of [batch_size, num_masks, 4] in un-normalized coordinates. A box is represented by [ymin, xmin, ymax, xmax]. output_size: A scalar to indicate the output crop size. It currently only supports to output a square shape outputs. sample_offset: a float number in [0, 1] indicates the subpixel sample offset from grid point. use_einsum: Use einsum to replace gather in selective_crop_and_resize. Returns: A 4-D tensor representing feature crop of shape [batch_size, num_boxes, output_size, output_size]. """ with tf.name_scope('crop_mask_in_target_box'): batch_size, num_masks, height, width = masks.get_shape().as_list() masks = tf.reshape(masks, [batch_size * num_masks, height, width, 1]) # Pad zeros on the boundary of masks. masks = tf.image.pad_to_bounding_box(masks, 2, 2, height + 4, width + 4) masks = tf.reshape(masks, [batch_size, num_masks, height + 4, width + 4, 1]) # Projects target box locations and sizes to corresponding cropped # mask coordinates. gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split(value=boxes, num_or_size_splits=4, axis=2) bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split(value=target_boxes, num_or_size_splits=4, axis=2) y_transform = (bb_y_min - gt_y_min) * height / (gt_y_max - gt_y_min + _EPSILON) + 2 x_transform = (bb_x_min - gt_x_min) * height / (gt_x_max - gt_x_min + _EPSILON) + 2 h_transform = (bb_y_max - bb_y_min) * width / (gt_y_max - gt_y_min + _EPSILON) w_transform = (bb_x_max - bb_x_min) * width / (gt_x_max - gt_x_min + _EPSILON) boundaries = tf.concat([ tf.to_float(tf.ones_like(y_transform) * ((height + 4) - 1)), tf.to_float(tf.ones_like(x_transform) * ((width + 4) - 1)) ], axis=-1) # Reshape tensors to have the right shape for selective_crop_and_resize. trasnformed_boxes = tf.concat( [y_transform, x_transform, h_transform, w_transform], -1) levels = tf.tile(tf.reshape(tf.range(num_masks), [1, num_masks]), [batch_size, 1]) cropped_masks = selective_crop_and_resize(masks, trasnformed_boxes, levels, boundaries, output_size, sample_offset=sample_offset, use_einsum_gather=use_einsum) cropped_masks = tf.squeeze(cropped_masks, axis=-1) return cropped_masks