def process_obj_mask_warp(obj_id): """Performs warping of the individual object masks.""" obj_mask = tf.to_float( tf.equal(current_seg, obj_id)) # Warp obj_mask according to overall egomotion. obj_mask_warped, _ = ( project.inverse_warp( tf.expand_dims(obj_mask, axis=0), # Middle frame, highest scale, batch element i: tf.expand_dims( self.depth_upsampled[1][s][i], axis=0), # Matrix for warping j into middle frame, batch elem. i: tf.expand_dims( self.egomotions_seq[s][j][i], axis=0), tf.expand_dims( self.intrinsic_mat[i, 0, :, :], axis=0), tf.expand_dims( self.intrinsic_mat_inv[i, 0, :, :], axis=0))) obj_mask_warped = tf.squeeze(obj_mask_warped) obj_mask_binarized = tf.greater( # Threshold to binarize mask. obj_mask_warped, tf.constant(0.5)) return tf.to_float(obj_mask_binarized)
def inverse_warp_wrapper(matrix): """Wrapper for inverse warping method.""" warp_image, _ = ( project.inverse_warp( tf.expand_dims(base_warping, axis=0), tf.expand_dims(target_depth[batch_s], axis=0), tf.expand_dims(matrix, axis=0), tf.expand_dims(self.intrinsic_mat[ batch_s, selected_scale, :, :], axis=0), tf.expand_dims(self.intrinsic_mat_inv[ batch_s, selected_scale, :, :], axis=0))) return warp_image
def process_obj_mask_warp(obj_id): """Performs warping of the individual object masks.""" obj_mask = tf.to_float(tf.equal(current_seg, obj_id)) # Warp obj_mask according to overall egomotion. obj_mask_warped, _ = ( project.inverse_warp( tf.expand_dims(obj_mask, axis=0), # Middle frame, highest scale, batch element i: tf.expand_dims(self.depth_upsampled[1][s][i], axis=0), # Matrix for warping j into middle frame, batch elem. i: tf.expand_dims(self.egomotions_seq[s][j][i], axis=0), tf.expand_dims(self.intrinsic_mat[i, 0, :, :], axis=0), tf.expand_dims(self.intrinsic_mat_inv[i, 0, :, :], axis=0))) obj_mask_warped = tf.squeeze(obj_mask_warped) obj_mask_binarized = tf.greater( # Threshold to binarize mask. obj_mask_warped, tf.constant(0.5)) return tf.to_float(obj_mask_binarized)
def build_loss(self): """Adds ops for computing loss.""" with tf.name_scope('compute_loss'): self.reconstr_loss = 0 self.smooth_loss = 0 self.ssim_loss = 0 self.icp_transform_loss = 0 self.icp_residual_loss = 0 # self.images is organized by ...[scale][B, h, w, seq_len * 3]. self.images = [None for _ in range(NUM_SCALES)] # Following nested lists are organized by ...[scale][source-target]. self.warped_image = [{} for _ in range(NUM_SCALES)] self.warp_mask = [{} for _ in range(NUM_SCALES)] self.warp_error = [{} for _ in range(NUM_SCALES)] self.ssim_error = [{} for _ in range(NUM_SCALES)] self.icp_transform = [{} for _ in range(NUM_SCALES)] self.icp_residual = [{} for _ in range(NUM_SCALES)] self.middle_frame_index = util.get_seq_middle(self.seq_length) # Compute losses at each scale. for s in range(NUM_SCALES): # Scale image stack. if s == 0: # Just as a precaution. TF often has interpolation bugs. self.images[s] = self.image_stack else: height_s = int(self.img_height / (2**s)) width_s = int(self.img_width / (2**s)) self.images[s] = tf.image.resize_bilinear( self.image_stack, [height_s, width_s], align_corners=True) # Smoothness. if self.smooth_weight > 0: for i in range(self.seq_length): # When computing minimum loss, use the depth map from the middle # frame only. if not self.compute_minimum_loss or i == self.middle_frame_index: disp_smoothing = self.disp[i][s] if self.depth_normalization: # Perform depth normalization, dividing by the mean. mean_disp = tf.reduce_mean(disp_smoothing, axis=[1, 2, 3], keep_dims=True) disp_input = disp_smoothing / mean_disp else: disp_input = disp_smoothing scaling_f = (1.0 if self.equal_weighting else 1.0 / (2**s)) self.smooth_loss += scaling_f * self.depth_smoothness( disp_input, self.images[s][:, :, :, 3 * i:3 * (i + 1)]) self.debug_all_warped_image_batches = [] for i in range(self.seq_length): for j in range(self.seq_length): if i == j: continue # When computing minimum loss, only consider the middle frame as # target. if self.compute_minimum_loss and j != self.middle_frame_index: continue # We only consider adjacent frames, unless either # compute_minimum_loss is on (where the middle frame is matched with # all other frames) or exhaustive_mode is on (where all frames are # matched with each other). if (not self.compute_minimum_loss and not self.exhaustive_mode and abs(i - j) != 1): continue selected_scale = 0 if self.depth_upsampling else s source = self.images[selected_scale][:, :, :, 3 * i:3 * (i + 1)] target = self.images[selected_scale][:, :, :, 3 * j:3 * (j + 1)] if self.depth_upsampling: target_depth = self.depth_upsampled[j][s] else: target_depth = self.depth[j][s] key = '%d-%d' % (i, j) if self.handle_motion: # self.seg_stack of shape (B, H, W, 9). # target_depth corresponds to middle frame, of shape (B, H, W, 1). # Now incorporate the other warping results, performed according # to the object motion network's predictions. # self.object_masks batch_size elements of (N, H, W, 9). # self.object_masks_warped batch_size elements of (N, H, W, 9). # self.object_transforms batch_size elements of (N, 2, 6). self.all_batches = [] for batch_s in range(self.batch_size): # To warp i into j, first take the base warping (this is the # full image i warped into j using only the egomotion estimate). base_warping = self.warped_seq[s][i][batch_s] transform_matrices_thisbatch = tf.map_fn( lambda transform: project.get_transform_mat( tf.expand_dims(transform, axis=0), i, j)[0], self.object_transforms[0][batch_s]) def inverse_warp_wrapper(matrix): """Wrapper for inverse warping method.""" warp_image, _ = ( project.inverse_warp( tf.expand_dims(base_warping, axis=0), tf.expand_dims(target_depth[batch_s], axis=0), tf.expand_dims(matrix, axis=0), tf.expand_dims(self.intrinsic_mat[ batch_s, selected_scale, :, :], axis=0), tf.expand_dims(self.intrinsic_mat_inv[ batch_s, selected_scale, :, :], axis=0))) return warp_image warped_images_thisbatch = tf.map_fn( inverse_warp_wrapper, transform_matrices_thisbatch, dtype=tf.float32) warped_images_thisbatch = warped_images_thisbatch[:, 0, :, :, :] # warped_images_thisbatch is now of shape (N, H, W, 9). # Combine warped frames into a single one, using the object # masks. Result should be (1, 128, 416, 3). # Essentially, we here want to sum them all up, filtered by the # respective object masks. mask_base_valid_source = tf.equal( self.seg_stack[batch_s, :, :, i*3:(i+1)*3], tf.constant(0, dtype=tf.uint8)) mask_base_valid_target = tf.equal( self.seg_stack[batch_s, :, :, j*3:(j+1)*3], tf.constant(0, dtype=tf.uint8)) mask_valid = tf.logical_and( mask_base_valid_source, mask_base_valid_target) self.base_warping = base_warping * tf.to_float(mask_valid) background = tf.expand_dims(self.base_warping, axis=0) def construct_const_filter_tensor(obj_id): return tf.fill( dims=[self.img_height, self.img_width, 3], value=tf.sign(obj_id)) * tf.to_float( tf.equal(self.seg_stack[batch_s, :, :, 3:6], tf.cast(obj_id, dtype=tf.uint8))) filter_tensor = tf.map_fn( construct_const_filter_tensor, tf.to_float(self.object_ids[s][batch_s])) filter_tensor = tf.stack(filter_tensor, axis=0) objects_to_add = tf.reduce_sum( tf.multiply(warped_images_thisbatch, filter_tensor), axis=0, keepdims=True) combined = background + objects_to_add self.all_batches.append(combined) # Now of shape (B, 128, 416, 3). self.warped_image[s][key] = tf.concat(self.all_batches, axis=0) else: # Don't handle motion, classic model formulation. egomotion_mat_i_j = project.get_transform_mat( self.egomotion, i, j) # Inverse warp the source image to the target image frame for # photometric consistency loss. self.warped_image[s][key], self.warp_mask[s][key] = ( project.inverse_warp( source, target_depth, egomotion_mat_i_j, self.intrinsic_mat[:, selected_scale, :, :], self.intrinsic_mat_inv[:, selected_scale, :, :])) # Reconstruction loss. self.warp_error[s][key] = tf.abs(self.warped_image[s][key] - target) if not self.compute_minimum_loss: self.reconstr_loss += tf.reduce_mean( self.warp_error[s][key] * self.warp_mask[s][key]) # SSIM. if self.ssim_weight > 0: self.ssim_error[s][key] = self.ssim(self.warped_image[s][key], target) # TODO(rezama): This should be min_pool2d(). if not self.compute_minimum_loss: ssim_mask = slim.avg_pool2d(self.warp_mask[s][key], 3, 1, 'VALID') self.ssim_loss += tf.reduce_mean( self.ssim_error[s][key] * ssim_mask) # If the minimum loss should be computed, the loss calculation has been # postponed until here. if self.compute_minimum_loss: for frame_index in range(self.middle_frame_index): key1 = '%d-%d' % (frame_index, self.middle_frame_index) key2 = '%d-%d' % (self.seq_length - frame_index - 1, self.middle_frame_index) logging.info('computing min error between %s and %s', key1, key2) min_error = tf.minimum(self.warp_error[s][key1], self.warp_error[s][key2]) self.reconstr_loss += tf.reduce_mean(min_error) if self.ssim_weight > 0: # Also compute the minimum SSIM loss. min_error_ssim = tf.minimum(self.ssim_error[s][key1], self.ssim_error[s][key2]) self.ssim_loss += tf.reduce_mean(min_error_ssim) # Build the total loss as composed of L1 reconstruction, SSIM, smoothing # and object size constraint loss as appropriate. self.reconstr_loss *= self.reconstr_weight self.total_loss = self.reconstr_loss if self.smooth_weight > 0: self.smooth_loss *= self.smooth_weight self.total_loss += self.smooth_loss if self.ssim_weight > 0: self.ssim_loss *= self.ssim_weight self.total_loss += self.ssim_loss if self.size_constraint_weight > 0: self.inf_loss *= self.size_constraint_weight self.total_loss += self.inf_loss
def build_inference_for_training(self): """Invokes depth and ego-motion networks and computes clouds if needed.""" (self.image_stack, self.image_stack_norm, self.seg_stack, self.intrinsic_mat, self.intrinsic_mat_inv) = self.reader.read_data() with tf.variable_scope('depth_prediction'): # Organized by ...[i][scale]. Note that the order is flipped in # variables in build_loss() below. self.disp = {} self.depth = {} self.depth_upsampled = {} self.inf_loss = 0.0 # Organized by [i]. disp_bottlenecks = [None] * self.seq_length if self.icp_weight > 0: self.cloud = {} for i in range(self.seq_length): image = self.image_stack_norm[:, :, :, 3 * i:3 * (i + 1)] multiscale_disps_i, disp_bottlenecks[i] = nets.disp_net( self.architecture, image, self.use_skip, self.weight_reg, True) multiscale_depths_i = [1.0 / d for d in multiscale_disps_i] self.disp[i] = multiscale_disps_i self.depth[i] = multiscale_depths_i if self.depth_upsampling: self.depth_upsampled[i] = [] # Upsample low-resolution depth maps using differentiable bilinear # interpolation. for s in range(len(multiscale_depths_i)): self.depth_upsampled[i].append(tf.image.resize_bilinear( multiscale_depths_i[s], [self.img_height, self.img_width], align_corners=True)) if self.icp_weight > 0: multiscale_clouds_i = [ project.get_cloud(d, self.intrinsic_mat_inv[:, s, :, :], name='cloud%d_%d' % (s, i)) for (s, d) in enumerate(multiscale_depths_i) ] self.cloud[i] = multiscale_clouds_i # Reuse the same depth graph for all images. tf.get_variable_scope().reuse_variables() if self.handle_motion: # Define egomotion network. This network can see the whole scene except # for any moving objects as indicated by the provided segmentation masks. # To avoid the network getting clues of motion by tracking those masks, we # define the segmentation masks as the union temporally. print('') print('') print('') print('HANDLE MOTION') print('') print('') print('') with tf.variable_scope('egomotion_prediction'): base_input = self.image_stack_norm # (B, H, W, 9) seg_input = self.seg_stack # (B, H, W, 9) ref_zero = tf.constant(0, dtype=tf.uint8) # Motion model is currently defined for three-frame sequences. object_mask1 = tf.equal(seg_input[:, :, :, 0], ref_zero) object_mask2 = tf.equal(seg_input[:, :, :, 3], ref_zero) object_mask3 = tf.equal(seg_input[:, :, :, 6], ref_zero) mask_complete = tf.expand_dims(tf.logical_and( # (B, H, W, 1) tf.logical_and(object_mask1, object_mask2), object_mask3), axis=3) mask_complete = tf.tile(mask_complete, (1, 1, 1, 9)) # (B, H, W, 9) # Now mask out base_input. self.mask_complete = tf.to_float(mask_complete) self.base_input_masked = base_input * self.mask_complete self.egomotion = nets.egomotion_net( image_stack=self.base_input_masked, disp_bottleneck_stack=None, joint_encoder=False, seq_length=self.seq_length, weight_reg=self.weight_reg) sess = tf.Session() with sess.as_default(): check_ego = sess.run(egomotion) print('') print('') print('egomotion = ', egomotion) print('') print('') # print('') # print('') # print('egomotion = ', egomotion) # print('') # print('') # Define object motion network for refinement. This network only sees # one object at a time over the whole sequence, and tries to estimate its # motion. The sequence of images are the respective warped frames. # For each scale, contains batch_size elements of shape (N, 2, 6). self.object_transforms = {} # For each scale, contains batch_size elements of shape (N, H, W, 9). self.object_masks = {} self.object_masks_warped = {} # For each scale, contains batch_size elements of size N. self.object_ids = {} self.egomotions_seq = {} self.warped_seq = {} self.inputs_objectmotion_net = {} with tf.variable_scope('objectmotion_prediction'): # First, warp raw images according to overall egomotion. for s in range(NUM_SCALES): self.warped_seq[s] = [] self.egomotions_seq[s] = [] for source_index in range(self.seq_length): egomotion_mat_i_1 = project.get_transform_mat( self.egomotion, source_index, 1) warped_image_i_1, _ = ( project.inverse_warp( self.image_stack[ :, :, :, source_index*3:(source_index+1)*3], self.depth_upsampled[1][s], egomotion_mat_i_1, self.intrinsic_mat[:, 0, :, :], self.intrinsic_mat_inv[:, 0, :, :])) self.warped_seq[s].append(warped_image_i_1) self.egomotions_seq[s].append(egomotion_mat_i_1) # Second, for every object in the segmentation mask, take its mask and # warp it according to the egomotion estimate. Then put a threshold to # binarize the warped result. Use this mask to mask out background and # other objects, and pass the filtered image to the object motion # network. self.object_transforms[s] = [] self.object_masks[s] = [] self.object_ids[s] = [] self.object_masks_warped[s] = [] self.inputs_objectmotion_net[s] = {} for i in range(self.batch_size): seg_sequence = self.seg_stack[i] # (H, W, 9=3*3) object_ids = tf.unique(tf.reshape(seg_sequence, [-1]))[0] self.object_ids[s].append(object_ids) color_stack = [] mask_stack = [] mask_stack_warped = [] for j in range(self.seq_length): current_image = self.warped_seq[s][j][i] # (H, W, 3) current_seg = seg_sequence[:, :, j * 3:(j+1) * 3] # (H, W, 3) def process_obj_mask_warp(obj_id): """Performs warping of the individual object masks.""" obj_mask = tf.to_float(tf.equal(current_seg, obj_id)) # Warp obj_mask according to overall egomotion. obj_mask_warped, _ = ( project.inverse_warp( tf.expand_dims(obj_mask, axis=0), # Middle frame, highest scale, batch element i: tf.expand_dims(self.depth_upsampled[1][s][i], axis=0), # Matrix for warping j into middle frame, batch elem. i: tf.expand_dims(self.egomotions_seq[s][j][i], axis=0), tf.expand_dims(self.intrinsic_mat[i, 0, :, :], axis=0), tf.expand_dims(self.intrinsic_mat_inv[i, 0, :, :], axis=0))) obj_mask_warped = tf.squeeze(obj_mask_warped) obj_mask_binarized = tf.greater( # Threshold to binarize mask. obj_mask_warped, tf.constant(0.5)) return tf.to_float(obj_mask_binarized) def process_obj_mask(obj_id): """Returns the individual object masks separately.""" return tf.to_float(tf.equal(current_seg, obj_id)) object_masks = tf.map_fn( # (N, H, W, 3) process_obj_mask, object_ids, dtype=tf.float32) if self.size_constraint_weight > 0: # The object segmentation masks are all in object_masks. # We need to measure the height of every of them, and get the # approximate distance. # self.depth_upsampled of shape (seq_length, scale, B, H, W). depth_pred = self.depth_upsampled[j][s][i] # (H, W) def get_losses(obj_mask): """Get motion constraint loss.""" # Find height of segment. coords = tf.where(tf.greater( # Shape (num_true, 2=yx) obj_mask[:, :, 0], tf.constant(0.5, dtype=tf.float32))) y_max = tf.reduce_max(coords[:, 0]) y_min = tf.reduce_min(coords[:, 0]) seg_height = y_max - y_min f_y = self.intrinsic_mat[i, 0, 1, 1] approx_depth = ((f_y * self.global_scale_var) / tf.to_float(seg_height)) reference_pred = tf.boolean_mask( depth_pred, tf.greater( tf.reshape(obj_mask[:, :, 0], (self.img_height, self.img_width, 1)), tf.constant(0.5, dtype=tf.float32))) # Establish loss on approx_depth, a scalar, and # reference_pred, our dense prediction. Normalize both to # prevent degenerative depth shrinking. global_mean_depth_pred = tf.reduce_mean(depth_pred) reference_pred /= global_mean_depth_pred approx_depth /= global_mean_depth_pred spatial_err = tf.abs(reference_pred - approx_depth) print('') print('') print('spatial error =', spatial_err) print('') print('') #mean_spatial_err = tf.reduce_mean(tf.concat([spatial_err, tf.zeros(1)], axis = 0)) mean_spatial_err = tf.reduce_mean(spatial_err) return mean_spatial_err losses = tf.map_fn( get_losses, object_masks, dtype=tf.float32) print('') print('') print('Losses = ', losses) print('') print('') self.inf_loss += tf.reduce_mean(losses) print('') print('') print('self.inf_loss = ', self.inf_loss) print('') print('') object_masks_warped = tf.map_fn( # (N, H, W, 3) process_obj_mask_warp, object_ids, dtype=tf.float32) filtered_images = tf.map_fn( lambda mask: current_image * mask, object_masks_warped, dtype=tf.float32) # (N, H, W, 3) color_stack.append(filtered_images) mask_stack.append(object_masks) mask_stack_warped.append(object_masks_warped) # For this batch-element, if there are N moving objects, # color_stack, mask_stack and mask_stack_warped contain both # seq_length elements of shape (N, H, W, 3). # We can now concatenate them on the last axis, creating a tensor of # (N, H, W, 3*3 = 9), and, assuming N does not get too large so that # we have enough memory, pass them in a single batch to the object # motion network. mask_stack = tf.concat(mask_stack, axis=3) # (N, H, W, 9) mask_stack_warped = tf.concat(mask_stack_warped, axis=3) color_stack = tf.concat(color_stack, axis=3) # (N, H, W, 9) all_transforms = nets.objectmotion_net( # We cut the gradient flow here as the object motion gradient # should have no saying in how the egomotion network behaves. # One could try just stopping the gradient for egomotion, but # not for the depth prediction network. image_stack=tf.stop_gradient(color_stack), disp_bottleneck_stack=None, joint_encoder=False, # Joint encoder not supported. seq_length=self.seq_length, weight_reg=self.weight_reg) # all_transforms of shape (N, 2, 6). self.object_transforms[s].append(all_transforms) self.object_masks[s].append(mask_stack) self.object_masks_warped[s].append(mask_stack_warped) self.inputs_objectmotion_net[s][i] = color_stack tf.get_variable_scope().reuse_variables() print('') print('') print('') print('HANDLE MOTION22222') print('') print('') print('') else: # Don't handle motion, classic model formulation. with tf.name_scope('egomotion_prediction'): if self.joint_encoder: # Re-arrange disp_bottleneck_stack to be of shape # [B, h_hid, w_hid, c_hid * seq_length]. Currently, it is a list with # seq_length elements, each of dimension [B, h_hid, w_hid, c_hid]. disp_bottleneck_stack = tf.concat(disp_bottlenecks, axis=3) else: disp_bottleneck_stack = None self.egomotion = nets.egomotion_net( image_stack=self.image_stack_norm, disp_bottleneck_stack=disp_bottleneck_stack, joint_encoder=self.joint_encoder, seq_length=self.seq_length, weight_reg=self.weight_reg)
def build_inference_for_training(self): """Invokes depth and ego-motion networks and computes clouds if needed.""" (self.image_stack, self.image_stack_norm, self.seg_stack, self.intrinsic_mat, self.intrinsic_mat_inv) = self.reader.read_data() with tf.variable_scope('depth_prediction'): # Organized by ...[i][scale]. Note that the order is flipped in # variables in build_loss() below. self.disp = {} self.depth = {} self.depth_upsampled = {} self.inf_loss = 0.0 # Organized by [i]. disp_bottlenecks = [None] * self.seq_length if self.icp_weight > 0: self.cloud = {} for i in range(self.seq_length): image = self.image_stack_norm[:, :, :, 3 * i:3 * (i + 1)] multiscale_disps_i, disp_bottlenecks[i] = nets.disp_net( self.architecture, image, self.use_skip, self.weight_reg, True) multiscale_depths_i = [1.0 / d for d in multiscale_disps_i] self.disp[i] = multiscale_disps_i self.depth[i] = multiscale_depths_i if self.depth_upsampling: self.depth_upsampled[i] = [] # Upsample low-resolution depth maps using differentiable bilinear # interpolation. for s in range(len(multiscale_depths_i)): self.depth_upsampled[i].append(tf.image.resize_bilinear( multiscale_depths_i[s], [self.img_height, self.img_width], align_corners=True)) if self.icp_weight > 0: multiscale_clouds_i = [ project.get_cloud(d, self.intrinsic_mat_inv[:, s, :, :], name='cloud%d_%d' % (s, i)) for (s, d) in enumerate(multiscale_depths_i) ] self.cloud[i] = multiscale_clouds_i # Reuse the same depth graph for all images. tf.get_variable_scope().reuse_variables() if self.handle_motion: # Define egomotion network. This network can see the whole scene except # for any moving objects as indicated by the provided segmentation masks. # To avoid the network getting clues of motion by tracking those masks, we # define the segmentation masks as the union temporally. with tf.variable_scope('egomotion_prediction'): base_input = self.image_stack_norm # (B, H, W, 9) seg_input = self.seg_stack # (B, H, W, 9) ref_zero = tf.constant(0, dtype=tf.uint8) # Motion model is currently defined for three-frame sequences. object_mask1 = tf.equal(seg_input[:, :, :, 0], ref_zero) object_mask2 = tf.equal(seg_input[:, :, :, 3], ref_zero) object_mask3 = tf.equal(seg_input[:, :, :, 6], ref_zero) mask_complete = tf.expand_dims(tf.logical_and( # (B, H, W, 1) tf.logical_and(object_mask1, object_mask2), object_mask3), axis=3) mask_complete = tf.tile(mask_complete, (1, 1, 1, 9)) # (B, H, W, 9) # Now mask out base_input. self.mask_complete = tf.to_float(mask_complete) self.base_input_masked = base_input * self.mask_complete self.egomotion = nets.egomotion_net( image_stack=self.base_input_masked, disp_bottleneck_stack=None, joint_encoder=False, seq_length=self.seq_length, weight_reg=self.weight_reg) # Define object motion network for refinement. This network only sees # one object at a time over the whole sequence, and tries to estimate its # motion. The sequence of images are the respective warped frames. # For each scale, contains batch_size elements of shape (N, 2, 6). self.object_transforms = {} # For each scale, contains batch_size elements of shape (N, H, W, 9). self.object_masks = {} self.object_masks_warped = {} # For each scale, contains batch_size elements of size N. self.object_ids = {} self.egomotions_seq = {} self.warped_seq = {} self.inputs_objectmotion_net = {} with tf.variable_scope('objectmotion_prediction'): # First, warp raw images according to overall egomotion. for s in range(NUM_SCALES): self.warped_seq[s] = [] self.egomotions_seq[s] = [] for source_index in range(self.seq_length): egomotion_mat_i_1 = project.get_transform_mat( self.egomotion, source_index, 1) warped_image_i_1, _ = ( project.inverse_warp( self.image_stack[ :, :, :, source_index*3:(source_index+1)*3], self.depth_upsampled[1][s], egomotion_mat_i_1, self.intrinsic_mat[:, 0, :, :], self.intrinsic_mat_inv[:, 0, :, :])) self.warped_seq[s].append(warped_image_i_1) self.egomotions_seq[s].append(egomotion_mat_i_1) # Second, for every object in the segmentation mask, take its mask and # warp it according to the egomotion estimate. Then put a threshold to # binarize the warped result. Use this mask to mask out background and # other objects, and pass the filtered image to the object motion # network. self.object_transforms[s] = [] self.object_masks[s] = [] self.object_ids[s] = [] self.object_masks_warped[s] = [] self.inputs_objectmotion_net[s] = {} for i in range(self.batch_size): seg_sequence = self.seg_stack[i] # (H, W, 9=3*3) object_ids = tf.unique(tf.reshape(seg_sequence, [-1]))[0] self.object_ids[s].append(object_ids) color_stack = [] mask_stack = [] mask_stack_warped = [] for j in range(self.seq_length): current_image = self.warped_seq[s][j][i] # (H, W, 3) current_seg = seg_sequence[:, :, j * 3:(j+1) * 3] # (H, W, 3) def process_obj_mask_warp(obj_id): """Performs warping of the individual object masks.""" obj_mask = tf.to_float(tf.equal(current_seg, obj_id)) # Warp obj_mask according to overall egomotion. obj_mask_warped, _ = ( project.inverse_warp( tf.expand_dims(obj_mask, axis=0), # Middle frame, highest scale, batch element i: tf.expand_dims(self.depth_upsampled[1][s][i], axis=0), # Matrix for warping j into middle frame, batch elem. i: tf.expand_dims(self.egomotions_seq[s][j][i], axis=0), tf.expand_dims(self.intrinsic_mat[i, 0, :, :], axis=0), tf.expand_dims(self.intrinsic_mat_inv[i, 0, :, :], axis=0))) obj_mask_warped = tf.squeeze(obj_mask_warped) obj_mask_binarized = tf.greater( # Threshold to binarize mask. obj_mask_warped, tf.constant(0.5)) return tf.to_float(obj_mask_binarized) def process_obj_mask(obj_id): """Returns the individual object masks separately.""" return tf.to_float(tf.equal(current_seg, obj_id)) object_masks = tf.map_fn( # (N, H, W, 3) process_obj_mask, object_ids, dtype=tf.float32) if self.size_constraint_weight > 0: # The object segmentation masks are all in object_masks. # We need to measure the height of every of them, and get the # approximate distance. # self.depth_upsampled of shape (seq_length, scale, B, H, W). depth_pred = self.depth_upsampled[j][s][i] # (H, W) def get_losses(obj_mask): """Get motion constraint loss.""" # Find height of segment. coords = tf.where(tf.greater( # Shape (num_true, 2=yx) obj_mask[:, :, 0], tf.constant(0.5, dtype=tf.float32))) y_max = tf.reduce_max(coords[:, 0]) y_min = tf.reduce_min(coords[:, 0]) seg_height = y_max - y_min f_y = self.intrinsic_mat[i, 0, 1, 1] approx_depth = ((f_y * self.global_scale_var) / tf.to_float(seg_height)) reference_pred = tf.boolean_mask( depth_pred, tf.greater( tf.reshape(obj_mask[:, :, 0], (self.img_height, self.img_width, 1)), tf.constant(0.5, dtype=tf.float32))) # Establish loss on approx_depth, a scalar, and # reference_pred, our dense prediction. Normalize both to # prevent degenerative depth shrinking. global_mean_depth_pred = tf.reduce_mean(depth_pred) reference_pred /= global_mean_depth_pred approx_depth /= global_mean_depth_pred spatial_err = tf.abs(reference_pred - approx_depth) mean_spatial_err = tf.reduce_mean(spatial_err) return mean_spatial_err losses = tf.map_fn( get_losses, object_masks, dtype=tf.float32) self.inf_loss += tf.reduce_mean(losses) object_masks_warped = tf.map_fn( # (N, H, W, 3) process_obj_mask_warp, object_ids, dtype=tf.float32) filtered_images = tf.map_fn( lambda mask: current_image * mask, object_masks_warped, dtype=tf.float32) # (N, H, W, 3) color_stack.append(filtered_images) mask_stack.append(object_masks) mask_stack_warped.append(object_masks_warped) # For this batch-element, if there are N moving objects, # color_stack, mask_stack and mask_stack_warped contain both # seq_length elements of shape (N, H, W, 3). # We can now concatenate them on the last axis, creating a tensor of # (N, H, W, 3*3 = 9), and, assuming N does not get too large so that # we have enough memory, pass them in a single batch to the object # motion network. mask_stack = tf.concat(mask_stack, axis=3) # (N, H, W, 9) mask_stack_warped = tf.concat(mask_stack_warped, axis=3) color_stack = tf.concat(color_stack, axis=3) # (N, H, W, 9) all_transforms = nets.objectmotion_net( # We cut the gradient flow here as the object motion gradient # should have no saying in how the egomotion network behaves. # One could try just stopping the gradient for egomotion, but # not for the depth prediction network. image_stack=tf.stop_gradient(color_stack), disp_bottleneck_stack=None, joint_encoder=False, # Joint encoder not supported. seq_length=self.seq_length, weight_reg=self.weight_reg) # all_transforms of shape (N, 2, 6). self.object_transforms[s].append(all_transforms) self.object_masks[s].append(mask_stack) self.object_masks_warped[s].append(mask_stack_warped) self.inputs_objectmotion_net[s][i] = color_stack tf.get_variable_scope().reuse_variables() else: # Don't handle motion, classic model formulation. with tf.name_scope('egomotion_prediction'): if self.joint_encoder: # Re-arrange disp_bottleneck_stack to be of shape # [B, h_hid, w_hid, c_hid * seq_length]. Currently, it is a list with # seq_length elements, each of dimension [B, h_hid, w_hid, c_hid]. disp_bottleneck_stack = tf.concat(disp_bottlenecks, axis=3) else: disp_bottleneck_stack = None self.egomotion = nets.egomotion_net( image_stack=self.image_stack_norm, disp_bottleneck_stack=disp_bottleneck_stack, joint_encoder=self.joint_encoder, seq_length=self.seq_length, weight_reg=self.weight_reg)
def build_loss(self): """Adds ops for computing loss.""" with tf.name_scope('compute_loss'): self.reconstr_loss = 0 self.smooth_loss = 0 self.ssim_loss = 0 self.icp_transform_loss = 0 self.icp_residual_loss = 0 # self.images is organized by ...[scale][B, h, w, seq_len * 3]. self.images = [{} for _ in range(NUM_SCALES)] # Following nested lists are organized by ...[scale][source-target]. self.warped_image = [{} for _ in range(NUM_SCALES)] self.warp_mask = [{} for _ in range(NUM_SCALES)] self.warp_error = [{} for _ in range(NUM_SCALES)] self.ssim_error = [{} for _ in range(NUM_SCALES)] self.icp_transform = [{} for _ in range(NUM_SCALES)] self.icp_residual = [{} for _ in range(NUM_SCALES)] self.middle_frame_index = util.get_seq_middle(self.seq_length) # Compute losses at each scale. for s in range(NUM_SCALES): # Scale image stack. height_s = int(self.img_height / (2**s)) width_s = int(self.img_width / (2**s)) self.images[s] = tf.image.resize_area(self.image_stack, [height_s, width_s]) # Smoothness. if self.smooth_weight > 0: for i in range(self.seq_length): # In legacy mode, use the depth map from the middle frame only. if not self.legacy_mode or i == self.middle_frame_index: self.smooth_loss += 1.0 / ( 2**s) * self.depth_smoothness( self.disp[i][s], self.images[s][:, :, :, 3 * i:3 * (i + 1)]) for i in range(self.seq_length): for j in range(self.seq_length): # Only consider adjacent frames. if i == j or abs(i - j) != 1: continue # In legacy mode, only consider the middle frame as target. if self.legacy_mode and j != self.middle_frame_index: continue source = self.images[s][:, :, :, 3 * i:3 * (i + 1)] target = self.images[s][:, :, :, 3 * j:3 * (j + 1)] target_depth = self.depth[j][s] key = '%d-%d' % (i, j) # Extract ego-motion from i to j egomotion_index = min(i, j) egomotion_mult = 1 if i > j: # Need to inverse egomotion when going back in sequence. egomotion_mult *= -1 # For compatiblity with SfMLearner, interpret all egomotion vectors # as pointing toward the middle frame. Note that unlike SfMLearner, # each vector captures the motion to/from its next frame, and not # the center frame. Although with seq_length == 3, there is no # difference. if self.legacy_mode: if egomotion_index >= self.middle_frame_index: egomotion_mult *= -1 egomotion = egomotion_mult * self.egomotion[:, egomotion_index, :] # Inverse warp the source image to the target image frame for # photometric consistency loss. self.warped_image[s][key], self.warp_mask[s][key] = ( project.inverse_warp( source, target_depth, egomotion, self.intrinsic_mat[:, s, :, :], self.intrinsic_mat_inv[:, s, :, :])) # Reconstruction loss. self.warp_error[s][key] = tf.abs( self.warped_image[s][key] - target) self.reconstr_loss += tf.reduce_mean( self.warp_error[s][key] * self.warp_mask[s][key]) # SSIM. if self.ssim_weight > 0: self.ssim_error[s][key] = self.ssim( self.warped_image[s][key], target) # TODO(rezama): This should be min_pool2d(). ssim_mask = slim.avg_pool2d( self.warp_mask[s][key], 3, 1, 'VALID') self.ssim_loss += tf.reduce_mean( self.ssim_error[s][key] * ssim_mask) # 3D loss. if self.icp_weight > 0: cloud_a = self.cloud[j][s] cloud_b = self.cloud[i][s] self.icp_transform[s][key], self.icp_residual[s][ key] = icp(cloud_a, egomotion, cloud_b) self.icp_transform_loss += 1.0 / ( 2**s) * tf.reduce_mean( tf.abs(self.icp_transform[s][key])) self.icp_residual_loss += 1.0 / ( 2**s) * tf.reduce_mean( tf.abs(self.icp_residual[s][key])) self.total_loss = self.reconstr_weight * self.reconstr_loss if self.smooth_weight > 0: self.total_loss += self.smooth_weight * self.smooth_loss if self.ssim_weight > 0: self.total_loss += self.ssim_weight * self.ssim_loss if self.icp_weight > 0: self.total_loss += self.icp_weight * (self.icp_transform_loss + self.icp_residual_loss)
def build_inference_for_training(self): """Invokes depth and ego-motion networks.""" if self.is_training: (self.image_stack, self.image_stack_norm, self.seg_stack, self.intrinsic_mat, self.intrinsic_mat_inv) = self.reader.read_data() with tf.variable_scope('depth_prediction'): # Organized by ...[i][scale]. Note that the order is flipped in # variables in build_loss() below. self.disp = {} self.depth = {} self.depth_upsampled = {} self.object_depth_loss = 0.0 # Organized by [i]. disp_bottlenecks = [None] * self.seq_length for i in range(self.seq_length): image = self.image_stack_norm[:, :, :, 3 * i:3 * (i + 1)] multiscale_disps_i, disp_bottlenecks[i] = nets.disp_net( self.architecture, image, self.use_skip, self.weight_reg, True) multiscale_depths_i = [1.0 / d for d in multiscale_disps_i] self.disp[i] = multiscale_disps_i self.depth[i] = multiscale_depths_i if self.depth_upsampling: self.depth_upsampled[i] = [] # Upsample low-resolution depth maps using differentiable bilinear # interpolation. for s in range(len(multiscale_depths_i)): self.depth_upsampled[i].append(tf.image.resize_bilinear( multiscale_depths_i[s], [self.img_height, self.img_width], align_corners=True)) # Reuse the same depth graph for all images. tf.get_variable_scope().reuse_variables() if self.handle_motion: # Define egomotion network. This network can see the whole scene except # for any moving objects as indicated by the provided segmentation masks. # To avoid the network getting clues of motion by tracking those masks, we # define the segmentation masks as the union temporally. with tf.variable_scope('egomotion_prediction'): base_input = self.image_stack_norm # (B, H, W, 9) seg_input = self.seg_stack # (B, H, W, 9) ref_zero = tf.constant(0, dtype=tf.uint8) # Motion model is currently defined for three-frame sequences. object_mask1 = tf.equal(seg_input[:, :, :, 0], ref_zero) object_mask2 = tf.equal(seg_input[:, :, :, 3], ref_zero) object_mask3 = tf.equal(seg_input[:, :, :, 6], ref_zero) mask_complete = tf.expand_dims(tf.logical_and( # (B, H, W, 1) tf.logical_and(object_mask1, object_mask2), object_mask3), axis=3) mask_complete = tf.tile(mask_complete, (1, 1, 1, 9)) # (B, H, W, 9) # Now mask out base_input. self.mask_complete = tf.to_float(mask_complete) self.base_input_masked = base_input * self.mask_complete # [B, H, W, 9] self.egomotion = nets.egomotion_net( image_stack=self.base_input_masked, disp_bottleneck_stack=None, joint_encoder=False, seq_length=self.seq_length, weight_reg=self.weight_reg, same_trans_rot_scaling=self.same_trans_rot_scaling) # Define object motion network for refinement. This network only sees # one object at a time over the whole sequence, and tries to estimate its # motion. The sequence of images are the respective warped frames. # For each scale, contains batch_size elements of shape (N, 2, 6). self.object_transforms = {} # For each scale, contains batch_size elements of shape (N, H, W, 9). self.object_masks = {} self.object_masks_warped = {} # For each scale, contains batch_size elements of size N. self.object_ids = {} self.egomotions_seq = {} self.warped_seq = {} # For each scale, contains 3 elements of shape [B, H, W, 2] self.rigid_flow_seq = {} self.inputs_region_deformer_net = {} with tf.variable_scope('objectmotion_prediction'): # First, warp raw images according to overall egomotion. for s in range(NUM_SCALES): self.warped_seq[s] = [] self.rigid_flow_seq[s] = [] self.egomotions_seq[s] = [] for source_index in range(self.seq_length): egomotion_mat_i_1 = project.get_transform_mat( self.egomotion, source_index, 1, use_axis_angle=self.use_axis_angle) # The gradient of egomotion network should only comes from background, # stop gradient from objects if self.stop_egomotion_gradient: current_seg = self.seg_stack[:, :, :, source_index * 3] # [B, H, W] background_mask = tf.equal(current_seg, tf.constant(0, dtype=tf.uint8)) # [B, H, W] background_mask = tf.tile(tf.expand_dims(background_mask, axis=3), (1, 1, 1, 3)) # [B, H, W, 3] background_mask = tf.to_float(background_mask) background_mask_warped, _ = ( project.inverse_warp( background_mask, self.depth_upsampled[1][s], egomotion_mat_i_1, self.intrinsic_mat[:, 0, :, :], self.intrinsic_mat_inv[:, 0, :, :])) # Stop gradient for mask background_mask_warped = tf.stop_gradient(background_mask_warped) background_warped, _ = ( project.inverse_warp( self.image_stack[:, :, :, source_index * 3:(source_index + 1) * 3], self.depth_upsampled[1][s], egomotion_mat_i_1, self.intrinsic_mat[:, 0, :, :], self.intrinsic_mat_inv[:, 0, :, :])) obj_warped, _ = ( project.inverse_warp( self.image_stack[:, :, :, source_index * 3:(source_index + 1) * 3], self.depth_upsampled[1][s], tf.stop_gradient(egomotion_mat_i_1), # stop gradient from objects self.intrinsic_mat[:, 0, :, :], self.intrinsic_mat_inv[:, 0, :, :])) warped_image_i_1 = background_warped * background_mask_warped + \ obj_warped * (1.0 - background_mask_warped) background_rigid_flow = project.compute_rigid_flow( self.depth_upsampled[1][s], egomotion_mat_i_1, self.intrinsic_mat[:, 0, :, :], self.intrinsic_mat_inv[:, 0, :, :] ) # [B, H, W, 2] obj_rigid_flow = project.compute_rigid_flow( self.depth_upsampled[1][s], tf.stop_gradient(egomotion_mat_i_1), # stop gradients for objects self.intrinsic_mat[:, 0, :, :], self.intrinsic_mat_inv[:, 0, :, :] ) rigid_flow_i_1 = background_rigid_flow * background_mask[:, :, :, :2] + \ obj_rigid_flow * (1.0 - background_mask[:, :, :, :2]) else: warped_image_i_1, _ = ( project.inverse_warp( self.image_stack[:, :, :, source_index * 3:(source_index + 1) * 3], self.depth_upsampled[1][s], egomotion_mat_i_1, self.intrinsic_mat[:, 0, :, :], self.intrinsic_mat_inv[:, 0, :, :])) rigid_flow_i_1 = project.compute_rigid_flow( self.depth_upsampled[1][s], egomotion_mat_i_1, self.intrinsic_mat[:, 0, :, :], self.intrinsic_mat_inv[:, 0, :, :]) self.warped_seq[s].append(warped_image_i_1) self.rigid_flow_seq[s].append(rigid_flow_i_1) self.egomotions_seq[s].append(egomotion_mat_i_1) # Second, for every object in the segmentation mask, take its mask and # warp it according to the egomotion estimate. Then put a threshold to # binarize the warped result. Use this mask to mask out background and # other objects, and pass the filtered image to the region deformer # network. self.object_transforms[s] = [] self.object_masks[s] = [] self.object_ids[s] = [] self.object_masks_warped[s] = [] self.inputs_region_deformer_net[s] = {} for i in range(self.batch_size): seg_sequence = self.seg_stack[i] # (H, W, 9=3*3) # Backgound is 0, include 0 here object_ids = tf.unique(tf.reshape(seg_sequence, [-1]))[0] self.object_ids[s].append(object_ids) color_stack = [] mask_stack = [] mask_stack_warped = [] for j in range(self.seq_length): current_image = self.warped_seq[s][j][i] # (H, W, 3) current_seg = seg_sequence[:, :, j * 3:(j + 1) * 3] # (H, W, 3) # When enforcing object depth prior, exclude objects when computing # neighboring mask background = tf.equal(current_seg[:, :, 0], tf.constant(0, dtype=tf.uint8)) # [H, W] def process_obj_mask_warp(obj_id): """Performs warping of the individual object masks.""" obj_mask = tf.to_float(tf.equal(current_seg, obj_id)) # Warp obj_mask according to overall egomotion. obj_mask_warped, _ = ( project.inverse_warp( tf.expand_dims(obj_mask, axis=0), # Middle frame, highest scale, batch element i: tf.expand_dims(self.depth_upsampled[1][s][i], axis=0), # Matrix for warping j into middle frame, batch elem. i: tf.expand_dims(self.egomotions_seq[s][j][i], axis=0), tf.expand_dims(self.intrinsic_mat[i, 0, :, :], axis=0), tf.expand_dims(self.intrinsic_mat_inv[i, 0, :, :], axis=0))) obj_mask_warped = tf.squeeze(obj_mask_warped, axis=0) # specify axis=0 obj_mask_binarized = tf.greater( # Threshold to binarize mask. obj_mask_warped, tf.constant(0.5)) return tf.to_float(obj_mask_binarized) # [H, W, 3] def process_obj_mask(obj_id): """Returns the individual object masks separately.""" return tf.to_float(tf.equal(current_seg, obj_id)) object_masks = tf.map_fn( # (N, H, W, 3) process_obj_mask, object_ids, dtype=tf.float32) if self.object_depth_weight > 0: # The inverse depth of a moving object should be larger or equal to # its horizontal surrounding environment depth_pred = self.depth_upsampled[j][s][i] # [H, W, 1] def get_obj_losses(obj_mask): # Note obj_mask includes background # Find width of segment coords = tf.where(tf.greater( obj_mask[:, :, 0], tf.constant(0.5, dtype=tf.float32) )) # [num_true, 2] y_max = tf.to_int32(tf.reduce_max(coords[:, 0])) y_min = tf.to_int32(tf.reduce_min(coords[:, 0])) x_max = tf.to_int32(tf.reduce_max(coords[:, 1])) x_min = tf.to_int32(tf.reduce_min(coords[:, 1])) neighbor_pixel = 10 # empirical value id_x_min = tf.maximum(0, x_min - neighbor_pixel) id_x_max = tf.minimum(self.img_width - 1, x_max + neighbor_pixel) slice1 = tf.zeros([y_min, self.img_width]) slice2_1 = tf.zeros([y_max - y_min + 1, id_x_min]) slice2_2 = tf.ones([y_max - y_min + 1, (id_x_max - id_x_min + 1)]) # neighbor slice2_3 = tf.zeros([y_max - y_min + 1, self.img_width - 1 - id_x_max]) slice2 = tf.concat([slice2_1, slice2_2, slice2_3], axis=1) # [y_max - y_min, W] slice3 = tf.zeros([self.img_height - 1 - y_max, self.img_width]) neighbor_mask = tf.concat([slice1, slice2, slice3], axis=0) # [H, W] neighbor_mask = neighbor_mask * (tf.to_float( tf.less(obj_mask[:, :, 0], tf.constant(0.5, dtype=tf.float32)) )) # Handle overlapping objects if self.exclude_object_mask: neighbor_mask = neighbor_mask * tf.to_float(background) # [H, W] neighbor_depth = tf.boolean_mask( depth_pred, tf.greater( tf.reshape(neighbor_mask, (self.img_height, self.img_width, 1)), tf.constant(0.5, dtype=tf.float32))) reference_depth = tf.boolean_mask( depth_pred, tf.greater( tf.reshape(obj_mask[:, :, 0], (self.img_height, self.img_width, 1)), tf.constant(0.5, dtype=tf.float32))) neighbor_mean = tf.reduce_mean(neighbor_depth) reference_mean = tf.reduce_mean(reference_depth) # Soft constraint loss = tf.maximum(reference_mean - neighbor_mean - self.object_depth_threshold, tf.constant(0.0, dtype=tf.float32)) return loss losses = tf.map_fn(get_obj_losses, object_masks, dtype=tf.float32) # Remove background, whose id is 0 self.object_depth_loss += tf.reduce_mean(tf.sign(tf.to_float(object_ids)) * losses) object_masks_warped = tf.map_fn( # (N, H, W, 3) process_obj_mask_warp, object_ids, dtype=tf.float32) # When warping object mask, stop gradient of depth and egomotion if self.stop_egomotion_gradient: object_masks_warped = tf.stop_gradient(object_masks_warped) filtered_images = tf.map_fn( lambda mask: current_image * mask, object_masks_warped, dtype=tf.float32) # (N, H, W, 3) color_stack.append(filtered_images) mask_stack.append(object_masks) mask_stack_warped.append(object_masks_warped) # For this batch-element, if there are N moving objects, # color_stack, mask_stack and mask_stack_warped contain both # seq_length elements of shape (N, H, W, 3). # We can now concatenate them on the last axis, creating a tensor of # (N, H, W, 3*3 = 9), and, assuming N does not get too large so that # we have enough memory, pass them in a single batch to the region # deformer network. mask_stack = tf.concat(mask_stack, axis=3) # (N, H, W, 9) mask_stack_warped = tf.concat(mask_stack_warped, axis=3) color_stack = tf.concat(color_stack, axis=3) # (N, H, W, 9) if self.stop_egomotion_gradient: # Gradient has been stopped before image_stack = color_stack else: image_stack = tf.stop_gradient(color_stack) all_transforms = nets.region_deformer_net( image_stack=image_stack, disp_bottleneck_stack=None, joint_encoder=False, # joint encoder not supported. seq_length=self.seq_length, weight_reg=self.weight_reg, trans_params_size=self.trans_params_size, region_deformer_scaling=self.region_deformer_scaling) # all_transforms of shape (N, 2, 32) self.object_transforms[s].append(all_transforms) self.object_masks[s].append(mask_stack) self.object_masks_warped[s].append(mask_stack_warped) self.inputs_region_deformer_net[s][i] = color_stack tf.get_variable_scope().reuse_variables() else: # Don't handle motion, classic model formulation. with tf.name_scope('egomotion_prediction'): if self.joint_encoder: # Re-arrange disp_bottleneck_stack to be of shape # [B, h_hid, w_hid, c_hid * seq_length]. Currently, it is a list with # seq_length elements, each of dimension [B, h_hid, w_hid, c_hid]. disp_bottleneck_stack = tf.concat(disp_bottlenecks, axis=3) else: disp_bottleneck_stack = None self.egomotion = nets.egomotion_net( image_stack=self.image_stack_norm, disp_bottleneck_stack=disp_bottleneck_stack, joint_encoder=self.joint_encoder, seq_length=self.seq_length, weight_reg=self.weight_reg, same_trans_rot_scaling=self.same_trans_rot_scaling)
def build_loss(self): """Adds ops for computing loss.""" with tf.name_scope('compute_loss'): self.reconstr_loss = 0 self.smooth_loss = 0 self.ssim_loss = 0 self.icp_transform_loss = 0 self.icp_residual_loss = 0 # self.images is organized by ...[scale][B, h, w, seq_len * 3]. self.images = [{} for _ in range(NUM_SCALES)] # Following nested lists are organized by ...[scale][source-target]. self.warped_image = [{} for _ in range(NUM_SCALES)] self.warp_mask = [{} for _ in range(NUM_SCALES)] self.warp_error = [{} for _ in range(NUM_SCALES)] self.ssim_error = [{} for _ in range(NUM_SCALES)] self.icp_transform = [{} for _ in range(NUM_SCALES)] self.icp_residual = [{} for _ in range(NUM_SCALES)] self.middle_frame_index = util.get_seq_middle(self.seq_length) # Compute losses at each scale. for s in range(NUM_SCALES): # Scale image stack. height_s = int(self.img_height / (2**s)) width_s = int(self.img_width / (2**s)) self.images[s] = tf.image.resize_area(self.image_stack, [height_s, width_s]) # Smoothness. if self.smooth_weight > 0: for i in range(self.seq_length): # In legacy mode, use the depth map from the middle frame only. if not self.legacy_mode or i == self.middle_frame_index: self.smooth_loss += 1.0 / (2**s) * self.depth_smoothness( self.disp[i][s], self.images[s][:, :, :, 3 * i:3 * (i + 1)]) for i in range(self.seq_length): for j in range(self.seq_length): # Only consider adjacent frames. if i == j or abs(i - j) != 1: continue # In legacy mode, only consider the middle frame as target. if self.legacy_mode and j != self.middle_frame_index: continue source = self.images[s][:, :, :, 3 * i:3 * (i + 1)] target = self.images[s][:, :, :, 3 * j:3 * (j + 1)] target_depth = self.depth[j][s] key = '%d-%d' % (i, j) # Extract ego-motion from i to j egomotion_index = min(i, j) egomotion_mult = 1 if i > j: # Need to inverse egomotion when going back in sequence. egomotion_mult *= -1 # For compatiblity with SfMLearner, interpret all egomotion vectors # as pointing toward the middle frame. Note that unlike SfMLearner, # each vector captures the motion to/from its next frame, and not # the center frame. Although with seq_length == 3, there is no # difference. if self.legacy_mode: if egomotion_index >= self.middle_frame_index: egomotion_mult *= -1 egomotion = egomotion_mult * self.egomotion[:, egomotion_index, :] # Inverse warp the source image to the target image frame for # photometric consistency loss. self.warped_image[s][key], self.warp_mask[s][key] = ( project.inverse_warp(source, target_depth, egomotion, self.intrinsic_mat[:, s, :, :], self.intrinsic_mat_inv[:, s, :, :])) # Reconstruction loss. self.warp_error[s][key] = tf.abs(self.warped_image[s][key] - target) self.reconstr_loss += tf.reduce_mean( self.warp_error[s][key] * self.warp_mask[s][key]) # SSIM. if self.ssim_weight > 0: self.ssim_error[s][key] = self.ssim(self.warped_image[s][key], target) # TODO(rezama): This should be min_pool2d(). ssim_mask = slim.avg_pool2d(self.warp_mask[s][key], 3, 1, 'VALID') self.ssim_loss += tf.reduce_mean( self.ssim_error[s][key] * ssim_mask) # 3D loss. if self.icp_weight > 0: cloud_a = self.cloud[j][s] cloud_b = self.cloud[i][s] self.icp_transform[s][key], self.icp_residual[s][key] = icp( cloud_a, egomotion, cloud_b) self.icp_transform_loss += 1.0 / (2**s) * tf.reduce_mean( tf.abs(self.icp_transform[s][key])) self.icp_residual_loss += 1.0 / (2**s) * tf.reduce_mean( tf.abs(self.icp_residual[s][key])) self.total_loss = self.reconstr_weight * self.reconstr_loss if self.smooth_weight > 0: self.total_loss += self.smooth_weight * self.smooth_loss if self.ssim_weight > 0: self.total_loss += self.ssim_weight * self.ssim_loss if self.icp_weight > 0: self.total_loss += self.icp_weight * (self.icp_transform_loss + self.icp_residual_loss)