def control_point_l1_loss(pred_control_points, gt_control_points, confidence=None, confidence_weight=None): """ Computes the l1 loss between the predicted control points and the groundtruth contorl points on the gripper. """ confidence_term = tf.constant(0, dtype=tf.float32) print('control_point_l1_loss', get_shape(pred_control_points), get_shape(gt_control_points)) error = tf.reduce_sum(tf.abs(pred_control_points - gt_control_points), -1) error = tf.reduce_mean(error, -1) if confidence is not None: assert (confidence_weight is not None) error *= confidence confidence_term = tf.reduce_mean(tf.log(tf.maximum( confidence, 1e-10))) * confidence_weight print('confidence_term = ', get_shape(confidence_term)) print('l1_error = {}'.format(get_shape(error))) if confidence is None: return tf.reduce_mean(error) else: return tf.reduce_mean(error), -confidence_term
def scaled_dot_product_attention(Q, K, V, dropout_rate=0.0): scaler = tf.rsqrt(tf.to_float( tf_utils.get_shape(Q)[2])) # depth of the query logits = tf.matmul(Q, K, transpose_b=True) * scaler weights = tf.nn.softmax(logits) weights = tf.nn.dropout(weights, 1.0 - dropout_rate) return tf.matmul(weights, V)
def control_point_l1_loss_better_than_threshold(pred_control_points, gt_control_points, confidence, confidence_threshold): npoints = get_shape(pred_control_points)[1] mask = tf.greater_equal(confidence, confidence_threshold) mask_ratio = tf.reduce_mean(tf.to_float(mask)) mask = tf.tile(mask, [1, npoints]) p1 = tf.boolean_mask(pred_control_points, mask) p2 = tf.boolean_mask(gt_control_points, mask) return control_point_l1_loss(p1, p2), mask_ratio
def merge_pc_and_gripper_pc(pc, gripper_pc, instance_mode=0, pc_latent=None, gripper_pc_latent=None): """ Merges the object point cloud and gripper point cloud and adds a binary auxilary feature that indicates whether each point belongs to the object or to the gripper. """ pc_shape = get_shape(pc) gripper_shape = get_shape(gripper_pc) assert (len(pc_shape) == 3) assert (len(gripper_shape) == 3) assert (pc_shape[0] == gripper_shape[0]) npoints = get_shape(pc)[1] batch_size = tf.shape(pc)[0] if instance_mode == 1: assert pc_shape[-1] == 3 latent_dist = [pc_latent, gripper_pc_latent] latent_dist = tf.concat(latent_dist, 1) l0_xyz = tf.concat((pc, gripper_pc), 1) labels = [ tf.ones((get_shape(pc)[1], 1), dtype=tf.float32), tf.zeros((get_shape(gripper_pc)[1], 1), dtype=tf.float32) ] labels = tf.concat(labels, 0) labels = tf.expand_dims(labels, 0) labels = tf.tile(labels, [batch_size, 1, 1]) if instance_mode == 1: l0_points = tf.concat([l0_xyz, latent_dist, labels], -1) else: l0_points = tf.concat([l0_xyz, labels], -1) return l0_xyz, l0_points
def get_decoded_features_depth(raw_depth_and_masks, encoded_depth, reg_constant): pyr_top_conv = normal_convolution(num_filters=196, reg_constant=reg_constant, input_shape=get_shape( encoded_depth[-1][-1]), num='_pyr_top') norm_top, conf_top = pyr_top_conv(encoded_depth[-1][0], encoded_depth[-1][1]) decoder = [[norm_top, conf_top]] for i in range(4, 0, -1): norm, conf = encoded_depth[i] channels = norm.get_shape().as_list()[-1] #upsampling_lower_level with tf.variable_scope("upsampling" + str(i + 1) + 'to' + str(i)): upsample_conv = normal_convolution( num_filters=channels, reg_constant=reg_constant, input_shape=get_shape(decoder[-1][1]), num=str(i + 2) + 'to' + str(i + 1)) upsample_norm, upsample_conf = upsample_conv( decoder[-1][0], decoder[-1][1]) upsample_norm = upsample(upsample_norm) upsample_conf = upsample(upsample_conf) #refinement with tf.variable_scope("refine" + str(i)): pyramid_feature_n = upsample_norm + norm pyramid_feature_c = upsample_conf + conf refine_layer = normal_convolution( num_filters=channels, reg_constant=reg_constant, input_shape=get_shape(pyramid_feature_n), num='refinement') refined_n, refined_c = refine_layer(pyramid_feature_n, pyramid_feature_c) decoder.append([refined_n, refined_c]) return decoder
def get_features_encoder_depth(depth, masks, reg_constant): depth_encoder = [] filters_by_level = [16, 32, 64, 96, 128, 196] for i, num_filters in (zip(range(0, 6), filters_by_level)): if i == 0: data, conf = max_pool_normalized(depth, masks) norm_inst = normal_convolution(num_filters=num_filters, num=str(i) + 'b', input_shape=get_shape(data), reg_constant=reg_constant) data, conf = norm_inst(data, conf) depth_encoder.append([data, conf]) else: data, conf = depth_encoder[-1] data, conf = max_pool_normalized(data, conf) norm_inst = normal_convolution(num_filters=num_filters, num=str(i) + 'b', input_shape=get_shape(data), reg_constant=reg_constant) data, conf = norm_inst(data, conf) depth_encoder.append([data, conf]) return depth_encoder
def verify_tensor_size(t, expected_shape): """ Checks whether input tensor t, has the expected_shape. Args: t: input tensor expected_shape: list of int indicating the expected shape. """ shape = get_shape(t) if len(shape) != len(expected_shape): raise ValueError('shape do not match : {} != {}'.format( shape, expected_shape)) if np.any(np.asarray(shape) != np.asarray(expected_shape)): raise ValueError('shape do not match : {} != {}'.format( shape, expected_shape))
def accuracy_better_than_threshold(pred_success_logits, gt, confidence, confidence_threshold): """ Computes average precision for the grasps with confidence > threshold. """ pred_classes = tf.cast(tf.argmax(pred_success_logits, -1), tf.int32) correct = tf.to_float(tf.equal(pred_classes, gt)) mask = tf.squeeze( tf.to_float(tf.greater_equal(confidence, confidence_threshold)), -1) gt = tf.to_float(gt) positive_acc = tf.reduce_sum(correct * mask * gt) / tf.maximum( tf.reduce_sum(mask * gt), 1.) negative_acc = tf.reduce_sum(correct * mask * (1. - gt)) / tf.maximum( tf.reduce_sum(mask * (1. - gt)), 1.) return 0.5 * (positive_acc + negative_acc), tf.reduce_sum(mask) / get_shape(gt)[0]
def build_vae_ops(data_dict, args, scope='vae'): """ builds vae operations that are required for training/inference of vae. Args: data_dict: dict, contains the tensors for the input to the model. args: arguments that are set for training. scope: string. Returns: train_op, summary_op, data_dict, logger_dict, global_step train_op: tf op for running training. summary_op: tf summary op that needs to be run for populating the summaries. data_dict: dictionary of tensors. Keys are tensor names and values are tensors. New keys and tensors will be added to the input data_dict. logger_dict: dictionary of tensors for printing. global_step: tf.Step that keeps the step number of the training. """ losses = None summaries = None train_op = None logger_dict = None summary_op = None global_step = None first_dimension = args.num_objects_per_batch * args.num_grasps_per_object is_training = args.is_training with tf.variable_scope(scope): if is_training: assert '{}_pred/samples' not in data_dict input_pcs = data_dict['{}_pc'.format(scope)] losses = {} summaries = {} gt_control_points = tf_utils.transform_control_points( data_dict['{}_grasp_rt'.format(scope)], first_dimension, mode='rt') gt_control_points = tf.slice(gt_control_points, [0, 0, 0], [-1, -1, 3]) data_dict['{}_gt_control_point'.format(scope)] = gt_control_points pc_input = tf.slice(input_pcs, [0, 0, 0], [-1, -1, 3]) if not args.gan: # Create Encoder. latent_input = data_dict['{}_grasp_rt'.format(scope)] batch_size = get_shape(pc_input)[0] npoints = get_shape(pc_input)[1] latent_input = tf.tile( tf.reshape(latent_input, [batch_size, 1, -1]), [1, npoints, 1]) with tf.variable_scope('encoder'): latent_mean_std = models.model.model_with_confidence( pc_input, latent_input, is_training=tf.constant(is_training), bn_decay=None, is_encoder=True, latent_size=args.latent_size, scale=args.model_scale, merge_pcs=args.merge_pcs_in_vae_encoder, pointnet_radius=args.pointnet_radius, pointnet_nclusters=args.pointnet_nclusters) latent_mean = tf.slice(latent_mean_std, [0, 0], [-1, args.latent_size]) latent_std = tf.slice(latent_mean_std, [0, args.latent_size], [-1, args.latent_size]) with tf.variable_scope('sample_from_latent'): samples = latent_mean + tf.exp( latent_std / 2.0) * tf.random_normal( latent_mean.shape, 0, 1, dtype=tf.float32) data_dict['{}_pred/samples'.format(scope)] = samples kl_loss = models.model.kl_divergence(latent_mean, latent_std) kl_loss = tf.reduce_mean(kl_loss) losses['kl_loss'] = kl_loss * args.kl_loss_weight summaries['unscaled_kl_loss'] = kl_loss else: # For gan just sample random latents. samples = tf.random.uniform( [first_dimension, args.latent_size], name='gan_latents') else: input_pcs = data_dict['{}_pc'.format(scope)] samples = data_dict['{}_pred/samples'.format(scope)] with tf.variable_scope('decoder'): pc_input = tf.slice(input_pcs, [0, 0, 0], [-1, -1, 3]) latent_input = samples batch_size = get_shape(pc_input)[0] npoints = get_shape(pc_input)[1] latent_input = tf.tile( tf.reshape(latent_input, [batch_size, 1, -1]), [1, npoints, 1]) q, t, confidence = models.model.model_with_confidence( pc_input, latent_input, tf.constant(is_training), bn_decay=None, is_encoder=False, latent_size=None, scale=args.model_scale, pointnet_radius=args.pointnet_radius, pointnet_nclusters=args.pointnet_nclusters) predicted_qt = tf.concat((q, t), -1) data_dict['{}_pred/grasp_qt'.format(scope)] = predicted_qt data_dict['{}_pred/confidence'.format(scope)] = confidence cp = tf_utils.transform_control_points( predicted_qt, get_shape(data_dict['{}_pc'.format(scope)])[0], scope='transform_predicted_qt') data_dict['{}_pred/cps'.format(scope)] = cp if is_training: loss_fn = None if args.gan: loss_fn = models.model.min_distance_loss else: loss_fn = models.model.control_point_l1_loss loss_term, confidence_term = loss_fn( cp, gt_control_points, confidence=confidence, confidence_weight=args.confidence_weight) data_dict['{}_loss'.format(scope)] = loss_term losses['gan_min_dist' if args. gan else 'L1_grasp_reconstruction'] = loss_term losses['confidence'] = confidence_term for c in CONFIDENCES: qkey = 'quality_at_confidence/{}'.format(c) rkey = 'ratio_at_confidence/{}'.format(c) summary_fn = models.model.control_point_l1_loss_better_than_threshold if args.gan: summary_fn = models.model.min_distance_better_than_threshold summaries[qkey], summaries[rkey] = summary_fn( cp, gt_control_points, confidence, c) global_step = tf.train.get_or_create_global_step() total_loss = tf.reduce_sum(tf.stack(list(losses.values()))) summaries['total_loss'] = total_loss learning_rate = tf.constant(args.lr, dtype=tf.float32) if args.ngpus > 1: optimizer = tf.train.AdamOptimizer(learning_rate * hvd.size()) optimizer = hvd.DistributedOptimizer(optimizer) else: optimizer = tf.train.AdamOptimizer(learning_rate) train_op = optimizer.minimize(total_loss, global_step=global_step) summaries['global_step'] = global_step for k in losses: summaries['loss/{}'.format(k)] = losses[k] logger_dict = {} for k, v in summaries.items(): logger_dict[k] = summaries[k] summaries[k] = tf.summary.scalar(k, v) summary_op = tf.summary.merge(list(summaries.values())) return train_op, summary_op, data_dict, logger_dict, global_step
def build_evaluator_ops(data_dict, args, scope='evaluator', npoints=-1): """ Builds all the tf ops necessary for training/evaluating the evaluator network. Args: data_dict: dict, contains all the tensors for input and will be populated with more intermeddiate tensors. args: arguments that are set for training. Returns: train_op, summary_op, data_dict, logger_dict, global_step train_op: tf op for running training. summary_op: tf summary op that needs to be run for populating the summaries. data_dict: dictionary of tensors. Keys are tensor names and values are tensors. New keys and tensors will be added to the input data_dict. logger_dict: dictionary of tensors for printing. global_step: tf.Step that keeps the step number of the training. """ logger_dict = {} summary_dict = {} global_step = None pc = data_dict['{}_pc'.format(scope)] gripper_pc_latent = None pc_latent = None gt_cps = tf_utils.get_control_point_tensor(get_shape(pc)[0]) ones = tf.ones((get_shape(gt_cps)[0], get_shape(gt_cps)[1], 1), dtype=tf.float32) gt_cps = tf.concat((gt_cps, ones), -1) # B x N x 4 data_dict['{}_gt_cps'.format(scope)] = gt_cps if args.gripper_pc_npoints == -1: # Use a pre-defined set of points on the gripper. 5 points. Used in the paper grasp_pc_o = gt_cps else: grasp_pc_o = tf_utils.get_gripper_pc( get_shape(pc)[0], args.gripper_pc_npoints) if '{}_grasp_eulers'.format(scope) in data_dict: # Refinement assert args.is_training == False assert '{}_grasp_translations'.format(scope) in data_dict assert isinstance(data_dict['{}_grasp_eulers'.format(scope)], list) assert len(data_dict['{}_grasp_eulers'.format(scope)]) == 3 sample_batch_size = get_shape(pc)[0] sample_rotation = data_dict['{}_grasp_eulers'.format(scope)] sample_translation = data_dict['{}_grasp_translations'.format(scope)] verify_tensor_size( pc, [sample_batch_size, npoints if npoints > 0 else args.npoints, 3]) for i in range(3): verify_tensor_size(sample_rotation[i], [sample_batch_size]) verify_tensor_size(sample_translation, [sample_batch_size, 3]) rot = tf_utils.tf_rotation_matrix(*sample_rotation, batched=True) grasp_pc = tf_utils.get_control_point_tensor(sample_batch_size) grasp_pc = tf.matmul(grasp_pc, rot, transpose_a=False, transpose_b=True) grasp_pc += tf.expand_dims(sample_translation, 1) else: # Training grasp generation assert args.is_training gt_cps = tf_utils.get_control_point_tensor( get_shape(pc)[0]) # Samples of the 3d points on the gripper ones = tf.ones((get_shape(gt_cps)[0], get_shape(gt_cps)[1], 1), dtype=tf.float32) gt_cps = tf.concat((gt_cps, ones), -1) # B x N x 4 data_dict['{}_gt_cps'.format(scope)] = gt_cps if args.gripper_pc_npoints == -1: # Use a pre-defined set of points on the gripper. 5 points. Used in the paper grasp_pc_o = gt_cps else: grasp_pc_o = tf_utils.get_gripper_pc( get_shape(pc)[0], args.gripper_pc_npoints) grasp_pc = tf.matmul( grasp_pc_o, data_dict['{}_grasp_rt'.format(scope)], transpose_a=False, transpose_b=True) # apply the transformation to the gripper pc grasp_pc = tf.slice(grasp_pc, [0, 0, 0], [-1, -1, 3]) # remove last dimension; B x N x 3 data_dict['{}_grasp_pc'.format(scope)] = grasp_pc label = data_dict['{}_label'.format(scope)] with tf.variable_scope(scope): pc_input = tf.slice(pc, [0, 0, 0], [-1, -1, 3]) success_logit, confidence = models.model.evaluator_model( # Confidence of the prediction; Not used now, i.e. confidence==1 (by setting the weight of the confidence loss to a large number) pc_input, grasp_pc, is_training=tf.constant( False ), # May be buggy with the batchnorm with evaluator. Disabled. # right now the evaluator model does not work with batch norm, and I don't know why. VAE is fine with batch norm. bn_decay=None, scale=1, pc_latent=pc_latent, gripper_pc_latent=gripper_pc_latent) data_dict['{}_pred/evaluator'.format(scope)] = tf.nn.softmax( success_logit) # Predicted success data_dict['{}_pred/confidence'.format(scope)] = confidence if args.is_training: global_step = tf.train.get_or_create_global_step() loss, confidence_term = models.model.classification_with_confidence_loss( success_logit, label, confidence, args.confidence_weight) total_loss = loss + confidence_term learning_rate = tf.constant(args.lr, tf.float32) if args.ngpus == 1: optimizer = tf.train.AdamOptimizer(learning_rate) else: optimizer = tf.train.AdamOptimizer(learning_rate * hvd.size()) optimizer = hvd.DistributedOptimizer(optimizer) # with tf.control_dependencies(update_ops): train_op = optimizer.minimize(total_loss, global_step=global_step, var_list=tf.global_variables()) confidences = [0.2, 0.4, 0.6, 0.8] for c in confidences: acc_at_confidence, ratio_at_confidence = models.model.accuracy_better_than_threshold( success_logit, label, confidence, c) summary_dict['ratio_at_each_confidence/' + str(c)] = ratio_at_confidence summary_dict['acc_at_Each_confidence/' + str(c)] = acc_at_confidence summary_dict['losses/classification_loss'] = loss summary_dict['losses/confidence_loss'] = confidence_term summary_dict['losses/total_loss'] = total_loss summary_dict['step'] = global_step logger_dict['predictions'] = tf.math.argmax(success_logit, -1) for k in summary_dict: logger_dict[k] = summary_dict[k] summary_dict[k] = tf.summary.scalar(k, summary_dict[k]) summary_op = tf.summary.merge(list(summary_dict.values())) else: train_op = None summary_op = None logger_dict = None tf_success = tf.slice(data_dict['{}_pred/evaluator'.format(scope)], [0, 1], [-1, 1]) # Got the success column data_dict['{}_pred/success'.format(scope)] = tf_success data_dict['{}_gradient'.format(scope)] = tf.gradients( tf_success, [ data_dict['{}_grasp_translations'.format(scope)], data_dict['{}_grasp_eulers'.format(scope)][0], data_dict['{}_grasp_eulers'.format(scope)][1], data_dict['{}_grasp_eulers'.format(scope)][2] ]) return train_op, summary_op, data_dict, logger_dict, global_step
def model_with_confidence(pc, latent, is_training, bn_decay, is_encoder, latent_size=None, scale=1, merge_pcs=False, pointnet_radius=0.02, pointnet_nclusters=128): """ If is_encoder=True, it creates a model that outputs grasp score and grasp confidence. Grasp confidence is the confidence of the network in the predicted scores. """ assert (~isinstance(is_training, bool)) if not is_encoder: if merge_pcs: raise ValueError( 'unless in encoder mode, merge_pcs should be False!!!') l0_xyz = pc l0_points = tf.concat([l0_xyz, latent], -1) if is_encoder and merge_pcs: grasp_rt = latent grasp_shape = get_shape(grasp_rt) print('encoder: merge_pc: grasp_shape: ', grasp_shape) if len(grasp_shape) != 3 or grasp_shape[1] != 4 or grasp_shape[2] != 4: raise ValueError('invalid grasp shape '.format(grasp_shape)) gripper_pc = tf.matmul(tf_utils.get_gripper_pc(get_shape(pc)[0], -1), grasp_rt, transpose_a=False, transpose_b=True) gripper_pc = tf.slice(gripper_pc, [0, 0, 0], [-1, -1, 3]) print('gripper_pc = {}, pc = {}'.format(get_shape(gripper_pc), get_shape(pc))) l0_xyz, l0_points = merge_pc_and_gripper_pc(pc, gripper_pc) print('l0_xyz = {} l0_points = {}'.format(get_shape(l0_xyz), get_shape(l0_points))) net = base_network(l0_xyz, l0_points, is_training, bn_decay, scale, pointnet_radius, pointnet_nclusters) if is_encoder: assert (latent_size is not None) mean = tf_util.fully_connected(net, latent_size, activation_fn=None, scope='fc_mean') logvar = tf_util.fully_connected(net, latent_size, activation_fn=None, scope='fc_var') return tf.concat((mean, logvar), -1) else: q = tf_util.fully_connected(net, 4, activation_fn=None, scope='fc_q') q = tf.nn.l2_normalize(q, -1) t = tf_util.fully_connected(net, 3, activation_fn=None, scope='fc_t') confidence = tf_util.fully_connected(net, 1, activation_fn=None, scope='fc_conf') confidence = tf.nn.sigmoid(confidence) return q, t, confidence
def base_network(l0_xyz, l0_points, is_training, bn_decay, scale, pointnet_radius=0.02, pointnet_nclusters=128): """ Backbone model used for encoder, decoder, and evaluator. """ l1_xyz, l1_points, _ = pointnet_sa_module( l0_xyz, l0_points, npoint=pointnet_nclusters, radius=pointnet_radius, nsample=64, mlp=[64 * scale, 64 * scale, 128 * scale], mlp2=None, group_all=False, is_training=is_training, bn_decay=bn_decay, scope='ssg-layer1') l2_xyz, l2_points, _ = pointnet_sa_module( l1_xyz, l1_points, npoint=32, radius=0.04, nsample=128, mlp=[128 * scale, 128 * scale, 256 * scale], mlp2=None, group_all=False, is_training=is_training, bn_decay=bn_decay, scope='ssg-layer2') _, l3_points, _ = pointnet_sa_module( l2_xyz, l2_points, npoint=None, radius=None, nsample=None, mlp=[256 * scale, 256 * scale, 512 * scale], mlp2=None, group_all=True, is_training=is_training, bn_decay=bn_decay, scope='ssg-layer3') # Fully connected layers batch_size = get_shape(l0_xyz)[0] net = tf.reshape(l3_points, [batch_size, -1]) net = tf_util.fully_connected(net, 1024 * scale, bn=True, is_training=is_training, scope='fc1', bn_decay=bn_decay) net = tf_util.fully_connected(net, 1024 * scale, bn=True, is_training=is_training, scope='fc2', bn_decay=bn_decay) return net
def min_distance_loss( pred_control_points, gt_control_points, confidence=None, confidence_weight=None, threshold=None, ): """ Computes the minimum distance (L1 distance)between each gt control point and any of the predicted control points. Args: pred_control_points: tensor of (N_pred, M, 4) shape. N is the number of grasps. M is the number of points on the gripper. gt_control_points: (N_gt, M, 4) confidence: tensor of N_pred, tensor for the confidence of each prediction. confidence_weight: float, the weight for confidence loss. """ pred_shape = get_shape(pred_control_points) gt_shape = get_shape(gt_control_points) if len(pred_shape) != 3: raise ValueError( "pred_control_point should have len of 3. {}".format(pred_shape)) if len(gt_shape) != 3: raise ValueError( "gt_control_point should have len of 3. {}".format(gt_shape)) if np.any([ p != gt for i, (p, gt) in enumerate(zip(pred_shape, gt_shape)) if i > 0 ]): raise ValueError("shapes do no match {} != {}".format( pred_shape, gt_shape)) # N_pred x Ngt x M x 3 error = tf.expand_dims(pred_control_points, 1) - tf.expand_dims( gt_control_points, 0) error = tf.reduce_sum(tf.abs(error), -1) # L1 distance of error (N_pred, N_gt, M) error = tf.reduce_mean( error, -1) # average L1 for all the control points. (N_pred, N_gt) min_distance_error = tf.reduce_min( error, 0) # take the min distance for each gt control point. (N_gt) #print('min_distance_error', get_shape(min_distance_error)) if confidence is not None: closest_index = tf.argmin(error, 0) # (N_gt) #print('closest_index', get_shape(closest_index)) selected_confidence = tf.one_hot(closest_index, axis=-1, depth=pred_shape[0]) # (N_gt, N_pred) #print('selected_confidence', selected_confidence) selected_confidence *= tf.expand_dims(confidence, 0) #print('selected_confidence', selected_confidence) selected_confidence = tf.reduce_sum(selected_confidence, -1) # N_gt #print('selected_confidence', selected_confidence) min_distance_error *= selected_confidence confidence_term = tf.reduce_mean(tf.log(tf.maximum( confidence, 1e-4))) * confidence_weight else: confidence_term = 0. return tf.reduce_mean(min_distance_error), -confidence_term