def execute(configs): tf.reset_default_graph() random.seed(configs["random_state"]) nprand.seed(configs["random_state"]) DECAY_FACTOR = 0.80 decay_steps = 1000 latent_dim = configs["latent_dim"] som_dim = [configs["som_dim"], configs["som_dim"]] num_classes = 10 global_step = tf.Variable(0, trainable=False, name="global_step") embeddings = tf.get_variable( "embeddings", som_dim + [latent_dim], initializer=tf.truncated_normal_initializer(stddev=0.05)) x = tf.placeholder(tf.float32, shape=[None, 784]) x_image = tf.reshape(x, [-1, 28, 28, 1]) y = tf.placeholder(tf.int32, shape=[None]) train = tf.placeholder(tf.bool, name="train") batch_size = tf.shape(x)[0] with tf.variable_scope("encoder"): h_conv1 = tf.nn.relu( conv2d(x_image, [4, 4, 1, configs["conv_size"]], "conv1")) h_pool1 = max_pool_2x2(h_conv1) h_conv2 = tf.nn.relu( conv2d(h_pool1, [4, 4, configs["conv_size"], configs["conv_size"]], "conv2")) h_pool2 = max_pool_2x2(h_conv2) flat_size = 7 * 7 * configs["conv_size"] h_flat = tf.reshape(h_pool2, [batch_size, flat_size]) # h_flat_norm = tf.layers.batch_normalization(h_flat, training=train, renorm=True) z_e = tf.keras.layers.Dense(latent_dim)(h_flat) z_dist = tf.squared_difference(tf.expand_dims(tf.expand_dims(z_e, 1), 1), tf.expand_dims(embeddings, 0)) z_dist_red = tf.reduce_sum(z_dist, axis=-1) z_dist_flat = tf.reshape(z_dist_red, [batch_size, -1]) k = tf.argmin(z_dist_flat, axis=-1) k_1 = k // som_dim[1] k_2 = k % som_dim[1] k_stacked = tf.stack([k_1, k_2], axis=1) z_q = tf.gather_nd(embeddings, k_stacked) def decoder(z_tensor): with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): h_flat_dec = tf.keras.layers.Dense(flat_size)(z_tensor) h_reshaped = tf.reshape(h_flat_dec, tf.shape(h_pool2)) h_unpool1 = tf.keras.layers.UpSampling2D((2, 2))(h_reshaped) h_deconv1 = tf.nn.relu( conv2d(h_unpool1, [4, 4, configs["conv_size"], configs["conv_size"]], "deconv1")) h_unpool2 = tf.keras.layers.UpSampling2D((2, 2))(h_deconv1) h_deconv2 = tf.nn.sigmoid( conv2d(h_unpool2, [4, 4, configs["conv_size"], 1], "deconv2")) x_hat = h_deconv2 return x_hat x_hat = decoder(z_q) beta = 0.25 loss_rec_mse = tf.losses.mean_squared_error(x_image, x_hat) loss_vq = tf.reduce_mean(tf.squared_difference(tf.stop_gradient(z_e), z_q)) loss_commit = tf.reduce_mean( tf.squared_difference(z_e, tf.stop_gradient(z_q))) loss = loss_rec_mse + loss_vq + beta * loss_commit learning_rate = tf.placeholder_with_default(0.001, []) lr_decay = tf.train.exponential_decay(learning_rate, global_step, decay_steps, DECAY_FACTOR, staircase=True) decoder_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "decoder") decoder_grads = list(zip(tf.gradients(loss, decoder_vars), decoder_vars)) encoder_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "encoder") grad_z = tf.gradients(loss_rec_mse, z_q) encoder_grads = [(tf.gradients(z_e, var, grad_z)[0] + beta * tf.gradients(loss_commit, var)[0], var) for var in encoder_vars] embed_grads = list(zip(tf.gradients(loss_vq, embeddings), [embeddings])) optimizer = tf.train.AdamOptimizer(lr_decay) train_step = optimizer.apply_gradients(decoder_grads + encoder_grads + embed_grads) BATCH_SIZE = configs["batch_size"] EPOCHS = configs["n_epochs"] NUM_TESTS = 1 for data_set in configs["DATASETS"]: if data_set == "mnist": ds_train, ds_test = tf.keras.datasets.mnist.load_data() elif data_set == "fashion": ds_train, ds_test = tf.keras.datasets.fashion_mnist.load_data() data_train = ds_train[0] data_train = np.reshape( data_train, (data_train.shape[0], data_train.shape[1] * data_train.shape[2])) data_test = ds_test[0] data_test = np.reshape( data_test, (data_test.shape[0], data_test.shape[1] * data_test.shape[2])) labels_test = ds_test[1] labels_train = ds_train[1] aggregated_mses = [] aggregated_NMIs = [] aggregated_purities = [] for _ in range(NUM_TESTS): with tf.Session() as sess: sess.run(tf.global_variables_initializer()) indices_unsup = np.arange(data_train.shape[0]) with tqdm(total=EPOCHS * (data_train.shape[0] // BATCH_SIZE)) as pbar: for epoch in range(EPOCHS): np.random.shuffle(indices_unsup) test_mse = sess.run(loss_rec_mse, feed_dict={ x: data_test[:100], train: False }) for i in range(indices_unsup.shape[0] // BATCH_SIZE): batch_data = data_train[ indices_unsup[BATCH_SIZE * i:BATCH_SIZE * (i + 1)]] if i % 100 == 0: train_mse, train_commit, train_loss = sess.run( [loss_rec_mse, loss_commit, loss], feed_dict={ x: batch_data, train: False }) train_step.run(feed_dict={ x: batch_data, train: True }) pbar.set_postfix(epoch=epoch, train_mse=train_mse, train_commit=train_commit, test_mse=test_mse, refresh=False) pbar.update(1) test_k_all = [] test_x_hat_all = [] for i in trange(data_test.shape[0] // 100): batch_data = data_test[100 * i:100 * (i + 1)] test_k_all.extend( sess.run(k, feed_dict={ x: batch_data, train: False })) test_x_hat_all.extend( sess.run(x_hat, feed_dict={ x: batch_data, train: False })) test_x_hat_all = np.array(test_x_hat_all) test_k_all = np.array(test_k_all) aggregated_mses.append( mean_squared_error(data_test, np.reshape(test_x_hat_all, [10000, 784]))) aggregated_NMIs.append( normalized_mutual_info_score(test_k_all, labels_test[:len(test_k_all)])) aggregated_purities.append( cluster_purity(test_k_all, labels_test[:len(test_k_all)])) print("Results for {}".format(data_set)) print("Test MSE: {} +- {}\nTest NMI: {} +- {}\nTest purity: {} +- {}". format(np.mean(aggregated_mses), np.std(aggregated_mses) / np.sqrt(NUM_TESTS), np.mean(aggregated_NMIs), np.std(aggregated_NMIs) / np.sqrt(NUM_TESTS), np.mean(aggregated_purities), np.std(aggregated_purities) / np.sqrt(NUM_TESTS))) if not configs["debug_mode"]: with open( "../results/vqvae_{}_{}_somdim_{}.tsv".format( data_set, configs["random_state"], configs["som_dim"]), 'w') as fp: csv_fp = csv.writer(fp, delimiter='\t') csv_fp.writerow(["model", "mse", "nmi", "purity"]) csv_fp.writerow([ "vqvae", str(aggregated_mses[0]), str(aggregated_NMIs[0]), str(aggregated_purities[0]) ])
def fuse_features(nodes, weight_method): """Fuse features from different resolutions and return a weighted sum. Args: nodes: a list of tensorflow features at different levels weight_method: feature fusion method. One of: - "attn" - Softmax weighted fusion - "fastattn" - Fast normalzied feature fusion - "sum" - a sum of inputs Returns: A tensor denoting the fused feature. """ dtype = nodes[0].dtype if weight_method == 'attn': edge_weights = [ tf.cast(tf.Variable(1.0, name='WSM'), dtype=dtype) for _ in nodes ] normalized_weights = tf.nn.softmax(tf.stack(edge_weights)) nodes = tf.stack(nodes, axis=-1) new_node = tf.reduce_sum(nodes * normalized_weights, -1) elif weight_method == 'fastattn': edge_weights = [ tf.nn.relu(tf.cast(tf.Variable(1.0, name='WSM'), dtype=dtype)) for _ in nodes ] weights_sum = tf.add_n(edge_weights) nodes = [ nodes[i] * edge_weights[i] / (weights_sum + 0.0001) for i in range(len(nodes)) ] new_node = tf.add_n(nodes) elif weight_method == 'channel_attn': num_filters = int(nodes[0].shape[-1]) edge_weights = [ tf.cast(tf.Variable(lambda: tf.ones([num_filters]), name='WSM'), dtype=dtype) for _ in nodes ] normalized_weights = tf.nn.softmax(tf.stack(edge_weights, -1), axis=-1) nodes = tf.stack(nodes, axis=-1) new_node = tf.reduce_sum(nodes * normalized_weights, -1) elif weight_method == 'channel_fastattn': num_filters = int(nodes[0].shape[-1]) edge_weights = [ tf.nn.relu( tf.cast(tf.Variable(lambda: tf.ones([num_filters]), name='WSM'), dtype=dtype)) for _ in nodes ] weights_sum = tf.add_n(edge_weights) nodes = [ nodes[i] * edge_weights[i] / (weights_sum + 0.0001) for i in range(len(nodes)) ] new_node = tf.add_n(nodes) elif weight_method == 'sum': new_node = tf.add_n(nodes) else: raise ValueError('unknown weight_method {}'.format(weight_method)) return new_node
def build_act_enjoy (make_obs_ph, q_func, num_actions, noisy=False, scope="deepq", reuse=None, attack=None, model_path=''): with tf.variable_scope(scope, reuse=reuse): observations_ph = U.ensure_tf_input(make_obs_ph("observation")) stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) q_values = q_func(observations_ph.get(), num_actions, scope="q_func", noisy=noisy) q_values = q_values.get_logits(observations_ph.get()) #q_values = q_func(observations_ph, num_actions, scope="q_func", noisy=noisy) deterministic_actions = tf.argmax(q_values, axis=1) batch_size = tf.shape(observations_ph.get())[0] random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph], outputs=output_actions, givens={update_eps_ph: -1.0, stochastic_ph: True}, updates=[update_eps_expr]) # Load model before attacks graph construction so that TF won't # complain can't load parameters for attack try: U.load_state(model_path) except: pass if attack != None: if attack == 'fgsm': def wrapper(x): return q_func(x, num_actions, scope="q_func", reuse=True, concat_softmax=True, noisy=noisy) adversary = FastGradientMethod(CallableModelWrapper(wrapper, 'probs'), sess=U.get_session()) adv_observations = adversary.generate(observations_ph.get(), eps=1.0/255.0, clip_min=0, clip_max=1.0) * 255.0 elif attack == 'iterative': def wrapper(x): return q_func(x, num_actions, scope="q_func", reuse=True, concat_softmax=True) adversary = BasicIterativeMethod(CallableModelWrapper(wrapper, 'probs'), sess=U.get_session()) adv_observations = adversary.generate(observations_ph.get(), eps=1.0/255.0, clip_min=0, clip_max=1.0) * 255.0 elif attack == 'cwl2': def wrapper(x): return q_func(x, num_actions, scope="q_func", reuse=True) adversary = CarliniWagnerL2(CallableModelWrapper(wrapper, 'logits'), sess=U.get_session()) cw_params = {'binary_search_steps': 1, 'max_iterations': 100, 'learning_rate': 0.1, 'initial_const': 10, 'clip_min': 0, 'clip_max': 1.0} adv_observations = adversary.generate(observations_ph.get(), **cw_params) * 255.0 craft_adv_obs = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph], outputs=adv_observations, givens={update_eps_ph: -1.0, stochastic_ph: True}, updates=[update_eps_expr]) if attack == None: craft_adv_obs = None return act else: return act, craft_adv_obs
def mode(self): return tf.cast( tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32)
def single_level_feature_crop(features, level_boxes, detection_prior_levels, min_mask_level, mask_crop_size): """Crop the FPN features at the appropriate levels for each detection. Args: features: a float tensor of shape [batch_size, num_levels, max_feature_size, max_feature_size, num_downsample_channels]. level_boxes: a float Tensor of the level boxes to crop from. [batch_size, num_instances, 4]. detection_prior_levels: an int Tensor of instance assigned level of shape [batch_size, num_instances]. min_mask_level: minimum FPN level to crop mask feature from. mask_crop_size: an int of mask crop size. Returns: crop_features: a float Tensor of shape [batch_size * num_instances, mask_crop_size, mask_crop_size, num_downsample_channels]. This is the instance feature crop. """ (batch_size, num_levels, max_feature_size, _, num_downsample_channels) = features.get_shape().as_list() _, num_of_instances, _ = level_boxes.get_shape().as_list() level_boxes = tf.cast(level_boxes, tf.int32) assert num_of_instances == detection_prior_levels.get_shape().as_list()[1] x_start_indices = level_boxes[:, :, 1] y_start_indices = level_boxes[:, :, 0] # generate the full indices (not just the starting index) x_idx_list = [] y_idx_list = [] for i in range(mask_crop_size): x_idx_list.append(x_start_indices + i) y_idx_list.append(y_start_indices + i) x_indices = tf.stack(x_idx_list, axis=2) y_indices = tf.stack(y_idx_list, axis=2) levels = detection_prior_levels - min_mask_level height_dim_size = max_feature_size level_dim_size = max_feature_size * height_dim_size batch_dim_size = num_levels * level_dim_size # TODO(weicheng) change this to gather_nd for better readability. indices = tf.reshape( tf.tile( tf.reshape( tf.range(batch_size) * batch_dim_size, [batch_size, 1, 1, 1]), [1, num_of_instances, mask_crop_size, mask_crop_size]) + tf.tile( tf.reshape(levels * level_dim_size, [batch_size, num_of_instances, 1, 1]), [1, 1, mask_crop_size, mask_crop_size]) + tf.tile( tf.reshape(y_indices * height_dim_size, [batch_size, num_of_instances, mask_crop_size, 1]), [1, 1, 1, mask_crop_size]) + tf.tile( tf.reshape(x_indices, [batch_size, num_of_instances, 1, mask_crop_size]), [1, 1, mask_crop_size, 1]), [-1]) features_r2 = tf.reshape(features, [-1, num_downsample_channels]) crop_features = tf.reshape( tf.gather(features_r2, indices), [batch_size * num_of_instances, mask_crop_size, mask_crop_size, num_downsample_channels]) return crop_features
def _createStackBidirectionalDynamicRNN(self, use_gpu, use_shape, use_state_tuple, initial_states_fw=None, initial_states_bw=None, scope=None): del use_gpu del use_state_tuple self.layers = [2, 3] input_size = 5 batch_size = 2 max_length = 8 initializer = tf.random_uniform_initializer(-0.01, 0.01, seed=self._seed) sequence_length = tf.placeholder(tf.int64) self.cells_fw = [ rnn_cell.LSTMCell( # pylint:disable=g-complex-comprehension num_units, input_size, initializer=initializer, state_is_tuple=False) for num_units in self.layers ] self.cells_bw = [ rnn_cell.LSTMCell( # pylint:disable=g-complex-comprehension num_units, input_size, initializer=initializer, state_is_tuple=False) for num_units in self.layers ] inputs = max_length * [ tf.placeholder(tf.float32, shape=(batch_size, input_size) if use_shape else (None, input_size)) ] inputs_c = tf.stack(inputs) inputs_c = tf.transpose(inputs_c, [1, 0, 2]) outputs, st_fw, st_bw = contrib_rnn.stack_bidirectional_dynamic_rnn( self.cells_fw, self.cells_bw, inputs_c, initial_states_fw=initial_states_fw, initial_states_bw=initial_states_bw, dtype=tf.float32, sequence_length=sequence_length, scope=scope) # Outputs has shape (batch_size, max_length, 2* layer[-1]. output_shape = [None, max_length, 2 * self.layers[-1]] if use_shape: output_shape[0] = batch_size self.assertAllEqual(outputs.get_shape().as_list(), output_shape) input_value = np.random.randn(batch_size, input_size) return input_value, inputs, outputs, st_fw, st_bw, sequence_length
def iou_loss(pred_boxes: FloatType, target_boxes: FloatType, iou_type: Text = 'iou') -> tf.Tensor: """A unified interface for computing various IoU losses. Let B and B_gt denotes the pred_box and B_gt is the target box (ground truth): IoU = |B & B_gt| / |B | B_gt| GIoU = IoU - |C - B U B_gt| / C, where C is the smallest box covering B and B_gt. DIoU = IoU - E(B, B_gt)^2 / c^2, E is the Euclidean distance of the center points of B and B_gt, and c is the diagonal length of the smallest box covering the two boxes CIoU = IoU - DIoU - a * v, where a is a positive trade-off parameter, and v measures the consistency of aspect ratio: v = (arctan(w_gt / h_gt) - arctan(w / h)) * 4 / pi^2 where (w_gt, h_gt) and (w, h) are the width and height of the target and predicted box respectively. The returned loss is computed as 1 - one of {IoU, GIoU, DIoU, CIoU}. Args: pred_boxes: predicted boxes, with coordinate [y_min, x_min, y_max, x_max]*. It can be multiple anchors, with each anchor box has four coordinates. target_boxes: target boxes, with coordinate [y_min, x_min, y_max, x_max]*. It can be multiple anchors, with each anchor box has four coordinates. iou_type: one of ['iou', 'ciou', 'diou', 'giou']. Returns: IoU loss float `Tensor`. """ if iou_type not in ('iou', 'ciou', 'diou', 'giou'): raise ValueError( 'Unknown loss_type {}, not iou/ciou/diou/giou'.format(iou_type)) pred_boxes = tf.convert_to_tensor(pred_boxes, tf.float32) target_boxes = tf.cast(target_boxes, pred_boxes.dtype) # t_ denotes target boxes and p_ denotes predicted boxes: (y, x, y_max, x_max) pred_boxes_list = tf.unstack(pred_boxes, None, axis=-1) target_boxes_list = tf.unstack(target_boxes, None, axis=-1) assert len(pred_boxes_list) == len(target_boxes_list) assert len(pred_boxes_list) % 4 == 0 iou_loss_list = [] for i in range(0, len(pred_boxes_list), 4): pred_boxes = pred_boxes_list[i:i + 4] target_boxes = target_boxes_list[i:i + 4] # Compute mask. t_ymin, t_xmin, t_ymax, t_xmax = target_boxes mask = tf.not_equal((t_ymax - t_ymin) * (t_xmax - t_xmin), 0) mask = tf.cast(mask, t_ymin.dtype) # Loss should be mask * (1 - iou) = mask - masked_iou. pred_boxes = [b * mask for b in pred_boxes] iou_loss_list.append( mask - tf.squeeze(_iou_per_anchor(pred_boxes, target_boxes, iou_type))) if len(iou_loss_list) == 1: return iou_loss_list[0] return tf.reduce_sum(tf.stack(iou_loss_list), 0)
def build_bifpn_layer(feats, fpn_name, fpn_config, is_training, input_size, fpn_num_filters, min_level, max_level, separable_conv, apply_bn_for_resampling, conv_after_downsample, use_native_resize_op, conv_bn_relu_pattern, pooling_type): """Builds a feature pyramid given previous feature pyramid and config.""" config = fpn_config or get_fpn_config(fpn_name) num_output_connections = [0 for _ in feats] for i, fnode in enumerate(config.nodes): with tf.variable_scope('fnode{}'.format(i)): logging.info('fnode %d : %s', i, fnode) new_node_width = int(fnode['width_ratio'] * input_size) nodes = [] for idx, input_offset in enumerate(fnode['inputs_offsets']): input_node = feats[input_offset] num_output_connections[input_offset] += 1 input_node = resample_feature_map( input_node, '{}_{}_{}'.format(idx, input_offset, len(feats)), new_node_width, fpn_num_filters, apply_bn_for_resampling, is_training, conv_after_downsample, use_native_resize_op, pooling_type) nodes.append(input_node) # Combine all nodes. dtype = nodes[0].dtype if config.weight_method == 'attn': edge_weights = [ tf.cast(tf.Variable(1.0, name='WSM'), dtype=dtype) for _ in range(len(fnode['inputs_offsets'])) ] normalized_weights = tf.nn.softmax(tf.stack(edge_weights)) nodes = tf.stack(nodes, axis=-1) new_node = tf.reduce_sum( tf.multiply(nodes, normalized_weights), -1) elif config.weight_method == 'fastattn': edge_weights = [ tf.nn.relu( tf.cast(tf.Variable(1.0, name='WSM'), dtype=dtype)) for _ in range(len(fnode['inputs_offsets'])) ] weights_sum = tf.add_n(edge_weights) nodes = [ nodes[i] * edge_weights[i] / (weights_sum + 0.0001) for i in range(len(nodes)) ] new_node = tf.add_n(nodes) elif config.weight_method == 'sum': new_node = tf.add_n(nodes) else: raise ValueError('unknown weight_method {}'.format( config.weight_method)) with tf.variable_scope('op_after_combine{}'.format(len(feats))): if not conv_bn_relu_pattern: new_node = utils.relu_fn(new_node) if separable_conv: conv_op = functools.partial(tf.layers.separable_conv2d, depth_multiplier=1) else: conv_op = tf.layers.conv2d new_node = conv_op( new_node, filters=fpn_num_filters, kernel_size=(3, 3), padding='same', use_bias=True if not conv_bn_relu_pattern else False, name='conv') new_node = utils.batch_norm_relu( new_node, is_training_bn=is_training, relu=False if not conv_bn_relu_pattern else True, data_format='channels_last', name='bn') feats.append(new_node) num_output_connections.append(0) output_feats = {} for l in range(min_level, max_level + 1): for i, fnode in enumerate(reversed(config.nodes)): if fnode['width_ratio'] == F(l): output_feats[l] = feats[-1 - i] break return output_feats
def generate_trips(self, min_gap=1, max_gap=5): """Generate a tf Dataset of training triplets with an offset between three frames. Args: min_gap: (int) the minimum offset between two frames of a sampled triplet. max_gap: (int) the maximum offset between two frames of a sampled triplet. Returns: A tf.data.Dataset of ViewSequences without images, consisting of triplets from the input sequence separated by the given offset. """ def mapper(timestamp_trips, rgb_trips, pano_trips, depth_trips, normal_trips, pose_trips): """A function mapping a data tuple to ViewTrip.""" return ViewTrip(self.scene_id, self.sequence_id, timestamp_trips, rgb_trips, pano_trips, depth_trips, normal_trips, tf.zeros([1]), pose_trips, self.intrinsics[0], self.resolution[0]) with tf.control_dependencies([ tf.Assert(tf.less(max_gap, self.length()), [max_gap, self.length()]) ]): timestamp_trips = [] rgb_trips = [] pano_trips = [] depth_trips = [] normal_trips = [] pose_trips = [] # generate triplets with an offset that ranges # from 'min_gap' to 'max_gap'. for stride in range(min_gap, max_gap + 1): inds = tf.range(stride, self.length() - stride) inds_jitter = tf.random.uniform( minval=-40, maxval=40, shape=[self.length() - 2 * stride], dtype=tf.int32) rand_inds = tf.minimum(tf.maximum(inds + inds_jitter, 0), self.length() - 1) timestamp = tf.stack([ self.timestamp[:-2 * stride], self.timestamp[2 * stride:], self.timestamp[stride:-stride], tf.gather(self.timestamp, rand_inds) ], axis=1) rgb = tf.stack([ self.rgb[:-2 * stride], self.rgb[2 * stride:], self.rgb[stride:-stride], tf.gather(self.rgb, rand_inds) ], axis=1) pano = tf.stack([ self.pano[:-2 * stride], self.pano[2 * stride:], self.pano[stride:-stride], tf.gather(self.pano, rand_inds) ], axis=1) depth = tf.stack([ self.depth[:-2 * stride], self.depth[2 * stride:], self.depth[stride:-stride], tf.gather(self.depth, rand_inds) ], axis=1) normal = tf.stack([ self.normal[:-2 * stride], self.normal[2 * stride:], self.normal[stride:-stride], tf.gather(self.normal, rand_inds) ], axis=1) pose = tf.stack([ self.pose[:-2 * stride], self.pose[2 * stride:], self.pose[stride:-stride], tf.gather(self.pose, rand_inds) ], axis=1) timestamp_trips.append(timestamp) rgb_trips.append(rgb) pano_trips.append(pano) depth_trips.append(depth) normal_trips.append(normal) pose_trips.append(pose) timestamp_trips = tf.concat(timestamp_trips, 0) rgb_trips = tf.concat(rgb_trips, 0) pano_trips = tf.concat(pano_trips, 0) depth_trips = tf.concat(depth_trips, 0) normal_trips = tf.concat(normal_trips, 0) pose_trips = tf.concat(pose_trips, 0) dataset = tf.data.Dataset.from_tensor_slices( (timestamp_trips, rgb_trips, pano_trips, depth_trips, normal_trips, pose_trips)) return dataset.map(mapper)
def get(self): """ Provides input data to the graph. """ # calculate size of each record (this lists what is contained in the db and how many bytes are occupied) record_bytes = 0 encoding_bytes = 4 kp_xyz_entries = 3 * self.num_kp record_bytes += encoding_bytes*kp_xyz_entries encoding_bytes = 4 kp_uv_entries = 2 * self.num_kp record_bytes += encoding_bytes*kp_uv_entries kp_vis_entries = self.num_kp record_bytes += encoding_bytes*kp_vis_entries image_bytes = self.image_size[0] * self.image_size[1] * 3 record_bytes += image_bytes """ READ DATA ITEMS""" # Start reader reader = tf.FixedLengthRecordReader(header_bytes=0, record_bytes=record_bytes) _, value = reader.read(tf.train.string_input_producer([self.path_to_db])) # decode to floats bytes_read = 0 data_dict = dict() record_bytes_float32 = tf.decode_raw(value, tf.float32) # 1. Read keypoint xyz keypoint_xyz21 = tf.reshape(tf.slice(record_bytes_float32, [bytes_read//4], [kp_xyz_entries]), [self.num_kp, 3]) bytes_read += encoding_bytes*kp_xyz_entries keypoint_xyz21 /= 1000.0 # scale to meters keypoint_xyz21 = self.convert_kp(keypoint_xyz21) # calculate wrist coord if self.use_wrist_coord: wrist_xyz = keypoint_xyz21[16, :] + 2.0*(keypoint_xyz21[0, :] - keypoint_xyz21[16, :]) keypoint_xyz21 = tf.concat([tf.expand_dims(wrist_xyz, 0), keypoint_xyz21[1:, :]], 0) data_dict['keypoint_xyz21'] = keypoint_xyz21 # 2. Read keypoint uv AND VIS keypoint_uv_vis21 = tf.reshape(tf.slice(record_bytes_float32, [bytes_read//4], [kp_uv_entries+kp_vis_entries]), [self.num_kp, 3]) bytes_read += encoding_bytes*(kp_uv_entries+kp_vis_entries) keypoint_uv_vis21 = self.convert_kp(keypoint_uv_vis21) keypoint_uv21 = keypoint_uv_vis21[:, :2] keypoint_vis21 = tf.equal(keypoint_uv_vis21[:, 2], 1.0) # calculate wrist vis if self.use_wrist_coord: wrist_vis = tf.logical_or(keypoint_vis21[16], keypoint_vis21[0]) keypoint_vis21 = tf.concat([tf.expand_dims(wrist_vis, 0), keypoint_vis21[1:]], 0) wrist_uv = keypoint_uv21[16, :] + 2.0*(keypoint_uv21[0, :] - keypoint_uv21[16, :]) keypoint_uv21 = tf.concat([tf.expand_dims(wrist_uv, 0), keypoint_uv21[1:, :]], 0) data_dict['keypoint_vis21'] = keypoint_vis21 if self.coord_uv_noise: noise = tf.truncated_normal([42, 2], mean=0.0, stddev=self.coord_uv_noise_sigma) keypoint_uv21 += noise data_dict['keypoint_uv21'] = keypoint_uv21 # decode to uint8 record_bytes_uint8 = tf.decode_raw(value, tf.uint8) # 4. Read image image = tf.reshape(tf.slice(record_bytes_uint8, [bytes_read], [image_bytes]), [self.image_size[0], self.image_size[1], 3]) image = tf.cast(image, tf.float32) bytes_read += image_bytes # subtract mean image = image / 255.0 - 0.5 if self.hue_aug: image = tf.image.random_hue(image, self.hue_aug_max) data_dict['image'] = image """ CONSTANTS """ # Camera intrinsics sx = 822.79041 sy = 822.79041 tx = 318.47345 ty = 250.31296 data_dict['cam_mat'] = tf.constant([[sx, 0.0, tx], [0.0, sy, ty], [0.0, 0.0, 1.0]]) # Hand side: this dataset only contains left hands data_dict['hand_side'] = tf.one_hot(tf.constant(0, dtype=tf.int32), depth=2, on_value=1.0, off_value=0.0, dtype=tf.float32) assert bytes_read == record_bytes, "Doesnt add up." """ DEPENDENT DATA ITEMS: XYZ represenations. """ # make coords relative to root joint kp_coord_xyz_root = keypoint_xyz21[0, :] # this is the palm coord kp_coord_xyz21_rel = keypoint_xyz21 - kp_coord_xyz_root # relative coords in metric coords index_root_bone_length = tf.sqrt(tf.reduce_sum(tf.square(kp_coord_xyz21_rel[12, :] - kp_coord_xyz21_rel[11, :]))) data_dict['keypoint_scale'] = index_root_bone_length data_dict['keypoint_xyz21_normed'] = kp_coord_xyz21_rel / index_root_bone_length # normalized by length of 12->11 # calculate local coordinates kp_coord_xyz21_local = bone_rel_trafo(data_dict['keypoint_xyz21_normed']) kp_coord_xyz21_local = tf.squeeze(kp_coord_xyz21_local) data_dict['keypoint_xyz21_local'] = kp_coord_xyz21_local # calculate viewpoint and coords in canonical coordinates kp_coord_xyz21_rel_can, rot_mat = canonical_trafo(data_dict['keypoint_xyz21_normed']) kp_coord_xyz21_rel_can, rot_mat = tf.squeeze(kp_coord_xyz21_rel_can), tf.squeeze(rot_mat) data_dict['keypoint_xyz21_can'] = kp_coord_xyz21_rel_can data_dict['rot_mat'] = tf.matrix_inverse(rot_mat) """ DEPENDENT DATA ITEMS: HAND CROP """ if self.hand_crop: crop_center = keypoint_uv21[12, ::-1] # catch problem, when no valid kp available (happens almost never) crop_center = tf.cond(tf.reduce_all(tf.is_finite(crop_center)), lambda: crop_center, lambda: tf.constant([0.0, 0.0])) crop_center.set_shape([2, ]) if self.crop_center_noise: noise = tf.truncated_normal([2], mean=0.0, stddev=self.crop_center_noise_sigma) crop_center += noise crop_scale_noise = tf.constant(1.0) if self.crop_scale_noise: crop_scale_noise = tf.squeeze(tf.random_uniform([1], minval=1.0, maxval=1.2)) if not self.use_wrist_coord: wrist_uv = keypoint_uv21[16, :] + 2.0*(keypoint_uv21[0, :] - keypoint_uv21[16, :]) keypoint_uv21 = tf.concat([tf.expand_dims(wrist_uv, 0), keypoint_uv21[1:, :]], 0) # select visible coords only kp_coord_h = tf.boolean_mask(keypoint_uv21[:, 1], keypoint_vis21) kp_coord_w = tf.boolean_mask(keypoint_uv21[:, 0], keypoint_vis21) kp_coord_hw = tf.stack([kp_coord_h, kp_coord_w], 1) # determine size of crop (measure spatial extend of hw coords first) min_coord = tf.maximum(tf.reduce_min(kp_coord_hw, 0), 0.0) max_coord = tf.minimum(tf.reduce_max(kp_coord_hw, 0), self.image_size) # find out larger distance wrt the center of crop crop_size_best = 2*tf.maximum(max_coord - crop_center, crop_center - min_coord) crop_size_best = tf.reduce_max(crop_size_best) crop_size_best = tf.minimum(tf.maximum(crop_size_best, 50.0), 500.0) # catch problem, when no valid kp available crop_size_best = tf.cond(tf.reduce_all(tf.is_finite(crop_size_best)), lambda: crop_size_best, lambda: tf.constant(200.0)) crop_size_best.set_shape([]) # calculate necessary scaling scale = tf.cast(self.crop_size, tf.float32) / crop_size_best scale = tf.minimum(tf.maximum(scale, 1.0), 10.0) scale *= crop_scale_noise data_dict['crop_scale'] = scale if self.crop_offset_noise: noise = tf.truncated_normal([2], mean=0.0, stddev=self.crop_offset_noise_sigma) crop_center += noise # Crop image img_crop = crop_image_from_xy(tf.expand_dims(image, 0), crop_center, self.crop_size, scale) data_dict['image_crop'] = tf.squeeze(img_crop) # Modify uv21 coordinates crop_center_float = tf.cast(crop_center, tf.float32) keypoint_uv21_u = (data_dict['keypoint_uv21'][:, 0] - crop_center_float[1]) * scale + self.crop_size // 2 keypoint_uv21_v = (data_dict['keypoint_uv21'][:, 1] - crop_center_float[0]) * scale + self.crop_size // 2 keypoint_uv21 = tf.stack([keypoint_uv21_u, keypoint_uv21_v], 1) data_dict['keypoint_uv21'] = keypoint_uv21 # Modify camera intrinsics scale = tf.reshape(scale, [1, ]) scale_matrix = tf.dynamic_stitch([[0], [1], [2], [3], [4], [5], [6], [7], [8]], [scale, [0.0], [0.0], [0.0], scale, [0.0], [0.0], [0.0], [1.0]]) scale_matrix = tf.reshape(scale_matrix, [3, 3]) crop_center_float = tf.cast(crop_center, tf.float32) trans1 = crop_center_float[0] * scale - self.crop_size // 2 trans2 = crop_center_float[1] * scale - self.crop_size // 2 trans1 = tf.reshape(trans1, [1, ]) trans2 = tf.reshape(trans2, [1, ]) trans_matrix = tf.dynamic_stitch([[0], [1], [2], [3], [4], [5], [6], [7], [8]], [[1.0], [0.0], -trans2, [0.0], [1.0], -trans1, [0.0], [0.0], [1.0]]) trans_matrix = tf.reshape(trans_matrix, [3, 3]) data_dict['cam_mat'] = tf.matmul(trans_matrix, tf.matmul(scale_matrix, data_dict['cam_mat'])) """ DEPENDENT DATA ITEMS: Scoremap from the SUBSET of 21 keypoints""" # create scoremaps from the subset of 2D annoataion keypoint_hw21 = tf.stack([keypoint_uv21[:, 1], keypoint_uv21[:, 0]], -1) scoremap_size = self.image_size if self.hand_crop: scoremap_size = (self.crop_size, self.crop_size) scoremap = self.create_multiple_gaussian_map(keypoint_hw21, scoremap_size, self.sigma, valid_vec=keypoint_vis21) if self.scoremap_dropout: scoremap = tf.nn.dropout(scoremap, self.scoremap_dropout_prob, noise_shape=[1, 1, 21]) scoremap *= self.scoremap_dropout_prob data_dict['scoremap'] = scoremap if self.random_crop_to_size: tensor_stack = tf.concat([data_dict['image'], tf.expand_dims(tf.cast(data_dict['hand_parts'], tf.float32), -1), tf.cast(data_dict['hand_mask'], tf.float32)], 2) s = tensor_stack.get_shape().as_list() tensor_stack_cropped = tf.random_crop(tensor_stack, [self.random_crop_size, self.random_crop_size, s[2]]) data_dict = dict() # delete everything else because the random cropping makes the data invalid anyway data_dict['image'], data_dict['hand_parts'], data_dict['hand_mask'] = tensor_stack_cropped[:, :, :3],\ tf.cast(tensor_stack_cropped[:, :, 3], tf.int32),\ tf.cast(tensor_stack_cropped[:, :, 4:], tf.int32) names, tensors = zip(*data_dict.items()) if self.shuffle: tensors = tf.train.shuffle_batch_join([tensors], batch_size=self.batch_size, capacity=100, min_after_dequeue=50, enqueue_many=False) else: tensors = tf.train.batch_join([tensors], batch_size=self.batch_size, capacity=100, enqueue_many=False) return dict(zip(names, tensors))
def transformer_model(input_tensor, attention_mask=None, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, intermediate_act_fn=gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=False): """Multi-headed, multi-layer Transformer from "Attention is All You Need". This is almost an exact implementation of the original Transformer encoder. See the original paper: https://arxiv.org/abs/1706.03762 Also see: https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py Args: input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, seq_length], with 1 for positions that can be attended to and 0 in positions that should not be. hidden_size: int. Hidden size of the Transformer. num_hidden_layers: int. Number of layers (blocks) in the Transformer. num_attention_heads: int. Number of attention heads in the Transformer. intermediate_size: int. The size of the "intermediate" (a.k.a., feed forward) layer. intermediate_act_fn: function. The non-linear activation function to apply to the output of the intermediate/feed-forward layer. hidden_dropout_prob: float. Dropout probability for the hidden layers. attention_probs_dropout_prob: float. Dropout probability of the attention probabilities. initializer_range: float. Range of the initializer (stddev of truncated normal). do_return_all_layers: Whether to also return all layers or just the final layer. Returns: float Tensor of shape [batch_size, seq_length, hidden_size], the final hidden layer of the Transformer. Raises: ValueError: A Tensor shape or parameter is invalid. """ if hidden_size % num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (hidden_size, num_attention_heads)) attention_head_size = int(hidden_size / num_attention_heads) input_shape = get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] input_width = input_shape[2] # The Transformer performs sum residuals on all layers so the input needs # to be the same as the hidden size. if input_width != hidden_size: raise ValueError("The width of the input tensor (%d) != hidden size (%d)" % (input_width, hidden_size)) # We keep the representation as a 2D tensor to avoid re-shaping it back and # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on # the GPU/CPU but may not be free on the TPU, so we want to minimize them to # help the optimizer. prev_output = reshape_to_matrix(input_tensor) attn_maps = [] all_layer_outputs = [] for layer_idx in range(num_hidden_layers): with tf.variable_scope("layer_%d" % layer_idx): with tf.variable_scope("attention"): attention_heads = [] with tf.variable_scope("self"): attention_head, probs = attention_layer( from_tensor=prev_output, to_tensor=prev_output, attention_mask=attention_mask, num_attention_heads=num_attention_heads, size_per_head=attention_head_size, attention_probs_dropout_prob=attention_probs_dropout_prob, initializer_range=initializer_range, do_return_2d_tensor=True, batch_size=batch_size, from_seq_length=seq_length, to_seq_length=seq_length) attention_heads.append(attention_head) attn_maps.append(probs) attention_output = None if len(attention_heads) == 1: attention_output = attention_heads[0] else: # In the case where we have other sequences, we just concatenate # them to the self-attention head before the projection. attention_output = tf.concat(attention_heads, axis=-1) # Run a linear projection of `hidden_size` then add a residual # with `layer_input`. with tf.variable_scope("output"): attention_output = tf.layers.dense( attention_output, hidden_size, kernel_initializer=create_initializer(initializer_range)) attention_output = dropout(attention_output, hidden_dropout_prob) attention_output = layer_norm(attention_output + prev_output) # The activation is only applied to the "intermediate" hidden layer. with tf.variable_scope("intermediate"): intermediate_output = tf.layers.dense( attention_output, intermediate_size, activation=intermediate_act_fn, kernel_initializer=create_initializer(initializer_range)) # Down-project back to `hidden_size` then add the residual. with tf.variable_scope("output"): prev_output = tf.layers.dense( intermediate_output, hidden_size, kernel_initializer=create_initializer(initializer_range)) prev_output = dropout(prev_output, hidden_dropout_prob) prev_output = layer_norm(prev_output + attention_output) all_layer_outputs.append(prev_output) attn_maps = tf.stack(attn_maps, 0) if do_return_all_layers: return tf.stack([reshape_from_matrix(layer, input_shape) for layer in all_layer_outputs], 0), attn_maps else: return reshape_from_matrix(prev_output, input_shape), attn_maps
def _generate_detections_tf(cls_outputs, box_outputs, anchor_boxes, indices, classes, image_id, image_scale, min_score_thresh=0.2, max_boxes_to_draw=50, soft_nms_sigma=0.0, iou_threshold=0.5, use_native_nms=True): """Generates detections with model outputs and anchors. Args: cls_outputs: a numpy array with shape [N, 1], which has the highest class scores on all feature levels. The N is the number of selected top-K total anchors on all levels. (k being MAX_DETECTION_POINTS) box_outputs: a numpy array with shape [N, 4], which stacks box regression outputs on all feature levels. The N is the number of selected top-k total anchors on all levels. (k being MAX_DETECTION_POINTS) anchor_boxes: a numpy array with shape [N, 4], which stacks anchors on all feature levels. The N is the number of selected top-k total anchors on all levels. indices: a numpy array with shape [N], which is the indices from top-k selection. classes: a numpy array with shape [N], which represents the class prediction on all selected anchors from top-k selection. image_id: an integer number to specify the image id. image_scale: a float tensor representing the scale between original image and input image for the detector. It is used to rescale detections for evaluating with the original groundtruth annotations. num_classes: a integer that indicates the number of classes. min_score_thresh: A float representing the threshold for deciding when to remove boxes based on score. max_boxes_to_draw: Max number of boxes to draw. soft_nms_sigma: A scalar float representing the Soft NMS sigma parameter; See Bodla et al, https://arxiv.org/abs/1704.04503). When `soft_nms_sigma=0.0` (which is default), we fall back to standard (hard) NMS. iou_threshold: A float representing the threshold for deciding whether boxes overlap too much with respect to IOU. use_native_nms: a bool that indicates whether to use native nms. Returns: detections: detection results in a tensor with each row representing [image_id, y, x, height, width, score, class] """ logging.info('Using tf version of post-processing.') anchor_boxes = tf.gather(anchor_boxes, indices) scores = tf.math.sigmoid(cls_outputs) # apply bounding box regression to anchors boxes = decode_box_outputs_tf(tf.transpose(box_outputs, [1, 0]), tf.transpose(anchor_boxes, [1, 0])) if use_native_nms: logging.info('Using native nms.') top_detection_idx, scores = tf.image.non_max_suppression_with_scores( boxes, scores, max_boxes_to_draw, iou_threshold=iou_threshold, score_threshold=min_score_thresh, soft_nms_sigma=soft_nms_sigma) boxes = tf.gather(boxes, top_detection_idx) else: logging.info('Using customized nms.') scores = tf.expand_dims(scores, axis=1) all_detections = tf.concat([boxes, scores], axis=1) top_detection_idx = nms_tf(all_detections, iou_threshold) detections = tf.gather(all_detections, top_detection_idx) scores = detections[:, 4] boxes = detections[:, :4] height = boxes[:, 2] - boxes[:, 0] width = boxes[:, 3] - boxes[:, 1] detections = tf.stack([ tf.cast(tf.repeat(image_id, tf.size(top_detection_idx)), tf.float32), boxes[:, 0] * image_scale, boxes[:, 1] * image_scale, height * image_scale, width * image_scale, scores, tf.cast(tf.gather(classes, top_detection_idx) + 1, tf.float32) ], axis=1) return detections
def _generate_detections_tf(cls_outputs, box_outputs, anchor_boxes, indices, classes, image_id, image_scale, image_size, min_score_thresh=MIN_SCORE_THRESH, max_boxes_to_draw=MAX_DETECTIONS_PER_IMAGE, soft_nms_sigma=0.25, iou_threshold=0.5): """Generates detections with model outputs and anchors. Args: cls_outputs: a numpy array with shape [N, 1], which has the highest class scores on all feature levels. The N is the number of selected top-K total anchors on all levels. (k being MAX_DETECTION_POINTS) box_outputs: a numpy array with shape [N, 4], which stacks box regression outputs on all feature levels. The N is the number of selected top-k total anchors on all levels. (k being MAX_DETECTION_POINTS) anchor_boxes: a numpy array with shape [N, 4], which stacks anchors on all feature levels. The N is the number of selected top-k total anchors on all levels. indices: a numpy array with shape [N], which is the indices from top-k selection. classes: a numpy array with shape [N], which represents the class prediction on all selected anchors from top-k selection. image_id: an integer number to specify the image id. image_scale: a float tensor representing the scale between original image and input image for the detector. It is used to rescale detections for evaluating with the original groundtruth annotations. image_size: a tuple (height, width) or an integer for image size. min_score_thresh: A float representing the threshold for deciding when to remove boxes based on score. max_boxes_to_draw: Max number of boxes to draw. soft_nms_sigma: A scalar float representing the Soft NMS sigma parameter; See Bodla et al, https://arxiv.org/abs/1704.04503). When `soft_nms_sigma=0.0` (which is default), we fall back to standard (hard) NMS. iou_threshold: A float representing the threshold for deciding whether boxes overlap too much with respect to IOU. Returns: detections: detection results in a tensor with each row representing [image_id, ymin, xmin, ymax, xmax, score, class] """ if not image_size: raise ValueError( 'tf version generate_detection needs non-empty image_size') logging.info('Using tf version of post-processing.') anchor_boxes = tf.gather(anchor_boxes, indices) scores = tf.math.sigmoid(cls_outputs) # apply bounding box regression to anchors boxes = decode_box_outputs_tf(box_outputs, anchor_boxes) # TF API is slightly different from paper, here we follow the paper value: # https://github.com/tensorflow/tensorflow/issues/40253. top_detection_idx, scores = tf.image.non_max_suppression_with_scores( boxes, scores, max_boxes_to_draw, iou_threshold=iou_threshold, score_threshold=min_score_thresh, soft_nms_sigma=soft_nms_sigma) boxes = tf.gather(boxes, top_detection_idx) image_size = utils.parse_image_size(image_size) detections = tf.stack([ tf.cast(tf.tile(image_id, tf.shape(top_detection_idx)), tf.float32), tf.clip_by_value(boxes[:, 0], 0, image_size[0]) * image_scale, tf.clip_by_value(boxes[:, 1], 0, image_size[1]) * image_scale, tf.clip_by_value(boxes[:, 2], 0, image_size[0]) * image_scale, tf.clip_by_value(boxes[:, 3], 0, image_size[1]) * image_scale, scores, tf.cast(tf.gather(classes, top_detection_idx) + 1, tf.float32) ], axis=1) return detections
def rnn(cell, inputs, sequence_length=None, initial_state=None, ff_keep_prob=1., recur_keep_prob=1., enforce_dropout=False, dtype=tf.float32, scope=None): """ """ inputs = tf.transpose(inputs, [1, 0, 2]) # (B,T,D) => (T,B,D) parallel_iterations = 32 if sequence_length is not None: sequence_length = tf.to_int32(sequence_length) with tf.variable_scope(scope or 'RNN') as varscope: #if varscope.caching_device is None: # varscope.set_caching_device(lambda op: op.device) input_shape = tf.shape(inputs) time_steps, batch_size, _ = tf.unstack(input_shape, 3) const_time_steps, const_batch_size, const_depth = inputs.get_shape( ).as_list() if initial_state is not None: state = initial_state else: if not dtype: raise ValueError( 'If no initial_state is provided, dtype must be.') state = cell.zero_state(batch_size, dtype) zero_output = tf.zeros(tf.stack([batch_size, cell.output_size]), inputs.dtype) if sequence_length is not None: min_sequence_length = tf.reduce_min(sequence_length) max_sequence_length = tf.reduce_max(sequence_length) time = tf.constant(0, dtype=tf.int32, name='time') output_ta = tf.TensorArray(dtype=inputs.dtype, size=time_steps, tensor_array_name='dynamic_rnn_output') input_ta = tf.TensorArray(dtype=inputs.dtype, size=time_steps, tensor_array_name='dynamic_rnn_input') if ff_keep_prob < 1: noise_shape = tf.stack([1, batch_size, const_depth]) if enforce_dropout is not None: inputs = tf.layers.dropout(inputs, 1 - ff_keep_prob, noise_shape=noise_shape, training=enforce_dropout) else: inputs = tf.nn.dropout(inputs, ff_keep_prob, noise_shape=noise_shape) if recur_keep_prob < 1: ones = tf.ones(tf.stack([batch_size, cell.output_size])) if enforce_dropout is not None: state_dropout = tf.layers.dropout(ones, 1 - recur_keep_prob, training=enforce_dropout) else: state_dropout = tf.nn.dropout(ones, recur_keep_prob) state_dropout = tf.concat( [ones] * (cell.state_size // cell.output_size - 1) + [state_dropout], 1) else: state_dropout = 1 input_ta = input_ta.unstack(inputs) #----------------------------------------------------------- def _time_step(time, state, output_ta_t): """ """ input_t = input_ta.read(time) #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - def _empty_update(): return zero_output, state #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - def _call_cell(): return cell(input_t, state * state_dropout) #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - def _maybe_copy_some_through(): new_output, new_state = _call_cell() return tf.cond( time < min_sequence_length, lambda: (new_output, new_state), lambda: (tf.where( time >= sequence_length, zero_output, new_output ), tf.where(time >= sequence_length, state, new_state))) #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - if sequence_length is not None: output, new_state = tf.cond(time >= max_sequence_length, _empty_update, _maybe_copy_some_through) else: (output, new_state) = _call_cell() output_ta_t = output_ta_t.write(time, output) return (time + 1, new_state, output_ta_t) #----------------------------------------------------------- _, final_state, output_final_ta = tf.while_loop( cond=lambda time, _1, _2: time < time_steps, body=_time_step, loop_vars=(time, state, output_ta), parallel_iterations=parallel_iterations) final_outputs = output_final_ta.stack() outputs = tf.transpose(final_outputs, [1, 0, 2]) # (T,B,D) => (B,T,D) return outputs, final_state
def _prepare_groundtruth_for_eval(detection_model, class_agnostic, max_number_of_boxes): """Extracts groundtruth data from detection_model and prepares it for eval. Args: detection_model: A `DetectionModel` object. class_agnostic: Whether the detections are class_agnostic. max_number_of_boxes: Max number of groundtruth boxes. Returns: A tuple of: groundtruth: Dictionary with the following fields: 'groundtruth_boxes': [batch_size, num_boxes, 4] float32 tensor of boxes, in normalized coordinates. 'groundtruth_classes': [batch_size, num_boxes] int64 tensor of 1-indexed classes. 'groundtruth_masks': 4D float32 tensor of instance masks (if provided in groundtruth) 'groundtruth_is_crowd': [batch_size, num_boxes] bool tensor indicating is_crowd annotations (if provided in groundtruth). 'groundtruth_area': [batch_size, num_boxes] float32 tensor indicating the area (in the original absolute coordinates) of annotations (if provided in groundtruth). 'num_groundtruth_boxes': [batch_size] tensor containing the maximum number of groundtruth boxes per image.. 'groundtruth_keypoints': [batch_size, num_boxes, num_keypoints, 2] float32 tensor of keypoints (if provided in groundtruth). 'groundtruth_group_of': [batch_size, num_boxes] bool tensor indicating group_of annotations (if provided in groundtruth). 'groundtruth_labeled_classes': [batch_size, num_classes] int64 tensor of 1-indexed classes. class_agnostic: Boolean indicating whether detections are class agnostic. """ input_data_fields = fields.InputDataFields() groundtruth_boxes = tf.stack( detection_model.groundtruth_lists(fields.BoxListFields.boxes)) groundtruth_boxes_shape = tf.shape(groundtruth_boxes) # For class-agnostic models, groundtruth one-hot encodings collapse to all # ones. if class_agnostic: groundtruth_classes_one_hot = tf.ones( [groundtruth_boxes_shape[0], groundtruth_boxes_shape[1], 1]) else: groundtruth_classes_one_hot = tf.stack( detection_model.groundtruth_lists(fields.BoxListFields.classes)) label_id_offset = 1 # Applying label id offset (b/63711816) groundtruth_classes = ( tf.argmax(groundtruth_classes_one_hot, axis=2) + label_id_offset) groundtruth = { input_data_fields.groundtruth_boxes: groundtruth_boxes, input_data_fields.groundtruth_classes: groundtruth_classes } if detection_model.groundtruth_has_field(fields.BoxListFields.masks): groundtruth[input_data_fields.groundtruth_instance_masks] = tf.stack( detection_model.groundtruth_lists(fields.BoxListFields.masks)) if detection_model.groundtruth_has_field(fields.BoxListFields.is_crowd): groundtruth[input_data_fields.groundtruth_is_crowd] = tf.stack( detection_model.groundtruth_lists(fields.BoxListFields.is_crowd)) if detection_model.groundtruth_has_field(input_data_fields.groundtruth_area): groundtruth[input_data_fields.groundtruth_area] = tf.stack( detection_model.groundtruth_lists(input_data_fields.groundtruth_area)) if detection_model.groundtruth_has_field(fields.BoxListFields.keypoints): groundtruth[input_data_fields.groundtruth_keypoints] = tf.stack( detection_model.groundtruth_lists(fields.BoxListFields.keypoints)) if detection_model.groundtruth_has_field( fields.BoxListFields.keypoint_visibilities): groundtruth[input_data_fields.groundtruth_keypoint_visibilities] = tf.stack( detection_model.groundtruth_lists( fields.BoxListFields.keypoint_visibilities)) if detection_model.groundtruth_has_field(fields.BoxListFields.group_of): groundtruth[input_data_fields.groundtruth_group_of] = tf.stack( detection_model.groundtruth_lists(fields.BoxListFields.group_of)) if detection_model.groundtruth_has_field( fields.InputDataFields.groundtruth_labeled_classes): labeled_classes_list = detection_model.groundtruth_lists( fields.InputDataFields.groundtruth_labeled_classes) labeled_classes = [ tf.where(x)[:, 0] + label_id_offset for x in labeled_classes_list ] if len(labeled_classes) > 1: num_classes = labeled_classes_list[0].shape[0] padded_labeled_classes = [] for x in labeled_classes: padding = num_classes - tf.shape(x)[0] padded_labeled_classes.append(tf.pad(x, [[0, padding]])) groundtruth[input_data_fields.groundtruth_labeled_classes] = tf.stack( padded_labeled_classes) else: groundtruth[input_data_fields.groundtruth_labeled_classes] = tf.stack( labeled_classes) groundtruth[input_data_fields.num_groundtruth_boxes] = ( tf.tile([max_number_of_boxes], multiples=[groundtruth_boxes_shape[0]])) return groundtruth
def add_metric_fn_inputs(params, cls_outputs, box_outputs, metric_fn_inputs, max_detection_points=anchors.MAX_DETECTION_POINTS): """Selects top-k predictions and adds the selected to metric_fn_inputs. Args: params: a parameter dictionary that includes `min_level`, `max_level`, `batch_size`, and `num_classes`. cls_outputs: an OrderDict with keys representing levels and values representing logits in [batch_size, height, width, num_anchors]. box_outputs: an OrderDict with keys representing levels and values representing box regression targets in [batch_size, height, width, num_anchors * 4]. metric_fn_inputs: a dictionary that will hold the top-k selections. max_detection_points: an integer specifing the maximum detection points to keep before NMS. Keep all anchors if max_detection_points <= 0. """ batch_size = params['batch_size'] num_classes = params['num_classes'] cls_outputs_all = [] box_outputs_all = [] # Concatenates class and box of all levels into one tensor. for level in range(params['min_level'], params['max_level'] + 1): if params['data_format'] == 'channels_first': cls_outputs[level] = tf.transpose(cls_outputs[level], [0, 2, 3, 1]) box_outputs[level] = tf.transpose(box_outputs[level], [0, 2, 3, 1]) cls_outputs_all.append(tf.reshape( cls_outputs[level], [batch_size, -1, num_classes])) box_outputs_all.append(tf.reshape(box_outputs[level], [batch_size, -1, 4])) cls_outputs_all = tf.concat(cls_outputs_all, 1) box_outputs_all = tf.concat(box_outputs_all, 1) if max_detection_points > 0: # Prune anchors and detections to only keep max_detection_points. # Due to some issues, top_k is currently slow in graph model. cls_outputs_all_reshape = tf.reshape(cls_outputs_all, [batch_size, -1]) _, cls_topk_indices = tf.math.top_k(cls_outputs_all_reshape, k=max_detection_points, sorted=False) indices = cls_topk_indices // num_classes classes = cls_topk_indices % num_classes cls_indices = tf.stack([indices, classes], axis=2) cls_outputs_all_after_topk = tf.gather_nd( cls_outputs_all, cls_indices, batch_dims=1) box_outputs_all_after_topk = tf.gather_nd( box_outputs_all, tf.expand_dims(indices, 2), batch_dims=1) else: # Keep all anchors, but for each anchor, just keep the max probablity for # each class. cls_outputs_idx = tf.math.argmax( cls_outputs_all, axis=-1, output_type=tf.int32) num_anchors = cls_outputs_all.shape[1] classes = cls_outputs_idx indices = tf.tile(tf.expand_dims(tf.range(num_anchors), axis=0), [batch_size, 1]) cls_outputs_all_after_topk = tf.reduce_max(cls_outputs_all, -1) box_outputs_all_after_topk = box_outputs_all metric_fn_inputs['cls_outputs_all'] = cls_outputs_all_after_topk metric_fn_inputs['box_outputs_all'] = box_outputs_all_after_topk metric_fn_inputs['indices_all'] = indices metric_fn_inputs['classes_all'] = classes
def det_post_process(params: Dict[Any, Any], cls_outputs: Dict[int, tf.Tensor], box_outputs: Dict[int, tf.Tensor], scales: List[float], min_score_thresh, max_boxes_to_draw): """Post preprocessing the box/class predictions. Args: params: a parameter dictionary that includes `min_level`, `max_level`, `batch_size`, and `num_classes`. cls_outputs: an OrderDict with keys representing levels and values representing logits in [batch_size, height, width, num_anchors]. box_outputs: an OrderDict with keys representing levels and values representing box regression targets in [batch_size, height, width, num_anchors * 4]. scales: a list of float values indicating image scale. min_score_thresh: A float representing the threshold for deciding when to remove boxes based on score. max_boxes_to_draw: Max number of boxes to draw. Returns: detections_batch: a batch of detection results. Each detection is a tensor with each row as [image_id, ymin, xmin, ymax, xmax, score, class]. """ if not params['batch_size']: # Use combined version for dynamic batch size. return det_post_process_combined(params, cls_outputs, box_outputs, scales, min_score_thresh, max_boxes_to_draw) # TODO(tanmingxing): refactor the code to make it more explicity. outputs = { 'cls_outputs_all': [None], 'box_outputs_all': [None], 'indices_all': [None], 'classes_all': [None] } det_model_fn.add_metric_fn_inputs(params, cls_outputs, box_outputs, outputs, -1) # Create anchor_label for picking top-k predictions. eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) # Add all detections for each input image. detections_batch = [] for index in range(params['batch_size']): cls_outputs_per_sample = outputs['cls_outputs_all'][index] box_outputs_per_sample = outputs['box_outputs_all'][index] indices_per_sample = outputs['indices_all'][index] classes_per_sample = outputs['classes_all'][index] detections = anchor_labeler.generate_detections( cls_outputs_per_sample, box_outputs_per_sample, indices_per_sample, classes_per_sample, image_id=[index], image_scale=[scales[index]], image_size=params['image_size'], min_score_thresh=min_score_thresh, max_boxes_to_draw=max_boxes_to_draw, disable_pyfun=params.get('disable_pyfun')) if params['batch_size'] > 1: # pad to fixed length if batch size > 1. padding_size = max_boxes_to_draw - tf.shape(detections)[0] detections = tf.pad(detections, [[0, padding_size], [0, 0]]) detections_batch.append(detections) return tf.stack(detections_batch, name='detections')
def _stitch(features): """Stitch features on the first dimension.""" full_mask = tf.greater(features['task'], 1) step_mask = tf.reduce_any(full_mask, axis=-1) step_mask_exclude_last = tf.pad(step_mask, [[0, 0], [0, 1]], constant_values=False)[:, 1:] num_sequences = common_layers.shape_list(features['task'])[0] num_steps = common_layers.shape_list(features['task'])[1] connectors = tf.constant(PADDED_CONCATENATORS) # Select connectors connector_indices = tf.random.uniform( [num_sequences * num_steps], minval=0, maxval=len(PADDED_CONCATENATORS), dtype=tf.int32) selected_connectors = tf.reshape( tf.gather(connectors, connector_indices), [num_sequences, num_steps, len(PADDED_CONCATENATORS[0])]) selected_connectors = tf.multiply( selected_connectors, tf.expand_dims(tf.to_int32(step_mask_exclude_last), 2), name='connector_mask') features['task'] = tf.concat([features['task'], selected_connectors], axis=-1) ref_offsets = tf.expand_dims( tf.cumsum(tf.reduce_sum(tf.to_int32(tf.greater(features['task'], 1)), -1), exclusive=True, axis=-1), 2) features['task'] = tf.reshape(features['task'], [num_sequences, -1]) full_mask = tf.greater(features['task'], 1) full_mask_int = tf.to_int32(full_mask) indices = tf.where(tf.sequence_mask(lengths=tf.reduce_sum(full_mask_int, -1))) values = tf.boolean_mask(tf.reshape(features['task'], [-1]), tf.reshape(full_mask, [-1])) sparse_task = tf.sparse.SparseTensor( indices=indices, values=values, dense_shape=tf.to_int64(tf.shape(features['task']))) # Stitch task and raw_task stitched_features = {} stitched_features['task'] = tf.sparse_tensor_to_dense(sparse_task) max_len = tf.reduce_max( tf.reduce_sum(tf.to_int32(tf.greater(stitched_features['task'], 1)), -1)) stitched_features['task'] = stitched_features['task'][:, :max_len] if 'raw_task' in features: connector_strs = tf.reshape( tf.gather(tf.constant(CONCATENATORS_STR), connector_indices), [num_sequences, num_steps]) masked_connector_strs = tf.where( step_mask_exclude_last, connector_strs, tf.fill(tf.shape(connector_strs), '')) stitched_features['raw_task'] = tf.strings.reduce_join( tf.strings.reduce_join(tf.concat([ tf.expand_dims(features['raw_task'], 2), tf.expand_dims(masked_connector_strs, 2)], axis=2), axis=-1), -1) # Stitch screen sequences action_lengths = tf.reduce_sum(tf.to_int32( tf.greater(features['verb_refs'][:, :, 0, 1], features['verb_refs'][:, :, 0, 0])), -1) max_action_length = tf.reduce_max(action_lengths) def _pad(tensor, padding_value=0): shape_list = common_layers.shape_list(tensor) assert len(shape_list) >= 2 padding_list = [[0, 0], [0, 1]] + [[0, 0]] * (len(shape_list) - 2) return tf.pad(tensor[:, :max_action_length], padding_list, constant_values=padding_value) for key in features.keys(): if key.endswith('_refs'): features[key] = tf.squeeze(features[key], 2) ref_mask = tf.expand_dims(tf.to_int32( tf.not_equal(features[key][:, :, 0], features[key][:, :, 1])), 2) stitched_features[key] = tf.multiply( (features[key] + ref_offsets), ref_mask, name='ref_mask') stitched_features[key] = _pad(stitched_features[key]) elif key in ['verbs', 'objects', 'consumed', 'obj_dom_pos', 'obj_text', 'obj_type', 'obj_clickable', 'obj_screen_pos', 'verb_refs', 'obj_refs', 'input_refs', 'obj_dom_dist']: features[key] = tf.squeeze(features[key], 2) stitched_features[key] = features[key] stitched_features[key] = _pad( stitched_features[key], padding_value=-1 if key == 'obj_type' else 0) elif key not in ['task', 'raw_task']: stitched_features[key] = features[key][:, 0] # Append eos to 'task' stitched_features['task'] = tf.pad(stitched_features['task'], [[0, 0], [0, 1]]) task_mask = tf.to_int32(tf.greater(stitched_features['task'], 1)) task_eos_mask = tf.pad(task_mask, [[0, 0], [1, 0]], constant_values=1)[:, :-1] stitched_features['task'] = stitched_features['task'] + ( task_eos_mask - task_mask) # Append eos verb_mask = tf.to_int32(tf.greater(stitched_features['verbs'], 1)) verb_eos_mask = tf.pad(verb_mask, [[0, 0], [1, 0]], constant_values=1)[:, :-1] verb_eos = verb_eos_mask - verb_mask stitched_features['verbs'] = stitched_features['verbs'] + verb_eos # Append last step refs to 'verb_refs' task_lengths = tf.where(tf.equal(stitched_features['task'], 1))[:, 1] eos_pos = tf.to_int32(tf.stack([task_lengths, task_lengths + 1], axis=1)) action_mask = tf.to_int32( tf.sequence_mask(action_lengths, max_action_length + 1)) action_and_eos_mask = tf.pad(action_mask, [[0, 0], [1, 0]], constant_values=1)[:, :-1] verb_ref_eos = action_and_eos_mask - action_mask eos_refs = tf.multiply( tf.tile(tf.expand_dims(eos_pos, 1), [1, max_action_length + 1, 1]), tf.expand_dims(verb_ref_eos, 2), name='verb_ref_eos') stitched_features['verb_refs'] += eos_refs return stitched_features
def add_input_distortions(flip_left_right, random_crop, random_scale, random_brightness): """Creates the operations to apply the specified distortions. During training it can help to improve the results if we run the images through simple distortions like crops, scales, and flips. These reflect the kind of variations we expect in the real world, and so can help train the model to cope with natural data more effectively. Here we take the supplied parameters and construct a network of operations to apply them to an image. Cropping ~~~~~~~~ Cropping is done by placing a bounding box at a random position in the full image. The cropping parameter controls the size of that box relative to the input image. If it's zero, then the box is the same size as the input and no cropping is performed. If the value is 50%, then the crop box will be half the width and height of the input. In a diagram it looks like this: < width > +---------------------+ | | | width - crop% | | < > | | +------+ | | | | | | | | | | | | | | +------+ | | | | | +---------------------+ Scaling ~~~~~~~ Scaling is a lot like cropping, except that the bounding box is always centered and its size varies randomly within the given range. For example if the scale percentage is zero, then the bounding box is the same size as the input and no scaling is applied. If it's 50%, then the bounding box will be in a random range between half the width and height and full size. Args: flip_left_right: Boolean whether to randomly mirror images horizontally. random_crop: Integer percentage setting the total margin used around the crop box. random_scale: Integer percentage of how much to vary the scale by. random_brightness: Integer range to randomly multiply the pixel values by. graph. Returns: The jpeg input layer and the distorted result tensor. """ jpeg_data = tf.placeholder(tf.string, name='DistortJPGInput') decoded_image = tf.image.decode_jpeg(jpeg_data, channels=MODEL_INPUT_DEPTH) decoded_image_as_float = tf.cast(decoded_image, dtype=tf.float32) decoded_image_4d = tf.expand_dims(decoded_image_as_float, 0) margin_scale = 1.0 + (random_crop / 100.0) resize_scale = 1.0 + (random_scale / 100.0) margin_scale_value = tf.constant(margin_scale) resize_scale_value = tf.random_uniform(tensor_shape.scalar(), minval=1.0, maxval=resize_scale) scale_value = tf.multiply(margin_scale_value, resize_scale_value) precrop_width = tf.multiply(scale_value, MODEL_INPUT_WIDTH) precrop_height = tf.multiply(scale_value, MODEL_INPUT_HEIGHT) precrop_shape = tf.stack([precrop_height, precrop_width]) precrop_shape_as_int = tf.cast(precrop_shape, dtype=tf.int32) precropped_image = tf.image.resize_bilinear(decoded_image_4d, precrop_shape_as_int) precropped_image_3d = tf.squeeze(precropped_image, squeeze_dims=[0]) cropped_image = tf.random_crop( precropped_image_3d, [MODEL_INPUT_HEIGHT, MODEL_INPUT_WIDTH, MODEL_INPUT_DEPTH]) if flip_left_right: flipped_image = tf.image.random_flip_left_right(cropped_image) else: flipped_image = cropped_image brightness_min = 1.0 - (random_brightness / 100.0) brightness_max = 1.0 + (random_brightness / 100.0) brightness_value = tf.random_uniform(tensor_shape.scalar(), minval=brightness_min, maxval=brightness_max) brightened_image = tf.multiply(flipped_image, brightness_value) distort_result = tf.expand_dims(brightened_image, 0, name='DistortResult') return jpeg_data, distort_result
def __init__(self, session, player_id, state_representation_size, num_actions, hidden_layers_sizes=128, replay_buffer_capacity=10000, batch_size=128, replay_buffer_class=ReplayBuffer, learning_rate=0.01, update_target_network_every=1000, learn_every=10, discount_factor=1.0, min_buffer_size_to_learn=1000, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_duration=int(1e6), optimizer_str="sgd", loss_str="mse"): """Initialize the DQN agent.""" # This call to locals() is used to store every argument used to initialize # the class instance, so it can be copied with no hyperparameter change. self._kwargs = locals() self.player_id = player_id self._session = session self._num_actions = num_actions if isinstance(hidden_layers_sizes, int): hidden_layers_sizes = [hidden_layers_sizes] self._layer_sizes = hidden_layers_sizes + [num_actions] self._batch_size = batch_size self._update_target_network_every = update_target_network_every self._learn_every = learn_every self._min_buffer_size_to_learn = min_buffer_size_to_learn self._discount_factor = discount_factor self._epsilon_start = epsilon_start self._epsilon_end = epsilon_end self._epsilon_decay_duration = epsilon_decay_duration # TODO(author6) Allow for optional replay buffer config. if not isinstance(replay_buffer_capacity, int): raise ValueError("Replay buffer capacity not an integer.") self._replay_buffer = replay_buffer_class(replay_buffer_capacity) self._prev_timestep = None self._prev_action = None # Step counter to keep track of learning, eps decay and target network. self._step_counter = 0 # Keep track of the last training loss achieved in an update step. self._last_loss_value = None # Create required TensorFlow placeholders to perform the Q-network updates. self._info_state_ph = tf.placeholder( shape=[None, state_representation_size], dtype=tf.float32, name="info_state_ph") self._action_ph = tf.placeholder(shape=[None], dtype=tf.int32, name="action_ph") self._reward_ph = tf.placeholder(shape=[None], dtype=tf.float32, name="reward_ph") self._is_final_step_ph = tf.placeholder(shape=[None], dtype=tf.float32, name="is_final_step_ph") self._next_info_state_ph = tf.placeholder( shape=[None, state_representation_size], dtype=tf.float32, name="next_info_state_ph") self._legal_actions_mask_ph = tf.placeholder( shape=[None, num_actions], dtype=tf.float32, name="legal_actions_mask_ph") self._q_network = snt.nets.MLP(output_sizes=self._layer_sizes) self._q_values = self._q_network(self._info_state_ph) self._target_q_network = snt.nets.MLP(output_sizes=self._layer_sizes) self._target_q_values = self._target_q_network( self._next_info_state_ph) # Stop gradient to prevent updates to the target network while learning self._target_q_values = tf.stop_gradient(self._target_q_values) self._update_target_network = self._create_target_network_update_op( self._q_network, self._target_q_network) # Create the loss operations. # Sum a large negative constant to illegal action logits before taking the # max. This prevents illegal action values from being considered as target. illegal_actions = 1 - self._legal_actions_mask_ph illegal_logits = illegal_actions * ILLEGAL_ACTION_LOGITS_PENALTY max_next_q = tf.reduce_max(tf.math.add( tf.stop_gradient(self._target_q_values), illegal_logits), axis=-1) target = ( self._reward_ph + (1 - self._is_final_step_ph) * self._discount_factor * max_next_q) action_indices = tf.stack( [tf.range(tf.shape(self._q_values)[0]), self._action_ph], axis=-1) predictions = tf.gather_nd(self._q_values, action_indices) if loss_str == "mse": loss_class = tf.losses.mean_squared_error elif loss_str == "huber": loss_class = tf.losses.huber_loss else: raise ValueError("Not implemented, choose from 'mse', 'huber'.") self._loss = tf.reduce_mean( loss_class(labels=target, predictions=predictions)) if optimizer_str == "adam": self._optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate) elif optimizer_str == "sgd": self._optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) else: raise ValueError("Not implemented, choose from 'adam' and 'sgd'.") self._learn_step = self._optimizer.minimize(self._loss) self._initialize()
def _iou_per_anchor(pred_boxes: FloatType, target_boxes: FloatType, iou_type: Text = 'iou') -> tf.Tensor: """Computing the IoU for a single anchor. Args: pred_boxes: predicted boxes, with coordinate [y_min, x_min, y_max, x_max]. target_boxes: target boxes, with coordinate [y_min, x_min, y_max, x_max]. iou_type: one of ['iou', 'ciou', 'diou', 'giou']. Returns: IoU loss float `Tensor`. """ # t_ denotes target boxes and p_ denotes predicted boxes. t_ymin, t_xmin, t_ymax, t_xmax = target_boxes p_ymin, p_xmin, p_ymax, p_xmax = pred_boxes zero = tf.convert_to_tensor(0.0, t_ymin.dtype) p_width = tf.maximum(zero, p_xmax - p_xmin) p_height = tf.maximum(zero, p_ymax - p_ymin) t_width = tf.maximum(zero, t_xmax - t_xmin) t_height = tf.maximum(zero, t_ymax - t_ymin) p_area = p_width * p_height t_area = t_width * t_height intersect_ymin = tf.maximum(p_ymin, t_ymin) intersect_xmin = tf.maximum(p_xmin, t_xmin) intersect_ymax = tf.minimum(p_ymax, t_ymax) intersect_xmax = tf.minimum(p_xmax, t_xmax) intersect_width = tf.maximum(zero, intersect_xmax - intersect_xmin) intersect_height = tf.maximum(zero, intersect_ymax - intersect_ymin) intersect_area = intersect_width * intersect_height union_area = p_area + t_area - intersect_area iou_v = tf.math.divide_no_nan(intersect_area, union_area) if iou_type == 'iou': return iou_v # iou is the simplest form. enclose_ymin = tf.minimum(p_ymin, t_ymin) enclose_xmin = tf.minimum(p_xmin, t_xmin) enclose_ymax = tf.maximum(p_ymax, t_ymax) enclose_xmax = tf.maximum(p_xmax, t_xmax) assert iou_type in ('giou', 'diou', 'ciou') if iou_type == 'giou': # giou is the generalized iou. enclose_width = tf.maximum(zero, enclose_xmax - enclose_xmin) enclose_height = tf.maximum(zero, enclose_ymax - enclose_ymin) enclose_area = enclose_width * enclose_height giou_v = iou_v - tf.math.divide_no_nan( (enclose_area - union_area), enclose_area) return giou_v assert iou_type in ('diou', 'ciou') p_center = tf.stack([(p_ymin + p_ymax) / 2, (p_xmin + p_xmax) / 2]) t_center = tf.stack([(t_ymin + t_ymax) / 2, (t_xmin + t_xmax) / 2]) euclidean = tf.linalg.norm(t_center - p_center) diag_length = tf.linalg.norm( [enclose_ymax - enclose_ymin, enclose_xmax - enclose_xmin]) diou_v = iou_v - tf.math.divide_no_nan(euclidean**2, diag_length**2) if iou_type == 'diou': # diou is the distance iou. return diou_v assert iou_type == 'ciou' v = _get_v(p_height, p_width, t_height, t_width) alpha = tf.math.divide_no_nan(v, ((1 - iou_v) + v)) return diou_v - alpha * v # the last one is ciou.
def _using_motion_vector_with_distortion(depth, translation, rotation_angles, intrinsic_mat, distortion_coeff=0.0): """A helper for using_motion_vector. See docstring therein.""" if translation.shape.ndims not in (2, 4): raise ValueError('\'translation\' should have rank 2 or 4, not %d' % translation.shape.ndims) if translation.shape[-1] != 3: raise ValueError('translation\'s last dimension should be 3, not %d' % translation.shape[1]) if translation.shape.ndims == 2: translation = tf.expand_dims(tf.expand_dims(translation, 1), 1) _, height, width = tf.unstack(tf.shape(depth)) grid = tf.squeeze(tf.stack( tf.meshgrid(tf.range(width), tf.range(height), (1, ))), axis=3) # 3 x height x width grid = tf.to_float(grid) intrinsic_mat_inv = tf.linalg.inv(intrinsic_mat) normalized_grid = tf.einsum('bij,jhw->bihw', intrinsic_mat_inv, grid) radii_squared = tf.reduce_sum(tf.square(normalized_grid[:, :2, :, :]), axis=1) undistortion_factor = quadratic_inverse_distortion_scale( distortion_coeff, radii_squared) undistortion_factor = tf.stack([ undistortion_factor, undistortion_factor, tf.ones_like(undistortion_factor) ], axis=1) normalized_grid *= undistortion_factor rot_mat = transform_utils.matrix_from_angles(rotation_angles) # We have to treat separately the case of a per-image rotation vector and a # per-image rotation field, because the broadcasting capabilities of einsum # are limited. if rotation_angles.shape.ndims == 2: # The calculation here is identical to the one in inverse_warp above. # Howeverwe use einsum for better clarity. Under the hood, einsum performs # the reshaping and invocation of BatchMatMul, instead of doing it manually, # as in inverse_warp. pcoords = tf.einsum('bij,bjhw,bhw->bihw', rot_mat, normalized_grid, depth) elif rotation_angles.shape.ndims == 4: # We push the H and W dimensions to the end, and transpose the rotation # matrix elements (as noted above). rot_mat = tf.transpose(rot_mat, [0, 3, 4, 1, 2]) pcoords = tf.einsum('bijhw,bjhw,bhw->bihw', rot_mat, normalized_grid, depth) pcoords += tf.transpose(translation, [0, 3, 1, 2]) x, y, z = tf.unstack(pcoords, axis=1) x /= z y /= z scale = quadraric_distortion_scale(distortion_coeff, tf.square(x) + tf.square(y)) x *= scale y *= scale pcoords = tf.einsum('bij,bjhw->bihw', intrinsic_mat, tf.stack([x, y, tf.ones_like(x)], axis=1)) x, y, _ = tf.unstack(pcoords, axis=1) return x, y, z
def multilevel_crop_and_resize(features, boxes, output_size=7): """Crop and resize on multilevel feature pyramid. Generate the (output_size, output_size) set of pixels for each input box by first locating the box into the correct feature level, and then cropping and resizing it using the correspoding feature map of that level. Args: features: A dictionary with key as pyramid level and value as features. The features are in shape of [batch_size, height_l, width_l, num_filters]. boxes: A 3-D Tensor of shape [batch_size, num_boxes, 4]. Each row represents a box with [y1, x1, y2, x2] in un-normalized coordinates. output_size: A scalar to indicate the output crop size. Returns: A 5-D tensor representing feature crop of shape [batch_size, num_boxes, output_size, output_size, num_filters]. """ with tf.name_scope('multilevel_crop_and_resize'): levels = list(features.keys()) min_level = min(levels) max_level = max(levels) _, max_feature_height, max_feature_width, _ = ( features[min_level].get_shape().as_list()) # Stacks feature pyramid into a features_all of shape # [batch_size, levels, height, width, num_filters]. features_all = [] for level in range(min_level, max_level + 1): features_all.append(tf.image.pad_to_bounding_box( features[level], 0, 0, max_feature_height, max_feature_width)) features_all = tf.stack(features_all, axis=1) # Assigns boxes to the right level. box_width = boxes[:, :, 3] - boxes[:, :, 1] box_height = boxes[:, :, 2] - boxes[:, :, 0] areas_sqrt = tf.sqrt(box_height * box_width) levels = tf.cast(tf.floordiv(tf.log(tf.div(areas_sqrt, 224.0)), tf.log(2.0)) + 4.0, dtype=tf.int32) # Maps levels between [min_level, max_level]. levels = tf.minimum(max_level, tf.maximum(levels, min_level)) # Projects box location and sizes to corresponding feature levels. scale_to_level = tf.cast( tf.pow(tf.constant(2.0), tf.cast(levels, tf.float32)), dtype=boxes.dtype) boxes /= tf.expand_dims(scale_to_level, axis=2) box_width /= scale_to_level box_height /= scale_to_level boxes = tf.concat([boxes[:, :, 0:2], tf.expand_dims(box_height, -1), tf.expand_dims(box_width, -1)], axis=-1) # Maps levels to [0, max_level-min_level]. levels -= min_level level_strides = tf.pow([[2.0]], tf.cast(levels, tf.float32)) boundary = tf.cast( tf.concat([ tf.expand_dims([[tf.cast(max_feature_height, tf.float32)]] / level_strides - 1, axis=-1), tf.expand_dims([[tf.cast(max_feature_width, tf.float32)]] / level_strides - 1, axis=-1), ], axis=-1), boxes.dtype) return selective_crop_and_resize( features_all, boxes, levels, boundary, output_size)
def _add_seq2seq(self): """Add the whole sequence-to-sequence model to the graph.""" hps = self._hps vsize = self._vocab.size() # size of the vocabulary with tf.variable_scope('seq2seq'): # Some initializers self.rand_unif_init = tf.random_uniform_initializer( -hps.rand_unif_init_mag, hps.rand_unif_init_mag, seed=123) self.trunc_norm_init = tf.truncated_normal_initializer( stddev=hps.trunc_norm_init_std) # Add embedding matrix (shared by the encoder and decoder inputs) with tf.variable_scope('embedding'): embedding = tf.get_variable('embedding', [vsize, hps.emb_dim], dtype=tf.float32, initializer=self.trunc_norm_init) if hps.mode == "train": self._add_emb_vis(embedding) # add to tensorboard emb_enc_inputs = tf.nn.embedding_lookup( embedding, self._enc_batch ) # tensor with shape (batch_size, max_enc_steps, emb_size) emb_dec_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in tf.unstack(self._dec_batch, axis=1) ] # list length max_dec_steps containing shape (batch_size, emb_size) # Add the encoder. enc_outputs, fw_st, bw_st = self._add_encoder( emb_enc_inputs, self._enc_lens) self._enc_states = enc_outputs # Our encoder is bidirectional and our decoder is unidirectional so we need to reduce the final encoder hidden state to the right size to be the initial decoder hidden state self._dec_in_state = self._reduce_states(fw_st, bw_st) # Add the decoder. with tf.variable_scope('decoder'): decoder_outputs, self._dec_out_state, self.attn_dists, self.p_gens, self.coverage = self._add_decoder( emb_dec_inputs) # Add the output projection to obtain the vocabulary distribution with tf.variable_scope('output_projection'): w = tf.get_variable('w', [hps.hidden_dim, vsize], dtype=tf.float32, initializer=self.trunc_norm_init) w_t = tf.transpose(w) v = tf.get_variable('v', [vsize], dtype=tf.float32, initializer=self.trunc_norm_init) vocab_scores = [ ] # vocab_scores is the vocabulary distribution before applying softmax. Each entry on the list corresponds to one decoder step for i, output in enumerate(decoder_outputs): if i > 0: tf.get_variable_scope().reuse_variables() vocab_scores.append(tf.nn.xw_plus_b( output, w, v)) # apply the linear layer vocab_dists = [ tf.nn.softmax(s) for s in vocab_scores ] # The vocabulary distributions. List length max_dec_steps of (batch_size, vsize) arrays. The words are in the order they appear in the vocabulary file. # For pointer-generator model, calc final distribution from copy distribution and vocabulary distribution if FLAGS.pointer_gen: final_dists = self._calc_final_dist(vocab_dists, self.attn_dists) else: # final distribution is just vocabulary distribution final_dists = vocab_dists if hps.mode in ['train', 'eval']: # Calculate the loss with tf.variable_scope('loss'): if FLAGS.pointer_gen: # Calculate the loss per step # This is fiddly; we use tf.gather_nd to pick out the probabilities of the gold target words loss_per_step = [ ] # will be list length max_dec_steps containing shape (batch_size) batch_nums = tf.range( 0, limit=hps.batch_size) # shape (batch_size) for dec_step, dist in enumerate(final_dists): targets = self._target_batch[:, dec_step] # The indices of the target words. shape (batch_size) indices = tf.stack((batch_nums, targets), axis=1) # shape (batch_size, 2) gold_probs = tf.gather_nd( dist, indices ) # shape (batch_size). prob of correct words on this step losses = -tf.log(gold_probs) loss_per_step.append(losses) # Apply dec_padding_mask and get loss self._loss = _mask_and_avg(loss_per_step, self._dec_padding_mask) else: # baseline model self._loss = tf.contrib.seq2seq.sequence_loss( tf.stack(vocab_scores, axis=1), self._target_batch, self._dec_padding_mask ) # this applies softmax internally tf.summary.scalar('loss', self._loss) # Calculate coverage loss from the attention distributions if hps.coverage: with tf.variable_scope('coverage_loss'): self._coverage_loss = _coverage_loss( self.attn_dists, self._dec_padding_mask) tf.summary.scalar('coverage_loss', self._coverage_loss) self._total_loss = self._loss + hps.cov_loss_wt * self._coverage_loss tf.summary.scalar('total_loss', self._total_loss) if hps.mode == "decode": # We run decode beam search mode one decoder step at a time assert len( final_dists ) == 1 # final_dists is a singleton list containing shape (batch_size, extended_vsize) final_dists = final_dists[0] topk_probs, self._topk_ids = tf.nn.top_k( final_dists, hps.batch_size * 2 ) # take the k largest probs. note batch_size=beam_size in decode mode self._topk_log_probs = tf.log(topk_probs)
def selective_crop_and_resize(features, boxes, box_levels, boundaries, output_size=7, sample_offset=0.5): """Crop and resize boxes on a set of feature maps. Given multiple features maps indexed by different levels, and a set of boxes where each box is mapped to a certain level, it selectively crops and resizes boxes from the corresponding feature maps to generate the box features. We follow the ROIAlign technique (see https://arxiv.org/pdf/1703.06870.pdf, figure 3 for reference). Specifically, for each feature map, we select an (output_size, output_size) set of pixels corresponding to the box location, and then use bilinear interpolation to select the feature value for each pixel. For performance, we perform the gather and interpolation on all layers as a single operation. This is op the multi-level features are first stacked and gathered into [2*output_size, 2*output_size] feature points. Then bilinear interpolation is performed on the gathered feature points to generate [output_size, output_size] RoIAlign feature map. Here is the step-by-step algorithm: 1. The multi-level features are gathered into a [batch_size, num_boxes, output_size*2, output_size*2, num_filters] Tensor. The Tensor contains four neighboring feature points for each vertice in the output grid. 2. Compute the interpolation kernel of shape [batch_size, num_boxes, output_size*2, output_size*2]. The last 2 axis can be seen as stacking 2x2 interpolation kernels for all vertices in the output grid. 3. Element-wise multiply the gathered features and interpolation kernel. Then apply 2x2 average pooling to reduce spatial dimension to output_size. Args: features: a 5-D tensor of shape [batch_size, num_levels, max_height, max_width, num_filters] where cropping and resizing are based. boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the information of each box w.r.t. the corresponding feature map. boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float) in terms of the number of pixels of the corresponding feature map size. box_levels: a 3-D tensor of shape [batch_size, num_boxes, 1] representing the 0-based corresponding feature level index of each box. boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing the boundary (in (y, x)) of the corresponding feature map for each box. Any resampled grid points that go beyond the bounary will be clipped. output_size: a scalar indicating the output crop size. sample_offset: a float number in [0, 1] indicates the subpixel sample offset from grid point. Returns: features_per_box: a 5-D tensor of shape [batch_size, num_boxes, output_size, output_size, num_filters] representing the cropped features. """ (batch_size, num_levels, max_feature_height, max_feature_width, num_filters) = features.get_shape().as_list() _, num_boxes, _ = boxes.get_shape().as_list() # Compute the grid position w.r.t. the corresponding feature map. box_grid_x = [] box_grid_y = [] for i in range(output_size): box_grid_x.append(boxes[:, :, 1] + (i + sample_offset) * boxes[:, :, 3] / output_size) box_grid_y.append(boxes[:, :, 0] + (i + sample_offset) * boxes[:, :, 2] / output_size) box_grid_x = tf.stack(box_grid_x, axis=2) box_grid_y = tf.stack(box_grid_y, axis=2) # Compute indices for gather operation. box_grid_y0 = tf.floor(box_grid_y) box_grid_x0 = tf.floor(box_grid_x) box_grid_x0 = tf.maximum(0., box_grid_x0) box_grid_y0 = tf.maximum(0., box_grid_y0) box_gridx0x1 = tf.stack( [tf.minimum(box_grid_x0, tf.expand_dims(boundaries[:, :, 1], -1)), tf.minimum(box_grid_x0 + 1, tf.expand_dims(boundaries[:, :, 1], -1))], axis=3) box_gridy0y1 = tf.stack( [tf.minimum(box_grid_y0, tf.expand_dims(boundaries[:, :, 0], -1)), tf.minimum(box_grid_y0 + 1, tf.expand_dims(boundaries[:, :, 0], -1))], axis=3) x_indices = tf.cast( tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size * 2]), dtype=tf.int32) y_indices = tf.cast( tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size * 2]), dtype=tf.int32) height_dim_offset = max_feature_width level_dim_offset = max_feature_height * height_dim_offset batch_dim_offset = num_levels * level_dim_offset indices = tf.reshape( tf.tile(tf.reshape(tf.range(batch_size) * batch_dim_offset, [batch_size, 1, 1, 1]), [1, num_boxes, output_size * 2, output_size * 2]) + tf.tile(tf.reshape(box_levels * level_dim_offset, [batch_size, num_boxes, 1, 1]), [1, 1, output_size * 2, output_size * 2]) + tf.tile(tf.reshape(y_indices * height_dim_offset, [batch_size, num_boxes, output_size * 2, 1]), [1, 1, 1, output_size * 2]) + tf.tile(tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]), [1, 1, output_size * 2, 1]), [-1]) features = tf.reshape(features, [-1, num_filters]) features_per_box = tf.reshape( tf.gather(features, indices), [batch_size, num_boxes, output_size * 2, output_size * 2, num_filters]) # The RoIAlign feature f can be computed by bilinear interpolation of four # neighboring feature points f0, f1, f2, and f3. # f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T # [f10, f11]] # f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11 # f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11 ly = box_grid_y - box_grid_y0 lx = box_grid_x - box_grid_x0 hy = 1.0 - ly hx = 1.0 - lx kernel_x = tf.reshape(tf.stack([hx, lx], axis=3), [batch_size, num_boxes, 1, output_size*2]) kernel_y = tf.reshape(tf.stack([hy, ly], axis=3), [batch_size, num_boxes, output_size*2, 1]) # Uses implicit broadcast to generate the interpolation kernel. The # multiplier `4` is for avg pooling. interpolation_kernel = kernel_y * kernel_x * 4 # Interpolates the gathered features with computed interpolation kernels. features_per_box *= tf.cast( tf.expand_dims(interpolation_kernel, axis=4), dtype=features_per_box.dtype) features_per_box = tf.reshape( features_per_box, [batch_size * num_boxes, output_size*2, output_size*2, num_filters]) features_per_box = tf.nn.avg_pool( features_per_box, [1, 2, 2, 1], [1, 2, 2, 1], 'VALID') features_per_box = tf.reshape( features_per_box, [batch_size, num_boxes, output_size, output_size, num_filters]) return features_per_box
def __init__(self, num_emb, batch_size, emb_dim, hidden_dim, sequence_length, start_token, mid_layer, learning_rate=0.005, l2_reg_lambda=0): self.num_emb = num_emb self.batch_size = batch_size self.emb_dim = emb_dim self.hidden_dim = hidden_dim self.sequence_length = sequence_length # self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.r_params = [] self.grad_clip = 5.0 self.mid_layer = mid_layer self.start_token = tf.constant([start_token] * self.batch_size, dtype=tf.int32) self.expected_reward = tf.Variable(tf.zeros([self.sequence_length])) with tf.variable_scope('generator'): self.r_embeddings = tf.Variable(self.init_matrix([self.num_emb, self.emb_dim])) self.r_params.append(self.r_embeddings) self.r_recurrent_unit = self.create_recurrent_unit(self.r_params) # maps h_tm1 to h_t for generator self.r_output_unit = self.create_output_unit(self.r_params, self.mid_layer) # maps h_t to o_t (output token logits) # placeholder definition self.x = tf.placeholder(tf.int32, shape=[self.batch_size, self.sequence_length]) # sequence of tokens generated by generator self.weight = tf.placeholder(tf.float32, shape=[self.batch_size]) self.temperature = tf.placeholder(tf.float32, name='temperature') self.learning_rate = tf.placeholder(tf.float32, name="lr") self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") self.rewards = tf.placeholder(tf.float32, shape=[self.batch_size, self.sequence_length]) # get from rollout policy and discriminator # processed for batch self.pos_weight = tf.nn.softmax(self.weight[:self.batch_size//2]) self.neg_weight = -1.0 * tf.nn.softmax(self.weight[self.batch_size//2:] / self.temperature) self.f_weight = tf.concat([self.pos_weight, self.neg_weight], axis=0) with tf.device("/cpu:0"): self.word = tf.nn.dropout(tf.nn.embedding_lookup(self.r_embeddings, self.x), self.dropout_keep_prob) self.processed_x = tf.transpose(self.word, perm=[1, 0, 2]) # seq_length x batch_size x emb_dim # Initial states self.h0 = tf.zeros([self.batch_size, self.hidden_dim]) self.h0 = tf.stack([self.h0, self.h0]) # self.avg_h0 = tf.zeros([self.batch_size, self.hidden_dim]) gen_h = tensor_array_ops.TensorArray(dtype=tf.float32, size=self.sequence_length, dynamic_size=False, infer_shape=True) # supervised pretraining for generator r_predictions = tensor_array_ops.TensorArray( dtype=tf.float32, size=self.sequence_length, dynamic_size=False, infer_shape=True) ta_emb_x = tensor_array_ops.TensorArray( dtype=tf.float32, size=self.sequence_length) ta_emb_x = ta_emb_x.unstack(self.processed_x) def _pretrain_recurrence(i, x_t, h_tm1, r_predictions, gen_h): gen_h = gen_h.write(i, tf.unstack(h_tm1)[0]) h_t = self.r_recurrent_unit(x_t, h_tm1) o_t = self.r_output_unit(h_t) r_predictions = r_predictions.write(i, o_t) # batch x vocab_size x_tp1 = ta_emb_x.read(i) return i + 1, x_tp1, h_t, r_predictions, gen_h _, _, _, self.r_predictions, self.gen_h = control_flow_ops.while_loop( cond=lambda i, _1, _2, _3, _4: i < self.sequence_length, body=_pretrain_recurrence, loop_vars=(tf.constant(0, dtype=tf.int32), tf.nn.embedding_lookup(self.r_embeddings, self.start_token), self.h0, r_predictions, gen_h)) self.r_predictions = tf.transpose(self.r_predictions.stack(), perm=[1, 0, 2]) # batch_size x seq_length x vocab_size # clip_reward & log_pred : batch*seq x vocab_size self.clipped_reward = tf.one_hot(tf.to_int32(tf.reshape(self.x, [-1])), self.num_emb, 1.0, 0.0) * \ tf.clip_by_value(tf.reshape(self.r_predictions, [-1, self.num_emb]), 1e-20, 1.0) self.reward_per_step_snt = tf.reshape(tf.reduce_sum(self.clipped_reward, -1), [self.batch_size, self.sequence_length]) self.sent_reward = tf.reduce_sum(self.reward_per_step_snt, axis=1) self.reward_loss = -tf.reduce_sum(self.sent_reward * self.f_weight) + \ l2_reg_lambda * (tf.add_n([tf.nn.l2_loss(var) for var in self.r_params if var not in [self.r_embeddings]])) reward_opt = self.optimizer(self.learning_rate) self.reward_grad, _ = tf.clip_by_global_norm(tf.gradients(self.reward_loss, self.r_params), self.grad_clip) self.reward_updates = reward_opt.apply_gradients(zip(self.reward_grad, self.r_params))
def sample(self): return self.low + tf.cast( tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32)
def unstack_batch(tensor_dict, unpad_groundtruth_tensors=True): """Unstacks all tensors in `tensor_dict` along 0th dimension. Unstacks tensor from the tensor dict along 0th dimension and returns a tensor_dict containing values that are lists of unstacked, unpadded tensors. Tensors in the `tensor_dict` are expected to be of one of the three shapes: 1. [batch_size] 2. [batch_size, height, width, channels] 3. [batch_size, num_boxes, d1, d2, ... dn] When unpad_groundtruth_tensors is set to true, unstacked tensors of form 3 above are sliced along the `num_boxes` dimension using the value in tensor field.InputDataFields.num_groundtruth_boxes. Note that this function has a static list of input data fields and has to be kept in sync with the InputDataFields defined in core/standard_fields.py Args: tensor_dict: A dictionary of batched groundtruth tensors. unpad_groundtruth_tensors: Whether to remove padding along `num_boxes` dimension of the groundtruth tensors. Returns: A dictionary where the keys are from fields.InputDataFields and values are a list of unstacked (optionally unpadded) tensors. Raises: ValueError: If unpad_tensors is True and `tensor_dict` does not contain `num_groundtruth_boxes` tensor. """ unbatched_tensor_dict = { key: tf.unstack(tensor) for key, tensor in tensor_dict.items() } if unpad_groundtruth_tensors: if (fields.InputDataFields.num_groundtruth_boxes not in unbatched_tensor_dict): raise ValueError('`num_groundtruth_boxes` not found in tensor_dict. ' 'Keys available: {}'.format( unbatched_tensor_dict.keys())) unbatched_unpadded_tensor_dict = {} unpad_keys = set([ # List of input data fields that are padded along the num_boxes # dimension. This list has to be kept in sync with InputDataFields in # standard_fields.py. fields.InputDataFields.groundtruth_instance_masks, fields.InputDataFields.groundtruth_classes, fields.InputDataFields.groundtruth_boxes, fields.InputDataFields.groundtruth_keypoints, fields.InputDataFields.groundtruth_keypoint_visibilities, fields.InputDataFields.groundtruth_group_of, fields.InputDataFields.groundtruth_difficult, fields.InputDataFields.groundtruth_is_crowd, fields.InputDataFields.groundtruth_area, fields.InputDataFields.groundtruth_weights ]).intersection(set(unbatched_tensor_dict.keys())) for key in unpad_keys: unpadded_tensor_list = [] for num_gt, padded_tensor in zip( unbatched_tensor_dict[fields.InputDataFields.num_groundtruth_boxes], unbatched_tensor_dict[key]): tensor_shape = shape_utils.combined_static_and_dynamic_shape( padded_tensor) slice_begin = tf.zeros([len(tensor_shape)], dtype=tf.int32) slice_size = tf.stack( [num_gt] + [-1 if dim is None else dim for dim in tensor_shape[1:]]) unpadded_tensor = tf.slice(padded_tensor, slice_begin, slice_size) unpadded_tensor_list.append(unpadded_tensor) unbatched_unpadded_tensor_dict[key] = unpadded_tensor_list unbatched_tensor_dict.update(unbatched_unpadded_tensor_dict) return unbatched_tensor_dict
def metric_fn(**kwargs): """Returns a dictionary that has the evaluation metrics.""" if params['nms_configs'].get('pyfunc', True): detections_bs = [] for index in range(kwargs['boxes'].shape[0]): nms_configs = params['nms_configs'] detections = tf.numpy_function( functools.partial(nms_np.per_class_nms, nms_configs=nms_configs), [ kwargs['boxes'][index], kwargs['scores'][index], kwargs['classes'][index], tf.slice(kwargs['image_ids'], [index], [1]), tf.slice(kwargs['image_scales'], [index], [1]), params['num_classes'], nms_configs['max_output_size'], ], tf.float32) detections_bs.append(detections) detections_bs = postprocess.transform_detections( tf.stack(detections_bs)) else: # These two branches should be equivalent, but currently they are not. # TODO(tanmingxing): enable the non_pyfun path after bug fix. nms_boxes, nms_scores, nms_classes, _ = postprocess.per_class_nms( params, kwargs['boxes'], kwargs['scores'], kwargs['classes'], kwargs['image_scales']) img_ids = tf.cast( tf.expand_dims(kwargs['image_ids'], -1), nms_scores.dtype) detections_bs = [ img_ids * tf.ones_like(nms_scores), nms_boxes[:, :, 1], nms_boxes[:, :, 0], nms_boxes[:, :, 3] - nms_boxes[:, :, 1], nms_boxes[:, :, 2] - nms_boxes[:, :, 0], nms_scores, nms_classes, ] detections_bs = tf.stack(detections_bs, axis=-1, name='detnections') if params.get('testdev_dir', None): logging.info('Eval testdev_dir %s', params['testdev_dir']) eval_metric = coco_metric.EvaluationMetric( testdev_dir=params['testdev_dir']) coco_metrics = eval_metric.estimator_metric_fn(detections_bs, tf.zeros([1])) else: logging.info('Eval val with groudtruths %s.', params['val_json_file']) eval_metric = coco_metric.EvaluationMetric( filename=params['val_json_file'], label_map=params['label_map']) coco_metrics = eval_metric.estimator_metric_fn( detections_bs, kwargs['groundtruth_data']) # Add metrics to output. cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics
def _log_prob(self, data, num_samples=1): """Compute a lower bound on the log likelihood.""" # Due to memory issues, we need to use num_samples=1 here num_samples, proposal_num_samples = 1, num_samples batch_size = tf.shape(data)[0] # Sample from the proposal and compute the weighs of the "unseen" samples. # We share these across the batch dimension. # [num_samples, K, data_size] proposal_samples = self.proposal.sample(num_samples * (self.K - 1)) if not self.reparameterize_proposal_samples: proposal_samples = tf.stop_gradient(proposal_samples) # [num_samples, K] log_energy_proposal = tf.reshape( self.energy_fn(tf.reshape(proposal_samples, [-1] + self.data_dim)), [num_samples, self.K - 1]) tf.summary.histogram("log_energy_proposal", log_energy_proposal) tf.summary.scalar("min_log_energy_proposal", tf.reduce_min(log_energy_proposal)) tf.summary.scalar("max_log_energy_proposal", tf.reduce_max(log_energy_proposal)) # [num_samples] proposal_lse = tf.reduce_logsumexp(log_energy_proposal, axis=1) # [batch_size, num_samples] tiled_proposal_lse = tf.tile(proposal_lse[tf.newaxis, :], [batch_size, 1]) # Compute the weights of the observed data. # [batch_size, 1] log_energy_data = tf.reshape(self.energy_fn(data), [batch_size]) tf.summary.histogram("log_energy_data", log_energy_data) tf.summary.scalar("min_log_energy_data", tf.reduce_min(log_energy_data)) tf.summary.scalar("max_log_energy_data", tf.reduce_max(log_energy_data)) # [batch_size, num_samples] tiled_log_energy_data = tf.tile(log_energy_data[:, tf.newaxis], [1, num_samples]) # Add the weights of the proposal samples with the true data weights. # [batch_size, num_samples] # pylint: disable=invalid-name Z_hat = tf.reduce_logsumexp(tf.stack( [tiled_log_energy_data, tiled_proposal_lse], axis=-1), axis=-1) Z_hat -= tf.log(tf.to_float(self.K)) # Perform the log-sum-exp reduction for IWAE # [batch_size] Z_hat = tf.reduce_logsumexp(Z_hat, axis=1) - tf.log( tf.to_float(num_samples)) # pylint: enable=invalid-name try: # Try giving the proposal lower bound num_samples if it can use it. proposal_lp = self.proposal.log_prob( data, num_samples=proposal_num_samples) except TypeError: proposal_lp = self.proposal.log_prob(data) lower_bound = proposal_lp + log_energy_data - Z_hat return lower_bound