def _compute_inner_update_scinol(self, var, grad, state): update_ops = [] betting_domain = tf.cast( state.get_hyper(BETTING_DOMAIN), var.dtype.base_dtype) reward = state.get_slot(var, INNER_REWARD) betting_fraction = state.get_slot(var, OUTER_BETTING_FRACTION) sum_grad_squared = state.get_slot(var, INNER_SUM_GRAD_SQUARED) sum_grad = state.get_slot(var, INNER_SUM_GRAD) inner_maximum_gradient = state.get_slot(var, INNER_MAXIMUM_GRADIENT) # clip inner gradient to respect previous inner_maximum_gradient value # This introduces at most an additive constant overhead in the regret # since the inner betting fraction lies in a bounded domain. clipped_grad = tf.clip_by_value(grad, -inner_maximum_gradient, inner_maximum_gradient) with tf.control_dependencies([clipped_grad]): inner_maximum_gradient_updated = self._assign( inner_maximum_gradient, tf.maximum(inner_maximum_gradient, tf.abs(grad))) update_ops.append(inner_maximum_gradient_updated) clipped_old_betting_fraction = tf.clip_by_value(betting_fraction, -betting_domain, betting_domain) # Process grad to respect truncation to [-betting_domain, betting_domain] truncated_grad = tf.where( tf.greater_equal( clipped_grad * (betting_fraction - clipped_old_betting_fraction), 0.0), clipped_grad, tf.zeros(tf.shape(clipped_grad))) reward_delta = -betting_fraction * truncated_grad reward_updated = self._assign_add(reward, reward_delta) update_ops.append(reward_updated) sum_grad_squared_updated = self._assign_add(sum_grad_squared, tf.square(truncated_grad)) update_ops.append(sum_grad_squared_updated) sum_grad_updated = self._assign_add(sum_grad, truncated_grad) update_ops.append(sum_grad_updated) # The second term in this maximum, inner_maximum_gradient_updated / self.eta # is a hack to force the betting fraction to not be too big at first. scaling = tf.minimum(tf.rsqrt(sum_grad_squared_updated + tf.square(inner_maximum_gradient_updated)), self.eta/inner_maximum_gradient_updated) theta = -sum_grad_updated * scaling # rescale inner flag is a hack that rescales the epsilon_v by the # maximum inner gradient. if self.rescale_inner: epsilon_scaling = inner_maximum_gradient_updated else: epsilon_scaling = 1.0 inner_betting_fraction = tf.sign(theta) * tf.minimum(tf.abs(theta), 1.0) * scaling / 2.0 new_betting_fraction = inner_betting_fraction * ( reward_updated + epsilon_scaling * self.epsilon_v) betting_fraction_updated = self._assign(betting_fraction, new_betting_fraction) update_ops.append(betting_fraction_updated) clipped_betting_fraction = tf.clip_by_value(betting_fraction_updated, -betting_domain, betting_domain) if self.output_summaries: mean_unclipped_betting_fraction_summary = tf.reduce_mean( tf.abs(betting_fraction_updated)) max_unclipped_betting_fraction_summary = tf.reduce_max( tf.abs(betting_fraction_updated)) mean_clipped_betting_fraction_summary = tf.reduce_mean( tf.abs(clipped_betting_fraction)) max_clipped_betting_fraction_summary = tf.reduce_max( tf.abs(clipped_betting_fraction)) max_abs_gradient = tf.reduce_max(tf.abs(grad)) max_truncated_grad = tf.reduce_max(tf.abs(truncated_grad)) tf.summary.scalar(self._name + "/mean_unclipped_bet/" + var.name, mean_unclipped_betting_fraction_summary) tf.summary.scalar(self._name + "/max_unclipped_bet/" + var.name, max_unclipped_betting_fraction_summary) tf.summary.scalar(self._name + "/mean_clipped_bet/" + var.name, mean_clipped_betting_fraction_summary) tf.summary.scalar(self._name + "/max_clipped_bet/" + var.name, max_clipped_betting_fraction_summary) tf.summary.scalar(self._name + "/max_abs_inner_grad/" + var.name, max_abs_gradient) tf.summary.scalar( self._name + "/max_abs_truncated_inner_grad/" + var.name, max_truncated_grad) return clipped_betting_fraction, tf.group(*update_ops)
def _process_rico_sca(feature_dict, max_range, max_dom_pos, load_dom_dist=False, load_extra=False, load_screen=True): """Processes one_shot feature dictionary. Args: feature_dict: feature dictionary max_range: the max range. max_dom_pos: the max dom pos. load_dom_dist: whether to load the dom distance feature. load_extra: whether to load the extra data for debugging. load_screen: whether to load the screen features. Returns: A processed feature dictionary. """ phrase_count = tf.size(feature_dict['obj_desc_position_seq']) // 2 feature = { 'task': tf.reshape(feature_dict['instruction_word_id_seq'], [phrase_count, NUM_TOKENS_PER_SYN]), 'input_refs': tf.reshape(feature_dict['input_str_position_seq'], [phrase_count, 1, 2]), 'obj_refs': tf.reshape(feature_dict['obj_desc_position_seq'], [phrase_count, 1, 2]), 'verb_refs': tf.reshape(feature_dict['verb_str_position_seq'], [phrase_count, 1, 2]), 'rule': tf.reshape(feature_dict['instruction_rule_id'], [phrase_count]), } selected_synthetic_action_idx = tf.random_uniform( shape=(), minval=0, maxval=phrase_count, dtype=tf.int32) for key in feature: feature[key] = feature[key][selected_synthetic_action_idx] if load_extra: feature['raw_task'] = tf.reshape( feature_dict['instruction_str'], [phrase_count])[selected_synthetic_action_idx] feature['task_id'] = tf.constant('empty_task_id', dtype=tf.string) if load_screen: feature['verbs'] = tf.reshape( feature_dict['verb_id_seq'], [phrase_count, 1])[selected_synthetic_action_idx] feature['objects'] = tf.reshape( feature_dict['ui_target_id_seq'], [phrase_count, 1])[selected_synthetic_action_idx] feature['obj_text'] = tf.reshape(feature_dict['ui_obj_word_id_seq'], [1, -1, NUM_TOKENS_PER_OBJ]) feature['obj_type'] = tf.reshape( feature_dict['ui_obj_type_id_seq'], [1, -1]) feature['obj_clickable'] = tf.reshape(feature_dict['ui_obj_clickable_seq'], [1, -1]) def _make_obj_screen_pos(): return tf.concat([ tf.reshape(feature_dict['ui_obj_cord_x_seq'], [1, -1, 2]), tf.reshape(feature_dict['ui_obj_cord_y_seq'], [1, -1, 2]) ], 2) feature['obj_screen_pos'] = tf.cond( tf.equal( tf.size(feature_dict['ui_obj_cord_x_seq']), 0), lambda: tf.fill([1, tf.shape(feature['obj_type'])[1], 4], 0.), _make_obj_screen_pos) feature['obj_dom_pos'] = tf.reshape(feature_dict['ui_obj_dom_location_seq'], [1, -1, 3]) feature['obj_dom_pos'] = tf.minimum(feature['obj_dom_pos'], max_dom_pos - 1) if load_dom_dist: num_ui_obj = tf.to_int32( tf.sqrt(tf.to_float(tf.size(feature_dict['ui_obj_dom_distance'])))) feature['obj_dom_dist'] = tf.reshape(feature_dict['ui_obj_dom_distance'], [1, num_ui_obj, num_ui_obj]) if load_extra: feature['obj_raw_text'] = tf.reshape(feature_dict['ui_obj_str_seq'], [1, -1]) else: _load_fake_screen(feature, load_extra, load_dom_dist) _bound_refs(feature, max_range) feature['data_source'] = tf.constant(0, dtype=tf.int32) feature['agreement_count'] = tf.constant(100, dtype=tf.int32) return feature
def _iou_per_anchor(pred_boxes: FloatType, target_boxes: FloatType, iou_type: Text = 'iou') -> tf.Tensor: """Computing the IoU for a single anchor. Args: pred_boxes: predicted boxes, with coordinate [y_min, x_min, y_max, x_max]. target_boxes: target boxes, with coordinate [y_min, x_min, y_max, x_max]. iou_type: one of ['iou', 'ciou', 'diou', 'giou']. Returns: IoU loss float `Tensor`. """ # t_ denotes target boxes and p_ denotes predicted boxes. t_ymin, t_xmin, t_ymax, t_xmax = target_boxes p_ymin, p_xmin, p_ymax, p_xmax = pred_boxes zero = tf.convert_to_tensor(0.0, t_ymin.dtype) p_width = tf.maximum(zero, p_xmax - p_xmin) p_height = tf.maximum(zero, p_ymax - p_ymin) t_width = tf.maximum(zero, t_xmax - t_xmin) t_height = tf.maximum(zero, t_ymax - t_ymin) p_area = p_width * p_height t_area = t_width * t_height intersect_ymin = tf.maximum(p_ymin, t_ymin) intersect_xmin = tf.maximum(p_xmin, t_xmin) intersect_ymax = tf.minimum(p_ymax, t_ymax) intersect_xmax = tf.minimum(p_xmax, t_xmax) intersect_width = tf.maximum(zero, intersect_xmax - intersect_xmin) intersect_height = tf.maximum(zero, intersect_ymax - intersect_ymin) intersect_area = intersect_width * intersect_height union_area = p_area + t_area - intersect_area iou_v = tf.math.divide_no_nan(intersect_area, union_area) if iou_type == 'iou': return iou_v # iou is the simplest form. enclose_ymin = tf.minimum(p_ymin, t_ymin) enclose_xmin = tf.minimum(p_xmin, t_xmin) enclose_ymax = tf.maximum(p_ymax, t_ymax) enclose_xmax = tf.maximum(p_xmax, t_xmax) assert iou_type in ('giou', 'diou', 'ciou') if iou_type == 'giou': # giou is the generalized iou. enclose_width = tf.maximum(zero, enclose_xmax - enclose_xmin) enclose_height = tf.maximum(zero, enclose_ymax - enclose_ymin) enclose_area = enclose_width * enclose_height giou_v = iou_v - tf.math.divide_no_nan( (enclose_area - union_area), enclose_area) return giou_v assert iou_type in ('diou', 'ciou') p_center = tf.stack([(p_ymin + p_ymax) / 2, (p_xmin + p_xmax) / 2]) t_center = tf.stack([(t_ymin + t_ymax) / 2, (t_xmin + t_xmax) / 2]) euclidean = tf.linalg.norm(t_center - p_center) diag_length = tf.linalg.norm( [enclose_ymax - enclose_ymin, enclose_xmax - enclose_xmin]) diou_v = iou_v - tf.math.divide_no_nan(euclidean**2, diag_length**2) if iou_type == 'diou': # diou is the distance iou. return diou_v assert iou_type == 'ciou' v = _get_v(p_height, p_width, t_height, t_width) alpha = tf.math.divide_no_nan(v, ((1 - iou_v) + v)) return diou_v - alpha * v # the last one is ciou.
def generate_trips(self, min_gap=1, max_gap=5): """Generate a tf Dataset of training triplets with an offset between three frames. Args: min_gap: (int) the minimum offset between two frames of a sampled triplet. max_gap: (int) the maximum offset between two frames of a sampled triplet. Returns: A tf.data.Dataset of ViewSequences without images, consisting of triplets from the input sequence separated by the given offset. """ def mapper(timestamp_trips, rgb_trips, pano_trips, depth_trips, normal_trips, pose_trips): """A function mapping a data tuple to ViewTrip.""" return ViewTrip(self.scene_id, self.sequence_id, timestamp_trips, rgb_trips, pano_trips, depth_trips, normal_trips, tf.zeros([1]), pose_trips, self.intrinsics[0], self.resolution[0]) with tf.control_dependencies([ tf.Assert(tf.less(max_gap, self.length()), [max_gap, self.length()]) ]): timestamp_trips = [] rgb_trips = [] pano_trips = [] depth_trips = [] normal_trips = [] pose_trips = [] # generate triplets with an offset that ranges # from 'min_gap' to 'max_gap'. for stride in range(min_gap, max_gap + 1): inds = tf.range(stride, self.length() - stride) inds_jitter = tf.random.uniform( minval=-40, maxval=40, shape=[self.length() - 2 * stride], dtype=tf.int32) rand_inds = tf.minimum(tf.maximum(inds + inds_jitter, 0), self.length() - 1) timestamp = tf.stack([ self.timestamp[:-2 * stride], self.timestamp[2 * stride:], self.timestamp[stride:-stride], tf.gather(self.timestamp, rand_inds) ], axis=1) rgb = tf.stack([ self.rgb[:-2 * stride], self.rgb[2 * stride:], self.rgb[stride:-stride], tf.gather(self.rgb, rand_inds) ], axis=1) pano = tf.stack([ self.pano[:-2 * stride], self.pano[2 * stride:], self.pano[stride:-stride], tf.gather(self.pano, rand_inds) ], axis=1) depth = tf.stack([ self.depth[:-2 * stride], self.depth[2 * stride:], self.depth[stride:-stride], tf.gather(self.depth, rand_inds) ], axis=1) normal = tf.stack([ self.normal[:-2 * stride], self.normal[2 * stride:], self.normal[stride:-stride], tf.gather(self.normal, rand_inds) ], axis=1) pose = tf.stack([ self.pose[:-2 * stride], self.pose[2 * stride:], self.pose[stride:-stride], tf.gather(self.pose, rand_inds) ], axis=1) timestamp_trips.append(timestamp) rgb_trips.append(rgb) pano_trips.append(pano) depth_trips.append(depth) normal_trips.append(normal) pose_trips.append(pose) timestamp_trips = tf.concat(timestamp_trips, 0) rgb_trips = tf.concat(rgb_trips, 0) pano_trips = tf.concat(pano_trips, 0) depth_trips = tf.concat(depth_trips, 0) normal_trips = tf.concat(normal_trips, 0) pose_trips = tf.concat(pose_trips, 0) dataset = tf.data.Dataset.from_tensor_slices( (timestamp_trips, rgb_trips, pano_trips, depth_trips, normal_trips, pose_trips)) return dataset.map(mapper)
def _compute_model_loss(self, input_sequence, output_sequence, sequence_length, control_sequence): """Builds a model with loss for train/eval.""" hparams = self.hparams batch_size = hparams.batch_size input_sequence = tf.to_float(input_sequence) output_sequence = tf.to_float(output_sequence) max_seq_len = tf.minimum( tf.shape(output_sequence)[1], hparams.max_seq_len) input_sequence = input_sequence[:, :max_seq_len] if control_sequence is not None: control_depth = control_sequence.shape[-1] control_sequence = tf.to_float(control_sequence) control_sequence = control_sequence[:, :max_seq_len] # Shouldn't be necessary, but the slice loses shape information when # control depth is zero. control_sequence.set_shape([batch_size, None, control_depth]) # The target/expected outputs. x_target = output_sequence[:, :max_seq_len] # Inputs to be fed to decoder, including zero padding for the initial input. x_input = tf.pad(output_sequence[:, :max_seq_len - 1], [(0, 0), (1, 0), (0, 0)]) x_length = tf.minimum(sequence_length, max_seq_len) # Either encode to get `z`, or do unconditional, decoder-only. if hparams.z_size: # vae mode: q_z = self.encode(input_sequence, x_length, control_sequence) z = q_z.sample() # Prior distribution. p_z = ds.MultivariateNormalDiag(loc=[0.] * hparams.z_size, scale_diag=[1.] * hparams.z_size) # KL Divergence (nats) kl_div = ds.kl_divergence(q_z, p_z) # Concatenate the Z vectors to the inputs at each time step. else: # unconditional, decoder-only generation kl_div = tf.zeros([batch_size, 1], dtype=tf.float32) z = None r_loss, metric_map = self.decoder.reconstruction_loss( x_input, x_target, x_length, z, control_sequence)[0:2] free_nats = hparams.free_bits * tf.math.log(2.0) kl_cost = tf.maximum(kl_div - free_nats, 0) beta = ( (1.0 - tf.pow(hparams.beta_rate, tf.to_float(self.global_step))) * hparams.max_beta) self.loss = tf.reduce_mean(r_loss) + beta * tf.reduce_mean(kl_cost) scalars_to_summarize = { 'loss': self.loss, 'losses/r_loss': r_loss, 'losses/kl_loss': kl_cost, 'losses/kl_bits': kl_div / tf.math.log(2.0), 'losses/kl_beta': beta, } return metric_map, scalars_to_summarize
def BuildNet(self): # [node_feat_dim, embed_dim] w_n2l = tf.Variable( tf.truncated_normal([node_feat_dim, self.embedding_size], stddev=initialization_stddev), tf.float32, name="w_n2l") # [embed_dim, embed_dim] p_node_conv = tf.Variable(tf.truncated_normal( [self.embedding_size, self.embedding_size], stddev=initialization_stddev), tf.float32, name="p_node_conv") if combineID == 1: # 'graphsage' # [embed_dim, embed_dim] p_node_conv2 = tf.Variable(tf.truncated_normal( [self.embedding_size, self.embedding_size], stddev=initialization_stddev), tf.float32, name="p_node_conv2") # [2*embed_dim, embed_dim] p_node_conv3 = tf.Variable(tf.truncated_normal( [2 * self.embedding_size, self.embedding_size], stddev=initialization_stddev), tf.float32, name="p_node_conv3") elif combineID == 2: #GRU w_r = tf.Variable(tf.truncated_normal( [self.embedding_size, self.embedding_size], stddev=initialization_stddev), tf.float32, name="w_r") u_r = tf.Variable(tf.truncated_normal( [self.embedding_size, self.embedding_size], stddev=initialization_stddev), tf.float32, name="u_r") w_z = tf.Variable(tf.truncated_normal( [self.embedding_size, self.embedding_size], stddev=initialization_stddev), tf.float32, name="w_z") u_z = tf.Variable(tf.truncated_normal( [self.embedding_size, self.embedding_size], stddev=initialization_stddev), tf.float32, name="u_z") w = tf.Variable(tf.truncated_normal( [self.embedding_size, self.embedding_size], stddev=initialization_stddev), tf.float32, name="w") u = tf.Variable(tf.truncated_normal( [self.embedding_size, self.embedding_size], stddev=initialization_stddev), tf.float32, name="u") # [embed_dim, reg_hidden] h1_weight = tf.Variable(tf.truncated_normal( [self.embedding_size, self.reg_hidden], stddev=initialization_stddev), tf.float32, name="h1_weight") # [reg_hidden+aux_feat_dim, 1] h2_weight = tf.Variable(tf.truncated_normal( [self.reg_hidden + aux_feat_dim, 1], stddev=initialization_stddev), tf.float32, name="h2_weight") # [reg_hidden, 1] last_w = h2_weight # [node_cnt, node_feat_dim] node_size = tf.shape(self.n2nsum_param)[0] node_input = self.node_feat #[node_cnt, embed_dim] input_message = tf.matmul(tf.cast(node_input, tf.float32), w_n2l) lv = 0 # [node_cnt, embed_dim], no sparse cur_message_layer = self.activation(input_message) cur_message_layer = tf.nn.l2_normalize(cur_message_layer, axis=1) if JK: # # 1:max_pooling; 2:min_pooling; 3:mean_pooling; 4:LSTM with attention cur_message_layer_JK = cur_message_layer if JK == 4: #LSTM init hidden layer w_r_JK = tf.Variable(tf.truncated_normal( [self.embedding_size, self.embedding_size], stddev=initialization_stddev), tf.float32, name="w_r_JK") u_r_JK = tf.Variable(tf.truncated_normal( [self.embedding_size, self.embedding_size], stddev=initialization_stddev), tf.float32, name="u_r_JK") w_z_JK = tf.Variable(tf.truncated_normal( [self.embedding_size, self.embedding_size], stddev=initialization_stddev), tf.float32, name="w_z_JK") u_z_JK = tf.Variable(tf.truncated_normal( [self.embedding_size, self.embedding_size], stddev=initialization_stddev), tf.float32, name="u_z_JK") w_JK = tf.Variable(tf.truncated_normal( [self.embedding_size, self.embedding_size], stddev=initialization_stddev), tf.float32, name="w_JK") u_JK = tf.Variable(tf.truncated_normal( [self.embedding_size, self.embedding_size], stddev=initialization_stddev), tf.float32, name="u_JK") #attention matrix JK_attention = tf.Variable(tf.truncated_normal( [self.embedding_size, 1], stddev=initialization_stddev), tf.float32, name="JK_attention") #attention list JK_attention_list = [] JK_Hidden_list = [] cur_message_layer_list = [] cur_message_layer_list.append(cur_message_layer) JK_Hidden = tf.truncated_normal(tf.shape(cur_message_layer), stddev=initialization_stddev) # max_bp_iter steps of neighbor propagation while lv < max_bp_iter: lv = lv + 1 # [node_cnt, node_cnt]*[node_cnt, embed_dim] = [node_cnt, embed_dim] n2npool = tf.sparse_tensor_dense_matmul( tf.cast(self.n2nsum_param, tf.float64), tf.cast(cur_message_layer, tf.float64)) n2npool = tf.cast(n2npool, tf.float32) # [node_cnt, embed_dim] * [embedding, embedding] = [node_cnt, embed_dim], dense node_linear = tf.matmul(n2npool, p_node_conv) if combineID == 0: # 'structure2vec' # [node_cnt, embed_dim] + [node_cnt, embed_dim] = [node_cnt, embed_dim], return tensed matrix merged_linear = tf.add(node_linear, input_message) # [node_cnt, embed_dim] cur_message_layer = self.activation(merged_linear) if JK == 1: cur_message_layer_JK = tf.maximum(cur_message_layer_JK, cur_message_layer) elif JK == 2: cur_message_layer_JK = tf.minimum(cur_message_layer_JK, cur_message_layer) elif JK == 3: cur_message_layer_JK = tf.add(cur_message_layer_JK, cur_message_layer) elif JK == 4: cur_message_layer_list.append(cur_message_layer) elif combineID == 1: # 'graphsage' # [node_cnt, embed_dim] * [embed_dim, embed_dim] = [node_cnt, embed_dim], dense cur_message_layer_linear = tf.matmul( tf.cast(cur_message_layer, tf.float32), p_node_conv2) # [[node_cnt, embed_dim] [node_cnt, embed_dim]] = [node_cnt, 2*embed_dim], return tensed matrix merged_linear = tf.concat( [node_linear, cur_message_layer_linear], 1) # [node_cnt, 2*embed_dim]*[2*embed_dim, embed_dim] = [node_cnt, embed_dim] cur_message_layer = self.activation( tf.matmul(merged_linear, p_node_conv3)) if JK == 1: cur_message_layer_JK = tf.maximum(cur_message_layer_JK, cur_message_layer) elif JK == 2: cur_message_layer_JK = tf.minimum(cur_message_layer_JK, cur_message_layer) elif JK == 3: cur_message_layer_JK = tf.add(cur_message_layer_JK, cur_message_layer) elif JK == 4: cur_message_layer_list.append(cur_message_layer) elif combineID == 2: #gru r_t = tf.nn.relu( tf.add(tf.matmul(node_linear, w_r), tf.matmul(cur_message_layer, u_r))) z_t = tf.nn.relu( tf.add(tf.matmul(node_linear, w_z), tf.matmul(cur_message_layer, u_z))) h_t = tf.nn.tanh( tf.add(tf.matmul(node_linear, w), tf.matmul(r_t * cur_message_layer, u))) cur_message_layer = (1 - z_t) * cur_message_layer + z_t * h_t cur_message_layer = tf.nn.l2_normalize(cur_message_layer, axis=1) if JK == 1: cur_message_layer_JK = tf.maximum(cur_message_layer_JK, cur_message_layer) elif JK == 2: cur_message_layer_JK = tf.minimum(cur_message_layer_JK, cur_message_layer) elif JK == 3: cur_message_layer_JK = tf.add(cur_message_layer_JK, cur_message_layer) elif JK == 4: cur_message_layer_list.append(cur_message_layer) cur_message_layer = tf.nn.l2_normalize(cur_message_layer, axis=1) if JK == 1 or JK == 2: cur_message_layer = cur_message_layer_JK elif JK == 3: cur_message_layer = cur_message_layer_JK / (max_bp_iter + 1) elif JK == 4: for X_value in cur_message_layer_list: #[node_cnt,embed_size] r_t_JK = tf.nn.relu( tf.add(tf.matmul(X_value, w_r_JK), tf.matmul(JK_Hidden, u_r_JK))) z_t_JK = tf.nn.relu( tf.add(tf.matmul(X_value, w_z_JK), tf.matmul(JK_Hidden, u_z_JK))) h_t_JK = tf.nn.tanh( tf.add(tf.matmul(X_value, w_JK), tf.matmul(r_t_JK * JK_Hidden, u_JK))) JK_Hidden = (1 - z_t_JK) * h_t_JK + z_t_JK * JK_Hidden JK_Hidden = tf.nn.l2_normalize(JK_Hidden, axis=1) #[max_bp_iter+1,node_cnt,embed_size] JK_Hidden_list.append(JK_Hidden) # [max_bp_iter+1,node_cnt,1] = [node_cnt,embed_size]*[embed_size,1]=[node_cnt,1] attention = tf.nn.tanh(tf.matmul(JK_Hidden, JK_attention)) JK_attention_list.append(attention) cur_message_layer = JK_Hidden # [max_bp_iter+1,node_cnt,1] JK_attentions = tf.reshape(JK_attention_list, [max_bp_iter + 1, node_size, 1]) cofficient = tf.nn.softmax(JK_attentions, axis=0) JK_Hidden_list = tf.reshape( JK_Hidden_list, [max_bp_iter + 1, node_size, self.embedding_size]) # [max_bpr_iter+1,node_cnt,1]* [max_bp_iter + 1,node_cnt,embed_size] = [max_bp_iter + 1,node_cnt,embed_size] #[max_bp_iter + 1,node_cnt,embed_size] result = cofficient * JK_Hidden_list cur_message_layer = tf.reduce_sum(result, 0) cur_message_layer = tf.reshape(cur_message_layer, [node_size, self.embedding_size]) cur_message_layer = tf.nn.l2_normalize(cur_message_layer, axis=1) # node embedding, [node_cnt, embed_dim] embed_s_a = cur_message_layer # decoder, two-layer MLP hidden = tf.matmul(embed_s_a, h1_weight) last_output = self.activation(hidden) last_output = tf.concat([last_output, self.aux_feat], axis=1) betw_pred = tf.matmul(last_output, last_w) # [pair_size, 1] labels = tf.nn.embedding_lookup( self.label, self.pair_ids_src) - tf.nn.embedding_lookup( self.label, self.pair_ids_tgt) preds = tf.nn.embedding_lookup( betw_pred, self.pair_ids_src) - tf.nn.embedding_lookup( betw_pred, self.pair_ids_tgt) loss = self.pairwise_ranking_loss(preds, labels) trainStep = tf.train.AdamOptimizer(self.learning_rate).minimize(loss) return loss, trainStep, betw_pred, embed_s_a, tf.trainable_variables()
def mask(config: configure_pretraining.PretrainingConfig, inputs: pretrain_data.Inputs, mask_prob, proposal_distribution=1.0, disallow_from_mask=None, already_masked=None): """Implementation of dynamic masking. The optional arguments aren't needed for BERT/ELECTRA and are from early experiments in "strategically" masking out tokens instead of uniformly at random. Args: config: configure_pretraining.PretrainingConfig inputs: pretrain_data.Inputs containing input input_ids/input_mask mask_prob: percent of tokens to mask proposal_distribution: for non-uniform masking can be a [B, L] tensor of scores for masking each position. disallow_from_mask: a boolean tensor of [B, L] of positions that should not be masked out already_masked: a boolean tensor of [B, N] of already masked-out tokens for multiple rounds of masking Returns: a pretrain_data.Inputs with masking added """ # Get the batch size, sequence length, and max masked-out tokens N = config.max_predictions_per_seq B, L = modeling.get_shape_list(inputs.input_ids) # Find indices where masking out a token is allowed vocab = tokenization.FullTokenizer( config.vocab_file, do_lower_case=config.do_lower_case).vocab candidates_mask = _get_candidates_mask(inputs, vocab, disallow_from_mask) # Set the number of tokens to mask out per example num_tokens = tf.cast(tf.reduce_sum(inputs.input_mask, -1), tf.float32) num_to_predict = tf.maximum( 1, tf.minimum(N, tf.cast(tf.round(num_tokens * mask_prob), tf.int32))) masked_lm_weights = tf.cast(tf.sequence_mask(num_to_predict, N), tf.float32) if already_masked is not None: masked_lm_weights *= (1 - already_masked) # Get a probability of masking each position in the sequence candidate_mask_float = tf.cast(candidates_mask, tf.float32) sample_prob = (proposal_distribution * candidate_mask_float) sample_prob /= tf.reduce_sum(sample_prob, axis=-1, keepdims=True) # Sample the positions to mask out sample_prob = tf.stop_gradient(sample_prob) sample_logits = tf.log(sample_prob) masked_lm_positions = tf.random.categorical(sample_logits, N, dtype=tf.int32) masked_lm_positions *= tf.cast(masked_lm_weights, tf.int32) # Get the ids of the masked-out tokens shift = tf.expand_dims(L * tf.range(B), -1) flat_positions = tf.reshape(masked_lm_positions + shift, [-1, 1]) masked_lm_ids = tf.gather_nd(tf.reshape(inputs.input_ids, [-1]), flat_positions) masked_lm_ids = tf.reshape(masked_lm_ids, [B, -1]) masked_lm_ids *= tf.cast(masked_lm_weights, tf.int32) masked_synonym_ids = tf.gather_nd( tf.reshape(inputs.synonym_ids, [B * L, -1]), flat_positions) masked_synonym_ids = tf.reshape(masked_synonym_ids, [B, N, -1]) masked_synonym_ids *= tf.expand_dims(tf.cast(masked_lm_weights, tf.int32), -1) # Update the input ids replace_with_mask_positions = masked_lm_positions * tf.cast( tf.less(tf.random.uniform([B, N]), 0.85), tf.int32) inputs_ids, _ = scatter_update(inputs.input_ids, tf.fill([B, N], vocab["[MASK]"]), replace_with_mask_positions) return pretrain_data.get_updated_inputs( inputs, input_ids=tf.stop_gradient(inputs_ids), masked_lm_positions=masked_lm_positions, masked_lm_ids=masked_lm_ids, masked_synonym_ids=masked_synonym_ids, masked_lm_weights=masked_lm_weights)
def build_train_graph(self, inputs, min_depth, max_depth, cube_res, theta_res, phi_res, r_res, scale_factors, num_mpi_planes, learning_rate=0.0001, vgg_model_weights=None, global_step=0, depth_clip=20.0): """Construct the training computation graph. Args: inputs: dictionary of tensors (see 'input_data' below) needed for training min_depth: minimum depth for the PSV and MPI planes max_depth: maximum depth for the PSV and MPI planes cube_res: per-side cube resolution theta_res: environment map width phi_res: environment map height r_res: number of radii to use when sampling spheres for rendering scale_factors: downsampling factors of cubes relative to the coarsest num_mpi_planes: number of MPI planes to infer learning_rate: learning rate vgg_model_weights: vgg weights (needed when vgg loss is used) global_step: training iteration depth_clip: maximum depth for coarsest resampled volumes Returns: A train_op to be used for training. """ with tf.name_scope('setup'): psv_planes = pj.inv_depths(min_depth, max_depth, num_mpi_planes) mpi_planes = pj.inv_depths(min_depth, max_depth, num_mpi_planes) with tf.name_scope('input_data'): tgt_image = inputs['tgt_image'] ref_image = inputs['ref_image'] src_images = inputs['src_images'] env_image = inputs['env_image'] ref_depth = inputs['ref_depth'] tgt_pose = inputs['tgt_pose'] ref_pose = inputs['ref_pose'] src_poses = inputs['src_poses'] env_pose = inputs['env_pose'] intrinsics = inputs['intrinsics'] _, _, _, num_source = src_poses.get_shape().as_list() with tf.name_scope('inference'): num_mpi_planes = tf.shape(mpi_planes)[0] pred = self.infer_mpi(src_images, ref_image, ref_pose, src_poses, intrinsics, psv_planes) rgba_layers = pred['rgba_layers'] psv = pred['psv'] with tf.name_scope('synthesis'): output_image, output_alpha_acc, _ = self.mpi_render_view( rgba_layers, ref_pose, tgt_pose, mpi_planes, intrinsics) with tf.name_scope('environment_rendering'): mpi_gt = self.img2mpi(ref_image, ref_depth, mpi_planes) output_image_gt, _, _ = self.mpi_render_view(mpi_gt, ref_pose, tgt_pose, mpi_planes, intrinsics) lightvols_gt, _, _, _, _ = self.predict_lighting_vol( mpi_gt, mpi_planes, intrinsics, cube_res, scale_factors, depth_clip=depth_clip) lightvols, lightvol_centers, \ lightvol_side_lengths, \ cube_rel_shapes, \ cube_nest_inds = self.predict_lighting_vol(rgba_layers, mpi_planes, intrinsics, cube_res, scale_factors, depth_clip=depth_clip) lightvols_out = nets.cube_net_multires(lightvols, cube_rel_shapes, cube_nest_inds) gt_envmap, gt_shells = self.render_envmap(lightvols_gt, lightvol_centers, lightvol_side_lengths, cube_rel_shapes, cube_nest_inds, ref_pose, env_pose, theta_res, phi_res, r_res) prenet_envmap, prenet_shells = self.render_envmap( lightvols, lightvol_centers, lightvol_side_lengths, cube_rel_shapes, cube_nest_inds, ref_pose, env_pose, theta_res, phi_res, r_res) output_envmap, output_shells = self.render_envmap( lightvols_out, lightvol_centers, lightvol_side_lengths, cube_rel_shapes, cube_nest_inds, ref_pose, env_pose, theta_res, phi_res, r_res) with tf.name_scope('loss'): # mask loss for pixels outside reference frustum loss_mask = tf.where( tf.equal(output_alpha_acc[Ellipsis, tf.newaxis], 0.0), tf.zeros_like(output_image[:, :, :, 0:1]), tf.ones_like(output_image[:, :, :, 0:1])) loss_mask = tf.stop_gradient(loss_mask) tf.summary.image('loss_mask', loss_mask) # helper functions for loss def compute_error(real, fake, mask): mask = tf.ones_like(real) * mask return tf.reduce_sum(mask * tf.abs(fake - real)) / ( tf.reduce_sum(mask) + 1.0e-8) # Normalized VGG loss def downsample(tensor, ds): return tf.nn.avg_pool(tensor, [1, ds, ds, 1], [1, ds, ds, 1], 'SAME') def vgg_loss(tgt_image, output_image, loss_mask, vgg_weights): """VGG activation loss definition.""" vgg_real = nets.build_vgg19(tgt_image * 255.0, vgg_weights) rescaled_output_image = output_image * 255.0 vgg_fake = nets.build_vgg19(rescaled_output_image, vgg_weights) p0 = compute_error(vgg_real['input'], vgg_fake['input'], loss_mask) p1 = compute_error(vgg_real['conv1_2'], vgg_fake['conv1_2'], loss_mask) / 2.6 p2 = compute_error(vgg_real['conv2_2'], vgg_fake['conv2_2'], downsample(loss_mask, 2)) / 4.8 p3 = compute_error(vgg_real['conv3_2'], vgg_fake['conv3_2'], downsample(loss_mask, 4)) / 3.7 p4 = compute_error(vgg_real['conv4_2'], vgg_fake['conv4_2'], downsample(loss_mask, 8)) / 5.6 p5 = compute_error(vgg_real['conv5_2'], vgg_fake['conv5_2'], downsample(loss_mask, 16)) * 10 / 1.5 total_loss = p0 + p1 + p2 + p3 + p4 + p5 return total_loss # rendered image loss render_loss = vgg_loss(tgt_image, output_image, loss_mask, vgg_model_weights) / 100.0 total_loss = render_loss # rendered envmap loss envmap_loss = vgg_loss(env_image, output_envmap[Ellipsis, :3], tf.ones_like(env_image[Ellipsis, 0:1]), vgg_model_weights) / 100.0 # set envmap loss to 0 when only training mpi network (see paper) envmap_loss = tf.where(tf.greater(global_step, 240000), envmap_loss, 0.0) total_loss += envmap_loss # adversarial loss for envmap real_logit = nets.discriminator(env_image, scope='discriminator') fake_logit = nets.discriminator( output_envmap[Ellipsis, :3], scope='discriminator') adv_loss_list = [] for i in range(len(fake_logit)): adv_loss_list.append(0.1 * -1.0 * tf.reduce_mean(fake_logit[i][-1])) adv_loss = tf.reduce_mean(adv_loss_list) real_loss_list = [] fake_loss_list = [] for i in range(len(fake_logit)): real_loss_list.append( -1.0 * tf.reduce_mean(tf.minimum(real_logit[i][-1] - 1, 0.0))) fake_loss_list.append( -1.0 * tf.reduce_mean(tf.minimum(-1.0 * fake_logit[i][-1] - 1, 0.0))) real_loss = tf.reduce_mean(real_loss_list) fake_loss = tf.reduce_mean(fake_loss_list) disc_loss = real_loss + fake_loss # set adv/disc losses to 0 until end of training adv_loss = tf.where(tf.greater(global_step, 690000), adv_loss, 0.0) disc_loss = tf.where(tf.greater(global_step, 690000), disc_loss, 0.0) tf.summary.scalar('loss_disc', disc_loss) tf.summary.scalar('loss_disc_real', real_loss) tf.summary.scalar('loss_disc_fake', fake_loss) tf.summary.scalar('loss_adv', adv_loss) total_loss += adv_loss with tf.name_scope('train_op'): train_variables = [ var for var in tf.trainable_variables() if 'discriminator' not in var.name ] optim = tf.train.AdamOptimizer(learning_rate, epsilon=1e-4) grads_and_variables = optim.compute_gradients( total_loss, var_list=train_variables) grads = [gv[0] for gv in grads_and_variables] variables = [gv[1] for gv in grads_and_variables] def denan(x): return tf.where(tf.is_nan(x), tf.zeros_like(x), x) grads_clipped = [denan(g) for g in grads] grads_clipped, _ = tf.clip_by_global_norm(grads_clipped, 100.0) train_op = [optim.apply_gradients(zip(grads_clipped, variables))] tf.summary.scalar('gradient global norm', tf.linalg.global_norm(grads)) tf.summary.scalar('clipped gradient global norm', tf.linalg.global_norm(grads_clipped)) d_variables = [ var for var in tf.trainable_variables() if 'discriminator' in var.name ] optim_d = tf.train.AdamOptimizer(learning_rate, beta1=0.0) train_op.append(optim_d.minimize(disc_loss, var_list=d_variables)) with tf.name_scope('envmap_gt'): tf.summary.image('envmap', gt_envmap) tf.summary.image('envmap_alpha', gt_envmap[Ellipsis, -1:]) for i in range(len(gt_shells)): i_envmap = pj.over_composite(gt_shells[i]) tf.summary.image('envmap_level_' + str(i), i_envmap) with tf.name_scope('envmap_prenet'): tf.summary.image('envmap', prenet_envmap) tf.summary.image('envmap_alpha', prenet_envmap[Ellipsis, -1:]) for i in range(len(prenet_shells)): i_envmap = pj.over_composite(prenet_shells[i]) tf.summary.image('envmap_level_' + str(i), i_envmap) with tf.name_scope('envmap_output'): tf.summary.image('envmap', output_envmap) tf.summary.image('envmap_alpha', output_envmap[Ellipsis, -1:]) for i in range(len(output_shells)): i_envmap = pj.over_composite(output_shells[i]) tf.summary.image('envmap_level_' + str(i), i_envmap) tf.summary.scalar('loss_total', total_loss) tf.summary.scalar('loss_render', render_loss) tf.summary.scalar('loss_envmap', envmap_loss) tf.summary.scalar('min_depth', min_depth) tf.summary.scalar('max_depth', max_depth) with tf.name_scope('level_stats'): for i in range(len(lightvols)): tf.summary.scalar('cube_side_length_' + str(i), lightvol_side_lengths[i]) tf.summary.scalar('cube_center_' + str(i), lightvol_centers[i][0, -1]) # Source images for i in range(num_source): src_image = src_images[:, :, :, i * 3:(i + 1) * 3] tf.summary.image('image_src_%d' % i, src_image) # Output image tf.summary.image('image_output', output_image) tf.summary.image('image_output_Gt', output_image_gt) # Target image tf.summary.image('image_tgt', tgt_image) tf.summary.image('envmap_tgt', env_image) # Ref image tf.summary.image('image_ref', ref_image) # Predicted color and alpha layers, and PSV num_summ = 8 # number of plane summaries to show in tensorboard for i in range(num_summ): ind = tf.to_int32(i * num_mpi_planes / num_summ) rgb = rgba_layers[:, :, :, ind, :3] alpha = rgba_layers[:, :, :, ind, -1:] ref_plane = psv[:, :, :, ind, :3] source_plane = psv[:, :, :, ind, 3:6] tf.summary.image('layer_rgb_%d' % i, rgb) tf.summary.image('layer_alpha_%d' % i, alpha) tf.summary.image('layer_rgba_%d' % i, rgba_layers[:, :, :, ind, :]) tf.summary.image('psv_avg_%d' % i, 0.5 * ref_plane + 0.5 * source_plane) tf.summary.image('psv_ref_%d' % i, ref_plane) tf.summary.image('psv_source_%d' % i, source_plane) return train_op
def resize_and_crop_image_v2(image, short_side, long_side, padded_size, aug_scale_min=1.0, aug_scale_max=1.0, seed=1, method=tf.image.ResizeMethod.BILINEAR): """Resizes the input image to output size (Faster R-CNN style). Resize and pad images given the specified short / long side length and the stride size. Here are the preprocessing steps. 1. For a given image, keep its aspect ratio and first try to rescale the short side of the original image to `short_side`. 2. If the scaled image after 1 has a long side that exceeds `long_side`, keep the aspect ratio and rescal the long side of the image to `long_side`. 2. Pad the rescaled image to the padded_size. Args: image: a `Tensor` of shape [height, width, 3] representing an image. short_side: a scalar `Tensor` or `int` representing the desired short side to be rescaled to. long_side: a scalar `Tensor` or `int` representing the desired long side to be rescaled to. padded_size: a `Tensor` or `int` list/tuple of two elements representing [height, width] of the padded output image size. Padding will be applied after scaling the image to the desired_size. aug_scale_min: a `float` with range between [0, 1.0] representing minimum random scale applied to desired_size for training scale jittering. aug_scale_max: a `float` with range between [1.0, inf] representing maximum random scale applied to desired_size for training scale jittering. seed: seed for random scale jittering. method: function to resize input image to scaled image. Returns: output_image: `Tensor` of shape [height, width, 3] where [height, width] equals to `output_size`. image_info: a 2D `Tensor` that encodes the information of the image and the applied preprocessing. It is in the format of [[original_height, original_width], [desired_height, desired_width], [y_scale, x_scale], [y_offset, x_offset]], where [desired_height, desired_width] is the actual scaled image size, and [y_scale, x_scale] is the scaling factor, which is the ratio of scaled dimension / original dimension. """ with tf.name_scope('resize_and_crop_image_v2'): image_size = tf.cast(tf.shape(image)[0:2], tf.float32) scale_using_short_side = (short_side / tf.minimum(image_size[0], image_size[1])) scale_using_long_side = (long_side / tf.maximum(image_size[0], image_size[1])) scaled_size = tf.round(image_size * scale_using_short_side) scaled_size = tf.where( tf.greater(tf.maximum(scaled_size[0], scaled_size[1]), long_side), tf.round(image_size * scale_using_long_side), scaled_size) desired_size = scaled_size random_jittering = (aug_scale_min != 1.0 or aug_scale_max != 1.0) if random_jittering: random_scale = tf.random_uniform([], aug_scale_min, aug_scale_max, seed=seed) scaled_size = tf.round(random_scale * scaled_size) # Computes 2D image_scale. image_scale = scaled_size / image_size # Selects non-zero random offset (x, y) if scaled image is larger than # desired_size. if random_jittering: max_offset = scaled_size - desired_size max_offset = tf.where(tf.less(max_offset, 0), tf.zeros_like(max_offset), max_offset) offset = max_offset * tf.random_uniform([ 2, ], 0, 1, seed=seed) offset = tf.cast(offset, tf.int32) else: offset = tf.zeros((2, ), tf.int32) scaled_image = tf.image.resize_images(image, tf.cast(scaled_size, tf.int32), method=method) if random_jittering: scaled_image = scaled_image[offset[0]:offset[0] + desired_size[0], offset[1]:offset[1] + desired_size[1], :] output_image = tf.image.pad_to_bounding_box(scaled_image, 0, 0, padded_size[0], padded_size[1]) image_info = tf.stack([ image_size, tf.cast(desired_size, dtype=tf.float32), image_scale, tf.cast(offset, tf.float32) ]) return output_image, image_info
def _clip_by_global_norm(t_list, clip_norm, use_norm, name=None): """Clips values of multiple tensors by the ratio of the sum of their norms. Given a tuple or list of tensors `t_list`, and a clipping ratio `clip_norm`, this operation returns a list of clipped tensors `list_clipped` and the global norm (`global_norm`) of all tensors in `t_list`. The global norm is expected to be pre-computed and passed as use_norm. To perform the clipping, the values `t_list[i]` are set to: t_list[i] * clip_norm / max(global_norm, clip_norm) where: global_norm = sqrt(sum([l2norm(t)**2 for t in t_list])) If `clip_norm > global_norm` then the entries in `t_list` remain as they are, otherwise they're all shrunk by the global ratio. Any of the entries of `t_list` that are of type `None` are ignored. This is the correct way to perform gradient clipping (for example, see [Pascanu et al., 2012](http://arxiv.org/abs/1211.5063) ([pdf](http://arxiv.org/pdf/1211.5063.pdf))). However, it is slower than `clip_by_norm()` because all the parameters must be ready before the clipping operation can be performed. Args: t_list: A tuple or list of mixed `Tensors`, `IndexedSlices`, or None. clip_norm: A 0-D (scalar) `Tensor` > 0. The clipping ratio. use_norm: A 0-D (scalar) `Tensor` of type `float` (optional). The global norm to use. If not provided, `global_norm()` is used to compute the norm. name: A name for the operation (optional). Returns: list_clipped: A list of `Tensors` of the same type as `list_t`. global_norm: A 0-D (scalar) `Tensor` representing the global norm. Raises: TypeError: If `t_list` is not a sequence. """ if not isinstance(t_list, collections.Sequence) or isinstance( t_list, six.string_types): raise TypeError('t_list should be a sequence') t_list = list(t_list) # Removed as use_norm should always be passed # if use_norm is None: # use_norm = global_norm(t_list, name) with tf.name_scope(name, 'clip_by_global_norm', t_list + [clip_norm]) as name: # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm scale = clip_norm * tf.minimum( 1.0 / use_norm, tf.ones([1], dtype=use_norm.dtype) / clip_norm) values = [ tf.cast( tf.convert_to_tensor( t.values if isinstance(t, tf.IndexedSlices) else t, name='t_%d' % i, ), dtype=tf.float32, ) if t is not None else t for i, t in enumerate(t_list) ] values_clipped = [] for i, v in enumerate(values): if v is None: values_clipped.append(None) else: with tf.colocate_with(v): values_clipped.append( tf.identity(v * scale, name='%s_%d' % (name, i))) list_clipped = [ tf.IndexedSlices(c_v, t.indices, t.dense_shape) if isinstance( t, tf.IndexedSlices) else c_v for (c_v, t) in zip(values_clipped, t_list) ] return list_clipped, use_norm
def predict_lighting_vol(self, mpi, planes, intrinsics, cube_res, scale_factors, depth_clip=20.0): """Predict lighting volumes from MPI. Args: mpi: input mpi planes: input mpi plane depths intrinsics: ref camera intrinsics cube_res: resolution of cube volume for lighting prediction scale_factors: scales for multiresolution cube sampling depth_clip: farthest depth (sets limits of coarsest cube) Returns: list of completed lighting volumes """ batchsize = tf.shape(mpi)[0] max_depth = tf.minimum(planes[0], depth_clip) cube_side_lengths = [2.0 * max_depth] for i in range(len(scale_factors)): cube_side_lengths.append(2.0 * max_depth / scale_factors[i]) # shape of each cube's footprint within the next coarser volume cube_rel_shapes = [] for i in range(len(scale_factors)): if i == 0: i_rel_shape = cube_res // scale_factors[0] else: i_rel_shape = (cube_res * scale_factors[i - 1]) // scale_factors[i] cube_rel_shapes.append(i_rel_shape) cube_centers = [tf.zeros([batchsize, 3])] for i in range(len(scale_factors)): i_center_depth = (cube_side_lengths[i] / (cube_res - 1)) * ( cube_rel_shapes[i] // 2) cube_centers.append( tf.concat([ tf.zeros([batchsize, 2]), i_center_depth * tf.ones([batchsize, 1]) ], axis=1)) cube_nest_inds = [] for i in range(len(scale_factors)): if i == 0: i_nest_inds = [(cube_res - cube_rel_shapes[i]) // 2, (cube_res - cube_rel_shapes[i]) // 2, cube_res // 2 - cube_rel_shapes[i]] else: i_nest_inds = [(cube_res - cube_rel_shapes[i]) // 2, (cube_res - cube_rel_shapes[i]) // 2, cube_res - cube_rel_shapes[i]] cube_nest_inds.append(i_nest_inds) cube_list = [] for i in range(len(cube_centers)): i_cube, _ = pj.mpi_resample_cube(mpi, cube_centers[i], intrinsics, planes, cube_side_lengths[i], cube_res) cube_list.append(i_cube) return cube_list, cube_centers, cube_side_lengths, cube_rel_shapes, cube_nest_inds
def post_process_gradients(grads_and_vars, summaries, lr, clip_gradients, larc_params): """Applies post processing to gradients, i.e. clipping, LARC, summaries.""" if 'global_gradient_norm' in summaries: tf.summary.scalar('global_gradient_norm', _global_norm_with_cast(grads_and_vars)) # Optionally clip gradients by global norm. if clip_gradients is not None: grads_and_vars = _clip_gradients_by_norm(grads_and_vars, clip_gradients) # Add histograms for variables, gradients and gradient norms. if 'global_gradient_norm' in summaries: for gradient, variable in grads_and_vars: if isinstance(gradient, tf.IndexedSlices): grad_values = gradient.values else: grad_values = gradient if isinstance(variable, tf.IndexedSlices): var_values = variable.values else: var_values = variable if grad_values is not None: var_name = variable.name.replace(':', '_') if 'gradients' in summaries: # need to mask nans for automatic loss scaling tf.summary.histogram('gradients/%s' % var_name, mask_nans(grad_values)) if 'gradient_norm' in summaries: tf.summary.scalar('gradient_norm/%s' % var_name, tf.norm(grad_values)) if 'variables' in summaries: tf.summary.histogram('variables/%s' % var_name, var_values) if 'variable_norm' in summaries: tf.summary.scalar('variable_norm/%s' % var_name, tf.norm(var_values)) if clip_gradients is not None and 'global_gradient_norm' in summaries: tf.summary.scalar( 'global_clipped_gradient_norm', _global_norm_with_cast(grads_and_vars), ) # LARC gradient re-scaling if larc_params is not None: check_params( config=larc_params, required_dict={'larc_eta': float}, optional_dict={ 'larc_mode': ['clip', 'scale'], 'min_update': float, 'epsilon': float, }, ) larc_eta = larc_params['larc_eta'] larc_mode = larc_params.get('larc_mode', 'clip') min_update = larc_params.get('min_update', 1e-7) eps = larc_params.get('epsilon', 1e-7) grads_and_vars_larc = [None] * len(grads_and_vars) for idx, (g, v) in enumerate(grads_and_vars): var_dtype = v.dtype v_norm = tf.norm(tensor=tf.cast(v, tf.float32), ord=2) g_norm = tf.norm(tensor=tf.cast(g, tf.float32), ord=2) if larc_mode == 'clip': larc_grad_update = tf.maximum( larc_eta * v_norm / (lr * (g_norm + eps)), min_update) if 'larc_summaries' in summaries: tf.summary.scalar( 'larc_clip_on/{}'.format(v.name), tf.cast(tf.less(larc_grad_update, 1.0), tf.int32), ) larc_grad_update = tf.minimum(larc_grad_update, 1.0) else: larc_grad_update = tf.maximum( larc_eta * v_norm / (g_norm + eps), min_update) larc_grad_update = tf.saturate_cast(larc_grad_update, var_dtype) grads_and_vars_larc[idx] = (larc_grad_update * g, v) # adding additional summary if 'larc_summaries' in summaries: tf.summary.scalar('larc_grad_update/{}'.format(v.name), larc_grad_update) tf.summary.scalar( 'larc_final_lr/{}'.format(v.name), tf.cast(lr, var_dtype) * larc_grad_update, ) grads_and_vars = grads_and_vars_larc return grads_and_vars
def call(self, x): input_image, y_pred, y_true, true_boxes = x # adjust the shape of the y_predict [batch, grid_h, grid_w, 3, 4+1+nb_class] y_pred = tf.reshape( y_pred, tf.concat([tf.shape(y_pred)[:3], tf.constant([3, -1])], axis=0)) # initialize the masks object_mask = tf.expand_dims(y_true[..., 4], 4) # the variable to keep track of number of batches processed batch_seen = tf.Variable(0.) # compute grid factor and net factor grid_h = tf.shape(y_true)[1] grid_w = tf.shape(y_true)[2] grid_factor = tf.reshape(tf.cast([grid_w, grid_h], tf.float32), [1, 1, 1, 1, 2]) net_h = tf.shape(input_image)[1] net_w = tf.shape(input_image)[2] net_factor = tf.reshape(tf.cast([net_w, net_h], tf.float32), [1, 1, 1, 1, 2]) """ Adjust prediction """ pred_box_xy = (self.cell_grid[:, :grid_h, :grid_w, :, :] + tf.sigmoid(y_pred[..., :2])) # sigma(t_xy) + c_xy pred_box_wh = y_pred[..., 2:4] # t_wh pred_box_conf = tf.expand_dims(tf.sigmoid(y_pred[..., 4]), 4) # adjust confidence pred_box_class = y_pred[..., 5:] # adjust class probabilities """ Adjust ground truth """ true_box_xy = y_true[..., 0:2] # (sigma(t_xy) + c_xy) true_box_wh = y_true[..., 2:4] # t_wh true_box_conf = tf.expand_dims(y_true[..., 4], 4) true_box_class = tf.argmax(y_true[..., 5:], -1) """ Compare each predicted box to all true boxes """ # initially, drag all objectness of all boxes to 0 conf_delta = pred_box_conf - 0 # then, ignore the boxes which have good overlap with some true box true_xy = true_boxes[..., 0:2] / grid_factor true_wh = true_boxes[..., 2:4] / net_factor true_wh_half = true_wh / 2. true_mins = true_xy - true_wh_half true_maxes = true_xy + true_wh_half pred_xy = tf.expand_dims(pred_box_xy / grid_factor, 4) pred_wh = tf.expand_dims( tf.exp(pred_box_wh) * self.anchors / net_factor, 4) pred_wh_half = pred_wh / 2. pred_mins = pred_xy - pred_wh_half pred_maxes = pred_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_wh[..., 0] * true_wh[..., 1] pred_areas = pred_wh[..., 0] * pred_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) best_ious = tf.reduce_max(iou_scores, axis=4) conf_delta *= tf.expand_dims( tf.to_float(best_ious < self.ignore_thresh), 4) """ Compute some online statistics """ true_xy = true_box_xy / grid_factor true_wh = tf.exp(true_box_wh) * self.anchors / net_factor true_wh_half = true_wh / 2. true_mins = true_xy - true_wh_half true_maxes = true_xy + true_wh_half pred_xy = pred_box_xy / grid_factor pred_wh = tf.exp(pred_box_wh) * self.anchors / net_factor pred_wh_half = pred_wh / 2. pred_mins = pred_xy - pred_wh_half pred_maxes = pred_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_wh[..., 0] * true_wh[..., 1] pred_areas = pred_wh[..., 0] * pred_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) iou_scores = object_mask * tf.expand_dims(iou_scores, 4) count = tf.reduce_sum(object_mask) count_noobj = tf.reduce_sum(1 - object_mask) detect_mask = tf.to_float((pred_box_conf * object_mask) >= 0.5) class_mask = tf.expand_dims( tf.to_float(tf.equal(tf.argmax(pred_box_class, -1), true_box_class)), 4) recall50 = tf.reduce_sum( tf.to_float(iou_scores >= 0.5) * detect_mask * class_mask) / (count + 1e-3) recall75 = tf.reduce_sum( tf.to_float(iou_scores >= 0.75) * detect_mask * class_mask) / (count + 1e-3) avg_iou = tf.reduce_sum(iou_scores) / (count + 1e-3) avg_obj = tf.reduce_sum(pred_box_conf * object_mask) / (count + 1e-3) avg_noobj = tf.reduce_sum(pred_box_conf * (1 - object_mask)) / (count_noobj + 1e-3) avg_cat = tf.reduce_sum(object_mask * class_mask) / (count + 1e-3) """ Warm-up training """ batch_seen = tf.assign_add(batch_seen, 1.) true_box_xy, true_box_wh, xywh_mask = tf.cond( tf.less(batch_seen, self.warmup_batches + 1), lambda: [ true_box_xy + (0.5 + self.cell_grid[:, :grid_h, :grid_w, :, :]) * (1 - object_mask), true_box_wh + tf.zeros_like(true_box_wh) * (1 - object_mask), tf.ones_like(object_mask) ], lambda: [true_box_xy, true_box_wh, object_mask]) """ Compare each true box to all anchor boxes """ wh_scale = tf.exp(true_box_wh) * self.anchors / net_factor wh_scale = tf.expand_dims( 2 - wh_scale[..., 0] * wh_scale[..., 1], axis=4) # the smaller the box, the bigger the scale xy_delta = xywh_mask * (pred_box_xy - true_box_xy) * wh_scale * self.xywh_scale wh_delta = xywh_mask * (pred_box_wh - true_box_wh) * wh_scale * self.xywh_scale conf_delta = object_mask * ( pred_box_conf - true_box_conf) * self.obj_scale + ( 1 - object_mask) * conf_delta * self.noobj_scale class_delta = object_mask * \ tf.expand_dims(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class), 4) * \ self.class_scale loss_xy = tf.reduce_sum(tf.square(xy_delta), list(range(1, 5))) loss_wh = tf.reduce_sum(tf.square(wh_delta), list(range(1, 5))) loss_conf = tf.reduce_sum(tf.square(conf_delta), list(range(1, 5))) loss_class = tf.reduce_sum(class_delta, list(range(1, 5))) loss = loss_xy + loss_wh + loss_conf + loss_class if debug: loss = tf.Print(loss, [grid_h, avg_obj], message='avg_obj \t\t', summarize=1000) loss = tf.Print(loss, [grid_h, avg_noobj], message='avg_noobj \t\t', summarize=1000) loss = tf.Print(loss, [grid_h, avg_iou], message='avg_iou \t\t', summarize=1000) loss = tf.Print(loss, [grid_h, avg_cat], message='avg_cat \t\t', summarize=1000) loss = tf.Print(loss, [grid_h, recall50], message='recall50 \t', summarize=1000) loss = tf.Print(loss, [grid_h, recall75], message='recall75 \t', summarize=1000) loss = tf.Print(loss, [grid_h, count], message='count \t', summarize=1000) loss = tf.Print(loss, [ grid_h, tf.reduce_sum(loss_xy), tf.reduce_sum(loss_wh), tf.reduce_sum(loss_conf), tf.reduce_sum(loss_class) ], message='loss xy, wh, conf, class: \t', summarize=1000) return loss * self.grid_scale
def minimum(arg1, arg2): """Get min item.""" return tf.minimum(arg1, arg2)
def test_min(self): input1 = tf.placeholder(shape=(4, 32, 32, 3), dtype=tf.float32) input2 = tf.placeholder(shape=(4, 32, 32, 3), dtype=tf.float32) output = tf.minimum(input1, input2) self._test_conversion('min', [input1, input2], [output])
def call(self, inputs, prev_state): """Evaluates one timestep of the current neural stack cell. See section 3.4 of Grefenstette et al., 2015. Args: inputs: The inputs to the neural stack cell should be a tf.float32 tensor with shape [batch_size, embedding_size] prev_state: The NeuralStackState from the previous timestep. Returns: A tuple of the output of the stack as well as the new NeuralStackState. """ batch_size = tf.shape(inputs)[0] # Call the controller and get controller interface values. with tf.control_dependencies([prev_state.read_strengths]): controller_output = self.call_controller( inputs, prev_state.read_values, prev_state.controller_state, batch_size) # Always write input values to memory regardless of push strength. # See Equation-1 in Grefenstette et al., 2015. new_memory_values = prev_state.memory_values + tf.reduce_sum( tf.expand_dims(controller_output.write_values, axis=2) * prev_state.write_strengths, axis=1) # Attenuate the read strengths of existing memory values depending on the # current pop strength. # See Equation-2 in Grefenstette et al., 2015. new_read_strengths = prev_state.read_strengths for h in range(self._num_read_heads - 1, -1, -1): new_read_strengths = tf.nn.relu(new_read_strengths - tf.nn.relu( tf.slice(controller_output.pop_strengths, [0, h, 0, 0], [-1, 1, -1, -1]) - tf.expand_dims(tf.reduce_sum( new_read_strengths * self.get_read_mask(h), axis=2), axis=3))) # Combine all write heads and their associated push values into a single set # of read weights. new_read_strengths += tf.reduce_sum(controller_output.push_strengths * prev_state.write_strengths, axis=1, keep_dims=True) # Calculate the "top" value of the stack by looking at read strengths. # See Equation-3 in Grefenstette et al., 2015. new_read_values = tf.reduce_sum( tf.minimum( new_read_strengths, tf.nn.relu(1 - tf.expand_dims(tf.reduce_sum( new_read_strengths * tf.concat([ self.get_read_mask(h) for h in range(self._num_read_heads) ], axis=1), axis=2), axis=3))) * tf.expand_dims(new_memory_values, axis=1), axis=2) # Temporarily split write strengths apart so they can be shifted in # different directions. write_strengths_by_head = tf.split(prev_state.write_strengths, self._num_write_heads, axis=1) # Shift the write strengths for each write head in the direction indicated # by get_write_head_offset(). new_write_strengths = tf.concat([ tf.roll( write_strength, shift=self.get_write_head_offset(h), axis=2) for h, write_strength in enumerate(write_strengths_by_head) ], axis=1) return (controller_output.outputs, NeuralStackState(controller_state=controller_output.state, read_values=new_read_values, memory_values=new_memory_values, read_strengths=new_read_strengths, write_strengths=new_write_strengths))
def features_to_nonpadding(features, inputs_or_targets="inputs"): """See transformer.features_to_nonpadding.""" key = inputs_or_targets + "_segmentation" if features and key in features: return tf.minimum(tf.to_float(features[key]), 1.0) return None
def __init__(self, session, state_spec, action_spec, hidden_layers, learning_rate, learning_rate_action, learning_rate_ga, batch_size, action_maximization_iterations, name, l2_loss_flag=False, simple_lambda_flag=True, solver=None, sufficient_ascent_flag=False, initial_lambda=10.0, lambda_max=5e3): """Creates CAQL networks. Args: session: TF session. state_spec: tf_agents.specs.array_spec.ArraySpec. Specification for state. action_spec: tf_agents.specs.array_spec.ArraySpec. Specification for action. hidden_layers: list of integers. Number of hidden units for each hidden layer. learning_rate: float on Q function learning rate. learning_rate_action: float on action function learning rate. learning_rate_ga: float. Learning rate for gradient ascent optimizer. batch_size: int on batch size for training. action_maximization_iterations: int on CEM/gradient ascent iterations. name: string on name of network. l2_loss_flag: bool on using l2 loss. simple_lambda_flag: bool on using lambda hinge loss. solver: string on inner max optimizer. Supported optimizers are "gradient_ascent", "cross_entropy", "ails", "mip". sufficient_ascent_flag: bool on using sufficient ascent. initial_lambda: float on initial lambda (only for simple_lambda_flag). lambda_max: float on lambda upper-bound. """ self._session = session self.state_spec = state_spec self.action_spec = action_spec self.state_dim = state_spec.shape[0] self.action_dim = action_spec.shape[0] self.action_max = action_spec.maximum self.action_min = action_spec.minimum self.hidden_layers = hidden_layers self.learning_rate = learning_rate self.learning_rate_action = learning_rate_action self.learning_rate_ga = learning_rate_ga self.batch_size = batch_size self.action_maximization_iterations = action_maximization_iterations self.name = name self.lambda_max = lambda_max if solver == "ails" or solver == "mip": raise ValueError("AILS and MIP solvers are not supported yet.") # define placeholders self._state_tensor = tf.placeholder(dtype=tf.float32, name="state_tensor", shape=(None, self.state_dim)) self._state_deviation_tensor = tf.placeholder( dtype=tf.float32, name="state_deviation_tensor", shape=(None, self.state_dim)) self._action_tensor = tf.placeholder(dtype=tf.float32, name="action_tensor", shape=(None, self.action_dim)) self._next_state_tensor = tf.placeholder(dtype=tf.float32, name="next_state_tensor", shape=(None, self.state_dim)) self._reward_tensor = tf.placeholder(dtype=tf.float32, name="reward_tensor", shape=(None, 1)) self._done_tensor = tf.placeholder(dtype=tf.bool, name="done_tensor", shape=(None, 1)) self._discount_factor = tf.placeholder(dtype=tf.float32, name="discounting_factor", shape=()) self._maxq_label = tf.placeholder(dtype=tf.float32, shape=(None, 1), name="maxq_label") self._backup_tensor = self._reward_tensor + (1.0 - tf.to_float( self._done_tensor)) * self._discount_factor * self._maxq_label self._true_label = tf.placeholder(dtype=tf.float32, shape=(None, 1), name="true_label") self.q_function_network = self._build_q_function_net( self._state_tensor, self._action_tensor) self.state_perturbed_q_function_network = self.q_function_network \ + tf.expand_dims(tf.einsum("ij,ij->i", tf.gradients(self.q_function_network, self._state_tensor)[0], self._state_deviation_tensor), axis=-1) self._td_rmse = tf.sqrt( tf.losses.mean_squared_error( self._reward_tensor + (1.0 - tf.to_float(self._done_tensor)) * self._discount_factor * self._maxq_label, self.q_function_network)) if simple_lambda_flag: with tf.variable_scope("{}_{}".format(self.name, "lambda_function")): lambda_var = tf.Variable(initial_value=initial_lambda, trainable=True, name="lambda_var") self.lambda_function_network = tf.tile( tf.reshape( tf.minimum(lambda_max, tf.maximum(0.0, lambda_var), name="lambda_proj"), (-1, 1)), (self.batch_size, 1)) else: self.lambda_function_network = self._build_lambda_function_net( self._state_tensor, self._action_tensor) # define loss if l2_loss_flag: self._q_function_loss = tf.losses.mean_squared_error( self._true_label, self.q_function_network) else: self._q_function_loss = tf.reduce_mean( self.q_function_network + self.lambda_function_network * tf.maximum(0.0, self._true_label - self.q_function_network)) self._lambda_function_loss = tf.reduce_mean( -self.lambda_function_network * (self._true_label - self.q_function_network)) # Action network to learn argmax of Q self._best_q_label = tf.placeholder(dtype=tf.float32, shape=(None, 1), name="best_q_label") # create network placeholders self._create_network_var_ph() self.action_function_network = self._build_action_function_net( self._state_tensor) self.dummy_q_function_network = self._build_q_function_net( self._state_tensor, self.action_function_network) self._action_function_loss = tf.losses.mean_squared_error( self._best_q_label, self.dummy_q_function_network) # optimizer # NOTE: Increment this by one by inlcuding it only in main_q trainer. global_step = tf.Variable(0, name="{}_global_step".format(self.name), trainable=False) with tf.variable_scope("{}_{}".format(self.name, "optimizer")): self._action_function_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate).minimize( self._action_function_loss, var_list=tf.trainable_variables("{}_{}".format( self.name, "action_function"))) self._q_function_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate).minimize( self._q_function_loss, global_step=global_step, var_list=tf.trainable_variables("{}_{}".format( self.name, "q_function"))) self._lambda_function_optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate).minimize( self._lambda_function_loss, var_list=tf.trainable_variables("{}_{}".format( self.name, "lambda_function"))) # Tensors for dual solvers self._create_dual_maxq_label_tensor() self._create_dual_active_constraint_condition_tensor() self.solver = solver self.sufficient_ascent_flag = sufficient_ascent_flag
def resize_crop_pad(image, desired_output_size, stride, aug_scale_min=1.0, aug_scale_max=1.0, boxes=None, classes=None, attributes=None, masks=None, crop_mask_size=112): """Resize, crop and pad images, boxes and masks (RetinaNet style). Resize, crop and pad images, (optionally boxes and masks) given the desired output size of the image and the stride size. Here are the preprocessing steps. 1. For a given image, keep its aspect ratio and rescale the image to make it the largest rectangle to be bounded by the rectangle specified by the `desired_output_size`. 2. Pad the rescaled image such that the height and width of the image become the smallest multiple of the stride that is larger or equal to the desired output diemension. Args: image: an image tensor of shape [original_height, original_width, 3]. desired_output_size: a tuple of two integers indicating the desired output image size. Note that the actual output size could be different from this. stride: the stride of the backbone network. Each of the output image sides must be the multiple of this. aug_scale_min: a `float` with range between [0, 1.0] representing minimum random scale applied to desired_size for training scale jittering. aug_scale_max: a `float` with range between [1.0, inf] representing maximum random scale applied to desired_size for training scale jittering. boxes: (Optional) a tensor of shape [num_boxes, 4] represneting the box corners in normalized coordinates. classes: (Optional) a tensor of shape [num_boxes] representing the box classes. masks: (Optional) a tensor of shape [num_boxes, image_height, image_width] representing the instance masks which have the same shape as the input image. crop_mask_size: an integer indicating the size of the cropped mask. Returns: image: the processed image tensor after being resized and padded. image_info: a tensor of shape [5] which encodes the height, width before and after resizing and the scaling factor. boxes: None or the processed box tensor after being resized and padded. After the processing, boxes will be in the absolute coordinates w.r.t. the scaled image. classes: None or the processed class tensor after boxes being resized and filtered. masks: None or the processed mask tensor after being resized. """ if boxes is not None: assert classes is not None input_shape = tf.shape(image) input_height = tf.cast(input_shape[0], dtype=tf.float32) input_width = tf.cast(input_shape[1], dtype=tf.float32) desired_height, desired_width = desired_output_size # Find the scale factor such that the scaled image is surrounded by the # rectangle of shape of desired_output_size. scale_if_resize_height = desired_height / input_height scale_if_resize_width = desired_width / input_width scale = tf.minimum(scale_if_resize_height, scale_if_resize_width) desired_scaled_height = scale * input_height desired_scaled_width = scale * input_width desired_scaled_size = tf.stack( [desired_scaled_height, desired_scaled_width], axis=0) random_jittering = aug_scale_min != 1.0 or aug_scale_max != 1.0 if random_jittering: random_scale = tf.random_uniform([], aug_scale_min, aug_scale_max) scale = random_scale * scale scaled_size = tf.round(random_scale * desired_scaled_size) else: scaled_size = desired_scaled_size scaled_size_int = tf.cast(scaled_size, dtype=tf.int32) desired_scaled_size_int = tf.cast(desired_scaled_size, dtype=tf.int32) image = tf.image.resize_images(image, scaled_size_int, method=tf.image.ResizeMethod.BILINEAR) if boxes is not None: normalized_boxes = boxes # Convert the normalized coordinates to the coordinates w.r.t. # the scaled image. boxes = boxes * tf.tile(tf.expand_dims(scaled_size, axis=0), [1, 2]) if masks is not None and not random_jittering: num_instances = tf.shape(boxes)[0] masks = tf.image.crop_and_resize( image=tf.expand_dims(masks, axis=-1), boxes=normalized_boxes, box_indices=tf.range(num_instances, dtype=tf.int32), crop_size=[crop_mask_size, crop_mask_size], method='bilinear') masks = tf.squeeze(masks, axis=-1) if random_jittering: max_offset = scaled_size - desired_scaled_size max_offset = tf.where(tf.less(max_offset, 0), tf.zeros_like(max_offset), max_offset) offset = tf.cast(max_offset * tf.random_uniform((2, ), 0, 1), dtype=tf.int32) image = image[offset[0]:offset[0] + desired_scaled_size_int[0], offset[1]:offset[1] + desired_scaled_size_int[1], :] if boxes is not None: box_offsets = tf.cast(tf.tile(tf.expand_dims(offset, axis=0), [1, 2]), dtype=tf.float32) boxes -= box_offsets boxes = box_utils.clip_boxes(boxes, desired_scaled_size_int[0], desired_scaled_size_int[1]) indices = tf.where( tf.logical_and(tf.greater(boxes[:, 2] - boxes[:, 0], 0), tf.greater(boxes[:, 3] - boxes[:, 1], 0)))[:, 0] boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) if attributes is not None: attributes = tf.gather(attributes, indices) if masks is not None: masks = tf.gather(masks, indices) # Convert the processed boxes back to the normalized coordinates w.r.t. # the original image in order to crop and resize the instance masks. cropped_boxes = boxes + box_offsets cropped_boxes /= tf.tile(tf.expand_dims(scaled_size, axis=0), [1, 2]) num_instances = tf.shape(boxes)[0] masks = tf.image.crop_and_resize( image=tf.expand_dims(masks, axis=-1), boxes=cropped_boxes, box_indices=tf.range(num_instances, dtype=tf.int32), crop_size=[crop_mask_size, crop_mask_size], method='bilinear') masks = tf.squeeze(masks, axis=-1) # Pad image such that its height and width are the closest multiple of stride. padded_height = int(math.ceil(desired_height * 1.0 / stride) * stride) padded_width = int(math.ceil(desired_width * 1.0 / stride) * stride) image = tf.image.pad_to_bounding_box(image, 0, 0, padded_height, padded_width) image.set_shape([padded_height, padded_width, 3]) # desired_scaled_size is the actual image size. Pixels beyond this are from # padding. image_info = tf.stack([ desired_scaled_size[0], desired_scaled_size[1], 1.0 / scale, input_height, input_width ]) return image, image_info, boxes, classes, attributes, masks
def _create_dual_maxq_label_tensor(self, method="duality_based"): """Approximate the maxq label with dual.""" w_transpose_list = [] b_transpose_list = [] num_layers = 1 for itr, var in enumerate(self._vars_tf): if itr % 2 == 0: # even itr, multiplicative weights if itr == 0: wx_transpose = self._dummy_network_var_ph["{}_ph".format( var.name)][:self.state_dim, :] w_transpose_list.append( self._dummy_network_var_ph["{}_ph".format( var.name)][self.state_dim:, :]) else: w_transpose_list.append( self._dummy_network_var_ph["{}_ph".format(var.name)]) num_layers += 1 else: # odd itr, additive weights if itr == 1: b_transpose_list.append( tf.tile( tf.expand_dims(self._dummy_network_var_ph[ "{}_ph".format(var.name)], axis=0), [self.batch_size, 1]) + tf.matmul(self._next_state_tensor, wx_transpose)) else: b_transpose_list.append( tf.tile( tf.expand_dims( self._dummy_network_var_ph["{}_ph".format( var.name)], axis=0), [self.batch_size, 1])) action_tensor_center = tf.zeros( shape=[self.batch_size, self.action_dim]) l_infty_norm_bound = np.max(self.action_max) if method == "duality_based": self.dual_maxq_tensor = dual_method.create_dual_approx( num_layers, self.batch_size, l_infty_norm_bound, w_transpose_list, b_transpose_list, action_tensor_center) elif method == "ibp": # ibp dual solver self.dual_maxq_tensor = dual_ibp_method.create_dual_ibp_approx( num_layers, self.batch_size, l_infty_norm_bound, w_transpose_list, b_transpose_list, action_tensor_center) else: # mix method dual_maxq_tensor = dual_method.create_dual_approx( num_layers, self.batch_size, l_infty_norm_bound, w_transpose_list, b_transpose_list, action_tensor_center) dual_ibp_maxq_tensor = dual_ibp_method.create_dual_ibp_approx( num_layers, self.batch_size, l_infty_norm_bound, w_transpose_list, b_transpose_list, action_tensor_center) # minimum of the upper-bound self.dual_maxq_tensor = tf.minimum(dual_maxq_tensor, dual_ibp_maxq_tensor)
def get(self): """ Provides input data to the graph. """ # calculate size of each record (this lists what is contained in the db and how many bytes are occupied) record_bytes = 0 encoding_bytes = 4 kp_xyz_entries = 3 * self.num_kp record_bytes += encoding_bytes*kp_xyz_entries encoding_bytes = 4 kp_uv_entries = 2 * self.num_kp record_bytes += encoding_bytes*kp_uv_entries kp_vis_entries = self.num_kp record_bytes += encoding_bytes*kp_vis_entries image_bytes = self.image_size[0] * self.image_size[1] * 3 record_bytes += image_bytes """ READ DATA ITEMS""" # Start reader reader = tf.FixedLengthRecordReader(header_bytes=0, record_bytes=record_bytes) _, value = reader.read(tf.train.string_input_producer([self.path_to_db])) # decode to floats bytes_read = 0 data_dict = dict() record_bytes_float32 = tf.decode_raw(value, tf.float32) # 1. Read keypoint xyz keypoint_xyz21 = tf.reshape(tf.slice(record_bytes_float32, [bytes_read//4], [kp_xyz_entries]), [self.num_kp, 3]) bytes_read += encoding_bytes*kp_xyz_entries keypoint_xyz21 /= 1000.0 # scale to meters keypoint_xyz21 = self.convert_kp(keypoint_xyz21) # calculate wrist coord if self.use_wrist_coord: wrist_xyz = keypoint_xyz21[16, :] + 2.0*(keypoint_xyz21[0, :] - keypoint_xyz21[16, :]) keypoint_xyz21 = tf.concat([tf.expand_dims(wrist_xyz, 0), keypoint_xyz21[1:, :]], 0) data_dict['keypoint_xyz21'] = keypoint_xyz21 # 2. Read keypoint uv AND VIS keypoint_uv_vis21 = tf.reshape(tf.slice(record_bytes_float32, [bytes_read//4], [kp_uv_entries+kp_vis_entries]), [self.num_kp, 3]) bytes_read += encoding_bytes*(kp_uv_entries+kp_vis_entries) keypoint_uv_vis21 = self.convert_kp(keypoint_uv_vis21) keypoint_uv21 = keypoint_uv_vis21[:, :2] keypoint_vis21 = tf.equal(keypoint_uv_vis21[:, 2], 1.0) # calculate wrist vis if self.use_wrist_coord: wrist_vis = tf.logical_or(keypoint_vis21[16], keypoint_vis21[0]) keypoint_vis21 = tf.concat([tf.expand_dims(wrist_vis, 0), keypoint_vis21[1:]], 0) wrist_uv = keypoint_uv21[16, :] + 2.0*(keypoint_uv21[0, :] - keypoint_uv21[16, :]) keypoint_uv21 = tf.concat([tf.expand_dims(wrist_uv, 0), keypoint_uv21[1:, :]], 0) data_dict['keypoint_vis21'] = keypoint_vis21 if self.coord_uv_noise: noise = tf.truncated_normal([42, 2], mean=0.0, stddev=self.coord_uv_noise_sigma) keypoint_uv21 += noise data_dict['keypoint_uv21'] = keypoint_uv21 # decode to uint8 record_bytes_uint8 = tf.decode_raw(value, tf.uint8) # 4. Read image image = tf.reshape(tf.slice(record_bytes_uint8, [bytes_read], [image_bytes]), [self.image_size[0], self.image_size[1], 3]) image = tf.cast(image, tf.float32) bytes_read += image_bytes # subtract mean image = image / 255.0 - 0.5 if self.hue_aug: image = tf.image.random_hue(image, self.hue_aug_max) data_dict['image'] = image """ CONSTANTS """ # Camera intrinsics sx = 822.79041 sy = 822.79041 tx = 318.47345 ty = 250.31296 data_dict['cam_mat'] = tf.constant([[sx, 0.0, tx], [0.0, sy, ty], [0.0, 0.0, 1.0]]) # Hand side: this dataset only contains left hands data_dict['hand_side'] = tf.one_hot(tf.constant(0, dtype=tf.int32), depth=2, on_value=1.0, off_value=0.0, dtype=tf.float32) assert bytes_read == record_bytes, "Doesnt add up." """ DEPENDENT DATA ITEMS: XYZ represenations. """ # make coords relative to root joint kp_coord_xyz_root = keypoint_xyz21[0, :] # this is the palm coord kp_coord_xyz21_rel = keypoint_xyz21 - kp_coord_xyz_root # relative coords in metric coords index_root_bone_length = tf.sqrt(tf.reduce_sum(tf.square(kp_coord_xyz21_rel[12, :] - kp_coord_xyz21_rel[11, :]))) data_dict['keypoint_scale'] = index_root_bone_length data_dict['keypoint_xyz21_normed'] = kp_coord_xyz21_rel / index_root_bone_length # normalized by length of 12->11 # calculate local coordinates kp_coord_xyz21_local = bone_rel_trafo(data_dict['keypoint_xyz21_normed']) kp_coord_xyz21_local = tf.squeeze(kp_coord_xyz21_local) data_dict['keypoint_xyz21_local'] = kp_coord_xyz21_local # calculate viewpoint and coords in canonical coordinates kp_coord_xyz21_rel_can, rot_mat = canonical_trafo(data_dict['keypoint_xyz21_normed']) kp_coord_xyz21_rel_can, rot_mat = tf.squeeze(kp_coord_xyz21_rel_can), tf.squeeze(rot_mat) data_dict['keypoint_xyz21_can'] = kp_coord_xyz21_rel_can data_dict['rot_mat'] = tf.matrix_inverse(rot_mat) """ DEPENDENT DATA ITEMS: HAND CROP """ if self.hand_crop: crop_center = keypoint_uv21[12, ::-1] # catch problem, when no valid kp available (happens almost never) crop_center = tf.cond(tf.reduce_all(tf.is_finite(crop_center)), lambda: crop_center, lambda: tf.constant([0.0, 0.0])) crop_center.set_shape([2, ]) if self.crop_center_noise: noise = tf.truncated_normal([2], mean=0.0, stddev=self.crop_center_noise_sigma) crop_center += noise crop_scale_noise = tf.constant(1.0) if self.crop_scale_noise: crop_scale_noise = tf.squeeze(tf.random_uniform([1], minval=1.0, maxval=1.2)) if not self.use_wrist_coord: wrist_uv = keypoint_uv21[16, :] + 2.0*(keypoint_uv21[0, :] - keypoint_uv21[16, :]) keypoint_uv21 = tf.concat([tf.expand_dims(wrist_uv, 0), keypoint_uv21[1:, :]], 0) # select visible coords only kp_coord_h = tf.boolean_mask(keypoint_uv21[:, 1], keypoint_vis21) kp_coord_w = tf.boolean_mask(keypoint_uv21[:, 0], keypoint_vis21) kp_coord_hw = tf.stack([kp_coord_h, kp_coord_w], 1) # determine size of crop (measure spatial extend of hw coords first) min_coord = tf.maximum(tf.reduce_min(kp_coord_hw, 0), 0.0) max_coord = tf.minimum(tf.reduce_max(kp_coord_hw, 0), self.image_size) # find out larger distance wrt the center of crop crop_size_best = 2*tf.maximum(max_coord - crop_center, crop_center - min_coord) crop_size_best = tf.reduce_max(crop_size_best) crop_size_best = tf.minimum(tf.maximum(crop_size_best, 50.0), 500.0) # catch problem, when no valid kp available crop_size_best = tf.cond(tf.reduce_all(tf.is_finite(crop_size_best)), lambda: crop_size_best, lambda: tf.constant(200.0)) crop_size_best.set_shape([]) # calculate necessary scaling scale = tf.cast(self.crop_size, tf.float32) / crop_size_best scale = tf.minimum(tf.maximum(scale, 1.0), 10.0) scale *= crop_scale_noise data_dict['crop_scale'] = scale if self.crop_offset_noise: noise = tf.truncated_normal([2], mean=0.0, stddev=self.crop_offset_noise_sigma) crop_center += noise # Crop image img_crop = crop_image_from_xy(tf.expand_dims(image, 0), crop_center, self.crop_size, scale) data_dict['image_crop'] = tf.squeeze(img_crop) # Modify uv21 coordinates crop_center_float = tf.cast(crop_center, tf.float32) keypoint_uv21_u = (data_dict['keypoint_uv21'][:, 0] - crop_center_float[1]) * scale + self.crop_size // 2 keypoint_uv21_v = (data_dict['keypoint_uv21'][:, 1] - crop_center_float[0]) * scale + self.crop_size // 2 keypoint_uv21 = tf.stack([keypoint_uv21_u, keypoint_uv21_v], 1) data_dict['keypoint_uv21'] = keypoint_uv21 # Modify camera intrinsics scale = tf.reshape(scale, [1, ]) scale_matrix = tf.dynamic_stitch([[0], [1], [2], [3], [4], [5], [6], [7], [8]], [scale, [0.0], [0.0], [0.0], scale, [0.0], [0.0], [0.0], [1.0]]) scale_matrix = tf.reshape(scale_matrix, [3, 3]) crop_center_float = tf.cast(crop_center, tf.float32) trans1 = crop_center_float[0] * scale - self.crop_size // 2 trans2 = crop_center_float[1] * scale - self.crop_size // 2 trans1 = tf.reshape(trans1, [1, ]) trans2 = tf.reshape(trans2, [1, ]) trans_matrix = tf.dynamic_stitch([[0], [1], [2], [3], [4], [5], [6], [7], [8]], [[1.0], [0.0], -trans2, [0.0], [1.0], -trans1, [0.0], [0.0], [1.0]]) trans_matrix = tf.reshape(trans_matrix, [3, 3]) data_dict['cam_mat'] = tf.matmul(trans_matrix, tf.matmul(scale_matrix, data_dict['cam_mat'])) """ DEPENDENT DATA ITEMS: Scoremap from the SUBSET of 21 keypoints""" # create scoremaps from the subset of 2D annoataion keypoint_hw21 = tf.stack([keypoint_uv21[:, 1], keypoint_uv21[:, 0]], -1) scoremap_size = self.image_size if self.hand_crop: scoremap_size = (self.crop_size, self.crop_size) scoremap = self.create_multiple_gaussian_map(keypoint_hw21, scoremap_size, self.sigma, valid_vec=keypoint_vis21) if self.scoremap_dropout: scoremap = tf.nn.dropout(scoremap, self.scoremap_dropout_prob, noise_shape=[1, 1, 21]) scoremap *= self.scoremap_dropout_prob data_dict['scoremap'] = scoremap if self.random_crop_to_size: tensor_stack = tf.concat([data_dict['image'], tf.expand_dims(tf.cast(data_dict['hand_parts'], tf.float32), -1), tf.cast(data_dict['hand_mask'], tf.float32)], 2) s = tensor_stack.get_shape().as_list() tensor_stack_cropped = tf.random_crop(tensor_stack, [self.random_crop_size, self.random_crop_size, s[2]]) data_dict = dict() # delete everything else because the random cropping makes the data invalid anyway data_dict['image'], data_dict['hand_parts'], data_dict['hand_mask'] = tensor_stack_cropped[:, :, :3],\ tf.cast(tensor_stack_cropped[:, :, 3], tf.int32),\ tf.cast(tensor_stack_cropped[:, :, 4:], tf.int32) names, tensors = zip(*data_dict.items()) if self.shuffle: tensors = tf.train.shuffle_batch_join([tensors], batch_size=self.batch_size, capacity=100, min_after_dequeue=50, enqueue_many=False) else: tensors = tf.train.batch_join([tensors], batch_size=self.batch_size, capacity=100, enqueue_many=False) return dict(zip(names, tensors))
def vae_model_fn(features, labels, mode, params): # Build mtf_features & seq length dict for getting number of microbatches # We need to pack inputs into a dict to pass into serialize_training_step H = W = params["dataset"]["image_size"] # TODO: check equal mode_str = mode_to_str(mode) batch_size = params[f"{mode_str}_batch_size"] n_channels = params.get("input_channels", 3) model = DiscreteVAE(num_tokens=params["num_tokens"], dim=params["n_embd"], hidden_dim=params["hidden_dim"], input_channels=n_channels, convblocks=params.get("convblocks", [(3, 64), (3, 128), (3, 256)]), recompute_grad=params.get("recompute_grad", False), use_bf16=params.get("use_bf16", False), stack_factor=params.get("stack_factor", 1), dimensions=H) if mode == tf.estimator.ModeKeys.PREDICT: raise NotImplementedError train_gumbel = params.get("train_gumbel_hard", True) eval_gumbel = params.get("eval_gumbel_hard", True) # We're not predicting, so we better be training or evaluating assert (mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL) gumbel = train_gumbel if mode == tf.estimator.ModeKeys.TRAIN else eval_gumbel if params.get("temp_anneal_steps", None): warmup_frac = tf.cast(tf.train.get_global_step(), tf.float32) / params["temp_anneal_steps"] warmup_frac = tf.minimum(warmup_frac, tf.constant(1.0)) temp = params["temp_start"] - warmup_frac * (params["temp_start"] - params["temp"]) else: temp = params.get("temp", 1.0) # TODO: add back in microbatching if params.get("use_bf16", False): with tf.tpu.bfloat16_scope(): with tf.variable_scope("vae"): loss, reconstruction = model.forward(features, return_recon_loss=True, temperature=temp, hard_gumbel=gumbel) loss = tf.cast(loss, tf.float32) reconstruction = tf.cast(reconstruction, tf.float32) else: with tf.variable_scope("vae"): loss, reconstruction = model.forward(features, return_recon_loss=True, temperature=temp, hard_gumbel=gumbel) optimizer = tf.train.AdamOptimizer(learning_rate=params["lr"]) optimizer = tf.tpu.CrossShardOptimizer(optimizer) global_step = tf.train.get_or_create_global_step() update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step) def host_call_fn(gs, loss, input, reconstruction): gs = gs[0] loss = tf.math.reduce_mean(loss) denormalize = lambda x: (x + 1) / 2 with tf2.summary.create_file_writer(params['model_path']).as_default(): tf2.summary.scalar('loss', loss, step=gs) tf2.summary.image('input_image', denormalize(input), step=gs) tf2.summary.image('reconstruction_image', denormalize(reconstruction), step=gs) return tf.summary.all_v2_summary_ops() def metric_fn(gs, loss, input, reconstruction): gs = gs[0] loss = tf.math.reduce_mean(loss) denormalize = lambda x: (x + 1) / 2 with tf2.summary.create_file_writer(params['model_path']).as_default(): loss_op = tf.metrics.mean(loss) with tf2.summary.record_if(loss_op[0] < tf.constant(1e-9)): tf2.summary.image('eval/input_image', denormalize(input), step=gs) tf2.summary.image('eval/reconstruction_image', denormalize(reconstruction), step=gs) with tf.control_dependencies(tf.summary.all_v2_summary_ops()): dummy_op = tf.no_op() return {"_loss": loss_op, "zzz_dummy": (tf.constant(0), dummy_op)} # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. gs_t = tf.reshape(global_step, [1]) loss_t = tf.reshape(loss, [1]) host_call = (host_call_fn, [gs_t, loss_t, features, reconstruction]) metric = (metric_fn, [gs_t, loss_t, features, reconstruction]) return tpu_estimator.TPUEstimatorSpec( mode, loss=loss, host_call=host_call if mode == tf.estimator.ModeKeys.TRAIN else None, train_op=train_op, eval_metrics=metric)
def _get_classification_outputs( config, is_training, output_layer, output_layer_aggregation, label_ids, input_mask, table_mask, aggregation_function_id, answer, numeric_values, numeric_values_scale, row_ids, column_ids, classification_class_index, ): """Creates a classification model. Args: config: Configuration for Tapas model. is_training: Whether the model is training. output_layer: <float32>[batch_size, seq_length, hidden_size] output_layer_aggregation: <float32>[batch_size, hidden_size] label_ids: <int32>[batch_size, seq_length] input_mask: <int32>[batch_size, seq_length] table_mask: <int32>[batch_size, seq_length] aggregation_function_id: <int32>[batch_size] answer: <float32>[batch_size] numeric_values: <float32>[batch_size, seq_length] numeric_values_scale: <float32>[batch_size, seq_length] row_ids: <int32>[batch_size, seq_length] column_ids: <int32>[batch_size, seq_length] classification_class_index: <int32>[batch] Returns: Outputs """ if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) # Construct indices for the table. row_index = segmented_tensor.IndexMap(indices=tf.minimum( row_ids, config.max_num_rows - 1), num_segments=config.max_num_rows, batch_dims=1) col_index = segmented_tensor.IndexMap(indices=tf.minimum( column_ids, config.max_num_columns - 1), num_segments=config.max_num_columns, batch_dims=1) cell_index = segmented_tensor.ProductIndexMap(row_index, col_index) # Masks. # <float32>[batch_size, seq_length] input_mask_float = tf.cast(input_mask, tf.float32) table_mask_float = tf.cast(table_mask, tf.float32) # Mask for cells that exist in the table (i.e. that are not padding). cell_mask, _ = segmented_tensor.reduce_mean(input_mask_float, cell_index) # Compute logits per token. These are used to select individual cells. logits = compute_token_logits( output_layer=output_layer, temperature=config.temperature, init_cell_selection_weights_to_zero=\ config.init_cell_selection_weights_to_zero) # Compute logits per column. These are used to select a column. if config.select_one_column: column_logits = utils.compute_column_logits( output_layer=output_layer, cell_index=cell_index, cell_mask=cell_mask, init_cell_selection_weights_to_zero=\ config.init_cell_selection_weights_to_zero, allow_empty_column_selection=config.allow_empty_column_selection) # TODO(pawelnow): Extract this into a function. # Compute aggregation function logits. do_model_aggregation = config.num_aggregation_labels > 0 if do_model_aggregation: hidden_size_agg = output_layer_aggregation.shape[-1].value output_weights_agg = tf.get_variable( "output_weights_agg", shape=[config.num_aggregation_labels, hidden_size_agg], initializer=_classification_initializer()) output_bias_agg = tf.get_variable( "output_bias_agg", shape=[config.num_aggregation_labels], initializer=tf.zeros_initializer()) do_model_classification = config.num_classification_labels > 0 logits_cls = None if do_model_classification: logits_cls = compute_classification_logits( config.num_classification_labels, output_layer_aggregation) with tf.variable_scope("loss"): total_loss = 0.0 is_supervised = not do_model_aggregation or \ not config.use_answer_as_supervision ### Semi-supervised cell selection in case of no aggregation ############################################################# # If the answer (the denotation) appears directly in the table we might # select the answer without applying any aggregation function. There are # some ambiguous cases, see _calculate_aggregate_mask for more info. # `aggregate_mask` is 1 for examples where we chose to aggregate and 0 # for examples where we chose to select the answer directly. # `label_ids` encodes the positions of the answer appearing in the table. if is_supervised: aggregate_mask = None else: # <float32>[batch_size] aggregate_mask = _calculate_aggregate_mask( answer=answer, output_layer_aggregation=output_layer_aggregation, output_bias_agg=output_bias_agg, output_weights_agg=output_weights_agg, cell_select_pref=config.cell_select_pref, label_ids=label_ids) ### Cell selection log-likelihood ################################### if config.average_logits_per_cell: logits_per_cell, _ = segmented_tensor.reduce_mean( logits, cell_index) logits = segmented_tensor.gather(logits_per_cell, cell_index) dist_per_token = tfp.distributions.Bernoulli(logits=logits) selection_loss_per_example = None if not config.select_one_column: weight = tf.where( label_ids == 0, tf.ones_like(label_ids, dtype=tf.float32), config.positive_weight *\ tf.ones_like(label_ids, dtype=tf.float32)) selection_loss_per_token = -dist_per_token.log_prob( label_ids) * weight selection_loss_per_example = ( tf.reduce_sum(selection_loss_per_token * input_mask_float, axis=1) / (tf.reduce_sum(input_mask_float, axis=1) + _EPSILON_ZERO_DIVISION)) else: selection_loss_per_example, logits = _single_column_cell_selection_loss( token_logits=logits, column_logits=column_logits, label_ids=label_ids, cell_index=cell_index, col_index=col_index, cell_mask=cell_mask) dist_per_token = tfp.distributions.Bernoulli(logits=logits) ### Logits for the aggregation function ######################################### logits_aggregation = None if do_model_aggregation: logits_aggregation = _calculate_aggregation_logits( output_layer_aggregation, output_weights_agg, output_bias_agg) ### Classification loss ############################### if do_model_classification: one_hot_labels = tf.one_hot(classification_class_index, depth=config.num_classification_labels, dtype=tf.float32) log_probs = tf.nn.log_softmax(logits_cls, axis=-1) # <float32>[batch_size] per_example_classification_intermediate = -tf.reduce_sum( one_hot_labels * log_probs, axis=-1) cls_loss = tf.reduce_mean(per_example_classification_intermediate) total_loss += cls_loss ### Supervised cell selection ############################### span_indexes = None span_logits = None if config.span_prediction != SpanPredictionMode.NONE: ( span_indexes, span_logits, span_loss, ) = span_prediction_utils.get_span_logits_by_mode( config.span_prediction, output_layer, label_ids, column_ids, row_ids, max_span_length=10, ) total_loss += span_loss elif config.disable_per_token_loss: pass elif is_supervised: total_loss += tf.reduce_mean(selection_loss_per_example) else: # For the not supervissed case, do not assign loss for cell selection total_loss += tf.reduce_mean(selection_loss_per_example * (1.0 - aggregate_mask)) ### Semi-supervised regression loss and supervised loss for aggregations ######################################################################### if do_model_aggregation: # Note that `aggregate_mask` is None if the setting is supervised. per_example_additional_loss = _calculate_aggregation_loss( logits_aggregation, aggregate_mask, aggregation_function_id, config) if config.use_answer_as_supervision: # Add regression loss for numeric answers which require aggregation. answer_loss, large_answer_loss_mask = _calculate_regression_loss( answer, aggregate_mask, dist_per_token, numeric_values, numeric_values_scale, table_mask_float, logits_aggregation, config) per_example_additional_loss += answer_loss # Zero loss for examples with answer_loss > cutoff. per_example_additional_loss *= large_answer_loss_mask total_loss += tf.reduce_mean(per_example_additional_loss) return Outputs( total_loss=total_loss, logits=logits, probs=_get_probs(dist_per_token) * input_mask_float, logits_aggregation=logits_aggregation, logits_cls=logits_cls, span_indexes=span_indexes, span_logits=span_logits, )
def _generate_detections_per_image(boxes, scores, attributes, max_total_size=100, nms_iou_threshold=0.3, score_threshold=0.05, pre_nms_num_boxes=5000): """Generate the final detections per image given the model outputs. Args: boxes: a tensor with shape [N, num_classes, 4] or [N, 1, 4], which box predictions on all feature levels. The N is the number of total anchors on all levels. scores: a tensor with shape [N, num_classes], which stacks class probability on all feature levels. The N is the number of total anchors on all levels. The num_classes is the number of classes predicted by the model. Note that the class_outputs here is the raw score. attributes: a tensor with shape [N, num_attributes], which stacks attribute probability on all feature levels. max_total_size: a scalar representing maximum number of boxes retained over all classes. nms_iou_threshold: a float representing the threshold for deciding whether boxes overlap too much with respect to IOU. score_threshold: a float representing the threshold for deciding when to remove boxes based on score. pre_nms_num_boxes: an int number of top candidate detections per class before NMS. Returns: nmsed_boxes: `float` Tensor of shape [max_total_size, 4] representing top detected boxes in [y1, x1, y2, x2]. nmsed_scores: `float` Tensor of shape [max_total_size] representing sorted confidence scores for detected boxes. The values are between [0, 1]. nmsed_classes: `int` Tensor of shape [max_total_size] representing classes for detected boxes. nmsed_attributes: `int` Tensor of shape [max_total_size, num_attributes] representing attributes for detected boxes. valid_detections: `int` Tensor of shape [1] only the top `valid_detections` boxes are valid detections. """ nmsed_boxes = [] nmsed_scores = [] nmsed_classes = [] nmsed_attributes = [] num_classes_for_box = boxes.get_shape().as_list()[1] num_classes = scores.get_shape().as_list()[1] for i in range(num_classes): boxes_i = boxes[:, min(num_classes_for_box - 1, i)] scores_i = scores[:, i] # Obtains pre_nms_num_boxes before running NMS. scores_i, indices = tf.nn.top_k(scores_i, k=tf.minimum( tf.shape(scores_i)[-1], pre_nms_num_boxes)) boxes_i = tf.gather(boxes_i, indices) attributes_i = tf.gather(attributes, indices) (nmsed_indices_i, nmsed_num_valid_i) = tf.image.non_max_suppression_padded( tf.cast(boxes_i, tf.float32), tf.cast(scores_i, tf.float32), max_total_size, iou_threshold=nms_iou_threshold, score_threshold=score_threshold, pad_to_max_output_size=True, name='nms_detections_' + str(i)) nmsed_boxes_i = tf.gather(boxes_i, nmsed_indices_i) nmsed_scores_i = tf.gather(scores_i, nmsed_indices_i) nmsed_attributes_i = tf.gather(attributes_i, nmsed_indices_i) # Sets scores of invalid boxes to -1. nmsed_scores_i = tf.where( tf.less(tf.range(max_total_size), [nmsed_num_valid_i]), nmsed_scores_i, -tf.ones_like(nmsed_scores_i)) nmsed_classes_i = tf.fill([max_total_size], i) nmsed_boxes.append(nmsed_boxes_i) nmsed_scores.append(nmsed_scores_i) nmsed_classes.append(nmsed_classes_i) nmsed_attributes.append(nmsed_attributes_i) # Concats results from all classes and sort them. nmsed_boxes = tf.concat(nmsed_boxes, axis=0) nmsed_scores = tf.concat(nmsed_scores, axis=0) nmsed_classes = tf.concat(nmsed_classes, axis=0) nmsed_attributes = tf.concat(nmsed_attributes, axis=0) nmsed_scores, indices = tf.nn.top_k(nmsed_scores, k=max_total_size, sorted=True) nmsed_boxes = tf.gather(nmsed_boxes, indices) nmsed_classes = tf.gather(nmsed_classes, indices) nmsed_attributes = tf.gather(nmsed_attributes, indices) valid_detections = tf.reduce_sum( tf.cast(tf.greater(nmsed_scores, -1), tf.int32)) return (nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_attributes, valid_detections)
def dot_product_area_attention(q, k, v, bias, dropout_rate=0.0, image_shapes=None, name=None, attention_image_summary=None, save_weights_to=None, dropout_broadcast_dims=None, max_area_width=1, max_area_height=1, memory_height=1, area_key_mode="mean", area_value_mode="sum", top_k_areas=0, area_temperature=1.0, training=True): """Dot-product area attention. Args: q: Tensor with shape [..., length_q, depth_k]. k: Tensor with shape [..., length_kv, depth_k]. Leading dimensions must match with q. v: Tensor with shape [..., length_kv, depth_v] Leading dimensions must match with q. bias: bias Tensor (see attention_bias()) dropout_rate: a float. image_shapes: optional tuple of integer scalars. see comments for attention_image_summary() name: an optional string attention_image_summary: the callback for making image summary of attention. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). dropout_broadcast_dims: an optional list of integers less than rank of q. Specifies in which dimensions to broadcast the dropout decisions. max_area_width: the max width allowed for an area. max_area_height: the max height allowed for an area. memory_height: the height of the memory. area_key_mode: the mode for computing area keys, which can be "mean", "concat", "sum", "sample_concat", and "sample_sum". area_value_mode: the mode for computing area values, which can be either "mean", or "sum". top_k_areas: Use the top key areas for attention. area_temperature: the temperature for attention softmax. training: indicating if it is in the training mode. Returns: Tensor with shape [..., length_q, depth_v]. """ tf.logging.info( "dot_product_area_attention: " "area_h=%d, area_w=%d, mem_h=%d, " "area_key_mode=%s, area_value_mode=%s, " "area_temperature=%f", max_area_height, max_area_width, memory_height, area_key_mode, area_value_mode, area_temperature) with tf.variable_scope(name, default_name="dot_product_area_attention", values=[q, k, v]) as scope: mem_shape = common_layers.shape_list(k) batch_size = mem_shape[0] head_size = mem_shape[1] length = mem_shape[2] depth = mem_shape[3] k_area = compute_area_key(tf.reshape(k, [-1, length, depth]), max_area_width=max_area_width, max_area_height=max_area_height, height=memory_height, mode=area_key_mode, training=training) if area_value_mode == "mean": v_area, _, _, _, _ = compute_area_features( tf.reshape(v, [-1, length, depth]), max_area_width=max_area_width, max_area_height=max_area_height, height=memory_height) elif area_value_mode == "max": v_area, _, _ = basic_pool(tf.reshape(v, [-1, length, depth]), max_area_width=max_area_width, max_area_height=max_area_height, height=memory_height, fn=tf.reduce_max) elif area_value_mode == "sum": _, _, v_area, _, _ = compute_area_features( tf.reshape(v, [-1, length, depth]), max_area_width=max_area_width, max_area_height=max_area_height, height=memory_height) else: raise ValueError("Unsupported area value mode=%s" % area_value_mode) k = tf.reshape(k_area, [batch_size, head_size, -1, depth]) v = tf.reshape(v_area, [batch_size, head_size, -1, depth]) logits = tf.matmul(q, k, transpose_b=True) # [..., length_q, length_kv] if bias is not None: bias = common_layers.cast_like(bias, logits) with tf.name_scope("compute_area_att_bias", values=[bias]): bias_shape = common_layers.shape_list(bias) mem_length = bias_shape[-1] bias_values = tf.reshape(tf.to_float(tf.less(bias, -1)), [-1, mem_length, 1]) _, _, padding_sum, _, _ = compute_area_features( bias_values, max_area_width=max_area_width, max_area_height=max_area_height, height=memory_height) bias = tf.where(tf.cast(tf.to_int32(padding_sum), tf.bool), tf.fill(tf.shape(padding_sum), -np.inf), tf.zeros_like(padding_sum, dtype=tf.float32)) bias = tf.reshape( bias, [bias_shape[0], bias_shape[1], bias_shape[2], -1]) logits += bias logits = logits / area_temperature weights = tf.nn.softmax(logits, name="attention_weights") if top_k_areas > 0: tf.logging.info("area_attention top_k_areas=%d", top_k_areas) top_k = tf.minimum( common_layers.shape_list(weights)[-1], top_k_areas) top_weights, _ = tf.nn.top_k(weights, k=top_k) min_values = tf.reduce_min(top_weights, -1, keepdims=True) weights = tf.where(tf.greater_equal(weights, min_values), weights, tf.zeros_like(weights)) weights = tf.div(weights, tf.reduce_sum(weights, -1, keepdims=True)) if save_weights_to is not None: save_weights_to[scope.name] = weights save_weights_to[scope.name + "/logits"] = logits # Drop out attention links for each head. weights = common_layers.dropout_with_broadcast_dims( weights, 1.0 - dropout_rate, broadcast_dims=dropout_broadcast_dims) if common_layers.should_generate_summaries( ) and attention_image_summary: attention_image_summary(weights, image_shapes) return tf.matmul(weights, v)
def _resource_apply_dense(self, grad, var): var_dtype = var.dtype.base_dtype lr_t = self._decayed_lr(var_dtype) m = self.get_slot(var, 'm') v = self.get_slot(var, 'v') beta_1_t = self._get_hyper('beta_1', var_dtype) beta_2_t = self._get_hyper('beta_2', var_dtype) epsilon_t = ops.convert_to_tensor(self.epsilon, var_dtype) local_step = math_ops.cast(self.iterations + 1, var_dtype) beta_1_power = math_ops.pow(beta_1_t, local_step) beta_2_power = math_ops.pow(beta_2_t, local_step) if self._initial_total_steps > 0: total_steps = self._get_hyper('total_steps', var_dtype) warmup_steps = total_steps * self._get_hyper( 'warmup_proportion', var_dtype) decay_steps = total_steps - warmup_steps lr_t = tf.where( local_step <= warmup_steps, lr_t * (local_step / warmup_steps), lr_t * (1.0 - tf.minimum(local_step, decay_steps) / decay_steps), ) sma_inf = 2.0 / (1.0 - beta_2_t) - 1.0 sma_t = sma_inf - 2.0 * local_step * beta_2_power / (1.0 - beta_2_power) m_t = state_ops.assign(m, beta_1_t * m + (1.0 - beta_1_t) * grad, use_locking=self._use_locking) m_corr_t = m_t / (1.0 - beta_1_power) v_t = state_ops.assign(v, beta_2_t * v + (1.0 - beta_2_t) * math_ops.square(grad), use_locking=self._use_locking) if self.amsgrad: vhat = self.get_slot(var, 'vhat') vhat_t = state_ops.assign(vhat, math_ops.maximum(vhat, v_t), use_locking=self._use_locking) v_corr_t = math_ops.sqrt(vhat_t / (1.0 - beta_2_power) + epsilon_t) else: v_corr_t = math_ops.sqrt(v_t / (1.0 - beta_2_power) + epsilon_t) r_t = math_ops.sqrt((sma_t - 4.0) / (sma_inf - 4.0) * (sma_t - 2.0) / (sma_inf - 2.0) * sma_inf / sma_t) var_t = tf.where(sma_t > 5.0, r_t * m_corr_t / v_corr_t, m_corr_t) if self._initial_weight_decay > 0.0: var_t += self._get_hyper('weight_decay', var_dtype) * var var_update = state_ops.assign_sub(var, lr_t * var_t, use_locking=self._use_locking) updates = [var_update, m_t, v_t] if self.amsgrad: updates.append(vhat_t) return control_flow_ops.group(*updates)
def _finish(self, state): update_ops = [] grads_at_prev_iterate = self._recompute_gradients(state) for var, grad, grad_at_prev_iterate in zip(self.vars, self.grads, grads_at_prev_iterate): sum_grad_squared = state.get_slot(var, SUM_GRAD_SQUARED) previous_iterate = state.get_slot(var, PREVIOUS_ITERATE) maximum_gradient = state.get_slot(var, MAXIMUM_GRADIENT) sum_estimates_squared = state.get_slot(var, SUM_ESTIMATES_SQUARED) maximum_gradient_updated = tf.assign( maximum_gradient, tf.maximum(maximum_gradient, tf.norm(grad))) update_ops.append(maximum_gradient_updated) sum_grad_squared_updated = tf.assign_add(sum_grad_squared, tf.pow(tf.abs(grad), 2.0)) update_ops.append(sum_grad_squared_updated) smoothness = tf.norm(grad - grad_at_prev_iterate) / ( 0.0001 + tf.norm(var - previous_iterate)) eta = self.lr * tf.pow(self.eta + sum_grad_squared_updated, -1.0 / 3.0) beta = tf.minimum(1.0, self.momentum * tf.square(eta)) grad_estimate = state.get_slot(var, GRAD_ESTIMATE) new_grad_estimate = grad + (1.0 - beta) * (grad_estimate - grad_at_prev_iterate) new_grad_estimate = tf.clip_by_value(new_grad_estimate, -maximum_gradient_updated, maximum_gradient_updated) if self.output_summaries: tf.summary.scalar(self._name + "/smoothness/" + var.name, smoothness) tf.summary.scalar(self._name + "/max_grad/" + var.name, maximum_gradient_updated) tf.summary.scalar(self._name + "/average_beta/" + var.name, tf.reduce_mean(beta)) tf.summary.scalar(self._name + "/iterate_diff/" + var.name, tf.norm(var - previous_iterate)) tf.summary.scalar(self._name + "/grad_diff/" + var.name, tf.norm(grad - grad_at_prev_iterate)) tf.summary.scalar( self._name + "/vr_grad_estimate_norm/" + var.name, tf.norm(new_grad_estimate)) tf.summary.scalar(self._name + "/grad_norm/" + var.name, tf.norm(grad)) grad_estimate_updated = tf.assign(grad_estimate, new_grad_estimate) update_ops.append(grad_estimate_updated) sum_estimates_squared_updated = tf.assign_add( sum_estimates_squared, tf.square(new_grad_estimate)) update_ops.append(sum_estimates_squared_updated) with tf.control_dependencies([grad_at_prev_iterate]): previous_iterate_updated = tf.assign(previous_iterate, var) update_ops.append(previous_iterate_updated) step = -eta * grad_estimate_updated with tf.control_dependencies([previous_iterate_updated]): var_updated = tf.assign_add(var, step) update_ops.append(var_updated) return tf.group(*update_ops)
def main(argv): del argv # unused if tf.io.gfile.exists(FLAGS.model_dir): tf.compat.v1.logging.warning( "Warning: deleting old log directory at {}".format( FLAGS.model_dir)) tf.io.gfile.rmtree(FLAGS.model_dir) tf.io.gfile.makedirs(FLAGS.model_dir) if FLAGS.fake_data: (x_train, y_train), (x_test, y_test) = build_fake_data() else: (x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data() (images, labels, handle, training_iterator, heldout_iterator) = build_input_pipeline(x_train, x_test, y_train, y_test, FLAGS.batch_size, 500) if FLAGS.architecture == "resnet": model_fn = bayesian_resnet else: model_fn = bayesian_vgg model = model_fn( IMAGE_SHAPE, num_classes=10, kernel_posterior_scale_mean=FLAGS.kernel_posterior_scale_mean, kernel_posterior_scale_constraint=FLAGS. kernel_posterior_scale_constraint) logits = model(images) labels_distribution = tfd.Categorical(logits=logits) # Perform KL annealing. The optimal number of annealing steps # depends on the dataset and architecture. t = tf.compat.v2.Variable(0.0) kl_regularizer = t / (FLAGS.kl_annealing * len(x_train) / FLAGS.batch_size) # Compute the -ELBO as the loss. The kl term is annealed from 0 to 1 over # the epochs specified by the kl_annealing flag. log_likelihood = labels_distribution.log_prob(labels) neg_log_likelihood = -tf.reduce_mean(input_tensor=log_likelihood) kl = sum(model.losses) / len(x_train) * tf.minimum(1.0, kl_regularizer) loss = neg_log_likelihood + kl # Build metrics for evaluation. Predictions are formed from a single forward # pass of the probabilistic layers. They are cheap but noisy # predictions. predictions = tf.argmax(input=logits, axis=1) with tf.compat.v1.name_scope("train"): train_accuracy, train_accuracy_update_op = tf.compat.v1.metrics.accuracy( labels=labels, predictions=predictions) opt = tf.compat.v1.train.AdamOptimizer(FLAGS.learning_rate) train_op = opt.minimize(loss) update_step_op = tf.compat.v1.assign(t, t + 1) with tf.compat.v1.name_scope("valid"): valid_accuracy, valid_accuracy_update_op = tf.compat.v1.metrics.accuracy( labels=labels, predictions=predictions) init_op = tf.group(tf.compat.v1.global_variables_initializer(), tf.compat.v1.local_variables_initializer()) stream_vars_valid = [ v for v in tf.compat.v1.local_variables() if "valid/" in v.name ] reset_valid_op = tf.compat.v1.variables_initializer(stream_vars_valid) with tf.compat.v1.Session() as sess: sess.run(init_op) # Run the training loop train_handle = sess.run(training_iterator.string_handle()) heldout_handle = sess.run(heldout_iterator.string_handle()) training_steps = int( round(FLAGS.epochs * (len(x_train) / FLAGS.batch_size))) for step in range(training_steps): _ = sess.run([train_op, train_accuracy_update_op, update_step_op], feed_dict={handle: train_handle}) # Manually print the frequency if step % 100 == 0: loss_value, accuracy_value, kl_value = sess.run( [loss, train_accuracy, kl], feed_dict={handle: train_handle}) print("Step: {:>3d} Loss: {:.3f} Accuracy: {:.3f} KL: {:.3f}". format(step, loss_value, accuracy_value, kl_value)) if (step + 1) % FLAGS.eval_freq == 0: # Compute log prob of heldout set by averaging draws from the model: # p(heldout | train) = int_model p(heldout|model) p(model|train) # ~= 1/n * sum_{i=1}^n p(heldout | model_i) # where model_i is a draw from the posterior # p(model|train). probs = np.asarray([ sess.run((labels_distribution.probs), feed_dict={handle: heldout_handle}) for _ in range(FLAGS.num_monte_carlo) ]) mean_probs = np.mean(probs, axis=0) _, label_vals = sess.run((images, labels), feed_dict={handle: heldout_handle}) heldout_lp = np.mean( np.log(mean_probs[np.arange(mean_probs.shape[0]), label_vals.flatten()])) print(" ... Held-out nats: {:.3f}".format(heldout_lp)) # Calculate validation accuracy for _ in range(20): sess.run(valid_accuracy_update_op, feed_dict={handle: heldout_handle}) valid_value = sess.run(valid_accuracy, feed_dict={handle: heldout_handle}) print(" ... Validation Accuracy: {:.3f}".format(valid_value)) sess.run(reset_valid_op)
def get_warmed_up_lr(max_lr, warmup, global_step): if warmup == 0: return max_lr return max_lr * tf.minimum( tf.cast(global_step, tf.float32) / float(warmup), 1.0)
def ae_transformer_internal(inputs, targets, target_space, hparams, cache=None): """Main step used for training.""" # Encoder. inputs = common_layers.flatten4d3d(inputs) inputs, ed = encode(inputs, target_space, hparams, "input_enc") # Autoencoding. losses = {"extra": tf.constant(0.0), "latent_pred": tf.constant(0.0)} max_targets_len_from_inputs = tf.concat([inputs, inputs], axis=1) targets, _ = common_layers.pad_to_same_length( targets, max_targets_len_from_inputs, final_length_divisible_by=2**hparams.num_compress_steps) targets_c = compress(targets, hparams, "compress") if hparams.mode != tf.estimator.ModeKeys.PREDICT: # Compress and bottleneck. latents_discrete_hot, extra_loss = vq_discrete_bottleneck( x=targets_c, hparams=hparams) latents_dense = vq_discrete_unbottleneck( latents_discrete_hot, hparams=hparams) latents_dense = targets_c + tf.stop_gradient(latents_dense - targets_c) latents_discrete = tf.argmax(latents_discrete_hot, axis=-1) tf.summary.histogram("codes", tf.reshape(latents_discrete[:, 0, :], [-1])) losses["extra"] = extra_loss # Extra loss predicting latent code from input. latents_pred = decode_transformer(inputs, ed, latents_dense, hparams, "extra") latent_pred_loss = get_latent_pred_loss(latents_pred, latents_discrete_hot, hparams) losses["latent_pred"] = tf.reduce_mean(latent_pred_loss) else: latent_len = common_layers.shape_list(targets_c)[1] embed = functools.partial(vq_discrete_unbottleneck, hparams=hparams) latents_dense = tf.zeros_like(targets_c[:, :latent_len, :, :]) if cache is None: cache = ae_latent_sample_beam(latents_dense, inputs, ed, embed, hparams) cache_hot = tf.one_hot(cache, depth=2**hparams.bottleneck_bits) latents_dense = embed(cache_hot) # Postprocess. d = latents_dense pos = tf.get_variable("pos", [1, 1000, 1, hparams.hidden_size]) pos = pos[:, :common_layers.shape_list(latents_dense)[1] + 1, :, :] latents_dense = tf.pad(latents_dense, [[0, 0], [1, 0], [0, 0], [0, 0]]) + pos # Decompressing the dense latents for i in range(hparams.num_compress_steps): j = hparams.num_compress_steps - i - 1 d = residual_conv(d, 1, (3, 1), hparams, "decompress_rc_%d" % j) d = decompress_step(d, hparams, i > 0, "decompress_%d" % j) masking = common_layers.inverse_lin_decay(hparams.mask_startup_steps) masking *= common_layers.inverse_exp_decay( hparams.mask_startup_steps // 4) # Not much at start. masking = tf.minimum(tf.maximum(masking, 0.0), 1.0) if hparams.mode == tf.estimator.ModeKeys.PREDICT: masking = 1.0 mask = tf.less(masking, tf.random_uniform(common_layers.shape_list(targets)[:-1])) mask = tf.expand_dims(tf.to_float(mask), 3) # targets is always [batch, length, 1, depth] targets = mask * targets + (1.0 - mask) * d res = decode_transformer(inputs, ed, targets, hparams, "decoder") latent_time = tf.less(hparams.mask_startup_steps, tf.to_int32(tf.train.get_global_step())) losses["latent_pred"] *= tf.to_float(latent_time) return res, losses, cache