def _interpolate(im, x, y, out_size): with tf.variable_scope('_interpolate'): # constants num_batch = tf.shape(im)[0] height = tf.shape(im)[1] width = tf.shape(im)[2] channels = tf.shape(im)[3] x = tf.cast(x, 'float32') y = tf.cast(y, 'float32') height_f = tf.cast(height, 'float32') width_f = tf.cast(width, 'float32') out_height = out_size[0] out_width = out_size[1] zero = tf.zeros([], dtype='int32') max_y = tf.cast(tf.shape(im)[1] - 1, 'int32') max_x = tf.cast(tf.shape(im)[2] - 1, 'int32') # scale indices from [-1, 1] to [0, width/height] # x = (x + 1.0)*(width_f) / 2.0 # y = (y + 1.0)*(height_f) / 2.0 x = ((x / (width_f / 2.0)) + 1.0) * (width_f / 2.0) y = ((y / (height_f / 2.0)) + 1.0) * (height_f / 2.0) # do sampling x0 = tf.cast(tf.floor(x), 'int32') x1 = x0 + 1 y0 = tf.cast(tf.floor(y), 'int32') y1 = y0 + 1 x0 = tf.clip_by_value(x0, zero, max_x) x1 = tf.clip_by_value(x1, zero, max_x) y0 = tf.clip_by_value(y0, zero, max_y) y1 = tf.clip_by_value(y1, zero, max_y) dim2 = width dim1 = width * height base = _repeat(tf.range(num_batch) * dim1, out_height * out_width) base_y0 = base + y0 * dim2 base_y1 = base + y1 * dim2 idx_a = base_y0 + x0 idx_b = base_y1 + x0 idx_c = base_y0 + x1 idx_d = base_y1 + x1 # use indices to lookup pixels in the flat image and restore # channels dim im_flat = tf.reshape(im, tf.stack([-1, channels])) im_flat = tf.cast(im_flat, 'float32') Ia = tf.gather(im_flat, idx_a) Ib = tf.gather(im_flat, idx_b) Ic = tf.gather(im_flat, idx_c) Id = tf.gather(im_flat, idx_d) # and finally calculate interpolated values x0_f = tf.cast(x0, 'float32') x1_f = tf.cast(x1, 'float32') y0_f = tf.cast(y0, 'float32') y1_f = tf.cast(y1, 'float32') wa = tf.expand_dims(((x1_f - x) * (y1_f - y)), 1) wb = tf.expand_dims(((x1_f - x) * (y - y0_f)), 1) wc = tf.expand_dims(((x - x0_f) * (y1_f - y)), 1) wd = tf.expand_dims(((x - x0_f) * (y - y0_f)), 1) output = tf.add_n([wa * Ia, wb * Ib, wc * Ic, wd * Id]) return output
def add_contrastive_loss(hidden, hidden_norm=True, temperature=1.0, tpu_context=None, weights=1.0): """Compute loss for model. Args: hidden: hidden vector (`Tensor`) of shape (2 * bsz, dim). hidden_norm: whether or not to use normalization on the hidden vector. temperature: a `floating` number for temperature scaling. tpu_context: context information for tpu. weights: a weighting number or vector. Returns: A loss scalar. The logits for contrastive prediction task. The labels for contrastive prediction task. """ # Get (normalized) hidden1 and hidden2. if hidden_norm: hidden = tf.math.l2_normalize(hidden, -1) hidden1, hidden2 = tf.split(hidden, 2, 0) batch_size = tf.shape(hidden1)[0] # Gather hidden1/hidden2 across replicas and create local labels. if tpu_context is not None: hidden1_large = tpu_cross_replica_concat(hidden1, tpu_context) hidden2_large = tpu_cross_replica_concat(hidden2, tpu_context) enlarged_batch_size = tf.shape(hidden1_large)[0] # TODO(iamtingchen): more elegant way to convert u32 to s32 for replica_id. replica_id = tf.cast(tf.cast(xla.replica_id(), tf.uint32), tf.int32) labels_idx = tf.range(batch_size) + replica_id * batch_size labels = tf.one_hot(labels_idx, enlarged_batch_size * 2) masks = tf.one_hot(labels_idx, enlarged_batch_size) else: hidden1_large = hidden1 hidden2_large = hidden2 labels = tf.one_hot(tf.range(batch_size), batch_size * 2) masks = tf.one_hot(tf.range(batch_size), batch_size) logits_aa = tf.matmul(hidden1, hidden1_large, transpose_b=True) / temperature logits_aa = logits_aa - masks * LARGE_NUM logits_bb = tf.matmul(hidden2, hidden2_large, transpose_b=True) / temperature logits_bb = logits_bb - masks * LARGE_NUM logits_ab = tf.matmul(hidden1, hidden2_large, transpose_b=True) / temperature logits_ba = tf.matmul(hidden2, hidden1_large, transpose_b=True) / temperature loss_a = tf.losses.softmax_cross_entropy(labels, tf.concat([logits_ab, logits_aa], 1), weights=weights) loss_b = tf.losses.softmax_cross_entropy(labels, tf.concat([logits_ba, logits_bb], 1), weights=weights) loss = loss_a + loss_b return loss, logits_ab, labels
def _tf_fn(): tf_indices = [tf.range(dim) for dim in sshape] return tf.cast(tf.stack(tf.meshgrid(*tf_indices, indexing='ij'), axis=-1), dtype=self._dtype)
def __init__(self, sess, model, batch_size=1, confidence=CONFIDENCE, targeted=TARGETED, learning_rate=LEARNING_RATE, binary_search_steps=BINARY_SEARCH_STEPS, max_iterations=MAX_ITERATIONS, abort_early=ABORT_EARLY, initial_const=INITIAL_CONST, boxmin=-0.5, boxmax=0.5, x_window=0, y_window=0, window_size=-1): """ The L_2 optimized attack. This attack is the most efficient and should be used as the primary attack to evaluate potential defenses. Returns adversarial examples for the supplied model. confidence: Confidence of adversarial examples: higher produces examples that are farther away, but more strongly classified as adversarial. batch_size: Number of attacks to run simultaneously. targeted: True if we should perform a targetted attack, False otherwise. learning_rate: The learning rate for the attack algorithm. Smaller values produce better results but are slower to converge. binary_search_steps: The number of times we perform binary search to find the optimal tradeoff-constant between distance and confidence. max_iterations: The maximum number of iterations. Larger values are more accurate; setting too small will require a large learning rate and will produce poor results. abort_early: If true, allows early aborts if gradient descent gets stuck. initial_const: The initial tradeoff-constant to use to tune the relative importance of distance and confidence. If binary_search_steps is large, the initial constant is not important. boxmin: Minimum pixel value (default -0.5). boxmax: Maximum pixel value (default 0.5). """ if window_size == -1: window_size = model.image_size image_size, num_channels, num_labels = model.image_size, model.num_channels, model.num_labels self.sess = sess self.TARGETED = targeted self.LEARNING_RATE = learning_rate self.MAX_ITERATIONS = max_iterations self.BINARY_SEARCH_STEPS = binary_search_steps self.ABORT_EARLY = abort_early self.CONFIDENCE = confidence self.initial_const = initial_const self.batch_size = batch_size self.repeat = binary_search_steps >= 10 self.I_KNOW_WHAT_I_AM_DOING_AND_WANT_TO_OVERRIDE_THE_PRESOFTMAX_CHECK = False shape = (batch_size, window_size, window_size, num_channels) # the variable we're going to optimize over modifier = tf.Variable(np.zeros( shape, dtype=np.float32)) #qui ridimensionare per fare porzione # these are variables to be more efficient in sending data to tf self.timg = tf.Variable(np.zeros(shape), dtype=tf.float32) self.tlab = tf.Variable(np.zeros((batch_size, num_labels)), dtype=tf.float32) self.const = tf.Variable(np.zeros(batch_size), dtype=tf.float32) # and here's what we use to assign them self.assign_timg = tf.placeholder(tf.float32, shape) self.assign_tlab = tf.placeholder(tf.float32, (batch_size, num_labels)) self.assign_const = tf.placeholder(tf.float32, [batch_size]) # the resulting image, tanh'd to keep bounded from boxmin to boxmax self.boxmul = (boxmax - boxmin) / 2. self.boxplus = (boxmin + boxmax) / 2. ###################################################################### editing mask = tf.zeros((batch_size, image_size, image_size, num_channels), tf.float32) # Get input shapes modifier_shape = tf.shape(modifier) mask_shape = tf.shape(mask) # Make indices grid oo, ii, jj, kk = tf.meshgrid(tf.range(modifier_shape[0]), tf.range(modifier_shape[1]), tf.range(modifier_shape[2]), tf.range(modifier_shape[3]), indexing='ij') # Shift indices ii += y_window jj += x_window # Scatter update mask_to_apply = tf.tensor_scatter_nd_update( mask, tf.stack([oo, ii, jj, kk], axis=-1), modifier) self.newimg = tf.tanh(mask_to_apply + self.timg) * self.boxmul + self.boxplus ###################################################################### editing # prediction BEFORE-SOFTMAX of the model self.output = model.predict(self.newimg) # distance to the input data self.l2dist = tf.reduce_sum( tf.square(self.newimg - (tf.tanh(self.timg) * self.boxmul + self.boxplus)), [1, 2, 3]) # compute the probability of the label class versus the maximum other real = tf.reduce_sum((self.tlab) * self.output, 1) other = tf.reduce_max( (1 - self.tlab) * self.output - (self.tlab * 10000), 1) if self.TARGETED: # if targetted, optimize for making the other class most likely loss1 = tf.maximum(0.0, other - real + self.CONFIDENCE) else: # if untargeted, optimize for making this class least likely. loss1 = tf.maximum(0.0, real - other + self.CONFIDENCE) # sum up the losses self.loss2 = tf.reduce_sum(self.l2dist) self.loss1 = tf.reduce_sum(self.const * loss1) self.loss = self.loss1 + self.loss2 # Setup the adam optimizer and keep track of variables we're creating start_vars = set(x.name for x in tf.global_variables()) optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE) self.train = optimizer.minimize(self.loss, var_list=[modifier]) end_vars = tf.global_variables() new_vars = [x for x in end_vars if x.name not in start_vars] # these are the variables to initialize when we run self.setup = [] self.setup.append(self.timg.assign(self.assign_timg)) self.setup.append(self.tlab.assign(self.assign_tlab)) self.setup.append(self.const.assign(self.assign_const)) self.init = tf.variables_initializer(var_list=[mask] + new_vars)
def __init__(self, session, player_id, state_representation_size, num_actions, hidden_layers_sizes=128, replay_buffer_capacity=10000, batch_size=128, replay_buffer_class=ReplayBuffer, learning_rate=0.01, update_target_network_every=1000, learn_every=10, discount_factor=1.0, min_buffer_size_to_learn=1000, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_duration=int(1e6), optimizer_str="sgd", loss_str="mse"): """Initialize the DQN agent.""" # This call to locals() is used to store every argument used to initialize # the class instance, so it can be copied with no hyperparameter change. self._kwargs = locals() self.player_id = player_id self._session = session self._num_actions = num_actions if isinstance(hidden_layers_sizes, int): hidden_layers_sizes = [hidden_layers_sizes] self._layer_sizes = hidden_layers_sizes self._batch_size = batch_size self._update_target_network_every = update_target_network_every self._learn_every = learn_every self._min_buffer_size_to_learn = min_buffer_size_to_learn self._discount_factor = discount_factor self._epsilon_start = epsilon_start self._epsilon_end = epsilon_end self._epsilon_decay_duration = epsilon_decay_duration # TODO(author6) Allow for optional replay buffer config. if not isinstance(replay_buffer_capacity, int): raise ValueError("Replay buffer capacity not an integer.") self._replay_buffer = replay_buffer_class(replay_buffer_capacity) self._prev_timestep = None self._prev_action = None # Step counter to keep track of learning, eps decay and target network. self._step_counter = 0 # Keep track of the last training loss achieved in an update step. self._last_loss_value = None # Create required TensorFlow placeholders to perform the Q-network updates. self._info_state_ph = tf.placeholder( shape=[None, state_representation_size], dtype=tf.float32, name="info_state_ph") self._action_ph = tf.placeholder( shape=[None], dtype=tf.int32, name="action_ph") self._reward_ph = tf.placeholder( shape=[None], dtype=tf.float32, name="reward_ph") self._is_final_step_ph = tf.placeholder( shape=[None], dtype=tf.float32, name="is_final_step_ph") self._next_info_state_ph = tf.placeholder( shape=[None, state_representation_size], dtype=tf.float32, name="next_info_state_ph") self._legal_actions_mask_ph = tf.placeholder( shape=[None, num_actions], dtype=tf.float32, name="legal_actions_mask_ph") self._q_network = simple_nets.MLP(state_representation_size, self._layer_sizes, num_actions) self._q_values = self._q_network(self._info_state_ph) self._target_q_network = simple_nets.MLP(state_representation_size, self._layer_sizes, num_actions) self._target_q_values = self._target_q_network(self._next_info_state_ph) # Stop gradient to prevent updates to the target network while learning self._target_q_values = tf.stop_gradient(self._target_q_values) self._update_target_network = self._create_target_network_update_op( self._q_network, self._target_q_network) # Create the loss operations. # Sum a large negative constant to illegal action logits before taking the # max. This prevents illegal action values from being considered as target. illegal_actions = 1 - self._legal_actions_mask_ph illegal_logits = illegal_actions * ILLEGAL_ACTION_LOGITS_PENALTY max_next_q = tf.reduce_max( tf.math.add(tf.stop_gradient(self._target_q_values), illegal_logits), axis=-1) target = ( self._reward_ph + (1 - self._is_final_step_ph) * self._discount_factor * max_next_q) action_indices = tf.stack( [tf.range(tf.shape(self._q_values)[0]), self._action_ph], axis=-1) predictions = tf.gather_nd(self._q_values, action_indices) if loss_str == "mse": loss_class = tf.losses.mean_squared_error elif loss_str == "huber": loss_class = tf.losses.huber_loss else: raise ValueError("Not implemented, choose from 'mse', 'huber'.") self._loss = tf.reduce_mean( loss_class(labels=target, predictions=predictions)) if optimizer_str == "adam": self._optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) elif optimizer_str == "sgd": self._optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) else: raise ValueError("Not implemented, choose from 'adam' and 'sgd'.") self._learn_step = self._optimizer.minimize(self._loss) self._initialize()
def _scan_initial_state(self): """Create TensorArrays and indices to track bin assignment. availability: TensorArray[queue_size, num_sequences] This represents the number of tokens available in the ith bin. See implementation note below. contents: TensorArray[queue_size, num_sequences * 2] This holds the actual contents of the packed strings as well as a bit mask indicating where sequences begin. It is stored in a flat vector and is accessed in offsets of packed_length. top_index: scalar [0, queue_size) Integer tensor indicating which index is the "top" bin. See implementation note below. IMPLEMENTATION_NOTE: The FFD algorithm periodically pops the topmost queue and pushes a new one to replace it. In order to replicate those semantics with a fixed size TensorArray, indexing operations are shifted by top_index. For example, instead of: `queue_available.read(i)` a read is instead performed as: `queue_available.read((i - top_index) % queue_size)` to account for the fact that the "ith" logical FFD queue is stored at position j. This means that the pop / push update can be performed by simply incrementing top_index. (And zeroing the old top_index position.) Returns: The state for the binning scan. """ all_available = tf.ones((self._queue_size, self._num_sequences), dtype=INDEX_DTYPE) * self._packed_length total_size = self._packed_length * self._queue_size total_size_range = tf.range(total_size, dtype=INDEX_DTYPE) empty = tf.zeros((total_size, self._num_sequences * 2), dtype=self._token_dtype) availability = tf.TensorArray( dtype=INDEX_DTYPE, size=self._queue_size, dynamic_size=False, clear_after_read=False, element_shape=(self._num_sequences, )).scatter( tf.range(self._queue_size, dtype=INDEX_DTYPE), all_available) contents = tf.TensorArray(dtype=self._token_dtype, size=total_size, dynamic_size=False, clear_after_read=False, element_shape=(self._num_sequences * 2, )).scatter( total_size_range, empty) # Which index should be considered the "top" bucket for the purpose of # the first-fit descending algorithm. top_index = tf.zeros((), dtype=INDEX_DTYPE) return availability, contents, top_index
def compute_mel_filterbank_features(waveforms, sample_rate=16000, dither=1.0 / np.iinfo(np.int16).max, preemphasis=0.97, frame_length=25, frame_step=10, fft_length=None, window_fn=functools.partial( tf.signal.hann_window, periodic=True), lower_edge_hertz=80.0, upper_edge_hertz=7600.0, num_mel_bins=80, log_noise_floor=1e-3, apply_mask=True): """Implement mel-filterbank extraction using tf ops. Args: waveforms: float32 tensor with shape [batch_size, max_len] sample_rate: sampling rate of the waveform dither: stddev of Gaussian noise added to waveform to prevent quantization artefacts preemphasis: waveform high-pass filtering constant frame_length: frame length in ms frame_step: frame_Step in ms fft_length: number of fft bins window_fn: windowing function lower_edge_hertz: lowest frequency of the filterbank upper_edge_hertz: highest frequency of the filterbank num_mel_bins: filterbank size log_noise_floor: clip small values to prevent numeric overflow in log apply_mask: When working on a batch of samples, set padding frames to zero Returns: filterbanks: a float32 tensor with shape [batch_size, len, num_bins, 1] """ # `stfts` is a complex64 Tensor representing the short-time Fourier # Transform of each signal in `signals`. Its shape is # [batch_size, ?, fft_unique_bins] # where fft_unique_bins = fft_length // 2 + 1 # Find the wave length: the largest index for which the value is !=0 # note that waveforms samples that are exactly 0.0 are quite common, so # simply doing sum(waveforms != 0, axis=-1) will not work correctly. wav_lens = tf.reduce_max( tf.expand_dims(tf.range(tf.shape(waveforms)[1]), 0) * tf.to_int32(tf.not_equal(waveforms, 0.0)), axis=-1) + 1 if dither > 0: waveforms += tf.random_normal(tf.shape(waveforms), stddev=dither) if preemphasis > 0: waveforms = waveforms[:, 1:] - preemphasis * waveforms[:, :-1] wav_lens -= 1 frame_length = int(frame_length * sample_rate / 1e3) frame_step = int(frame_step * sample_rate / 1e3) if fft_length is None: fft_length = int(2**(np.ceil(np.log2(frame_length)))) stfts = tf.contrib.signal.stft(waveforms, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length, window_fn=window_fn, pad_end=True) stft_lens = (wav_lens + (frame_step - 1)) // frame_step masks = tf.to_float( tf.less_equal(tf.expand_dims(tf.range(tf.shape(stfts)[1]), 0), tf.expand_dims(stft_lens, 1))) # An energy spectrogram is the magnitude of the complex-valued STFT. # A float32 Tensor of shape [batch_size, ?, 257]. magnitude_spectrograms = tf.abs(stfts) # Warp the linear-scale, magnitude spectrograms into the mel-scale. num_spectrogram_bins = magnitude_spectrograms.shape[-1].value linear_to_mel_weight_matrix = ( tf.contrib.signal.linear_to_mel_weight_matrix(num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz, upper_edge_hertz)) mel_spectrograms = tf.tensordot(magnitude_spectrograms, linear_to_mel_weight_matrix, 1) # Note: Shape inference for tensordot does not currently handle this case. mel_spectrograms.set_shape(magnitude_spectrograms.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) log_mel_sgram = tf.log(tf.maximum(log_noise_floor, mel_spectrograms)) if apply_mask: log_mel_sgram *= tf.expand_dims(tf.to_float(masks), -1) return tf.expand_dims(log_mel_sgram, -1, name="mel_sgrams")
def real_svg_top(body_output, unused_targets, model_hparams, unused_vocab_size, hard=False): """Applies the Mixture Density Network on top of the LSTM outputs. Args: body_output: outputs from LSTM with shape [batch, seqlen, 1, hidden_size] unused_targets: what the ground truth SVG outputted should be (unused). model_hparams: hyper-parameters, should include num_mixture, mix_temperature, and gauss_temperature. unused_vocab_size: unused hard: whether to force predict mode functionality, or return all MDN components Returns: The MDN output. Could be shape [batch, seqlen, 1, 10] if in predict mode (or hard=True) or shape [batch, seqlen, 1, 4 + 6 * num_mix * 3], in train. """ # mixture of gaussians for 6 args plus 4 extra states for cmds num_mix = model_hparams.num_mixture nout = 4 + 6 * num_mix * 3 # the 'hard' option is meant to be used if 'top' is called within body with tf.variable_scope('real_top', reuse=tf.AUTO_REUSE): ret = tf.layers.dense(body_output, nout, name='top') batch_size = common_layers.shape_list(ret)[0] if hard or model_hparams.mode == tf.estimator.ModeKeys.PREDICT: temperature = model_hparams.mix_temperature # apply temperature, do softmax command = tf.identity(ret[:, :, :, :4]) / temperature command = tf.exp(command - tf.reduce_max(command, axis=[-1], keepdims=True)) command = command / tf.reduce_sum( command, axis=[-1], keepdims=True) # sample from the given probs, this is the same as get_pi_idx, # and already returns not soft prob command = tf.distributions.Categorical(probs=command).sample() # this is now [batch, seq, 1], need to make it one_hot command = tf.one_hot(command, 4) arguments = ret[:, :, :, 4:] # args are [batch, seq, 1, 6*3*num_mix]. want [batch * seq * 6, 3*num_mix] arguments = tf.reshape(arguments, [-1, 3 * num_mix]) out_logmix, out_mean, out_logstd = _get_mdn_coef(arguments) # these are [batch*seq*6, num_mix] # apply temp to logmix out_logmix = tf.identity(out_logmix) / temperature out_logmix = tf.exp( out_logmix - tf.reduce_max(out_logmix, axis=[-1], keepdims=True)) out_logmix = out_logmix / tf.reduce_sum( out_logmix, axis=[-1], keepdims=True) # get_pi_idx out_logmix = tf.distributions.Categorical( probs=out_logmix).sample() # should now be [batch*seq*6, 1] out_logmix = tf.cast(out_logmix, tf.int32) out_logmix = tf.reshape(out_logmix, [-1]) # prepare for gather out_logmix = tf.stack([tf.range(tf.size(out_logmix)), out_logmix], axis=-1) chosen_mean = tf.gather_nd(out_mean, out_logmix) chosen_logstd = tf.gather_nd(out_logstd, out_logmix) # sample!! rand_gaussian = (tf.random.normal(tf.shape(chosen_mean)) * tf.sqrt(model_hparams.gauss_temperature)) arguments = chosen_mean + tf.exp(chosen_logstd) * rand_gaussian arguments = tf.reshape(arguments, [batch_size, -1, 1, 6]) # concat with the command we picked! ret = tf.concat([command, arguments], axis=-1) return ret
def _generate_detections_per_image(boxes, scores, max_total_size=100, nms_iou_threshold=0.3, score_threshold=0.05, pre_nms_num_boxes=5000): """Generate the final detections per image given the model outputs. Args: boxes: a tensor with shape [N, num_classes, 4] or [N, 1, 4], which box predictions on all feature levels. The N is the number of total anchors on all levels. scores: a tensor with shape [N, num_classes], which stacks class probability on all feature levels. The N is the number of total anchors on all levels. The num_classes is the number of classes predicted by the model. Note that the class_outputs here is the raw score. max_total_size: a scalar representing maximum number of boxes retained over all classes. nms_iou_threshold: a float representing the threshold for deciding whether boxes overlap too much with respect to IOU. score_threshold: a float representing the threshold for deciding when to remove boxes based on score. pre_nms_num_boxes: an int number of top candidate detections per class before NMS. Returns: nms_boxes: `float` Tensor of shape [max_total_size, 4] representing top detected boxes in [y1, x1, y2, x2]. nms_scores: `float` Tensor of shape [max_total_size] representing sorted confidence scores for detected boxes. The values are between [0, 1]. nms_classes: `int` Tensor of shape [max_total_size] representing classes for detected boxes. valid_detections: `int` Tensor of shape [1] only the top `valid_detections` boxes are valid detections. """ nmsed_boxes = [] nmsed_scores = [] nmsed_classes = [] num_classes_for_box = boxes.get_shape().as_list()[1] num_classes = scores.get_shape().as_list()[1] for i in range(num_classes): boxes_i = boxes[:, min(num_classes_for_box - 1, i)] scores_i = scores[:, i] # Obtains pre_nms_num_boxes before running NMS. scores_i, indices = tf.nn.top_k(scores_i, k=tf.minimum( tf.shape(scores_i)[-1], pre_nms_num_boxes)) boxes_i = tf.gather(boxes_i, indices) (nmsed_indices_i, nmsed_num_valid_i) = tf.image.non_max_suppression_padded( tf.cast(boxes_i, tf.float32), tf.cast(scores_i, tf.float32), max_total_size, iou_threshold=nms_iou_threshold, score_threshold=score_threshold, pad_to_max_output_size=True, name='nms_detections_' + str(i)) nmsed_boxes_i = tf.gather(boxes_i, nmsed_indices_i) nmsed_scores_i = tf.gather(scores_i, nmsed_indices_i) # Sets scores of invalid boxes to -1. nmsed_scores_i = tf.where( tf.less(tf.range(max_total_size), [nmsed_num_valid_i]), nmsed_scores_i, -tf.ones_like(nmsed_scores_i)) nmsed_classes_i = tf.fill([max_total_size], i) nmsed_boxes.append(nmsed_boxes_i) nmsed_scores.append(nmsed_scores_i) nmsed_classes.append(nmsed_classes_i) # Concats results from all classes and sort them. nmsed_boxes = tf.concat(nmsed_boxes, axis=0) nmsed_scores = tf.concat(nmsed_scores, axis=0) nmsed_classes = tf.concat(nmsed_classes, axis=0) nmsed_scores, indices = tf.nn.top_k(nmsed_scores, k=max_total_size, sorted=True) nmsed_boxes = tf.gather(nmsed_boxes, indices) nmsed_classes = tf.gather(nmsed_classes, indices) valid_detections = tf.reduce_sum( tf.cast(tf.greater(nmsed_scores, -1), tf.int32)) return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
def _local_perm(inputs, targets, is_masked, perm_size, seq_len): """Samples a permutation of the factorization order, and create a mask. Args: inputs: int64 Tensor in shape [seq_len], input ids. targets: int64 Tensor in shape [seq_len], target ids. is_masked: bool Tensor in shape [seq_len]. True means being selected for partial prediction. perm_size: the length of longest permutation. Could be set to be reuse_len. Should not be larger than reuse_len or there will be data leaks. seq_len: int, sequence length. Returns: The permutation mask, new targets, target mask, and new inputs. """ # Generate permutation indices index = tf.range(seq_len, dtype=tf.int64) index = tf.transpose(tf.reshape(index, [-1, perm_size])) index = tf.random_shuffle(index) index = tf.reshape(tf.transpose(index), [-1]) # `perm_mask` and `target_mask` # non-functional tokens non_func_tokens = tf.logical_not( tf.logical_or(tf.equal(inputs, SEP_ID), tf.equal(inputs, CLS_ID))) non_mask_tokens = tf.logical_and(tf.logical_not(is_masked), non_func_tokens) masked_or_func_tokens = tf.logical_not(non_mask_tokens) # Set the permutation indices of non-masked (& non-funcional) tokens to the # smallest index (-1): # (1) they can be seen by all other positions # (2) they cannot see masked positions, so there won"t be information leak smallest_index = -tf.ones([seq_len], dtype=tf.int64) rev_index = tf.where(non_mask_tokens, smallest_index, index) # Create `target_mask`: non-funcional and maksed tokens # 1: use mask as input and have loss # 0: use token (or [SEP], [CLS]) as input and do not have loss target_tokens = tf.logical_and(masked_or_func_tokens, non_func_tokens) target_mask = tf.cast(target_tokens, tf.float32) # Create `perm_mask` # `target_tokens` cannot see themselves self_rev_index = tf.where(target_tokens, rev_index, rev_index + 1) # 1: cannot attend if i <= j and j is not non-masked (masked_or_func_tokens) # 0: can attend if i > j or j is non-masked perm_mask = tf.logical_and(self_rev_index[:, None] <= rev_index[None, :], masked_or_func_tokens) perm_mask = tf.cast(perm_mask, tf.float32) # new target: [next token] for LM and [curr token] (self) for PLM new_targets = tf.concat([inputs[0:1], targets[:-1]], axis=0) # construct inputs_k inputs_k = inputs # construct inputs_q inputs_q = target_mask return perm_mask, new_targets, target_mask, inputs_k, inputs_q
def parser(record): """function used to parse tfrecord.""" record_spec = { "input": tf.FixedLenFeature([seq_len], tf.int64), "target": tf.FixedLenFeature([seq_len], tf.int64), "seg_id": tf.FixedLenFeature([seq_len], tf.int64), "label": tf.FixedLenFeature([1], tf.int64), "is_masked": tf.FixedLenFeature([seq_len], tf.int64), } # retrieve serialized example example = tf.parse_single_example(serialized=record, features=record_spec) inputs = example.pop("input") target = example.pop("target") is_masked = tf.cast(example.pop("is_masked"), tf.bool) non_reuse_len = seq_len - reuse_len assert perm_size <= reuse_len and perm_size <= non_reuse_len perm_mask_0, target_0, target_mask_0, input_k_0, input_q_0 = _local_perm( inputs[:reuse_len], target[:reuse_len], is_masked[:reuse_len], perm_size, reuse_len) perm_mask_1, target_1, target_mask_1, input_k_1, input_q_1 = _local_perm( inputs[reuse_len:], target[reuse_len:], is_masked[reuse_len:], perm_size, non_reuse_len) perm_mask_0 = tf.concat( [perm_mask_0, tf.ones([reuse_len, non_reuse_len])], axis=1) perm_mask_1 = tf.concat( [tf.zeros([non_reuse_len, reuse_len]), perm_mask_1], axis=1) perm_mask = tf.concat([perm_mask_0, perm_mask_1], axis=0) target = tf.concat([target_0, target_1], axis=0) target_mask = tf.concat([target_mask_0, target_mask_1], axis=0) input_k = tf.concat([input_k_0, input_k_1], axis=0) input_q = tf.concat([input_q_0, input_q_1], axis=0) if num_predict is not None: indices = tf.range(seq_len, dtype=tf.int64) bool_target_mask = tf.cast(target_mask, tf.bool) indices = tf.boolean_mask(indices, bool_target_mask) ##### extra padding due to CLS/SEP introduced after prepro actual_num_predict = tf.shape(indices)[0] pad_len = num_predict - actual_num_predict ##### target_mapping target_mapping = tf.one_hot(indices, seq_len, dtype=tf.float32) paddings = tf.zeros([pad_len, seq_len], dtype=target_mapping.dtype) target_mapping = tf.concat([target_mapping, paddings], axis=0) example["target_mapping"] = tf.reshape(target_mapping, [num_predict, seq_len]) ##### target target = tf.boolean_mask(target, bool_target_mask) paddings = tf.zeros([pad_len], dtype=target.dtype) target = tf.concat([target, paddings], axis=0) example["target"] = tf.reshape(target, [num_predict]) ##### target mask target_mask = tf.concat([ tf.ones([actual_num_predict], dtype=tf.float32), tf.zeros([pad_len], dtype=tf.float32) ], axis=0) example["target_mask"] = tf.reshape(target_mask, [num_predict]) else: example["target"] = tf.reshape(target, [seq_len]) example["target_mask"] = tf.reshape(target_mask, [seq_len]) # reshape back to fixed shape example["perm_mask"] = tf.reshape(perm_mask, [seq_len, seq_len]) example["input_k"] = tf.reshape(input_k, [seq_len]) example["input_q"] = tf.reshape(input_q, [seq_len]) _convert_example(example, use_bfloat16) for k, v in example.items(): logging.info("%s: %s", k, v) return example
def _get_final_index(sequence_length, time_major=True): indices = [tf.maximum(0, sequence_length - 1), tf.range(sequence_length.shape[0])] if not time_major: indices = indices[-1::-1] return tf.stack(indices, axis=1)
def sum_python(N): return np.sum(np.arange(N)**2) #%% sum_python(10**5) #%% # Tensorflow teaser # Initialize the parameter N = tf.placeholder('int64', name='input_to_fun') # a recipe how to produce result result = tf.reduce_sum(tf.range(N)**2) result #%% result.eval({N: 10**5}) # logger for the tensorboard writer = tf.summary.FileWriter('Tensorboard_logs', graph=sess.graph) #%% with tf.name_scope('Placeholder_examples'): # default placeholder that can be arobitrary float32 # scalar vector, matirx etc arbitrary_input = tf.placeholder('float32') #input_vector of arbitrary length input_vector = tf.placeholder('float32', shape=(None, )) # input vector that must have 10 elements and integer type
def _build_train_op(self): """Builds a training op. Returns: train_op: An op performing one step of training. """ target_distribution = tf.stop_gradient( self._build_target_distribution()) # size of indices: batch_size x 1. indices = tf.range(tf.shape(self._replay_net_outputs.logits)[0])[:, None] # size of reshaped_actions: batch_size x 2. reshaped_actions = tf.concat([indices, self._replay.actions[:, None]], 1) # For each element of the batch, fetch the logits for its selected action. chosen_action_logits = tf.gather_nd(self._replay_net_outputs.logits, reshaped_actions) bellman_errors = (target_distribution[:, None, :] - chosen_action_logits[:, :, None] ) # Input `u' of Eq. 9. huber_loss = ( # Eq. 9 of paper. tf.to_float(tf.abs(bellman_errors) <= self.kappa) * 0.5 * bellman_errors**2 + tf.to_float(tf.abs(bellman_errors) > self.kappa) * self.kappa * (tf.abs(bellman_errors) - 0.5 * self.kappa)) tau_hat = ( (tf.range(self._num_atoms, dtype=tf.float32) + 0.5) / self._num_atoms) # Quantile midpoints. See Lemma 2 of paper. quantile_huber_loss = ( # Eq. 10 of paper. tf.abs(tau_hat[None, :, None] - tf.to_float(bellman_errors < 0)) * huber_loss) # Sum over tau dimension, average over target value dimension. loss = tf.reduce_sum(tf.reduce_mean(quantile_huber_loss, 2), 1) if self._replay_scheme == 'prioritized': target_priorities = self._replay.tf_get_priority( self._replay.indices) # The original prioritized experience replay uses a linear exponent # schedule 0.4 -> 1.0. Comparing the schedule to a fixed exponent of 0.5 # on 5 games (Asterix, Pong, Q*Bert, Seaquest, Space Invaders) suggested # a fixed exponent actually performs better, except on Pong. loss_weights = 1.0 / tf.sqrt(target_priorities + 1e-10) loss_weights /= tf.reduce_max(loss_weights) # Rainbow and prioritized replay are parametrized by an exponent alpha, # but in both cases it is set to 0.5 - for simplicity's sake we leave it # as is here, using the more direct tf.sqrt(). Taking the square root # "makes sense", as we are dealing with a squared loss. # Add a small nonzero value to the loss to avoid 0 priority items. While # technically this may be okay, setting all items to 0 priority will cause # troubles, and also result in 1.0 / 0.0 = NaN correction terms. update_priorities_op = self._replay.tf_set_priority( self._replay.indices, tf.sqrt(loss + 1e-10)) # Weight loss by inverse priorities. loss = loss_weights * loss else: update_priorities_op = tf.no_op() with tf.control_dependencies([update_priorities_op]): if self.summary_writer is not None: with tf.variable_scope('Losses'): tf.summary.scalar('QuantileLoss', tf.reduce_mean(loss)) return self.optimizer.minimize(tf.reduce_mean(loss)), loss
def naive_log_likelihood(x, presence=None): """Implementation from original repo ripped wholesale""" batch_size, n_input_points = x.shape[:2].as_list() # Generate gaussian mixture pdfs... # [B, 1, n_votes, n_input_dims] expanded_votes = tf.expand_dims(_votes, 1) expanded_scale = tf.expand_dims(tf.expand_dims(_scales, 1), -1) vote_component_pdf = _get_pdf(expanded_votes, expanded_scale) # For each part, evaluates all capsule, vote mixture likelihoods # [B, n_points, n_caps x n_votes, n_input_dims] expanded_x = tf.expand_dims(x, 2) vote_log_prob_per_dim = vote_component_pdf.log_prob(expanded_x) # Compressing mixture likelihood across all part dimension (ie. 2d point) # [B, n_points, n_caps x n_votes] vote_log_prob = tf.reduce_sum(vote_log_prob_per_dim, -1) dummy_vote_log_prob = tf.zeros([batch_size, n_input_points, 1]) dummy_vote_log_prob -= 2. * tf.log(10.) # adding extra [B, n_points, n_caps x n_votes] to end. WHY? vote_log_prob = tf.concat([vote_log_prob, dummy_vote_log_prob], 2) # [B, n_points, n_caps x n_votes] # CONDITIONAL LOGIT a_(k,n) mixing_logits = math_ops.safe_log(_vote_presence_prob) dummy_logit = tf.zeros([batch_size, 1]) - 2. * tf.log(10.) mixing_logits = tf.concat([mixing_logits, dummy_logit], 1) # # Following seems relevant only towards compressing ll for loss. # REDUNDANCY # # mixing_logits -> presence (a) # vote_log_prob -> Gaussian value (one per vote) for each coordinate # BAD -> vote presence / summed vote presence mixing_log_prob = mixing_logits - tf.reduce_logsumexp(mixing_logits, 1, keepdims=True) # BAD -> mixing presence (above) * each vote gaussian prob expanded_mixing_logits = tf.expand_dims(mixing_log_prob, 1) # Reduce to loglikelihood given k,n combination (capsule, vote) mixture_log_prob_per_component\ = tf.reduce_logsumexp(expanded_mixing_logits + vote_log_prob, 2) if presence is not None: presence = tf.to_float(presence) mixture_log_prob_per_component *= presence # Reduce votes to single capsule # ^ Misleading, reducing across all parts, multiplying log # likelihoods for each part _wrt all capsules_. mixture_log_prob_per_example\ = tf.reduce_sum(mixture_log_prob_per_component, 1) # Same as above but across all compressed part likelihoods in a batch. mixture_log_prob_per_batch = tf.reduce_mean( mixture_log_prob_per_example) # # Back from compression to argmax (routing to proper k) # # [B, n_points, n_votes] posterior_mixing_logits_per_point = expanded_mixing_logits + vote_log_prob # [B, n_points] winning_vote_idx = tf.argmax( posterior_mixing_logits_per_point[:, :, :-1], 2) batch_idx = tf.expand_dims(tf.range(batch_size, dtype=tf.int64), -1) batch_idx = snt.TileByDim([1], [winning_vote_idx.shape[-1]])(batch_idx) idx = tf.stack([batch_idx, winning_vote_idx], -1) winning_vote = tf.gather_nd(_votes, idx) winning_pres = tf.gather_nd(_vote_presence_prob, idx) vote_presence = tf.greater(mixing_logits[:, :-1], mixing_logits[:, -1:]) # the first four votes belong to the square # Just assuming the votes are ordered by capsule... is_from_capsule = winning_vote_idx // _n_votes posterior_mixing_probs = tf.nn.softmax( posterior_mixing_logits_per_point, -1)[Ellipsis, :-1] assert winning_vote.shape == x.shape return OutputTuple( log_prob=mixture_log_prob_per_batch, vote_presence=tf.to_float(vote_presence), winner=winning_vote, winner_pres=winning_pres, is_from_capsule=is_from_capsule, mixing_logits=mixing_logits, mixing_log_prob=mixing_log_prob, # TODO(adamrk): this is broken soft_winner=tf.zeros_like(winning_vote), soft_winner_pres=tf.zeros_like(winning_pres), posterior_mixing_probs=posterior_mixing_probs, )
def _create_make_unique(inputs): """Replaces the lower bits of each element with iota. The iota is used to derive the index, and also serves the purpose to make each element unique to break ties. Args: inputs: A tensor with rank of 2 and dtype of tf.float32. [batch_size, original_size]. Returns: A tensor after element wise transformation, with dtype the same as inputs. [batch_size, original_size]. Raises: ValueError: If the rank of the input tensor does not equal 2. """ if inputs.shape.ndims != 2: raise ValueError("Input of top_k_with_unique must be rank-2 " "but got: %s" % inputs.shape) height = inputs.shape[0] width = inputs.shape[1] zeros = tf.zeros([height, width], dtype=tf.int32) # Count_mask is used to mask away the low order bits to ensure that every # element is distinct. log2_ceiling = int(math.ceil(math.log(int(width), 2))) next_power_of_two = 1 << log2_ceiling count_mask = ~(next_power_of_two - 1) count_mask_r0 = tf.constant(count_mask) count_mask_r2 = tf.fill([height, width], count_mask_r0) # Smallest_normal is the bit representation of the smallest positive normal # floating point number. The sign is zero, exponent is one, and the fraction # is zero. smallest_normal = 1 << 23 smallest_normal_r0 = tf.constant(smallest_normal, dtype=tf.int32) smallest_normal_r2 = tf.fill([height, width], smallest_normal_r0) # Low_bit_mask is used to mask away the sign bit when computing the absolute # value. low_bit_mask = ~(1 << 31) low_bit_mask_r0 = tf.constant(low_bit_mask, dtype=tf.int32) low_bit_mask_r2 = tf.fill([height, width], low_bit_mask_r0) iota = tf.tile(tf.expand_dims(tf.range(width, dtype=tf.int32), 0), [height, 1]) # Compare the absolute value with positive zero to handle negative zero. input_r2 = tf.bitcast(inputs, tf.int32) abs_r2 = tf.bitwise.bitwise_and(input_r2, low_bit_mask_r2) if_zero_r2 = tf.equal(abs_r2, zeros) smallest_normal_preserving_sign_r2 = tf.bitwise.bitwise_or( input_r2, smallest_normal_r2) input_no_zeros_r2 = tf.where( if_zero_r2, smallest_normal_preserving_sign_r2, input_r2) # Discard the low-order bits and replace with iota. and_r2 = tf.bitwise.bitwise_and(input_no_zeros_r2, count_mask_r2) or_r2 = tf.bitwise.bitwise_or(and_r2, iota) return tf.bitcast(or_r2, tf.float32)
def _scan_step_fn(state, example, packed_length, queue_size, spacing, num_sequences, token_dtype): # pylint: disable=g-doc-args """Transform function used by tf.data.experimental.scan to process an example. This is written as a stateless function rather than a class method because we trace it with AutoGraph (in order to simplify the conditional), and this way we don't have to worry about handling re-tracing semantics. Args: See the SequenceDatasetPacker class. Returns: The updated queue state, and either a packed example or a dummy sequence which will be filtered out downstream. """ # Convert TensorArray tuples to lists since we'll need to replace them. availability, contents, top_index = state lengths = tf.concat([tf.shape(i) for i in example], axis=0) start_availability = availability.stack() can_fit = tf.reduce_all(tf.greater_equal(start_availability, lengths), axis=1) any_can_fit = tf.reduce_any(can_fit, axis=0) # AutoGraph will convert this block to a tf.cond if any_can_fit: # This indicates where in the FFD queue rotation a given index sits shifted_range = (tf.range(queue_size, dtype=INDEX_DTYPE) - top_index) % queue_size # Mark any indices which cannot accommodate the current example. exclusion_mask = tf.cast(tf.logical_not(can_fit), INDEX_DTYPE) * queue_size # Index in [0, queue_size) in which to place the sample. Note, this index # is the position in the actual TensorArray, not the index of the FFD queue. queue_index = (tf.reduce_min(shifted_range + exclusion_mask) + top_index) % queue_size # NOTE(taylorrobie): We emit a non-empty Tensor for downstream checks. output_contents = -tf.ones((1, num_sequences), dtype=token_dtype) else: index_range = top_index * packed_length + tf.range(packed_length) output_contents = contents.gather(index_range) # Reset the queue state. availability = availability.write( top_index, packed_length * tf.ones((num_sequences, ), dtype=INDEX_DTYPE)) empty_contents = tf.zeros((packed_length, num_sequences * 2), dtype=token_dtype) contents = contents.scatter(index_range, empty_contents) queue_index = top_index top_index = (top_index + 1) % queue_size pre_assign_availability = availability.read(queue_index) space_left = pre_assign_availability - lengths - spacing availability = availability.write(queue_index, space_left) # ============================================================================ # == Update contents ========================================================= # ============================================================================ # Consider the following case for a seq-to-seq packing: # (padding is represented as underscores) # # Queue starting state: # [1, 3, 2, 4, 6, 1, _, _, _, _, _, ...] # [5, 9, _, _, _, _, _, _, _, _, _, ...] # # Examples: # [4, 2, 4], [3] # # Desired new queue state: # [1, 3, 2, 4, 6, 1, _, _, 4, 2, 4, _, _, ...] # [5, 9, _, _, 3, _, _, _, _, _, _, _, _, ...] # # This could be acomplished by creating a TensorArray for each of the two # sequences, and scattering into the respective arrays. However TensorArray # writes are extremely expensive relative to other operations. So instead we # store the contents in a single TensorArray of shape (packed_length, 2), and # we pad and concatenate the examples such that they can be added in a single # assign: # # [_, _, _, _, 4, 2, 4] # [3, _, _, _, _, _, _] # + # [1, 3, 2, 4, 6, 1, _, _, _, _, _, ...] # [5, 9, _, _, _, _, _, _, _, _, _, ...] # # And in practice, the extra work of padding is neglidgable compared to # the gain from vectorizing the TensorArray assign. We also store a bit mask # denoting where sequences start which is used to compute segment and # position metadata: # # [_, _, _, _, 1, _, _] # [1, _, _, _, _, _, _] # + # [1, _, _, _, _, _, _, _, _, _, _, ...] # [1, _, _, _, _, _, _, _, _, _, _, ...] # # Both the contents and the mask are concatenated in the same TensorArray # for performance. start_index = packed_length - pre_assign_availability end_index = start_index + lengths leftmost = tf.reduce_min(start_index, axis=0) rightmost = tf.reduce_max(end_index, axis=0) delta = rightmost - leftmost pad_indices = [ tf.stack((start_index[i] - leftmost, rightmost - end_index[i])) for i in range(num_sequences) ] padded_examples = [ tf.pad(ex, padding[tf.newaxis, :]) for ex, padding in zip(example, pad_indices) ] padded_examples = tf.transpose(tf.stack(padded_examples)) mask_update = tf.one_hot(start_index - leftmost, delta, dtype=contents.dtype, axis=0) content_update = tf.concat([padded_examples, mask_update], axis=1) index_range = ( queue_index * packed_length + # Offset into the right section. tf.range(delta, dtype=INDEX_DTYPE) + leftmost) contents = contents.scatter(index_range, contents.gather(index_range) + content_update) state = (availability, contents, top_index) return state, (tf.logical_not(any_can_fit), output_contents)
def __init__(self, item_num, args, reuse=None): self.args = args self.is_training = tf.placeholder(tf.bool, shape=()) self.input_seq = tf.placeholder(tf.int32, shape=(None, args.maxlen)) self.pos = tf.placeholder(tf.int32, shape=None) self.exemplar_logits = tf.placeholder(tf.float32, shape=(None, None)) self.exemplar_pos = tf.placeholder(tf.int32, shape=None) self.max_item = tf.placeholder(tf.int32, shape=()) self.lr = tf.placeholder(tf.float32, shape=()) self.dropout_rate = tf.placeholder(tf.float32, shape=()) pos = self.pos mask = tf.expand_dims(tf.to_float(tf.not_equal(self.input_seq, 0)), -1) with tf.variable_scope("SASRec", reuse=reuse): # sequence embedding, item embedding table self.seq, item_emb_table = embedding(self.input_seq, vocab_size=item_num + 1, num_units=args.hidden_units, zero_pad=True, scale=True, l2_reg=args.l2_emb, scope="input_embeddings", with_t=True, reuse=reuse ) # # Positional Encoding t, pos_emb_table = embedding( tf.tile(tf.expand_dims(tf.range(tf.shape(self.input_seq)[1]), 0), [tf.shape(self.input_seq)[0], 1]), vocab_size=args.maxlen, num_units=args.hidden_units, zero_pad=False, scale=False, l2_reg=args.l2_emb, scope="dec_pos", reuse=reuse, with_t=True ) self.seq += t # Dropout self.seq = tf.layers.dropout(self.seq, rate=self.dropout_rate, training=tf.convert_to_tensor(self.is_training), seed=args.random_seed) self.seq *= mask # Build blocks for i in range(args.num_blocks): with tf.variable_scope("num_blocks_%d" % i): # Self-attention self.seq = multihead_attention(queries=normalize(self.seq), keys=self.seq, num_units=args.hidden_units, num_heads=args.num_heads, dropout_rate=self.dropout_rate, seed=args.random_seed, is_training=self.is_training, causality=True, scope="self_attention") # Feed forward self.seq = feedforward(normalize(self.seq), num_units=[args.hidden_units, args.hidden_units], dropout_rate=self.dropout_rate, is_training=self.is_training, seed=args.random_seed) self.seq *= mask self.seq = normalize(self.seq) # find representation self.rep = self.seq[:, -1, :] # define loss seq_emb = tf.reshape(self.rep, [tf.shape(self.input_seq)[0], args.hidden_units]) indices = pos - 1 self.labels = tf.one_hot(indices, self.max_item) item_emb = tf.nn.embedding_lookup(item_emb_table, tf.range(1, self.max_item + 1)) self.logits = tf.matmul(seq_emb, tf.transpose(item_emb)) self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.labels, logits=self.logits)) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) # prediction self.test_item = tf.placeholder(tf.int32, shape=None) self.test_item_emb = tf.nn.embedding_lookup(item_emb_table, self.test_item) self.test_logits = tf.matmul(seq_emb, tf.transpose(self.test_item_emb)) self.test_logits = tf.reshape(self.test_logits, [tf.shape(self.input_seq)[0], tf.shape(self.test_item)[0]]) self.pred_last = tf.argsort(tf.argsort(-self.test_logits))
def dropblock(net, is_training, keep_prob, dropblock_size, data_format='channels_first'): """DropBlock: a regularization method for convolutional neural networks. DropBlock is a form of structured dropout, where units in a contiguous region of a feature map are dropped together. DropBlock works better than dropout on convolutional layers due to the fact that activation units in convolutional layers are spatially correlated. See https://arxiv.org/pdf/1810.12890.pdf for details. Args: net: `Tensor` input tensor. is_training: `bool` for whether the model is training. keep_prob: `float` or `Tensor` keep_prob parameter of DropBlock. "None" means no DropBlock. dropblock_size: `int` size of blocks to be dropped by DropBlock. data_format: `str` either "channels_first" for `[batch, channels, height, width]` or "channels_last for `[batch, height, width, channels]`. Returns: A version of input tensor with DropBlock applied. Raises: if width and height of the input tensor are not equal. """ if not is_training or keep_prob is None: return net tf.logging.info( 'Applying DropBlock: dropblock_size {}, net.shape {}'.format( dropblock_size, net.shape)) if data_format == 'channels_last': _, width, height, _ = net.get_shape().as_list() else: _, _, width, height = net.get_shape().as_list() if width != height: raise ValueError('Input tensor with width!=height is not supported.') dropblock_size = min(dropblock_size, width) # seed_drop_rate is the gamma parameter of DropBlcok. seed_drop_rate = (1.0 - keep_prob) * width**2 / dropblock_size**2 / ( width - dropblock_size + 1)**2 # Forces the block to be inside the feature map. w_i, h_i = tf.meshgrid(tf.range(width), tf.range(width)) valid_block_center = tf.logical_and( tf.logical_and(w_i >= int(dropblock_size // 2), w_i < width - (dropblock_size - 1) // 2), tf.logical_and(h_i >= int(dropblock_size // 2), h_i < width - (dropblock_size - 1) // 2)) valid_block_center = tf.expand_dims(valid_block_center, 0) valid_block_center = tf.expand_dims( valid_block_center, -1 if data_format == 'channels_last' else 0) randnoise = tf.random_uniform(net.shape, dtype=tf.float32) block_pattern = ( 1 - tf.cast(valid_block_center, dtype=tf.float32) + tf.cast( (1 - seed_drop_rate), dtype=tf.float32) + randnoise) >= 1 block_pattern = tf.cast(block_pattern, dtype=tf.float32) if dropblock_size == width: block_pattern = tf.reduce_min( block_pattern, axis=[1, 2] if data_format == 'channels_last' else [2, 3], keepdims=True) else: if data_format == 'channels_last': ksize = [1, dropblock_size, dropblock_size, 1] else: ksize = [1, 1, dropblock_size, dropblock_size] block_pattern = -tf.nn.max_pool( -block_pattern, ksize=ksize, strides=[1, 1, 1, 1], padding='SAME', data_format='NHWC' if data_format == 'channels_last' else 'NCHW') percent_ones = tf.cast(tf.reduce_sum( (block_pattern)), tf.float32) / tf.cast(tf.size(block_pattern), tf.float32) net = net / tf.cast(percent_ones, net.dtype) * tf.cast( block_pattern, net.dtype) return net
def compute_knowledge_selection_and_loss(self, features, encoder_output, fact_embedding, fact_lengths, margin, num_negative_samples): """Compute knowledge selection and loss. Args: features: features. encoder_output: <tf.float32>[batch_size, input_length, hidden_dim] fact_embedding: <tf.float32>[batch_size*triple_num, max_triple_length, emb_dim] fact_lengths: # <tf.int32>[batch_size*triple_num] margin: integer value for max margin in TransE loss, num_negative_samples: shuffle and sample multiple negative examples for the TransE loss Returns: knowledge_weights: knowledge_loss: """ hparams = self._hparams encoder_output_shape = common_layers.shape_list(encoder_output) encoder_hidden_dim = encoder_output_shape[-1] inputs = features["inputs"] # <tf.float32>[batch_size, input_length, emb_dim] inputs = tf.squeeze(inputs, 2) # <tf.float32>[batch_size, input_length] context_padding = common_attention.embedding_to_padding(inputs) # <tf.float32>[batch_size] context_lens = tf.to_float( common_attention.padding_to_length(context_padding)) # <tf.float32>[batch_size, 1] context_lens = tf.expand_dims(context_lens, -1) # Compute context vector summary. # <tf.float32>[batch_size, hidden_dim] context_vector_summary = compute_summary_embedding( encoder_output, context_lens, hparams) knowledge_encoder_output = compute_average_embedding( fact_embedding, fact_lengths) # <tf.float32>[batch_size, triple_num, emb_dim] knowledge_encoder_output = tf.reshape( knowledge_encoder_output, [-1, self.triple_num, encoder_hidden_dim]) original_knowledge_encoder_output = knowledge_encoder_output if hparams.similarity_fuction == "dot_product": triple_logits = tf.squeeze( tf.matmul(knowledge_encoder_output, tf.expand_dims(context_vector_summary, 2)), -1) elif hparams.similarity_fuction == "bilinear": # Tile the context vector summary. # <tf.float32>[batch_size, triple_num*hidden_dim] tiled_context_vector = tf.tile(context_vector_summary, [1, self.triple_num]) # <tf.float32>[batch_size, triple_num, hidden_dim] context_vector = tf.reshape( tiled_context_vector, [-1, self.triple_num, encoder_hidden_dim]) # compute outer product context_vector = tf.expand_dims(context_vector, -1) knowledge_encoder_output = tf.expand_dims(knowledge_encoder_output, 2) # <tf.float32>[batch_size, triple_num, hidden_dim, hidden_dim] outer_product = tf.matmul(context_vector, knowledge_encoder_output) outer_product = tf.reshape( outer_product, [-1, self.triple_num, encoder_hidden_dim * encoder_hidden_dim]) triple_logits = tf.squeeze( tf.layers.dense(outer_product, 1, name="knolwedge_final_mlp"), -1) avg_triple_loss = 0.0 triple_labels = features["triple_labels"] subject_mask = tf.reshape( features["subject_mask"], [-1, self.triple_num, hparams.max_triple_length]) subject_mask = tf.reshape(subject_mask, [-1, hparams.max_triple_length]) predicate_mask = tf.reshape( features["predicate_mask"], [-1, self.triple_num, hparams.max_triple_length]) predicate_mask = tf.reshape(predicate_mask, [-1, hparams.max_triple_length]) object_mask = tf.reshape( features["object_mask"], [-1, self.triple_num, hparams.max_triple_length]) object_mask = tf.reshape(object_mask, [-1, hparams.max_triple_length]) # mask : [bs, max_seq_len, triple_num] # the below operation will result in [bs*triple_num,emb_dim] subject_length = tf.cast( tf.expand_dims(tf.reduce_sum(subject_mask, -1), 1), tf.float32) # [bs*tn] object_length = tf.cast( tf.expand_dims(tf.reduce_sum(object_mask, -1), 1), tf.float32) predicate_length = tf.cast( tf.expand_dims(tf.reduce_sum(predicate_mask, -1), 1), tf.float32) # expand dimension 2 to be able to broadcast subject_mask = tf.cast(tf.expand_dims(subject_mask, 2), tf.float32) predicate_mask = tf.cast(tf.expand_dims(predicate_mask, 2), tf.float32) object_mask = tf.cast(tf.expand_dims(object_mask, 2), tf.float32) subject_vect = tf.reduce_sum(tf.multiply( fact_embedding, subject_mask), 1) / ( subject_length + tf.broadcast_to(tf.constant([1e-5]), tf.shape(subject_length))) object_vect = tf.reduce_sum(tf.multiply( fact_embedding, object_mask), 1) / ( object_length + tf.broadcast_to(tf.constant([1e-5]), tf.shape(object_length))) predicate_vect = tf.reduce_sum( tf.multiply(fact_embedding, predicate_mask), 1) / (predicate_length + tf.broadcast_to( tf.constant([1e-5]), tf.shape(predicate_length))) # Shuffled rows to generate adversarial samples shuffled_subject_vect = [] shuffled_object_vect = [] for _ in range(num_negative_samples): shuffled_subject_vect += [ tf.gather( subject_vect, tf.random.shuffle(tf.range(tf.shape(subject_vect)[0]))) ] # [bs*tn,d] shuffled_object_vect += [ tf.gather( object_vect, tf.random.shuffle(tf.range(tf.shape(object_vect)[0]))) ] # [bs*tn,d] # KB pretraining loss positive_loss = tf.reduce_mean( tf.squared_difference(subject_vect + predicate_vect, object_vect)) negative_loss = 0 for n_adv in range(num_negative_samples): negative_loss += tf.reduce_mean( tf.squared_difference( shuffled_subject_vect[n_adv] + predicate_vect, object_vect)) negative_loss += tf.reduce_mean( tf.squared_difference(subject_vect + predicate_vect, shuffled_object_vect[n_adv])) # TransE Loss negative_loss = negative_loss / (2 * num_negative_samples) transe_loss = tf.clip_by_value(margin + positive_loss - negative_loss, clip_value_min=0, clip_value_max=100) if hparams.mode != tf.estimator.ModeKeys.PREDICT: triple_losses = tf.nn.weighted_cross_entropy_with_logits( labels=triple_labels, logits=triple_logits, pos_weight=hparams.pos_weight) avg_triple_loss = tf.reduce_mean(triple_losses) tf.summary.scalar("triple_loss", avg_triple_loss) return triple_logits, avg_triple_loss, original_knowledge_encoder_output, transe_loss
def make_ordered_one_hot_vectors(num, num_tokens): """Makes one hot vectors of size [num, num_tokens].""" num_repeats = int(np.ceil(num / float(num_tokens))) indices = tf.stack([tf.range(num_tokens)] * num_repeats) indices = tf.reshape(tf.transpose(indices), [-1])[0:num] return tf.one_hot(indices, depth=num_tokens)
def positions_for(tokens, past_length): batch_size = tf.shape(tokens)[0] nsteps = tf.shape(tokens)[1] return expand_tile(past_length + tf.range(nsteps), batch_size)
def _build_sampler(self): """Build the sampler ops and the log_prob ops.""" hidden_size = self.params.controller_hidden_size num_layers = self.params.controller_num_layers arc_seq = [] sample_log_probs = [] sample_entropy = [] all_h = [tf.zeros([1, hidden_size], dtype=tf.float32)] all_h_w = [tf.zeros([1, hidden_size], dtype=tf.float32)] # sampler ops inputs = self.g_emb prev_c = tf.zeros([1, hidden_size], dtype=tf.float32) prev_h = tf.zeros([1, hidden_size], dtype=tf.float32) inputs = self.g_emb for layer_id in range(1, num_layers+1): next_c, next_h = _lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h all_h.append(next_h) all_h_w.append(tf.matmul(next_h, self.attn_w_1)) query = tf.matmul(next_h, self.attn_w_2) query = query + tf.concat(all_h_w[:-1], axis=0) query = tf.tanh(query) logits = tf.matmul(query, self.attn_v) logits = tf.reshape(logits, [1, layer_id]) if self.params.controller_temperature: logits /= self.params.controller_temperature if self.params.controller_tanh_constant: logits = self.params.controller_tanh_constant * tf.tanh(logits) diff = tf.to_float(layer_id - tf.range(0, layer_id)) ** 2 logits -= tf.reshape(diff, [1, layer_id]) / 6.0 skip_index = tf.multinomial(logits, 1) skip_index = tf.to_int32(skip_index) skip_index = tf.reshape(skip_index, [1]) arc_seq.append(skip_index) log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=skip_index) sample_log_probs.append(log_prob) entropy = log_prob * tf.exp(-log_prob) sample_entropy.append(tf.stop_gradient(entropy)) inputs = tf.nn.embedding_lookup( tf.concat(all_h[:-1], axis=0), skip_index) inputs /= (0.1 + tf.to_float(layer_id - skip_index)) next_c, next_h = _lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h logits = tf.matmul(next_h, self.w_emb, transpose_b=True) if self.params.controller_temperature: logits /= self.params.controller_temperature if self.params.controller_tanh_constant: logits = self.params.controller_tanh_constant * tf.tanh(logits) func = tf.multinomial(logits, 1) func = tf.to_int32(func) func = tf.reshape(func, [1]) arc_seq.append(func) log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=func) sample_log_probs.append(log_prob) entropy = log_prob * tf.exp(-log_prob) sample_entropy.append(tf.stop_gradient(entropy)) inputs = tf.nn.embedding_lookup(self.w_emb, func) arc_seq = tf.concat(arc_seq, axis=0) self.sample_arc = arc_seq self.sample_log_probs = tf.concat(sample_log_probs, axis=0) self.ppl = tf.exp(tf.reduce_mean(self.sample_log_probs)) sample_entropy = tf.concat(sample_entropy, axis=0) self.sample_entropy = tf.reduce_sum(sample_entropy) self.all_h = all_h
def initialize(shape, dtype): return tf.reshape(1 + tf.range(np.prod(shape), dtype=dtype), shape)
def create_cm_sketch(topk_obj_ids, topk_obj_weights, all_entity_sketches, cm_width): """Create cm sketches for a set of weighted entities. Args: topk_obj_ids: batch_size, topk topk_obj_weights: batch_size, topk all_entity_sketches: num_entities, depth cm_width: width of count-min sketch Returns: k hot dense vectors: batch_size, depth, width """ topk_fact_obj_sketches = tf.gather(all_entity_sketches, topk_obj_ids, axis=0) # batch_size, topk, depth batch_size = tf.shape(topk_fact_obj_sketches)[0] topk = tf.shape(topk_fact_obj_sketches)[1] cm_depth = tf.shape(topk_fact_obj_sketches)[2] # We first create a sparse matrix from the hash values. We will then # convert it into dense matrix. This is more efficient than creating # k one-hot vectors and then aggregating them into one k-hot vector. # First prepare ids of non-zero values in the sparse matrix flattened_topk_hash_ids = tf.reshape(topk_fact_obj_sketches, shape=[-1]) # batch_size * topk * depth topk_obj_weights = tf.tile(tf.expand_dims(topk_obj_weights, axis=2), multiples=[1, 1, cm_depth]) # batch_size, topk, depth flattened_topk_obj_weights = tf.reshape(topk_obj_weights, shape=[-1]) # batch_size * topk * depth batch_ids = tf.range(batch_size) # batch_size, batch_ids = tf.expand_dims(tf.expand_dims(batch_ids, axis=1), axis=2) # batch_size, 1, 1 batch_ids = tf.tile(batch_ids, multiples=[1, topk, cm_depth]) # batch_size, topk, depth flattened_batch_ids = tf.reshape(batch_ids, shape=[-1]) # batch_size * topk * depth depth_ids = tf.range(cm_depth) # depth, depth_ids = tf.expand_dims(tf.expand_dims(depth_ids, axis=0), axis=1) # 1, 1, depth depth_ids = tf.tile(depth_ids, multiples=[batch_size, topk, 1]) # batch_size, topk, depth flattened_depth_ids = tf.reshape(depth_ids, shape=[-1]) # batch_size * topk * depth sparse_value_ids = tf.cast(tf.stack( [flattened_batch_ids, flattened_depth_ids, flattened_topk_hash_ids], axis=1), dtype=tf.int64) # Then prepare values of non-zero values in the sparse matrix. Values # are sorted to ascending order. If there are duplicates, later (larger) # values will be kept. sorted_orders = tf.argsort(flattened_topk_obj_weights, direction='ASCENDING', stable=True) # batch_size * topk * depth sorted_flattened_topk_obj_weights = tf.gather(flattened_topk_obj_weights, sorted_orders) sorted_sparse_value_ids = tf.gather(sparse_value_ids, sorted_orders) # Finally create sketch in sparse tensors and convert it to dense tensors. # We donot validate indices here. If multiple values are about to be assigned # to the same row and column, we will keep the last value, because the last # value is the larger one. This behaviour is by design. sparse_k_hot_sketch = tf.SparseTensor( indices=sorted_sparse_value_ids, values=sorted_flattened_topk_obj_weights, dense_shape=[batch_size, cm_depth, cm_width]) dense_k_hot_sketch = tf.sparse.to_dense(sparse_k_hot_sketch, validate_indices=False) # batch_size, cm_depth, cm_width return dense_k_hot_sketch
def __init__(self, session, player_id, info_state_size, num_actions, loss_str="a2c", loss_class=None, hidden_layers_sizes=(128, ), batch_size=16, critic_learning_rate=0.01, pi_learning_rate=0.001, entropy_cost=0.01, num_critic_before_pi=8, additional_discount_factor=1.0, max_global_gradient_norm=None, optimizer_str="sgd"): """Initialize the PolicyGradient agent. Args: session: Tensorflow session. player_id: int, player identifier. Usually its position in the game. info_state_size: int, info_state vector size. num_actions: int, number of actions per info state. loss_str: string or None. If string, must be one of ["rpg", "qpg", "rm", "a2c"] and defined in `_get_loss_class`. If None, a loss class must be passed through `loss_class`. Defaults to "a2c". loss_class: Class or None. If Class, it must define the policy gradient loss. If None a loss class in a string format must be passed through `loss_str`. Defaults to None. hidden_layers_sizes: iterable, defines the neural network layers. Defaults to (128,), which produces a NN: [INPUT] -> [128] -> ReLU -> [OUTPUT]. batch_size: int, batch size to use for Q and Pi learning. Defaults to 128. critic_learning_rate: float, learning rate used for Critic (Q or V). Defaults to 0.001. pi_learning_rate: float, learning rate used for Pi. Defaults to 0.001. entropy_cost: float, entropy cost used to multiply the entropy loss. Can be set to None to skip entropy computation. Defaults to 0.001. num_critic_before_pi: int, number of Critic (Q or V) updates before each Pi update. Defaults to 8 (every 8th critic learning step, Pi also learns). additional_discount_factor: float, additional discount to compute returns. Defaults to 1.0, in which case, no extra discount is applied. None that users must provide *only one of* `loss_str` or `loss_class`. max_global_gradient_norm: float or None, maximum global norm of a gradient to which the gradient is shrunk if its value is larger. optimizer_str: String defining which optimizer to use. Supported values are {sgd, adam} """ assert bool(loss_str) ^ bool( loss_class), "Please provide only one option." self._kwargs = locals() loss_class = loss_class if loss_class else self._get_loss_class( loss_str) self._loss_class = loss_class self.player_id = player_id self._session = session self._num_actions = num_actions self._layer_sizes = hidden_layers_sizes self._batch_size = batch_size self._extra_discount = additional_discount_factor self._num_critic_before_pi = num_critic_before_pi self._episode_data = [] self._dataset = collections.defaultdict(list) self._prev_time_step = None self._prev_action = None # Step counters self._step_counter = 0 self._episode_counter = 0 self._num_learn_steps = 0 # Keep track of the last training loss achieved in an update step. self._last_loss_value = None # Placeholders self._info_state_ph = tf.placeholder(shape=[None, info_state_size], dtype=tf.float32, name="info_state_ph") self._action_ph = tf.placeholder(shape=[None], dtype=tf.int32, name="action_ph") self._return_ph = tf.placeholder(shape=[None], dtype=tf.float32, name="return_ph") # Network # activate final as we plug logit and qvalue heads afterwards. self._net_torso = snt.nets.MLP(output_sizes=self._layer_sizes, activate_final=True) torso_out = self._net_torso(self._info_state_ph) self._policy_logits_layer = snt.Linear(output_size=self._num_actions, name="policy_head") self.policy_logits_network = snt.Sequential( [self._net_torso, self._policy_logits_layer]) self._policy_logits = self._policy_logits_layer(torso_out) self._policy_probs = tf.nn.softmax(self._policy_logits) self._savers = [] # Add baseline (V) head for A2C. if loss_class.__name__ == "BatchA2CLoss": self._baseline_layer = snt.Linear(output_size=1, name="baseline") self._baseline = tf.squeeze(self._baseline_layer(torso_out), axis=1) else: self._q_values_layer = snt.Linear(output_size=self._num_actions, name="q_values_head") self._q_values = self._q_values_layer(torso_out) # Critic loss # Baseline loss in case of A2C if loss_class.__name__ == "BatchA2CLoss": self._critic_loss = tf.reduce_mean( tf.losses.mean_squared_error(labels=self._return_ph, predictions=self._baseline)) else: # Q-loss otherwise. action_indices = tf.stack( [tf.range(tf.shape(self._q_values)[0]), self._action_ph], axis=-1) value_predictions = tf.gather_nd(self._q_values, action_indices) self._critic_loss = tf.reduce_mean( tf.losses.mean_squared_error(labels=self._return_ph, predictions=value_predictions)) if optimizer_str == "adam": self._critic_optimizer = tf.train.AdamOptimizer( learning_rate=critic_learning_rate) elif optimizer_str == "sgd": self._critic_optimizer = tf.train.GradientDescentOptimizer( learning_rate=critic_learning_rate) else: raise ValueError("Not implemented, choose from 'adam' and 'sgd'.") def minimize_with_clipping(optimizer, loss): grads_and_vars = optimizer.compute_gradients(loss) if max_global_gradient_norm is not None: grads, variables = zip(*grads_and_vars) grads, _ = tf.clip_by_global_norm(grads, max_global_gradient_norm) grads_and_vars = list(zip(grads, variables)) return optimizer.apply_gradients(grads_and_vars) self._critic_learn_step = minimize_with_clipping( self._critic_optimizer, self._critic_loss) # Pi loss pg_class = loss_class(entropy_cost=entropy_cost) if loss_class.__name__ == "BatchA2CLoss": self._pi_loss = pg_class.loss(policy_logits=self._policy_logits, baseline=self._baseline, actions=self._action_ph, returns=self._return_ph) else: self._pi_loss = pg_class.loss(policy_logits=self._policy_logits, action_values=self._q_values) if optimizer_str == "adam": self._pi_optimizer = tf.train.AdamOptimizer( learning_rate=pi_learning_rate) elif optimizer_str == "sgd": self._pi_optimizer = tf.train.GradientDescentOptimizer( learning_rate=pi_learning_rate) self._pi_learn_step = minimize_with_clipping(self._pi_optimizer, self._pi_loss) self._loss_str = loss_str self._initialize()
def _build_single_q_network(self, observations, head, state_t, state_tp1, done_mask, reward_t, error_weight): """Builds the computational graph for a single Q network. Briefly, this part is calculating the following two quantities: 1. q_value = q_fn(observations) 2. td_error = q_fn(state_t) - reward_t - gamma * q_fn(state_tp1) The optimization target is to minimize the td_error. Args: observations: shape = [batch_size, hparams.fingerprint_length]. The input of the Q function. head: shape = [1]. The index of the head chosen for decision in bootstrap DQN. state_t: shape = [batch_size, hparams.fingerprint_length]. The state at time step t. state_tp1: a list of tensors, with total number of batch_size, each has shape = [num_actions, hparams.fingerprint_length]. Note that the num_actions can be different for each tensor. The state at time step t+1, tp1 is short for t plus 1. done_mask: shape = [batch_size, 1] Whether state_tp1 is the terminal state. reward_t: shape = [batch_size, 1] the reward at time step t. error_weight: shape = [batch_size, 1] weight for the loss. Returns: q_values: Tensor of [batch_size, 1]. The q values for the observations. td_error: Tensor of [batch_size, 1]. The TD error. weighted_error: Tensor of [batch_size, 1]. The TD error weighted by error_weight. q_fn_vars: List of tf.Variables. The variables of q_fn when computing the q_values of state_t q_fn_vars: List of tf.Variables. The variables of q_fn when computing the q_values of state_tp1 """ with tf.variable_scope('q_fn'): # q_value have shape [batch_size, 1]. q_values = tf.gather(self.q_fn(observations), head, axis=-1) # calculating q_fn(state_t) # The Q network shares parameters with the action graph. with tf.variable_scope('q_fn', reuse=True): q_t = self.q_fn(state_t, reuse=True) q_fn_vars = tf.trainable_variables( scope=tf.get_variable_scope().name + '/q_fn') # calculating q_fn(state_tp1) with tf.variable_scope('q_tp1', reuse=tf.AUTO_REUSE): q_tp1 = [self.q_fn(s_tp1, reuse=tf.AUTO_REUSE) for s_tp1 in state_tp1] q_tp1_vars = tf.trainable_variables( scope=tf.get_variable_scope().name + '/q_tp1') if self.double_q: with tf.variable_scope('q_fn', reuse=True): q_tp1_online = [self.q_fn(s_tp1, reuse=True) for s_tp1 in state_tp1] if self.num_bootstrap_heads: num_heads = self.num_bootstrap_heads else: num_heads = 1 # determine the action to choose based on online Q estimator. q_tp1_online_idx = [ tf.stack([tf.argmax(q, axis=0), tf.range(num_heads, dtype=tf.int64)], axis=1) for q in q_tp1_online ] # use the index from max online q_values to compute the value # function v_tp1 = tf.stack( [tf.gather_nd(q, idx) for q, idx in zip(q_tp1, q_tp1_online_idx)], axis=0) else: v_tp1 = tf.stack([tf.reduce_max(q) for q in q_tp1], axis=0) # if s_{t+1} is the terminal state, we do not evaluate the Q value of # the state. q_tp1_masked = (1.0 - done_mask) * v_tp1 q_t_target = reward_t + self.gamma * q_tp1_masked # stop gradient from flowing to the computating graph which computes # the Q value of s_{t+1}. # td_error has shape [batch_size, 1] td_error = q_t - tf.stop_gradient(q_t_target) # If use bootstrap, each head is trained with a different subset of the # training sample. Like the idea of dropout. if self.num_bootstrap_heads: head_mask = tf.keras.backend.random_binomial( shape=(1, self.num_bootstrap_heads), p=0.6) td_error = tf.reduce_mean(td_error * head_mask, axis=1) # The loss comes from a traditional trick in convex optimization: # http://web.stanford.edu/~boyd/cvxbook/. # See Chapter 6 pp. 298 # It will makes the optimization robust. # Specifically, the loss will use l1 instead of l2 loss when the td error # gets larger than 1.0. The l2 loss has the disadvantage that it has # the tendency to be dominated by outliers. In terms of estimation theory, # the asymptotic relative efficiency of the l1 loss estimator is better # for heavy-tailed distributions. errors = tf.where( tf.abs(td_error) < 1.0, tf.square(td_error), 1.0 * (tf.abs(td_error) - 0.5)) weighted_error = tf.reduce_mean(error_weight * errors) return q_values, td_error, weighted_error, q_fn_vars, q_tp1_vars
def naive_mcmc_ll(x, presence=None): """Most simple of the optimization schemes. Skip the product of closeform probability of part given _all_ data. Rather use the value at the argmax as a proxy for each part. """ batch_size, n_input_points = x.shape[:2].as_list() # Generate gaussian mixture pdfs... # [B, 1, n_votes, n_input_dims] expanded_votes = tf.expand_dims(_votes, 1) expanded_scale = tf.expand_dims(tf.expand_dims(_scales, 1), -1) vote_component_pdf = _get_pdf(expanded_votes, expanded_scale) print("vote_component_pdf: ", vote_component_pdf) # For each part, evaluates all capsule, vote mixture likelihoods # [B, n_points, n_caps x n_votes, n_input_dims] expanded_x = tf.expand_dims(x, 2) print("expanded_x: ", expanded_x.shape) vote_log_prob_per_dim = vote_component_pdf.log_prob(expanded_x) print("vote_log_prob_dim: ", vote_log_prob_per_dim.shape) # Compressing mixture likelihood across all part dimension (ie. 2d point) # [B, n_points, n_caps x n_votes] vote_log_prob = tf.reduce_sum(vote_log_prob_per_dim, -1) print("vote_log_prob: ", vote_log_prob.shape) dummy_vote_log_prob = tf.zeros([batch_size, n_input_points, 1]) dummy_vote_log_prob -= 2. * tf.log(10.) print("dummy_vote: ", dummy_vote_log_prob.shape) # adding extra [B, n_points, n_caps x n_votes] to end. WHY? vote_log_prob = tf.concat([vote_log_prob, dummy_vote_log_prob], 2) print("cat vote_log_prob: ", vote_log_prob.shape) # [B, n_points, n_caps x n_votes] # CONDITIONAL LOGIT a_(k,n) mixing_logits = math_ops.safe_log(_vote_presence_prob) dummy_logit = tf.zeros([batch_size, 1]) - 2. * tf.log(10.) mixing_logits = tf.concat([mixing_logits, dummy_logit], 1) print("mixing_logits : ", mixing_logits.shape) # BAD -> vote presence / summed vote presence mixing_log_prob = mixing_logits - tf.reduce_logsumexp(mixing_logits, 1, keepdims=True) print("mixing_log_prob : ", mixing_log_prob.shape) expanded_mixing_logits = tf.expand_dims(mixing_log_prob, 1) # [B, n_points, n_votes] posterior_mixing_logits_per_point = expanded_mixing_logits + vote_log_prob print("posterior_mixing_per_point: ", posterior_mixing_logits_per_point.shape) # [B, n_points] winning_vote_idx = tf.argmax( posterior_mixing_logits_per_point[:, :, :-1], 2) print("winning_vote_idx: ", winning_vote_idx.shape) batch_idx = tf.expand_dims(tf.range(batch_size, dtype=tf.int64), -1) batch_idx = snt.TileByDim([1], [winning_vote_idx.shape[-1]])(batch_idx) idx = tf.stack([batch_idx, winning_vote_idx], -1) winning_vote = tf.gather_nd(_votes, idx) print("winning_vote: ", winning_vote.shape) winning_pres = tf.gather_nd(_vote_presence_prob, idx) print("winning_pres: ", winning_pres.shape) vote_presence = tf.greater(mixing_logits[:, :-1], mixing_logits[:, -1:]) print("vote_presence: ", vote_presence.shape) # the first four votes belong to the square # Just assuming the votes are ordered by capsule... is_from_capsule = winning_vote_idx // _n_votes print("is_from_capsule: ", is_from_capsule.shape) posterior_mixing_probs = tf.nn.softmax( posterior_mixing_logits_per_point, -1)[Ellipsis, :-1] assert winning_vote.shape == x.shape # log_prob=mixture_log_prob_per_batch, return OutputTuple( log_prob=None, vote_presence=tf.to_float(vote_presence), winner=winning_vote, winner_pres=winning_pres, is_from_capsule=is_from_capsule, mixing_logits=mixing_logits, mixing_log_prob=mixing_log_prob, # TODO(adamrk): this is broken soft_winner=tf.zeros_like(winning_vote), soft_winner_pres=tf.zeros_like(winning_pres), posterior_mixing_probs=posterior_mixing_probs, )
def _compute_object_logits(hparams, object_hidden, screen_encoding, screen_encoding_bias): """The output layer for a specific domain.""" with tf.variable_scope("compute_object_logits", reuse=tf.AUTO_REUSE): if hparams.alignment == "cosine_similarity": object_hidden = tf.layers.dense( object_hidden, units=hparams.hidden_size) screen_encoding = tf.layers.dense( screen_encoding, units=hparams.hidden_size) norm_screen_encoding = tf.math.l2_normalize(screen_encoding, axis=-1) norm_obj_hidden = tf.math.l2_normalize(object_hidden, axis=-1) align_logits = tf.matmul(norm_screen_encoding, tf.expand_dims(norm_obj_hidden, 3)) elif hparams.alignment == "scaled_cosine_similarity": object_hidden = tf.layers.dense( object_hidden, units=hparams.hidden_size) screen_encoding = tf.reshape( screen_encoding, common_layers.shape_list( screen_encoding)[:-1] + [hparams.hidden_size]) screen_encoding = tf.layers.dense( screen_encoding, units=hparams.hidden_size) norm_screen_encoding = tf.math.l2_normalize(screen_encoding, axis=-1) norm_obj_hidden = tf.math.l2_normalize(object_hidden, axis=-1) dot_products = tf.matmul(norm_screen_encoding, tf.expand_dims(norm_obj_hidden, 3)) align_logits = tf.layers.dense(dot_products, units=1) elif hparams.alignment == "dot_product_attention": object_hidden = tf.layers.dense( object_hidden, units=hparams.hidden_size) align_logits = tf.matmul(screen_encoding, tf.expand_dims(object_hidden, 3)) elif hparams.alignment == "mlp_attention": batch_size = tf.shape(screen_encoding)[0] num_steps = tf.shape(screen_encoding)[1] num_objects = tf.shape(screen_encoding)[2] tiled_object_hidden = tf.tile(tf.expand_dims(object_hidden, 2), [1, 1, num_objects, 1]) align_feature = tf.concat([tiled_object_hidden, screen_encoding], axis=-1) align_feature = tf.reshape( align_feature, [batch_size, num_steps, num_objects, hparams.hidden_size * 2]) with tf.variable_scope("align", reuse=tf.AUTO_REUSE): align_hidden = tf.layers.dense(align_feature, units=hparams.hidden_size) align_hidden = common_layers.apply_norm( align_hidden, hparams.norm_type, hparams.hidden_size, epsilon=hparams.norm_epsilon) align_hidden = tf.nn.tanh(align_hidden) align_logits = tf.layers.dense(align_hidden, units=1) else: raise ValueError("Unsupported alignment: %s" % hparams.alignment) obj_logits = tf.squeeze(align_logits, [3]) + screen_encoding_bias # [batch_size, num_steps] batch_size = common_layers.shape_list(obj_logits)[0] num_steps = common_layers.shape_list(obj_logits)[1] # [batch_size * num_steps, 1] batch_indices = tf.to_int64(tf.reshape( tf.tile(tf.expand_dims(tf.range(batch_size), 1), [1, num_steps]), [-1, 1])) step_indices = tf.to_int64(tf.reshape( tf.tile(tf.expand_dims(tf.range(num_steps), 0), [batch_size, 1]), [-1, 1])) object_indices = tf.reshape(tf.argmax(obj_logits, -1), [-1, 1]) indices = tf.concat([batch_indices, step_indices, object_indices], -1) # [batch_size, num_steps, depth] depth = tf.shape(screen_encoding)[-1] best_logits = tf.reshape( tf.gather_nd(screen_encoding, indices=indices), [batch_size, num_steps, depth]) consumed_logits = tf.layers.dense( tf.reshape(tf.concat([object_hidden, best_logits], -1), [batch_size, num_steps, hparams.hidden_size * 2]), 2) with tf.control_dependencies([tf.assert_equal( tf.reduce_all(tf.math.is_nan(consumed_logits)), False, data=[tf.shape(best_logits), best_logits, tf.constant("screen_encoding"), screen_encoding, tf.constant("indices"), indices], summarize=10000, message="consumed_logits_nan")]): consumed_logits = tf.identity(consumed_logits) return obj_logits, consumed_logits
def add_distance_loss_to_center(labels, logits, groundtruth_coords): """Add distance loss function for ClickRegression.""" weights = tf.to_int32( tf.not_equal( labels, model_input.dataset_descriptors[FLAGS.dataset].ignore_label)) labels *= weights # Use GT box to get center if it exists. Less computation required. # Otherwise, calculate from label mask. if FLAGS.use_groundtruth_box: center_x = (groundtruth_coords['xmin'] + groundtruth_coords['xmax']) / 2.0 center_y = (groundtruth_coords['ymin'] + groundtruth_coords['ymax']) / 2.0 center = tf.stack([center_y, center_x], axis=1) else: # Make array of coordinates (each row contains three coordinates) ii, jj = tf.meshgrid(tf.range(FLAGS.image_size), tf.range(FLAGS.image_size), indexing='ij') coords = tf.stack([tf.reshape(ii, (-1, )), tf.reshape(jj, (-1, ))], axis=-1) coords = tf.cast(coords, tf.int32) # Rearrange input into one vector per volume volumes_flat = tf.reshape( labels, [-1, FLAGS.image_size * FLAGS.image_size * 1, 1]) # Compute total mass for each volume. Add 0.00001 to prevent division by 0 total_mass = tf.cast(tf.reduce_sum(volumes_flat, axis=1), tf.float32) + ZERO_DIV_OFFSET # Compute centre of mass center = tf.cast(tf.reduce_sum(volumes_flat * coords, axis=1), tf.float32) / total_mass center = center / FLAGS.image_size # Normalize coordinates by size of image logits = logits / FLAGS.image_size # Calculate loss based on the distance metric specified # Loss added later in model_fn by tf.losses.get_total_loss() if FLAGS.distance_metric == 'mse': tf.losses.mean_squared_error(center, logits) elif FLAGS.distance_metric in [ 'euclidean', 'euclidean_sqrt', 'euclidean_iter' ]: distance_to_center = tf.sqrt( tf.reduce_sum(tf.square(logits - center), axis=-1) + ZERO_DIV_OFFSET) if FLAGS.ratio_box_distance: distance_to_box = calc_distance_to_edge(groundtruth_coords, logits) box_distance_to_center = (tf.to_float(distance_to_center) - distance_to_box) loss = distance_to_center / (box_distance_to_center + ZERO_DIV_OFFSET) else: loss = distance_to_center if FLAGS.distance_metric == 'euclidean_sqrt': loss = tf.sqrt(loss) if FLAGS.distance_metric == 'euclidean_iter': iter_num = tf.to_float(tf.train.get_or_create_global_step()) step = (iter_num // FLAGS.euclidean_step) + 1.0 loss = tf.pow(loss, tf.to_float(1.0 / step)) tf.losses.compute_weighted_loss(loss)