def _calc_logpz(z, z_mu, z_log_sigma, **_): # Calculate log densities for sampled latent vectors in a standard # Gaussian distribution (zero mean, unit variance) logpz = _calc_gaussian_log_density(z, K.zeros_like(z_mu), K.zeros_like(z_log_sigma)) logpz = K.sum(logpz, axis=1) return logpz
def masked_loss(y_true, y_pred): max_args = argmax(y_true) mask = cast(not_equal(max_args, zeros_like(max_args)), dtype='float32') loss = switch(mask, categorical_crossentropy(y_true, y_pred, from_logits=True), zeros_like(mask, dtype=floatx())) return sum(loss) / (cast(sum(mask), dtype='float32') + epsilon())
def masked_accuracy(y_true, y_pred): max_args = argmax(y_true) mask = cast(not_equal(max_args, zeros_like(max_args)), dtype='float32') points = switch( mask, cast(equal(argmax(y_true, -1), argmax(y_pred, -1)), dtype='float32'), zeros_like(mask, dtype=floatx())) return sum(points) / cast(sum(mask), dtype='float32')
def exact_matched_accuracy(y_true, y_pred, mask_id): true_ids = bk.argmax(y_true, axis=-1) pred_ids = bk.argmax(y_pred, axis=-1) maskBool = bk.not_equal(true_ids, mask_id) maskInt64 = bk.cast(maskBool, 'int64') diff = (true_ids - pred_ids) * maskInt64 matches = bk.cast(bk.not_equal(diff, bk.zeros_like(diff)), 'int64') matches = bk.sum(matches, axis=-1) matches = bk.cast(bk.equal(matches, bk.zeros_like(matches)), bk.floatx()) return bk.mean(matches)
def adversarial_loss(net_d, real, fake_abgr, distorted, gan_training="mixup_LSGAN", **weights): """ Adversarial Loss Function from Shoanlu GAN """ alpha = Lambda(lambda x: x[:, :, :, :1])(fake_abgr) fake_bgr = Lambda(lambda x: x[:, :, :, 1:])(fake_abgr) fake = alpha * fake_bgr + (1 - alpha) * distorted if gan_training == "mixup_LSGAN": dist = Beta(0.2, 0.2) lam = dist.sample() mixup = lam * concatenate([real, distorted]) + (1 - lam) * concatenate( [fake, distorted]) pred_fake = net_d(concatenate([fake, distorted])) pred_mixup = net_d(mixup) loss_d = calc_loss(pred_mixup, lam * K.ones_like(pred_mixup), "l2") loss_g = weights['w_D'] * calc_loss(pred_fake, K.ones_like(pred_fake), "l2") mixup2 = lam * concatenate( [real, distorted]) + (1 - lam) * concatenate([fake_bgr, distorted]) pred_fake_bgr = net_d(concatenate([fake_bgr, distorted])) pred_mixup2 = net_d(mixup2) loss_d += calc_loss(pred_mixup2, lam * K.ones_like(pred_mixup2), "l2") loss_g += weights['w_D'] * calc_loss(pred_fake_bgr, K.ones_like(pred_fake_bgr), "l2") elif gan_training == "relativistic_avg_LSGAN": real_pred = net_d(concatenate([real, distorted])) fake_pred = net_d(concatenate([fake, distorted])) loss_d = K.mean(K.square(real_pred - K.ones_like(fake_pred))) / 2 loss_d += K.mean(K.square(fake_pred - K.zeros_like(fake_pred))) / 2 loss_g = weights['w_D'] * K.mean( K.square(fake_pred - K.ones_like(fake_pred))) fake_pred2 = net_d(concatenate([fake_bgr, distorted])) loss_d += K.mean( K.square(real_pred - K.mean(fake_pred2, axis=0) - K.ones_like(fake_pred2))) / 2 loss_d += K.mean( K.square(fake_pred2 - K.mean(real_pred, axis=0) - K.zeros_like(fake_pred2))) / 2 loss_g += weights['w_D'] * K.mean( K.square(real_pred - K.mean(fake_pred2, axis=0) - K.zeros_like(fake_pred2))) / 2 loss_g += weights['w_D'] * K.mean( K.square(fake_pred2 - K.mean(real_pred, axis=0) - K.ones_like(fake_pred2))) / 2 else: raise ValueError( "Receive an unknown GAN training method: {gan_training}") return loss_d, loss_g
def recursion(self, input_energy, mask=None, go_backwards=False, return_sequences=True, return_logZ=True, input_length=None): """Forward (alpha) or backward (beta) recursion If `return_logZ = True`, compute the logZ, the normalization constant: \[ Z = \sum_{y1, y2, y3} exp(-E) # energy = \sum_{y1, y2, y3} exp(-(u1' y1 + y1' W y2 + u2' y2 + y2' W y3 + u3' y3)) = sum_{y2, y3} (exp(-(u2' y2 + y2' W y3 + u3' y3)) sum_{y1} exp(-(u1' y1' + y1' W y2))) \] Denote: \[ S(y2) := sum_{y1} exp(-(u1' y1 + y1' W y2)), \] \[ Z = sum_{y2, y3} exp(log S(y2) - (u2' y2 + y2' W y3 + u3' y3)) \] \[ logS(y2) = log S(y2) = log_sum_exp(-(u1' y1' + y1' W y2)) \] Note that: yi's are one-hot vectors u1, u3: boundary energies have been merged If `return_logZ = False`, compute the Viterbi's best path lookup table. """ chain_energy = self.chain_kernel # shape=(1, F, F): F=num of output features. 1st F is for t-1, 2nd F for t chain_energy = K.expand_dims(chain_energy, 0) # shape=(B, F), dtype=float32 prev_target_val = K.zeros_like(input_energy[:, 0, :]) if go_backwards: input_energy = K.reverse(input_energy, 1) if mask is not None: mask = K.reverse(mask, 1) initial_states = [prev_target_val, K.zeros_like(prev_target_val[:, :1])] constants = [chain_energy] if mask is not None: mask2 = K.cast(K.concatenate([mask, K.zeros_like(mask[:, :1])], axis=1), K.floatx()) constants.append(mask2) def _step(input_energy_i, states): return self.step(input_energy_i, states, return_logZ) target_val_last, target_val_seq, _ = K.rnn(_step, input_energy, initial_states, constants=constants, input_length=input_length, unroll=self.unroll) if return_sequences: if go_backwards: target_val_seq = K.reverse(target_val_seq, 1) return target_val_seq else: return target_val_last
def get_updates(self, params, loss): grads = self.get_gradients(loss, params) shapes = [K.shape(p) for p in params] alphas = [ K.variable(K.ones(shape) * self.init_alpha) for shape in shapes ] old_grads = [K.zeros(shape) for shape in shapes] prev_weight_deltas = [K.zeros(shape) for shape in shapes] self.weights = alphas + old_grads self.updates = [] for param, grad, old_grad, prev_weight_delta, alpha in zip( params, grads, old_grads, prev_weight_deltas, alphas): # equation 4 new_alpha = K.switch( K.greater(grad * old_grad, 0), K.minimum(alpha * self.scale_up, self.max_alpha), K.switch(K.less(grad * old_grad, 0), K.maximum(alpha * self.scale_down, self.min_alpha), alpha)) # equation 5 new_delta = K.switch( K.greater(grad, 0), -new_alpha, K.switch(K.less(grad, 0), new_alpha, K.zeros_like(new_alpha))) # equation 7 weight_delta = K.switch(K.less(grad * old_grad, 0), -prev_weight_delta, new_delta) # equation 6 new_param = param + weight_delta # reset gradient_{t-1} to 0 if gradient sign changed (so that we do # not "double punish", see paragraph after equation 7) grad = K.switch(K.less(grad * old_grad, 0), K.zeros_like(grad), grad) # Apply constraints #if param in constraints: # c = constraints[param] # new_param = c(new_param) self.updates.append(K.update(param, new_param)) self.updates.append(K.update(alpha, new_alpha)) self.updates.append(K.update(old_grad, grad)) self.updates.append(K.update(prev_weight_delta, weight_delta)) return self.updates
def get_psp(self, output_spikes): new_spiketimes = tf.where(k.greater(output_spikes, 0), k.ones_like(output_spikes) * self.time, self.last_spiketimes) new_spiketimes = tf.where(k.less(output_spikes, 0), k.zeros_like(output_spikes) * self.time, new_spiketimes) assign_new_spiketimes = tf.assign(self.last_spiketimes, new_spiketimes) with tf.control_dependencies([assign_new_spiketimes]): last_spiketimes = self.last_spiketimes + 0 # Dummy op # psp = k.maximum(0., tf.divide(self.dt, last_spiketimes)) psp = tf.where(k.greater(last_spiketimes, 0), k.ones_like(output_spikes) * self.dt, k.zeros_like(output_spikes)) return psp
def call(self, u_vecs): if self.share_weights: u_hat_vecs = K.conv1d(u_vecs, self.W) else: u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1]) batch_size = K.shape(u_vecs)[0] input_num_capsule = K.shape(u_vecs)[1] u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule, self.num_capsule, self.dim_capsule)) u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3)) #final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule] b = K.zeros_like(u_hat_vecs[:,:,:,0]) #shape = [None, num_capsule, input_num_capsule] for i in range(self.routings): c = softmax(b, 1) # o = K.batch_dot(c, u_hat_vecs, [2, 2]) o = tf.einsum('bin,binj->bij', c, u_hat_vecs) if K.backend() == 'theano': o = K.sum(o, axis=1) if i < self.routings - 1: o = K.l2_normalize(o, -1) # b = K.batch_dot(o, u_hat_vecs, [2, 3]) b = tf.einsum('bij,binj->bin', o, u_hat_vecs) if K.backend() == 'theano': b = K.sum(b, axis=1) return self.activation(o)
def get_updates(self, loss, params): sync_cond = K.equal((self.iterations + 1) // self.sync_period * self.sync_period, (self.iterations + 1)) if TF_KERAS: slow_params = [K.variable(K.get_value(p), name='sp_{}'.format(i)) for i, p in enumerate(params)] self.updates = self.optimizer.get_updates(loss, params) slow_updates = [] for p, sp in zip(params, slow_params): sp_t = sp + self.slow_step * (p - sp) slow_updates.append(K.update(sp, K.switch( sync_cond, sp_t, sp, ))) slow_updates.append(K.update_add(p, K.switch( sync_cond, sp_t - p, K.zeros_like(p), ))) else: slow_params = {p.name: K.variable(K.get_value( p), name='sp_{}'.format(i)) for i, p in enumerate(params)} update_names = ['update', 'update_add', 'update_sub'] original_updates = [getattr(K, name) for name in update_names] setattr(K, 'update', lambda x, new_x: ('update', x, new_x)) setattr(K, 'update_add', lambda x, new_x: ('update_add', x, new_x)) setattr(K, 'update_sub', lambda x, new_x: ('update_sub', x, new_x)) self.updates = self.optimizer.get_updates(loss, params) for name, original_update in zip(update_names, original_updates): setattr(K, name, original_update) slow_updates = [] for i, update in enumerate(self.updates): if isinstance(update, tuple): name, x, new_x, adjusted = update + (update[-1],) update_func = getattr(K, name) if name == 'update_add': adjusted = x + new_x if name == 'update_sub': adjusted = x - new_x if x.name not in slow_params: self.updates[i] = update_func(x, new_x) else: slow_param = slow_params[x.name] slow_param_t = slow_param + \ self.slow_step * (adjusted - slow_param) slow_updates.append(K.update(slow_param, K.switch( sync_cond, slow_param_t, slow_param, ))) self.updates[i] = K.update(x, K.switch( sync_cond, slow_param_t, adjusted, )) slow_params = list(slow_params.values()) self.updates += slow_updates self.weights = self.optimizer.weights + slow_params return self.updates
def decorator(self, x): # Only call layer if there are input spikes. This is to prevent # accumulation of bias. self.impulse = tf.cond(k.any(k.not_equal(x, 0)), lambda: call(self, x), lambda: k.zeros_like(self.mem)) return self.update_neurons()
def dummy_weighted_categorical_loss(y_true, y_pred): y_pred /= K.sum(y_pred, axis=-1, keepdims=True) y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon()) loss = y_true * K.log(y_pred) loss = -K.sum(loss, -1) condition = K.greater(K.sum(y_true), 0) return K.switch(condition, loss, K.zeros_like(loss))
def call(self, inputs): if self.share_weights: hat_inputs = K.conv1d(inputs, self.kernel) else: hat_inputs = K.local_conv1d(inputs, self.kernel, [1], [1]) batch_size = K.shape(inputs)[0] input_num_capsule = K.shape(inputs)[1] hat_inputs = K.reshape(hat_inputs, (batch_size, input_num_capsule, self.num_capsule, self.dim_capsule)) hat_inputs = K.permute_dimensions(hat_inputs, (0, 2, 1, 3)) b = K.zeros_like(hat_inputs[:, :, :, 0]) for i in range(self.routings): c = softmax(b, 1) o = self.activation(keras.backend.batch_dot(c, hat_inputs, [2, 2])) if i < self.routings - 1: b = keras.backend.batch_dot(o, hat_inputs, [2, 3]) if K.backend() == 'theano': o = K.sum(o, axis=1) return o
def call(self, inputs): mu, std = inputs var_dist = tfp.MultivariateNormalDiag(loc=mu, scale_diag=std) pri_dist = tfp.MultivariateNormalDiag(loc=K.zeros_like(mu), scale_diag=K.ones_like(std)) kl_loss = self.lamb_kl * K.mean(tfp.kl_divergence(var_dist, pri_dist)) return kl_loss
def call(self, inputs): """Following the routing algorithm from Hinton's paper, but replace b = b + <u,v> with b = <u,v>. This change can improve the feature representation of Capsule. However, you can replace b = K.batch_dot(outputs, hat_inputs, [2, 3]) with b += K.batch_dot(outputs, hat_inputs, [2, 3]) to realize a standard routing. """ if self.share_weights: hat_inputs = K.conv1d(inputs, self.kernel) else: hat_inputs = K.local_conv1d(inputs, self.kernel, [1], [1]) batch_size = K.shape(inputs)[0] input_num_capsule = K.shape(inputs)[1] hat_inputs = K.reshape(hat_inputs, (batch_size, input_num_capsule, self.num_capsule, self.dim_capsule)) hat_inputs = K.permute_dimensions(hat_inputs, (0, 2, 1, 3)) b = K.zeros_like(hat_inputs[:, :, :, 0]) for i in range(self.routings): c = softmax(b, 1) o = self.activation(caps_batch_dot(c, hat_inputs)) if i < self.routings - 1: b = caps_batch_dot(o, hat_inputs) if K.backend() == 'theano': o = K.sum(o, axis=1) return o
def yolo_det_loss(y, p): # X/Y values y_xy = y[..., 0:2] p_xy = p[..., 0:2] # Width/Height values y_wh = y[..., 2:4] p_wh = p[..., 2:4] # Object confidence y_conf = y[..., 4] p_conf = p[..., 4] # Intersection over Union intersect_wh = K.maximum(K.zeros_like(p_wh), (p_wh + y_wh) / 2 - K.square(p_xy - y_xy)) I = intersect_wh[..., 0] * intersect_wh[..., 1] true_area = y_wh[..., 0] * y_wh[..., 1] pred_area = p_wh[..., 0] * p_wh[..., 1] U = pred_area + true_area - I iou = I / U # Calculate individual errors e_xy = K.sum(K.sum(K.square(y_xy - p_xy), axis=-1) * y_conf, axis=-1) e_wh = K.sum(K.sum(K.square(K.sqrt(y_wh) - K.sqrt(p_wh)), axis=-1) * y_conf, axis=-1) e_conf = K.sum(K.square(y_conf * iou - p_conf), axis=-1) # Sum all errors e = e_xy + e_wh + 10 * e_conf return e
def heteroscedastic_crossentropy(y_true, logits_log_var): def monte_carlo(T, logits, gaussian): T_softmax = K.zeros_like(logits) n_classes = logits.shape[-1] for i in range(T): # (?, K) <- (K, ?) <- (K, ?, 1) noise = K.transpose(K.squeeze(gaussian.sample(n_classes), axis=-1)) # draw a sample per logit #noise = gaussian.sample() # draw sample from multivariate, for all logits at once T_softmax += K.softmax(logits + noise) # (?, K) return (1 / T) * T_softmax n_classes = logits_log_var.shape[-1] - 1 #10 #n_classes = logits_log_var.shape[-1] // 2 #10 std = K.sqrt(K.exp(logits_log_var[:, n_classes:])) # get T softmax monte carlo simulations y_hat = monte_carlo( T=100, # number of simulations logits=logits_log_var[:, :n_classes], # logits gaussian=tf.distributions.Normal(loc=K.zeros_like(std), scale=std)) # log_var to std y_hat = K.clip(y_hat, 1e-11, 1 - 1e-11) # prevent nans #beta = 1. #gamma = .1 #H = -K.sum(y_hat * K.log(y_hat), -1) # entropy term to punish confident predictions nll = -K.sum(y_true * K.log(y_hat), -1) # negative log likelihood return nll
def accfun(y0, y1): x_pos = K.ones_like(x_r) x_neg = K.zeros_like(x_r) loss_r = K.mean(tf.keras.metrics.binary_accuracy(x_pos, x_r)) loss_f = K.mean(tf.keras.metrics.binary_accuracy(x_neg, x_f)) loss_p = K.mean(tf.keras.metrics.binary_accuracy(x_neg, x_p)) return (1.0 / 3.0) * (loss_r + loss_p + loss_f)
def zero_loss(y_true, y_pred): """ args: y_true(): y_pred(): """ return K.zeros_like(y_true)
def _get_weight_matrix(self, freq_true: tf.Tensor, freq_pred: tf.Tensor) -> tf.Tensor: """ Calculate a continuous, dynamic weight matrix based on current Euclidean distance. Parameters ---------- freq_true: :class:`tf.Tensor` The real and imaginary DFT frequencies for the true batch of images freq_pred: :class:`tf.Tensor` The real and imaginary DFT frequencies for the predicted batch of images Returns ------- :class:`tf.Tensor` The weights matrix for prioritizing hard frequencies """ weights = K.square(freq_pred - freq_true) weights = K.sqrt(weights[..., 0] + weights[..., 1]) weights = K.pow(weights, self._alpha) if self._log_matrix: # adjust the spectrum weight matrix by logarithm weights = K.log(weights + 1.0) if self._batch_matrix: # calculate the spectrum weight matrix using batch-based statistics weights = weights / K.max(weights) else: weights = weights / K.max(K.max(weights, axis=-2), axis=-2)[..., None, None, :] weights = K.switch(tf.math.is_nan(weights), K.zeros_like(weights), weights) weights = K.clip(weights, min_value=0.0, max_value=1.0) return weights
def get_updates(self, params, loss): grads = self.get_gradients(loss, params) shapes = [K.get_variable_shape(p) for p in params] alphas = [ K.variable(K.ones(shape) * self.init_alpha) for shape in shapes ] old_grads = [K.zeros(shape) for shape in shapes] self.weights = alphas + old_grads self.updates = [] for p, grad, old_grad, alpha in zip(params, grads, old_grads, alphas): grad = K.sign(grad) new_alpha = K.switch( K.greater(grad * old_grad, 0), K.minimum(alpha * self.scale_up, self.max_alpha), K.switch(K.less(grad * old_grad, 0), K.maximum(alpha * self.scale_down, self.min_alpha), alpha)) grad = K.switch(K.less(grad * old_grad, 0), K.zeros_like(grad), grad) new_p = p - grad * new_alpha # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) self.updates.append(K.update(alpha, new_alpha)) self.updates.append(K.update(old_grad, grad)) return self.updates
def lossfun(self, y_real, y_fake_f, y_fake_p): y_pos = K.ones_like(y_real) y_neg = K.zeros_like(y_real) loss_real = tf.keras.losses.binary_crossentropy(y_pos, y_real) loss_fake_f = tf.keras.losses.binary_crossentropy(y_neg, y_fake_f) loss_fake_p = tf.keras.losses.binary_crossentropy(y_neg, y_fake_p) return K.mean(loss_real + loss_fake_f + loss_fake_p)
def _init_cel(self, A_graph, b_graph, c_graph, y): # Sanity Checks y = tf.check_numerics(y, 'Problem with input y') # Find intersection points between Ax-b and the line joining the c and y Ac = tf.reduce_sum(A_graph * tf.expand_dims(c_graph, axis=-2), axis=-1) bMinusAc = b_graph - Ac yMinusc = y - c_graph ADotyMinusc = tf.reduce_sum((A_graph * tf.expand_dims(yMinusc, -2)), axis=-1) intersection_alphas = bMinusAc / (ADotyMinusc + K.epsilon()) # Enforce intersection_alpha > 0 because the point must lie on the ray from c to y less_equal_0 = K.less_equal(intersection_alphas, K.zeros_like(intersection_alphas)) candidate_alpha = K.switch( less_equal_0, K.ones_like(intersection_alphas) * tf.constant(np.inf, dtype='float32'), intersection_alphas) # Find closest the intersection point closest to the interior point to get projection point intersection_alpha = K.min(candidate_alpha, axis=-1, keepdims=True) # If it is an interior point, y itself is the projection point is_interior_point = K.greater_equal(intersection_alpha, K.ones_like(intersection_alpha)) alpha = K.switch(is_interior_point, K.ones_like(intersection_alpha), intersection_alpha) # Return z = \alpha.y + (1 - \alpha).c z = alpha * y + ((1 - alpha) * c_graph) return z
def call(self, u_vecs): if self.share_weights: u_hat_vecs = K.conv1d(u_vecs, self.W) else: u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1]) batch_size = K.shape(u_vecs)[0] input_num_capsule = K.shape(u_vecs)[1] u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule, self.num_capsule, self.dim_capsule)) u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3)) # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule] b = K.zeros_like( u_hat_vecs[:, :, :, 0]) # shape = [None, num_capsule, input_num_capsule] for i in range(self.routings): b = K.permute_dimensions( b, (0, 2, 1)) # shape = [None, input_num_capsule, num_capsule] c = K.softmax(b) c = K.permute_dimensions(c, (0, 2, 1)) b = K.permute_dimensions(b, (0, 2, 1)) outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2])) if i < self.routings - 1: b = K.batch_dot(outputs, u_hat_vecs, [2, 3]) return outputs
def dropped_mask(): drop_mask = K.switch( K.random_uniform(K.shape(inputs)) < self.drop_rate, K.ones_like(inputs, K.floatx()), K.zeros_like(inputs, K.floatx()), ) return target_mask * drop_mask
def saliency_map(input_x, input_y, logits, conv_output): """ Compute the saliency map for a text input """ # shape output (batch_size, sequence_length, num_filters) # shape grads_val (batch_size, sequence_length, num_filters) _, grads_val = get_gradients(input_x, input_y, logits, conv_output) # get the maximum gradient for each gram of words # shape (batch_size, sequence_length) s_maps = K.max(grads_val, axis=2) # Process s_maps new_s_maps = K.zeros_like(s_maps) for i in range(s_maps.shape[0]): # Distance from the mean s_map_ = s_maps[i] - K.mean(s_maps[i]) # Keep only positive values s_map_ = K.maximum(s_map_, 0) # Normalize s_map_ = s_map_ / s_map_.max() if s_map_.max() != 0 else s_map_ new_s_maps[i] = s_map_ return new_s_maps
def make_readout_decode_model(self, max_output_len=32): src_seq_input = Input(shape=(None, ), dtype='int32') tgt_start_input = Input(shape=(1, ), dtype='int32') src_seq = src_seq_input enc_mask = Lambda(lambda x: K.cast(K.greater(x, 0), 'float32'))( src_seq) src_emb = self.i_word_emb(src_seq) if self.pos_emb: src_emb = add_layer([src_emb, self.pos_emb(src_seq)]) src_emb = self.emb_dropout(src_emb) enc_output = self.encoder(src_emb, src_seq) tgt_emb = self.o_word_emb(tgt_start_input) tgt_seq = Lambda(lambda x: K.repeat_elements(x, max_output_len, 1))( tgt_start_input) rep_input = Lambda(lambda x: K.repeat_elements(x, max_output_len, 1))( tgt_emb) cell = ReadoutDecoderCell(self.o_word_emb, self.pos_emb, self.decoder, self.target_layer) final_output = InferRNN(cell, return_sequences=True)(rep_input, initial_state=[tgt_start_input, K.ones_like(tgt_start_input), K.zeros_like(tgt_seq)] + \ [rep_input for _ in self.decoder.layers], constants=[enc_output, enc_mask]) final_output = Lambda(lambda x: K.squeeze(x, -1))(final_output) self.readout_model = Model([src_seq_input, tgt_start_input], final_output)
def grad_cam(input_x, input_y, logits, conv_output): """ Compute the grad-cam for a text input """ # shape output (batch_size, sequence_length, num_filters) # shape grads_val (batch_size, sequence_length, num_filters) output, grads_val = get_gradients(input_x, input_y, logits, conv_output) # get the maximum gradient for each gram of words # shape (batch_size, sequence_length) weights = K.max(grads_val, axis=2) # shape cam (batch_size, sequence_length) cams = tf.einsum('ijk,ij->ij', output, weights) # Process cam new_cams = K.zeros_like(cams) for i, cam in enumerate(cams): # Distance from the mean cam_ = cam - cam.mean() # Keep only positive values cam_ = K.maximum(cam_, 0) # Normalize cam_ = cam_ / cam_.max() if cam_.max() != 0 else cam_ new_cams[i] = cam_ return new_cams
def masked_softmax(vector, mask): """ `K.softmax(vector)` does not work if some elements of `vector` should be masked. This performs a softmax on just the non-masked portions of `vector` (passing None in for the mask is also acceptable; you'll just get a regular softmax). We assume that both `vector` and `mask` (if given) have shape (batch_size, vector_dim). In the case that the input vector is completely masked, this function returns an array of ``0.0``. This behavior may cause ``NaN`` if this is used as the last layer of a model that uses categorial cross-entropy loss. """ # We calculate masked softmax in a numerically stable fashion, as done # in https://github.com/rkadlec/asreader/blob/master/asreader/custombricks/softmax_mask_bricks.py if mask is not None: # Here we get normalized log probabilities for # enhanced numerical stability. mask = K.cast(mask, "float32") input_masked = mask * vector shifted = mask * (input_masked - K.max(input_masked, axis=1, keepdims=True)) # We add epsilon to avoid numerical instability when # the sum in the log yields 0. normalization_constant = K.log( K.sum(mask * K.exp(shifted), axis=1, keepdims=True) + K.epsilon()) normalized_log_probabilities = mask * (shifted - normalization_constant) unmasked_probabilities = K.exp(normalized_log_probabilities) return switch(mask, unmasked_probabilities, K.zeros_like(unmasked_probabilities)) else: # There is no mask, so we use the provided ``K.softmax`` function. return K.softmax(vector)
def create_inital_state(inputs, hidden_size): # We are not using initial states, but need to pass something to K.rnn funciton fake_state = K.zeros_like(inputs) # <= (batch_size, enc_seq_len, latent_dim fake_state = K.sum(fake_state, axis=[1, 2]) # <= (batch_size) fake_state = K.expand_dims(fake_state) # <= (batch_size, 1) fake_state = K.tile(fake_state, [1, hidden_size]) # <= (batch_size, latent_dim return fake_state