def step(self, x, states): r_tm1, V_tm1,s_tm1,time = states[:4] h_tm1 = states[4:] r_tm1 = r_tm1 op_t, h_t = _update_controller(self, T.concatenate([x, r_tm1], axis=-1), h_tm1) # op_t = op_t + print_name_shape("W_d",self.W_d.get_value()) op_t = op_t #op_t = op_t[:,0,:] d_t = K.sigmoid( K.dot(op_t, self.W_d) + self.b_d) u_t = K.sigmoid(K.dot(op_t, self.W_u) + self.b_u) v_t = K.tanh(K.dot(op_t, self.W_v) + self.b_v) o_t = K.tanh(K.dot(op_t, self.W_o) + self.b_o) time = time + 1 V_t, s_t, r_t = _update_neural_stack(self, V_tm1, s_tm1, d_t[::,0], u_t[::,0], v_t,time[0],stack=self.stack) return o_t, [r_t, V_t, s_t, time] + h_t
def call(self, x, mask=None): # x[0]: (batch_size, input_length, input_dim) # x[1]: (batch_size, 1) indices of prepositions # Optional: x[2]: (batch_size, input_length - 2) assert isinstance(x, list) or isinstance(x, tuple) encoded_sentence = x[0] prep_indices = K.squeeze(x[1], axis=-1) #(batch_size,) batch_indices = K.arange(K.shape(encoded_sentence)[0]) # (batch_size,) if self.with_attachment_probs: # We're essentially doing K.argmax(x[2]) here, but argmax is not differentiable! head_probs = x[2] head_probs_padding = K.zeros_like(x[2])[:, :2] # (batch_size, 2) # (batch_size, input_length) padded_head_probs = K.concatenate([head_probs, head_probs_padding]) # (batch_size, 1) max_head_probs = K.expand_dims(K.max(padded_head_probs, axis=1)) # (batch_size, input_length, 1) max_head_prob_indices = K.expand_dims(K.equal(padded_head_probs, max_head_probs)) # (batch_size, input_length, input_dim) masked_head_encoding = K.switch(max_head_prob_indices, encoded_sentence, K.zeros_like(encoded_sentence)) # (batch_size, input_dim) head_encoding = K.sum(masked_head_encoding, axis=1) else: head_indices = prep_indices - 1 # (batch_size,) head_encoding = encoded_sentence[batch_indices, head_indices, :] # (batch_size, input_dim) prep_encoding = encoded_sentence[batch_indices, prep_indices, :] # (batch_size, input_dim) child_encoding = encoded_sentence[batch_indices, prep_indices+1, :] # (batch_size, input_dim) ''' prep_indices = x[1] sentence_mask = mask[0] if sentence_mask is not None: if K.ndim(sentence_mask) > 2: # This means this layer came after a Bidirectional layer. Keras has this bug which # concatenates input masks instead of output masks. # TODO: Fix Bidirectional instead. sentence_mask = K.any(sentence_mask, axis=(-2, -1)) head_encoding, prep_encoding, child_encoding = self.get_split_averages(encoded_sentence, sentence_mask, prep_indices) ''' head_projection = K.dot(head_encoding, self.proj_head) # (batch_size, proj_dim) prep_projection = K.dot(prep_encoding, self.proj_prep) # (batch_size, proj_dim) child_projection = K.dot(child_encoding, self.proj_child) # (batch_size, proj_dim) #(batch_size, proj_dim) if self.composition_type == 'HPCT': composed_projection = K.tanh(head_projection + prep_projection + child_projection) elif self.composition_type == 'HPC': prep_child_projection = K.tanh(prep_projection + child_projection) # (batch_size, proj_dim) composed_projection = K.tanh(head_projection + prep_child_projection) else: # Composition type in HC composed_projection = K.tanh(head_projection + child_projection) for hidden_layer in self.hidden_layers: composed_projection = K.tanh(K.dot(composed_projection, hidden_layer)) # (batch_size, proj_dim) # (batch_size, num_classes) class_scores = K.dot(composed_projection, self.scorer) label_probabilities = K.softmax(class_scores) return label_probabilities
def staircase_loss(y_true, y_pred, var_a=16.0, cnst=1.0/255.0): """ Keras Staircase Loss """ height = cnst width = cnst var_x = K.clip(K.abs(y_true - y_pred) - 0.5 * cnst, 0.0, 1.0) loss = height*(K.tanh(var_a*((var_x/width)-tf.floor(var_x/width)-0.5)) / (2.0*K.tanh(var_a/2.0)) + 0.5 + tf.floor(var_x/width)) loss += 1e-10 return K.mean(loss, axis=-1)
def logtanh(x, a=1): """ log * tanh See Also: arcsinh """ return K.tanh(x) * K.log(2 + a * abs(x))
def call(self, x, mask=None): # eij = K.dot(x, self.W) TF backend doesn't support it # features_dim = self.W.shape[0] # step_dim = x._keras_shape[1] features_dim = self.features_dim step_dim = self.step_dim eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim)) if self.bias: eij += self.b eij = K.tanh(eij) a = K.exp(eij) # apply mask after the exp. will be re-normalized next if mask is not None: # Cast the mask to floatX to avoid float64 upcasting in theano a *= K.cast(mask, K.floatx()) # in some cases especially in the early stages of training the sum may be almost zero a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) a = K.expand_dims(a) weighted_input = x * a # print weigthted_input.shape return K.sum(weighted_input, axis=1)
def call(self, x, mask=None): eij = dot_product(x, self.W) if self.bias: eij += self.b eij = K.tanh(eij) a = K.exp(eij) # apply mask after the exp. will be re-normalized next if mask is not None: # Cast the mask to floatX to avoid float64 upcasting in theano a *= K.cast(mask, K.floatx()) # in some cases especially in the early stages of training the sum may be almost zero # and this results in NaN's. A workaround is to add a very small positive number ε to the sum. # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx()) a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) weighted_input = x * K.expand_dims(a) result = K.sum(weighted_input, axis=1) if self.return_attention: return [result, a] return result
def call(self, x): assert(K.backend() == 'tensorflow') temp = K.permute_dimensions(x, (0, 2, 1)) for i in range(0, self.attention_depth): temp = K.sigmoid(K.dot(temp, self.Ws[i]) + self.bs[i]) temp = K.permute_dimensions(temp, (0, 2, 1)) estimated_weight = K.squeeze(K.dot(temp, K.expand_dims(self.Wf, -1)), -1) biased_weight = estimated_weight + self.bias non_linear_weight = K.tanh(biased_weight) # For each hidded state calculate how much should it contribute # to the context vector. This is the main part of attention. # In order to convert weights to "probabilities" use a sigmoid # based function: exp(x) / sum(exp(xi)). prob = K.exp(non_linear_weight) # Compute the total sum for each batch. total_sum = K.sum(prob, axis=1, keepdims=True) prob /= K.cast(total_sum, K.floatx()) # Enable this if you want access to internal probabilities. # Should only be used for testing that Attention works as expected. # return prob # Multiply each hidden value by the corresponding probability. prob = K.expand_dims(prob, -1) new_hidden_values = x * prob return K.sum(new_hidden_values, axis=1)
def step(self, x, states): r_tm1, V_tm1,s_tm1,time = states[:4] h_tm1 = states[4:] def print_name_shape(name,x): return T.cast( K.sum(theano.printing.Print(name)(x.shape)) * 0,"float32") r_tm1 = r_tm1 + print_name_shape("out\nr_tm1",r_tm1) + \ print_name_shape("V_tm1",V_tm1) + \ print_name_shape("s_tm1",s_tm1) + \ print_name_shape("x",x) + \ print_name_shape("h_tm1_0",h_tm1[0]) + \ print_name_shape("h_tm1_1",h_tm1[1]) op_t, h_t = self._update_controller( T.concatenate([x, r_tm1], axis=-1), h_tm1) # op_t = op_t + print_name_shape("W_d",self.W_d.get_value()) op_t = op_t + print_name_shape("afterop_t",op_t) #op_t = op_t[:,0,:] ao = K.dot(op_t, self.W_d) ao = ao +print_name_shape("ao",ao) d_t = K.sigmoid( ao + self.b_d) + print_name_shape("afterop2_t",op_t) u_t = K.sigmoid(K.dot(op_t, self.W_u) + self.b_u)+ print_name_shape("d_t",op_t) v_t = K.tanh(K.dot(op_t, self.W_v) + self.b_v) + print_name_shape("u_t",u_t) o_t = K.tanh(K.dot(op_t, self.W_o) + self.b_o) + print_name_shape("v_t",v_t) o_t = o_t + print_name_shape("afterbulk_t",o_t) time = time + 1 V_t, s_t, r_t = _update_neural_stack(self, V_tm1, s_tm1, d_t[::,0], u_t[::,0], v_t,time[0],stack=self.stack) #V_t, s_t, r_t = V_tm1,s_tm1,T.sum(V_tm1,axis = 1) V_t = V_t + print_name_shape("o_t",o_t) + \ print_name_shape("r_t",r_t) + \ print_name_shape("V_t",V_t) +\ print_name_shape("s_t",s_t) # T.cast( theano.printing.Print("time")(time[0]),"float32") #time = T.set_subtensor(time[0],time[0] +) return o_t, [r_t, V_t, s_t, time] + h_t
def call(self, x, mask=None): eij = K.tanh(K.dot(x, self.W)) ai = K.exp(eij) weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x') weighted_input = x*weights.dimshuffle(0,1,'x') return weighted_input.sum(axis=1)
def step(self, x, states): r_tm1, V_tm1,s_tm1,time = states[:4] h_tm1 = states[4:] op_t, h_t = _update_controller(self, T.concatenate([x, r_tm1], axis=-1), h_tm1) d_t = K.sigmoid( K.dot(op_t, self.W_d) + self.b_d) u_t = K.sigmoid(K.dot(op_t, self.W_u) + self.b_u) v_t = K.tanh(K.dot(op_t, self.W_v) + self.b_v) o_t = K.tanh(K.dot(op_t, self.W_o) + self.b_o) time = time + 1 V_t, s_t, r_t = _update_neural_stack(self, V_tm1, s_tm1, d_t[::,0], u_t[::,0], v_t,time[0],stack=self.stack) return o_t, [r_t, V_t, s_t, time] + h_t
def tanh(x): """ Tanh activation function. >>> tanh(0) 0.0 """ return K.eval(K.tanh(K.variable(x))).tolist()
def get_similarity(self): ''' Specify similarity in configuration under 'similarity_params' -> 'mode' If a parameter is needed for the model, specify it in 'similarity_params' Example configuration: config = { ... other parameters ... 'similarity_params': { 'mode': 'gesd', 'gamma': 1, 'c': 1, } } cosine: dot(a, b) / sqrt(dot(a, a) * dot(b, b)) polynomial: (gamma * dot(a, b) + c) ^ d sigmoid: tanh(gamma * dot(a, b) + c) rbf: exp(-gamma * l2_norm(a-b) ^ 2) euclidean: 1 / (1 + l2_norm(a - b)) exponential: exp(-gamma * l2_norm(a - b)) gesd: euclidean * sigmoid aesd: (euclidean + sigmoid) / 2 ''' params = self.similarity_params similarity = params['mode'] axis = lambda a: len(a._keras_shape) - 1 dot = lambda a, b: K.batch_dot(a, b, axes=axis(a)) l2_norm = lambda a, b: K.sqrt(K.sum((a - b) ** 2, axis=axis(a), keepdims=True)) l1_norm = lambda a, b: K.sum(K.abs(a - b), axis=axis(a), keepdims=True) if similarity == 'cosine': return lambda x: dot(x[0], x[1]) / K.sqrt(dot(x[0], x[0]) * dot(x[1], x[1])) elif similarity == 'polynomial': return lambda x: (params['gamma'] * dot(x[0], x[1]) + params['c']) ** params['d'] elif similarity == 'sigmoid': return lambda x: K.tanh(params['gamma'] * dot(x[0], x[1]) + params['c']) elif similarity == 'rbf': return lambda x: K.exp(-1 * params['gamma'] * l2_norm(x[0], x[1]) ** 2) elif similarity == 'euclidean': return lambda x: 1 / (1 + l2_norm(x[0], x[1])) elif similarity == 'l1': return lambda x: -l1_norm(x[0], x[1]) elif similarity == 'exponential': return lambda x: K.exp(-1 * params['gamma'] * l2_norm(x[0], x[1])) elif similarity == 'gesd': euclidean = lambda x: 1 / (1 + l2_norm(x[0], x[1])) sigmoid = lambda x: 1 / (1 + K.exp(-1 * params['gamma'] * (dot(x[0], x[1]) + params['c']))) return lambda x: euclidean(x) * sigmoid(x) elif similarity == 'aesd': euclidean = lambda x: 0.5 / (1 + l2_norm(x[0], x[1])) sigmoid = lambda x: 0.5 / (1 + K.exp(-1 * params['gamma'] * (dot(x[0], x[1]) + params['c']))) return lambda x: euclidean(x) + sigmoid(x) else: raise Exception('Invalid similarity: {}'.format(similarity))
def call(self, x, mask=None): # x: (batch_size, input_length, input_dim) where input_length = head_size + 2 head_encoding = x[:, :-2, :] # (batch_size, head_size, input_dim) prep_encoding = x[:, -2, :] # (batch_size, input_dim) child_encoding = x[:, -1, :] # (batch_size, input_dim) if self.composition_type == 'HPCD': # TODO: The following line may not work with TF. # (batch_size, head_size, input_dim, 1) * (1, head_size, input_dim, proj_dim) head_proj_prod = K.expand_dims(head_encoding) * K.expand_dims(self.dist_proj_head, dim=0) head_projection = K.sum(head_proj_prod, axis=2) # (batch_size, head_size, proj_dim) else: head_projection = K.dot(head_encoding, self.proj_head) # (batch_size, head_size, proj_dim) prep_projection = K.expand_dims(K.dot(prep_encoding, self.proj_prep), dim=1) # (batch_size, 1, proj_dim) child_projection = K.expand_dims(K.dot(child_encoding, self.proj_child), dim=1) # (batch_size, 1, proj_dim) #(batch_size, head_size, proj_dim) if self.composition_type == 'HPCT': composed_projection = K.tanh(head_projection + prep_projection + child_projection) elif self.composition_type == 'HPC' or self.composition_type == "HPCD": prep_child_projection = K.tanh(prep_projection + child_projection) # (batch_size, 1, proj_dim) composed_projection = K.tanh(head_projection + prep_child_projection) else: # Composition type in HC composed_projection = K.tanh(head_projection + child_projection) for hidden_layer in self.hidden_layers: composed_projection = K.tanh(K.dot(composed_projection, hidden_layer)) # (batch_size, head_size, proj_dim) # (batch_size, head_size) head_word_scores = K.squeeze(K.dot(composed_projection, self.scorer), axis=-1) if mask is None: attachment_probabilities = K.softmax(head_word_scores) # (batch_size, head_size) else: if K.ndim(mask) > 2: # This means this layer came after a Bidirectional layer. Keras has this bug which # concatenates input masks instead of output masks. # TODO: Fix Bidirectional instead. mask = K.any(mask, axis=(-2, -1)) # We need to do a masked softmax. exp_scores = K.exp(head_word_scores) # (batch_size, head_size) head_mask = mask[:, :-2] # (batch_size, head_size) # (batch_size, head_size) masked_exp_scores = switch(head_mask, exp_scores, K.zeros_like(head_encoding[:, :, 0])) # (batch_size, 1). Adding epsilon to avoid divison by 0. But epsilon is float64. exp_sum = K.cast(K.expand_dims(K.sum(masked_exp_scores, axis=1) + K.epsilon()), 'float32') attachment_probabilities = masked_exp_scores / exp_sum # (batch_size, head_size) return attachment_probabilities
def step(self, x, states): h_tild_tm1 = states[0] B_U = states[1] B_W = states[2] if self.consume_less == 'cpu': x_i = x[:, :self.output_dim] x_f = x[:, self.output_dim: 2 * self.output_dim] x_c = x[:, 2 * self.output_dim: 3 * self.output_dim] x_o = x[:, 3 * self.output_dim: 4 * self.output_dim] x_new = x[:, 4 * self.output_dim:] else: x_i = K.dot(x * B_W[0], self.W_i) + self.b_i x_f = K.dot(x * B_W[1], self.W_f) + self.b_f x_c = K.dot(x * B_W[2], self.W_c) + self.b_c x_o = K.dot(x * B_W[3], self.W_o) + self.b_o x_new = x # self.C_tape -> BT, t-1, k # self.H_tape -> BT, t-1, k # x -> BT, k # h_tild_tm1 -> BT, k if self.H_tape is None: self.H_tape = K.zeros_like(h_tild_tm1).dimshuffle((0,'x',1)) self.C_tape = K.zeros_like(h_tild_tm1).dimshuffle((0,'x',1)) # s_t -> BT, t-1, 1 t = K.shape(self.C_tape)[1] sum1 = K.dot(self.H_tape, self.W_h) sum2 = K.dot(K.repeat_elements(x_new.dimshuffle((0,'x',1)),t, axis=1), self.W_x) sum3 = K.dot(K.repeat_elements(h_tild_tm1.dimshuffle((0,'x',1)),t, axis=1), self.W_h_tilde) tanhed_sum = K.tanh(sum1 + sum2 + sum3) a_t = K.dot(tanhed_sum, self.v)[:,:,0] s_t = K.softmax(a_t) h_tilde_t = T.batched_dot(self.H_tape.dimshuffle((0,2,1)), s_t.dimshuffle((0,1,'x')))[:,:,0] c_tilde_t = T.batched_dot(self.C_tape.dimshuffle((0,2,1)), s_t.dimshuffle((0,1,'x')))[:,:,0] i = self.inner_activation(x_i + K.dot(h_tilde_t * B_U[0], self.U_i)) f = self.inner_activation(x_f + K.dot(h_tilde_t * B_U[1], self.U_f)) c_t = f * c_tilde_t + i * self.activation(x_c + K.dot(h_tilde_t * B_U[2], self.U_c)) o = self.inner_activation(x_o + K.dot(h_tilde_t * B_U[3], self.U_o)) h_t = o * self.activation(c_t) # Add to Tape self.C_tape = K.concatenate([self.C_tape, c_t.dimshuffle((0,'x',1))], axis=1) self.H_tape = K.concatenate([self.H_tape, h_t.dimshuffle((0,'x',1))], axis=1) return h_t, [h_tilde_t]
def _additive_similarity(self, source, query): concatenation = K.concatenate([source, query], axis=2) nonlinearity = K.tanh(K.dot(concatenation, self._weights["w_a"])) # tile the weight vector (1, 1, dim) for each time step and each element of the batch -> (bs, T, dim) source_shape = K.shape(source) vaeff = K.tile(K.expand_dims(self._weights["v_a"], 0), [source_shape[0], source_shape[1], 1]) similarity = K.batch_dot(K.permute_dimensions(vaeff, [0, 2, 1]), nonlinearity, axes=[1, 2]) return similarity
def call(self, x, mask=None): e = K.dot(x, self.W) if self.bias: e += self.b e = K.tanh(e) e = K.reshape(K.dot(e, self.U), (-1, self.timesteps)) a = K.exp(e) if mask is not None: a *= K.cast(mask, K.floatx()) a_weights = a / K.cast(K.sum(a, axis=-1, keepdims=True) + K.epsilon(), K.floatx()) weighted_output = x * K.expand_dims(a_weights, axis=-1) return [K.mean(weighted_output, axis=1), a_weights]
def get_w(self, x, mask=None): input_shape = K.int_shape(x) features_dim = self.features_dim step_dim = input_shape[1] eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim)) if self.bias: eij += self.b[:input_shape[1]] eij = K.tanh(eij) a = K.exp(eij) if mask is not None: a *= K.cast(mask, K.floatx()) a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) return a
def call(self, x, mask=None): x_transpose = K.permute_dimensions(x, (0,2,1)) e = K.dot(x_transpose, self.W) if self.bias: e += self.b e = K.tanh(e) if not self.simple: e = K.permute_dimensions(e, (0,2,1)) e = K.reshape(K.dot(e, self.V), (-1, self.timesteps)) else: e = K.mean(e, axis=1) a = K.exp(e) if mask is not None: a *= K.cast(mask, K.floatx()) a_weights = a / K.cast(K.sum(a, axis=-1, keepdims=True) + K.epsilon(), K.floatx()) weighted_output = x * K.expand_dims(a_weights, axis=-1) return [K.sum(weighted_output, axis=1), a_weights]
def call(self, x, mask=None): # size of x :[batch_size, sel_len, attention_dim] # size of u :[batch_size, attention_dim] # uit = tanh(xW+b) uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b)) ait = K.dot(uit, self.u) ait = K.squeeze(ait, -1) ait = K.exp(ait) if mask is not None: # Cast the mask to floatX to avoid float64 upcasting in theano ait *= K.cast(mask, K.floatx()) ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx()) ait = K.expand_dims(ait) weighted_input = x * ait output = K.sum(weighted_input, axis=1) return output
def call(self, inputs, mask=None): if type(inputs) is not list or len(inputs) <= 1: raise Exception('BilinearTensorLayer must be called on a list of tensors ' '(at least 2). Got: ' + str(inputs)) e1 = inputs[0] e2 = inputs[1] batch_size = K.shape(e1)[0] k = self.output_dim # print([e1,e2]) feed_forward_product = K.dot(K.concatenate([e1,e2]), self.V) # print(feed_forward_product) bilinear_tensor_products = [ K.sum((e2 * K.dot(e1, self.W[0])) + self.b, axis=1) ] # print(bilinear_tensor_products) for i in range(k)[1:]: btp = K.sum((e2 * K.dot(e1, self.W[i])) + self.b, axis=1) bilinear_tensor_products.append(btp) result = K.tanh(K.reshape(K.concatenate(bilinear_tensor_products, axis=0), (batch_size, k)) + feed_forward_product) # print(result) return result
def call(self, x, mask=None): mean = super(IntraAttention, self).call(x, mask) # x: (batch_size, input_length, input_dim) # mean: (batch_size, input_dim) ones = K.expand_dims(K.mean(K.ones_like(x), axis=(0, 2)), dim=0) # (1, input_length) # (batch_size, input_length, input_dim) tiled_mean = K.permute_dimensions(K.dot(K.expand_dims(mean), ones), (0, 2, 1)) if mask is not None: if K.ndim(mask) > K.ndim(x): # Assuming this is because of the bug in Bidirectional. Temporary fix follows. # TODO: Fix Bidirectional. mask = K.any(mask, axis=(-2, -1)) if K.ndim(mask) < K.ndim(x): mask = K.expand_dims(mask) x = switch(mask, x, K.zeros_like(x)) # (batch_size, input_length, proj_dim) projected_combination = K.tanh(K.dot(x, self.vector_projector) + K.dot(tiled_mean, self.mean_projector)) scores = K.dot(projected_combination, self.scorer) # (batch_size, input_length) weights = K.softmax(scores) # (batch_size, input_length) attended_x = K.sum(K.expand_dims(weights) * x, axis=1) # (batch_size, input_dim) return attended_x
def call(self, inputs, mask=None): if not isinstance(inputs, list) or len(inputs) <= 1: raise TypeError('Attention must be called on a list of tensors ' '(at least 2). Got: ' + str(inputs)) # (None(batch), MaxLen(time), spec_dim, embed_dim) mix_embed_l = inputs[0] # (None(batch), embed_dim) spk_embed_l = inputs[1] energy = None if self.mode == 'dot': # (batch, time, spec_dim, embed_dim) batch_dot(3,1) (batch, embed_dim) = (batch, time, spec_dim) energy = K.batch_dot(mix_embed_l, spk_embed_l, axes=(3, 1)) elif self.mode == 'align': # (batch, time, spec_dim, embed_dim) dot (embed_dim, align_hidden) # -> (batch, time, spec_dim, align_hidden) hUa = K.dot(mix_embed_l, self.U_align) # (batch, embed_dim) dot (embed_dim, align_hidden) # -> (batch, align_hidden) sWa = K.dot(spk_embed_l, self.W_align) sWa = sWa.dimshuffle(0, 'x', 'x', 1) # -> (batch, time, spec_dim, align_hidden) tanh_sWahUa = K.tanh(sWa+hUa) # -> (batch, time, spec_dim, align_hidden) dot (align_hidden, 1) # -> (batch, time, spec_dim, 1) energy = K.dot(tanh_sWahUa, self.v_align) # -> (batch, time, spec_dim) energy = K.reshape(energy, (-1, self.time_step, self.spec_dim)) else: raise ValueError('Unknown merge mode.') if self.nonlinearity == 'sigmoid': alpha = K.sigmoid(energy) elif self.nonlinearity == 'linear': alpha = energy else: raise Exception('Unknown nonlinearity mode for attention:'+self.nonlinearity) # (batch, time, spec_dim) return alpha
def call(self, x): assert(K.backend() == 'tensorflow') # The model is described by the following equations: # estimated_weight_i = dot_product(hidden_state_i, W). # biased_weight = estimated_weight + bias # non_linear_weight = tanh(biased_weight) estimated_weight = K.squeeze(K.dot(x, K.expand_dims(self.W, -1)), -1) biased_weight = estimated_weight + self.bias non_linear_weight = K.tanh(biased_weight) # For each hidded state calculate how much should it contribute # to the context vector. This is the main part of attention. # In order to convert weights to "probabilities" use a sigmoid # based function: exp(x) / sum(exp(xi)). prob = K.exp(non_linear_weight) # Compute the total sum for each batch. total_sum = K.sum(prob, axis=1, keepdims=True) prob /= K.cast(total_sum, K.floatx()) # Multiply each hidden value by the corresponding probability. prob = K.expand_dims(prob, -1) new_hidden_values = x * prob return K.sum(new_hidden_values, axis=1)
def attention_step(self, x, args): ''' Attention step function #Arguments args: [h_, context, projected_context, W_h_prj, w_prj_att, b_att, W_x_h, U_h_h, W_ctx_h, b_h W_x_p, W_h_p, W_ctx_p, b_p]. h_: (batch_size, dim_hidden) context: (batch_size, nb_context, dim_context) projected_context: (batch_size, nb_context, dim_projected_context) projected_context = context dot W_ctx_prj + b_ctx_prj calculated before step. W_h_prj: (dim_hidden, dim_projected_context) w_prj_att: (dim_projected_context, 1) b_att: (1,) W_x_h: (dim_embedding, dim_hidden) U_h_h: (dim_hidden, dim_hidden) W_ctx_h: (dim_context, dim_hidden) b_h: (dim_hidden,) ''' assert len(args) == 1 + len(self.contexts) + len(self.params) [h_, context, projected_context, W_h_prj, w_prj_att, b_att, W_x_h, U_h_h, W_ctx_h, b_h] = args projected = K.expand_dims(K.dot(h_, W_h_prj), 1) + projected_context e = K.dot(K.tanh(projected), w_prj_att) + b_att alpha = K.softmax(K.flatten(e)) weighted_context = K.sum((context * K.expand_dims(alpha)), 1) pre_act = K.dot(x, W_x_h) + K.dot(h_, U_h_h) + K.dot(weighted_context, W_ctx_h) + b_h h = K.sigmoid(pre_act) return h, [alpha, weighted_context], [h]
def call(self, x): return self.alpha * K.tanh(self.beta * x)
def scaled_tanh(x): return K.tanh(x) * 640
def tanhNorm(x): square_sum = K.sum(K.square(x), axis=-1, keepdims=True) dist = K.sqrt(K.maximum(square_sum, K.epsilon())) tanh = K.tanh(dist) scale = tanh / dist return x * scale
def sigtan(x: tf.Tensor): feat_len = x.shape.as_list()[-1] // 2 xsig = x[..., :feat_len] xtan = x[..., feat_len:] return K.sigmoid(xsig) * K.tanh(xtan)
def __init__(self, model, intensity_range, regularization, input_shape, init_cost, steps, mini_batch, lr, num_classes, channels_first=CHANNELS_FIRST, upsample_size=UPSAMPLE_SIZE, attack_succ_threshold=ATTACK_SUCC_THRESHOLD, patience=PATIENCE, cost_multiplier=COST_MULTIPLIER, reset_cost_to_zero=RESET_COST_TO_ZERO, mask_min=MASK_MIN, mask_max=MASK_MAX, color_min=COLOR_MIN, color_max=COLOR_MAX, img_color=IMG_COLOR, shuffle=SHUFFLE, batch_size=BATCH_SIZE, verbose=VERBOSE, return_logs=RETURN_LOGS, save_last=SAVE_LAST, epsilon=EPSILON, early_stop=EARLY_STOP, early_stop_threshold=EARLY_STOP_THRESHOLD, early_stop_patience=EARLY_STOP_PATIENCE, save_tmp=SAVE_TMP, tmp_dir=TMP_DIR, raw_input_flag=RAW_INPUT_FLAG): assert intensity_range in {'imagenet', 'inception', 'mnist', 'raw'} assert regularization in {None, 'l1', 'l2'} self.model = model self.intensity_range = intensity_range self.regularization = regularization self.input_shape = input_shape self.channels_first = channels_first self.init_cost = init_cost self.steps = steps self.mini_batch = mini_batch self.lr = lr self.num_classes = num_classes self.upsample_size = upsample_size self.attack_succ_threshold = attack_succ_threshold self.patience = patience self.cost_multiplier_up = cost_multiplier self.cost_multiplier_down = cost_multiplier**1.5 self.reset_cost_to_zero = reset_cost_to_zero self.mask_min = mask_min self.mask_max = mask_max self.color_min = color_min self.color_max = color_max self.img_color = img_color self.shuffle = shuffle self.batch_size = batch_size self.verbose = verbose self.return_logs = return_logs self.save_last = save_last self.epsilon = epsilon self.early_stop = early_stop self.early_stop_threshold = early_stop_threshold self.early_stop_patience = early_stop_patience self.save_tmp = save_tmp self.tmp_dir = tmp_dir self.raw_input_flag = raw_input_flag if self.channels_first: mask_size = np.ceil( np.array(input_shape[1:], dtype=float) / upsample_size) else: mask_size = np.ceil( np.array(input_shape[0:2], dtype=float) / upsample_size) mask_size = mask_size.astype(int) self.mask_size = mask_size mask = np.zeros(self.mask_size) pattern = np.zeros(input_shape) if self.channels_first: mask = np.expand_dims(mask, axis=0) else: mask = np.expand_dims(mask, axis=2) mask_tanh = np.zeros_like(mask) pattern_tanh = np.zeros_like(pattern) # prepare mask related tensors self.mask_tanh_tensor = K.variable(mask_tanh) mask_tensor_unrepeat = (K.tanh(self.mask_tanh_tensor) / (2 - self.epsilon) + 0.5) if self.channels_first: mask_tensor_unexpand = K.repeat_elements(mask_tensor_unrepeat, rep=self.img_color, axis=0) else: mask_tensor_unexpand = K.repeat_elements(mask_tensor_unrepeat, rep=self.img_color, axis=2) self.mask_tensor = K.expand_dims(mask_tensor_unexpand, axis=0) upsample_layer = UpSampling2D(size=(self.upsample_size, self.upsample_size)) mask_upsample_tensor_uncrop = upsample_layer(self.mask_tensor) uncrop_shape = K.int_shape(mask_upsample_tensor_uncrop)[1:] if self.channels_first: cropping_layer = Cropping2D( cropping=((0, uncrop_shape[1] - self.input_shape[1]), (0, uncrop_shape[2] - self.input_shape[2]))) else: cropping_layer = Cropping2D( cropping=((0, uncrop_shape[0] - self.input_shape[0]), (0, uncrop_shape[1] - self.input_shape[1]))) self.mask_upsample_tensor = cropping_layer(mask_upsample_tensor_uncrop) reverse_mask_tensor = (K.ones_like(self.mask_upsample_tensor) - self.mask_upsample_tensor) def keras_preprocess(x_input, intensity_range): if intensity_range is 'raw': x_preprocess = x_input elif intensity_range is 'imagenet': # 'RGB'->'BGR' x_tmp = x_input[..., ::-1] # Zero-center by mean pixel mean = K.constant([[[103.939, 116.779, 123.68]]]) x_preprocess = x_tmp - mean elif intensity_range is 'inception': x_preprocess = (x_input / 255.0 - 0.5) * 2.0 elif intensity_range is 'mnist': x_preprocess = x_input / 255.0 else: raise Exception('unknown intensity_range %s' % intensity_range) return x_preprocess def keras_reverse_preprocess(x_input, intensity_range): if intensity_range is 'raw': x_reverse = x_input elif intensity_range is 'imagenet': # Zero-center by mean pixel mean = K.constant([[[103.939, 116.779, 123.68]]]) x_reverse = x_input + mean # 'BGR'->'RGB' x_reverse = x_reverse[..., ::-1] elif intensity_range is 'inception': x_reverse = (x_input / 2 + 0.5) * 255.0 elif intensity_range is 'mnist': x_reverse = x_input * 255.0 else: raise Exception('unknown intensity_range %s' % intensity_range) return x_reverse # prepare pattern related tensors self.pattern_tanh_tensor = K.variable(pattern_tanh) self.pattern_raw_tensor = ((K.tanh(self.pattern_tanh_tensor) / (2 - self.epsilon) + 0.5) * 255.0) # prepare input image related tensors # ignore clip operation here # assume input image is already clipped into valid color range input_tensor = K.placeholder(model.input_shape) if self.raw_input_flag: input_raw_tensor = input_tensor else: input_raw_tensor = keras_reverse_preprocess( input_tensor, self.intensity_range) # IMPORTANT: MASK OPERATION IN RAW DOMAIN X_adv_raw_tensor = ( reverse_mask_tensor * input_raw_tensor + self.mask_upsample_tensor * self.pattern_raw_tensor) X_adv_tensor = keras_preprocess(X_adv_raw_tensor, self.intensity_range) output_tensor = model(X_adv_tensor) y_true_tensor = K.placeholder(model.output_shape) # TODO: remove quick fixes. # output of mnist is (batch, 1, 10), so oddly we need to squeeze. # The output is also before softmax. if self.intensity_range == 'mnist': output_tensor = K.softmax(output_tensor) output_tensor = K.squeeze(output_tensor, axis=1) y_true_tensor = K.squeeze(y_true_tensor, axis=1) print(output_tensor.shape) print(y_true_tensor.shape) self.loss_acc = categorical_accuracy(output_tensor, y_true_tensor) self.loss_ce = categorical_crossentropy(output_tensor, y_true_tensor) if self.regularization is None: self.loss_reg = K.constant(0) elif self.regularization is 'l1': self.loss_reg = (K.sum(K.abs(self.mask_upsample_tensor)) / self.img_color) elif self.regularization is 'l2': self.loss_reg = K.sqrt( K.sum(K.square(self.mask_upsample_tensor)) / self.img_color) cost = self.init_cost self.cost_tensor = K.variable(cost) self.loss = self.loss_ce + self.loss_reg * self.cost_tensor self.opt = Adam(lr=self.lr, beta_1=0.5, beta_2=0.9) self.updates = self.opt.get_updates( params=[self.pattern_tanh_tensor, self.mask_tanh_tensor], loss=self.loss) self.train = K.function( [input_tensor, y_true_tensor], [self.loss_ce, self.loss_reg, self.loss, self.loss_acc], updates=self.updates) pass
def triplet_tanh_pn_loss(y_true, y_pred): return K.mean( K.tanh(y_pred[:, 0, 0]) + ((K.constant(1) - K.tanh(y_pred[:, 1, 0])) + (K.constant(1) - K.tanh(y_pred[:, 2, 0]))) / K.constant(2))
def __init__( self, model: KERAS_MODEL_TYPE, use_logits: bool = False, channels_first: bool = False, clip_values: Optional["CLIP_VALUES_TYPE"] = None, preprocessing_defences: Union["Preprocessor", List["Preprocessor"], None] = None, postprocessing_defences: Union["Postprocessor", List["Postprocessor"], None] = None, preprocessing: "PREPROCESSING_TYPE" = (0.0, 1.0), input_layer: int = 0, output_layer: int = 0, steps: int = 1000, init_cost: float = 1e-3, norm: Union[int, float] = 2, learning_rate: float = 0.1, attack_success_threshold: float = 0.99, patience: int = 5, early_stop: bool = True, early_stop_threshold: float = 0.99, early_stop_patience: int = 10, cost_multiplier: float = 1.5, batch_size: int = 32, ): """ Create a Neural Cleanse classifier. :param model: Keras model, neural network or other. :param use_logits: True if the output of the model are logits; false for probabilities or any other type of outputs. Logits output should be favored when possible to ensure attack efficiency. :param channels_first: Set channels first or last. :param clip_values: Tuple of the form `(min, max)` of floats or `np.ndarray` representing the minimum and maximum values allowed for features. If floats are provided, these will be used as the range of all features. If arrays are provided, each value will be considered the bound for a feature, thus the shape of clip values needs to match the total number of features. :param preprocessing_defences: Preprocessing defence(s) to be applied by the classifier. :param postprocessing_defences: Postprocessing defence(s) to be applied by the classifier. :param preprocessing: Tuple of the form `(subtrahend, divisor)` of floats or `np.ndarray` of values to be used for data preprocessing. The first value will be subtracted from the input. The input will then be divided by the second one. :param input_layer: The index of the layer to consider as input for models with multiple input layers. The layer with this index will be considered for computing gradients. For models with only one input layer this values is not required. :param output_layer: Which layer to consider as the output when the models has multiple output layers. The layer with this index will be considered for computing gradients. For models with only one output layer this values is not required. :param steps: The maximum number of steps to run the Neural Cleanse optimization :param init_cost: The initial value for the cost tensor in the Neural Cleanse optimization :param norm: The norm to use for the Neural Cleanse optimization, can be 1, 2, or np.inf :param learning_rate: The learning rate for the Neural Cleanse optimization :param attack_success_threshold: The threshold at which the generated backdoor is successful enough to stop the Neural Cleanse optimization :param patience: How long to wait for changing the cost multiplier in the Neural Cleanse optimization :param early_stop: Whether or not to allow early stopping in the Neural Cleanse optimization :param early_stop_threshold: How close values need to come to max value to start counting early stop :param early_stop_patience: How long to wait to determine early stopping in the Neural Cleanse optimization :param cost_multiplier: How much to change the cost in the Neural Cleanse optimization :param batch_size: The batch size for optimizations in the Neural Cleanse optimization """ import keras.backend as K from keras.losses import categorical_crossentropy from keras.metrics import categorical_accuracy super().__init__( model=model, use_logits=use_logits, channels_first=channels_first, clip_values=clip_values, preprocessing_defences=preprocessing_defences, postprocessing_defences=postprocessing_defences, preprocessing=preprocessing, input_layer=input_layer, output_layer=output_layer, steps=steps, init_cost=init_cost, norm=norm, learning_rate=learning_rate, attack_success_threshold=attack_success_threshold, early_stop=early_stop, early_stop_threshold=early_stop_threshold, early_stop_patience=early_stop_patience, patience=patience, cost_multiplier=cost_multiplier, batch_size=batch_size, ) mask = np.random.uniform(size=super().input_shape) pattern = np.random.uniform(size=super().input_shape) self.epsilon = K.epsilon() # Normalize mask between [0, 1] self.mask_tensor_raw = K.variable(mask) # self.mask_tensor = K.expand_dims(K.tanh(self.mask_tensor_raw) / (2 - self.epsilon) + 0.5, axis=0) self.mask_tensor = K.tanh(self.mask_tensor_raw) / (2 - self.epsilon) + 0.5 # Normalize pattern between [0, 1] self.pattern_tensor_raw = K.variable(pattern) self.pattern_tensor = K.expand_dims(K.tanh(self.pattern_tensor_raw) / (2 - self.epsilon) + 0.5, axis=0) reverse_mask_tensor = K.ones_like(self.mask_tensor) - self.mask_tensor input_tensor = K.placeholder(model.input_shape) x_adv_tensor = reverse_mask_tensor * input_tensor + self.mask_tensor * self.pattern_tensor output_tensor = self.model(x_adv_tensor) y_true_tensor = K.placeholder(model.outputs[0].shape.as_list()) self.loss_acc = categorical_accuracy(output_tensor, y_true_tensor) self.loss_ce = categorical_crossentropy(output_tensor, y_true_tensor) if self.norm == 1: # TODO: change 3 to dynamically set img_color self.loss_reg = K.sum(K.abs(self.mask_tensor)) / 3 elif self.norm == 2: self.loss_reg = K.sqrt(K.sum(K.square(self.mask_tensor)) / 3) self.cost = self.init_cost self.cost_tensor = K.variable(self.cost) self.loss_combined = self.loss_ce + self.loss_reg * self.cost_tensor try: from keras.optimizers import Adam self.opt = Adam(lr=self.learning_rate, beta_1=0.5, beta_2=0.9) except ImportError: from keras.optimizers import adam_v2 self.opt = adam_v2.Adam(lr=self.learning_rate, beta_1=0.5, beta_2=0.9) self.updates = self.opt.get_updates( params=[self.pattern_tensor_raw, self.mask_tensor_raw], loss=self.loss_combined ) self.train = K.function( [input_tensor, y_true_tensor], [self.loss_ce, self.loss_reg, self.loss_combined, self.loss_acc], updates=self.updates, )
def hierarchical_containment_layer_sum_activation(x): return K.tanh(K.sum(x, axis=-1, keepdims=True))
def call(self, x): e_t = kb.squeeze(kb.tanh(kb.dot(x, self.w) + self.b), axis=-1) a_t = kb.softmax(e_t) a_t = kb.expand_dims(a_t, axis=-1) return kb.sum(x * a_t, axis=1)
def call(self, inputs): x = K.permute_dimensions(inputs, (0, 2, 1)) a = K.softmax(K.tanh(K.dot(x, self.W) + self.b)) outputs = K.permute_dimensions(a * x, (0, 2, 1)) outputs = K.sum(outputs, axis=1) return outputs
def call(self, inputs, **kwargs): pos = keras.activations.relu(inputs) neg = self.alpha * K.tanh(-self.beta * keras.activations.relu(-inputs)) return pos + neg
def mean_distance(y_true, y_pred): return K.sqrt(K.mean(K.pow(y_true[:, 2] - K.tanh(y_pred[:, 2]), 2)))
def mixed_loss(target, output): loss1 = K.binary_crossentropy(K.sigmoid(output[:, 0]), target[:, 0]) loss2 = K.binary_crossentropy(K.sigmoid(output[:, 1]), target[:, 1]) loss3 = mean_squared_error(K.tanh(output[:, 2]), target[:, 2]) return loss1 + loss2 + loss3
def call(self, x): et = K.squeeze(K.tanh(K.dot(x, self.W) + self.b), axis=-1) at = K.softmax(et) at = K.expand_dims(at, axis=-1) output = x * at return K.sum(output, axis=1)
def NewTanh(x): return K.tanh(x)
def call(self, P, **kwargs): """ :param P: inputs :return: encoding of inputs P """ ''' Paper notations in the code ''' # P = P_hw # itr_attn = P_itrAtt # encoding = P_enc # The paper takes inputs to be P(_hw) as an example and then computes the same thing for H, # therefore we'll name our inputs P too. # Input of encoding is P with shape (batch, p, d). It would be (batch, h, d) for hypothesis # Construct alphaP of shape (batch, p, 3*d, p) # A = dot(w_itr_att, alphaP) # alphaP consists of 3*d rows along 2nd axis # 1. up -> first d items represent P[i] # 2. mid -> second d items represent P[j] # 3. down -> final items represent alpha(P[i], P[j]) which is element-wise product of P[i] and P[j] = P[i]*P[j] # If we look at one slice of alphaP we'll see that it has the following elements: # ---------------------------------------- # P[i][0], P[i][0], P[i][0], ... P[i][0] ▲ # P[i][1], P[i][1], P[i][1], ... P[i][1] | # P[i][2], P[i][2], P[i][2], ... P[i][2] | # ... ... | up # ... ... | # ... ... | # P[i][d], P[i][d], P[i][d], ... P[i][d] ▼ # ---------------------------------------- # P[0][0], P[1][0], P[2][0], ... P[p][0] ▲ # P[0][1], P[1][1], P[2][1], ... P[p][1] | # P[0][2], P[1][2], P[2][2], ... P[p][2] | # ... ... | mid # ... ... | # ... ... | # P[0][d], P[1][d], P[2][d], ... P[p][d] ▼ # ---------------------------------------- # ▲ # | # | # up * mid | down # element-wise product | # | # ▼ # ---------------------------------------- # For every slice(i) the up part changes its P[i] values # The middle part is repeated p times in depth (for every i) # So we can get the middle part by doing the following: # mid = broadcast(P) -> to get tensor of shape (batch, p, d, p) # As we can notice up is the same mid, but with changed axis, so to obtain up from mid we can do: # up = swap_axes(mid, axis1=0, axis2=2) ''' Alpha ''' # P # (batch, p, d) mid = broadcast_last_axis(P) # (batch, p, d, p) up = K.permute_dimensions(mid, pattern=(0, 3, 2, 1)) # (batch, p, d, p) alphaP = K.concatenate([up, mid, up * mid], axis=2) # (batch, p, 3d, p) A = K.dot(self.w_itr_att, alphaP) # (batch, p, p) ''' Self-attention ''' # P_itr_attn[i] = sum of for j = 1...p: # s = sum(for k = 1...p: e^A[k][j] # ( e^A[i][j] / s ) * P[j] --> P[j] is the j-th row, while the first part is a number # So P_itr_attn is the weighted sum of P # SA is column-wise soft-max applied on A # P_itr_attn[i] is the sum of all rows of P scaled by i-th row of SA SA = softmax(A, axis=2) # (batch, p, p) itr_attn = K.batch_dot(SA, P) # (batch, p, d) ''' Fuse gate ''' # These layers are considered linear in the official implementation, therefore we apply dropout on each input P_concat = K.concatenate([P, itr_attn], axis=2) # (batch, p, 2d) z = K.tanh(K.dot(DecayingDropout()(P_concat), self.w1) + self.b1) # (batch, p, d) r = K.sigmoid(K.dot(DecayingDropout()(P_concat), self.w2) + self.b2) # (batch, p, d) f = K.sigmoid(K.dot(DecayingDropout()(P_concat), self.w3) + self.b3) # (batch, p, d) encoding = r * P + f * z # (batch, p, d) return encoding # (batch, p, d)
def call(self, inputs): W = K.tanh(self.W_hat) * K.sigmoid(self.M_hat) return K.dot(inputs, W)
def gelu_tanh(x): """基于Tanh近似计算的gelu函数 """ cdf = 0.5 * (1.0 + K.tanh( (np.sqrt(2 / np.pi) * (x + 0.044715 * K.pow(x, 3))))) return x * cdf
def call(self, x): prog = K.tanh(self.W) out = x + prog return out
def call(self, inputs, **kwargs): W = K.tanh(self.W_hat) * K.sigmoid(self.M_hat) a = K.dot(inputs, W) return a
def mish(x): return x*K.tanh(K.softplus(x))
def scaled_hyperbolic_tangent(x): return K.tanh((2 / 3) * x) * 1.7159
def call(self, inputs): return inputs * K.tanh(K.softplus(inputs))
def call(self, x): weights=K.transpose(K.tanh(K.dot(self.u,self.tinyW))) return K.dot(x,weights)
def call(self, inputs): a = K.dot(inputs[0], self.kernel) y_trans = K.permute_dimensions(inputs[1], (0,2,1)) b = K.batch_dot(a, y_trans, axes=[2,1]) print K.int_shape(b) return K.tanh(b)
def gelu(x): c = math.sqrt(2 / math.pi) return 0.5 * x * (1 + K.tanh(c * (x + 0.044715 * K.pow(x, 3))))
def gelu_tanh(x): cdf = 0.5 * (1.0 + K.tanh( (np.sqrt(2 / np.pi) * (x + 0.044715 * K.pow(x, 3))))) return x * cdf
def call(self, x): u_it = K.tanh(K.dot(x, self.W) + self.b) a_it = K.dot(u_it, self.u) a_it = K.squeeze(a_it, -1) a_it = K.softmax(a_it) return a_it
def mean_length_error(y_true, y_pred): y_true_f = K.sum(K.round(K.flatten(y_true))) y_pred_f = K.sum(K.round(K.flatten(y_pred))) delta = (y_pred_f - y_true_f) return K.mean(K.tanh(delta))
def call(self, inputs, **kwargs): results = inputs * K.tanh(K.softplus(inputs)) return results
def call(self, x): tmp = K.tanh(K.dot(x[0], self.w1) + K.dot(x[1], self.w2)) z = K.sigmoid(K.dot(tmp, self.w3)) x_new = z * x[0] + (1 - z) * x[1] return x_new