def step(self, x, states): ytm, stm = states # repeat the hidden state to the length of the sequence _stm = K.repeat(stm, self.timesteps) # now multiplty the weight matrix with the repeated hidden state _Wxstm = K.dot(_stm, self.W_a) # calculate the attention probabilities # this relates how much other timesteps contributed to this one. et = K.dot(activations.tanh(_Wxstm + self._uxpb), K.expand_dims(self.V_a)) at = K.exp(et) at_sum = K.sum(at, axis=1) at_sum_repeated = K.repeat(at_sum, self.timesteps) at /= at_sum_repeated # vector of size (batchsize, timesteps, 1) # calculate the context vector context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1) # ~~~> calculate new hidden state # first calculate the "r" gate: rt = activations.sigmoid( K.dot(ytm, self.W_r) + K.dot(stm, self.U_r) + K.dot(context, self.C_r) + self.b_r) # now calculate the "z" gate zt = activations.sigmoid( K.dot(ytm, self.W_z) + K.dot(stm, self.U_z) + K.dot(context, self.C_z) + self.b_z) # calculate the proposal hidden state: s_tp = activations.tanh( K.dot(ytm, self.W_p) + K.dot((rt * stm), self.U_p) + K.dot(context, self.C_p) + self.b_p) # new hidden state: st = (1-zt)*stm + zt * s_tp yt = activations.softmax( K.dot(ytm, self.W_o) + K.dot(stm, self.U_o) + K.dot(context, self.C_o) + self.b_o) if self.return_probabilities: return at, [yt, st] else: return yt, [yt, st]
def _step(self, xi_t, xf_t, xc_t, xo_t, h_tm1, c_tm1, u_i, u_f, u_o, u_c): i_t = hard_sigmoid(xi_t + T.dot(h_tm1, u_i)) f_t = hard_sigmoid(xf_t + T.dot(h_tm1, u_f)) c_t = f_t * c_tm1 + i_t * tanh(xc_t + T.dot(h_tm1, u_c)) o_t = hard_sigmoid(xo_t + T.dot(h_tm1, u_o)) h_t = o_t * tanh(c_t) return h_t, c_t
def test_tanh(): test_values = get_standard_values() x = K.placeholder(ndim=2) exp = activations.tanh(x) f = K.function([x], [exp]) result = f([test_values])[0] expected = np.tanh(test_values) assert_allclose(result, expected, rtol=1e-05)
def step_backward(inputs, states): h_tm1 = states[0] # previous memory state c_tm1 = states[1] # previous carry state x_i = tf.tensordot(inputs, self.kernel_i_backward,axes=[[2],[0]]) x_f = tf.tensordot(inputs, self.kernel_f_backward,axes=[[2],[0]]) x_c = tf.tensordot(inputs, self.kernel_c_backward,axes=[[2],[0]]) x_o = tf.tensordot(inputs, self.kernel_o_backward,axes=[[2],[0]]) x_i = K.bias_add(x_i, self.bias_i_backward) x_f = K.bias_add(x_f, self.bias_f_backward) x_c = K.bias_add(x_c, self.bias_c_backward) x_o = K.bias_add(x_o, self.bias_o_backward) i = activations.hard_sigmoid(x_i + tf.tensordot(h_tm1, self.recurrent_kernel_i_backward,axes=[[2],[0]])) f = activations.hard_sigmoid(x_f + tf.tensordot(h_tm1, self.recurrent_kernel_f_backward,axes=[[2],[0]])) c = f * c_tm1 + i * activations.tanh(x_c + tf.tensordot(h_tm1, self.recurrent_kernel_c_backward,axes=[[2],[0]])) o = activations.hard_sigmoid(x_o + tf.tensordot(h_tm1, self.recurrent_kernel_o_backward,axes=[[2],[0]])) h = o * activations.tanh(c) return h, [h, c]
def get_initial_state(self, inputs): print('inputs shape:', inputs.get_shape()) # apply the matrix on the first time step to get the initial s0. s0 = activations.tanh(K.dot(inputs[:, 0], self.W_s)) # from keras.layers.recurrent to initialize a vector of (batchsize, # output_dim) y0 = K.zeros_like(inputs) # (samples, timesteps, input_dims) y0 = K.sum(y0, axis=(1, 2)) # (samples, ) y0 = K.expand_dims(y0) # (samples, 1) y0 = K.tile(y0, [1, self.output_dim]) return [y0, s0]
def step(self, x, states): y_prev, s_prev = states s_all = K.repeat(s_prev, self.timesteps) Wa_s_all = K.dot(s_all, self.W_a) et = K.dot(activations.tanh(Wa_s_all + self.uh), K.expand_dims(self.V_a)) #et_sum = K.sum(K.exp(et), axis=1) #et_sum_repeated = K.repeat(et_sum, self.timesteps) #a_current = et_sum / et_sum_repeated #shape batch_size, timestep, 1 a_current = activations.softmax(et) context = K.squeeze(K.batch_dot(a_current, self.x_seq, axes=1), axis=1) #calculate reset gate r_current = activations.sigmoid( K.dot(y_prev, self.W_r) + K.dot(s_prev, self.U_r) + K.dot(context, self.C_r) + self.b_r) #calculate update gate z_current = activations.sigmoid( K.dot(y_prev, self.W_z) + K.dot(s_prev, self.U_z) + K.dot(context, self.C_z) + self.b_z) #calculate s tilde s_tilde = activations.tanh( K.dot(y_prev, self.W_c) + K.dot((r_current * s_prev), self.U_c) + K.dot(context, self.C_c) + self.b_c) s_current = (1 - z_current) * s_prev + z_current * s_tilde #calculate output y_current = activations.sigmoid( K.dot(y_prev, self.W_o) + K.dot(s_current, self.U_o) + K.dot(context, self.C_o) + self.b_o) if self.return_attention_weights: return a_current, [y_current, s_current] else: return y_current, [y_current, s_current]
def get_initial_state(self, inputs): # apply the matrix on the first time step to get the initial s0. s0 = activations.tanh(K.dot(inputs[:, 0], self.W_s)) # from keras.layers.recurrent to initialize a vector of (batchsize, # output_dim) y0 = K.zeros_like(inputs) # (samples, timesteps, input_dims) y0 = K.sum(y0, axis=(1, 2)) # (samples, ) y0 = K.expand_dims(y0) # (samples, 1) y0 = K.tile(y0, [1, self.output_dim]) # Counter of decoding timestep (for enforcing causality) t = K.variable(0, name='decode_t', dtype='int32') return [y0, s0, t]
def step(self, x_input, states): input_shape = self.input_spec[0].shape en_seq = states[-1] _, [h, c] = super(PointerLSTM, self).step(x_input, states[:-1]) # vt*tanh(W1*e+W2*d) dec_seq = K.repeat(h, input_shape[1]) Eij = time_distributed_dense(en_seq, self.W1, output_dim=1) Dij = time_distributed_dense(dec_seq, self.W2, output_dim=1) U = self.vt * tanh(Eij + Dij) U = K.squeeze(U, 2) # make probability tensor pointer = softmax(U) return pointer, [h, c]
def _compute_energy(self, stm): # "concat" energy function # energy_i = g * V / |V| * tanh([stm, h_i] * W + b) + r _stm = K.dot(stm, self.W_a) V_a = self.V_a if self.normalize_energy: V_a = self.Energy_g * K.l2_normalize(self.V_a) et = K.dot(activations.tanh(K.expand_dims(_stm, axis=1) + self._uxpb), K.expand_dims(V_a)) if self.is_monotonic: et += self.Energy_r return et
def calc_reduced_value(self, values): # Вычисляем новое значение для операции REDUCE, полученное из двух последних векторов из стека h = K.concatenate([ values['stack_current'][:, self.hidden_dim:], values['stack_prev'][:, self.hidden_dim:] ], axis=1) q = K.dot(h, self.W_R) + self.b_R q1 = sigmoid(q[:, :4 * self.hidden_dim]) q2 = tanh(q[:, 4 * self.hidden_dim:]) c = q1[:, self.hidden_dim:2*self.hidden_dim]*values['stack_current'][:,:self.hidden_dim] + \ q1[:, 2*self.hidden_dim:3*self.hidden_dim]*values['stack_prev'][:,:self.hidden_dim] + \ q1[:, :self.hidden_dim]*q2 h = q1[:, 3 * self.hidden_dim:] * c reduced = K.concatenate([c, h], axis=1) return reduced
def call(self, inputs): assert isinstance(inputs, list) temp = K.dot(inputs[0], self.kernel) F = tf.matmul(temp, tf.transpose(inputs[1], perm = [0, 2, 1])) F = activations.tanh(F) ap = K.mean(F, axis = -1, keepdims=True) aq = K.mean(F, axis = 1, keepdims=True) eap = K.exp(ap) eaq = K.exp(aq) eap /= K.sum(eap, axis = 1, keepdims=True) eaq /= K.sum(eaq, axis = -1, keepdims=True) output0 = tf.matmul(tf.transpose(inputs[0], perm = [0, 2, 1]), eap) output1 = tf.matmul(tf.transpose(inputs[1], perm = [0, 2, 1]), tf.transpose(eaq, perm = [0, 2, 1])) output0 = tf.transpose(output0, perm = [0, 2, 1]) output1 = tf.transpose(output1, perm = [0, 2, 1]) return [output0, output1]
def multiway_soft_attention_alignment(input_1, input_2, max_len, dim): """Align text representation with neural soft attention""" # ----- Bilinear attention ----- # # attention = Dot(axes=-1)([input_1, # Dense(dim)(input_2)]) attention = Dot(axes=-1)([input_1, input_2]) bilinear_in1_aligned, bilinear_in2_aligned = weighted([attention, input_1, input_2]) # ----- Bilinear attention ----- # x1 = RepeatVector(n=max_len, axis=2, shape=[-1, max_len, dim])(input_1) x2 = RepeatVector(n=max_len, axis=1, shape=[-1, max_len, dim])(input_2) # ----- Minus attention ----- # attention = Subtract()([x1, x2]) # attention = Dense(int(dim / 2), activation='tanh')(attention) attention = Dense(1)(attention) print(np.shape(attention)) attention = Lambda(lambda x: K.squeeze(x, axis=-1), output_shape=squeeze_output_shape)(attention) print(np.shape(attention)) minus_in1_aligned, minus_in2_aligned = weighted([attention, input_1, input_2]) # ----- Minus attention ----- # # ----- Dot attention ----- # attention = Multiply()([x1, x2]) # attention = Dense(int(dim / 2), activation='tanh')(attention) attention = Dense(1)(attention) attention = Lambda(lambda x: K.squeeze(x, axis=-1), output_shape=squeeze_output_shape)(attention) dot_in1_aligned, dot_in2_aligned = weighted([attention, input_1, input_2]) # ----- Dot attention ----- # # ----- Concat attention ----- # # v1 = Dense(int(dim / 2))(x1) # (?, 43, 43, dim / 2) # v2 = Dense(int(dim / 2))(x2) # (?, 43, 43, dim / 2) # attention = Lambda(lambda x: tanh(x), output_shape=unchanged_shape)(Add()([v1, v2])) # (?, 43, 43, dim / 2) attention = Lambda(lambda x: tanh(x), output_shape=unchanged_shape)(Add()([x1, x2])) # (?, 43, 43, dim / 2) attention = Dense(1)(attention) # (?, 43, 43, 1) attention = Lambda(lambda x: K.squeeze(x, axis=-1), output_shape=squeeze_output_shape)(attention) concat_in1_aligned, concat_in2_aligned = weighted([attention, input_1, input_2]) # ----- Concat attention ----- # in1_aligned = Concatenate()([dot_in1_aligned, bilinear_in1_aligned, minus_in1_aligned, concat_in1_aligned]) in2_aligned = Concatenate()([dot_in2_aligned, bilinear_in2_aligned, minus_in2_aligned, concat_in2_aligned]) return in1_aligned, in2_aligned
def step(self, x_input, states): # print "x_input:", x_input, x_input.shape # <TensorType(float32, matrix)> input_shape = self.input_spec[0].shape en_seq = states[-1] _, [h, c] = self.cell.call(x_input, states[:-1]) # vt*tanh(W1*e+W2*d) dec_seq = K.repeat(h, input_shape[1]) Eij = _time_distributed_dense(en_seq, self.W1, output_dim=1) Dij = _time_distributed_dense(dec_seq, self.W2, output_dim=1) U = self.vt * tanh(Eij + Dij) U = K.squeeze(U, 2) # make probability tensor pointer = softmax(U) return pointer, [h, c]
def step(self, x, states): ytm, stm = states # repeat the hidden state to the length of the sequence _stm = K.repeat(stm, self.timesteps) # now multiplty the weight matrix with the repeated hidden state _Wxstm = K.dot(_stm, self.W_a) # calculate the attention probabilities # this relates how much other timesteps contributed to this one. et = K.dot(activations.tanh(_Wxstm + self._uxpb), K.expand_dims(self.V_a)) at = K.exp(et) at_sum = K.sum(at, axis=1) at_sum_repeated = K.repeat(at_sum, self.timesteps) at /= at_sum_repeated # vector of size (batchsize, timesteps, 1) return at
def keras_linear_kernel(args, normalize=True, tanh_activation=False): """ Linear kernel: $k(x, y) = x^Ty$ :param args: list of size 2 containing x and y :param normalize: if True, normalize the input with l2 before computing the kernel function :param tanh_activation: if True apply tanh activation to the output :return: The linear kernel between args[0] and args[1] """ X = args[0] Y = args[1] if normalize: X = K.l2_normalize(X, axis=-1) Y = K.l2_normalize(Y, axis=-1) result = K.dot(X, K.transpose(Y)) if tanh_activation: return tanh(result) else: return result
def step(self, x_input, states): input_shape = self.input_spec[0].shape en_seq = states[-1] _, [h, c] = super(PointerLSTM, self).step(x_input, states[:-1]) # vt*tanh(W1*e+W2*d) dec_seq = K.repeat(h, input_shape[1]) #dec_seq = K.repeat(h, 2) print ('dec_seq') print (dec_seq) Eij = time_distributed_dense(en_seq, self.W1, output_dim=1) Dij = time_distributed_dense(dec_seq, self.W2, output_dim=1) U = self.vt * tanh(Eij + Dij) print ('U') print (U) U = K.squeeze(U, 2) print ('U squeezed') print (U) # make probability tensor pointer = softmax(U) return pointer, [h, c]
def get_mixture_coef(self, out_tensor): """ Parses the output tensor to appropriate mixture density coefficients""" # This uses eqns 18 -> 23 of http://arxiv.org/abs/1308.0850. # Pen states: z_pen_logits = out_tensor[:, :, 0:3] # Process outputs into MDN parameters M = self.hps['num_mixture'] dist_params = [out_tensor[:, :, (3 + M * (n - 1)):(3 + M * n)] for n in range(1, 7)] z_pi, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr = dist_params # Softmax all the pi's and pen states: z_pi = softmax(z_pi) z_pen = softmax(z_pen_logits) # Exponent the sigmas and also make corr between -1 and 1. z_sigma1 = exponential(z_sigma1) z_sigma2 = exponential(z_sigma2) z_corr = tanh(z_corr) r = [z_pi, z_mu1, z_mu2, z_sigma1, z_sigma2, z_corr, z_pen, z_pen_logits] return r
def keras_chi_square_CPD(args, epsilon=None, tanh_activation=True, normalize=False): """ Chi square kernel (equivalent to `additive_chi2_kernel` in scikit-learn): $k(x, y) = -Sum [(x - y)^2 / (x + y)]$ :param args: list of size 2 containing x and y :param epsilon: very small value to add to the denominator so that we do not have zeros here :param tanh_activation: if True apply tanh activation to the output :param normalize: if True, normalize the input with l2 before computing the kernel function :return: The chi square kernel between args[0] and args[1] """ X = args[0] Y = args[1] if normalize: X = K.l2_normalize(X, axis=-1) Y = K.l2_normalize(Y, axis=-1) # the drawing of the matrix X expanded looks like a wall wall = K.expand_dims(X, axis=1) # the drawing of the matrix Y expanded looks like a floor floor = K.expand_dims(Y, axis=0) numerator = K.square((wall - floor)) denominator = wall + floor if epsilon is not None: quotient = numerator / (denominator + epsilon) else: quotient = numerator / denominator quotient_without_nan = replace_nan(quotient) result = -K.sum(quotient_without_nan, axis=2) if tanh_activation: return tanh(result) else: return result
acttf = kact.linear(nettf) # need to convert from TensorFlow tensors to numpy arrays before plotting # eval() is called because TensorFlow tensors have no values until they are "run" plt_act(nettf.eval(), acttf.eval(), 'linear activation function') # relu activation function acttf = kact.relu(nettf) plt_act(nettf.eval(), acttf.eval(), 'rectified linear (relu)') # sigmoid activation function acttf = kact.sigmoid(nettf) plt_act(nettf.eval(), acttf.eval(), 'sigmoid') # hard sigmoid activation function acttf = kact.hard_sigmoid(nettf) plt_act(nettf.eval(), acttf.eval(), 'hard sigmoid') # tanh activation function acttf = kact.tanh(nettf) plt_act(nettf.eval(), acttf.eval(), 'tanh') # softsign activation function acttf = kact.softsign(nettf) plt_act(nettf.eval(), acttf.eval(), 'softsign') # close the TensorFlow session session.close() # done print('Done!')
'celu': Lambda(lambda x: tf.nn.crelu(x) * 1.270926833152771), 'elu': Lambda(lambda x: elu(x) * 1.2716004848480225), 'gelu': Lambda(lambda x: gelu(x) * 1.7015043497085571), # 'glu': lambda x: jax.nn.glu(x) * 1.8484294414520264, 'leaky_relu': Lambda(lambda x: tf.nn.leaky_relu(x) * 1.70590341091156), 'log_sigmoid': Lambda(lambda x: tf.math.log(tf.nn.sigmoid(x)) * 1.9193484783172607), 'log_softmax': Lambda(lambda x: tf.math.log(tf.nn.softmax(x)) * 1.0002083778381348), 'relu': Lambda(lambda x: relu(x) * 1.7139588594436646), 'relu6': Lambda(lambda x: tf.nn.relu6(x) * 1.7131484746932983), 'selu': Lambda(lambda x: selu(x) * 1.0008515119552612), 'sigmoid': Lambda(lambda x: sigmoid(x) * 4.803835391998291), 'silu': Lambda(lambda x: tf.nn.silu(x) * 1.7881293296813965), 'soft_sign': Lambda(lambda x: tf.nn.softsign(x) * 2.338853120803833), 'softplus': Lambda(lambda x: softplus(x) * 1.9203323125839233), 'tanh': Lambda(lambda x: tanh(x) * 1.5939117670059204), }
def step_backward(X, states): new_state = activations.tanh(tf.tensordot(X,self.encoder_weight_backward, axes=[[2],[0]]) \ + tf.tensordot(states[0],self.recurrent_weight_backward, axes=[[2],[0]])) return new_state, [new_state]
def step(self, x, states): if self.is_monotonic: ytm, stm, timestep, previous_attention = states else: ytm, stm, timestep = states ytm = self.embedding_sublayer(K.cast(ytm, 'int32')) if self.recurrent_dropout is not None and 0. < self.recurrent_dropout < 1.: stm = K.in_train_phase(K.dropout(stm, self.recurrent_dropout), stm) ytm = K.in_train_phase(K.dropout(ytm, self.recurrent_dropout), ytm) et = self._compute_energy(stm) if self.is_monotonic: at = self._compute_probabilities(et, previous_attention) else: at = self._compute_probabilities(et) # calculate the context vector context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1) # ~~~> calculate new hidden state # first calculate the "r" gate: rt = activations.sigmoid( K.dot(ytm, self.W_r) + K.dot(stm, self.U_r) + K.dot(context, self.C_r) + self.b_r) # now calculate the "z" gate zt = activations.sigmoid( K.dot(ytm, self.W_z) + K.dot(stm, self.U_z) + K.dot(context, self.C_z) + self.b_z) # calculate the proposal hidden state: s_tp = activations.tanh( K.dot(ytm, self.W_p) + K.dot((rt * stm), self.U_p) + K.dot(context, self.C_p) + self.b_p) # new hidden state: st = (1 - zt) * stm + zt * s_tp yt = activations.softmax( K.dot(ytm, self.W_o) + K.dot(st, self.U_o) + K.dot(context, self.C_o) + self.b_o) if self.use_teacher_forcing: ys = K.in_train_phase(self.y_true[:, timestep[0]], K.argmax(yt, axis=-1)) ys = K.flatten(ys) else: ys = K.flatten(K.argmax(yt, axis=-1)) if self.return_probabilities: output = at else: output = yt next_states = [ys, st, timestep + 1] if self.is_monotonic: next_states.append(at) return output, next_states
def step(self, x, states): # obtain elements of the previous time step. ytm, stm = states # ## ## ## equation 1 ## ## ## ## ## # repeat the hidden state to the length of the sequence _stm = K.repeat(stm, self.timesteps) # now multiplty the weight matrix with the repeated hidden state _Wxstm = K.dot(_stm, self.W_a) # calculate the attention probabilities # this relates how much other timesteps contributed to this one. et = K.dot(activations.tanh(_Wxstm + self._uxpb), K.expand_dims(self.V_a)) ## ## ## equation 2 ## ## ## ## ## at = K.exp(et) at_sum = K.sum(at, axis=1) at_sum_repeated = K.repeat(at_sum, self.timesteps) at /= at_sum_repeated # vector of size (batchsize, timesteps, 1) ## ## ## equation 3 ## ## ## ## ## # calculate the context vector context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1) # ~~~> calculate new hidden state # equation 4 (reset gate) rt = activations.sigmoid( K.dot(ytm, self.W_r) + K.dot(stm, self.U_r) + K.dot(context, self.C_r) + self.b_r) # equation 5 (update gate) zt = activations.sigmoid( K.dot(ytm, self.W_z) + K.dot(stm, self.U_z) + K.dot(context, self.C_z) + self.b_z) # equation 6 (proposal state) s_tp = activations.tanh( K.dot(ytm, self.W_p) + K.dot((rt * stm), self.U_p) + K.dot(context, self.C_p) + self.b_p) # equation 7 (new hidden states) st = (1 - zt) * stm + zt * s_tp # equation 8 # the probability of having each character. yt = activations.softmax( K.dot(ytm, self.W_o) + K.dot(stm, self.U_o) + K.dot(context, self.C_o) + self.b_o) # a switch so that we can return the # attention for visualizations if self.return_probabilities: return at, [yt, st] else: return yt, [yt, st]
def SummaRuNNer(): # initialize embedding layers embed_layer = TimeDistributed( Embedding(vocab_sz, word_embed_dim, embeddings_initializer=Constant(vocab.embedding_matrix), input_length=args.sent_len, trainable=False)) abs_embed_layer = Embedding(args.doc_len, args.pos_embed_dim, input_length=1, trainable=True) rel_embed_layer = Embedding(rel_segments, args.pos_embed_dim, input_length=1, trainable=True) # input shape [bs, doc length, sentence length] doc_input = Input(shape=(int(args.doc_len), int(args.sent_len)), name='doc_input') # word embedding word_emb_seq = embed_layer(doc_input) # LSTM on each each word - return sequence word_LSTM = TimeDistributed( Bidirectional(LSTM(args.hidden_sz, return_sequences=True))) enc_words = word_LSTM(word_emb_seq) avg_pooler = TimeDistributed(AveragePooling1D(args.sent_len)) pooled_words = Reshape((args.doc_len, 2 * args.hidden_sz), name='sent_pooler')(avg_pooler(enc_words)) # run another word LSTM so that each sentece is represented by a single vector sent_LSTM = Bidirectional(LSTM(args.hidden_sz, return_sequences=True)) enc_sents = sent_LSTM(pooled_words) # create single vector for document doc_pooler = AveragePooling1D(args.doc_len) doc = Flatten(name='flatten_doc')(doc_pooler(enc_sents)) d = Dense(int(2 * args.hidden_sz), activation='tanh', name='dense_doc')(doc) # novelty tracker s = Lambda(lambda x: K.zeros_like(x), name='s_tensor')(d) # [?, 2*h] probs = [] # placeholder T = Lambda(lambda x: (K.ones_like(x[:, 0:1], name='T_tensor')))(s) split_sentences = Lambda( lambda tensor, doc_len: tf.unstack(tensor, doc_len, 1), arguments={'doc_len': args.doc_len})(enc_sents) # run every sentence through classification layer and store probability for pos in range(len(split_sentences)): sent = Lambda(lambda sentences, pos: sentences[pos], arguments={'pos': pos})(split_sentences) # run the absolute embedding abs_idx = Lambda(lambda T, pos: T * pos, arguments={'pos': pos})(T) abs_emb = Reshape((args.pos_embed_dim, ), name='abs_' + str(pos))(abs_embed_layer(abs_idx)) """ get relative position and run through relative embedding refers to a quantized representation that divides each document into a fixed number of segments and computes the segment ID of a given sentence. """ rel_idx = math.floor(((pos + (rel_segments) / 2) / args.doc_len) * 10) # only works for rel_segments = 10 rel_idx = Lambda(lambda T, rel_idx: T * (rel_idx - 1), arguments={'rel_idx': rel_idx})(T) rel_emb = Reshape((args.pos_embed_dim, ), name='rel_' + str(pos))(rel_embed_layer(rel_idx)) # classifier layer content = Dense(2, name='content_' + str(pos))(sent) salience = Dense(2, name='salience_' + str(pos))( Lambda(lambda x: x[0] * x[1])([sent, d])) novelty = Dense(2, name='novelty_' + str(pos))( Lambda(lambda x: x[0] * tanh(x[1]))([sent, s])) abs_pos = Dense(2, name='abs_pos_' + str(pos))(abs_emb) rel_pos = Dense(2, name='rel_pos_' + str(pos))(rel_emb) p = Lambda(lambda x: sigmoid(x[0] + x[1] + x[2] + x[3] + x[4]))( [content, salience, novelty, abs_pos, rel_pos]) probs.append(p) # extract just the probability of label = 1 p1 = Lambda(lambda p: p[:, 1:])(p) # weighted summation of all sentence encodings until now # weight = probability that sentences was part of summary s = Lambda(lambda x: x[0] + (x[1] * x[2]))([s, p1, sent]) output = Reshape((args.doc_len, 2), name='prob_reshape')(concatenate(probs, -1)) model = Model(inputs=doc_input, outputs=output) model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy']) return model
def call(self, a, states): c_tm1 = states[:self.nb_layers] e_tm1 = states[self.nb_layers:2 * self.nb_layers] r_tm1 = states[2 * self.nb_layers:3 * self.nb_layers] if self.extrap_start_time is not None: t = states[-1] # The previous prediction will be treated as the actual if t between t_extrap_start and t_extrap_end a = K.switch( tf.logical_and(t >= self.t_extrap_start, t < self.t_extrap_end), states[-2], a) c = [] r = [] e = [] # Update R units starting from the top for l in reversed(range(self.nb_layers)): inputs = [r_tm1[l], e_tm1[l]] if l < self.nb_layers - 1: inputs.append(_r) inputs = K.concatenate(inputs) i = self.layers['i'][l].call(inputs) f = self.layers['f'][l].call(inputs) o = self.layers['o'][l].call(inputs) _c = f * c_tm1[l] + i * self.layers['c'][l].call(inputs) if l == 0: _r = o * _c else: _r = o * activations.tanh(_c) c.insert(0, _c) r.insert(0, _r) # Update feed-forward path starting from the bottom for l in range(self.nb_layers): ahat = self.layers['ahat'][l].call(r[l]) if l == 0: prediction = ahat # compute errors e_up = activations.relu(ahat - a) e_down = activations.relu(a - ahat) e.append(K.concatenate([e_up, e_down])) if l < self.nb_layers - 1: a = self.layers['a'][l].call(e[l]) if self.output_mode == 'prediction': output = prediction else: for l in range(self.nb_layers): layer_error = K.mean(K.batch_flatten(e[l]), axis=-1, keepdims=True) all_error = layer_error if l == 0 else K.concatenate( [all_error, layer_error]) if self.output_mode == 'error': output = all_error else: output = K.concatenate([prediction, all_error]) states = c + e + r if self.extrap_start_time is not None: states += [prediction, t + 1] return output, states
def call(self, inputs, states, training=None): if 0 < self.dropout < 1 and self._dropout_mask is None: self._dropout_mask = _generate_dropout_mask( K.ones_like(inputs), self.dropout, training=training, count=4) if (0 < self.recurrent_dropout < 1 and self._recurrent_dropout_mask is None): self._recurrent_dropout_mask = _generate_dropout_mask( K.ones_like(states[0]), self.recurrent_dropout, training=training, count=4) # dropout matrices for input units dp_mask = self._dropout_mask # dropout matrices for recurrent units rec_dp_mask = self._recurrent_dropout_mask h_tm1 = states[0] # previous memory state c_tm1 = states[1] # previous carry state # repeat the hidden state to the length of the sequence _htm1 = K.repeat(h_tm1, self.seq_len) _Whtm1 = K.dot(_htm1, self.W_a) _Uinpt = K.dot(self._seq_input, self.U_a) # calculate the attention probabilities et = K.dot(activations.tanh(_Whtm1 + _Uinpt), K.expand_dims(self.V_a)) at = K.exp(et) at_sum = K.sum(at, axis=1) at_sum_repeated = K.repeat(at_sum, self.seq_len) at /= at_sum_repeated # (batch_size, seq_len, 1) # calculate the context vector context = K.squeeze(K.batch_dot(at, self._seq_input, axes=1), axis=1) if self.implementation == 1: if 0 < self.dropout < 1.: inputs_i = context * dp_mask[0] inputs_f = context * dp_mask[1] inputs_c = context * dp_mask[2] inputs_o = context * dp_mask[3] else: inputs_i = context inputs_f = context inputs_c = context inputs_o = context x_i = K.dot(inputs_i, self.kernel_i) x_f = K.dot(inputs_f, self.kernel_f) x_c = K.dot(inputs_c, self.kernel_c) x_o = K.dot(inputs_o, self.kernel_o) if self.use_bias: x_i = K.bias_add(x_i, self.bias_i) x_f = K.bias_add(x_f, self.bias_f) x_c = K.bias_add(x_c, self.bias_c) x_o = K.bias_add(x_o, self.bias_o) if 0 < self.recurrent_dropout < 1.: h_tm1_i = h_tm1 * rec_dp_mask[0] h_tm1_f = h_tm1 * rec_dp_mask[1] h_tm1_c = h_tm1 * rec_dp_mask[2] h_tm1_o = h_tm1 * rec_dp_mask[3] else: h_tm1_i = h_tm1 h_tm1_f = h_tm1 h_tm1_c = h_tm1 h_tm1_o = h_tm1 i = self.recurrent_activation(x_i + K.dot(h_tm1_i, self.recurrent_kernel_i)) f = self.recurrent_activation(x_f + K.dot(h_tm1_f, self.recurrent_kernel_f)) c = f * c_tm1 + i * self.activation(x_c + K.dot(h_tm1_c, self.recurrent_kernel_c)) o = self.recurrent_activation(x_o + K.dot(h_tm1_o, self.recurrent_kernel_o)) else: if 0. < self.dropout < 1.: inputs *= dp_mask[0] z = K.dot(inputs, self.kernel) if 0. < self.recurrent_dropout < 1.: h_tm1 *= rec_dp_mask[0] z += K.dot(h_tm1, self.recurrent_kernel) if self.use_bias: z = K.bias_add(z, self.bias) z0 = z[:, :self.units] z1 = z[:, self.units: 2 * self.units] z2 = z[:, 2 * self.units: 3 * self.units] z3 = z[:, 3 * self.units:] i = self.recurrent_activation(z0) f = self.recurrent_activation(z1) c = f * c_tm1 + i * self.activation(z2) o = self.recurrent_activation(z3) h = o * self.activation(c) if 0 < self.dropout + self.recurrent_dropout: if training is None: h._uses_learning_phase = True self.attention_weight.append(at) return h, [h, c]
def _split_and_apply_activations(self, controller_output): """ This takes the controller output, splits it in ntm_output, read and wright adressing data. It returns a triple of ntm_output, controller_instructions_read, controller_instructions_write. ntm_output is a tensor, controller_instructions_read and controller_instructions_write are lists containing the adressing instruction (k, beta, g, shift, gamma) and in case of write also the writing constructions, consisting of an erase and an add vector. As it is necesseary for stable results, k and add_vector is activated via tanh, erase_vector via sigmoid (this is critical!), shift via softmax, gamma is sigmoided, inversed and clipped (probably not ideal) g is sigmoided, beta is linear (probably not ideal!) """ # splitting ntm_output, controller_instructions_read, controller_instructions_write = tf.split( controller_output, np.asarray([self.output_dim, self.read_heads * self.controller_read_head_emitting_dim, self.write_heads * self.controller_write_head_emitting_dim]), axis=1) controller_instructions_read = tf.split(controller_instructions_read, self.read_heads, axis=1) controller_instructions_write = tf.split(controller_instructions_write, self.write_heads, axis=1) controller_instructions_read = [ tf.split(single_head_data, np.asarray([self.m_depth, 1, 1, 3, 1]), axis=1) for single_head_data in controller_instructions_read] controller_instructions_write = [ tf.split(single_head_data, np.asarray([self.m_depth, 1, 1, 3, 1, self.m_depth, self.m_depth]), axis=1) for single_head_data in controller_instructions_write] #activation ntm_output = self.activation(ntm_output) # original activations, IVM #controller_instructions_read = [(tanh(k), hard_sigmoid(beta)+0.5, sigmoid(g), softmax(shift), 1 + 9*sigmoid(gamma)) for # (k, beta, g, shift, gamma) in controller_instructions_read] #controller_instructions_write = [ # (tanh(k), hard_sigmoid(beta)+0.5, sigmoid(g), softmax(shift), # 1 + 9*sigmoid(gamma), hard_sigmoid(erase_vector), tanh(add_vector)) # for (k, beta, g, shift, gamma, erase_vector, add_vector) in controller_instructions_write] # IVM activations controller_instructions_read = [ ( tanh(k), # key softplus(beta), # beta, content based similarity sigmoid(g), # interpolation softmax(shift), # shift filter 1 + softplus(gamma) # gamma, focus sharpening ) for (k, beta, g, shift, gamma) in controller_instructions_read] controller_instructions_write = [ ( tanh(k), # key softplus(beta), # beta sigmoid(g), # interpolation softmax(shift), # shift filter 1 + softplus(gamma), # gamma, focus sharpening sigmoid(erase_vector), # erase tanh(add_vector) # add ) for (k, beta, g, shift, gamma, erase_vector, add_vector) in controller_instructions_write] return (ntm_output, controller_instructions_read, controller_instructions_write)
def step(self, x, states): ytm, stm, t = states # repeat the hidden state to the length of the sequence _stm = K.repeat(stm, self.timesteps) # now multiplty the weight matrix with the repeated hidden state _Wxstm = K.dot(_stm, self.W_a) # calculate the attention probabilities # this relates how much other timesteps contributed to this one. et = K.dot(activations.tanh(_Wxstm + self._uxpb), K.expand_dims(self.V_a)) if self.causal and not self.use_attention_horizon: is_future = K.greater(self._input_t, t) mask = K.cast(is_future, 'float32') * -10e9 et = et + K.expand_dims(K.expand_dims(mask, -1), 0) elif self.causal and self.use_attention_horizon: is_future = K.greater(self._input_t, t) is_beyond_horizon = K.less(self._input_t, t - self.attn_horizon) mask_future = K.cast(is_future, 'float32') * -10e9 mask_past = K.cast(is_beyond_horizon, 'float32') * -10e9 mask = mask_future + mask_past et = et + K.expand_dims(K.expand_dims(mask, -1), 0) at = K.softmax(et, axis=1) # calculate the context vector context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1) # ~~~> calculate new hidden state # first calculate the "r" gate: rt = activations.sigmoid( K.dot(ytm, self.W_r) + K.dot(stm, self.U_r) + K.dot(context, self.C_r) + self.b_r) # now calculate the "z" gate zt = activations.sigmoid( K.dot(ytm, self.W_z) + K.dot(stm, self.U_z) + K.dot(context, self.C_z) + self.b_z) # calculate the proposal hidden state: s_tp = activations.tanh( K.dot(ytm, self.W_p) + K.dot((rt * stm), self.U_p) + K.dot(context, self.C_p) + self.b_p) # new hidden state: st = (1 - zt) * stm + zt * s_tp yt = self.activation( K.dot(ytm, self.W_o) + K.dot(stm, self.U_o) + K.dot(context, self.C_o) + self.b_o) t += 1 if self.return_probabilities: return at, [yt, st, t] else: return yt, [yt, st, t]
def step(self, x, states): ytm, stm = states # repeat the hidden state to the length of the sequence _stm = K.repeat(stm, self.timesteps) # now multiplty the weight matrix with the repeated hidden state _Wxstm = K.dot(_stm, self.W_a) # calculate the attention probabilities # this relates how much other timesteps contributed to this one. et = K.dot(activations.tanh(_Wxstm + self._uxpb), K.expand_dims(self.V_a)) at = K.exp(et) at_sum = K.sum(at, axis=1) at_sum_repeated = K.repeat(at_sum, self.timesteps) at /= at_sum_repeated # vector of size (batchsize, timesteps, 1) # calculate the context vector context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1) # ~~~> calculate new hidden state # first calculate the "r" gate: rt = activations.sigmoid( K.dot(ytm, self.W_r) + K.dot(stm, self.U_r) + K.dot(context, self.C_r) + self.b_r) # now calculate the "z" gate zt = activations.sigmoid( K.dot(ytm, self.W_z) + K.dot(stm, self.U_z) + K.dot(context, self.C_z) + self.b_z) # calculate the proposal hidden state: s_tp = activations.tanh( K.dot(ytm, self.W_p) + K.dot((rt * stm), self.U_p) + K.dot(context, self.C_p) + self.b_p) # new hidden state: st = (1 - zt) * stm + zt * s_tp yt = activations.softmax( K.dot(ytm, self.W_o) + K.dot(stm, self.U_o) + K.dot(context, self.C_o) + self.b_o) if self.return_probabilities: return at, [yt, st] else: return yt, [yt, st]
#dec_seq = Reshape((-1,1,latent_dim))(dec_seq) #dec_seq = K.squeeze(dec_seq,0) print ("dec_seq") print (dec_seq) blendW1 = TimeDistributed(Dense(latent_dim))(en_seq) #blendW1 = TimeDistributed(Dense(latent_dim)(en_seq) #?,input_seq_length,latent_dim print ('blendW1') print (blendW1) #blendW2 = TimeDistributed(Dense(latent_dim),ouput_dim=1)(dec_seq) blendW2 = TimeDistributed(Dense(latent_dim))(dec_seq) print ('blendW2') print (blendW2) blend3 = tanh(blendW1+blendW2) print ("blend3") print (blend3) #blend3 = K.squeeze(blend3,0) #print ("blend3 squeezed") #print (blend3) U = dot([blend3,vt],(0,1)) print ('U') print (U) U = K.squeeze(U, 0) print ('U squeezed') print (U) # make probability tensor decoder_dense = Dense(num_encoder_tokens, activation='softmax') outputs = decoder_dense(U)
def call(self, inputs, states, training=None): if 0 < self.dropout < 1 and self._dropout_mask is None: self._dropout_mask = _generate_dropout_mask( _generate_dropout_ones(inputs, K.shape(inputs)[-1] + self.annotation_units), self.dropout, training=training, count=4) if (0 < self.recurrent_dropout < 1 and self._recurrent_dropout_mask is None): self._recurrent_dropout_mask = _generate_dropout_mask( _generate_dropout_ones(inputs, self.units), self.recurrent_dropout, training=training, count=4) # dropout matrices for input units dp_mask = self._dropout_mask # dropout matrices for recurrent units rec_dp_mask = self._recurrent_dropout_mask h_tm1 = states[0] # previous memory state c_tm1 = states[1] # previous carry state # attention mechanism # repeat the hidden state to the length of the sequence _stm = K.repeat(h_tm1, self.annotation_timesteps) # multiplty the weight matrix with the repeated (current) hidden state _Wxstm = K.dot(_stm, self.kernel_w) # calculate the attention probabilities et = K.dot(activations.tanh(_Wxstm + self._uh), K.expand_dims(self.kernel_v)) at = K.exp(et) at_sum = K.sum(at, axis=1) at_sum_repeated = K.repeat(at_sum, self.annotation_timesteps) at /= at_sum_repeated # vector of size (batchsize, timesteps, 1) # calculate the context vector context = K.squeeze(K.batch_dot(at, self.annotations, axes=1), axis=1) # append the context vector to the inputs inputs = K.concatenate([inputs, context]) if self.implementation == 1: if 0 < self.dropout < 1.: inputs_i = inputs * dp_mask[0] inputs_f = inputs * dp_mask[1] inputs_c = inputs * dp_mask[2] inputs_o = inputs * dp_mask[3] else: inputs_i = inputs inputs_f = inputs inputs_c = inputs inputs_o = inputs x_i = K.dot(inputs_i, self.kernel_i) x_f = K.dot(inputs_f, self.kernel_f) x_c = K.dot(inputs_c, self.kernel_c) x_o = K.dot(inputs_o, self.kernel_o) if self.use_bias: x_i = K.bias_add(x_i, self.bias_i) x_f = K.bias_add(x_f, self.bias_f) x_c = K.bias_add(x_c, self.bias_c) x_o = K.bias_add(x_o, self.bias_o) if 0 < self.recurrent_dropout < 1.: h_tm1_i = h_tm1 * rec_dp_mask[0] h_tm1_f = h_tm1 * rec_dp_mask[1] h_tm1_c = h_tm1 * rec_dp_mask[2] h_tm1_o = h_tm1 * rec_dp_mask[3] else: h_tm1_i = h_tm1 h_tm1_f = h_tm1 h_tm1_c = h_tm1 h_tm1_o = h_tm1 i = self.recurrent_activation(x_i + K.dot(h_tm1_i, self.recurrent_kernel_i)) f = self.recurrent_activation(x_f + K.dot(h_tm1_f, self.recurrent_kernel_f)) c = f * c_tm1 + i * self.activation(x_c + K.dot(h_tm1_c, self.recurrent_kernel_c)) o = self.recurrent_activation(x_o + K.dot(h_tm1_o, self.recurrent_kernel_o)) else: if 0. < self.dropout < 1.: inputs *= dp_mask[0] z = K.dot(inputs, self.kernel) if 0. < self.recurrent_dropout < 1.: h_tm1 *= rec_dp_mask[0] z += K.dot(h_tm1, self.recurrent_kernel) if self.use_bias: z = K.bias_add(z, self.bias) z0 = z[:, :self.units] z1 = z[:, self.units: 2 * self.units] z2 = z[:, 2 * self.units: 3 * self.units] z3 = z[:, 3 * self.units:] i = self.recurrent_activation(z0) f = self.recurrent_activation(z1) c = f * c_tm1 + i * self.activation(z2) o = self.recurrent_activation(z3) h = o * self.activation(c) if 0 < self.dropout + self.recurrent_dropout: if training is None: h._uses_learning_phase = True return h, [h, c]
def rating(Goz, discriminator): DoGoz = discriminator(Goz) loss = AGAN.loss_G(DoGoz) return tanh(loss)