def preprocess_input(self, x): ''' We have to override this preprocessing step, because if we are using the cpu, we do the weight - input multiplications in the internals of the GRU as seperate, smaller matrix multiplications and concatenate them after. Therefore, before this happens, we split off the attention and then add it back afterwards. ''' if self.consume_less == 'cpu': attention = x[:, :, 0] # Shape:(samples, knowledge_length) x = x[:, :, 1:] # Shape:(samples, knowledge_length, word_dim) input_shape = self.input_spec[0].shape input_dim = input_shape[2] - 1 timesteps = input_shape[1] x_z = time_distributed_dense(x, self.W_z, self.b_z, self.dropout_W, input_dim, self.output_dim, timesteps) x_r = time_distributed_dense(x, self.W_r, self.b_r, self.dropout_W, input_dim, self.output_dim, timesteps) x_h = time_distributed_dense(x, self.W_h, self.b_h, self.dropout_W, input_dim, self.output_dim, timesteps) # Add attention back on to it's original place. return K.concatenate([K.expand_dims(attention, 2), x_z, x_r, x_h], axis=2) else: return x
def preprocess_input(self, x, train=False): if self.consume_less == 'cpu': if train and (0 < self.dropout_W < 1): dropout = self.dropout_W else: dropout = 0 input_shape = self.input_spec[0].shape input_dim = input_shape[2] timesteps = input_shape[1] x_i = time_distributed_dense(x, self.W_i, self.b_i, dropout, input_dim, self.output_dim, timesteps) x_f = time_distributed_dense(x, self.W_f, self.b_f, dropout, input_dim, self.output_dim, timesteps) x_c = time_distributed_dense(x, self.W_c, self.b_c, dropout, input_dim, self.output_dim, timesteps) x_o = time_distributed_dense(x, self.W_o, self.b_o, dropout, input_dim, self.output_dim, timesteps) return K.concatenate([x_i, x_f, x_c, x_o], axis=2) else: return x
def call(self, x, mask=None): ''' x: batch_size * time_steps* input_dim ''' check_and_throw_if_fail(K.ndim(x) == 3, "x") input_dim = shape(x)[2] time_steps = shape(x)[1] ui = K.tanh( time_distributed_dense(x, self.Ws, self.bs, input_dim=input_dim, output_dim=self.attention_weight_vector_dim, timesteps=time_steps) ) # batch_size, time_steps, attention_weight_vector_dim ai = K.exp( time_distributed_dense( ui, K.expand_dims(self.us, 1), input_dim=self.attention_weight_vector_dim, output_dim=1, timesteps=time_steps)) # batch_size, time_steps, 1 sum_of_ai = K.sum(ai, 1, keepdims=True) # batch_size 1 1 ai = ai / sum_of_ai # batch_size * time_steps * 1 # batch_size *time_steps * input_dim -> batch_size* input_dim output = K.sum(ai * x, 1) if self.element_wise_output_transformer: return self.element_wise_output_transformer(output) else: return output
def preprocess_input(self, x): if self.consume_less == 'cpu': input_shape = K.int_shape(x) input_dim = input_shape[2] timesteps = input_shape[1] x_f = time_distributed_dense(x, self.W_f, self.b_f, self.dropout_W, input_dim, self.output_dim, timesteps) x_h = time_distributed_dense(x, self.W_h, self.b_h, self.dropout_W, input_dim, self.output_dim, timesteps) return K.concatenate([x_f, x_h], axis=2) else: return x
def step(self, x_input, states): input_shape = self.input_spec[0].shape en_seq = states[-1] _, [h, c] = super(PointerLSTM, self).step(x_input, states[:-1]) # vt*tanh(W1*e+W2*d) dec_seq = K.repeat(h, input_shape[1]) Eij = time_distributed_dense(en_seq, self.W1, output_dim=1) Dij = time_distributed_dense(dec_seq, self.W2, output_dim=1) U = self.vt * tanh(Eij + Dij) U = K.squeeze(U, 2) # make probability tensor pointer = softmax(U) return pointer, [h, c]
def preprocess_input(self, x): if self.consume_less == 'cpu': input_shape = self.input_spec[0].shape input_dim = input_shape[2] timesteps = input_shape[1] return time_distributed_dense(x, self.W, self.b, self.dropout_W, input_dim, self.output_dim, timesteps) else: return x
def preprocess_input(self, x): if self.consume_less == 'cpu': input_shape = K.int_shape(x) input_dim = input_shape[2] timesteps = input_shape[1] return time_distributed_dense(x, self.W, self.b, self.dropout_W, input_dim, self.hidden_recurrent_dim, timesteps) else: return x
def preprocess_input(self, x): if self.consume_less == 'cpu': input_shape = self.input_spec[0].shape input_dim = input_shape[2] timesteps = input_shape[1] return time_distributed_dense(x, self.W, self.b, self.dropout_W, input_dim, self.output_dim, timesteps) else: return x
def step(self, x_input, states): input_shape = self.input_spec[0].shape en_seq = states[-1] _, [h, c] = super(PointerLSTM, self).step(x_input, states[:-1]) # vt*tanh(W1*e+W2*d) dec_seq = K.repeat(h, input_shape[1]) #dec_seq = K.repeat(h, 2) print ('dec_seq') print (dec_seq) Eij = time_distributed_dense(en_seq, self.W1, output_dim=1) Dij = time_distributed_dense(dec_seq, self.W2, output_dim=1) U = self.vt * tanh(Eij + Dij) print ('U') print (U) U = K.squeeze(U, 2) print ('U squeezed') print (U) # make probability tensor pointer = softmax(U) return pointer, [h, c]
def step(self, x, states): h_tm1, c_tm1, y_tm1, B, U, H = states s = K.dot(c_tm1, self.W_h) + self.b_h s = K.repeat(s, self.input_length) energy = time_distributed_dense(s + H, self.W_a, self.b_a) energy = K.squeeze(energy, 2) alpha = K.softmax(energy) alpha = K.repeat(alpha, self.input_dim) alpha = K.permute_dimensions(alpha, (0, 2, 1)) weighted_H = H * alpha v = K.sum(weighted_H, axis=1) y, new_states = super(AttentionDecoder, self).step(v, states[:-1]) return y, new_states
def preprocess_input(self, x): #self.STACK[:] = 0.0 # when we see new data, zero out stack and pointer #self.POINT[:] = 0.0 ###shape = K.int_shape(self.X) ###self.STACK = K.variable(np.zeros((shape[0],shape[1]))) ###self.POINT = K.variable(np.zeros((shape[0]))) if self.consume_less == 'cpu': input_shape = K.int_shape(x) input_dim = input_shape[2] timesteps = input_shape[1] x_z = time_distributed_dense(x, self.W_z, self.b_z, self.dropout_W, input_dim, self.output_dim, timesteps) x_r = time_distributed_dense(x, self.W_r, self.b_r, self.dropout_W, input_dim, self.output_dim, timesteps) x_h = time_distributed_dense(x, self.W_h, self.b_h, self.dropout_W, input_dim, self.output_dim, timesteps) to_return = K.concatenate([x_z, x_r, x_h], axis=2) else: to_return = x return K.concatenate([self.X, to_return], axis=-1)
def step(self, x, states): h_tm1, c_tm1, y_tm1, B, U, H = states s = K.dot(c_tm1, self.W_h) + self.b_h s = K.repeat(s, self.input_length) energy = time_distributed_dense(s + H, self.W_a, self.b_a) energy = K.squeeze(energy, 2) alpha = K.softmax(energy) alpha = K.repeat(alpha, self.input_dim) alpha = K.permute_dimensions(alpha, (0, 2, 1)) weighted_H = H * alpha v = K.sum(weighted_H, axis=1) y, new_states = super(AttentionDecoder, self).step(v, states[:-1]) return y, new_states
def preprocess_input(self, x): #x = input #print(x) #return x if 0 < self.dropout_W < 1: dropout = self.dropout_W else: dropout = 0 input_shape = self.input_spec[0].shape input_dim = input_shape[2] timesteps = input_shape[1] x_i = time_distributed_dense(x, self.W_i, self.b_i, dropout, input_dim, self.output_dim, timesteps) x_f = time_distributed_dense(x, self.W_f, self.b_f, dropout, input_dim, self.output_dim, timesteps) x_c = time_distributed_dense(x, self.W_c, self.b_c, dropout, input_dim, self.output_dim, timesteps) x_o = time_distributed_dense(x, self.W_o, self.b_o, dropout, input_dim, self.output_dim, timesteps) return K.concatenate([x_i, x_f, x_c, x_o], axis=2)
def preprocess_input(self, x): if self.consume_less == 'cpu': input_shape = self.input_spec[0].shape input_dim = input_shape[2] timesteps = input_shape[1] x = time_distributed_dense(x, self.W_out, self.b_out, self.dropout_W, input_dim, self.output_dim, timesteps) # x_r = time_distributed_dense(x, self.W_r, self.b_r, self.dropout_W, # input_dim, self.output_dim, timesteps) # x_h = time_distributed_dense(x, self.W_h, self.b_h, self.dropout_W, # input_dim, self.output_dim, timesteps) return x else: return x