def step(input_n, cell_previous, hid_previous, r_previous, *args): if not self.precompute_input: input_n = T.dot(input_n, W_in_stacked) + b_stacked # Calculate gates pre-activations and slice gates = input_n + T.dot(hid_previous, W_hid_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip( gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous*self.W_cell_to_ingate forgetgate += cell_previous*self.W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # Compute new cell value cell = forgetgate*cell_previous + ingate*cell_input if self.peepholes: outgate += cell*self.W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate*self.nonlinearity(cell) r = r_previous if self.attention: if self.wordbyword: M_partial = T.dot(hid, self.W_h) + T.dot(r_previous, self.W_r) M_partial = M_partial.dimshuffle(0, 'x', 1) M = T.dot(encoder_hs, self.W_y) + M_partial M = nonlinearities.tanh(M) alpha = T.dot(M, self.w) alpha = T.flatten(alpha, 2) alpha = T.nnet.softmax(alpha) alpha = alpha.dimshuffle(0, 1, 'x') r = T.sum(encoder_hs*alpha, axis=1) + nonlinearities.tanh(T.dot(r_previous, self.W_t)) return [cell, hid, r]
def Ep_Gate(c, m, q, Wb, W1, W2, b1, b2): z = T.concatenate([ c, m, q, c * q, c * m, T.abs_(c - q), T.abs_(c - m), c * Wb * q, c * Wb * m ], axis=2) #g = (T.dot(W2, nonlin.tanh(T.dot(z, W1) + b1)) + b2) <- (big mistake :) g = (T.dot(nonlin.tanh(T.dot(W1, z) + b1), W2) + b2) return g
def get_output_for(self, inputs, **kwargs): input = inputs[0] mask = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] (d1, d2, d3) = input.shape # out = T.tensordot(input, self.W, axes=[[2], [0]]) # b_shuffled = self.b.dimshuffle('x', 'x', 0) # out += b_shuffled # out = tanh(out) # out *= mask.dimshuffle(0, 1, 'x') # out = T.batched_dot(out, out.dimshuffle(0, 2, 1)) q = T.tensordot(input, self.W1, axes=[[2], [0]]) b1_shuffled = self.b1.dimshuffle('x', 'x', 0) q += b1_shuffled q = tanh(q) # k = T.tensordot(input, self.W2, axes=[[2], [0]]) # b2_shuffled = self.b2.dimshuffle('x', 'x', 0) # k += b2_shuffled # k = tanh(k) q *= mask.dimshuffle(0, 1, 'x') # k *= mask.dimshuffle(0, 1, 'x') out = T.batched_dot(q, q.dimshuffle(0, 2, 1)) #out /= np.sqrt(self.nu) #out *= 0.1 out *= (1 - T.eye(d2, d2)) matrix = softmax(out.reshape((d1 * d2, d2))).reshape((d1, d2, d2)) matrix *= mask.dimshuffle(0, 1, 'x') matrix *= mask.dimshuffle(0, 'x', 1) return matrix
def step(input_n, cell_previous, hid_previous, previous_r, *args): if not self.precompute_input: input_n = T.dot(input_n, W_in_stacked) + b_stacked # Calculate gates pre-activations and slice gates = input_n + T.dot(hid_previous, W_hid_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip(gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous * self.W_cell_to_ingate forgetgate += cell_previous * self.W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # Compute new cell value cell = forgetgate * cell_previous + ingate * cell_input if self.peepholes: outgate += cell * self.W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate * self.nonlinearity(cell) r = previous_r if self.attention and self.word_by_word: mh = T.dot(hid, self.W_h_attend) + T.dot( previous_r, self.W_r_attend) # mh is (n_batch, 1, n_features) mh = mh.dimshuffle(0, 'x', 1) M = T.dot(encoder_hs, self.W_y_attend) + mh # (n_batch, n_time_steps, n_features) M = nonlinearities.tanh(M) # alpha is (n_batch, n_time_steps, 1) alpha = T.dot(M, self.w_attend) # now is (n_batch, n_time_steps) alpha = T.flatten(alpha, 2) # 0 after softmax is not 0, f**k, my mistake. # when i > encoder_seq_len, fill alpha_i to -np.inf # alpha = T.switch(encoder_mask, alpha, -np.inf) alpha = T.nnet.softmax(alpha) # apply encoder_mask to alpha # encoder_mask is (n_batch, n_time_steps) # when i > encoder_seq_len, alpha_i should be 0. # actually not need mask, but in case of error # alpha = alpha * encoder_mask alpha = alpha.dimshuffle(0, 1, 'x') weighted_encoder = T.sum(encoder_hs * alpha, axis=1) r = weighted_encoder + nonlinearities.tanh( T.dot(previous_r, self.W_t_attend)) return [cell, hid, r]
def step(input_i, cell_previous, hid_previous, *args): # word-by-word attention mh = T.dot(hid_previous, self.W_a_pointer) mh += self.b_a_pointer # mh is (n_batch, 1, n_features) mh = mh.dimshuffle(0, 'x', 1) M = T.dot(passage, self.V_pointer) + mh # (n_batch, passage_seq_len, n_features) M = nonlinearities.tanh(M) # alpha is (n_batch, passage_seq_len, 1) alpha = T.dot(M, self.v_pointer) # now is (n_batch, passage_seq_len) alpha = T.flatten(alpha, 2) alpha += self.c_pointer # 0 after softmax is not 0, f**k, my mistake. # when i >= passage_seq_len, fill alpha_i to -np.inf # apply passage_mask to alpha # passage_mask is (n_batch, passage_seq_len) alpha = T.switch(mask, alpha, -np.inf) alpha = T.nnet.softmax(alpha) # when i >= passage_seq_len, alpha_i should be 0. # actually not need mask, but in case of error # alpha = alpha * mask alpha = alpha.dimshuffle(0, 1, 'x') weighted_passage = T.sum(passage * alpha, axis=1) # (n_batch, n_features) input_n = weighted_passage if not self.precompute_input: input_n = T.dot(input_n, W_in_stacked) + b_stacked # Calculate gates pre-activations and slice gates = input_n + T.dot(hid_previous, W_hid_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip(gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous * self.W_cell_to_ingate forgetgate += cell_previous * self.W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # Compute new cell value cell = forgetgate * cell_previous + ingate * cell_input if self.peepholes: outgate += cell * self.W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate * self.nonlinearity(cell) return [cell, hid, alpha]
def tanh_temperature(x, temperature=1): from lasagne.nonlinearities import tanh return tanh(x * temperature)
def step(input_n, cell_previous, hid_previous, avg_previous, *args): x = input_n if not self.precompute_input: input_n = T.dot(input_n, W_in_stacked) + b_stacked # Calculate gates pre-activations and slice gates = input_n + T.dot(hid_previous, W_hid_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip(gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous * self.W_cell_to_ingate forgetgate += cell_previous * self.W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # Compute new cell value cell = forgetgate * cell_previous + ingate * cell_input if self.peepholes: outgate += cell * self.W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate * self.nonlinearity(cell) avg_input = T.dot(x, self.W_avg1) + T.dot(hid, self.W_avg2) + self.b_avg if self.model_type == 1: avg = x * nonlinearities.sigmoid(avg_input) elif self.model_type == 6: avg = nonlinearities.tanh(avg_input) elif self.model_type == 7: avg_input = T.dot(x, self.W_avg1) * T.dot( hid, self.W_avg2) + self.b_avg avg = x * nonlinearities.sigmoid(avg_input) elif self.model_type == 2: avg = hid * nonlinearities.sigmoid(avg_input) elif self.model_type == 3: avg_input2 = T.dot(x, self.W_avg12) + T.dot( hid, self.W_avg22) + self.b_avg2 g1 = nonlinearities.sigmoid(avg_input) g2 = nonlinearities.sigmoid(avg_input2) avg = avg_previous * g1 + x * g2 elif self.model_type == 4: avg_input = T.dot( x, self.W_avg1) + T.dot(hid, self.W_avg2) + T.dot( avg_previous, self.W_avg3) + self.b_avg avg_input2 = T.dot( x, self.W_avg12) + T.dot(hid, self.W_avg22) + T.dot( avg_previous, self.W_avg32) + self.b_avg2 g1 = nonlinearities.sigmoid(avg_input) g2 = nonlinearities.sigmoid(avg_input2) avg = avg_previous * g1 + x * g2 elif self.model_type == 5: avg_input2 = T.dot(x, self.W_avg12) + T.dot( hid, self.W_avg22) + self.b_avg2 g1 = nonlinearities.sigmoid(avg_input) g2 = nonlinearities.sigmoid(avg_input2) avg = x * g1 havg = hid * g2 avg = avg + havg return [cell, hid, avg]
def normalize(x): return tanh(x / 4.0)
def get_output_for(self, inputs, **kwargs): """ Compute this layer's output function given a symbolic input variable Parameters ---------- inputs : list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. When the hidden state of this layer is to be pre-filled (i.e. was set to a :class:`Layer` instance) `inputs` should have length at least 2, and `inputs[-1]` is the hidden state to prefill with. When the cell state of this layer is to be pre-filled (i.e. was set to a :class:`Layer` instance) `inputs` should have length at least 2, and `inputs[-1]` is the hidden state to prefill with. When both the cell state and the hidden state are being pre-filled `inputs[-2]` is the hidden state, while `inputs[-1]` is the cell state. Returns ------- layer_output : theano.TensorType Symbolic output variable. """ # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = None hid_init = None cell_init = None encoder_hs = None encoder_mask = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] if self.encoder_mask_incoming_index > 0: # (n_batch, n_time_steps) encoder_mask = inputs[self.encoder_mask_incoming_index] encoder_mask = encoder_mask.astype('float32') cell_init = inputs[self.cell_init_incoming_index] if self.attention: # (n_batch, n_time_steps, n_features) encoder_hs = cell_init[0] # encoder_mask is # (n_batch, n_time_steps, 1) encoder_hs = encoder_hs * encoder_mask.dimshuffle(0, 1, 'x') cell_init = cell_init[1] # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = T.flatten(input, 3) # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, 2) seq_len, num_batch, _ = input.shape # Stack input weight matrices into a (num_inputs, 4*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate( [self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate], axis=1) # Same for hidden weight matrices W_hid_stacked = T.concatenate( [self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell, self.W_hid_to_outgate], axis=1) # Stack biases into a (4*num_units) vector b_stacked = T.concatenate( [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate], axis=0) if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # W_in_stacked is (n_features, 4*num_units). input is then # (n_time_steps, n_batch, 4*num_units). input = T.dot(input, W_in_stacked) + b_stacked # At each call to scan, input_n will be (n_time_steps, 4*num_units). # We define a slicing function that extract the input to each LSTM gate def slice_w(x, n): return x[:, n*self.num_units:(n+1)*self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input def step(input_n, cell_previous, hid_previous, previous_r, *args): if not self.precompute_input: input_n = T.dot(input_n, W_in_stacked) + b_stacked # Calculate gates pre-activations and slice gates = input_n + T.dot(hid_previous, W_hid_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip( gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous*self.W_cell_to_ingate forgetgate += cell_previous*self.W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # Compute new cell value cell = forgetgate*cell_previous + ingate*cell_input if self.peepholes: outgate += cell*self.W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate*self.nonlinearity(cell) r = previous_r if self.attention and self.word_by_word: mh = T.dot(hid, self.W_h_attend) + T.dot(previous_r, self.W_r_attend) # mh is (n_batch, 1, n_features) mh = mh.dimshuffle(0, 'x', 1) M = T.dot(encoder_hs, self.W_y_attend) + mh # (n_batch, n_time_steps, n_features) M = nonlinearities.tanh(M) # alpha is (n_batch, n_time_steps, 1) alpha = T.dot(M, self.w_attend) # now is (n_batch, n_time_steps) alpha = T.flatten(alpha, 2) # 0 after softmax is not 0, f**k, my mistake. # when i > encoder_seq_len, fill alpha_i to -np.inf # alpha = T.switch(encoder_mask, alpha, -np.inf) alpha = T.nnet.softmax(alpha) # apply encoder_mask to alpha # encoder_mask is (n_batch, n_time_steps) # when i > encoder_seq_len, alpha_i should be 0. # actually not need mask, but in case of error # alpha = alpha * encoder_mask alpha = alpha.dimshuffle(0, 1, 'x') weighted_encoder = T.sum(encoder_hs * alpha, axis=1) r = weighted_encoder + nonlinearities.tanh(T.dot(previous_r, self.W_t_attend)) return [cell, hid, r] def step_masked(input_n, mask_n, cell_previous, hid_previous, previous_r, *args): cell, hid, r = step(input_n, cell_previous, hid_previous, previous_r, *args) # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. cell = T.switch(mask_n, cell, cell_previous) hid = T.switch(mask_n, hid, hid_previous) r = T.switch(mask_n, r, previous_r) return [cell, hid, r] if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask] step_fun = step_masked else: sequences = input step_fun = step ones = T.ones((num_batch, 1)) if not isinstance(self.hid_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init = T.dot(ones, self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [W_hid_stacked] # The "peephole" weight matrices are only used when self.peepholes=True if self.peepholes: non_seqs += [self.W_cell_to_ingate, self.W_cell_to_forgetgate, self.W_cell_to_outgate] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_seqs += [W_in_stacked, b_stacked] r_init = T.dot(ones, self.r_init) if self.attention and self.word_by_word: non_seqs += [self.W_y_attend, self.W_h_attend, self.W_r_attend, self.w_attend, self.W_t_attend, encoder_hs, # encoder_mask ] # Scan op iterates over first dimension of input and repeatedly # applies the step function cell_out, hid_out, r_out = theano.scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init, r_init], go_backwards=self.backwards, truncate_gradient=self.gradient_steps, non_sequences=non_seqs, strict=True)[0] # (n_batch, n_features) hid_N = hid_out[-1] out = hid_N if self.attention: if self.word_by_word: r_N = r_out[-1] else: mh = T.dot(hid_N, self.W_h_attend) mh = mh.dimshuffle(0, 'x', 1) M = T.dot(encoder_hs, self.W_y_attend) + mh # (n_batch, n_time_steps, n_features) M = nonlinearities.tanh(M) alpha = T.dot(M, self.w_attend) # (n_batch, n_time_steps) alpha = T.flatten(alpha, 2) # when i > encoder_seq_len, fill alpha_i to -np.inf # alpha = T.switch(encoder_mask, alpha, -np.inf) alpha = T.nnet.softmax(alpha) # apply encoder_mask to alpha # encoder_mask is (n_batch, n_time_steps) # when i > encoder_seq_len, alpha_i should be 0. # actually not need mask, but in case of error # alpha = alpha * encoder_mask alpha = alpha.dimshuffle(0, 1, 'x') # (n_batch, n_features) r_N = T.sum(encoder_hs * alpha, axis=1) out = nonlinearities.tanh(T.dot(r_N, self.W_p_attend) + T.dot(hid_N, self.W_x_attend)) return out
def get_output_for(self, inputs, **kwargs): num_batch, _, _ = inputs.shape #add padded zeros in front of sequence padded_input = T.concatenate([T.zeros((num_batch, self.filter_width - 1, self.original_features)), inputs], axis=1) #reshape input to include 1 filter dimension rs = padded_input.dimshuffle([0, 'x', 1, 2]) #apply convolutions for all "gates" (output = (n_batch, n_filters, n_time_steps, 1)) Z = nonlinearities.tanh(T.nnet.conv2d(rs, self.Z_W, input_shape=(None, 1, self.internal_seq_len, self.original_features), filter_shape=(self.num_units, 1, self.filter_width, self.original_features))) F = nonlinearities.sigmoid(T.nnet.conv2d(rs, self.F_W, input_shape=(None, 1, self.internal_seq_len, self.original_features), filter_shape=(self.num_units, 1, self.filter_width, self.original_features))) if self.pooling == 'fo' or self.pooling == 'ifo': O = nonlinearities.sigmoid(T.nnet.conv2d(rs, self.O_W, input_shape=(None, 1, self.internal_seq_len, self.original_features), filter_shape=(self.num_units, 1, self.filter_width, self.original_features))) if self.pooling == 'ifo': I = nonlinearities.sigmoid(T.nnet.conv2d(rs, self.I_W, input_shape=(None, 1, self.internal_seq_len, self.original_features), filter_shape=(self.num_units, 1, self.filter_width, self.original_features))) # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) Z = Z.flatten(ndim=3) Z = Z.dimshuffle([2, 0, 1]) F = F.flatten(ndim=3) F = F.dimshuffle([2, 0, 1]) if self.pooling == 'fo' or self.pooling == 'ifo': O = O.flatten(ndim=3) O = O.dimshuffle([2, 0, 1]) if self.pooling == 'ifo': I = I.flatten(ndim=3) I = I.dimshuffle([2, 0, 1]) # Dot against a 1s vector to repeat to shape (num_batch, num_units) ones = T.ones((num_batch, 1)) hid_init = T.dot(ones, self.hid_init) # Create single recurrent computation step function # input_n is the n'th vector of the input: (n_batch, n_features) def step_f(forget_n, z_n, hid_previous, *args): return forget_n * hid_previous + (1.0 - forget_n) * z_n def step_fo(forget_n, z_n, o_n, hid_previous, cell_previous, *args): cell_current = forget_n * cell_previous + (1.0 - forget_n) * z_n hid_current = o_n * cell_current return [hid_current, cell_current] def step_ifo(forget_n, z_n, o_n, i_n, hid_previous, cell_previous, *args): cell_current = forget_n * cell_previous + i_n * z_n hid_current = o_n * cell_current return [hid_current, cell_current] if self.pooling == 'f': step = step_f sequences = [F, Z] outputs_info = [hid_init] if self.pooling == 'fo': step = step_fo sequences = [F, Z, O] # Note that, below, we use hid_init as the initial /cell/ state! # That way we only need to declare one set of weights outputs_info = [T.zeros((num_batch, self.num_units)), hid_init] if self.pooling == 'ifo': step = step_ifo sequences = [F, Z, O, I] outputs_info = [T.zeros((num_batch, self.num_units)), hid_init] outputs = theano.scan( fn=step, sequences=sequences, outputs_info=outputs_info, strict=True)[0] hid_out = outputs if self.pooling == 'fo' or self.pooling == 'ifo': hid_out = outputs[0] # Shuffle back to (n_batch, n_time_steps, n_features) hid_out = hid_out.dimshuffle([1, 0, 2]) return hid_out
def Ep_Gate(c, m, q, Wb, W1, W2, b1, b2): z = T.concatenate([c,m,q,c*q,c*m,T.abs_(c-q),T.abs_(c-m),c*Wb*q,c*Wb*m], axis=2) #g = (T.dot(W2, nonlin.tanh(T.dot(z, W1) + b1)) + b2) <- (big mistake :) g = (T.dot(nonlin.tanh(T.dot(W1, z) + b1), W2) + b2) return g
def tanh_add(x, y): return tanh(T.add(x, y))
def step(input_n, hid_previous_total, *args): hid_previous_facts = hid_previous_total[0:self.num_hidden_units_h] hid_previous_brain = hid_previous_total[self.num_hidden_units_h:] self.cur_sequence_idx += 1 # Updates where we are at in the sequence # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1} hid_input_facts = T.dot(hid_previous_facts, W_hid_stacked) if self.grad_clipping: input_n = theano.gradient.grad_clip( input_n, -self.grad_clipping, self.grad_clipping) hid_input_facts = theano.gradient.grad_clip( hid_input_facts, -self.grad_clipping, self.grad_clipping) if not self.precompute_input: # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c input_n = T.dot(input_n, W_in_stacked) + b_stacked # DS Note: accomplishes the multiplication AND adds bias # Reset and update gates resetgate = slice_w_h(hid_input_facts, 0) + slice_w_h(input_n, 0) updategate = slice_w_h(hid_input_facts, 1) + slice_w_h(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) # DS Edit: DynamMemNet modifiers m_dmn = hid_previous_brain # Note that this should have size c_dmn = input_n # This is a TesnorType<float64, row> q_dmn = self.question_layer # This is a lasagne recurrent GRU layer z_dmn = T.concatenate([c_dmn, m_dmn, q_dmn, c_dmn * q_dmn, abs(c_dmn - q_dmn), abs(c_dmn - m_dmn), T.dot(c_dmn.T, T.dot(self.W_dmn_b, q_dmn)), T.dot(c_dmn.T, T.dot(self.W_dmn_b, m_dmn))], axis=1) G_dmn = nonlinearities.sigmoid(T.dot(self.W_dmn_2, nonlinearities.tanh(T.dot(self.W_dmn_1, z_dmn)) + self.b_dmn_1) + self.b_dmn_2) # Note, you also need W_b for the c and q elements. #something_else = T.dot(hid_previous_facts, W_hid_stacked) hidden_update_in = slice_w_h(input_n, 2) hidden_update_hid = slice_w_h(hid_input_facts, 2) hidden_update_facts = hidden_update_in + resetgate * hidden_update_hid if self.grad_clipping: hidden_update_facts = theano.gradient.grad_clip( hidden_update_facts, -self.grad_clipping, self.grad_clipping) hidden_update_facts = self.nonlinearity_hid(hidden_update_facts) # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate) * hid_previous_facts + updategate * hidden_update_facts # This is the GRU_fact output #output_dmn = G_dmn * hid + (1 - G_dmn) * hid_previous_facts # This is the output of the Dynamic Memory Net modified GRU, Eq. (5) output_dmn = hid # if self.cur_sequence_idx == self.max_seqlen: # hid_input_brain = T.dot(hid_previous_brain, W_brain_hid_stacked) # # if self.grad_clipping: # input_to_brain = theano.gradient.grad_clip( # output_dmn, -self.grad_clipping, self.grad_clipping) # hid_input_brain = theano.gradient.grad_clip( # hid_input_brain, -self.grad_clipping, self.grad_clipping) # else: # input_to_brain = output_dmn # # if not self.precompute_input: # # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c # input_to_brain = T.dot(input_to_brain, W_brain_in_stacked) + b_brain_stacked # DS Note: accomplishes the multiplication AND adds bias # # # Reset and update gates # resetgate_brain = slice_w_m(hid_input_brain, 0) + slice_w_m(input_to_brain, 0) # updategate_brain = slice_w_m(hid_input_brain, 1) + slice_w_m(input_to_brain, 1) # resetgate_brain = self.nonlinearity_brain_resetgate(resetgate_brain) # updategate_brain = self.nonlinearity_brain_updategate(updategate_brain) # # hidden_update_in_brain = slice_w_m(input_to_brain, 2) # hidden_update_brain = slice_w_m(hid_input_brain, 2) # hidden_update_brain = hidden_update_in_brain + resetgate_brain * hidden_update_brain # # if self.grad_clipping: # hidden_update_brain = theano.gradient.grad_clip(hidden_update_brain, -self.grad_clipping, self.grad_clipping) # hidden_update_brain = self.nonlinearity_brain_hid_update(hidden_update_brain) # # hid_brain = (1 - updategate_brain) * hid_previous_brain + updategate_brain * hidden_update_brain # # else: # hid_brain = hid_previous_brain return T.concatenate([output_dmn, hid_brain], axis=1)
def get_output_for(self, inputs, **kwargs): """ Compute this layer's output function given a symbolic input variable Parameters ---------- inputs : list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. When the hidden state of this layer is to be pre-filled (i.e. was set to a :class:`Layer` instance) `inputs` should have length at least 2, and `inputs[-1]` is the hidden state to prefill with. When the cell state of this layer is to be pre-filled (i.e. was set to a :class:`Layer` instance) `inputs` should have length at least 2, and `inputs[-1]` is the hidden state to prefill with. When both the cell state and the hidden state are being pre-filled `inputs[-2]` is the hidden state, while `inputs[-1]` is the cell state. Returns ------- layer_output : theano.TensorType Symbolic output variable. """ # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = None hid_init = None cell_init = None encoder_hs = None encoder_mask = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] if self.encoder_mask_incoming_index > 0: # (n_batch, n_time_steps) encoder_mask = inputs[self.encoder_mask_incoming_index] encoder_mask = encoder_mask.astype('float32') cell_init = inputs[self.cell_init_incoming_index] if self.attention: # (n_batch, n_time_steps, n_features) encoder_hs = cell_init[0] # encoder_mask is # (n_batch, n_time_steps, 1) encoder_hs = encoder_hs * encoder_mask.dimshuffle(0, 1, 'x') cell_init = cell_init[1] # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = T.flatten(input, 3) # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, 2) seq_len, num_batch, _ = input.shape # Stack input weight matrices into a (num_inputs, 4*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate([ self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate ], axis=1) # Same for hidden weight matrices W_hid_stacked = T.concatenate([ self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell, self.W_hid_to_outgate ], axis=1) # Stack biases into a (4*num_units) vector b_stacked = T.concatenate( [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate], axis=0) if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # W_in_stacked is (n_features, 4*num_units). input is then # (n_time_steps, n_batch, 4*num_units). input = T.dot(input, W_in_stacked) + b_stacked # At each call to scan, input_n will be (n_time_steps, 4*num_units). # We define a slicing function that extract the input to each LSTM gate def slice_w(x, n): return x[:, n * self.num_units:(n + 1) * self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input def step(input_n, cell_previous, hid_previous, previous_r, *args): if not self.precompute_input: input_n = T.dot(input_n, W_in_stacked) + b_stacked # Calculate gates pre-activations and slice gates = input_n + T.dot(hid_previous, W_hid_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip(gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous * self.W_cell_to_ingate forgetgate += cell_previous * self.W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # Compute new cell value cell = forgetgate * cell_previous + ingate * cell_input if self.peepholes: outgate += cell * self.W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate * self.nonlinearity(cell) r = previous_r if self.attention and self.word_by_word: mh = T.dot(hid, self.W_h_attend) + T.dot( previous_r, self.W_r_attend) # mh is (n_batch, 1, n_features) mh = mh.dimshuffle(0, 'x', 1) M = T.dot(encoder_hs, self.W_y_attend) + mh # (n_batch, n_time_steps, n_features) M = nonlinearities.tanh(M) # alpha is (n_batch, n_time_steps, 1) alpha = T.dot(M, self.w_attend) # now is (n_batch, n_time_steps) alpha = T.flatten(alpha, 2) # 0 after softmax is not 0, f**k, my mistake. # when i > encoder_seq_len, fill alpha_i to -np.inf # alpha = T.switch(encoder_mask, alpha, -np.inf) alpha = T.nnet.softmax(alpha) # apply encoder_mask to alpha # encoder_mask is (n_batch, n_time_steps) # when i > encoder_seq_len, alpha_i should be 0. # actually not need mask, but in case of error # alpha = alpha * encoder_mask alpha = alpha.dimshuffle(0, 1, 'x') weighted_encoder = T.sum(encoder_hs * alpha, axis=1) r = weighted_encoder + nonlinearities.tanh( T.dot(previous_r, self.W_t_attend)) return [cell, hid, r] def step_masked(input_n, mask_n, cell_previous, hid_previous, previous_r, *args): cell, hid, r = step(input_n, cell_previous, hid_previous, previous_r, *args) # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. cell = T.switch(mask_n, cell, cell_previous) hid = T.switch(mask_n, hid, hid_previous) r = T.switch(mask_n, r, previous_r) return [cell, hid, r] if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask] step_fun = step_masked else: sequences = input step_fun = step ones = T.ones((num_batch, 1)) if not isinstance(self.hid_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init = T.dot(ones, self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [W_hid_stacked] # The "peephole" weight matrices are only used when self.peepholes=True if self.peepholes: non_seqs += [ self.W_cell_to_ingate, self.W_cell_to_forgetgate, self.W_cell_to_outgate ] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_seqs += [W_in_stacked, b_stacked] r_init = T.dot(ones, self.r_init) if self.attention and self.word_by_word: non_seqs += [ self.W_y_attend, self.W_h_attend, self.W_r_attend, self.w_attend, self.W_t_attend, encoder_hs, # encoder_mask ] # Scan op iterates over first dimension of input and repeatedly # applies the step function cell_out, hid_out, r_out = theano.scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init, r_init], go_backwards=self.backwards, truncate_gradient=self.gradient_steps, non_sequences=non_seqs, strict=True)[0] # (n_batch, n_features) hid_N = hid_out[-1] out = hid_N if self.attention: if self.word_by_word: r_N = r_out[-1] else: mh = T.dot(hid_N, self.W_h_attend) mh = mh.dimshuffle(0, 'x', 1) M = T.dot(encoder_hs, self.W_y_attend) + mh # (n_batch, n_time_steps, n_features) M = nonlinearities.tanh(M) alpha = T.dot(M, self.w_attend) # (n_batch, n_time_steps) alpha = T.flatten(alpha, 2) # when i > encoder_seq_len, fill alpha_i to -np.inf # alpha = T.switch(encoder_mask, alpha, -np.inf) alpha = T.nnet.softmax(alpha) # apply encoder_mask to alpha # encoder_mask is (n_batch, n_time_steps) # when i > encoder_seq_len, alpha_i should be 0. # actually not need mask, but in case of error # alpha = alpha * encoder_mask alpha = alpha.dimshuffle(0, 1, 'x') # (n_batch, n_features) r_N = T.sum(encoder_hs * alpha, axis=1) out = nonlinearities.tanh( T.dot(r_N, self.W_p_attend) + T.dot(hid_N, self.W_x_attend)) return out
def step(input_n, cell_previous, hid_previous, previous_r, *args): if not self.precompute_input: input_n = T.dot(input_n, W_in_stacked) + b_stacked # Calculate gates pre-activations and slice gates = input_n + T.dot(hid_previous, W_hid_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip( gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous*self.W_cell_to_ingate forgetgate += cell_previous*self.W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # Compute new cell value cell = forgetgate*cell_previous + ingate*cell_input if self.peepholes: outgate += cell*self.W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate*self.nonlinearity(cell) r = previous_r if self.attention and self.word_by_word: mh = T.dot(hid, self.W_h_attend) + T.dot(previous_r, self.W_r_attend) # mh is (n_batch, 1, n_features) mh = mh.dimshuffle(0, 'x', 1) M = T.dot(encoder_hs, self.W_y_attend) + mh # (n_batch, n_time_steps, n_features) M = nonlinearities.tanh(M) # alpha is (n_batch, n_time_steps, 1) alpha = T.dot(M, self.w_attend) # now is (n_batch, n_time_steps) alpha = T.flatten(alpha, 2) # 0 after softmax is not 0, f**k, my mistake. # when i > encoder_seq_len, fill alpha_i to -np.inf # alpha = T.switch(encoder_mask, alpha, -np.inf) alpha = T.nnet.softmax(alpha) # apply encoder_mask to alpha # encoder_mask is (n_batch, n_time_steps) # when i > encoder_seq_len, alpha_i should be 0. # actually not need mask, but in case of error # alpha = alpha * encoder_mask alpha = alpha.dimshuffle(0, 1, 'x') weighted_encoder = T.sum(encoder_hs * alpha, axis=1) r = weighted_encoder + nonlinearities.tanh(T.dot(previous_r, self.W_t_attend)) return [cell, hid, r]
def step(input_n, hid_previous_total, *args): print("317 into step") print(" type input n: ", type(input_n)) hid_previous_facts = hid_previous_total[0:self.num_hidden_units_h] hid_previous_brain = hid_previous_total[self.num_hidden_units_h:] self.cur_sequence_idx += 1 # Updates where we are at in the sequence # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1} hid_input_facts = T.dot(hid_previous_facts, W_hid_stacked) if self.grad_clipping: input_n = theano.gradient.grad_clip( input_n, -self.grad_clipping, self.grad_clipping) hid_input_facts = theano.gradient.grad_clip( hid_input_facts, -self.grad_clipping, self.grad_clipping) if not self.precompute_input: # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c input_n = T.dot(input_n, W_in_stacked) + b_stacked # DS Note: accomplishes the multiplication AND adds bias # Reset and update gates resetgate = slice_w_h(hid_input_facts, 0) + slice_w_h(input_n, 0) updategate = slice_w_h(hid_input_facts, 1) + slice_w_h(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) # DS Edit: DynamMemNet modifiers m_dmn = hid_previous_brain # Note that this should have size c_dmn = input_n # This is a TesnorType<float64, row> q_dmn = self.question_layer # This is a lasagne recurrent GRU layer print(" entering 344") # DS Note: I believe this has size 9 x size(m_dmn)==size(cdmn) # z_dmn = [c_dmn, m_dmn, q_dmn, c_dmn * q_dmn, abs(c_dmn - q_dmn), abs(c_dmn - m_dmn), T.dot(c_dmn.T, T.dot(self.W_dmn_b, q_dmn)), # T.dot(c_dmn.T, T.dot(self.W_dmn_b, m_dmn))] # z_dmn = T.concatenate([c_dmn, m_dmn, q_dmn, c_dmn * q_dmn, abs(c_dmn - q_dmn), abs(c_dmn - m_dmn), T.dot(c_dmn.T, T.dot(self.W_dmn_b, q_dmn)), T.dot(c_dmn.T, T.dot(self.W_dmn_b, m_dmn))], axis=1) G_dmn = nonlinearities.sigmoid(T.dot(self.W_dmn_2, nonlinearities.tanh(T.dot(self.W_dmn_1, z_dmn)) + self.b_dmn_1) + self.b_dmn_2) # Note, you also need W_b for the c and q elements. # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update_in = slice_w_h(input_n, 2) hidden_update_hid = slice_w_h(hid_input_facts, 2) hidden_update_facts = hidden_update_in + resetgate * hidden_update_hid if self.grad_clipping: hidden_update_facts = theano.gradient.grad_clip( hidden_update_facts, -self.grad_clipping, self.grad_clipping) hidden_update_facts = self.nonlinearity_hid(hidden_update_facts) # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate) * hid_previous_facts + updategate * hidden_update_facts # This is the GRU_fact output output_dmn = G_dmn * hid + (1 - G_dmn) * hid_previous_facts # This is the output of the Dynamic Memory Net modified GRU, Eq. (5) # UPDATE THE BRAIN # We update the brain parameters if the current idx is equal to the sent len if self.cur_sequence_idx == self.max_seqlen: hid_input_brain = T.dot(hid_previous_brain, W_brain_hid_stacked) if self.grad_clipping: input_to_brain = theano.gradient.grad_clip( output_dmn, -self.grad_clipping, self.grad_clipping) hid_input_brain = theano.gradient.grad_clip( hid_input_brain, -self.grad_clipping, self.grad_clipping) else: input_to_brain = output_dmn if not self.precompute_input: # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c input_to_brain = T.dot(input_to_brain, W_brain_in_stacked) + b_brain_stacked # DS Note: accomplishes the multiplication AND adds bias # Reset and update gates resetgate_brain = slice_w_m(hid_input_brain, 0) + slice_w_m(input_to_brain, 0) updategate_brain = slice_w_m(hid_input_brain, 1) + slice_w_m(input_to_brain, 1) resetgate_brain = self.nonlinearity_brain_resetgate(resetgate_brain) updategate_brain = self.nonlinearity_brain_updategate(updategate_brain) hidden_update_in_brain = slice_w_m(input_to_brain, 2) hidden_update_brain = slice_w_m(hid_input_brain, 2) hidden_update_brain = hidden_update_in_brain + resetgate_brain * hidden_update_brain if self.grad_clipping: hidden_update_brain = theano.gradient.grad_clip(hidden_update_brain, -self.grad_clipping, self.grad_clipping) hidden_update_brain = self.nonlinearity_brain_hid_update(hidden_update_brain) hid_brain = (1 - updategate_brain) * hid_previous_brain + updategate_brain * hidden_update_brain else: hid_brain = hid_previous_brain # TODO: DS: ERROR IS HERE output_dmn = T.concatenate([output_dmn, hid_brain], axis=1) print(" 412 out of step") return output_dmn
def action_nonlinearity(x): return self.scale_action * tanh(x)