def take_one_step(self, nn_input_bf, hid_out=None): # Sometimes you don't want to unroll all t-steps of a recurrence but rather just one forward step. num_batch = nn_input_bf.shape[0] def slice_w(x, n): return x[:, n*self.num_units:(n+1)*self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input def step(input_n, cell_previous, hid_previous, W_hid_stacked, W_in_stacked, b_stacked): input_n = cgt.broadcast("+", cgt.dot(input_n, W_in_stacked), b_stacked, "xx,1x") # Calculate gates pre-activations and slice gates = input_n + cgt.dot(hid_previous, W_hid_stacked) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) outgate = self.nonlinearity_outgate(outgate) # Compute new cell value cell = forgetgate*cell_previous + ingate*cell_input # Compute new hidden unit activation hid = outgate*self.nonlinearity(cell) return [cell, hid] if hid_out is None: if self.hid_prev is None: self.hid_prev = cgt.dot(cgt.ones((self.num_batches, 1)), self.hid_init) hid_out = self.hid_prev if self.cell_prev is None: ones = cgt.ones((num_batch, 1)) self.cell_prev = cgt.dot(ones, self.cell_init) self.hid_prev = cgt.dot(ones, self.hid_init) if hid_out is None: ones = cgt.ones((num_batch, 1)) self.hid_prev = cgt.dot(ones, self.hid_init) hid_out = self.hid_prev one_step_out = step(nn_input_bf, hid_out, self.cell_prev, self.W_hid_stacked, self.W_in_stacked, self.b_stacked) self.cell_prev = one_step_out[0] self.hid_prev = one_step_out[1] return self.hid_prev
def to_one_hot(y, nb_class, dtype=None): """ Return a matrix where each row corresponds to the one hot encoding of each element in y. Parameters ---------- y A vector of integer value between 0 and nb_class - 1. nb_class : int The number of classes in y. dtype : data-type The dtype of the returned matrix. Default floatX. Returns ------- object A matrix of shape (y.shape[0], nb_class), where each row ``i`` is the one hot encoding of the corresponding ``y[i]`` value. """ fill_vals = cgt.ones((y.shape[0],)) ret = cgt.zeros((y.shape[0], nb_class), dtype) d1 = cgt.arange(y.shape[0]) d2 = cgt.cast(y, 'i1') ret = cgt.inc_subtensor(ret, [d1, d2], fill_vals) return ret
def to_one_hot(y, nb_class, dtype=None): """ Return a matrix where each row corresponds to the one hot encoding of each element in y. Parameters ---------- y A vector of integer value between 0 and nb_class - 1. nb_class : int The number of classes in y. dtype : data-type The dtype of the returned matrix. Default floatX. Returns ------- object A matrix of shape (y.shape[0], nb_class), where each row ``i`` is the one hot encoding of the corresponding ``y[i]`` value. """ fill_vals = cgt.ones((y.shape[0], )) ret = cgt.zeros((y.shape[0], nb_class), dtype) d1 = cgt.arange(y.shape[0]) d2 = cgt.cast(y, 'i1') ret = cgt.inc_subtensor(ret, [d1, d2], fill_vals) return ret
def take_one_step(self, input_bf, hid_out=None): num_batch = input_bf.shape[0] def step(input_bh, hid_previous_bh): hid_pre_bh = self.hid_to_hid(hid_previous_bh) hid_pre_bh += self.in_to_hid(input_bh) return self.activation(hid_pre_bh) if self.prev_out is None: self.prev_out = cgt.dot(cgt.ones((num_batch, 1)), self.hid_init) if hid_out is None: ones = cgt.ones((num_batch, 1)) self.prev_out = cgt.dot(ones, self.hid_init) hid_out = self.prev_out self.prev_out = step(input_bf, hid_out) return self.prev_out
def __call__(self, x): input_btf = x input_tbf = cgt.dimshuffle(input_btf, [1, 0, 2]) seq_len, num_batch = input_tbf.shape[0], input_tbf.shape[1] def step(input_bh, hid_previous_bh): hid_pre_bh = self.hid_to_hid(hid_previous_bh) hid_pre_bh += self.in_to_hid(input_bh) return self.activation(hid_pre_bh) hid_init_bh = cgt.dot(cgt.ones((num_batch, 1)), self.hid_init) hid_out_tbf = unroll_recurrence( step_function=step, input_to_unroll_tbf=input_tbf, hid_init=[hid_init_bh], go_backwards=self.backwards, n_steps=self.timesteps) hid_out_btf = cgt.dimshuffle(hid_out_tbf, [1, 0, 2]) if self.backwards: hid_out_btf = cgt.flip(hid_out_btf, [1]) return hid_out_btf
def take_one_step(self, nn_input_bf, hid_out): #PROBABLY BUGGED. SHOULD BE REWRITTEN. self.num_batches = cgt.infer_shape(nn_input_bf)[0] # (n_time_steps, n_batch, n_features) #input_bf = cgt.dimshuffle(nn_input_bf, [1, 0, 2]) # Stack input weight matrices into a (num_inputs, 3*num_units) # matrix, which speeds up computation W_in_stacked = cgt.concatenate( [self.W_in_to_resetgate, self.W_in_to_updategate, self.W_in_to_hidden_update], axis=1) # Same for hidden weight matrices W_hid_stacked = cgt.concatenate( [self.W_hid_to_resetgate, self.W_hid_to_updategate, self.W_hid_to_hidden_update], axis=1) # Stack gate biases into a (3*num_units) vector b_stacked = cgt.concatenate( [self.b_resetgate, self.b_updategate, self.b_hidden_update], axis=1) # At each loop, input_n will be (n_time_steps, 3*num_units). # We define a slicing function that extract the input to each GRU gate def slice_w(x, n): return x[:, n*self.num_units:(n+1)*self.num_units] # Create single recurrent computation step function # input__n is the n'th vector of the input def step(input_n, hid_previous, W_hid_stacked, W_in_stacked, b_stacked): # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1} hid_input = cgt.dot(hid_previous, W_hid_stacked) # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c input_n = cgt.broadcast("+", input_n.dot(W_in_stacked), b_stacked, "xx,1x") # Reset and update gates resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update_in = slice_w(input_n, 2) hidden_update_hid = slice_w(hid_input, 2) hidden_update = hidden_update_in + resetgate*hidden_update_hid # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate)*hid_previous + updategate*hidden_update return self.nonlinearity_hid(hid) # adding this non-linearity seems to help stability. #return hid if hid_out is None: if self.hid_out is None: self.hid_out = cgt.dot(cgt.ones((self.num_batches, 1)), self.hid_init) hid_out = self.hid_out # Retrieve the dimensionality of the incoming layer hid_out = step(nn_input_bf, hid_out, W_hid_stacked, W_in_stacked, b_stacked) # dimshuffle back to (n_batch, n_time_steps, n_features)) # self.hid_out = cgt.dimshuffle(self.hid_out, [1, 0, 2]) # if scan is backward reverse the output if self.backwards: hid_out = cgt.flip(hid_out, [1]) self.hid_out = hid_out return hid_out
def __call__(self, input_btf): # (n_time_steps, n_batch, n_features) input_tbf = cgt.dimshuffle(input_btf, [1, 0, 2]) self.num_batches = cgt.infer_shape(input_tbf)[1] # Stack input weight matrices into a (num_inputs, 3*num_units) # matrix, which speeds up computation W_in_stacked = cgt.concatenate( [self.W_in_to_resetgate, self.W_in_to_updategate, self.W_in_to_hidden_update], axis=1) # Same for hidden weight matrices W_hid_stacked = cgt.concatenate( [self.W_hid_to_resetgate, self.W_hid_to_updategate, self.W_hid_to_hidden_update], axis=1) # Stack gate biases into a (3*num_units) vector b_stacked = cgt.concatenate( [self.b_resetgate, self.b_updategate, self.b_hidden_update], axis=1) # At each loop, input_n will be (n_time_steps, 3*num_units). # We define a slicing function that extract the input to each GRU gate def slice_w(x, n): return x[:, n*self.num_units:(n+1)*self.num_units] # Create single recurrent computation step function # input__n is the n'th vector of the input def step(input_n, hid_previous, W_hid_stacked, W_in_stacked, b_stacked): # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1} hid_input = cgt.dot(hid_previous, W_hid_stacked) # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c input_n = cgt.broadcast("+", input_n.dot(W_in_stacked), b_stacked, "xx,1x") # Reset and update gates resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update_in = slice_w(input_n, 2) hidden_update_hid = slice_w(hid_input, 2) hidden_update = hidden_update_in + resetgate*hidden_update_hid # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate)*hid_previous + updategate*hidden_update return hid sequences = [input_tbf] step_fun = step hid_init = cgt.dot(cgt.ones((self.num_batches, 1)), self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [W_hid_stacked] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function non_seqs += [W_in_stacked, b_stacked] # theano.scan only allows for positional arguments, so when # self.precompute_input is True, we need to supply fake placeholder # arguments for the input weights and biases. # Retrieve the dimensionality of the incoming layer hid_out = unroll_lstm( fn=step_fun, sequences=sequences, outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=self.timesteps)[0] # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = cgt.dimshuffle(hid_out, [1, 0, 2]) # if scan is backward reverse the output if self.backwards: hid_out = cgt.flip(hid_out, [1]) return hid_out
def __call__(self, nn_input_btf): # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) nn_input_tbf = cgt.dimshuffle(nn_input_btf, [1, 0, 2]) seq_len, num_batch = nn_input_tbf.shape[0], nn_input_tbf.shape[1] def slice_w(x, n): return x[:, n*self.num_units:(n+1)*self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input def step(input_n, cell_previous, hid_previous, W_hid_stacked, W_in_stacked, b_stacked): input_n = cgt.broadcast("+", cgt.dot(input_n, W_in_stacked), b_stacked, "xx,1x") # Calculate gates pre-activations and slice gates = input_n + cgt.dot(hid_previous, W_hid_stacked) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) outgate = self.nonlinearity_outgate(outgate) # Compute new cell value cell = forgetgate*cell_previous + ingate*cell_input # Compute new hidden unit activation hid = outgate*self.nonlinearity(cell) return [cell, hid] sequences = nn_input_tbf step_fun = step ones = cgt.ones((num_batch, 1)) cell_init = cgt.dot(ones, self.cell_init) hid_init = cgt.dot(ones, self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [self.W_hid_stacked] non_seqs += [self.W_in_stacked, self.b_stacked] cell_out, hid_out = unroll_lstm( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=self.timesteps) # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = cgt.dimshuffle(hid_out, [1, 0, 2]) # if scan is backward reverse the output if self.backwards: hid_out = cgt.flip(hid_out, [1]) return hid_out
def ones(shape): return cgt.ones(shape)