def get_output_for(self, input, **kwargs): assert input.ndim == 2 activation = T.dot(input, self.C) if self.b is not None: activation = activation + self.b.dimshuffle('x', 0) return self.nonlinearity_final( nonlinearities.sigmoid(activation).dot(self.M))
def __init__(self, GenerativeParams, xDim, yDim): super(SigmoidGenerative, self).__init__(GenerativeParams,xDim,yDim) layer_size = GenerativeParams['layer_size'] self.un_base_bias = theano.shared(value=np.ones([1,xDim]).astype(theano.config.floatX)) self.base_bias = sigmoid(self.un_base_bias) sbn_nn = lasagne.layers.InputLayer((None, xDim)) for ls in layer_size: sbn_nn = SigmoidBernoulli(sbn_nn, ls) self.sbn_nn = SigmoidBernoulli(sbn_nn, yDim)
def __init__(self, GenerativeParams, xDim, yDim): super(SigmoidGenerative, self).__init__(GenerativeParams, xDim, yDim) layer_size = GenerativeParams['layer_size'] self.un_base_bias = theano.shared( value=np.ones([1, xDim]).astype(theano.config.floatX)) self.base_bias = sigmoid(self.un_base_bias) sbn_nn = lasagne.layers.InputLayer((None, xDim)) for ls in layer_size: sbn_nn = SigmoidBernoulli(sbn_nn, ls) self.sbn_nn = SigmoidBernoulli(sbn_nn, yDim)
def get_output_for(self, input, only_at_anchor=False, **kwargs): if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) # ## calculate attention anchor position based on atw, atb and input x # at_anchor = nonlinearities.rectify(T.dot(input, self.atw) + self.atb[0]) # at_anchor = T.minimum(at_anchor, 1) at_anchor = nonlinearities.sigmoid(T.dot(input, self.atw) + self.atb[0]) at_anchor *= self.num_units self.at_anchor = at_anchor # for printing # print_op = printing.Print('attention') # at_anchor = print_op(at_anchor) if only_at_anchor: return at_anchor # ## normal dense layer activation output activation = T.dot(input, self.W) if self.b is not None: activation = activation + self.b.dimshuffle('x', 0) out = self.nonlinearity(activation) ### multiply activation with attention weight attention = T.exp( self.at_decay * ( T.arange(0, self.num_units).dimshuffle('x', 0) - at_anchor.dimshuffle(0, 'x') ) ** 2) ## Truncation if self.hard_threshold: attention = T.maximum(attention - self.hard_threshold, 0) out *= attention return out
def get_output_for(self, input, **kwargs): # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input_reshape = input.flatten(2) if input.ndim > 2 else input activation = T.dot(input_reshape, self.W_h) if self.b_h is not None: activation = activation + self.b_h.dimshuffle('x', 0) activation = self.nonlinearity(activation) transform = T.dot(input_reshape, self.W_t) if self.b_t is not None: transform = transform + self.b_t.dimshuffle('x', 0) transform = nonlinearities.sigmoid(transform) carry = 1.0 - transform output = activation * transform + input_reshape * carry # reshape output back to orignal input_shape if input.ndim > 2: output = T.reshape(output, input.shape) return output
def get_output_for(self, input, **kwargs): # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input_reshape = input.flatten(2) if input.ndim > 2 else input activation = T.dot(input_reshape, self.W_h) if self.b_h is not None: activation = activation + self.b_h.dimshuffle('x', 0) activation = self.nonlinearity(activation) transform = T.dot(input_reshape, self.W_t) if self.b_t is not None: transform = transform + self.b_t.dimshuffle('x', 0) transform = nonlinearities.sigmoid(transform) carry = 1.0 - transform output = activation * transform + input_reshape * carry # reshape output back to orignal input_shape if input.ndim > 2: output = output.reshape(input.shape) return output
def get_output_for(self, input, **kwargs): activation = T.dot(input, self.C) if self.b is not None: activation = activation + self.b.dimshuffle('x', 0) return nonlinearities.sigmoid(activation)
def main(): input_var = T.tensor3(name='inputs', dtype=theano.config.floatX) target_var = T.ivector(name='targets') layer_input = lasagne.layers.InputLayer(shape=(None, LENGTH, 1), input_var=input_var, name='input') layer_rnn = RecurrentLayer(layer_input, NUM_UNITS, nonlinearity=nonlinearities.tanh, only_return_final=True, W_in_to_hid=lasagne.init.Constant(1), W_hid_to_hid=lasagne.init.Constant(2), b=None, name='RNN') W = layer_rnn.W_hid_to_hid U = layer_rnn.W_in_to_hid output = lasagne.layers.get_output(layer_rnn) output = output.mean(axis=1) prediction = T.switch(T.gt(output, 0), 1, -1) acc = T.eq(prediction, target_var) acc = acc.sum() # get the output before activation function tanh epsilon = 1e-6 prob = 0.5 * T.log((1 + output + epsilon) / (1 - output + epsilon)) prob = nonlinearities.sigmoid(prob) loss = -0.5 * ((1 + target_var) * T.log(prob) + (1 - target_var) * T.log(1 - prob)) loss = loss.sum() batch_size = 100 learning_rate = 0.01 steps_per_epoch = 1000 params = lasagne.layers.get_all_params(layer_rnn, trainable=True) updates = lasagne.updates.sgd(loss, params=params, learning_rate=learning_rate) train_fn = theano.function([input_var, target_var], [loss, acc, W, U, output], updates=updates) for epoch in range(3): print 'Epoch %d (learning rate=%.4f)' % (epoch, learning_rate) loss = 0.0 correct = 0.0 num_back = 0 for step in range(steps_per_epoch): x, y = get_batch(batch_size) err, corr, w, u, pred = train_fn(x, y) # print x # print y # print pred loss += err correct += corr num_inst = (step + 1) * batch_size # update log sys.stdout.write("\b" * num_back) log_info = 'inst: %d loss: %.4f, corr: %d, acc: %.2f%%, W: %.6f, U: %.6f' % ( num_inst, loss / num_inst, correct, correct * 100 / num_inst, w.sum(), u.sum()) sys.stdout.write(log_info) num_back = len(log_info) # raw_input() # update training log after each epoch sys.stdout.write("\b" * num_back) assert num_inst == batch_size * steps_per_epoch print 'inst: %d loss: %.4f, corr: %d, acc: %.2f%%' % ( num_inst, loss / num_inst, correct, correct * 100 / num_inst)
def step(input_n, cell_previous, hid_previous, avg_previous, *args): x = input_n if not self.precompute_input: input_n = T.dot(input_n, W_in_stacked) + b_stacked # Calculate gates pre-activations and slice gates = input_n + T.dot(hid_previous, W_hid_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip(gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous * self.W_cell_to_ingate forgetgate += cell_previous * self.W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # Compute new cell value cell = forgetgate * cell_previous + ingate * cell_input if self.peepholes: outgate += cell * self.W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate * self.nonlinearity(cell) avg_input = T.dot(x, self.W_avg1) + T.dot(hid, self.W_avg2) + self.b_avg if self.model_type == 1: avg = x * nonlinearities.sigmoid(avg_input) elif self.model_type == 6: avg = nonlinearities.tanh(avg_input) elif self.model_type == 7: avg_input = T.dot(x, self.W_avg1) * T.dot( hid, self.W_avg2) + self.b_avg avg = x * nonlinearities.sigmoid(avg_input) elif self.model_type == 2: avg = hid * nonlinearities.sigmoid(avg_input) elif self.model_type == 3: avg_input2 = T.dot(x, self.W_avg12) + T.dot( hid, self.W_avg22) + self.b_avg2 g1 = nonlinearities.sigmoid(avg_input) g2 = nonlinearities.sigmoid(avg_input2) avg = avg_previous * g1 + x * g2 elif self.model_type == 4: avg_input = T.dot( x, self.W_avg1) + T.dot(hid, self.W_avg2) + T.dot( avg_previous, self.W_avg3) + self.b_avg avg_input2 = T.dot( x, self.W_avg12) + T.dot(hid, self.W_avg22) + T.dot( avg_previous, self.W_avg32) + self.b_avg2 g1 = nonlinearities.sigmoid(avg_input) g2 = nonlinearities.sigmoid(avg_input2) avg = avg_previous * g1 + x * g2 elif self.model_type == 5: avg_input2 = T.dot(x, self.W_avg12) + T.dot( hid, self.W_avg22) + self.b_avg2 g1 = nonlinearities.sigmoid(avg_input) g2 = nonlinearities.sigmoid(avg_input2) avg = x * g1 havg = hid * g2 avg = avg + havg return [cell, hid, avg]
def get_output_for(self, input, **kwargs): if apply_nl: ps = nonlinearities.sigmoid(input) prod = T.prod(ps, axis=(1,2)) output = 1 - prod return output
def get_output_for(self, input, **kwargs): activation = T.dot(input, self.C) if self.b is not None: activation = activation + self.b.dimshuffle('x', 0) return nonlinearities.sigmoid(activation)
def step(input_n, hid_previous_total, *args): print("317 into step") print(" type input n: ", type(input_n)) hid_previous_facts = hid_previous_total[0:self.num_hidden_units_h] hid_previous_brain = hid_previous_total[self.num_hidden_units_h:] self.cur_sequence_idx += 1 # Updates where we are at in the sequence # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1} hid_input_facts = T.dot(hid_previous_facts, W_hid_stacked) if self.grad_clipping: input_n = theano.gradient.grad_clip( input_n, -self.grad_clipping, self.grad_clipping) hid_input_facts = theano.gradient.grad_clip( hid_input_facts, -self.grad_clipping, self.grad_clipping) if not self.precompute_input: # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c input_n = T.dot(input_n, W_in_stacked) + b_stacked # DS Note: accomplishes the multiplication AND adds bias # Reset and update gates resetgate = slice_w_h(hid_input_facts, 0) + slice_w_h(input_n, 0) updategate = slice_w_h(hid_input_facts, 1) + slice_w_h(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) # DS Edit: DynamMemNet modifiers m_dmn = hid_previous_brain # Note that this should have size c_dmn = input_n # This is a TesnorType<float64, row> q_dmn = self.question_layer # This is a lasagne recurrent GRU layer print(" entering 344") # DS Note: I believe this has size 9 x size(m_dmn)==size(cdmn) # z_dmn = [c_dmn, m_dmn, q_dmn, c_dmn * q_dmn, abs(c_dmn - q_dmn), abs(c_dmn - m_dmn), T.dot(c_dmn.T, T.dot(self.W_dmn_b, q_dmn)), # T.dot(c_dmn.T, T.dot(self.W_dmn_b, m_dmn))] # z_dmn = T.concatenate([c_dmn, m_dmn, q_dmn, c_dmn * q_dmn, abs(c_dmn - q_dmn), abs(c_dmn - m_dmn), T.dot(c_dmn.T, T.dot(self.W_dmn_b, q_dmn)), T.dot(c_dmn.T, T.dot(self.W_dmn_b, m_dmn))], axis=1) G_dmn = nonlinearities.sigmoid(T.dot(self.W_dmn_2, nonlinearities.tanh(T.dot(self.W_dmn_1, z_dmn)) + self.b_dmn_1) + self.b_dmn_2) # Note, you also need W_b for the c and q elements. # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update_in = slice_w_h(input_n, 2) hidden_update_hid = slice_w_h(hid_input_facts, 2) hidden_update_facts = hidden_update_in + resetgate * hidden_update_hid if self.grad_clipping: hidden_update_facts = theano.gradient.grad_clip( hidden_update_facts, -self.grad_clipping, self.grad_clipping) hidden_update_facts = self.nonlinearity_hid(hidden_update_facts) # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate) * hid_previous_facts + updategate * hidden_update_facts # This is the GRU_fact output output_dmn = G_dmn * hid + (1 - G_dmn) * hid_previous_facts # This is the output of the Dynamic Memory Net modified GRU, Eq. (5) # UPDATE THE BRAIN # We update the brain parameters if the current idx is equal to the sent len if self.cur_sequence_idx == self.max_seqlen: hid_input_brain = T.dot(hid_previous_brain, W_brain_hid_stacked) if self.grad_clipping: input_to_brain = theano.gradient.grad_clip( output_dmn, -self.grad_clipping, self.grad_clipping) hid_input_brain = theano.gradient.grad_clip( hid_input_brain, -self.grad_clipping, self.grad_clipping) else: input_to_brain = output_dmn if not self.precompute_input: # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c input_to_brain = T.dot(input_to_brain, W_brain_in_stacked) + b_brain_stacked # DS Note: accomplishes the multiplication AND adds bias # Reset and update gates resetgate_brain = slice_w_m(hid_input_brain, 0) + slice_w_m(input_to_brain, 0) updategate_brain = slice_w_m(hid_input_brain, 1) + slice_w_m(input_to_brain, 1) resetgate_brain = self.nonlinearity_brain_resetgate(resetgate_brain) updategate_brain = self.nonlinearity_brain_updategate(updategate_brain) hidden_update_in_brain = slice_w_m(input_to_brain, 2) hidden_update_brain = slice_w_m(hid_input_brain, 2) hidden_update_brain = hidden_update_in_brain + resetgate_brain * hidden_update_brain if self.grad_clipping: hidden_update_brain = theano.gradient.grad_clip(hidden_update_brain, -self.grad_clipping, self.grad_clipping) hidden_update_brain = self.nonlinearity_brain_hid_update(hidden_update_brain) hid_brain = (1 - updategate_brain) * hid_previous_brain + updategate_brain * hidden_update_brain else: hid_brain = hid_previous_brain # TODO: DS: ERROR IS HERE output_dmn = T.concatenate([output_dmn, hid_brain], axis=1) print(" 412 out of step") return output_dmn
def swish(x): """""" return x * nl.sigmoid(x)
def get_output_for(self, input, **kwargs): ps = nonlinearities.sigmoid(input) sum_p_r_benign = T.sum(ps,axis=1) sum_log = T.sum(T.log(1-ps+1.e-12),axis=1) return T.concatenate([sum_log, sum_p_r_benign])
def safe_sigmoid(x, eps=1e-6): return T.clip(sigmoid(x), eps, 1 - eps)
def get_output_for(self, input, **kwargs): ps = nonlinearities.sigmoid(input) powd = ps ** self.exp tmean = T.mean(powd, axis=(1,2)) return tmean
def get_output_for(self, input, **kwargs): if self.apply_nl: ps = nonlinearities.sigmoid(input) prod = T.prod(ps, axis=(1,2)) output = 1 - prod return output
def step(input_n, hid_previous_total, *args): hid_previous_facts = hid_previous_total[0:self.num_hidden_units_h] hid_previous_brain = hid_previous_total[self.num_hidden_units_h:] self.cur_sequence_idx += 1 # Updates where we are at in the sequence # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1} hid_input_facts = T.dot(hid_previous_facts, W_hid_stacked) if self.grad_clipping: input_n = theano.gradient.grad_clip( input_n, -self.grad_clipping, self.grad_clipping) hid_input_facts = theano.gradient.grad_clip( hid_input_facts, -self.grad_clipping, self.grad_clipping) if not self.precompute_input: # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c input_n = T.dot(input_n, W_in_stacked) + b_stacked # DS Note: accomplishes the multiplication AND adds bias # Reset and update gates resetgate = slice_w_h(hid_input_facts, 0) + slice_w_h(input_n, 0) updategate = slice_w_h(hid_input_facts, 1) + slice_w_h(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) # DS Edit: DynamMemNet modifiers m_dmn = hid_previous_brain # Note that this should have size c_dmn = input_n # This is a TesnorType<float64, row> q_dmn = self.question_layer # This is a lasagne recurrent GRU layer z_dmn = T.concatenate([c_dmn, m_dmn, q_dmn, c_dmn * q_dmn, abs(c_dmn - q_dmn), abs(c_dmn - m_dmn), T.dot(c_dmn.T, T.dot(self.W_dmn_b, q_dmn)), T.dot(c_dmn.T, T.dot(self.W_dmn_b, m_dmn))], axis=1) G_dmn = nonlinearities.sigmoid(T.dot(self.W_dmn_2, nonlinearities.tanh(T.dot(self.W_dmn_1, z_dmn)) + self.b_dmn_1) + self.b_dmn_2) # Note, you also need W_b for the c and q elements. #something_else = T.dot(hid_previous_facts, W_hid_stacked) hidden_update_in = slice_w_h(input_n, 2) hidden_update_hid = slice_w_h(hid_input_facts, 2) hidden_update_facts = hidden_update_in + resetgate * hidden_update_hid if self.grad_clipping: hidden_update_facts = theano.gradient.grad_clip( hidden_update_facts, -self.grad_clipping, self.grad_clipping) hidden_update_facts = self.nonlinearity_hid(hidden_update_facts) # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate) * hid_previous_facts + updategate * hidden_update_facts # This is the GRU_fact output #output_dmn = G_dmn * hid + (1 - G_dmn) * hid_previous_facts # This is the output of the Dynamic Memory Net modified GRU, Eq. (5) output_dmn = hid # if self.cur_sequence_idx == self.max_seqlen: # hid_input_brain = T.dot(hid_previous_brain, W_brain_hid_stacked) # # if self.grad_clipping: # input_to_brain = theano.gradient.grad_clip( # output_dmn, -self.grad_clipping, self.grad_clipping) # hid_input_brain = theano.gradient.grad_clip( # hid_input_brain, -self.grad_clipping, self.grad_clipping) # else: # input_to_brain = output_dmn # # if not self.precompute_input: # # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c # input_to_brain = T.dot(input_to_brain, W_brain_in_stacked) + b_brain_stacked # DS Note: accomplishes the multiplication AND adds bias # # # Reset and update gates # resetgate_brain = slice_w_m(hid_input_brain, 0) + slice_w_m(input_to_brain, 0) # updategate_brain = slice_w_m(hid_input_brain, 1) + slice_w_m(input_to_brain, 1) # resetgate_brain = self.nonlinearity_brain_resetgate(resetgate_brain) # updategate_brain = self.nonlinearity_brain_updategate(updategate_brain) # # hidden_update_in_brain = slice_w_m(input_to_brain, 2) # hidden_update_brain = slice_w_m(hid_input_brain, 2) # hidden_update_brain = hidden_update_in_brain + resetgate_brain * hidden_update_brain # # if self.grad_clipping: # hidden_update_brain = theano.gradient.grad_clip(hidden_update_brain, -self.grad_clipping, self.grad_clipping) # hidden_update_brain = self.nonlinearity_brain_hid_update(hidden_update_brain) # # hid_brain = (1 - updategate_brain) * hid_previous_brain + updategate_brain * hidden_update_brain # # else: # hid_brain = hid_previous_brain return T.concatenate([output_dmn, hid_brain], axis=1)
def get_output_for(self, input, **kwargs): ps = nonlinearities.sigmoid(input) sum_p_r_benign = T.sum(ps,axis=1) sum_log = T.sum(T.log(1-ps+1.e-12),axis=1) return T.concatenate([sum_log, sum_p_r_benign])
def get_output_for(self, input, **kwargs): ps = nonlinearities.sigmoid(input) powd = ps ** self.exp tmean = T.mean(powd, axis=(1,2)) return tmean
def output_layer_nonlinearity(x): return T.clip(sigmoid(x),1e-5,1.0-1e-4)
def get_output_for(self, inputs, **kwargs): num_batch, _, _ = inputs.shape #add padded zeros in front of sequence padded_input = T.concatenate([T.zeros((num_batch, self.filter_width - 1, self.original_features)), inputs], axis=1) #reshape input to include 1 filter dimension rs = padded_input.dimshuffle([0, 'x', 1, 2]) #apply convolutions for all "gates" (output = (n_batch, n_filters, n_time_steps, 1)) Z = nonlinearities.tanh(T.nnet.conv2d(rs, self.Z_W, input_shape=(None, 1, self.internal_seq_len, self.original_features), filter_shape=(self.num_units, 1, self.filter_width, self.original_features))) F = nonlinearities.sigmoid(T.nnet.conv2d(rs, self.F_W, input_shape=(None, 1, self.internal_seq_len, self.original_features), filter_shape=(self.num_units, 1, self.filter_width, self.original_features))) if self.pooling == 'fo' or self.pooling == 'ifo': O = nonlinearities.sigmoid(T.nnet.conv2d(rs, self.O_W, input_shape=(None, 1, self.internal_seq_len, self.original_features), filter_shape=(self.num_units, 1, self.filter_width, self.original_features))) if self.pooling == 'ifo': I = nonlinearities.sigmoid(T.nnet.conv2d(rs, self.I_W, input_shape=(None, 1, self.internal_seq_len, self.original_features), filter_shape=(self.num_units, 1, self.filter_width, self.original_features))) # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) Z = Z.flatten(ndim=3) Z = Z.dimshuffle([2, 0, 1]) F = F.flatten(ndim=3) F = F.dimshuffle([2, 0, 1]) if self.pooling == 'fo' or self.pooling == 'ifo': O = O.flatten(ndim=3) O = O.dimshuffle([2, 0, 1]) if self.pooling == 'ifo': I = I.flatten(ndim=3) I = I.dimshuffle([2, 0, 1]) # Dot against a 1s vector to repeat to shape (num_batch, num_units) ones = T.ones((num_batch, 1)) hid_init = T.dot(ones, self.hid_init) # Create single recurrent computation step function # input_n is the n'th vector of the input: (n_batch, n_features) def step_f(forget_n, z_n, hid_previous, *args): return forget_n * hid_previous + (1.0 - forget_n) * z_n def step_fo(forget_n, z_n, o_n, hid_previous, cell_previous, *args): cell_current = forget_n * cell_previous + (1.0 - forget_n) * z_n hid_current = o_n * cell_current return [hid_current, cell_current] def step_ifo(forget_n, z_n, o_n, i_n, hid_previous, cell_previous, *args): cell_current = forget_n * cell_previous + i_n * z_n hid_current = o_n * cell_current return [hid_current, cell_current] if self.pooling == 'f': step = step_f sequences = [F, Z] outputs_info = [hid_init] if self.pooling == 'fo': step = step_fo sequences = [F, Z, O] # Note that, below, we use hid_init as the initial /cell/ state! # That way we only need to declare one set of weights outputs_info = [T.zeros((num_batch, self.num_units)), hid_init] if self.pooling == 'ifo': step = step_ifo sequences = [F, Z, O, I] outputs_info = [T.zeros((num_batch, self.num_units)), hid_init] outputs = theano.scan( fn=step, sequences=sequences, outputs_info=outputs_info, strict=True)[0] hid_out = outputs if self.pooling == 'fo' or self.pooling == 'ifo': hid_out = outputs[0] # Shuffle back to (n_batch, n_time_steps, n_features) hid_out = hid_out.dimshuffle([1, 0, 2]) return hid_out
def get_output_for(self, inputs, deterministic=False, **kwargs): input = inputs[0] time_input = inputs[self.time_incoming_idx] event_input = inputs[self.event_incoming_idx] mask = None hid_init = None cell_init = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] if self.cell_init_incoming_index > 0: cell_init = inputs[self.cell_init_incoming_index] if self.bn: input = self.bn.get_output_for(input) input = input.dimshuffle(1, 0, 2) time_input = time_input.dimshuffle(1, 0) seq_len, num_batch, _ = input.shape # Stack input weight matrices into a (num_inputs, 4*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate([ self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate ], axis=1) # Same for hidden weight matrices W_hid_stacked = T.concatenate([ self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell, self.W_hid_to_outgate ], axis=1) # Stack biases into a (4*num_units) vector b_stacked = T.concatenate( [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate], axis=0) input = T.dot(input, W_in_stacked) + b_stacked # PHASED LSTM: If test time, off-phase means really shut. if deterministic: print('Using true off for testing.') off_slope = 0.0 else: print('Using {} for off_slope.'.format(self.off_alpha)) off_slope = self.off_alpha if self.model != 'LSTM': # PHASED LSTM: Pregenerate broadcast vars. # Same neuron in different batches has same shift and period. Also, # precalculate the middle (on_mid) and end (on_end) of the open-phase # ramp. shift_broadcast = self.shift_timegate.dimshuffle(['x', 0]) period_broadcast = T.abs_(self.period_timegate.dimshuffle(['x', 0])) on_mid_broadcast = T.abs_(self.on_end_timegate.dimshuffle( ['x', 0])) * 0.5 * period_broadcast on_end_broadcast = T.abs_(self.on_end_timegate.dimshuffle( ['x', 0])) * period_broadcast if self.model == 'HELSTM': event_W = self.event_w_timegate event_b = T.shape_padleft(self.event_b_timegate, 2) out_W = self.out_w_timegate out_b = T.shape_padleft(self.out_b_timegate, 2) hid_attention = nonlinearities.leaky_rectify( T.dot(event_input, event_W) + event_b) out_attention = nonlinearities.sigmoid( T.dot(hid_attention, out_W) + out_b) out_attention = out_attention.dimshuffle(1, 0, 2) def slice_w(x, n): return x[:, n * self.num_units:(n + 1) * self.num_units] def step(input_n, cell_previous, hid_previous, *args): gates = input_n + T.dot(hid_previous, W_hid_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip(gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous * self.W_cell_to_ingate forgetgate += cell_previous * self.W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # Mix in new stuff cell = forgetgate * cell_previous + ingate * cell_input if self.peepholes: outgate += cell * self.W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate * self.nonlinearity(cell) return [cell, hid] # PHASED LSTM: The actual calculation of the time gate def calc_time_gate(time_input_n): # Broadcast the time across all units t_broadcast = time_input_n.dimshuffle([0, 'x']) # Get the time within the period in_cycle_time = T.mod(t_broadcast + shift_broadcast, period_broadcast) # Find the phase is_up_phase = T.le(in_cycle_time, on_mid_broadcast) is_down_phase = T.gt(in_cycle_time, on_mid_broadcast) * T.le( in_cycle_time, on_end_broadcast) # Set the mask sleep_wake_mask = T.switch( is_up_phase, in_cycle_time / on_mid_broadcast, T.switch(is_down_phase, (on_end_broadcast - in_cycle_time) / on_mid_broadcast, off_slope * (in_cycle_time / period_broadcast))) return sleep_wake_mask #HELSTM: Mask the updates based on the time phase and event attention def step_masked(input_n, time_input_n, event_input_n, mask_n, cell_previous, hid_previous, *args): cell, hid = step(input_n, cell_previous, hid_previous, *args) if self.model != 'LSTM': # Get time gate openness sleep_wake_mask = calc_time_gate(time_input_n) if self.model == 'HELSTM': sleep_wake_mask = event_input_n * sleep_wake_mask # Sleep if off, otherwise stay a bit on cell = sleep_wake_mask * cell + ( 1. - sleep_wake_mask) * cell_previous hid = sleep_wake_mask * hid + (1. - sleep_wake_mask) * hid_previous #Skip over any input with mask 0 by copying the previous #hidden state; proceed normally for any input with mask 1. cell = T.switch(mask_n, cell, cell_previous) hid = T.switch(mask_n, hid, hid_previous) return [cell, hid] if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension mask = mask.dimshuffle(1, 0, 'x') else: mask = T.ones_like(time_input).dimshuffle(0, 1, 'x') if self.model != 'HELSTM': out_attention = event_input #if not using HELSTM, out_attention is of no use but still need to assign a value to complete sequences sequences = [input, time_input, out_attention, mask] step_fun = step_masked ones = T.ones((num_batch, 1)) if not isinstance(self.cell_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) cell_init = T.dot(ones, self.cell_init) if not isinstance(self.hid_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init = T.dot(ones, self.hid_init) # Scan op iterates over first dimension of input and repeatedly # applies the step function cell_out, hid_out = theano.scan(fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards)[0] # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: hid_out = hid_out[-1] else: # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] return hid_out
def safe_sigmoid(x, eps=1e-6): return T.clip(sigmoid(x), eps, 1-eps)