def __call__(self, x, prev_h): """ x is the input prev_h is the input from the previous timestep Returns next_h. For the GRU the output to the next timestep and next layer is one and the same. Copy it first! """ reset_gate = cgt.sigmoid(x.dot(self.W_xr) + prev_h.dot(self.W_hr)) update_gate = cgt.sigmoid(x.dot(self.W_xz) + prev_h.dot(self.W_hz)) # the elementwise multiplication here tells what of the previous # input we should forget. forget_gate = reset_gate * prev_h # this part is very similar to vanilla RNN h_candidate = cgt.tanh(x.dot(self.W_xc) + prev_h.dot(forget_gate)) # this isn't super clear in the paper just it's an elementwise mult here next_h = (1. - update_gate) * h + update_gate * h_candidate # In a standard GRU cell we only have 1 output. # However, it should be be copied and feed to # both the next timestep and the next layer return next_h
def __call__(self, x, prev_c, prev_h): """ x is the input prev_h is the previous timestep prev_c is the previous memory context Returns (next_c, next_h). next_h should be cloned since it's feed into the next layer and the next timstep. """ forget_gate = cgt.sigmoid(x.dot(self.W_xf) + prev_h.dot(self.W_hf)) input_gate = cgt.sigmoid(x.dot(self.W_xi) + prev_h.dot(self.W_hi)) output_gate = cgt.sigmoid(x.dot(self.W_xo) + prev_h.dot(self.W_ho)) candidate_values = cgt.tanh(x.dot(self.W_xc) + prev_h.dot(self.W_hc)) # new cell state next_c = forget_gate * prev_c + input_gate * candidate_values # input for next timestep next_h = output_gate * cgt.tanh(next_c) # NOTE: we feed next_h into the next layer and the next timestep # so we should clone the next_h output. return next_c, next_h
def make_ff_controller(opt): b, h, m, p, k = opt.b, opt.h, opt.m, opt.p, opt.k H = 2*h in_size = k + h*m out_size = H*m + H + H + H*3 + H + h*m + h*m + p # Previous reads r_bhm = cgt.tensor3("r", fixed_shape = (b,h,m)) # External inputs X_bk = cgt.matrix("x", fixed_shape = (b,k)) r_b_hm = r_bhm.reshape([r_bhm.shape[0], r_bhm.shape[1]*r_bhm.shape[2]]) # Input to controller inp_bq = cgt.concatenate([X_bk, r_b_hm], axis=1) hid_sizes = opt.ff_hid_sizes activation = cgt.tanh layer_out_sizes = [in_size] + hid_sizes + [out_size] last_out = inp_bq # feedforward part. we could simplify a bit by using nn.Affine for i in xrange(len(layer_out_sizes)-1): indim = layer_out_sizes[i] outdim = layer_out_sizes[i+1] W = cgt.shared(.02*nr.randn(indim, outdim), name="W%i"%i, fixed_shape_mask="all") bias = cgt.shared(.02*nr.randn(1, outdim), name="b%i"%i, fixed_shape_mask="all") last_out = cgt.broadcast("+",last_out.dot(W),bias,"xx,1x") # Don't apply nonlinearity at the last layer if i != len(layer_out_sizes)-2: last_out = activation(last_out) idx = 0 k_bHm = last_out[:,idx:idx+H*m]; idx += H*m; k_bHm = k_bHm.reshape([b,H,m]) beta_bH = last_out[:,idx:idx+H]; idx += H g_bH = last_out[:,idx:idx+H]; idx += H s_bH3 = last_out[:,idx:idx+3*H]; idx += 3*H; s_bH3 = s_bH3.reshape([b,H,3]) gamma_bH = last_out[:,idx:idx+H]; idx += H e_bhm = last_out[:,idx:idx+h*m]; idx += h*m; e_bhm = e_bhm.reshape([b,h,m]) a_bhm = last_out[:,idx:idx+h*m]; idx += h*m; a_bhm = a_bhm.reshape([b,h,m]) y_bp = last_out[:,idx:idx+p]; idx += p k_bHm = cgt.tanh(k_bHm) beta_bH = nn.softplus(beta_bH) g_bH = cgt.sigmoid(g_bH) s_bH3 = sum_normalize2(cgt.exp(s_bH3)) gamma_bH = cgt.sigmoid(gamma_bH)+1 e_bhm = cgt.sigmoid(e_bhm) a_bhm = cgt.tanh(a_bhm) # y_bp = y_bp assert infer_shape(k_bHm) == (b,H,m) assert infer_shape(beta_bH) == (b,H) assert infer_shape(g_bH) == (b,H) assert infer_shape(s_bH3) == (b,H,3) assert infer_shape(gamma_bH) == (b,H) assert infer_shape(e_bhm) == (b,h,m) assert infer_shape(a_bhm) == (b,h,m) assert infer_shape(y_bp) == (b,p) return nn.Module([r_bhm, X_bk], [k_bHm, beta_bH, g_bH, s_bH3, gamma_bH, e_bhm, a_bhm, y_bp])
def make_deep_gru(size_input, size_mem, n_layers, size_output, size_batch): inputs = [cgt.matrix() for i_layer in xrange(n_layers + 1)] outputs = [] for i_layer in xrange(n_layers): prev_h = inputs[ i_layer + 1] # note that inputs[0] is the external input, so we add 1 x = inputs[0] if i_layer == 0 else outputs[i_layer - 1] size_x = size_input if i_layer == 0 else size_mem update_gate = cgt.sigmoid( nn.Affine(size_x, size_mem, name="i2u")(x) + nn.Affine(size_mem, size_mem, name="h2u")(prev_h)) reset_gate = cgt.sigmoid( nn.Affine(size_x, size_mem, name="i2r")(x) + nn.Affine(size_mem, size_mem, name="h2r")(prev_h)) gated_hidden = reset_gate * prev_h p2 = nn.Affine(size_mem, size_mem)(gated_hidden) p1 = nn.Affine(size_x, size_mem)(x) hidden_target = cgt.tanh(p1 + p2) next_h = (1.0 - update_gate) * prev_h + update_gate * hidden_target outputs.append(next_h) category_activations = nn.Affine(size_mem, size_output, name="pred")(outputs[-1]) logprobs = nn.logsoftmax(category_activations) outputs.append(logprobs) return nn.Module(inputs, outputs)
def __call__(self,M,*inputs): assert len(inputs) == len(self.Wizs) n = M.shape[0] summands = [Xi.dot(Wiz) for (Xi,Wiz) in zip(inputs,self.Wizs)] + [M.dot(self.Wmz),cgt.repeat(self.bz,n, axis=0)] z = cgt.sigmoid(cgt.add_multi(summands)) summands = [Xi.dot(Wir) for (Xi,Wir) in zip(inputs,self.Wirs)] + [M.dot(self.Wmr),cgt.repeat(self.br,n, axis=0)] r = cgt.sigmoid(cgt.add_multi(summands)) summands = [Xi.dot(Wim) for (Xi,Wim) in zip(inputs,self.Wims)] + [(r*M).dot(self.Wmm),cgt.repeat(self.bm,n, axis=0)] Mtarg = cgt.tanh(cgt.add_multi(summands)) #pylint: disable=E1111 Mnew = (1-z)*M + z*Mtarg return Mnew
def __init__(self, x, n_in, n_hid, n_out, nlayers=1, y=None, eps=None): super(GaussianMLP, self).__init__(x, n_in, n_hid, nlayers=nlayers, prefix="GaussianMLP_hidden") self.mu_layer = HiddenLayer( input=self.hidden_layers[-1].output, n_in=self.hidden_layers[-1].n_out, n_out=n_out, activation=None, prefix="GaussianMLP_mu" ) # log(sigma^2) self.logvar_layer = HiddenLayer( input=self.hidden_layers[-1].output, n_in=self.hidden_layers[-1].n_out, n_out=n_out, activation=None, prefix="GaussianMLP_logvar" ) self.mu = self.mu_layer.output self.var = cgt.exp(self.logvar_layer.output) self.sigma = cgt.sqrt(self.var) self.params = self.params + self.mu_layer.params +\ self.logvar_layer.params # for use as encoder if eps is not None: assert(y is None) self.out = self.mu + self.sigma * eps # for use as decoder if y: assert(eps is None) self.out = cgt.sigmoid(self.mu) self.cost = -cgt.sum(log_diag_mvn(self.out, self.var)(y))
def make_deep_lstm(size_input, size_mem, n_layers, size_output, size_batch): inputs = [cgt.matrix(fixed_shape=(size_batch, size_input))] for _ in xrange(2 * n_layers): inputs.append(cgt.matrix(fixed_shape=(size_batch, size_mem))) outputs = [] for i_layer in xrange(n_layers): prev_h = inputs[i_layer * 2] prev_c = inputs[i_layer * 2 + 1] if i_layer == 0: x = inputs[0] size_x = size_input else: x = outputs[(i_layer - 1) * 2] size_x = size_mem input_sums = nn.Affine(size_x, 4 * size_mem)(x) + nn.Affine( size_x, 4 * size_mem)(prev_h) sigmoid_chunk = cgt.sigmoid(input_sums[:, 0:3 * size_mem]) in_gate = sigmoid_chunk[:, 0:size_mem] forget_gate = sigmoid_chunk[:, size_mem:2 * size_mem] out_gate = sigmoid_chunk[:, 2 * size_mem:3 * size_mem] in_transform = cgt.tanh(input_sums[:, 3 * size_mem:4 * size_mem]) next_c = forget_gate * prev_c + in_gate * in_transform next_h = out_gate * cgt.tanh(next_c) outputs.append(next_c) outputs.append(next_h) category_activations = nn.Affine(size_mem, size_output)(outputs[-1]) logprobs = nn.logsoftmax(category_activations) outputs.append(logprobs) return nn.Module(inputs, outputs)
def make_deep_lstm(size_input, size_mem, n_layers, size_output, size_batch): inputs = [cgt.matrix(fixed_shape=(size_batch, size_input))] for _ in xrange(2*n_layers): inputs.append(cgt.matrix(fixed_shape=(size_batch, size_mem))) outputs = [] for i_layer in xrange(n_layers): prev_h = inputs[i_layer*2] prev_c = inputs[i_layer*2+1] if i_layer==0: x = inputs[0] size_x = size_input else: x = outputs[(i_layer-1)*2] size_x = size_mem input_sums = nn.Affine(size_x, 4*size_mem)(x) + nn.Affine(size_x, 4*size_mem)(prev_h) sigmoid_chunk = cgt.sigmoid(input_sums[:,0:3*size_mem]) in_gate = sigmoid_chunk[:,0:size_mem] forget_gate = sigmoid_chunk[:,size_mem:2*size_mem] out_gate = sigmoid_chunk[:,2*size_mem:3*size_mem] in_transform = cgt.tanh(input_sums[:,3*size_mem:4*size_mem]) next_c = forget_gate*prev_c + in_gate * in_transform next_h = out_gate*cgt.tanh(next_c) outputs.append(next_c) outputs.append(next_h) category_activations = nn.Affine(size_mem, size_output)(outputs[-1]) logprobs = nn.logsoftmax(category_activations) outputs.append(logprobs) return nn.Module(inputs, outputs)
def hybrid_network(size_in, size_out, num_units, num_stos, dbg_out={}): assert len(num_units) == len(num_stos) net_in = cgt.matrix("X", fixed_shape=(None, size_in)) prev_num_units, prev_out = size_in, net_in dbg_out['NET~in'] = net_in curr_layer = 1 for (curr_num_units, curr_num_sto) in zip(num_units, num_stos): assert curr_num_units >= curr_num_sto >= 0 prev_out = combo_layer( prev_out, prev_num_units, curr_num_units, (curr_num_sto, ), s_funcs=s_func_ip, o_funcs=(lambda x: cgt.bernoulli(cgt.sigmoid(x)), cgt.nn.rectify), name=str(curr_layer), dbg_out=dbg_out) dbg_out['L%d~out' % curr_layer] = prev_out prev_num_units = curr_num_units curr_layer += 1 net_out = nn.Affine(prev_num_units, size_out, name="InnerProd(%d->%d)" % (prev_num_units, size_out))(prev_out) dbg_out['NET~out'] = net_out return net_in, net_out
def __call__(self, M, *inputs): assert len(inputs) == len(self.Wizs) n = M.shape[0] summands = [Xi.dot(Wiz) for (Xi, Wiz) in zip(inputs, self.Wizs)] + [ M.dot(self.Wmz), cgt.repeat(self.bz, n, axis=0) ] z = cgt.sigmoid(cgt.add_multi(summands)) summands = [Xi.dot(Wir) for (Xi, Wir) in zip(inputs, self.Wirs)] + [ M.dot(self.Wmr), cgt.repeat(self.br, n, axis=0) ] r = cgt.sigmoid(cgt.add_multi(summands)) summands = [Xi.dot(Wim) for (Xi, Wim) in zip(inputs, self.Wims) ] + [(r * M).dot(self.Wmm), cgt.repeat(self.bm, n, axis=0)] Mtarg = cgt.tanh(cgt.add_multi(summands)) #pylint: disable=E1111 Mnew = (1 - z) * M + z * Mtarg return Mnew
def hybrid_layer(X, size_in, size_out, size_random, dbg_out=[]): assert size_out >= size_random >= 0 out = cgt.sigmoid(nn.Affine( size_in, size_out, name="InnerProd(%d->%d)" % (size_in, size_out) )(X)) dbg_out.append(out) if size_random == 0: return out if size_random == size_out: out_s = cgt.bernoulli(out) return out_s out_s = cgt.bernoulli(out[:, :size_random]) out = cgt.concatenate([out_s, out[:, size_random:]], axis=1) return out
def make_deep_gru(size_input, size_mem, n_layers, size_output, size_batch): inputs = [cgt.matrix() for i_layer in xrange(n_layers+1)] outputs = [] for i_layer in xrange(n_layers): prev_h = inputs[i_layer+1] # note that inputs[0] is the external input, so we add 1 x = inputs[0] if i_layer==0 else outputs[i_layer-1] size_x = size_input if i_layer==0 else size_mem update_gate = cgt.sigmoid( nn.Affine(size_x, size_mem,name="i2u")(x) + nn.Affine(size_mem, size_mem, name="h2u")(prev_h)) reset_gate = cgt.sigmoid( nn.Affine(size_x, size_mem,name="i2r")(x) + nn.Affine(size_mem, size_mem, name="h2r")(prev_h)) gated_hidden = reset_gate * prev_h p2 = nn.Affine(size_mem, size_mem)(gated_hidden) p1 = nn.Affine(size_x, size_mem)(x) hidden_target = cgt.tanh(p1+p2) next_h = (1.0-update_gate)*prev_h + update_gate*hidden_target outputs.append(next_h) category_activations = nn.Affine(size_mem, size_output,name="pred")(outputs[-1]) logprobs = nn.logsoftmax(category_activations) outputs.append(logprobs) return nn.Module(inputs, outputs)
def hybrid_layer(X, size_in, size_out, size_random, dbg_out=[]): assert size_out >= size_random >= 0 out = cgt.sigmoid( nn.Affine(size_in, size_out, name="InnerProd(%d->%d)" % (size_in, size_out))(X)) dbg_out.append(out) if size_random == 0: return out if size_random == size_out: out_s = cgt.bernoulli(out) return out_s out_s = cgt.bernoulli(out[:, :size_random]) out = cgt.concatenate([out_s, out[:, size_random:]], axis=1) return out
def test_get_context(): batch_size = 32 feat_t_steps = 3 feat_num_features = 30 state_num_features = 20 num_out_classes = 28 feats = cgt.tensor3(fixed_shape=(batch_size, feat_t_steps, feat_num_features)) prev_out = cgt.matrix(fixed_shape=(batch_size, state_num_features)) sigmoided = cgt.sigmoid(prev_out) s = nnbuilder.Seq2Seq(nn_input_btf=feats, num_out_classes=num_out_classes, feature_size=feat_num_features, decoder_size=state_num_features) mm = cgt.infer_shape(s.features_post_mlp_btf) assert mm == (batch_size, feat_t_steps, feat_num_features) context_out = s.get_context(sigmoided) out_fun = cgt.function([feats, prev_out], [context_out]) tau = np.reshape(np.random.normal(0.1, 0.2, batch_size*feat_t_steps*feat_num_features), (batch_size, feat_t_steps, feat_num_features)) tau2 = np.reshape(np.random.normal(0.1, 0.2, batch_size*state_num_features), (batch_size, state_num_features)) m = out_fun(tau, tau2)[0] assert m.shape == (batch_size, feat_num_features) assert np.mean(m) < 1
def get_context_backup(self, prev_state_bf): state_step_bf = cgt.sigmoid(self.states_mlp_bf(prev_state_bf)) product_list = [] for time_step in range(0, 3): inner_product = cgt.sum(state_step_bf*self.features_post_mlp_btf[:, time_step, :], axis=1) product_list.append(inner_product) st = cgt.stack(product_list) st = cgt.dimshuffle(st, [1, 0]) softmax_weights = softmax(st) sum = None for time_step in range(0, 3): softmax_t_step = cgt.dimshuffle(softmax_weights[:, time_step], [0, 'x']) if sum is None: sum = cgt.broadcast('*', softmax_t_step, self.features_post_mlp_btf[:, time_step, :], 'x1,xx') else: sum += cgt.broadcast('*', softmax_t_step, self.features_post_mlp_btf[:, time_step, :], 'x1,xx') return sum
def hybrid_network(size_in, size_out, num_units, num_stos, dbg_out={}): assert len(num_units) == len(num_stos) net_in = cgt.matrix("X", fixed_shape=(None, size_in)) prev_num_units, prev_out = size_in, net_in dbg_out['NET~in'] = net_in curr_layer = 1 for (curr_num_units, curr_num_sto) in zip(num_units, num_stos): assert curr_num_units >= curr_num_sto >= 0 prev_out = combo_layer(prev_out, prev_num_units, curr_num_units, (curr_num_sto,), s_funcs=s_func_ip, o_funcs=(lambda x: cgt.bernoulli(cgt.sigmoid(x)), cgt.nn.rectify), name=str(curr_layer), dbg_out=dbg_out) dbg_out['L%d~out' % curr_layer] = prev_out prev_num_units = curr_num_units curr_layer += 1 net_out = nn.Affine(prev_num_units, size_out, name="InnerProd(%d->%d)" % (prev_num_units, size_out) )(prev_out) dbg_out['NET~out'] = net_out return net_in, net_out
def make_funcs(opt, ntm, total_time, loss_timesteps): x_tbk = cgt.tensor3("x", fixed_shape=(total_time, opt.b, opt.k)) y_tbp = cgt.tensor3("y", fixed_shape=(total_time, opt.b, opt.p)) loss_timesteps = set(loss_timesteps) initial_states = make_ntm_initial_states(opt) params = ntm.get_parameters() + get_parameters(initial_states) # params = ntm.get_parameters() lossCE = 0 loss01 = 0 state_arrs = initial_states for t in xrange(total_time): tmp = ntm([x_tbk[t]] + state_arrs) raw_pred = tmp[0] state_arrs = tmp[1:4] if t in loss_timesteps: p_pred = cgt.sigmoid(raw_pred) ce = bernoulli_crossentropy( y_tbp[t], p_pred).sum() # cross-entropy of bernoulli distribution lossCE = lossCE + ce loss01 = loss01 + cgt.cast(cgt.equal(y_tbp[t], round01(p_pred)), cgt.floatX).sum() lossCE = lossCE / (len(loss_timesteps) * opt.p * opt.b) / np.log(2) loss01 = loss01 / (len(loss_timesteps) * opt.p * opt.b) gradloss = cgt.grad(lossCE, params) flatgrad = flatcat(gradloss) f_loss = cgt.function([x_tbk, y_tbp], lossCE) f_loss_and_grad = cgt.function([x_tbk, y_tbp], [lossCE, loss01, flatgrad]) print "number of nodes in computation graph:", core.count_nodes( [lossCE, loss01, flatgrad]) return f_loss, f_loss_and_grad, params
def lstm_block(h_prev, c_prev, x_curr, size_x, size_c, name=''): """ Construct a LSTM cell block of specified number of cells :param h_prev: self activations at previous time step :param c_prev: self memory state at previous time step :param x_curr: inputs from previous layer at current time step :param size_x: size of inputs :param size_c: size of both c and h :return: c and h at current time step :rtype: """ input_sums = nn.Affine(size_x, 4 * size_c, name=name+'*x')(x_curr) + \ nn.Affine(size_c, 4 * size_c, name=name+'*h')(h_prev) c_new = cgt.tanh(input_sums[:, 3 * size_c:]) sigmoid_chunk = cgt.sigmoid(input_sums[:, :3 * size_c]) in_gate = sigmoid_chunk[:, :size_c] forget_gate = sigmoid_chunk[:, size_c:2 * size_c] out_gate = sigmoid_chunk[:, 2 * size_c:3 * size_c] c_curr = forget_gate * c_prev + in_gate * c_new h_curr = out_gate * cgt.tanh(c_curr) return c_curr, h_curr
def lstm_block(h_prev, c_prev, x_curr, size_x, size_c, name=''): """ Construct a LSTM cell block of specified number of cells :param h_prev: self activations at previous time step :param c_prev: self memory state at previous time step :param x_curr: inputs from previous layer at current time step :param size_x: size of inputs :param size_c: size of both c and h :return: c and h at current time step :rtype: """ input_sums = nn.Affine(size_x, 4 * size_c, name=name+'*x')(x_curr) + \ nn.Affine(size_c, 4 * size_c, name=name+'*h')(h_prev) c_new = cgt.tanh(input_sums[:, 3*size_c:]) sigmoid_chunk = cgt.sigmoid(input_sums[:, :3*size_c]) in_gate = sigmoid_chunk[:, :size_c] forget_gate = sigmoid_chunk[:, size_c:2*size_c] out_gate = sigmoid_chunk[:, 2*size_c:3*size_c] c_curr = forget_gate * c_prev + in_gate * c_new h_curr = out_gate * cgt.tanh(c_curr) return c_curr, h_curr
def make_funcs(opt, ntm, total_time, loss_timesteps): x_tbk = cgt.tensor3("x", fixed_shape=(total_time, opt.b, opt.k)) y_tbp = cgt.tensor3("y", fixed_shape=(total_time, opt.b, opt.p)) loss_timesteps = set(loss_timesteps) initial_states = make_ntm_initial_states(opt) params = ntm.get_parameters() + get_parameters(initial_states) # params = ntm.get_parameters() lossCE = 0 loss01 = 0 state_arrs = initial_states for t in xrange(total_time): tmp = ntm([x_tbk[t]] + state_arrs) raw_pred = tmp[0] state_arrs = tmp[1:4] if t in loss_timesteps: p_pred = cgt.sigmoid(raw_pred) ce = bernoulli_crossentropy(y_tbp[t] , p_pred).sum() # cross-entropy of bernoulli distribution lossCE = lossCE + ce loss01 = loss01 + cgt.cast(cgt.equal(y_tbp[t], round01(p_pred)),cgt.floatX).sum() lossCE = lossCE / (len(loss_timesteps) * opt.p * opt.b) / np.log(2) loss01 = loss01 / (len(loss_timesteps) * opt.p * opt.b) gradloss = cgt.grad(lossCE, params) flatgrad = flatcat(gradloss) f_loss = cgt.function([x_tbk, y_tbp], lossCE) f_loss_and_grad = cgt.function([x_tbk, y_tbp], [lossCE, loss01, flatgrad]) print "number of nodes in computation graph:", core.count_nodes([lossCE, loss01, flatgrad]) return f_loss, f_loss_and_grad, params
def sigmoid(x): return cgt.sigmoid(x)