Beispiel #1
0
    def __call__(self, x, prev_h):
        """
        x is the input
        prev_h is the input from the previous timestep

        Returns next_h. For the GRU the output to the next timestep
        and next layer is one and the same. Copy it first!
        """

        reset_gate = cgt.sigmoid(x.dot(self.W_xr) + prev_h.dot(self.W_hr))
        update_gate = cgt.sigmoid(x.dot(self.W_xz) + prev_h.dot(self.W_hz))

        # the elementwise multiplication here tells what of the previous
        # input we should forget.
        forget_gate = reset_gate * prev_h

        # this part is very similar to vanilla RNN
        h_candidate = cgt.tanh(x.dot(self.W_xc) + prev_h.dot(forget_gate))
        # this isn't super clear in the paper just it's an elementwise mult here
        next_h = (1. - update_gate) * h + update_gate * h_candidate

        # In a standard GRU cell we only have 1 output.
        # However, it should be be copied and feed to
        # both the next timestep and the next layer
        return next_h
Beispiel #2
0
    def __call__(self, x, prev_c, prev_h):
        """
        x is the input
        prev_h is the previous timestep
        prev_c is the previous memory context

        Returns (next_c, next_h).

        next_h should be cloned since it's feed into the next layer and
        the next timstep.
        """

        forget_gate = cgt.sigmoid(x.dot(self.W_xf) + prev_h.dot(self.W_hf))
        input_gate = cgt.sigmoid(x.dot(self.W_xi) + prev_h.dot(self.W_hi))
        output_gate = cgt.sigmoid(x.dot(self.W_xo) + prev_h.dot(self.W_ho))
        candidate_values = cgt.tanh(x.dot(self.W_xc) + prev_h.dot(self.W_hc))

        # new cell state
        next_c = forget_gate * prev_c + input_gate * candidate_values
        # input for next timestep
        next_h = output_gate * cgt.tanh(next_c)

        # NOTE: we feed next_h into the next layer and the next timestep
        # so we should clone the next_h output.
        return next_c, next_h
Beispiel #3
0
    def __call__(self, x, prev_c, prev_h):
        """
        x is the input
        prev_h is the previous timestep
        prev_c is the previous memory context

        Returns (next_c, next_h).

        next_h should be cloned since it's feed into the next layer and
        the next timstep.
        """

        forget_gate = cgt.sigmoid(x.dot(self.W_xf) + prev_h.dot(self.W_hf))
        input_gate = cgt.sigmoid(x.dot(self.W_xi) + prev_h.dot(self.W_hi))
        output_gate = cgt.sigmoid(x.dot(self.W_xo) + prev_h.dot(self.W_ho))
        candidate_values = cgt.tanh(x.dot(self.W_xc) + prev_h.dot(self.W_hc))

        # new cell state
        next_c = forget_gate * prev_c + input_gate * candidate_values
        # input for next timestep
        next_h = output_gate * cgt.tanh(next_c)

        # NOTE: we feed next_h into the next layer and the next timestep
        # so we should clone the next_h output.
        return next_c, next_h
def make_ff_controller(opt):

    b, h, m, p, k = opt.b, opt.h, opt.m, opt.p, opt.k

    H = 2*h
    in_size = k + h*m
    out_size = H*m + H + H + H*3 + H + h*m + h*m + p

    # Previous reads
    r_bhm = cgt.tensor3("r", fixed_shape = (b,h,m))
    # External inputs
    X_bk = cgt.matrix("x", fixed_shape = (b,k))
    r_b_hm = r_bhm.reshape([r_bhm.shape[0], r_bhm.shape[1]*r_bhm.shape[2]])
    # Input to controller
    inp_bq = cgt.concatenate([X_bk, r_b_hm], axis=1)

    hid_sizes = opt.ff_hid_sizes
    activation = cgt.tanh

    layer_out_sizes = [in_size] + hid_sizes + [out_size]
    last_out = inp_bq
    # feedforward part. we could simplify a bit by using nn.Affine
    for i in xrange(len(layer_out_sizes)-1):
        indim = layer_out_sizes[i]
        outdim = layer_out_sizes[i+1]        
        W = cgt.shared(.02*nr.randn(indim, outdim), name="W%i"%i, fixed_shape_mask="all")
        bias = cgt.shared(.02*nr.randn(1, outdim), name="b%i"%i, fixed_shape_mask="all")
        last_out = cgt.broadcast("+",last_out.dot(W),bias,"xx,1x")
        # Don't apply nonlinearity at the last layer
        if i != len(layer_out_sizes)-2: last_out = activation(last_out)

    idx = 0
    k_bHm = last_out[:,idx:idx+H*m];      idx += H*m;         k_bHm = k_bHm.reshape([b,H,m])
    beta_bH = last_out[:,idx:idx+H];      idx += H
    g_bH = last_out[:,idx:idx+H];         idx += H
    s_bH3 = last_out[:,idx:idx+3*H];      idx += 3*H;         s_bH3 = s_bH3.reshape([b,H,3])
    gamma_bH = last_out[:,idx:idx+H];     idx += H
    e_bhm = last_out[:,idx:idx+h*m];      idx += h*m;         e_bhm = e_bhm.reshape([b,h,m])
    a_bhm = last_out[:,idx:idx+h*m];      idx += h*m;         a_bhm = a_bhm.reshape([b,h,m])
    y_bp = last_out[:,idx:idx+p];         idx += p

    k_bHm = cgt.tanh(k_bHm)
    beta_bH = nn.softplus(beta_bH)
    g_bH = cgt.sigmoid(g_bH)
    s_bH3 = sum_normalize2(cgt.exp(s_bH3))
    gamma_bH = cgt.sigmoid(gamma_bH)+1
    e_bhm = cgt.sigmoid(e_bhm)
    a_bhm = cgt.tanh(a_bhm)
    # y_bp = y_bp

    assert infer_shape(k_bHm) == (b,H,m)
    assert infer_shape(beta_bH) == (b,H)
    assert infer_shape(g_bH) == (b,H)
    assert infer_shape(s_bH3) == (b,H,3)
    assert infer_shape(gamma_bH) == (b,H)
    assert infer_shape(e_bhm) == (b,h,m)
    assert infer_shape(a_bhm) == (b,h,m)
    assert infer_shape(y_bp) == (b,p)

    return nn.Module([r_bhm, X_bk], [k_bHm, beta_bH, g_bH, s_bH3, gamma_bH, e_bhm, a_bhm, y_bp])
Beispiel #5
0
def make_ff_controller(opt):

    b, h, m, p, k = opt.b, opt.h, opt.m, opt.p, opt.k

    H = 2*h
    in_size = k + h*m
    out_size = H*m + H + H + H*3 + H + h*m + h*m + p

    # Previous reads
    r_bhm = cgt.tensor3("r", fixed_shape = (b,h,m))
    # External inputs
    X_bk = cgt.matrix("x", fixed_shape = (b,k))
    r_b_hm = r_bhm.reshape([r_bhm.shape[0], r_bhm.shape[1]*r_bhm.shape[2]])
    # Input to controller
    inp_bq = cgt.concatenate([X_bk, r_b_hm], axis=1)

    hid_sizes = opt.ff_hid_sizes
    activation = cgt.tanh

    layer_out_sizes = [in_size] + hid_sizes + [out_size]
    last_out = inp_bq
    # feedforward part. we could simplify a bit by using nn.Affine
    for i in xrange(len(layer_out_sizes)-1):
        indim = layer_out_sizes[i]
        outdim = layer_out_sizes[i+1]        
        W = cgt.shared(.02*nr.randn(indim, outdim), name="W%i"%i, fixed_shape_mask="all")
        bias = cgt.shared(.02*nr.randn(1, outdim), name="b%i"%i, fixed_shape_mask="all")
        last_out = cgt.broadcast("+",last_out.dot(W),bias,"xx,1x")
        # Don't apply nonlinearity at the last layer
        if i != len(layer_out_sizes)-2: last_out = activation(last_out)

    idx = 0
    k_bHm = last_out[:,idx:idx+H*m];      idx += H*m;         k_bHm = k_bHm.reshape([b,H,m])
    beta_bH = last_out[:,idx:idx+H];      idx += H
    g_bH = last_out[:,idx:idx+H];         idx += H
    s_bH3 = last_out[:,idx:idx+3*H];      idx += 3*H;         s_bH3 = s_bH3.reshape([b,H,3])
    gamma_bH = last_out[:,idx:idx+H];     idx += H
    e_bhm = last_out[:,idx:idx+h*m];      idx += h*m;         e_bhm = e_bhm.reshape([b,h,m])
    a_bhm = last_out[:,idx:idx+h*m];      idx += h*m;         a_bhm = a_bhm.reshape([b,h,m])
    y_bp = last_out[:,idx:idx+p];         idx += p

    k_bHm = cgt.tanh(k_bHm)
    beta_bH = nn.softplus(beta_bH)
    g_bH = cgt.sigmoid(g_bH)
    s_bH3 = sum_normalize2(cgt.exp(s_bH3))
    gamma_bH = cgt.sigmoid(gamma_bH)+1
    e_bhm = cgt.sigmoid(e_bhm)
    a_bhm = cgt.tanh(a_bhm)
    # y_bp = y_bp

    assert infer_shape(k_bHm) == (b,H,m)
    assert infer_shape(beta_bH) == (b,H)
    assert infer_shape(g_bH) == (b,H)
    assert infer_shape(s_bH3) == (b,H,3)
    assert infer_shape(gamma_bH) == (b,H)
    assert infer_shape(e_bhm) == (b,h,m)
    assert infer_shape(a_bhm) == (b,h,m)
    assert infer_shape(y_bp) == (b,p)

    return nn.Module([r_bhm, X_bk], [k_bHm, beta_bH, g_bH, s_bH3, gamma_bH, e_bhm, a_bhm, y_bp])
Beispiel #6
0
def make_deep_gru(size_input, size_mem, n_layers, size_output, size_batch):
    inputs = [cgt.matrix() for i_layer in xrange(n_layers + 1)]
    outputs = []
    for i_layer in xrange(n_layers):
        prev_h = inputs[
            i_layer +
            1]  # note that inputs[0] is the external input, so we add 1
        x = inputs[0] if i_layer == 0 else outputs[i_layer - 1]
        size_x = size_input if i_layer == 0 else size_mem
        update_gate = cgt.sigmoid(
            nn.Affine(size_x, size_mem, name="i2u")(x) +
            nn.Affine(size_mem, size_mem, name="h2u")(prev_h))
        reset_gate = cgt.sigmoid(
            nn.Affine(size_x, size_mem, name="i2r")(x) +
            nn.Affine(size_mem, size_mem, name="h2r")(prev_h))
        gated_hidden = reset_gate * prev_h
        p2 = nn.Affine(size_mem, size_mem)(gated_hidden)
        p1 = nn.Affine(size_x, size_mem)(x)
        hidden_target = cgt.tanh(p1 + p2)
        next_h = (1.0 - update_gate) * prev_h + update_gate * hidden_target
        outputs.append(next_h)
    category_activations = nn.Affine(size_mem, size_output,
                                     name="pred")(outputs[-1])
    logprobs = nn.logsoftmax(category_activations)
    outputs.append(logprobs)

    return nn.Module(inputs, outputs)
Beispiel #7
0
    def __call__(self,M,*inputs):
        assert len(inputs) == len(self.Wizs)
        n = M.shape[0]
        summands = [Xi.dot(Wiz) for (Xi,Wiz) in zip(inputs,self.Wizs)] + [M.dot(self.Wmz),cgt.repeat(self.bz,n, axis=0)]
        z = cgt.sigmoid(cgt.add_multi(summands))

        summands = [Xi.dot(Wir) for (Xi,Wir) in zip(inputs,self.Wirs)] + [M.dot(self.Wmr),cgt.repeat(self.br,n, axis=0)]
        r = cgt.sigmoid(cgt.add_multi(summands))

        summands = [Xi.dot(Wim) for (Xi,Wim) in zip(inputs,self.Wims)] + [(r*M).dot(self.Wmm),cgt.repeat(self.bm,n, axis=0)]
        Mtarg = cgt.tanh(cgt.add_multi(summands)) #pylint: disable=E1111

        Mnew = (1-z)*M + z*Mtarg
        return Mnew
 def __init__(self, x, n_in, n_hid, n_out, nlayers=1, y=None, eps=None):
     super(GaussianMLP, self).__init__(x, n_in, n_hid, nlayers=nlayers, prefix="GaussianMLP_hidden")
     self.mu_layer = HiddenLayer(
         input=self.hidden_layers[-1].output,
         n_in=self.hidden_layers[-1].n_out,
         n_out=n_out,
         activation=None,
         prefix="GaussianMLP_mu"
     )
     # log(sigma^2)
     self.logvar_layer = HiddenLayer(
         input=self.hidden_layers[-1].output,
         n_in=self.hidden_layers[-1].n_out,
         n_out=n_out,
         activation=None,
         prefix="GaussianMLP_logvar"
     )
     self.mu = self.mu_layer.output
     self.var = cgt.exp(self.logvar_layer.output)
     self.sigma = cgt.sqrt(self.var)
     self.params = self.params + self.mu_layer.params +\
         self.logvar_layer.params
     # for use as encoder
     if eps is not None:
         assert(y is None)
         self.out = self.mu + self.sigma * eps
     # for use as decoder
     if y:
         assert(eps is None)
         self.out = cgt.sigmoid(self.mu)
         self.cost = -cgt.sum(log_diag_mvn(self.out, self.var)(y))
Beispiel #9
0
def make_deep_lstm(size_input, size_mem, n_layers, size_output, size_batch):
    inputs = [cgt.matrix(fixed_shape=(size_batch, size_input))]
    for _ in xrange(2 * n_layers):
        inputs.append(cgt.matrix(fixed_shape=(size_batch, size_mem)))
    outputs = []
    for i_layer in xrange(n_layers):
        prev_h = inputs[i_layer * 2]
        prev_c = inputs[i_layer * 2 + 1]
        if i_layer == 0:
            x = inputs[0]
            size_x = size_input
        else:
            x = outputs[(i_layer - 1) * 2]
            size_x = size_mem
        input_sums = nn.Affine(size_x, 4 * size_mem)(x) + nn.Affine(
            size_x, 4 * size_mem)(prev_h)
        sigmoid_chunk = cgt.sigmoid(input_sums[:, 0:3 * size_mem])
        in_gate = sigmoid_chunk[:, 0:size_mem]
        forget_gate = sigmoid_chunk[:, size_mem:2 * size_mem]
        out_gate = sigmoid_chunk[:, 2 * size_mem:3 * size_mem]
        in_transform = cgt.tanh(input_sums[:, 3 * size_mem:4 * size_mem])
        next_c = forget_gate * prev_c + in_gate * in_transform
        next_h = out_gate * cgt.tanh(next_c)
        outputs.append(next_c)
        outputs.append(next_h)

    category_activations = nn.Affine(size_mem, size_output)(outputs[-1])
    logprobs = nn.logsoftmax(category_activations)
    outputs.append(logprobs)

    return nn.Module(inputs, outputs)
Beispiel #10
0
def make_deep_lstm(size_input, size_mem, n_layers, size_output, size_batch):
    inputs = [cgt.matrix(fixed_shape=(size_batch, size_input))]
    for _ in xrange(2*n_layers):
        inputs.append(cgt.matrix(fixed_shape=(size_batch, size_mem)))
    outputs = []
    for i_layer in xrange(n_layers):
        prev_h = inputs[i_layer*2]
        prev_c = inputs[i_layer*2+1]
        if i_layer==0:
            x = inputs[0]
            size_x = size_input
        else:
            x = outputs[(i_layer-1)*2]
            size_x = size_mem
        input_sums = nn.Affine(size_x, 4*size_mem)(x) + nn.Affine(size_x, 4*size_mem)(prev_h)
        sigmoid_chunk = cgt.sigmoid(input_sums[:,0:3*size_mem])
        in_gate = sigmoid_chunk[:,0:size_mem]
        forget_gate = sigmoid_chunk[:,size_mem:2*size_mem]
        out_gate = sigmoid_chunk[:,2*size_mem:3*size_mem]
        in_transform = cgt.tanh(input_sums[:,3*size_mem:4*size_mem])
        next_c = forget_gate*prev_c + in_gate * in_transform
        next_h = out_gate*cgt.tanh(next_c)
        outputs.append(next_c)
        outputs.append(next_h)

    category_activations = nn.Affine(size_mem, size_output)(outputs[-1])
    logprobs = nn.logsoftmax(category_activations)
    outputs.append(logprobs)

    return nn.Module(inputs, outputs)
Beispiel #11
0
def hybrid_network(size_in, size_out, num_units, num_stos, dbg_out={}):
    assert len(num_units) == len(num_stos)
    net_in = cgt.matrix("X", fixed_shape=(None, size_in))
    prev_num_units, prev_out = size_in, net_in
    dbg_out['NET~in'] = net_in
    curr_layer = 1
    for (curr_num_units, curr_num_sto) in zip(num_units, num_stos):
        assert curr_num_units >= curr_num_sto >= 0
        prev_out = combo_layer(
            prev_out,
            prev_num_units,
            curr_num_units, (curr_num_sto, ),
            s_funcs=s_func_ip,
            o_funcs=(lambda x: cgt.bernoulli(cgt.sigmoid(x)), cgt.nn.rectify),
            name=str(curr_layer),
            dbg_out=dbg_out)
        dbg_out['L%d~out' % curr_layer] = prev_out
        prev_num_units = curr_num_units
        curr_layer += 1
    net_out = nn.Affine(prev_num_units,
                        size_out,
                        name="InnerProd(%d->%d)" %
                        (prev_num_units, size_out))(prev_out)
    dbg_out['NET~out'] = net_out
    return net_in, net_out
Beispiel #12
0
    def __call__(self, M, *inputs):
        assert len(inputs) == len(self.Wizs)
        n = M.shape[0]
        summands = [Xi.dot(Wiz) for (Xi, Wiz) in zip(inputs, self.Wizs)] + [
            M.dot(self.Wmz), cgt.repeat(self.bz, n, axis=0)
        ]
        z = cgt.sigmoid(cgt.add_multi(summands))

        summands = [Xi.dot(Wir) for (Xi, Wir) in zip(inputs, self.Wirs)] + [
            M.dot(self.Wmr), cgt.repeat(self.br, n, axis=0)
        ]
        r = cgt.sigmoid(cgt.add_multi(summands))

        summands = [Xi.dot(Wim) for (Xi, Wim) in zip(inputs, self.Wims)
                    ] + [(r * M).dot(self.Wmm),
                         cgt.repeat(self.bm, n, axis=0)]
        Mtarg = cgt.tanh(cgt.add_multi(summands))  #pylint: disable=E1111

        Mnew = (1 - z) * M + z * Mtarg
        return Mnew
Beispiel #13
0
def hybrid_layer(X, size_in, size_out, size_random, dbg_out=[]):
    assert size_out >= size_random >= 0
    out = cgt.sigmoid(nn.Affine(
        size_in, size_out, name="InnerProd(%d->%d)" % (size_in, size_out)
    )(X))
    dbg_out.append(out)
    if size_random == 0:
        return out
    if size_random == size_out:
        out_s = cgt.bernoulli(out)
        return out_s
    out_s = cgt.bernoulli(out[:, :size_random])
    out = cgt.concatenate([out_s, out[:, size_random:]], axis=1)
    return out
Beispiel #14
0
def make_deep_gru(size_input, size_mem, n_layers, size_output, size_batch):
    inputs = [cgt.matrix() for i_layer in xrange(n_layers+1)]
    outputs = []
    for i_layer in xrange(n_layers):
        prev_h = inputs[i_layer+1] # note that inputs[0] is the external input, so we add 1
        x = inputs[0] if i_layer==0 else outputs[i_layer-1]
        size_x = size_input if i_layer==0 else size_mem
        update_gate = cgt.sigmoid(
            nn.Affine(size_x, size_mem,name="i2u")(x)
            + nn.Affine(size_mem, size_mem, name="h2u")(prev_h))
        reset_gate = cgt.sigmoid(
            nn.Affine(size_x, size_mem,name="i2r")(x)
            + nn.Affine(size_mem, size_mem, name="h2r")(prev_h))
        gated_hidden = reset_gate * prev_h
        p2 = nn.Affine(size_mem, size_mem)(gated_hidden)
        p1 = nn.Affine(size_x, size_mem)(x)
        hidden_target = cgt.tanh(p1+p2)
        next_h = (1.0-update_gate)*prev_h + update_gate*hidden_target
        outputs.append(next_h)
    category_activations = nn.Affine(size_mem, size_output,name="pred")(outputs[-1])
    logprobs = nn.logsoftmax(category_activations)
    outputs.append(logprobs)

    return nn.Module(inputs, outputs)
Beispiel #15
0
def hybrid_layer(X, size_in, size_out, size_random, dbg_out=[]):
    assert size_out >= size_random >= 0
    out = cgt.sigmoid(
        nn.Affine(size_in,
                  size_out,
                  name="InnerProd(%d->%d)" % (size_in, size_out))(X))
    dbg_out.append(out)
    if size_random == 0:
        return out
    if size_random == size_out:
        out_s = cgt.bernoulli(out)
        return out_s
    out_s = cgt.bernoulli(out[:, :size_random])
    out = cgt.concatenate([out_s, out[:, size_random:]], axis=1)
    return out
Beispiel #16
0
def test_get_context():
    batch_size = 32
    feat_t_steps = 3
    feat_num_features = 30
    state_num_features = 20
    num_out_classes = 28
    feats = cgt.tensor3(fixed_shape=(batch_size, feat_t_steps, feat_num_features))
    prev_out = cgt.matrix(fixed_shape=(batch_size, state_num_features))
    sigmoided = cgt.sigmoid(prev_out)
    s = nnbuilder.Seq2Seq(nn_input_btf=feats, num_out_classes=num_out_classes, feature_size=feat_num_features, decoder_size=state_num_features)
    mm = cgt.infer_shape(s.features_post_mlp_btf)
    assert mm == (batch_size, feat_t_steps, feat_num_features)
    context_out = s.get_context(sigmoided)
    out_fun = cgt.function([feats, prev_out], [context_out])
    tau = np.reshape(np.random.normal(0.1, 0.2, batch_size*feat_t_steps*feat_num_features), (batch_size, feat_t_steps, feat_num_features))
    tau2 = np.reshape(np.random.normal(0.1, 0.2, batch_size*state_num_features), (batch_size, state_num_features))
    m = out_fun(tau, tau2)[0]
    assert m.shape == (batch_size, feat_num_features)
    assert np.mean(m) < 1
Beispiel #17
0
    def get_context_backup(self, prev_state_bf):
        state_step_bf = cgt.sigmoid(self.states_mlp_bf(prev_state_bf))

        product_list = []
        for time_step in range(0, 3):
            inner_product = cgt.sum(state_step_bf*self.features_post_mlp_btf[:, time_step, :], axis=1)
            product_list.append(inner_product)
        st = cgt.stack(product_list)
        st = cgt.dimshuffle(st, [1, 0])
        softmax_weights = softmax(st)

        sum = None

        for time_step in range(0, 3):
            softmax_t_step = cgt.dimshuffle(softmax_weights[:, time_step], [0, 'x'])
            if sum is None:
                sum = cgt.broadcast('*', softmax_t_step, self.features_post_mlp_btf[:, time_step, :], 'x1,xx')
            else:
                sum += cgt.broadcast('*', softmax_t_step, self.features_post_mlp_btf[:, time_step, :], 'x1,xx')

        return sum
Beispiel #18
0
def hybrid_network(size_in, size_out, num_units, num_stos, dbg_out={}):
    assert len(num_units) == len(num_stos)
    net_in = cgt.matrix("X", fixed_shape=(None, size_in))
    prev_num_units, prev_out = size_in, net_in
    dbg_out['NET~in'] = net_in
    curr_layer = 1
    for (curr_num_units, curr_num_sto) in zip(num_units, num_stos):
        assert curr_num_units >= curr_num_sto >= 0
        prev_out = combo_layer(prev_out, prev_num_units, curr_num_units,
                               (curr_num_sto,),
                               s_funcs=s_func_ip,
                               o_funcs=(lambda x: cgt.bernoulli(cgt.sigmoid(x)), cgt.nn.rectify),
                               name=str(curr_layer), dbg_out=dbg_out)
        dbg_out['L%d~out' % curr_layer] = prev_out
        prev_num_units = curr_num_units
        curr_layer += 1
    net_out = nn.Affine(prev_num_units, size_out,
                        name="InnerProd(%d->%d)" % (prev_num_units, size_out)
                        )(prev_out)
    dbg_out['NET~out'] = net_out
    return net_in, net_out
def make_funcs(opt, ntm, total_time, loss_timesteps):
    x_tbk = cgt.tensor3("x", fixed_shape=(total_time, opt.b, opt.k))
    y_tbp = cgt.tensor3("y", fixed_shape=(total_time, opt.b, opt.p))
    loss_timesteps = set(loss_timesteps)

    initial_states = make_ntm_initial_states(opt)
    params = ntm.get_parameters() + get_parameters(initial_states)
    # params = ntm.get_parameters()

    lossCE = 0
    loss01 = 0

    state_arrs = initial_states
    for t in xrange(total_time):
        tmp = ntm([x_tbk[t]] + state_arrs)
        raw_pred = tmp[0]
        state_arrs = tmp[1:4]

        if t in loss_timesteps:
            p_pred = cgt.sigmoid(raw_pred)
            ce = bernoulli_crossentropy(
                y_tbp[t],
                p_pred).sum()  # cross-entropy of bernoulli distribution
            lossCE = lossCE + ce
            loss01 = loss01 + cgt.cast(cgt.equal(y_tbp[t], round01(p_pred)),
                                       cgt.floatX).sum()

    lossCE = lossCE / (len(loss_timesteps) * opt.p * opt.b) / np.log(2)
    loss01 = loss01 / (len(loss_timesteps) * opt.p * opt.b)
    gradloss = cgt.grad(lossCE, params)

    flatgrad = flatcat(gradloss)

    f_loss = cgt.function([x_tbk, y_tbp], lossCE)
    f_loss_and_grad = cgt.function([x_tbk, y_tbp], [lossCE, loss01, flatgrad])

    print "number of nodes in computation graph:", core.count_nodes(
        [lossCE, loss01, flatgrad])

    return f_loss, f_loss_and_grad, params
Beispiel #20
0
def lstm_block(h_prev, c_prev, x_curr, size_x, size_c, name=''):
    """
    Construct a LSTM cell block of specified number of cells

    :param h_prev: self activations at previous time step
    :param c_prev: self memory state at previous time step
    :param x_curr: inputs from previous layer at current time step
    :param size_x: size of inputs
    :param size_c: size of both c and h
    :return: c and h at current time step
    :rtype:
    """
    input_sums = nn.Affine(size_x, 4 * size_c, name=name+'*x')(x_curr) + \
                 nn.Affine(size_c, 4 * size_c, name=name+'*h')(h_prev)
    c_new = cgt.tanh(input_sums[:, 3 * size_c:])
    sigmoid_chunk = cgt.sigmoid(input_sums[:, :3 * size_c])
    in_gate = sigmoid_chunk[:, :size_c]
    forget_gate = sigmoid_chunk[:, size_c:2 * size_c]
    out_gate = sigmoid_chunk[:, 2 * size_c:3 * size_c]
    c_curr = forget_gate * c_prev + in_gate * c_new
    h_curr = out_gate * cgt.tanh(c_curr)
    return c_curr, h_curr
Beispiel #21
0
def lstm_block(h_prev, c_prev, x_curr, size_x, size_c, name=''):
    """
    Construct a LSTM cell block of specified number of cells

    :param h_prev: self activations at previous time step
    :param c_prev: self memory state at previous time step
    :param x_curr: inputs from previous layer at current time step
    :param size_x: size of inputs
    :param size_c: size of both c and h
    :return: c and h at current time step
    :rtype:
    """
    input_sums = nn.Affine(size_x, 4 * size_c, name=name+'*x')(x_curr) + \
                 nn.Affine(size_c, 4 * size_c, name=name+'*h')(h_prev)
    c_new = cgt.tanh(input_sums[:, 3*size_c:])
    sigmoid_chunk = cgt.sigmoid(input_sums[:, :3*size_c])
    in_gate = sigmoid_chunk[:, :size_c]
    forget_gate = sigmoid_chunk[:, size_c:2*size_c]
    out_gate = sigmoid_chunk[:, 2*size_c:3*size_c]
    c_curr = forget_gate * c_prev + in_gate * c_new
    h_curr = out_gate * cgt.tanh(c_curr)
    return c_curr, h_curr
def make_funcs(opt, ntm, total_time, loss_timesteps):    
    x_tbk = cgt.tensor3("x", fixed_shape=(total_time, opt.b, opt.k))
    y_tbp = cgt.tensor3("y", fixed_shape=(total_time, opt.b, opt.p))
    loss_timesteps = set(loss_timesteps)

    initial_states = make_ntm_initial_states(opt)
    params = ntm.get_parameters() + get_parameters(initial_states)
    # params = ntm.get_parameters()

    lossCE = 0
    loss01 = 0

    state_arrs = initial_states
    for t in xrange(total_time):
        tmp = ntm([x_tbk[t]] + state_arrs)
        raw_pred = tmp[0]
        state_arrs = tmp[1:4]

        if t in loss_timesteps:
            p_pred = cgt.sigmoid(raw_pred)
            ce = bernoulli_crossentropy(y_tbp[t] , p_pred).sum() # cross-entropy of bernoulli distribution
            lossCE = lossCE + ce
            loss01 = loss01 + cgt.cast(cgt.equal(y_tbp[t], round01(p_pred)),cgt.floatX).sum()


    lossCE = lossCE / (len(loss_timesteps) * opt.p * opt.b) / np.log(2)
    loss01 = loss01 / (len(loss_timesteps) * opt.p * opt.b)
    gradloss = cgt.grad(lossCE, params)

    flatgrad = flatcat(gradloss)

    f_loss = cgt.function([x_tbk, y_tbp], lossCE)
    f_loss_and_grad = cgt.function([x_tbk, y_tbp], [lossCE, loss01, flatgrad])

    print "number of nodes in computation graph:", core.count_nodes([lossCE, loss01, flatgrad])

    return f_loss, f_loss_and_grad, params
Beispiel #23
0
def sigmoid(x):
    return cgt.sigmoid(x)