Exemple #1
0
def make_deep_lstm(size_input, size_mem, n_layers, size_output, size_batch):
    inputs = [cgt.matrix(fixed_shape=(size_batch, size_input))]
    for _ in xrange(2 * n_layers):
        inputs.append(cgt.matrix(fixed_shape=(size_batch, size_mem)))
    outputs = []
    for i_layer in xrange(n_layers):
        prev_h = inputs[i_layer * 2]
        prev_c = inputs[i_layer * 2 + 1]
        if i_layer == 0:
            x = inputs[0]
            size_x = size_input
        else:
            x = outputs[(i_layer - 1) * 2]
            size_x = size_mem
        input_sums = nn.Affine(size_x, 4 * size_mem)(x) + nn.Affine(
            size_x, 4 * size_mem)(prev_h)
        sigmoid_chunk = cgt.sigmoid(input_sums[:, 0:3 * size_mem])
        in_gate = sigmoid_chunk[:, 0:size_mem]
        forget_gate = sigmoid_chunk[:, size_mem:2 * size_mem]
        out_gate = sigmoid_chunk[:, 2 * size_mem:3 * size_mem]
        in_transform = cgt.tanh(input_sums[:, 3 * size_mem:4 * size_mem])
        next_c = forget_gate * prev_c + in_gate * in_transform
        next_h = out_gate * cgt.tanh(next_c)
        outputs.append(next_c)
        outputs.append(next_h)

    category_activations = nn.Affine(size_mem, size_output)(outputs[-1])
    logprobs = nn.logsoftmax(category_activations)
    outputs.append(logprobs)

    return nn.Module(inputs, outputs)
Exemple #2
0
def make_deep_gru(size_input, size_mem, n_layers, size_output, size_batch):
    inputs = [cgt.matrix() for i_layer in xrange(n_layers + 1)]
    outputs = []
    for i_layer in xrange(n_layers):
        prev_h = inputs[
            i_layer +
            1]  # note that inputs[0] is the external input, so we add 1
        x = inputs[0] if i_layer == 0 else outputs[i_layer - 1]
        size_x = size_input if i_layer == 0 else size_mem
        update_gate = cgt.sigmoid(
            nn.Affine(size_x, size_mem, name="i2u")(x) +
            nn.Affine(size_mem, size_mem, name="h2u")(prev_h))
        reset_gate = cgt.sigmoid(
            nn.Affine(size_x, size_mem, name="i2r")(x) +
            nn.Affine(size_mem, size_mem, name="h2r")(prev_h))
        gated_hidden = reset_gate * prev_h
        p2 = nn.Affine(size_mem, size_mem)(gated_hidden)
        p1 = nn.Affine(size_x, size_mem)(x)
        hidden_target = cgt.tanh(p1 + p2)
        next_h = (1.0 - update_gate) * prev_h + update_gate * hidden_target
        outputs.append(next_h)
    category_activations = nn.Affine(size_mem, size_output,
                                     name="pred")(outputs[-1])
    logprobs = nn.logsoftmax(category_activations)
    outputs.append(logprobs)

    return nn.Module(inputs, outputs)
Exemple #3
0
def make_ff_controller(opt):

    b, h, m, p, k = opt.b, opt.h, opt.m, opt.p, opt.k

    H = 2*h
    in_size = k + h*m
    out_size = H*m + H + H + H*3 + H + h*m + h*m + p

    # Previous reads
    r_bhm = cgt.tensor3("r", fixed_shape = (b,h,m))
    # External inputs
    X_bk = cgt.matrix("x", fixed_shape = (b,k))
    r_b_hm = r_bhm.reshape([r_bhm.shape[0], r_bhm.shape[1]*r_bhm.shape[2]])
    # Input to controller
    inp_bq = cgt.concatenate([X_bk, r_b_hm], axis=1)

    hid_sizes = opt.ff_hid_sizes
    activation = cgt.tanh

    layer_out_sizes = [in_size] + hid_sizes + [out_size]
    last_out = inp_bq
    # feedforward part. we could simplify a bit by using nn.Affine
    for i in xrange(len(layer_out_sizes)-1):
        indim = layer_out_sizes[i]
        outdim = layer_out_sizes[i+1]        
        W = cgt.shared(.02*nr.randn(indim, outdim), name="W%i"%i, fixed_shape_mask="all")
        bias = cgt.shared(.02*nr.randn(1, outdim), name="b%i"%i, fixed_shape_mask="all")
        last_out = cgt.broadcast("+",last_out.dot(W),bias,"xx,1x")
        # Don't apply nonlinearity at the last layer
        if i != len(layer_out_sizes)-2: last_out = activation(last_out)

    idx = 0
    k_bHm = last_out[:,idx:idx+H*m];      idx += H*m;         k_bHm = k_bHm.reshape([b,H,m])
    beta_bH = last_out[:,idx:idx+H];      idx += H
    g_bH = last_out[:,idx:idx+H];         idx += H
    s_bH3 = last_out[:,idx:idx+3*H];      idx += 3*H;         s_bH3 = s_bH3.reshape([b,H,3])
    gamma_bH = last_out[:,idx:idx+H];     idx += H
    e_bhm = last_out[:,idx:idx+h*m];      idx += h*m;         e_bhm = e_bhm.reshape([b,h,m])
    a_bhm = last_out[:,idx:idx+h*m];      idx += h*m;         a_bhm = a_bhm.reshape([b,h,m])
    y_bp = last_out[:,idx:idx+p];         idx += p

    k_bHm = cgt.tanh(k_bHm)
    beta_bH = nn.softplus(beta_bH)
    g_bH = cgt.sigmoid(g_bH)
    s_bH3 = sum_normalize2(cgt.exp(s_bH3))
    gamma_bH = cgt.sigmoid(gamma_bH)+1
    e_bhm = cgt.sigmoid(e_bhm)
    a_bhm = cgt.tanh(a_bhm)
    # y_bp = y_bp

    assert infer_shape(k_bHm) == (b,H,m)
    assert infer_shape(beta_bH) == (b,H)
    assert infer_shape(g_bH) == (b,H)
    assert infer_shape(s_bH3) == (b,H,3)
    assert infer_shape(gamma_bH) == (b,H)
    assert infer_shape(e_bhm) == (b,h,m)
    assert infer_shape(a_bhm) == (b,h,m)
    assert infer_shape(y_bp) == (b,p)

    return nn.Module([r_bhm, X_bk], [k_bHm, beta_bH, g_bH, s_bH3, gamma_bH, e_bhm, a_bhm, y_bp])
Exemple #4
0
def make_ntm(opt):
    Mprev_bnm = cgt.tensor3("M", fixed_shape=(opt.b, opt.n, opt.m))
    X_bk = cgt.matrix("X", fixed_shape=(opt.b, opt.k))
    wprev_bHn = cgt.tensor3("w", fixed_shape=(opt.b, opt.h*2, opt.n))
    rprev_bhm = cgt.tensor3("r", fixed_shape=(opt.b, opt.h, opt.m))
    controller = make_ff_controller(opt)
    M_bnm, w_bHn, r_bhm, y_bp = ntm_step(opt, Mprev_bnm, X_bk, wprev_bHn, rprev_bhm, controller)
    # in this form it looks like a standard seq-to-seq model
    # external input and output are first elements
    ntm = nn.Module([X_bk, Mprev_bnm, wprev_bHn, rprev_bhm], [y_bp, M_bnm, w_bHn, r_bhm])
    return ntm
Exemple #5
0
def make_deep_rrnn_rot_relu(size_input, size_mem, n_layers, size_output,
                            size_batch_in, k_in, k_h):
    inputs = [cgt.matrix() for i_layer in xrange(n_layers + 1)]
    outputs = []
    print 'input_size: ', size_input
    for i_layer in xrange(n_layers):
        prev_h = inputs[
            i_layer +
            1]  # note that inputs[0] is the external input, so we add 1
        x = inputs[0] if i_layer == 0 else outputs[i_layer - 1]
        size_x = size_input if i_layer == 0 else size_mem
        size_batch = prev_h.shape[0]

        xform_h_param = nn.TensorParam((2 * k_h, size_mem), name="rotxform")
        xform_h_non = xform_h_param.weight
        xform_h_non.props["is_rotation"] = True

        xform_h_norm = cgt.norm(xform_h_non, axis=1, keepdims=True)
        xform_h = cgt.broadcast('/', xform_h_non, xform_h_norm, "xx,x1")

        add_in_lin = nn.Affine(size_x, size_mem)(x)
        add_in_relu = nn.rectify(add_in_lin)

        prev_h_scaled = nn.scale_mag(prev_h)

        h_in_added = prev_h_scaled + add_in_relu
        inters_h = [h_in_added]

        colon = slice(None, None, None)

        for i in xrange(2 * k_h):
            inter_in = inters_h[-1]
            r_cur = xform_h[i, :]
            #r_cur = cgt.subtensor(xform_h, [i, colon])
            r_cur_2_transpose = cgt.reshape(r_cur, (size_mem, 1))
            r_cur_2 = cgt.reshape(r_cur, (1, size_mem))
            ref_cur = cgt.dot(cgt.dot(inter_in, r_cur_2_transpose), r_cur_2)
            inter_out = inter_in - 2 * ref_cur
            inters_h.append(inter_out)
        next_h = inters_h[-1]
        outputs.append(next_h)

    category_activations = nn.Affine(size_mem, size_output,
                                     name="pred")(outputs[-1])
    logprobs = nn.logsoftmax(category_activations)
    outputs.append(logprobs)

    #print 'len outputs:', len(outputs)
    #print 'len inputs:', len(inputs)

    return nn.Module(inputs, outputs)
Exemple #6
0
    def make_updater_fc_parallel():
        X = cgt.matrix("X", fixed_shape=(None, 28 * 28))
        y = cgt.vector("y", dtype='i8')
        stepsize = cgt.scalar("stepsize")

        loss = build_fc_return_loss(X, y)
        params = nn.get_parameters(loss)
        m = nn.Module([X, y], [loss])
        split_loss = 0
        for start in xrange(0, batch_size, batch_size // 4):
            sli = slice(start, start + batch_size // 4)
            split_loss += m([X[sli], y[sli]])[0]
        split_loss /= 4
        gparams = cgt.grad(split_loss, params)
        updates2 = [(p, p - stepsize * gp) for (p, gp) in zip(params, gparams)]
        return cgt.function([X, y, stepsize], split_loss, updates=updates2)
Exemple #7
0
def lstm_network(T, size_in, size_out, num_units, num_mems, dbg_out={}):
    assert T > 0
    x, y, c_in, h_in, c_out, h_out = lstm_network_t(size_in, size_out,
                                                    num_units, num_mems,
                                                    dbg_out)
    f_lstm_t = nn.Module([x] + c_in + h_in, [y] + c_out + h_out)
    Xs = [
        cgt.matrix(fixed_shape=x.get_fixed_shape(), name="X%d" % t)
        for t in range(T)
    ]
    C_0 = [cgt.matrix(fixed_shape=_c.get_fixed_shape()) for _c in c_in]
    H_0 = [cgt.matrix(fixed_shape=_h.get_fixed_shape()) for _h in h_in]
    loss, C_t, H_t, Ys = [], C_0, H_0, []
    for t, x in enumerate(Xs):
        _out = f_lstm_t([x] + C_t + H_t)
        y, C_t, H_t = _out[0], _out[1:len(C_t) + 1], _out[1 + len(C_t):]
        Ys.append(y)
        if t == 0: C_1, H_1 = C_t, H_t
    C_T, H_T = C_t, H_t
    params = f_lstm_t.get_parameters()
    return params, Xs, Ys, C_0, H_0, C_T, H_T, C_1, H_1
Exemple #8
0
r_vec = nn.Affine(size_x, 2 * k_in * size_mem)(x)
r_non = cgt.reshape(r_vec, (size_batch, 2 * k_in, size_mem))
r_norm = cgt.norm(r_non, axis=2, keepdims=True)
r = cgt.broadcast('/', r_non, r_norm, "xxx,xx1")
prev_h_3 = cgt.reshape(prev_h, (size_batch, size_mem, 1))
inters = [prev_h_3]

for i in xrange(k_in * 2):
    inter_in = inters[-1]
    r_cur = r[:, i, :]
    r_cur_3_transpose = cgt.reshape(r_cur, (size_batch, 1, size_mem))
    r_cur_3 = cgt.reshape(r_cur, (size_batch, size_mem, 1))
    ref_cur = cgt.batched_matmul(
        r_cur_3, cgt.batched_matmul(r_cur_3_transpose, inter_in))
    inter_out = inter_in - ref_cur
    inters.append(inter_out)
h = inters[-1]

r_nn = nn.Module([x], [h])

params = r_nn.get_parameters()
pc = ParamCollection(params)
pc.set_value_flat(nr.uniform(-.1, .1, size=(pc.get_total_size(), )))
func = cgt.function([x, prev_h], h)

x_in = nr.uniform(-.1, .1,
                  size=(size_batch * size_x)).reshape(size_batch, size_x)
h_in = np.zeros((size_batch, size_mem))
h_in[:, 0] = np.ones(size_batch)
h = func(x_in, h_in)