def ntm_address(opt, wprev_bhn, M_bnm, k_bhm, beta_bh, g_bh, s_bh3, gamma_bh):

    # Content addressing

    # Cosine similarity
    # take inner product along memory axis k * M
    numer_bhn = cgt.einsum("bhm,bnm->bhn", k_bhm, M_bnm) 
    # compute denominator |k| * |m|
    denom_bhn = cgt.broadcast("*",
        cgt.norm(k_bhm, axis=2, keepdims=True), # -> shape bh1
        cgt.norm(M_bnm, axis=2, keepdims=True).transpose([0,2,1]), # -> bn1 -> b1n
        "xx1,x1x"
    )
    csim_bhn =  numer_bhn / denom_bhn
    assert infer_shape(csim_bhn) == (opt.b, 2*opt.h, opt.n)
    # scale by beta
    tmp_bhn = cgt.broadcast("*", beta_bh[:,:,None], csim_bhn, "xx1,xxx")
    wc_bhn = sum_normalize2(cgt.exp( tmp_bhn ))
    # Interpolation
    g_bh1 = g_bh[:,:,None]
    wg_bhn = cgt.broadcast("*", wprev_bhn, (1 - g_bh1), "xxx,xx1") \
            + cgt.broadcast("*", wc_bhn, g_bh1, "xxx,xx1")
    # Shift
    wtil_bhn = circ_conv_1d(wg_bhn, s_bh3, axis=2)
    # Sharpening
    wfin_bhn = sum_normalize2(cgt.broadcast("**", wtil_bhn, gamma_bh.reshape([opt.b,2*opt.h,1]), "xxx,xx1"))

    b,h,n = opt.b, 2*opt.h, opt.n
    assert infer_shape(wtil_bhn) == (b,h,n)
    assert infer_shape(gamma_bh) == (b,h)
    assert infer_shape(gamma_bh[:,:,None]) == (b,h,1)
    return wfin_bhn
Exemple #2
0
    def __call__(self, Y, U):
        if Y.ndim > (self.axis + 1):
            Y = Y.reshape(Y.shape[:self.axis] +
                          [cgt.mul_multi(Y.shape[self.axis:])])

        outer_YU = cgt.broadcast(
            '*', Y.dimshuffle(range(Y.ndim) + ['x']),
            U.dimshuffle([0] + ['x'] * self.axis + [1]),
            ''.join(['x'] * Y.ndim + ['1', ',', 'x'] + ['1'] * self.axis +
                    ['x']))
        bilinear = cgt.dot(
            outer_YU.reshape(
                (outer_YU.shape[0], cgt.mul_multi(outer_YU.shape[1:]))),
            self.M.reshape((self.y_dim, self.y_dim * self.u_dim)).T)
        if self.axis > 1:
            bilinear = bilinear.reshape((-1, ) + self.y_shape[:self.axis - 1] +
                                        (self.y_dim, ))
        linear = cgt.dot(U, self.N.T)
        if self.axis > 1:
            linear = linear.dimshuffle([0] + ['x'] * (self.axis - 1) + [1])
        activation = bilinear + linear
        if self.b is not None:
            activation += cgt.broadcast(
                '+', activation, self.b.dimshuffle(['x'] * self.axis + [0]),
                ''.join(['x'] * activation.ndim + [','] + ['1'] *
                        (activation.ndim - 1) + ['x']))
        activation = activation.reshape((-1, ) + self.y_shape)
        return activation
Exemple #3
0
    def get_context(self, prev_state_bf):

        state_step_bf = self.states_mlp_bf(prev_state_bf)
        state_step_b1f = cgt.dimshuffle(state_step_bf, [0, 'x', 1])

        # Compute the inner product <phi(s_i), psi(h_u)> where phi and psi are MLPs.
        # The below line computes the pointwise product of phi(s_i) and psi(h_u) and then sums to get the inner product.
        # scalar_energies_vec_bt = cgt.sqrt(cgt.sum(cgt.broadcast('*', state_step_b1f, self.features_post_mlp_btf, 'x1x,xxx'), axis=2))

        # Compute tau=tanh(h_u*W + s_i*V), broadcasting to do all h_u mults at once.
        scalar_energies_vec_btf = cgt.tanh(cgt.broadcast('+', self.features_post_mlp_btf, state_step_b1f, 'xxx,x1x'))

        # The next two lines compute w^T*(tau) with a pointwise product and then a sum.
        scalar_energies_vec_btf = cgt.broadcast('*', self.mixing_vec_w, scalar_energies_vec_btf, '11x,xxx')
        scalar_energies_vec_bt = cgt.sum(scalar_energies_vec_btf, axis=2)

        # Softmax weights the blended features over their time dimesions.
        softmax_weights_bt = nn.softmax(scalar_energies_vec_bt, axis=1)

        # This weight multiplies all features.
        extended_softmax_bt1 = cgt.dimshuffle(softmax_weights_bt, [0, 1, 'x'])
        # Weight the features by it's temporally dependent softmax weight.
        pre_blended = cgt.broadcast('*', extended_softmax_bt1, self.features_post_mlp_btf, 'xx1,xxx')
        # Integrate out time.
        blended_features_bf = cgt.sum(pre_blended, axis=1)

        return blended_features_bf
def ntm_address(opt, wprev_bhn, M_bnm, k_bhm, beta_bh, g_bh, s_bh3, gamma_bh):

    # Content addressing

    # Cosine similarity
    # take inner product along memory axis k * M
    numer_bhn = cgt.einsum("bhm,bnm->bhn", k_bhm, M_bnm)
    # compute denominator |k| * |m|
    denom_bhn = cgt.broadcast(
        "*",
        cgt.norm(k_bhm, axis=2, keepdims=True),  # -> shape bh1
        cgt.norm(M_bnm, axis=2, keepdims=True).transpose([0, 2,
                                                          1]),  # -> bn1 -> b1n
        "xx1,x1x")
    csim_bhn = numer_bhn / denom_bhn
    assert infer_shape(csim_bhn) == (opt.b, 2 * opt.h, opt.n)
    # scale by beta
    tmp_bhn = cgt.broadcast("*", beta_bh[:, :, None], csim_bhn, "xx1,xxx")
    wc_bhn = sum_normalize2(cgt.exp(tmp_bhn))
    # Interpolation
    g_bh1 = g_bh[:, :, None]
    wg_bhn = cgt.broadcast("*", wprev_bhn, (1 - g_bh1), "xxx,xx1") \
            + cgt.broadcast("*", wc_bhn, g_bh1, "xxx,xx1")
    # Shift
    wtil_bhn = circ_conv_1d(wg_bhn, s_bh3, axis=2)
    # Sharpening
    wfin_bhn = sum_normalize2(
        cgt.broadcast("**", wtil_bhn, gamma_bh.reshape([opt.b, 2 * opt.h, 1]),
                      "xxx,xx1"))

    b, h, n = opt.b, 2 * opt.h, opt.n
    assert infer_shape(wtil_bhn) == (b, h, n)
    assert infer_shape(gamma_bh) == (b, h)
    assert infer_shape(gamma_bh[:, :, None]) == (b, h, 1)
    return wfin_bhn
Exemple #5
0
def make_deep_rrnn(size_input, size_mem, n_layers, size_output, size_batch_in, k_in, k_h):
    inputs = [cgt.matrix() for i_layer in xrange(n_layers+1)]
    outputs = []
    print 'input_size: ', size_input
    for i_layer in xrange(n_layers):
        prev_h = inputs[i_layer+1] # note that inputs[0] is the external input, so we add 1
        x = inputs[0] if i_layer==0 else outputs[i_layer-1]
        size_x = size_input if i_layer==0 else size_mem
        size_batch = prev_h.shape[0]

        xform_h_param = nn.TensorParam((2 * k_h, size_mem), name="rotxform")
        xform_h_non = xform_h_param.weight
        xform_h_non.props["is_rotation"] = True
        xform_h_norm = cgt.norm(xform_h_non, axis=1, keepdims=True)
        xform_h = cgt.broadcast('/', xform_h_non, xform_h_norm, "xx,x1")

        r_vec = nn.Affine(size_x, 2 * k_in * size_mem)(x)
        r_non = cgt.reshape(r_vec, (size_batch, 2 * k_in, size_mem))
        r_norm = cgt.norm(r_non, axis=2, keepdims=True)
        r = cgt.broadcast('/', r_non, r_norm, "xxx,xx1")
        prev_h_3 = cgt.reshape(prev_h, (size_batch, size_mem, 1))
        inters_in = [prev_h_3]

        colon = slice(None, None, None)

        for i in xrange(2 * k_in):
            inter_in = inters_in[-1]
            r_cur = cgt.subtensor(r, [colon, i, colon])
            r_cur_3_transpose = cgt.reshape(r_cur, (size_batch, 1, size_mem))
            r_cur_3 = cgt.reshape(r_cur, (size_batch, size_mem, 1))
            ref_cur = cgt.batched_matmul(r_cur_3, cgt.batched_matmul(r_cur_3_transpose, inter_in))
            inter_out = inter_in - 2 * ref_cur
            inters_in.append(inter_out)

        h_in_rot = cgt.reshape(inters_in[-1], (size_batch, size_mem))
        inters_h = [h_in_rot]

        for i in xrange(2 * k_h):
            inter_in = inters_h[-1]
            r_cur = cgt.subtensor(xform_h, [i, colon])
            r_cur_2_transpose = cgt.reshape(r_cur, (size_mem, 1))
            r_cur_2 = cgt.reshape(r_cur, (1, size_mem))
            ref_cur = cgt.dot(cgt.dot(inter_in, r_cur_2_transpose), r_cur_2)
            inter_out = inter_in - 2 * ref_cur
            inters_h.append(inter_out)
        next_h = inters_h[-1]
        outputs.append(next_h)


    category_activations = nn.Affine(size_mem, size_output,name="pred")(outputs[-1])
    logprobs = nn.logsoftmax(category_activations)
    outputs.append(logprobs)

    #print 'len outputs:', len(outputs)
    #print 'len inputs:', len(inputs)

    return nn.Module(inputs, outputs)
def circ_conv_1d(wg_bhn, s_bh3, axis=2):
    "VERY inefficient way to implement circular convolution for the special case of filter size 3"
    assert axis == 2
    n = cgt.size(wg_bhn,2)
    wback = cgt.concatenate([wg_bhn[:,:,n-1:n], wg_bhn[:,:,:n-1]], axis=2)
    w = wg_bhn
    wfwd = cgt.concatenate([wg_bhn[:,:,1:n], wg_bhn[:,:,0:1]], axis=2)
    return cgt.broadcast("*", s_bh3[:,:,0:1] , wback, "xx1,xxx")\
     + cgt.broadcast("*", s_bh3[:,:,1:2] , w, "xx1,xxx")\
     + cgt.broadcast("*", s_bh3[:,:,2:3] , wfwd, "xx1,xxx")
Exemple #7
0
def circ_conv_1d(wg_bhn, s_bh3, axis=2):
    "VERY inefficient way to implement circular convolution for the special case of filter size 3"
    assert axis == 2
    n = cgt.size(wg_bhn,2)
    wback = cgt.concatenate([wg_bhn[:,:,n-1:n], wg_bhn[:,:,:n-1]], axis=2)
    w = wg_bhn
    wfwd = cgt.concatenate([wg_bhn[:,:,1:n], wg_bhn[:,:,0:1]], axis=2)
    return cgt.broadcast("*", s_bh3[:,:,0:1] , wback, "xx1,xxx")\
     + cgt.broadcast("*", s_bh3[:,:,1:2] , w, "xx1,xxx")\
     + cgt.broadcast("*", s_bh3[:,:,2:3] , wfwd, "xx1,xxx")
def make_ff_controller(opt):

    b, h, m, p, k = opt.b, opt.h, opt.m, opt.p, opt.k

    H = 2*h
    in_size = k + h*m
    out_size = H*m + H + H + H*3 + H + h*m + h*m + p

    # Previous reads
    r_bhm = cgt.tensor3("r", fixed_shape = (b,h,m))
    # External inputs
    X_bk = cgt.matrix("x", fixed_shape = (b,k))
    r_b_hm = r_bhm.reshape([r_bhm.shape[0], r_bhm.shape[1]*r_bhm.shape[2]])
    # Input to controller
    inp_bq = cgt.concatenate([X_bk, r_b_hm], axis=1)

    hid_sizes = opt.ff_hid_sizes
    activation = cgt.tanh

    layer_out_sizes = [in_size] + hid_sizes + [out_size]
    last_out = inp_bq
    # feedforward part. we could simplify a bit by using nn.Affine
    for i in xrange(len(layer_out_sizes)-1):
        indim = layer_out_sizes[i]
        outdim = layer_out_sizes[i+1]        
        W = cgt.shared(.02*nr.randn(indim, outdim), name="W%i"%i, fixed_shape_mask="all")
        bias = cgt.shared(.02*nr.randn(1, outdim), name="b%i"%i, fixed_shape_mask="all")
        last_out = cgt.broadcast("+",last_out.dot(W),bias,"xx,1x")
        # Don't apply nonlinearity at the last layer
        if i != len(layer_out_sizes)-2: last_out = activation(last_out)

    idx = 0
    k_bHm = last_out[:,idx:idx+H*m];      idx += H*m;         k_bHm = k_bHm.reshape([b,H,m])
    beta_bH = last_out[:,idx:idx+H];      idx += H
    g_bH = last_out[:,idx:idx+H];         idx += H
    s_bH3 = last_out[:,idx:idx+3*H];      idx += 3*H;         s_bH3 = s_bH3.reshape([b,H,3])
    gamma_bH = last_out[:,idx:idx+H];     idx += H
    e_bhm = last_out[:,idx:idx+h*m];      idx += h*m;         e_bhm = e_bhm.reshape([b,h,m])
    a_bhm = last_out[:,idx:idx+h*m];      idx += h*m;         a_bhm = a_bhm.reshape([b,h,m])
    y_bp = last_out[:,idx:idx+p];         idx += p

    k_bHm = cgt.tanh(k_bHm)
    beta_bH = nn.softplus(beta_bH)
    g_bH = cgt.sigmoid(g_bH)
    s_bH3 = sum_normalize2(cgt.exp(s_bH3))
    gamma_bH = cgt.sigmoid(gamma_bH)+1
    e_bhm = cgt.sigmoid(e_bhm)
    a_bhm = cgt.tanh(a_bhm)
    # y_bp = y_bp

    assert infer_shape(k_bHm) == (b,H,m)
    assert infer_shape(beta_bH) == (b,H)
    assert infer_shape(g_bH) == (b,H)
    assert infer_shape(s_bH3) == (b,H,3)
    assert infer_shape(gamma_bH) == (b,H)
    assert infer_shape(e_bhm) == (b,h,m)
    assert infer_shape(a_bhm) == (b,h,m)
    assert infer_shape(y_bp) == (b,p)

    return nn.Module([r_bhm, X_bk], [k_bHm, beta_bH, g_bH, s_bH3, gamma_bH, e_bhm, a_bhm, y_bp])
Exemple #9
0
def make_ff_controller(opt):

    b, h, m, p, k = opt.b, opt.h, opt.m, opt.p, opt.k

    H = 2*h
    in_size = k + h*m
    out_size = H*m + H + H + H*3 + H + h*m + h*m + p

    # Previous reads
    r_bhm = cgt.tensor3("r", fixed_shape = (b,h,m))
    # External inputs
    X_bk = cgt.matrix("x", fixed_shape = (b,k))
    r_b_hm = r_bhm.reshape([r_bhm.shape[0], r_bhm.shape[1]*r_bhm.shape[2]])
    # Input to controller
    inp_bq = cgt.concatenate([X_bk, r_b_hm], axis=1)

    hid_sizes = opt.ff_hid_sizes
    activation = cgt.tanh

    layer_out_sizes = [in_size] + hid_sizes + [out_size]
    last_out = inp_bq
    # feedforward part. we could simplify a bit by using nn.Affine
    for i in xrange(len(layer_out_sizes)-1):
        indim = layer_out_sizes[i]
        outdim = layer_out_sizes[i+1]        
        W = cgt.shared(.02*nr.randn(indim, outdim), name="W%i"%i, fixed_shape_mask="all")
        bias = cgt.shared(.02*nr.randn(1, outdim), name="b%i"%i, fixed_shape_mask="all")
        last_out = cgt.broadcast("+",last_out.dot(W),bias,"xx,1x")
        # Don't apply nonlinearity at the last layer
        if i != len(layer_out_sizes)-2: last_out = activation(last_out)

    idx = 0
    k_bHm = last_out[:,idx:idx+H*m];      idx += H*m;         k_bHm = k_bHm.reshape([b,H,m])
    beta_bH = last_out[:,idx:idx+H];      idx += H
    g_bH = last_out[:,idx:idx+H];         idx += H
    s_bH3 = last_out[:,idx:idx+3*H];      idx += 3*H;         s_bH3 = s_bH3.reshape([b,H,3])
    gamma_bH = last_out[:,idx:idx+H];     idx += H
    e_bhm = last_out[:,idx:idx+h*m];      idx += h*m;         e_bhm = e_bhm.reshape([b,h,m])
    a_bhm = last_out[:,idx:idx+h*m];      idx += h*m;         a_bhm = a_bhm.reshape([b,h,m])
    y_bp = last_out[:,idx:idx+p];         idx += p

    k_bHm = cgt.tanh(k_bHm)
    beta_bH = nn.softplus(beta_bH)
    g_bH = cgt.sigmoid(g_bH)
    s_bH3 = sum_normalize2(cgt.exp(s_bH3))
    gamma_bH = cgt.sigmoid(gamma_bH)+1
    e_bhm = cgt.sigmoid(e_bhm)
    a_bhm = cgt.tanh(a_bhm)
    # y_bp = y_bp

    assert infer_shape(k_bHm) == (b,H,m)
    assert infer_shape(beta_bH) == (b,H)
    assert infer_shape(g_bH) == (b,H)
    assert infer_shape(s_bH3) == (b,H,3)
    assert infer_shape(gamma_bH) == (b,H)
    assert infer_shape(e_bhm) == (b,h,m)
    assert infer_shape(a_bhm) == (b,h,m)
    assert infer_shape(y_bp) == (b,p)

    return nn.Module([r_bhm, X_bk], [k_bHm, beta_bH, g_bH, s_bH3, gamma_bH, e_bhm, a_bhm, y_bp])
Exemple #10
0
    def __call__(self, x):
        """
        x is the input

        Returns the output to feed as the input into the next layer.
        """

        return cgt.broadcast("+", x.dot(self.W), self.b, "xx,1x")
    def __call__(self, x):
        """
        x is the input

        Returns the output to feed as the input into the next layer.
        """

        return cgt.broadcast("+", x.dot(self.W), self.b, "xx,1x")
Exemple #12
0
    def get_context_backup(self, prev_state_bf):
        state_step_bf = cgt.sigmoid(self.states_mlp_bf(prev_state_bf))

        product_list = []
        for time_step in range(0, 3):
            inner_product = cgt.sum(state_step_bf*self.features_post_mlp_btf[:, time_step, :], axis=1)
            product_list.append(inner_product)
        st = cgt.stack(product_list)
        st = cgt.dimshuffle(st, [1, 0])
        softmax_weights = softmax(st)

        sum = None

        for time_step in range(0, 3):
            softmax_t_step = cgt.dimshuffle(softmax_weights[:, time_step], [0, 'x'])
            if sum is None:
                sum = cgt.broadcast('*', softmax_t_step, self.features_post_mlp_btf[:, time_step, :], 'x1,xx')
            else:
                sum += cgt.broadcast('*', softmax_t_step, self.features_post_mlp_btf[:, time_step, :], 'x1,xx')

        return sum
Exemple #13
0
def make_deep_rrnn_rot_relu(size_input, size_mem, n_layers, size_output,
                            size_batch_in, k_in, k_h):
    inputs = [cgt.matrix() for i_layer in xrange(n_layers + 1)]
    outputs = []
    print 'input_size: ', size_input
    for i_layer in xrange(n_layers):
        prev_h = inputs[
            i_layer +
            1]  # note that inputs[0] is the external input, so we add 1
        x = inputs[0] if i_layer == 0 else outputs[i_layer - 1]
        size_x = size_input if i_layer == 0 else size_mem
        size_batch = prev_h.shape[0]

        xform_h_param = nn.TensorParam((2 * k_h, size_mem), name="rotxform")
        xform_h_non = xform_h_param.weight
        xform_h_non.props["is_rotation"] = True

        xform_h_norm = cgt.norm(xform_h_non, axis=1, keepdims=True)
        xform_h = cgt.broadcast('/', xform_h_non, xform_h_norm, "xx,x1")

        add_in_lin = nn.Affine(size_x, size_mem)(x)
        add_in_relu = nn.rectify(add_in_lin)

        prev_h_scaled = nn.scale_mag(prev_h)

        h_in_added = prev_h_scaled + add_in_relu
        inters_h = [h_in_added]

        colon = slice(None, None, None)

        for i in xrange(2 * k_h):
            inter_in = inters_h[-1]
            r_cur = xform_h[i, :]
            #r_cur = cgt.subtensor(xform_h, [i, colon])
            r_cur_2_transpose = cgt.reshape(r_cur, (size_mem, 1))
            r_cur_2 = cgt.reshape(r_cur, (1, size_mem))
            ref_cur = cgt.dot(cgt.dot(inter_in, r_cur_2_transpose), r_cur_2)
            inter_out = inter_in - 2 * ref_cur
            inters_h.append(inter_out)
        next_h = inters_h[-1]
        outputs.append(next_h)

    category_activations = nn.Affine(size_mem, size_output,
                                     name="pred")(outputs[-1])
    logprobs = nn.logsoftmax(category_activations)
    outputs.append(logprobs)

    #print 'len outputs:', len(outputs)
    #print 'len inputs:', len(inputs)

    return nn.Module(inputs, outputs)
def ntm_write(M_bnm, w_bhn, e_bhm, a_bhm):

    if False: # Here's the version that's faithful to the paper
        # weighted erases                  bhn1                bh1m
        # ideally we wouldn't create this big 4-tensor but this operation 
        # requires a more general kind of contraction than is provided by einsum
        we_bhmn = cgt.broadcast("*", w_bhn[:,:,:,None], e_bhm[:,:,None,:], "xxx1,xx1x")
        # take produce of erasing factors
        mult_bmn = (1 - we_bhmn).prod(axis=1)
        M_bnm = M_bnm * mult_bmn # Equation 3 http://arxiv.org/pdf/1410.5401v2.pdf
    else: # This version just does a regular contraction
        erase_bnm = cgt.einsum( "bhn,bhm->bnm", w_bhn, e_bhm)
        M_bnm = M_bnm*(1-erase_bnm)

    # Now do the same thing with adds
    # But now it's just a regular contraction since we are adding rather than taking product
    add_bnm = cgt.einsum( "bhn,bhm->bnm", w_bhn, a_bhm)
    M_bnm = M_bnm + add_bnm

    return M_bnm
Exemple #15
0
def ntm_write(M_bnm, w_bhn, e_bhm, a_bhm):

    if False: # Here's the version that's faithful to the paper
        # weighted erases                  bhn1                bh1m
        # ideally we wouldn't create this big 4-tensor but this operation 
        # requires a more general kind of contraction than is provided by einsum
        we_bhmn = cgt.broadcast("*", w_bhn[:,:,:,None], e_bhm[:,:,None,:], "xxx1,xx1x")
        # take produce of erasing factors
        mult_bmn = (1 - we_bhmn).prod(axis=1)
        M_bnm = M_bnm * mult_bmn # Equation 3 http://arxiv.org/pdf/1410.5401v2.pdf
    else: # This version just does a regular contraction
        erase_bnm = cgt.einsum( "bhn,bhm->bnm", w_bhn, e_bhm)
        M_bnm = M_bnm*(1-erase_bnm)

    # Now do the same thing with adds
    # But now it's just a regular contraction since we are adding rather than taking product
    add_bnm = cgt.einsum( "bhn,bhm->bnm", w_bhn, a_bhm)
    M_bnm = M_bnm + add_bnm

    return M_bnm
Exemple #16
0
        def step(input_n, hid_previous, W_hid_stacked, W_in_stacked, b_stacked):
            # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1}
            hid_input = cgt.dot(hid_previous, W_hid_stacked)

            # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c
            input_n = cgt.broadcast("+", input_n.dot(W_in_stacked), b_stacked, "xx,1x")

            # Reset and update gates
            resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0)
            updategate = slice_w(hid_input, 1) + slice_w(input_n, 1)
            resetgate = self.nonlinearity_resetgate(resetgate)
            updategate = self.nonlinearity_updategate(updategate)

            # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1})
            hidden_update_in = slice_w(input_n, 2)
            hidden_update_hid = slice_w(hid_input, 2)
            hidden_update = hidden_update_in + resetgate*hidden_update_hid

            # Compute (1 - u_t)h_{t - 1} + u_t c_t
            hid = (1 - updategate)*hid_previous + updategate*hidden_update
            return self.nonlinearity_hid(hid)  # adding this non-linearity seems to help stability.
    def __init__(self, input, n_in, n_out, W=None, b=None,
                 activation=cgt.tanh, prefix=""):
        self.n_in = n_in
        self.n_out = n_out

        if W is None:
            # XXX replace with nn init
            W_values = np.asarray(
                rng.uniform(
                    low=-np.sqrt(6. / (n_in + n_out)),
                    high=np.sqrt(6. / (n_in + n_out)),
                    size=(n_in, n_out)
                ),
                dtype=cgt.floatX
            )
            if activation == cgt.sigmoid:
                W_values *= 4

            W = cgt.shared(W_values, name=prefix+"_W")

        if b is None:
            b_values = np.zeros((n_out,), dtype=cgt.floatX)
            b = cgt.shared(b_values, name=prefix+"_b")

        self.W = W
        self.b = b

        # XXX broadcast api may change
        lin_output = cgt.broadcast("+", cgt.dot(input, self.W),
                cgt.dimshuffle(self.b, ["x", 0]), "xx,1x")
        self.output = (
            lin_output if activation is None
            else activation(lin_output)
        )
        # parameters of the model
        self.params = [self.W, self.b]
Exemple #18
0
        def step(input_n, cell_previous, hid_previous, W_hid_stacked, W_in_stacked, b_stacked):

            input_n = cgt.broadcast("+", cgt.dot(input_n, W_in_stacked), b_stacked, "xx,1x")

            # Calculate gates pre-activations and slice
            gates = input_n + cgt.dot(hid_previous, W_hid_stacked)

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)
            outgate = self.nonlinearity_outgate(outgate)

            # Compute new cell value
            cell = forgetgate*cell_previous + ingate*cell_input
            # Compute new hidden unit activation
            hid = outgate*self.nonlinearity(cell)
            return [cell, hid]
Exemple #19
0
Fichier : nn.py Projet : x724/cgt
 def __call__(self, x):
     tmp = conv2d(x, self.weight, self.kernelshape, self.pad, self.stride)
     return cgt.broadcast("+", tmp, self.bias, "xxxx,1x11")
Exemple #20
0
Fichier : nn.py Projet : x724/cgt
 def __call__(self, x):
     return cgt.broadcast("+", x.dot(self.weight), self.bias, "xx,1x")
Exemple #21
0
def softmax(x,axis=1):
    # x = cgt.broadcast("-", x, x.max(axis=1,keepdims=True),"xx,x1")
    out = cgt.exp(x)
    out = cgt.broadcast("/", out, out.sum(axis=axis,keepdims=True), "xx,x1")
    return out
Exemple #22
0
 def __call__(self, x):
     tmp = conv2d(x, self.weight, self.kernelshape, self.pad, self.stride)
     return cgt.broadcast("+", tmp, self.bias, "xxxx,1x11")
Exemple #23
0
 def __call__(self, x):
     return cgt.broadcast("+", x.dot(self.weight), self.bias, "xx,1x")
Exemple #24
0
from cgt import nn, utils
import numpy as np, numpy.random as nr
from numpy.linalg import norm
from param_collection import ParamCollection

k_in = 1
size_x = 3
size_mem = 4
size_batch = 4

x = cgt.matrix(fixed_shape=(size_batch, size_x))
prev_h = cgt.matrix(fixed_shape=(size_batch, size_mem))
r_vec = nn.Affine(size_x, 2 * k_in * size_mem)(x)
r_non = cgt.reshape(r_vec, (size_batch, 2 * k_in, size_mem))
r_norm = cgt.norm(r_non, axis=2, keepdims=True)
r = cgt.broadcast('/', r_non, r_norm, "xxx,xx1")
prev_h_3 = cgt.reshape(prev_h, (size_batch, size_mem, 1))
inters = [prev_h_3]

for i in xrange(k_in * 2):
    inter_in = inters[-1]
    r_cur = r[:, i, :]
    r_cur_3_transpose = cgt.reshape(r_cur, (size_batch, 1, size_mem))
    r_cur_3 = cgt.reshape(r_cur, (size_batch, size_mem, 1))
    ref_cur = cgt.batched_matmul(r_cur_3, cgt.batched_matmul(r_cur_3_transpose, inter_in))
    inter_out = inter_in - ref_cur
    inters.append(inter_out)
h = inters[-1]
    
r_nn = nn.Module([x], [h])
Exemple #25
0
def broadcast(x, a, b, bcpat):
    return cgt.broadcast(x, a, b, bcpat)
Exemple #26
0
     X = inputs[0]
     param = layer.convolution_param
     kh,kw = (param.kernel_size, param.kernel_size) if param.HasField("kernel_size")\
         else (param.kernel_h, param.kernel_w)
     nchanin = infer_shape(X)[0]
     Wshape = (param.num_output, nchanin, kh, kw)
     Wname = layer.param[0].name or layer.name+":W"
     Wval = np.empty(Wshape, dtype=cgt.floatX)
     W = name2node[Wname] = cgt.shared(Wval, name=Wname, fixed_shape_mask="all")
     bshape = (1, param.num_output, 1, 1)
     bname = layer.param[1].name or layer.name+":b"
     bval = np.empty(bshape, dtype=cgt.floatX)
     b = name2node[bname] = cgt.shared(bval, name=bname, fixed_shape_mask="all")
     sh,sw = (param.stride, param.stride) if param.HasField("stride")\
         else (param.stride_h, param.stride_w)
     output = [cgt.broadcast("+",nn.conv2d(X, W, subsample=(sh,sw)), b, "xxxx,1x11")]
 elif layer.type == "Pooling":
     param = layer.pooling_param
     X = inputs[0]
     pool_type = {param.MAX : "max", param.AVE : "mean"}[param.pool]
     height_in,width_in = infer_shape(X)[2:4]
     kernel = (param.kernel_size, param.kernel_size) if param.HasField("kernel_size")\
         else (param.kernel_h, param.kernel_w)
     stride = (param.stride, param.stride) if param.HasField("stride")\
         else (param.stride_h, param.stride_w)
     pad = (param.pad, param.pad) if param.HasField("pad")\
         else (param.pad_h, param.pad_w)
     output = [nn.pool(pool_type, X, stride, kernel, pad)]
 elif layer.type == "InnerProduct":
     X = inputs[0]
     if X.ndim == 4:
Exemple #27
0
def normalize(var):
    return cgt.broadcast("/", var, cgt.sum(var,axis=2,keepdims=True), "xxx,xx1")
Exemple #28
0
from cgt import nn, utils
import numpy as np, numpy.random as nr
from numpy.linalg import norm
from param_collection import ParamCollection

k_in = 1
size_x = 3
size_mem = 4
size_batch = 4

x = cgt.matrix(fixed_shape=(size_batch, size_x))
prev_h = cgt.matrix(fixed_shape=(size_batch, size_mem))
r_vec = nn.Affine(size_x, 2 * k_in * size_mem)(x)
r_non = cgt.reshape(r_vec, (size_batch, 2 * k_in, size_mem))
r_norm = cgt.norm(r_non, axis=2, keepdims=True)
r = cgt.broadcast('/', r_non, r_norm, "xxx,xx1")
prev_h_3 = cgt.reshape(prev_h, (size_batch, size_mem, 1))
inters = [prev_h_3]

for i in xrange(k_in * 2):
    inter_in = inters[-1]
    r_cur = r[:, i, :]
    r_cur_3_transpose = cgt.reshape(r_cur, (size_batch, 1, size_mem))
    r_cur_3 = cgt.reshape(r_cur, (size_batch, size_mem, 1))
    ref_cur = cgt.batched_matmul(
        r_cur_3, cgt.batched_matmul(r_cur_3_transpose, inter_in))
    inter_out = inter_in - ref_cur
    inters.append(inter_out)
h = inters[-1]

r_nn = nn.Module([x], [h])
def sum_normalize2(x):
    return cgt.broadcast("/", x, x.sum(axis=2,keepdims=True), "xxx,xx1")
Exemple #30
0
def broadcast(opname,x,y,bcpat):
    return cgt.broadcast(opname, x, y, bcpat) if isinstance(x, core.Node) else eval("x %s y"%opname)
def sum_normalize2(x):
    return cgt.broadcast("/", x, x.sum(axis=2, keepdims=True), "xxx,xx1")
Exemple #32
0
Fichier : nn.py Projet : x724/cgt
def softmax(x, axis=1):
    # x = cgt.broadcast("-", x, x.max(axis=1,keepdims=True),"xx,x1")
    out = cgt.exp(x)
    out = cgt.broadcast("/", out, out.sum(axis=axis, keepdims=True), "xx,x1")
    return out
Exemple #33
0
def broadcast(x, a, b, bcpat):
    return cgt.broadcast(x, a, b, bcpat)
Exemple #34
0
def broadcast(opname, x, y, bcpat):
    return cgt.broadcast(opname, x, y, bcpat) if isinstance(
        x, core.Node) else eval("x %s y" % opname)
Exemple #35
0
     Wshape = (param.num_output, nchanin, kh, kw)
     Wname = layer.param[0].name or layer.name + ":W"
     Wval = np.empty(Wshape, dtype=cgt.floatX)
     W = name2node[Wname] = cgt.shared(Wval,
                                       name=Wname,
                                       fixed_shape_mask="all")
     bshape = (1, param.num_output, 1, 1)
     bname = layer.param[1].name or layer.name + ":b"
     bval = np.empty(bshape, dtype=cgt.floatX)
     b = name2node[bname] = cgt.shared(bval,
                                       name=bname,
                                       fixed_shape_mask="all")
     sh,sw = (param.stride, param.stride) if param.HasField("stride")\
         else (param.stride_h, param.stride_w)
     output = [
         cgt.broadcast("+", nn.conv2d(X, W, subsample=(sh, sw)), b,
                       "xxxx,1x11")
     ]
 elif layer.type == "Pooling":
     param = layer.pooling_param
     X = inputs[0]
     pool_type = {param.MAX: "max", param.AVE: "mean"}[param.pool]
     height_in, width_in = infer_shape(X)[2:4]
     kernel = (param.kernel_size, param.kernel_size) if param.HasField("kernel_size")\
         else (param.kernel_h, param.kernel_w)
     stride = (param.stride, param.stride) if param.HasField("stride")\
         else (param.stride_h, param.stride_w)
     pad = (param.pad, param.pad) if param.HasField("pad")\
         else (param.pad_h, param.pad_w)
     output = [nn.pool(pool_type, X, stride, kernel, pad)]
 elif layer.type == "InnerProduct":
     X = inputs[0]