def ada_grad(cost, params, emb=None, sub_emb=None, w=None, lr=0.1, eps=1.):
    updates = OrderedDict()

    """update sub-tensor of embeddings"""
    if emb:
        p = emb
        g = T.grad(cost, sub_emb)
        r = build_shared_zeros(p.get_value(True).shape)
        r_sub = r[w]
        r_sub_t = r_sub + T.sqr(g)
        r_t = T.set_subtensor(r_sub, r_sub_t)
        p_t = T.inc_subtensor(sub_emb, - (lr / (T.sqrt(r_sub_t) + eps)) * g)
        updates[r] = r_t
        updates[p] = p_t

    """update parameters"""
    grads0 = T.grad(cost, params[0])
    for p, g in zip(params[0], grads0):
        r = build_shared_zeros(p.get_value(True).shape)
        r_t = r + T.sqr(g)
        p_t = p - (lr / (T.sqrt(r_t) + eps)) * g
        updates[r] = r_t
        updates[p] = p_t

    """update parameters"""
    grads1 = T.grad(cost, params[1])
    for p, g in zip(params[1], grads1):
        r = build_shared_zeros(p.get_value(True).shape)
        r_t = r + T.sqr(g)
        p_t = p - (lr / (T.sqrt(r_t) + eps)) * g
        updates[r] = r_t
        updates[p] = p_t
    return updates
Esempio n. 2
0
def ada_grad(cost, params, emb=None, sub_emb=None, w=None, lr=0.1, eps=1.):
    updates = OrderedDict()
    """update sub-tensor of embeddings"""
    if emb:
        p = emb
        g = T.grad(cost, sub_emb)
        r = build_shared_zeros(p.get_value(True).shape)
        r_sub = r[w]
        r_sub_t = r_sub + T.sqr(g)
        r_t = T.set_subtensor(r_sub, r_sub_t)
        p_t = T.inc_subtensor(sub_emb, -(lr / (T.sqrt(r_sub_t) + eps)) * g)
        updates[r] = r_t
        updates[p] = p_t
    """update parameters"""
    grads0 = T.grad(cost, params[0])
    for p, g in zip(params[0], grads0):
        r = build_shared_zeros(p.get_value(True).shape)
        r_t = r + T.sqr(g)
        p_t = p - (lr / (T.sqrt(r_t) + eps)) * g
        updates[r] = r_t
        updates[p] = p_t
    """update parameters"""
    grads1 = T.grad(cost, params[1])
    for p, g in zip(params[1], grads1):
        r = build_shared_zeros(p.get_value(True).shape)
        r_t = r + T.sqr(g)
        p_t = p - (lr / (T.sqrt(r_t) + eps)) * g
        updates[r] = r_t
        updates[p] = p_t
    return updates
Esempio n. 3
0
def adam(cost, params, emb, x, lr=0.001, b1=0.9, b2=0.999, e=1e-8):
    updates = OrderedDict()
    grads = T.grad(cost, params)
    i = theano.shared(np.float32(0))
    i_t = i + 1.

    """update sub-tensor of embeddings"""
#    p = build_shared_zeros(emb.get_value(True).shape)
#    p_sub = p[w]
#    i_p = build_shared_zeros(emb.get_value(True).shape)

#    i_p_sub = i_p[w]
#    updates[i_p] = T.inc_subtensor(i_p_sub, 1.)

#    g = T.grad(cost, x)

#    v = build_shared_zeros(emb.get_value(True).shape)
#    r = build_shared_zeros(emb.get_value(True).shape)
#    v_sub = v[w]
#    r_sub = r[w]

#    v_t = ((1. - b1) * g) + (b1 ** (i_t - i_p_sub) * v_sub)
#    r_t = ((1. - b2) * T.sqr(g)) + (b2 ** (i_t - i_p_sub) * r_sub)

#    r_hat = lr / (T.sqrt(r_t / (1 - b2 ** i_t)) + e)
#    v_hat = v_t / (1 - b1 ** i_t)

#    p_t = p_sub - r_hat * v_hat
#    updates[v] = T.set_subtensor(v_sub, v_t)
#    updates[r] = T.set_subtensor(r_sub, r_t)
#    updates[p] = T.set_subtensor(p_sub, p_t)

    """update sub-tensor of embeddings"""
    lr_emb = theano.shared(np.float32(0.1))
    updates[emb] = T.inc_subtensor(x, -lr_emb * T.grad(cost, x))

    for p, g in zip(params, grads):
        v = build_shared_zeros(p.get_value(True).shape)
        r = build_shared_zeros(p.get_value(True).shape)

        v_t = (b1 * v) + (1. - b1) * g
        r_t = (b2 * r) + (1. - b2) * T.sqr(g)

        r_hat = lr / (T.sqrt(r_t / (1 - b2 ** i_t)) + e)
        v_hat = v / (1 - b1 ** i_t)

        p_t = p - r_hat * v_hat
        updates[v] = v_t
        updates[r] = r_t
        updates[p] = p_t

    updates[i] = i_t
    return updates
Esempio n. 4
0
def adam(cost, params, emb, x, lr=0.001, b1=0.9, b2=0.999, e=1e-8):
    updates = OrderedDict()
    grads = T.grad(cost, params)
    i = theano.shared(np.float32(0))
    i_t = i + 1.
    """update sub-tensor of embeddings"""
    #    p = build_shared_zeros(emb.get_value(True).shape)
    #    p_sub = p[w]
    #    i_p = build_shared_zeros(emb.get_value(True).shape)

    #    i_p_sub = i_p[w]
    #    updates[i_p] = T.inc_subtensor(i_p_sub, 1.)

    #    g = T.grad(cost, x)

    #    v = build_shared_zeros(emb.get_value(True).shape)
    #    r = build_shared_zeros(emb.get_value(True).shape)
    #    v_sub = v[w]
    #    r_sub = r[w]

    #    v_t = ((1. - b1) * g) + (b1 ** (i_t - i_p_sub) * v_sub)
    #    r_t = ((1. - b2) * T.sqr(g)) + (b2 ** (i_t - i_p_sub) * r_sub)

    #    r_hat = lr / (T.sqrt(r_t / (1 - b2 ** i_t)) + e)
    #    v_hat = v_t / (1 - b1 ** i_t)

    #    p_t = p_sub - r_hat * v_hat
    #    updates[v] = T.set_subtensor(v_sub, v_t)
    #    updates[r] = T.set_subtensor(r_sub, r_t)
    #    updates[p] = T.set_subtensor(p_sub, p_t)
    """update sub-tensor of embeddings"""
    lr_emb = theano.shared(np.float32(0.1))
    updates[emb] = T.inc_subtensor(x, -lr_emb * T.grad(cost, x))

    for p, g in zip(params, grads):
        v = build_shared_zeros(p.get_value(True).shape)
        r = build_shared_zeros(p.get_value(True).shape)

        v_t = (b1 * v) + (1. - b1) * g
        r_t = (b2 * r) + (1. - b2) * T.sqr(g)

        r_hat = lr / (T.sqrt(r_t / (1 - b2**i_t)) + e)
        v_hat = v / (1 - b1**i_t)

        p_t = p - r_hat * v_hat
        updates[v] = v_t
        updates[r] = r_t
        updates[p] = p_t

    updates[i] = i_t
    return updates
Esempio n. 5
0
 def __init__(self,
              parameters,
              alpha=0.001,
              beta1=0.9,
              beta2=0.999,
              eps=1e-8):
     super(AdamOptimizer, self).__init__(parameters)
     # TODO: really?
     self.t = theano.shared(np.float32(1))
     self.alpha = alpha
     self.beta1 = beta1
     self.beta2 = beta2
     self.eps = eps
     self.m = [build_shared_zeros(p.shape.eval()) for p in self.parameters]
     self.v = [build_shared_zeros(p.shape.eval()) for p in self.parameters]
Esempio n. 6
0
def ada_delta(grads, params, b=0.999, eps=1e-8):
    updates = OrderedDict()

    for p, g in zip(params, grads):
        r = build_shared_zeros(p.get_value(True).shape)
        v = build_shared_zeros(p.get_value(True).shape)
        s = build_shared_zeros(p.get_value(True).shape)
        r_t = b * r + (1 - b) * T.sqr(g)
        v_t = (T.sqrt(s) + eps) / (T.sqrt(r) + eps) * g
        s_t = b * s + (1 - b) * T.sqr(v_t)
        p_t = p - v_t
        updates[r] = r_t
        updates[v] = v_t
        updates[s] = s_t
        updates[p] = p_t
    return updates
    def __init__(self, n_i, n_h, activation=tanh):
        self.activation = activation
        self.c0 = build_shared_zeros(n_h)
        self.h0 = self.activation(self.c0)

        self.W = theano.shared(sample_weights(n_i, n_h))

        """input gate parameters"""
        self.W_xi = theano.shared(sample_weights(n_h, n_h))
        self.W_hi = theano.shared(sample_weights(n_h, n_h))
        self.W_ci = theano.shared(sample_weights(n_h))

        """forget gate parameters"""
        self.W_xf = theano.shared(sample_weights(n_h, n_h))
        self.W_hf = theano.shared(sample_weights(n_h, n_h))
        self.W_cf = theano.shared(sample_weights(n_h))

        """cell parameters"""
        self.W_xc = theano.shared(sample_weights(n_h, n_h))
        self.W_hc = theano.shared(sample_weights(n_h, n_h))

        """output gate parameters"""
        self.W_xo = theano.shared(sample_weights(n_h, n_h))
        self.W_ho = theano.shared(sample_weights(n_h, n_h))
        self.W_co = theano.shared(sample_weights(n_h))

        self.params = [self.W, self.W_xi, self.W_hi, self.W_ci, self.W_xf, self.W_hf, self.W_cf,
                       self.W_xc, self.W_hc, self.W_xo, self.W_ho, self.W_co]
    def __init__(self, n_i, n_h, activation=tanh):
        self.activation = activation
        self.c0 = build_shared_zeros(n_h)
        self.h0 = self.activation(self.c0)

        self.W = theano.shared(sample_weights(n_i, n_h))

        # input gate parameters
        self.W_xi = theano.shared(sample_weights(n_h, n_h))
        self.W_hi = theano.shared(sample_weights(n_h, n_h))
        self.W_ci = theano.shared(sample_weights(n_h))

        # forget gate parameters
        self.W_xf = theano.shared(sample_weights(n_h, n_h))
        self.W_hf = theano.shared(sample_weights(n_h, n_h))
        self.W_cf = theano.shared(sample_weights(n_h))

        # cell parameters
        self.W_xc = theano.shared(sample_weights(n_h, n_h))
        self.W_hc = theano.shared(sample_weights(n_h, n_h))

        # output gate parameters
        self.W_xo = theano.shared(sample_weights(n_h, n_h))
        self.W_ho = theano.shared(sample_weights(n_h, n_h))
        self.W_co = theano.shared(sample_weights(n_h))

        self.params = [
            self.W, self.W_xi, self.W_hi, self.W_ci, self.W_xf, self.W_hf,
            self.W_cf, self.W_xc, self.W_hc, self.W_xo, self.W_ho, self.W_co
        ]
Esempio n. 9
0
def ada_delta(cost, params, b=0.999, eps=1e-8):
    updates = OrderedDict()
    grads = T.grad(cost, params)

    """update parameters"""
    for p, g in zip(params, grads):
        r = build_shared_zeros(p.get_value(True).shape)
        v = build_shared_zeros(p.get_value(True).shape)
        s = build_shared_zeros(p.get_value(True).shape)
        r_t = b * r + (1 - b) * T.sqr(g)
        v_t = (T.sqrt(s) + eps) / (T.sqrt(r) + eps) * g
        s_t = b * s + (1 - b) * T.sqr(v_t)
        p_t = p - v_t
        updates[r] = r_t
        updates[v] = v_t
        updates[s] = s_t
        updates[p] = p_t
    return updates
Esempio n. 10
0
def ada_grad(grads, params, lr=0.1, eps=1.):
    updates = OrderedDict()

    for p, g in zip(params, grads):
        r = build_shared_zeros(p.get_value(True).shape)
        r_t = r + T.sqr(g)
        p_t = p - (lr / (T.sqrt(r_t) + eps)) * g
        updates[r] = r_t
        updates[p] = p_t
    return updates
Esempio n. 11
0
    def set_layer(self, init_emb, n_vocab, dim_emb, n_posit, dim_posit, fix):
        self.word_emb = self.create_word_emb(init_emb, n_vocab, dim_emb)
        self.posit_emb = self.create_posit_emb(n_posit, dim_posit)

        if fix:
            self.params.extend([self.posit_emb])
        else:
            self.params.extend([self.word_emb, self.posit_emb])

        pad = build_shared_zeros((1, dim_emb))
        self.E = T.concatenate([pad, self.word_emb], 0)
Esempio n. 12
0
def ada_grad(cost, params, emb, x, w, lr=0.1, eps=1.):
    updates = OrderedDict()
    grads = T.grad(cost, params)
    """update sub-tensor of embeddings"""
    p = emb
    g = T.grad(cost, x)
    r = build_shared_zeros(p.get_value(True).shape)
    r_sub = r[w]
    r_sub_t = r_sub + T.sqr(g)
    r_t = T.set_subtensor(r_sub, r_sub_t)
    p_t = T.inc_subtensor(x, -(lr / (T.sqrt(r_sub_t) + eps)) * g)
    updates[r] = r_t
    updates[p] = p_t
    """update parameters"""
    for p, g in zip(params, grads):
        r = build_shared_zeros(p.get_value(True).shape)
        r_t = r + T.sqr(g)
        p_t = p - (lr / (T.sqrt(r_t) + eps)) * g
        updates[r] = r_t
        updates[p] = p_t
    return updates
Esempio n. 13
0
def ada_delta(cost, params, emb, x, w, b=0.999, eps=1e-8):
    updates = OrderedDict()
    grads = T.grad(cost, params)
    """update sub-tensor of embeddings"""
    p = emb
    g = T.grad(cost, x)

    r = build_shared_zeros(p.get_value(True).shape)
    v = build_shared_zeros(p.get_value(True).shape)
    s = build_shared_zeros(p.get_value(True).shape)
    r_sub = r[w]
    v_sub = v[w]
    s_sub = s[w]

    r_sub_t = b * r_sub + (1 - b) * T.sqr(g)
    v_sub_t = (T.sqrt(s_sub) + eps) / (T.sqrt(r_sub) + eps) * g
    s_sub_t = b * s_sub + (1 - b) * T.sqr(v_sub_t)
    updates[r] = T.set_subtensor(r_sub, r_sub_t)
    updates[v] = T.set_subtensor(v_sub, v_sub_t)
    updates[s] = T.set_subtensor(s_sub, s_sub_t)
    updates[p] = T.inc_subtensor(x, -v_sub_t)
    """update parameters"""
    for p, g in zip(params, grads):
        r = build_shared_zeros(p.get_value(True).shape)
        v = build_shared_zeros(p.get_value(True).shape)
        s = build_shared_zeros(p.get_value(True).shape)
        r_t = b * r + (1 - b) * T.sqr(g)
        v_t = (T.sqrt(s) + eps) / (T.sqrt(r) + eps) * g
        s_t = b * s + (1 - b) * T.sqr(v_t)
        p_t = p - v_t
        updates[r] = r_t
        updates[v] = v_t
        updates[s] = s_t
        updates[p] = p_t
    return updates
Esempio n. 14
0
def ada_grad(cost, params, lr=0.1, eps=1.):
    updates = OrderedDict()
    grads = T.grad(cost, params)

    """update parameters"""
    for p, g in zip(params, grads):
        g = grad_clipping(g, 10.)
        r = build_shared_zeros(p.get_value(True).shape)
        r_t = r + T.sqr(g)
        p_t = p - (lr / (T.sqrt(r_t) + eps)) * g
        updates[r] = r_t
        updates[p] = p_t
    return updates
def adam(params, grads, lr=0.001, b1=0.9, b2=0.999, e=1e-8):
    updates = OrderedDict()
    i = theano.shared(np.float32(0))
    i_t = i + 1.

    for p, g in zip(params, grads):
        v = build_shared_zeros(p.get_value(True).shape)
        r = build_shared_zeros(p.get_value(True).shape)

        v_t = (b1 * v) + (1. - b1) * g
        r_t = (b2 * r) + (1. - b2) * T.sqr(g)

        r_hat = lr / (T.sqrt(r_t / (1 - b2 ** i_t)) + e)
        v_hat = v / (1 - b1 ** i_t)

        p_t = p - r_hat * v_hat
        updates[v] = v_t
        updates[r] = r_t
        updates[p] = p_t

    updates[i] = i_t
    return updates
Esempio n. 16
0
def adam(params, grads, lr=0.001, b1=0.9, b2=0.999, e=1e-8):
    updates = OrderedDict()
    i = theano.shared(np.float32(0))
    i_t = i + 1.

    for p, g in zip(params, grads):
        v = build_shared_zeros(p.get_value(True).shape)
        r = build_shared_zeros(p.get_value(True).shape)

        v_t = (b1 * v) + (1. - b1) * g
        r_t = (b2 * r) + (1. - b2) * T.sqr(g)

        r_hat = lr / (T.sqrt(r_t / (1 - b2**i_t)) + e)
        v_hat = v / (1 - b1**i_t)

        p_t = p - r_hat * v_hat
        updates[v] = v_t
        updates[r] = r_t
        updates[p] = p_t

    updates[i] = i_t
    return updates
Esempio n. 17
0
def ada_grad(cost, params, emb, x, w, lr=0.1, eps=1.):
    updates = OrderedDict()
    grads = T.grad(cost, params)

    """update sub-tensor of embeddings"""
    p = emb
    g = T.grad(cost, x)
    r = build_shared_zeros(p.get_value(True).shape)
    r_sub = r[w]
    r_sub_t = r_sub + T.sqr(g)
    r_t = T.set_subtensor(r_sub, r_sub_t)
    p_t = T.inc_subtensor(x, - (lr / (T.sqrt(r_sub_t) + eps)) * g)
    updates[r] = r_t
    updates[p] = p_t

    """update parameters"""
    for p, g in zip(params, grads):
        r = build_shared_zeros(p.get_value(True).shape)
        r_t = r + T.sqr(g)
        p_t = p - (lr / (T.sqrt(r_t) + eps)) * g
        updates[r] = r_t
        updates[p] = p_t
    return updates
Esempio n. 18
0
    def __init__(self, n_i=32, n_h=32, activation=tanh):
        self.activation = activation
        self.h0 = build_shared_zeros(n_h)

        self.W = theano.shared(sample_weights(n_i, n_h))

        self.W_xr = theano.shared(sample_weights(n_h, n_h))
        self.W_hr = theano.shared(sample_weights(n_h, n_h))

        self.W_xz = theano.shared(sample_weights(n_h, n_h))
        self.W_hz = theano.shared(sample_weights(n_h, n_h))

        self.W_xh = theano.shared(sample_weights(n_h, n_h))
        self.W_hh = theano.shared(sample_weights(n_h, n_h))

        self.params = [self.W, self.W_xr, self.W_hr, self.W_xz, self.W_hz, self.W_xh, self.W_hh]
Esempio n. 19
0
    def __init__(self, input_dim, hidden_dim, activation=T.tanh):
        self.c_0 = build_shared_zeros(hidden_dim)
        self.h_0 = activation(self.c_0)
        self.activation = activation
        self.W = theano.shared(get_uniform_weight(input_dim, hidden_dim))
        self.W_i = theano.shared(get_uniform_weight(hidden_dim, hidden_dim))
        self.U_i = theano.shared(get_uniform_weight(hidden_dim, hidden_dim))
        self.V_i = theano.shared(get_uniform_weight(hidden_dim))
        self.W_f = theano.shared(get_uniform_weight(hidden_dim, hidden_dim))
        self.U_f = theano.shared(get_uniform_weight(hidden_dim, hidden_dim))
        self.V_f = theano.shared(get_uniform_weight(hidden_dim))
        self.W_c = theano.shared(get_uniform_weight(hidden_dim, hidden_dim))
        self.U_c = theano.shared(get_uniform_weight(hidden_dim, hidden_dim))
        self.W_o = theano.shared(get_uniform_weight(hidden_dim, hidden_dim))
        self.U_o = theano.shared(get_uniform_weight(hidden_dim, hidden_dim))
        self.V_o = theano.shared(get_uniform_weight(hidden_dim))

        self.parameters = [
            self.W, self.W_f, self.U_f, self.V_f, self.W_i, self.U_i, self.V_i,
            self.W_c, self.U_c, self.W_o, self.U_o, self.V_o
        ]
Esempio n. 20
0
def ada_delta(cost, params, emb, x, w, b=0.999, eps=1e-8):
    updates = OrderedDict()
    grads = T.grad(cost, params)

    """update sub-tensor of embeddings"""
    p = emb
    g = T.grad(cost, x)

    r = build_shared_zeros(p.get_value(True).shape)
    v = build_shared_zeros(p.get_value(True).shape)
    s = build_shared_zeros(p.get_value(True).shape)
    r_sub = r[w]
    v_sub = v[w]
    s_sub = s[w]

    r_sub_t = b * r_sub + (1 - b) * T.sqr(g)
    v_sub_t = (T.sqrt(s_sub) + eps) / (T.sqrt(r_sub) + eps) * g
    s_sub_t = b * s_sub + (1 - b) * T.sqr(v_sub_t)
    updates[r] = T.set_subtensor(r_sub, r_sub_t)
    updates[v] = T.set_subtensor(v_sub, v_sub_t)
    updates[s] = T.set_subtensor(s_sub, s_sub_t)
    updates[p] = T.inc_subtensor(x, -v_sub_t)

    """update parameters"""
    for p, g in zip(params, grads):
        r = build_shared_zeros(p.get_value(True).shape)
        v = build_shared_zeros(p.get_value(True).shape)
        s = build_shared_zeros(p.get_value(True).shape)
        r_t = b * r + (1 - b) * T.sqr(g)
        v_t = (T.sqrt(s) + eps) / (T.sqrt(r) + eps) * g
        s_t = b * s + (1 - b) * T.sqr(v_t)
        p_t = p - v_t
        updates[r] = r_t
        updates[v] = v_t
        updates[s] = s_t
        updates[p] = p_t
    return updates
Esempio n. 21
0
    def __init__(self, x, c, y, n_words, batch_size, lr, init_emb, vocab_w_size, w_emb_dim, w_hidden_dim,
                 c_emb_dim, c_hidden_dim, output_dim, vocab_c_size, window, opt):
        assert window % 2 == 1, 'Window size must be odd'

        """ input """
        self.x = x  # 1D: n_words * batch_size, 2D: window; elem=word id
        self.x_v = x.flatten()  # 1D: n_words * batch_size * window; elem=word id
        self.c = c  # 1D: n_words * batch_size, 2D: window, 3D: max_len_char, 4D: window; elem=char id
        self.y = y
        self.batch_size = batch_size
        self.n_words = n_words
        self.lr = lr

        n_phi = (w_emb_dim + c_hidden_dim) * window
        max_len_char = T.cast(self.c.shape[2], 'int32')

        """ params """
        if init_emb is not None:
            self.emb = theano.shared(init_emb)
        else:
            self.emb = theano.shared(sample_weights(vocab_w_size, w_emb_dim))

        self.pad = build_shared_zeros((1, c_emb_dim))
        self.e_c = theano.shared(sample_norm_dist(vocab_c_size - 1, c_emb_dim))
        self.emb_c = T.concatenate([self.pad, self.e_c], 0)

        self.W_in = theano.shared(sample_weights(n_phi, w_hidden_dim))
        self.W_c = theano.shared(sample_weights(c_emb_dim * window, c_hidden_dim))
        self.W_out = theano.shared(sample_weights(w_hidden_dim, output_dim))

        self.b_in = theano.shared(sample_weights(w_hidden_dim))
        self.b_c = theano.shared(sample_weights(c_hidden_dim))
        self.b_y = theano.shared(sample_weights(output_dim))

        self.params = [self.e_c, self.W_in, self.W_c, self.W_out, self.b_in, self.b_c, self.b_y]

        """ look up embedding """
        self.x_emb = self.emb[self.x_v]  # 1D: batch_size*n_words * window, 2D: emb_dim
        self.c_emb = self.emb_c[self.c]  # 1D: batch_size*n_words, 2D: window, 3D: max_len_char, 4D: window, 5D: n_c_emb
        self.x_emb_r = self.x_emb.reshape((x.shape[0], x.shape[1], -1))

        """ convolution """
        self.c_phi = T.max(T.dot(self.c_emb.reshape((batch_size * n_words, window, max_len_char, -1)), self.W_c) + self.b_c, 2)  # 1D: n_words, 2D: window, 3D: n_h_c
        self.x_phi = T.concatenate([self.x_emb_r, self.c_phi], axis=2)

        """ forward """
        self.h = relu(T.dot(self.x_phi.reshape((batch_size * n_words, n_phi)), self.W_in) + self.b_in)
        self.o = T.dot(self.h, self.W_out) + self.b_y
        self.p_y_given_x = T.nnet.softmax(self.o)

        """ predict """
        self.y_pred = T.argmax(self.o, axis=1)
        self.result = T.eq(self.y_pred, self.y)

        """ loss """
        self.log_p = T.log(self.p_y_given_x)[T.arange(batch_size * n_words), self.y]
        self.nll = -T.sum(self.log_p)
        self.cost = self.nll

        if opt == 'sgd':
            self.updates = sgd(self.cost, self.params, self.emb, self.x_emb, self.lr)
        else:
            self.updates = ada_grad(self.cost, self.params, self.emb, self.x_emb, self.x, self.lr)
Esempio n. 22
0
    def __init__(self, x, c, y, n_words, batch_size, lr, init_emb,
                 vocab_w_size, w_emb_dim, w_hidden_dim, c_emb_dim,
                 c_hidden_dim, output_dim, vocab_c_size, window, opt):
        assert window % 2 == 1, 'Window size must be odd'
        """ input """
        self.x = x  # 1D: n_words * batch_size, 2D: window; elem=word id
        self.x_v = x.flatten(
        )  # 1D: n_words * batch_size * window; elem=word id
        self.c = c  # 1D: n_words * batch_size, 2D: window, 3D: max_len_char, 4D: window; elem=char id
        self.y = y
        self.batch_size = batch_size
        self.n_words = n_words
        self.lr = lr

        n_phi = (w_emb_dim + c_hidden_dim) * window
        max_len_char = T.cast(self.c.shape[2], 'int32')
        """ params """
        if init_emb is not None:
            self.emb = theano.shared(init_emb)
        else:
            self.emb = theano.shared(sample_weights(vocab_w_size, w_emb_dim))

        self.pad = build_shared_zeros((1, c_emb_dim))
        self.e_c = theano.shared(sample_norm_dist(vocab_c_size - 1, c_emb_dim))
        self.emb_c = T.concatenate([self.pad, self.e_c], 0)

        self.W_in = theano.shared(sample_weights(n_phi, w_hidden_dim))
        self.W_c = theano.shared(
            sample_weights(c_emb_dim * window, c_hidden_dim))
        self.W_out = theano.shared(sample_weights(w_hidden_dim, output_dim))

        self.b_in = theano.shared(sample_weights(w_hidden_dim))
        self.b_c = theano.shared(sample_weights(c_hidden_dim))
        self.b_y = theano.shared(sample_weights(output_dim))

        self.params = [
            self.e_c, self.W_in, self.W_c, self.W_out, self.b_in, self.b_c,
            self.b_y
        ]
        """ look up embedding """
        self.x_emb = self.emb[
            self.x_v]  # 1D: batch_size*n_words * window, 2D: emb_dim
        self.c_emb = self.emb_c[
            self.
            c]  # 1D: batch_size*n_words, 2D: window, 3D: max_len_char, 4D: window, 5D: n_c_emb
        self.x_emb_r = self.x_emb.reshape((x.shape[0], x.shape[1], -1))
        """ convolution """
        self.c_phi = T.max(
            T.dot(
                self.c_emb.reshape(
                    (batch_size * n_words, window, max_len_char, -1)),
                self.W_c) + self.b_c, 2)  # 1D: n_words, 2D: window, 3D: n_h_c
        self.x_phi = T.concatenate([self.x_emb_r, self.c_phi], axis=2)
        """ forward """
        self.h = relu(
            T.dot(self.x_phi.reshape((batch_size * n_words,
                                      n_phi)), self.W_in) + self.b_in)
        self.o = T.dot(self.h, self.W_out) + self.b_y
        self.p_y_given_x = T.nnet.softmax(self.o)
        """ predict """
        self.y_pred = T.argmax(self.o, axis=1)
        self.result = T.eq(self.y_pred, self.y)
        """ loss """
        self.log_p = T.log(self.p_y_given_x)[T.arange(batch_size * n_words),
                                             self.y]
        self.nll = -T.sum(self.log_p)
        self.cost = self.nll

        if opt == 'sgd':
            self.updates = sgd(self.cost, self.params, self.emb, self.x_emb,
                               self.lr)
        else:
            self.updates = ada_grad(self.cost, self.params, self.emb,
                                    self.x_emb, self.x, self.lr)