Beispiel #1
0
    def __init__(self, rho=0.95, eps=1e-6, params=None):
        super(AdaDelta, self).__init__(params=params)

        self.rho = rho
        self.eps = eps
        self.accugrads = [
            utils.build_shared_zeros(t.shape.eval(), 'accugrad')
            for t in self.params
        ]
        self.accudeltas = [
            utils.build_shared_zeros(t.shape.eval(), 'accudelta')
            for t in self.params
        ]
    def __init__(self, x, y, l, window, opt, lr, init_emb, dim_emb, dim_hidden, n_vocab, L2_reg, unit,
                 sim='cos', n_layers=1, activation=tanh):
        self.tr_inputs = [x, y, l]
        self.pr_inputs = [x, y, l]

        self.x = x  # 1D: batch_size * l * 2, 2D: window; elem=word_id
        self.y = y  # 1D: batch_size; elem=label
        self.l = l  # scalar: elem=sentence length

        batch_size = y.shape[0]
        n_cands = x.shape[0] / batch_size / l

        self.pad = build_shared_zeros((1, dim_emb))
        if init_emb is None:
            self.emb = theano.shared(sample_weights(n_vocab - 1, dim_emb))
        else:
            self.emb = theano.shared(init_emb)
        self.E = T.concatenate([self.pad, self.emb], 0)
        self.W_out = theano.shared(sample_weights(dim_hidden, dim_hidden))
        self.params = [self.emb, self.W_out]

        """ Input Layer """
        e = self.E[x]  # e: 1D: batch_size * l * 2, 2D: window, 3D: dim_emb
        x_in = e.reshape((batch_size * n_cands, l, -1))

        """ Intermediate Layer """
        # h: 1D: n_batch * n_cands, 2D: dim_emb
        h, params = cnn.layers(x_in, window, dim_emb, dim_hidden, n_layers, activation)
        self.params.extend(params)

        """ Output Layer """
        h = h.reshape((batch_size, n_cands, -1))
        h_1 = h[T.arange(batch_size), 0]
        h_2 = h[T.arange(batch_size), 1:]
        if sim == 'cos':
            y_score = cosign_similarity(h_1, h_2)
        else:
            y_score = T.batched_dot(T.dot(h_1, self.W_out), h_2.dimshuffle(0, 2, 1))
        y_score_hat = T.max(y_score, 1)

        """ Objective Function """
        self.nll = max_margin_loss(y_score_hat, y_score[T.arange(batch_size), y])
        self.L2_sqr = regularization(self.params)
        self.cost = self.nll + L2_reg * self.L2_sqr / 2.

        """ Optimization """
        if opt == 'adagrad':
            self.update = ada_grad(cost=self.cost, params=self.params, lr=lr)
        elif opt == 'ada_delta':
            self.update = ada_delta(cost=self.cost, params=self.params)
        elif opt == 'adam':
            self.update = adam(cost=self.cost, params=self.params, lr=lr)
        else:
            self.update = sgd(cost=self.cost, params=self.params, lr=lr)

        """ Predicts """
        y_hat = T.argmax(y_score, 1)

        """ Check Accuracies """
        self.correct = T.eq(y_hat, y)
def ada_delta(cost, params, b=0.999, eps=1e-8):
    updates = OrderedDict()
    grads = T.grad(cost, params)
    """update parameters"""
    for p, g in zip(params, grads):
        r = build_shared_zeros(p.get_value(True).shape)
        v = build_shared_zeros(p.get_value(True).shape)
        s = build_shared_zeros(p.get_value(True).shape)
        r_t = b * r + (1 - b) * T.sqr(g)
        v_t = (T.sqrt(s) + eps) / (T.sqrt(r) + eps) * g
        s_t = b * s + (1 - b) * T.sqr(v_t)
        p_t = p - v_t
        updates[r] = r_t
        updates[v] = v_t
        updates[s] = s_t
        updates[p] = p_t
    return updates
def ada_delta(cost, params, b=0.999, eps=1e-8):
    updates = OrderedDict()
    grads = T.grad(cost, params)

    """update parameters"""
    for p, g in zip(params, grads):
        r = build_shared_zeros(p.get_value(True).shape)
        v = build_shared_zeros(p.get_value(True).shape)
        s = build_shared_zeros(p.get_value(True).shape)
        r_t = b * r + (1 - b) * T.sqr(g)
        v_t = (T.sqrt(s) + eps) / (T.sqrt(r) + eps) * g
        s_t = b * s + (1 - b) * T.sqr(v_t)
        p_t = p - v_t
        updates[r] = r_t
        updates[v] = v_t
        updates[s] = s_t
        updates[p] = p_t
    return updates
def ada_grad_emb(cost, params, emb, x, w, lr=0.1, eps=1.):
    updates = OrderedDict()
    grads = T.grad(cost, params)
    """update sub-tensor of embeddings"""
    p = emb
    g = T.grad(cost, x)
    r = build_shared_zeros(p.get_value(True).shape)
    r_sub = r[w]
    r_sub_t = r_sub + T.sqr(g)
    r_t = T.set_subtensor(r_sub, r_sub_t)
    p_t = T.inc_subtensor(x, -(lr / (T.sqrt(r_sub_t) + eps)) * g)
    updates[r] = r_t
    updates[p] = p_t
    """update parameters"""
    for p, g in zip(params, grads):
        r = build_shared_zeros(p.get_value(True).shape)
        r_t = r + T.sqr(g)
        p_t = p - (lr / (T.sqrt(r_t) + eps)) * g
        updates[r] = r_t
        updates[p] = p_t
    return updates
def ada_grad(cost, params, lr=0.1, eps=1.):
    updates = OrderedDict()
    grads = T.grad(cost, params)
    """update parameters"""
    for p, g in zip(params, grads):
        g = grad_clipping(g, 10.)
        r = build_shared_zeros(p.get_value(True).shape)
        r_t = r + T.sqr(g)
        p_t = p - (lr / (T.sqrt(r_t) + eps)) * g
        updates[r] = r_t
        updates[p] = p_t
    return updates
def ada_grad(cost, params, lr=0.1, eps=1.):
    updates = OrderedDict()
    grads = T.grad(cost, params)

    """update parameters"""
    for p, g in zip(params, grads):
        g = grad_clipping(g, 10.)
        r = build_shared_zeros(p.get_value(True).shape)
        r_t = r + T.sqr(g)
        p_t = p - (lr / (T.sqrt(r_t) + eps)) * g
        updates[r] = r_t
        updates[p] = p_t
    return updates
def ada_grad(cost, params, emb, x, w, lr=0.1, eps=1.0):
    updates = OrderedDict()
    grads = T.grad(cost, params)

    """update sub-tensor of embeddings"""
    p = emb
    g = T.grad(cost, x)
    r = build_shared_zeros(p.get_value(True).shape)
    r_sub = r[w]
    r_sub_t = r_sub + T.sqr(g)
    r_t = T.set_subtensor(r_sub, r_sub_t)
    p_t = T.inc_subtensor(x, -(lr / (T.sqrt(r_sub_t) + eps)) * g)
    updates[r] = r_t
    updates[p] = p_t

    """update parameters"""
    for p, g in zip(params, grads):
        r = build_shared_zeros(p.get_value(True).shape)
        r_t = r + T.sqr(g)
        p_t = p - (lr / (T.sqrt(r_t) + eps)) * g
        updates[r] = r_t
        updates[p] = p_t
    return updates
def adam(cost, params, lr=0.001, b1=0.9, b2=0.999, e=1e-8):
    updates = OrderedDict()
    grads = T.grad(cost, params)
    i = theano.shared(np.float32(0))
    i_t = i + 1.

    for p, g in zip(params, grads):
        v = build_shared_zeros(p.get_value(True).shape)
        r = build_shared_zeros(p.get_value(True).shape)

        v_t = (b1 * v) + (1. - b1) * g
        r_t = (b2 * r) + (1. - b2) * T.sqr(g)

        r_hat = lr / (T.sqrt(r_t / (1 - b2 ** i_t)) + e)
        v_hat = v / (1 - b1 ** i_t)

        p_t = p - r_hat * v_hat
        updates[v] = v_t
        updates[r] = r_t
        updates[p] = p_t

    updates[i] = i_t
    return updates
def adam(cost, params, lr=0.001, b1=0.9, b2=0.999, e=1e-8):
    updates = OrderedDict()
    grads = T.grad(cost, params)
    i = theano.shared(np.float32(0))
    i_t = i + 1.

    for p, g in zip(params, grads):
        v = build_shared_zeros(p.get_value(True).shape)
        r = build_shared_zeros(p.get_value(True).shape)

        v_t = (b1 * v) + (1. - b1) * g
        r_t = (b2 * r) + (1. - b2) * T.sqr(g)

        r_hat = lr / (T.sqrt(r_t / (1 - b2**i_t)) + e)
        v_hat = v / (1 - b1**i_t)

        p_t = p - r_hat * v_hat
        updates[v] = v_t
        updates[r] = r_t
        updates[p] = p_t

    updates[i] = i_t
    return updates
Beispiel #11
0
    def __init__(self,
                 x,
                 y,
                 l,
                 window,
                 opt,
                 lr,
                 init_emb,
                 dim_emb,
                 dim_hidden,
                 n_vocab,
                 L2_reg,
                 unit,
                 sim='cos',
                 n_layers=1,
                 activation=tanh):
        self.tr_inputs = [x, y, l]
        self.pr_inputs = [x, y, l]

        self.x = x  # 1D: batch_size * l * 2, 2D: window; elem=word_id
        self.y = y  # 1D: batch_size; elem=label
        self.l = l  # scalar: elem=sentence length

        batch_size = y.shape[0]
        n_cands = x.shape[0] / batch_size / l

        self.pad = build_shared_zeros((1, dim_emb))
        if init_emb is None:
            self.emb = theano.shared(sample_weights(n_vocab - 1, dim_emb))
        else:
            self.emb = theano.shared(init_emb)
        self.E = T.concatenate([self.pad, self.emb], 0)
        self.W_out = theano.shared(sample_weights(dim_hidden, dim_hidden))
        self.params = [self.emb, self.W_out]
        """ Input Layer """
        e = self.E[x]  # e: 1D: batch_size * l * 2, 2D: window, 3D: dim_emb
        x_in = e.reshape((batch_size * n_cands, l, -1))
        """ Intermediate Layer """
        # h: 1D: n_batch * n_cands, 2D: dim_emb
        h, params = cnn.layers(x_in, window, dim_emb, dim_hidden, n_layers,
                               activation)
        self.params.extend(params)
        """ Output Layer """
        h = h.reshape((batch_size, n_cands, -1))
        h_1 = h[T.arange(batch_size), 0]
        h_2 = h[T.arange(batch_size), 1:]
        if sim == 'cos':
            y_score = cosign_similarity(h_1, h_2)
        else:
            y_score = T.batched_dot(T.dot(h_1, self.W_out),
                                    h_2.dimshuffle(0, 2, 1))
        y_score_hat = T.max(y_score, 1)
        """ Objective Function """
        self.nll = max_margin_loss(y_score_hat, y_score[T.arange(batch_size),
                                                        y])
        self.L2_sqr = regularization(self.params)
        self.cost = self.nll + L2_reg * self.L2_sqr / 2.
        """ Optimization """
        if opt == 'adagrad':
            self.update = ada_grad(cost=self.cost, params=self.params, lr=lr)
        elif opt == 'ada_delta':
            self.update = ada_delta(cost=self.cost, params=self.params)
        elif opt == 'adam':
            self.update = adam(cost=self.cost, params=self.params, lr=lr)
        else:
            self.update = sgd(cost=self.cost, params=self.params, lr=lr)
        """ Predicts """
        y_hat = T.argmax(y_score, 1)
        """ Check Accuracies """
        self.correct = T.eq(y_hat, y)