Example #1
0
    def logp(self, x):
        n = self.n
        eta = self.eta

        diag_idxs = self.diag_idxs
        cumsum = tt.cumsum(x ** 2)
        variance = tt.zeros(n)
        variance = tt.inc_subtensor(variance[0], x[0] ** 2)
        variance = tt.inc_subtensor(
            variance[1:],
            cumsum[diag_idxs[1:]] - cumsum[diag_idxs[:-1]])
        sd_vals = tt.sqrt(variance)

        logp_sd = self.sd_dist.logp(sd_vals).sum()
        corr_diag = x[diag_idxs] / sd_vals

        logp_lkj = (2 * eta - 3 + n - tt.arange(n)) * tt.log(corr_diag)
        logp_lkj = tt.sum(logp_lkj)

        # Compute the log det jacobian of the second transformation
        # described in the docstring.
        idx = tt.arange(n)
        det_invjac = tt.log(corr_diag) - idx * tt.log(sd_vals)
        det_invjac = det_invjac.sum()

        norm = _lkj_normalizing_constant(eta, n)

        return norm + logp_lkj + logp_sd + det_invjac
Example #2
0
 def __init__(self, vocab_size, dim, lr=0.5):
     W = np.asarray(np.random.rand(vocab_size, dim),
                    dtype=theano.config.floatX) / float(dim)
     W1 = np.asarray((np.random.rand(vocab_size, dim)),
                     dtype=theano.config.floatX) / float(dim)
     self.W = theano.shared(W, name='W', borrow=True)
     self.W1 = theano.shared(W1, name='W1', borrow=True)
     gW = np.asarray(np.ones((vocab_size, dim)), dtype=theano.config.floatX)
     gW1 = np.asarray(
         np.ones((vocab_size, dim)), dtype=theano.config.floatX)
     self.gW = theano.shared(gW, name='gW', borrow=True)
     self.gW1 = theano.shared(gW1, name='gW1', borrow=True)
     X = T.vector()
     fX = T.vector()
     ind_W = T.ivector()
     ind_W1 = T.ivector()
     w = self.W[ind_W, :]
     w1 = self.W1[ind_W1, :]
     cost = T.sum(fX * ((T.sum(w * w1, axis=1) - X) ** 2))
     grad = T.clip(T.grad(cost, [w, w1]), -5.0, 5.0)
     updates1 = [(self.gW, T.inc_subtensor(self.gW[ind_W, :],
                                           grad[0] ** 2))]
     updates2 = [(self.gW1, T.inc_subtensor(self.gW1[ind_W1, :],
                                            grad[1] ** 2))]
     updates3 = [(self.W, T.inc_subtensor(self.W[ind_W, :],
                                          - (lr / T.sqrt(self.gW[ind_W, :])) *
                                          grad[0]))]
     updates4 = [(self.W1, T.inc_subtensor(self.W1[ind_W1, :],
                                           - (lr / T.sqrt(self.gW1[ind_W1, :])) *
                                           grad[1]))]
     updates = updates1 + updates2 + updates3 + updates4
     self.cost_fn = theano.function(
         inputs=[ind_W, ind_W1, X, fX], outputs=cost, updates=updates)
Example #3
0
def recurrence(log_p_curr, log_p_prev, skip_mask=None):
    if skip_mask is None:
        skip_mask = T.ones_like(log_p_curr[:, 1:-2:2])


    # normalise and bring back to p space
    k = T.max(log_p_prev, axis=1, keepdims=True)
    norm_p_prev = T.switch(
        T.isinf(log_p_prev), 0, T.exp(log_p_prev - k))  # set -inf to 0

    # previous
    _result = norm_p_prev
    # add shift of previous
    _result = T.inc_subtensor(_result[:, 1:],   norm_p_prev[:, :-1])
    # add skips of previous
    _result = T.inc_subtensor(_result[:, 3::2],
            T.switch(skip_mask,norm_p_prev[:, 1:-2:2],0))
    # current
    # log(p) should be 0 for first 2 terms
    result = T.switch(
        T.eq(_result, 0),
        -np.inf,
        log_p_curr + T.log(_result) + k
    )
    return result
Example #4
0
    def log_likelihood(self):
        Users = self.L[:, :-1]
        Items = self.R[:, :-1]
        UserBiases = self.L[:, -1].reshape((-1, 1))
        ItemBiases = self.R[:, -1].reshape((-1, 1))

        A = T.dot(self.L[:, :-1], (self.R[:, :-1]).T)
        A = T.inc_subtensor(A[:, :], UserBiases)
        A = T.inc_subtensor(A[:, :], ItemBiases.T)
        B = A * self.counts
        loglik = T.sum(B)

        A = T.exp(A)
        A += 1
        A = T.log(A)

        A = (self.counts + 1) * A
        loglik -= T.sum(A)

        # L2 regularization
        loglik -= 0.5 * self.reg_param * T.sum(T.square(self.L[:, :-1]))
        loglik -= 0.5 * self.reg_param * T.sum(T.square(self.R[:, :-1]))

        # Return negation of LogLikelihood cause we will minimize cost
        return -loglik
def power_pool_2d(x, ds, p=3, b=0):
    n_batch, n_ch, s0, s1 = x.shape
    d0, d1 = ds
    c = tt.ones((s0, s1))

    # sum elements in regions
    y = tt.abs_(x[:, :, 0::d0, 0::d1])**p
    d = c[0::d0, 0::d1].copy()
    for i in range(0, d0):
        for j in range(0, d1):
            if i != 0 or j != 0:
                ni = (s0 - i - 1) / d0 + 1
                nj = (s1 - j - 1) / d1 + 1
                xij = tt.abs_(x[:, :, i::d0, j::d1])**p
                y = tt.inc_subtensor(y[:, :, :ni, :nj], xij)
                d = tt.inc_subtensor(d[:ni, :nj], c[i::d0, j::d1])

    # divide by number of elements
    y /= d
    y += b**p

    # take root
    y = y**(1. / p)

    return y
Example #6
0
def update_log_p(skip_idxs,zeros,active,log_p_curr,log_p_prev):
    active_skip_idxs = skip_idxs[(skip_idxs < active).nonzero()]
    active_next = T.cast(T.minimum(
        T.maximum(
            active + 1,
            T.max(T.concatenate([active_skip_idxs, [-1]])) + 2 + 1
        ),
        log_p_curr.shape[0]
    ), 'int32')

    common_factor = T.max(log_p_prev[:active])
    p_prev = T.exp(log_p_prev[:active] - common_factor)
    _p_prev = zeros[:active_next]
    # copy over
    _p_prev = T.set_subtensor(_p_prev[:active], p_prev)
    # previous transitions
    _p_prev = T.inc_subtensor(_p_prev[1:], _p_prev[:-1])
    # skip transitions
    _p_prev = T.inc_subtensor(
        _p_prev[active_skip_idxs + 2], p_prev[active_skip_idxs])
    updated_log_p_prev = T.log(_p_prev) + common_factor

    log_p_next = T.set_subtensor(
        zeros[:active_next],
        log_p_curr[:active_next] + updated_log_p_prev
    )
    return active_next, log_p_next
Example #7
0
    def log_likelihood(self):
        Users = self.U[:, :-1]
        Middle = self.S
        Items = self.V[:-1, :]
        UserBiases = self.U[:, -1].reshape((-1, 1))
        ItemBiases = self.V[-1, :].reshape((-1, 1))

        A = T.dot(T.dot(self.U[:, :-1], self.S[:-1, :-1]), self.V[:-1, :])
        A = T.inc_subtensor(A[:, :], UserBiases * T.sqrt(self.S[-1, -1]))
        A = T.inc_subtensor(A[:, :], ItemBiases.T * T.sqrt(self.S[-1, -1]))
        B = A * self.counts
        loglik = T.sum(B)

        A = T.exp(A)
        A += 1
        A = T.log(A)

        A = (self.counts + 1) * A
        loglik -= T.sum(A)

        # L2 regularization
        loglik -= 0.5 * self.reg_param * T.sum(T.square(T.diag(self.S)[:-1]))

        # Return negation of LogLikelihood cause we will minimize cost
        return -loglik
Example #8
0
    def adadelta(self, param, grad, updates, sample_idx = None, epsilon = 1e-6):
        v1 = np.float32(self.adapt_params[0])
        v2 = np.float32(1.0 - self.adapt_params[0])
        acc = theano.shared(param.get_value(borrow=False) * 0., borrow=True)
        upd = theano.shared(param.get_value(borrow=False) * 0., borrow=True)
        if sample_idx is None:
            acc_new = v1 * acc + v2 * (grad**2)
            updates[acc] = acc_new
            grad_scaling = (upd + epsilon) / (acc_new + epsilon)
            upd_new = v1 * upd + v2 * grad_scaling * (grad**2)
            updates[upd] = upd_new
        else:
            acc_s = acc[sample_idx]
#            acc_new = v1 * acc_s + v2 * (grad**2) #Faster, but inaccurate when an index occurs multiple times
#            updates[acc] = T.set_subtensor(acc_s, acc_new) #Faster, but inaccurate when an index occurs multiple times
            updates[acc] = T.inc_subtensor(T.set_subtensor(acc_s, acc_s * v1)[sample_idx], v2 * (grad**2)) #Slower, but accurate when an index occurs multiple times
            acc_new = updates[acc][sample_idx] #Slower, but accurate when an index occurs multiple times
            upd_s = upd[sample_idx]
            grad_scaling = (upd_s + epsilon) / (acc_new + epsilon)
#            updates[upd] = T.set_subtensor(upd_s, v1 * upd_s + v2 * grad_scaling * (grad**2)) #Faster, but inaccurate when an index occurs multiple times
            updates[upd] = T.inc_subtensor(T.set_subtensor(upd_s, upd_s * v1)[sample_idx], v2 * grad_scaling * (grad**2)) #Slower, but accurate when an index occurs multiple times
        gradient_scaling = T.cast(T.sqrt(grad_scaling), theano.config.floatX)
        if self.learning_rate != 1.0:
            print('Warn: learning_rate is not 1.0 while using adadelta. Setting learning_rate to 1.0')
            self.learning_rate = 1.0
        return grad * gradient_scaling #Ok, checked
Example #9
0
    def adam(self, param, grad, updates, sample_idx = None, epsilon = 1e-6):
        v1 = np.float32(self.adapt_params[0])
        v2 = np.float32(1.0 - self.adapt_params[0])
        v3 = np.float32(self.adapt_params[1])
        v4 = np.float32(1.0 - self.adapt_params[1])
        acc = theano.shared(param.get_value(borrow=False) * 0., borrow=True)
        meang = theano.shared(param.get_value(borrow=False) * 0., borrow=True)
        countt = theano.shared(param.get_value(borrow=False) * 0., borrow=True)
        if sample_idx is None:
            acc_new = v3 * acc + v4 * (grad**2)
            meang_new = v1 * meang + v2 * grad
            countt_new = countt + 1
            updates[acc] = acc_new
            updates[meang] = meang_new
            updates[countt] = countt_new
        else:
            acc_s = acc[sample_idx]
            meang_s = meang[sample_idx]
            countt_s = countt[sample_idx]
#            acc_new = v3 * acc_s + v4 * (grad**2) #Faster, but inaccurate when an index occurs multiple times
#            updates[acc] = T.set_subtensor(acc_s, acc_new) #Faster, but inaccurate when an index occurs multiple times
            updates[acc] = T.inc_subtensor(T.set_subtensor(acc_s, acc_s * v3)[sample_idx], v4 * (grad**2)) #Slower, but accurate when an index occurs multiple times
            acc_new = updates[acc][sample_idx] #Slower, but accurate when an index occurs multiple times
#            meang_new = v1 * meang_s + v2 * grad
#            updates[meang] = T.set_subtensor(meang_s, meang_new) #Faster, but inaccurate when an index occurs multiple times
            updates[meang] = T.inc_subtensor(T.set_subtensor(meang_s, meang_s * v1)[sample_idx], v2 * (grad**2)) #Slower, but accurate when an index occurs multiple times
            meang_new = updates[meang][sample_idx] #Slower, but accurate when an index occurs multiple times
            countt_new = countt_s + 1.0
            updates[countt] = T.set_subtensor(countt_s, countt_new)
        return (meang_new / (1 - v1**countt_new)) / (T.sqrt(acc_new / (1 - v1**countt_new)) + epsilon)
Example #10
0
    def fac_vis(self, x_phid, x_shid):
        # calculate probability of visible units
        # fac_vis[view][node, sample, statistic]

        facv_vis = [T.zeros((self.n_vis_nodes[view],
                             self.n_samples,
                             self.vis[view].n_statistics),
                            dtype=theano.config.floatX) 
                    for view in range(self.n_views)]
        fv_shid = self.shid.f(x_shid)
        for view in range(self.n_views):      
            fv_phid = self.phid[view].f(x_phid[view])
            for statistic in range(self.vis[view].n_statistics):
                facv_vis[view] = T.set_subtensor(facv_vis[view][:, :, statistic],
                                                 self.bias_vis[view][:, statistic].dimshuffle(0, 'x'))
                if self.vis[view].fixed_bias[statistic]:
                    facv_vis[view] = T.set_subtensor(facv_vis[view][:, :, statistic],
                                                     self.vis[view].fixed_bias_value[statistic])

                for from_statistic in range(self.phid[view].n_statistics):
                    facv_vis[view] = T.inc_subtensor(facv_vis[view][:, :, statistic], 
                        T.dot(self.weights_priv[view][:, statistic, :, from_statistic].T,
                              fv_phid[:, :, from_statistic]))
                for from_statistic in range(self.shid.n_statistics):
                    facv_vis[view] = T.inc_subtensor(facv_vis[view][:, :, statistic],
                        T.dot(self.weights_shrd[view][:, statistic, :, from_statistic].T,
                              fv_shid[:, :, from_statistic]))
        return facv_vis
Example #11
0
 def scan(self, x, z, non_sequences, i, outputs_info, W_re, W_in, b, go_backwards = False, truncate_gradient = -1):
   W_re_b = self.parent.add_param(
     self.parent.create_recurrent_weights(self.n_units, self.n_re, name="W_re_b_%s" % self.parent.name))
   z_f = z[:,:,:z.shape[2]/2]
   z_b = z[::-1,:,z.shape[2]/2:]
   z_f = T.inc_subtensor(z_f[0], T.dot(outputs_info[0], W_re))
   z_b = T.inc_subtensor(z_b[0], T.dot(outputs_info[0], W_re_b))
   result = BLSTMOpInstance(z_f,z_b, W_re, W_re_b, outputs_info[1], T.zeros_like(outputs_info[1]), i, i[::-1])
   return [ T.concatenate([result[0],result[1][::-1]],axis=2), T.concatenate([result[4],result[5][::-1]],axis=1).dimshuffle('x',0,1) ]
Example #12
0
def gs_recurrence(p_curr, p_prev):
    # add previous
    _result = p_prev
    # add shift of previous
    _result = T.inc_subtensor(_result[1:],   p_prev[:-1])
    # add skips of previous
    _result = T.inc_subtensor(_result[3::2], p_prev[1:-2:2])
    # current
    _result = _result * p_curr
    return _result
Example #13
0
 def add_synap_post_inp(i,po,p,s,q):
     # i:: sequence
     # po:: post
     # p:: pre
     # s:: dA
     # q:: W
     index = T.nonzero(q[:self.Ne,i])
     npo = T.inc_subtensor(po[index,i],s)
     nw = T.inc_subtensor(q[:,i],p[:,i])
     nw = T.clip(nw,0,self.wmax)
     return {po:npo,q:nw}
Example #14
0
def past_weight_grad_step(xs, es, kp_x, kd_x, kp_e, kd_e, shape, dws=None):
    """
    Do an efficient update of the weights given the two spike-update.

    (This still runs FING SLOWLY!)

    :param xs: An (n_in) vector
    :param es: An (n_out) vector
    :param kp_x:
    :param kd_x:
    :param kp_e:
    :param kd_e:
    :param shapes: (n_in, n_out)
    :return:
    """
    kp_x, kd_x, kp_e, kd_e = [as_floatx(k) for k in (kp_x, kd_x, kp_e, kd_e)]
    n_in, n_out = shape
    rx = kd_x/(kp_x+kd_x)
    re = kd_e/(kp_e+kd_e)

    tx_last = create_shared_variable(np.zeros(n_in)+1)
    te_last = create_shared_variable(np.zeros(n_out)+1)
    x_last = create_shared_variable(np.zeros(n_in))
    e_last = create_shared_variable(np.zeros(n_out))
    x_spikes = tt.neq(xs, 0)
    e_spikes = tt.neq(es, 0)
    x_spike_ixs, = tt.nonzero(x_spikes)
    e_spike_ixs, = tt.nonzero(e_spikes)

    if dws is None:
        dws = tt.zeros(shape)

    t_last = tt.minimum(tx_last[x_spike_ixs, None], te_last)  # (n_x_spikes, n_out)
    dws = tt.inc_subtensor(dws[x_spike_ixs, :], x_last[x_spike_ixs, None]*e_last
        * rx**(tx_last[x_spike_ixs, None]-t_last)
        * re**(te_last[None, :]-t_last)
        * geoseries_sum(re*rx, t_end=t_last, t_start=1)
        )

    new_x_last = tt.set_subtensor(x_last[x_spike_ixs], x_last[x_spike_ixs]*rx**tx_last[x_spike_ixs]+ xs[x_spike_ixs]/as_floatx(kd_x))
    new_tx_last = tt.switch(x_spikes, 0, tx_last)

    t_last = tt.minimum(new_tx_last[:, None], te_last[e_spike_ixs])  # (n_in, n_e_spikes)
    dws = tt.inc_subtensor(dws[:, e_spike_ixs], new_x_last[:, None]*e_last[e_spike_ixs]
        * rx**(new_tx_last[:, None]-t_last)
        * re**(te_last[None, e_spike_ixs]-t_last)
        * geoseries_sum(re*rx, t_end=t_last, t_start=1)
        )

    add_update(x_last, new_x_last)
    add_update(e_last, tt.set_subtensor(e_last[e_spike_ixs], e_last[e_spike_ixs]*re**te_last[e_spike_ixs]+ es[e_spike_ixs]/as_floatx(kd_e)))
    add_update(tx_last, new_tx_last+1)
    add_update(te_last, tt.switch(e_spikes, 1, te_last+1))
    return dws
Example #15
0
 def RMSprop(self, cost, params, full_params, sampled_params, sidxs, epsilon=1e-6):
     grads =  [T.grad(cost = cost, wrt = param) for param in params]
     sgrads = [T.grad(cost = cost, wrt = sparam) for sparam in sampled_params]
     updates = OrderedDict()
     if self.grad_cap>0:
         norm=T.cast(T.sqrt(T.sum([T.sum([T.sum(g**2) for g in g_list]) for g_list in grads]) + T.sum([T.sum(g**2) for g in sgrads])), theano.config.floatX)
         grads = [[T.switch(T.ge(norm, self.grad_cap), g*self.grad_cap/norm, g) for g in g_list] for g_list in grads]
         sgrads = [T.switch(T.ge(norm, self.grad_cap), g*self.grad_cap/norm, g) for g in sgrads]
     for p_list, g_list in zip(params, grads):
         for p, g in zip(p_list, g_list):
             if self.adapt:
                 if self.adapt == 'adagrad':
                     g = self.adagrad(p, g, updates)
                 if self.adapt == 'rmsprop':
                     g = self.rmsprop(p, g, updates)
                 if self.adapt == 'adadelta':
                     g = self.adadelta(p, g, updates)
                 if self.adapt == 'adam':
                     g = self.adam(p, g, updates)
             if self.momentum > 0:
                 velocity = theano.shared(p.get_value(borrow=False) * 0., borrow=True)
                 velocity2 = self.momentum * velocity - np.float32(self.learning_rate) * (g + self.lmbd * p)
                 updates[velocity] = velocity2
                 updates[p] = p + velocity2
             else:
                 updates[p] = p * np.float32(1.0 - self.learning_rate * self.lmbd) - np.float32(self.learning_rate) * g
     for i in range(len(sgrads)):
         g = sgrads[i]
         fullP = full_params[i]
         sample_idx = sidxs[i]
         sparam = sampled_params[i]
         if self.adapt:
             if self.adapt == 'adagrad':
                 g = self.adagrad(fullP, g, updates, sample_idx)
             if self.adapt == 'rmsprop':
                 g = self.rmsprop(fullP, g, updates, sample_idx)
             if self.adapt == 'adadelta':
                 g = self.adadelta(fullP, g, updates, sample_idx)
             if self.adapt == 'adam':
                 g = self.adam(fullP, g, updates, sample_idx)
         if self.lmbd > 0:
             delta = np.float32(self.learning_rate) * (g + self.lmbd * sparam)
         else:
             delta = np.float32(self.learning_rate) * g
         if self.momentum > 0:
             velocity = theano.shared(fullP.get_value(borrow=False) * 0., borrow=True)
             vs = velocity[sample_idx]
             velocity2 = self.momentum * vs - delta
             updates[velocity] = T.set_subtensor(vs, velocity2)
             updates[fullP] = T.inc_subtensor(sparam, velocity2)
         else:
             updates[fullP] = T.inc_subtensor(sparam, - delta)
     return updates
Example #16
0
    def _pyramid_step(self, x_h, x_zr, x_m, t, h_tm1):
        '''
        x_h/z/r: input at time t     shape=[batch, hid] or [hid]
        x_m: mask of x_t         shape=[batch] or [1]
        h_tm1: previous state    shape=[batch, t+1 or n_steps, hid] or [t+1 or n_steps, hid]
        '''
        if self.with_begin_tag:
            if x_h.ndim == 1 and h_tm1.ndim == 2:
                h_tm1 = T.set_subtensor(h_tm1[t,:], self.struct_begin_tag)
            elif x_h.ndim == 2 and h_tm1.ndim == 3:
                h_tm1 = T.set_subtensor(h_tm1[:,t,:], self.struct_begin_tag[None,:])
            else:
                raise NotImplementedError

        zr_t = T.dot(h_tm1, self.W_hzr)
        can_h_t = T.dot(h_tm1, self.W_hh)

        if x_h.ndim == 1 and h_tm1.ndim == 2:
            xzr = x_zr[None,:]
            xm = x_m[:,None]

            zr_t = T.inc_subtensor(zr_t[:t+1], xzr)
        elif x_h.ndim == 2 and h_tm1.ndim == 3:
            xzr = x_zr[:,None,:]
            xm = x_m[:,None,None]

            zr_t = T.inc_subtensor(zr_t[:,:t+1], xzr)
        else:
            raise NotImplementedError

        zr_t = T.nnet.sigmoid(zr_t)

        z_t = _slice(zr_t, 0, self.n_hids)
        r_t = _slice(zr_t, 1, self.n_hids)

        can_h_t *= r_t

        if x_h.ndim == 1 and h_tm1.ndim == 2:
            xh = x_h[None,:]
            can_h_t = T.inc_subtensor(can_h_t[:t+1], xh)
        elif x_h.ndim == 2 and h_tm1.ndim == 3:
            xh = x_h[:,None,:]
            can_h_t = T.inc_subtensor(can_h_t[:,:t+1], xh)
        else:
            raise NotImplementedError


        can_h_t = T.tanh(can_h_t)

        h_t = z_t * h_tm1 + (1 - z_t) * can_h_t
        h_t = xm * h_t + (1. - xm) * h_tm1
        return h_t
Example #17
0
    def __init__(self, n_from, n_to, de, seed=1692, init_params=None):
        """
        n_from :: number of from embeddings in the vocabulary
        n_to :: number of to embeddings in the vocabulary
        de :: dimension of the word embeddings
        """
        np.random.seed(seed)
        # parameters of the model
        if init_params is not None:
            with open('data/case_embeddings.pkl', 'rb') as f:
                temp = cPickle.load(f)
            self.Win = theano.shared(temp.Win.get_value().astype(theano.config.floatX))
            self.Wout = theano.shared(temp.Wout.get_value().astype(theano.config.floatX))
        else:
            self.Win = theano.shared(0.2 * np.random.uniform(-1.0, 1.0, (n_from, de)).astype(theano.config.floatX))
            self.Wout = theano.shared(0.2 * np.random.uniform(-1.0, 1.0, (n_to, de)).astype(theano.config.floatX))

        # adagrad
        self.cumulative_gradients_in = theano.shared(0.1 * np.ones((n_from, de)).astype(theano.config.floatX))
        self.cumulative_gradients_out = theano.shared(0.1 * np.ones((n_to, de)).astype(theano.config.floatX))

        idxs = TT.imatrix()
        x_in = self.Win[idxs[:, 0], :]
        x_out = self.Wout[idxs[:, 1], :]

        norms_in= TT.sqrt(TT.sum(x_in ** 2, axis=1))
        norms_out = TT.sqrt(TT.sum(x_out ** 2, axis=1))
        norms = norms_in * norms_out

        y = TT.vector('y')  # label
        y_predictions = TT.sum(x_in * x_out, axis=1) / norms

        # cost and gradients and learning rate
        loss = TT.mean(TT.sqr(y_predictions - y))
        gradients = TT.grad(loss, [x_in, x_out])

        updates = [
            (self.cumulative_gradients_in, TT.inc_subtensor(self.cumulative_gradients_in[idxs[:, 0]], gradients[0] ** 2)),
            (self.cumulative_gradients_out, TT.inc_subtensor(self.cumulative_gradients_out[idxs[:, 1]], gradients[1] ** 2)),
            (self.Win, TT.inc_subtensor(self.Win[idxs[:, 0]], - (0.5 / TT.sqrt(self.cumulative_gradients_in[idxs[:, 0]])) * gradients[0])),
            (self.Wout, TT.inc_subtensor(self.Wout[idxs[:, 1]], - (0.5 / TT.sqrt(self.cumulative_gradients_out[idxs[:, 1]])) * gradients[1])),
        ]

        # theano functions
        self.calculate_loss = theano.function(inputs=[idxs, y], outputs=loss)
        self.classify = theano.function(inputs=[idxs], outputs=y_predictions)
        self.train = theano.function(
            inputs=[idxs, y],
            outputs=loss,
            updates=updates,
            name='training_fn'
        )
Example #18
0
    def indexed_train_func(self, arc, learning_rate, prealloc_x, batch_size, apply_x=identity):
        ''' Train function with indexed restriction '''
        nnlayer = self.layers[arc]
        applied_cost = theano.clone(self.cost, replace={ self._x: apply_x(self._x) })

        updates = [ (nnlayer.W, T.inc_subtensor(nnlayer.W[:,nnlayer.idx], - learning_rate * T.grad(applied_cost, nnlayer.W)[:,nnlayer.idx].T))
                  , (nnlayer.b, T.inc_subtensor(nnlayer.b[nnlayer.idx],   - learning_rate * T.grad(applied_cost, nnlayer.b)[nnlayer.idx]))
                  , (nnlayer.b_prime, - learning_rate * T.grad(applied_cost, nnlayer.b_prime))
                  ]

        idx = T.iscalar('idx')
        givens = { self._x: prealloc_x[idx * batch_size:(idx+1) * batch_size] }
        return theano.function([idx, nnlayer.idx], None, updates=updates, givens=givens)
Example #19
0
        def add_synap_pre_inp(i,p,po,s,q):
            # i :: sequence
            # p :: pre | post
            # s :: dApre | dApost
            # q :: W
            index = T.nonzero(q[i,:self.Ne])
            np = T.inc_subtensor(p[i,index],s)
##            tmp = p[i,:]
##            tmp=T.inc_subtensor(tmp[index],s)
##            np=T.set_subtensor(p[i,:],tmp)
            #np = T.inc_subtensor(p[i,:],s)
            nw = T.inc_subtensor(q[i,:],po[i,:])
            nw=T.clip(nw,0,self.wmax)
            return {p:np,q:nw}
Example #20
0
def test_context_manager():
    x = tensor.vector()
    y = tensor.vector()
    z = tensor.inc_subtensor(x[1:3], y)

    xp = tensor.vector()
    yp = tensor.vector()
    zp = tensor.inc_subtensor(xp[1:1234], yp)

    vars = (1234, xp, yp)

    with variables(*vars):
        match, = run(0, vars, (eq, z, zp))

    assert match == (3, x, y)
		def compute_numerical_gradient(v,i,X,Y):
			
			# perturb the input
			v_plus = T.inc_subtensor(v[i],self.eps)
			v_minus = T.inc_subtensor(v[i],-1.0*self.eps)

			# roll it back into the weight matrices and bias vectors
			wts_plus, bs_plus = nu.t_reroll(v_plus,nnet.num_nodes)
			wts_minus, bs_minus = nu.t_reroll(v_minus,nnet.num_nodes)
			
			# compute the loss for both sides, and then compute the numerical gradient
			loss_plus = nnet.compute_loss(X,Y,wts_plus,bs_plus)
			loss_minus = nnet.compute_loss(X,Y,wts_minus,bs_minus)
			
			return 1.0*(loss_plus-loss_minus)/(2*self.eps) # ( E(weights[i]+eps) - E(weights[i]-eps) )/(2*eps)
 def step(i_,j_,Rij_,_U,_V):
     cftools.test_value(i_, np.array([0.5]))
     cftools.test_value(j_, np.array([0.5]))
     cftools.test_value(Rij_, np.array([0.5]))
     i = i_[0]
     j = j_[0]
     Rij = Rij_[0]
     eij = Rij - T.dot(_U[:,i].T, _V[:,j])
     new_U = T.inc_subtensor(_U[:,i], config.lr * eij * _V[:,j])
     eij = Rij - T.dot(new_U[:,i].T, _V[:,j])
     new_V = T.inc_subtensor(_V[:,j], config.lr * eij * new_U[:,i])
     return {
         _U:new_U,
         _V:new_V
     }
Example #23
0
    def call(self, inputs, mask=None):
        if not isinstance(inputs, list) or len(inputs) <= 1:
            raise TypeError('SpkLifeLongMemory must be called on a list of tensors '
                            '(at least 2). Got: ' + str(inputs))
        # (None(batch), 1), index of speaker
        target_spk_l = inputs[0]
        target_spk_l = K.reshape(target_spk_l, (target_spk_l.shape[0], ))
        if K.dtype(target_spk_l) != 'int32':
            target_spk_l = K.cast(target_spk_l, 'int32')
        # (None(batch), embed_dim)
        spk_vector_l = inputs[1]
        # Start to update life-long memory based on the learned speech vector
        # First do normalization
        spk_vector_eps = K.switch(K.equal(spk_vector_l, 0.), np.spacing(1), spk_vector_l)  # avoid zero
        spk_vector_eps = K.sqrt(K.sum(spk_vector_eps**2, axis=1))
        spk_vector_eps = spk_vector_eps.dimshuffle((0, 'x'))
        spk_vector = T.true_div(spk_vector_l, K.repeat_elements(spk_vector_eps, self.vec_dim, axis=1))
        # Store speech vector into life-long memory according to the speaker identity.
        life_long_mem = T.inc_subtensor(self.life_long_mem[target_spk_l, :], spk_vector)
        # Normalization for memory
        life_long_mem_eps = K.switch(K.equal(life_long_mem, 0.), np.spacing(1), life_long_mem)  # avoid 0
        life_long_mem_eps = K.sqrt(K.sum(life_long_mem_eps**2, axis=1))
        life_long_mem_eps = life_long_mem_eps.dimshuffle((0, 'x'))
        life_long_mem = T.true_div(life_long_mem, K.repeat_elements(life_long_mem_eps, self.vec_dim, axis=1))

        # (None(batch), spk_size, embed_dim)
        return life_long_mem
Example #24
0
 def __init__(self, n_out, x_out=None, delay=0, sparse=False, name="", network=None, eval_flag=False,
              data_key=None,  # if we don't want to use "data" but something else. via y_in
              # These will be given if we initialize via JSON.
              sources=None, dropout=0, train_flag=None, mask=None, index=None, y_in=None, dtype=None):
   super(SourceLayer, self).__init__(layer_class=self.layer_class, name=name)
   if data_key is not None:
     assert x_out is None
     assert network
     assert dtype
     network.use_target(target=data_key, dtype=dtype)
     x_out = network.y[data_key]
   if x_out is None:
     assert network is not None
     x_out = network.x
   assert not sources, 'specify `"from": "null"` in json'  # or just ignore?
   assert dropout == 0
   if getattr(x_out.tag, "test_value", None) is None:
     if not sparse:
       x_out.tag.test_value = numpy.random.rand(3,2,n_out).astype('float32')
   if index and getattr(index.tag, "test_value", None) is None:
     index.tag.test_value = numpy.ones((3,2), dtype='int8')
   if not delay:
     self.output = x_out
   else:
     self.output = T.inc_subtensor(T.zeros_like(x_out)[delay:], x_out[:-delay])
   self.set_attr('n_out', n_out)
   self.set_attr('sparse', sparse)
   self.set_attr('delay', delay)
   self.index = index
   self.device = 'cpu'
   self.eval_flag = eval_flag
    def step_fn(current_input_to_state, prev_c, prev_h):
        # all args have shape (batch size, output_dim, height)

        # TODO consider learning this padding
        prev_h_padded = T.zeros((batch_size, output_dim, 1+height), dtype=theano.config.floatX)
        prev_h_padded = T.inc_subtensor(prev_h_padded[:,:,1:], prev_h)

        state_to_state = lib.ops.conv1d.Conv1D(
            name+'.StateToState', 
            output_dim, 
            4*output_dim, 
            2, 
            prev_h_padded, 
            biases=False
        )

        gates = current_input_to_state + state_to_state

        o_f_i = T.nnet.sigmoid(gates[:,:3*output_dim,:])
        o = o_f_i[:,0*output_dim:1*output_dim,:]
        f = o_f_i[:,1*output_dim:2*output_dim,:]
        i = o_f_i[:,2*output_dim:3*output_dim,:]
        g = T.tanh(gates[:,3*output_dim:4*output_dim,:])

        new_c = (f * prev_c) + (i * g)
        new_h = o * T.tanh(new_c)

        return (new_c, new_h)
Example #26
0
File: rnn.py Project: dwf/pylearn2
    def fprop_step_mask(self, state_below, mask, state_before, U):
        """
        Scan function for case using masks

        Parameters
        ----------
        : todo
        state_below : TheanoTensor
        """

        g_on = tensor.inc_subtensor(
            state_below[:, self.dim:],
            tensor.dot(state_before, U[:, self.dim:])
        )
        r_on = tensor.nnet.sigmoid(g_on[:, self.dim:2*self.dim])
        u_on = tensor.nnet.sigmoid(g_on[:, 2*self.dim:])

        z_t = tensor.tanh(
            g_on[:, :self.dim] +
            tensor.dot(r_on * state_before, U[:, :self.dim])
        )
        z_t = u_on * state_before + (1. - u_on) * z_t
        z_t = mask[:, None] * z_t + (1 - mask[:, None]) * state_before

        return z_t
    def test_simple_3d(self):
        """Increments or sets part of a tensor by a scalar using full slice and
        a partial slice depending on a scalar.
        """
        a = tt.dtensor3()
        increment = tt.dscalar()
        sl1 = slice(None)
        sl2_end = tt.lscalar()
        sl2 = slice(sl2_end)
        sl3 = 2

        for do_set in [True, False]:
            print "Set", do_set

            if do_set:
                resut = tt.set_subtensor(a[sl1, sl3, sl2], increment)
            else:
                resut = tt.inc_subtensor(a[sl1, sl3, sl2], increment)

            f = theano.function([a, increment, sl2_end], resut)

            val_a = numpy.ones((5, 3, 4))
            val_inc = 2.3
            val_sl2_end = 2

            expected_result = numpy.copy(val_a)
            result = f(val_a, val_inc, val_sl2_end)

            if do_set:
                expected_result[:, sl3, :val_sl2_end] = val_inc
            else:
                expected_result[:, sl3, :val_sl2_end] += val_inc

            self.assertTrue(numpy.array_equal(result, expected_result))
Example #28
0
 def fprop(self, XH):
     # XH is a list of inputs: [state_belows, state_befores]
     # each state vector is: [state_before; cell_before]
     # Hence, you use h[:, :self.nout] to compute recurrent term
     X, H = XH
     if len(X) != len(self.parent):
         raise AttributeError("The number of inputs doesn't match "
                              "with the number of parents.")
     if len(H) != len(self.recurrent):
         raise AttributeError("The number of inputs doesn't match "
                              "with the number of recurrents.")
     # The index of self recurrence is 0
     z_tm1 = H[0]
     z = T.zeros((X[0].shape[0], 3 * self.nout))
     for x, (parname, parout) in izip(X, self.parent.items()):
         W = self.params['W_' + parname + '__' + self.name]
         z += T.dot(x[:, :parout], W)
     for h, (recname, recout) in izip(H, self.recurrent.items()):
         U = self.params['U_' + recname + '__' + self.name]
         z = T.inc_subtensor(z[:, self.nout:],
                             T.dot(h[:, :recout], U[:, self.nout:]))
     z += self.params['b_' + self.name]
     # Compute activations of gating units
     r_on = T.nnet.sigmoid(z[:, self.nout:2 * self.nout])
     u_on = T.nnet.sigmoid(z[:, 2 * self.nout:])
     # Update hidden & cell states
     c_t = T.zeros_like(z_tm1)
     for h, (recname, recout) in izip(H, self.recurrent.items()):
         U = self.params['U_' + recname + '__' + self.name]
         c_t += T.dot(h[:, :recout], U[:, :self.nout])
     z_t = T.tanh(z[:, :self.nout] + r_on * c_t)
     z_t = u_on * z_tm1 + (1. - u_on) * z_t
     z_t.name = self.name
     return z_t
Example #29
0
def test_incsub_f16():
    shp = (3, 3)
    shared = gpuarray_shared_constructor
    xval = np.arange(np.prod(shp), dtype='float16').reshape(shp) + 1
    yval = np.empty((2,) + shp[1:], dtype='float16')
    yval[:] = 2
    x = shared(xval, name='x')
    y = tensor.tensor(dtype='float16',
                      broadcastable=(False,) * len(shp),
                      name='y')
    expr = tensor.advanced_inc_subtensor1(x, y, [0, 2])
    f = theano.function([y], expr, mode=mode_with_gpu)
    assert sum([isinstance(node.op, GpuAdvancedIncSubtensor1)
                for node in f.maker.fgraph.toposort()]) == 1
    rval = f(yval)
    rep = xval.copy()
    np.add.at(rep, [[0, 2]], yval)
    assert np.allclose(rval, rep)

    expr = tensor.inc_subtensor(x[1:], y)
    f = theano.function([y], expr, mode=mode_with_gpu)
    assert sum([isinstance(node.op, GpuIncSubtensor)
                for node in f.maker.fgraph.toposort()]) == 1
    rval = f(yval)
    rep = xval.copy()
    rep[1:] += yval
    assert np.allclose(rval, rep)
def create_adam_updates(updates, params, gparams, gsums, xsums, lr, eps, beta1, beta2):
    i = theano.shared(np.float64(0.0).astype(theano.config.floatX))
    i_t = i + 1.0
    omb1_t = 1.0 - beta1**i_t
    omb2_t = 1.0 - beta2**i_t
    lr_t = lr * (T.sqrt(omb2_t) / omb1_t)
    for p, g, m, v in zip(params, gparams, gsums, xsums):
        if is_subtensor_op(p):
            origin, indexes = get_subtensor_op_inputs(p)
            m_sub = m[indexes]
            v_sub = v[indexes]
            m_t = beta1*m_sub + (1.0-beta1)*g
            v_t = beta2*v_sub + (1.0-beta2)*T.sqr(g)
            g_t = m_t / (T.sqrt(v_t) + eps)
            updates[m] = T.set_subtensor(m_sub, m_t)
            updates[v] = T.set_subtensor(v_sub, v_t)
            updates[origin] = T.inc_subtensor(p, -lr_t*g_t)
        else:
            m_t = beta1*m + (1.0-beta1)*g
            v_t = beta2*v + (1.0-beta2)*T.sqr(g)
            g_t = m_t / (T.sqrt(v_t) + eps)
            updates[m] = m_t
            updates[v] = v_t
            updates[p] = p - lr_t*g_t
    updates[i] = i_t
Example #31
0
 def forward(self, x):
     y = tt.zeros(x.shape)
     y = tt.inc_subtensor(y[..., 0], x[..., 0])
     y = tt.inc_subtensor(y[..., 1:], tt.log(x[..., 1:] - x[..., :-1]))
     return y
Example #32
0
def test_jax_basic():
    x = tt.matrix("x")
    y = tt.matrix("y")
    b = tt.vector("b")

    # `ScalarOp`
    z = tt.cosh(x**2 + y / 3.0)

    # `[Inc]Subtensor`
    out = tt.set_subtensor(z[0], -10.0)
    out = tt.inc_subtensor(out[0, 1], 2.0)
    out = out[:5, :3]

    out_fg = theano.gof.FunctionGraph([x, y], [out])

    test_input_vals = [
        np.tile(np.arange(10), (10, 1)).astype(tt.config.floatX),
        np.tile(np.arange(10, 20), (10, 1)).astype(tt.config.floatX),
    ]
    (jax_res, ) = compare_jax_and_py(out_fg, test_input_vals)

    # Confirm that the `Subtensor` slice operations are correct
    assert jax_res.shape == (5, 3)

    # Confirm that the `IncSubtensor` operations are correct
    assert jax_res[0, 0] == -10.0
    assert jax_res[0, 1] == -8.0

    out = tt.clip(x, y, 5)
    out_fg = theano.gof.FunctionGraph([x, y], [out])
    compare_jax_and_py(out_fg, test_input_vals)

    out = tt.diagonal(x, 0)
    out_fg = theano.gof.FunctionGraph([x], [out])
    compare_jax_and_py(
        out_fg,
        [np.arange(10 * 10).reshape((10, 10)).astype(tt.config.floatX)])

    out = tt.slinalg.cholesky(x)
    out_fg = theano.gof.FunctionGraph([x], [out])
    compare_jax_and_py(out_fg, [
        (np.eye(10) + np.random.randn(10, 10) * 0.01).astype(tt.config.floatX)
    ])

    # not sure why this isn't working yet with lower=False
    out = tt.slinalg.Cholesky(lower=False)(x)
    out_fg = theano.gof.FunctionGraph([x], [out])
    compare_jax_and_py(out_fg, [
        (np.eye(10) + np.random.randn(10, 10) * 0.01).astype(tt.config.floatX)
    ])

    out = tt.slinalg.solve(x, b)
    out_fg = theano.gof.FunctionGraph([x, b], [out])
    compare_jax_and_py(
        out_fg,
        [
            np.eye(10).astype(tt.config.floatX),
            np.arange(10).astype(tt.config.floatX)
        ],
    )

    out = tt.nlinalg.alloc_diag(b)
    out_fg = theano.gof.FunctionGraph([b], [out])
    compare_jax_and_py(out_fg, [np.arange(10).astype(tt.config.floatX)])

    out = tt.nlinalg.det(x)
    out_fg = theano.gof.FunctionGraph([x], [out])
    compare_jax_and_py(
        out_fg,
        [np.arange(10 * 10).reshape((10, 10)).astype(tt.config.floatX)])

    out = tt.nlinalg.matrix_inverse(x)
    out_fg = theano.gof.FunctionGraph([x], [out])
    compare_jax_and_py(out_fg, [
        (np.eye(10) + np.random.randn(10, 10) * 0.01).astype(tt.config.floatX)
    ])
Example #33
0
def get_pseudograd(loss, params, srng=None, eps_sigma=1.0, grad_prior=1.0, r = 1.0e-1):
  srng = get_srng(srng)
  eps = 1.0 / eps_sigma

  def step(i, param, eta, lam_diag, dx):
    upd = OrderedDict()

    upd[dx] = dx



    n = param.get_value(borrow=True).shape[0]

  one = T.constant(1.0)
  zero = T.constant(0.0)


  dx = T.fvector()

  pgrads = []

  for param in params:
    value = param.get_value(borrow=True)
    shape=value.shape
    n = np.prod(shape)

    i = T.iscalar()

    zeros = T.zeros(shape=n, dtype=param.dtype)
    delta = (2 * srng.binomial() - 1) * r

    inc = T.set_subtensor(zeros[i], delta).reshape(shape)

    new_loss = theano.clone(
      loss, replace={param: param + inc}
    )

    dloss = new_loss - loss

    eta = theano.shared(np.zeros(shape=n, dtype='float32'))
    lam_diag = theano.shared(np.ones(n, dtype='float32') * grad_prior)

    def u(i):
      upd = OrderedDict()
      upd[eta] = T.inc_subtensor(eta[i], dloss * eps * delta)
      upd[lam_diag] = T.inc_subtensor(lam_diag[i], eps * (r ** 2))

    _, upd = theano.scan(
      u,
      sequences=T.arange(n)
    )


  dloss = new_loss - loss

  upd[eta] = rho * T.inc_subtensor(eta[i], dloss * eps * T.sum(dx)) + (one - rho) * T.zeros(n)
  upd[lam_diag] = rho * T.inc_subtensor(lam_diag[i], eps * T.sum(dx) ** 2) + (one - rho) * T.ones(n)

  pgrad = eta / lam_diag

  upd[param] = param - learning_rate * pgrad

  t = theano.function([dx, i] + input, output, updates=upd)

  dx_ = np.zeros(n, dtype='float32')
Example #34
0
 def test_incsubtensor1(self):
     tv = numpy.asarray(self.rng.uniform(size=(3, )), theano.config.floatX)
     t = theano.shared(tv)
     out = tensor.inc_subtensor(self.x[:3], t)
     self.check_rop_lop(out, self.in_shape)
Example #35
0
import numpy as np
import theano
import theano.tensor as T
fX = theano.config.floatX
s = theano.shared(np.ones((10, 1), dtype=fX))
idxs = [0, 1, 1]
fn = theano.function([], updates=[(s, T.inc_subtensor(s[idxs], s[idxs]**2))])
fn()
print s.get_value()
Example #36
0
print(z.eval({a: np.diag((3, 3)).astype(theano.config.floatX), b : 3}))

cond = T.vector('cond')

x, y = T.vectors('x', 'y')

z = T.switch(cond, x, y)

print(z.eval({ cond: [1, 0], x: [10, 10], y: [3, 2]}))

a = T.matrix('a')

print(T.max(a).eval({a: [[1, 2], [3, 4]]}))

print(T.max(a, axis=0).eval({a: [[1, 2], [3, 4]]}))

print(T.max(a, axis=1).eval({a: [[1, 2], [3, 4]]}))

a = T.arange(10).reshape((5, 2))
b = a[::-1]

print(b.eval())
print(T.concatenate([a, b]).eval())
print(T.concatenate([a, b], axis=1).eval())
print(T.stack([a, b]).eval())

a = T.arange(10).reshape((5, 2))
print(T.set_subtensor(a[3:], [-1, -1]).eval())
print(T.inc_subtensor(a[3:], [-1, -1]).eval())

Example #37
0
 def RMSprop(self,
             cost,
             params,
             full_params,
             sampled_params,
             sidxs,
             epsilon=1e-6):
     grads = [T.grad(cost=cost, wrt=param) for param in params]
     sgrads = [T.grad(cost=cost, wrt=sparam) for sparam in sampled_params]
     updates = OrderedDict()
     if self.grad_cap > 0:
         norm = T.cast(
             T.sqrt(
                 T.sum([
                     T.sum([T.sum(g**2) for g in g_list])
                     for g_list in grads
                 ]) + T.sum([T.sum(g**2) for g in sgrads])),
             theano.config.floatX)
         grads = [[
             T.switch(T.ge(norm, self.grad_cap), g * self.grad_cap / norm,
                      g) for g in g_list
         ] for g_list in grads]
         sgrads = [
             T.switch(T.ge(norm, self.grad_cap), g * self.grad_cap / norm,
                      g) for g in sgrads
         ]
     for p_list, g_list in zip(params, grads):
         for p, g in zip(p_list, g_list):
             if self.adapt == 'adagrad':
                 g = self.adagrad(p, g, updates)
             elif self.adapt == 'rmsprop':
                 g = self.rmsprop(p, g, updates)
             elif self.adapt == 'adadelta':
                 g = self.adadelta(p, g, updates)
             elif self.adapt == 'adam':
                 g = self.adam(p, g, updates)
             if self.momentum > 0:
                 velocity = theano.shared(p.get_value(borrow=False) * 0.,
                                          borrow=True)
                 velocity2 = self.momentum * velocity - np.float32(
                     self.learning_rate) * (g + self.lmbd * p)
                 updates[velocity] = velocity2
                 updates[p] = p + velocity2
             else:
                 updates[p] = p * np.float32(1.0 - self.learning_rate *
                                             self.lmbd) - np.float32(
                                                 self.learning_rate) * g
     for i in range(len(sgrads)):
         g = sgrads[i]
         fullP = full_params[i]
         sample_idx = sidxs[i]
         sparam = sampled_params[i]
         if self.adapt == 'adagrad':
             g = self.adagrad(fullP, g, updates, sample_idx)
         elif self.adapt == 'rmsprop':
             g = self.rmsprop(fullP, g, updates, sample_idx)
         elif self.adapt == 'adadelta':
             g = self.adadelta(fullP, g, updates, sample_idx)
         elif self.adapt == 'adam':
             g = self.adam(fullP, g, updates, sample_idx)
         if self.lmbd > 0:
             delta = np.float32(
                 self.learning_rate) * (g + self.lmbd * sparam)
         else:
             delta = np.float32(self.learning_rate) * g
         if self.momentum > 0:
             velocity = theano.shared(fullP.get_value(borrow=False) * 0.,
                                      borrow=True)
             vs = velocity[sample_idx]
             velocity2 = self.momentum * vs - delta
             updates[velocity] = T.set_subtensor(vs, velocity2)
             updates[fullP] = T.inc_subtensor(sparam, velocity2)
         else:
             updates[fullP] = T.inc_subtensor(sparam, -delta)
     return updates
Example #38
0
 def scan_step(index, prev_res, y_labeling, y_):
     res_t = T.inc_subtensor(prev_res[y_[index, T.arange(batch_size)],
                             T.arange(batch_size)],
                             y_labeling[index, T.arange(batch_size)])
     return res_t
Example #39
0
 def backward(self, y):
     out = tt.zeros(y.shape)
     out = tt.inc_subtensor(out[0], y[0])
     out = tt.inc_subtensor(out[1:], tt.exp(y[1:]))
     return tt.cumsum(out)
Example #40
0
def _update_std(nnet, layer, dW, db, loss, idx=None):
    """
    update with standard feature vectors (i.e. non-compressed)
    """

    assert layer.isconv or layer.issvm

    if Cfg.store_on_gpu:
        assert idx is not None

    C = Cfg.C
    D = Cfg.D
    eps = Cfg.eps

    k = layer.k

    K = (C * D) / (C + D)

    W_s = dW * K * T.cast(1. / nnet.data.n_train, 'floatX')
    b_s = db * K * T.cast(1. / nnet.data.n_train, 'floatX')
    l_s = loss * T.cast(1. / nnet.data.n_train, 'floatX')

    if Cfg.store_on_gpu:
        DeltaW = W_s - layer.W_i[idx]
        Deltab = b_s - layer.b_i[idx]
        Deltal = l_s - layer.l_i[idx]
    else:
        DeltaW = W_s - layer.W_i_buffer
        Deltab = b_s - layer.b_i_buffer
        Deltal = l_s - layer.l_i_buffer

    gamma = (K * Deltal +
             T.sum(DeltaW * layer.W) +
             T.sum(Deltab * layer.b)) / \
        (eps + T.sum(DeltaW ** 2) + T.sum(Deltab ** 2))

    gamma = gamma.clip(0, 1)

    W = layer.W - gamma * DeltaW
    b = layer.b - gamma * Deltab
    l = layer.l + gamma * Deltal

    if Cfg.store_on_gpu:
        # new value
        W_i = T.inc_subtensor(layer.W_i[idx], gamma * DeltaW)
        b_i = T.inc_subtensor(layer.b_i[idx], gamma * Deltab)
        l_i = T.inc_subtensor(layer.l_i[idx], gamma * Deltal)

        # shared variable to update
        layer_W_i = layer.W_i
        layer_b_i = layer.b_i
        layer_l_i = layer.l_i
    else:
        # new value
        W_i = layer.W_i_buffer + gamma * DeltaW
        b_i = layer.b_i_buffer + gamma * Deltab
        l_i = layer.l_i_buffer + gamma * Deltal

        # shared variable to update
        layer_W_i = layer.W_i_buffer
        layer_b_i = layer.b_i_buffer
        layer_l_i = layer.l_i_buffer

    # average
    W_avg = T.cast((k * 1. / (k + 2)), 'floatX') * layer.W_avg + \
        T.cast((2. / (k + 2)), 'floatX') * W
    b_avg = T.cast((k * 1. / (k + 2)), 'floatX') * layer.b_avg + \
        T.cast((2. / (k + 2)), 'floatX') * b
    k = k + 1

    updates = ((layer.W, W), (layer.b, b), (layer.W_avg, W_avg),
               (layer.b_avg, b_avg), (layer.k, k), (layer.l, l),
               (layer_W_i, W_i), (layer_b_i, b_i), (layer_l_i,
                                                    l_i), (layer.gamma, gamma))

    return updates
    def build_network(self, K, vocab_size, doc_var, query_var, docmask_var,
                      qmask_var, candmask_var, feat_var, W_init):

        l_docin = L.InputLayer(shape=(None, None, 1), input_var=doc_var)
        l_qin = L.InputLayer(shape=(None, None, 1), input_var=query_var)
        l_docmask = L.InputLayer(shape=(None, None), input_var=docmask_var)
        l_qmask = L.InputLayer(shape=(None, None), input_var=qmask_var)
        l_featin = L.InputLayer(shape=(None, None), input_var=feat_var)
        l_docembed = L.EmbeddingLayer(l_docin,
                                      input_size=vocab_size,
                                      output_size=EMBED_DIM,
                                      W=W_init)  # B x N x 1 x DE
        l_doce = L.ReshapeLayer(
            l_docembed,
            (doc_var.shape[0], doc_var.shape[1], EMBED_DIM))  # B x N x DE
        l_qembed = L.EmbeddingLayer(l_qin,
                                    input_size=vocab_size,
                                    output_size=EMBED_DIM,
                                    W=l_docembed.W)
        l_fembed = L.EmbeddingLayer(l_featin, input_size=2,
                                    output_size=2)  # B x N x 2

        if not EMB_TRAIN: l_docembed.params[l_docembed.W].remove('trainable')

        l_fwd_q = L.GRULayer(l_qembed,
                             NUM_HIDDEN,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True)
        l_bkd_q = L.GRULayer(l_qembed,
                             NUM_HIDDEN,
                             grad_clipping=GRAD_CLIP,
                             mask_input=l_qmask,
                             gradient_steps=GRAD_STEPS,
                             precompute_input=True,
                             backwards=True)

        l_fwd_q_slice = L.SliceLayer(l_fwd_q, -1, 1)
        l_bkd_q_slice = L.SliceLayer(l_bkd_q, 0, 1)
        l_q = L.ConcatLayer([l_fwd_q_slice, l_bkd_q_slice])  # B x 2D
        q = L.get_output(l_q)  # B x 2D

        l_qs = [l_q]
        for i in range(K - 1):
            l_fwd_doc_1 = L.GRULayer(l_doce,
                                     NUM_HIDDEN,
                                     grad_clipping=GRAD_CLIP,
                                     mask_input=l_docmask,
                                     gradient_steps=GRAD_STEPS,
                                     precompute_input=True)
            l_bkd_doc_1 = L.GRULayer(l_doce,
                                     NUM_HIDDEN,
                                     grad_clipping=GRAD_CLIP,
                                     mask_input=l_docmask,
                                     gradient_steps=GRAD_STEPS,
                                     precompute_input=True,
                                     backwards=True)

            l_doc_1 = L.concat([l_fwd_doc_1, l_bkd_doc_1],
                               axis=2)  # B x N x DE

            l_fwd_q_1 = L.GRULayer(l_qembed,
                                   NUM_HIDDEN,
                                   grad_clipping=GRAD_CLIP,
                                   mask_input=l_qmask,
                                   gradient_steps=GRAD_STEPS,
                                   precompute_input=True)
            l_bkd_q_1 = L.GRULayer(l_qembed,
                                   NUM_HIDDEN,
                                   grad_clipping=GRAD_CLIP,
                                   mask_input=l_qmask,
                                   gradient_steps=GRAD_STEPS,
                                   precompute_input=True,
                                   backwards=True)

            l_q_c_1 = L.ConcatLayer([l_fwd_q_1, l_bkd_q_1],
                                    axis=2)  # B x Q x DE
            l_qs.append(l_q_c_1)

            qd = L.get_output(l_q_c_1)  # B x Q x DE
            dd = L.get_output(l_doc_1)  # B x N x DE
            M = T.batched_dot(dd, qd.dimshuffle((0, 2, 1)))  # B x N x Q
            alphas = T.nnet.softmax(
                T.reshape(M, (M.shape[0] * M.shape[1], M.shape[2])))
            alphas_r = T.reshape(alphas, (M.shape[0],M.shape[1],M.shape[2]))* \
                    qmask_var[:,np.newaxis,:] # B x N x Q
            alphas_r = alphas_r / alphas_r.sum(axis=2)[:, :,
                                                       np.newaxis]  # B x N x Q
            q_rep = T.batched_dot(alphas_r, qd)  # B x N x DE

            l_q_rep_in = L.InputLayer(shape=(None, None, 2 * NUM_HIDDEN),
                                      input_var=q_rep)
            l_doc_2_in = L.ElemwiseMergeLayer([l_doc_1, l_q_rep_in], T.mul)
            l_doce = L.dropout(l_doc_2_in, p=DROPOUT_RATE)  # B x N x DE

        l_doce = L.ConcatLayer([l_doce, l_fembed], axis=2)  # B x N x DE+2
        l_fwd_doc = L.GRULayer(l_doce,
                               NUM_HIDDEN,
                               grad_clipping=GRAD_CLIP,
                               mask_input=l_docmask,
                               gradient_steps=GRAD_STEPS,
                               precompute_input=True)
        l_bkd_doc = L.GRULayer(l_doce, NUM_HIDDEN, grad_clipping=GRAD_CLIP,
                mask_input=l_docmask, gradient_steps=GRAD_STEPS, precompute_input=True, \
                        backwards=True)

        l_doc = L.concat([l_fwd_doc, l_bkd_doc], axis=2)

        d = L.get_output(l_doc)  # B x N x 2D
        p = T.batched_dot(d, q)  # B x N
        pm = T.nnet.softmax(p) * candmask_var
        pm = pm / pm.sum(axis=1)[:, np.newaxis]

        index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape)
        final = T.inc_subtensor(T.alloc(0.,p.shape[0],vocab_size)[index,T.flatten(doc_var,outdim=2)],\
                pm)

        dv = L.get_output(l_doc, deterministic=True)  # B x N x 2D
        p = T.batched_dot(dv, q)  # B x N
        pm = T.nnet.softmax(p) * candmask_var
        pm = pm / pm.sum(axis=1)[:, np.newaxis]

        index = T.reshape(T.repeat(T.arange(p.shape[0]), p.shape[1]), p.shape)
        final_v = T.inc_subtensor(T.alloc(0.,p.shape[0],vocab_size)[index,\
                T.flatten(doc_var,outdim=2)],pm)

        return final, final_v, l_doc, l_qs
Example #42
0
 def backward(self, y):
     x = tt.zeros(y.shape)
     x = tt.inc_subtensor(x[..., 0], y[..., 0])
     x = tt.inc_subtensor(x[..., 1:], tt.exp(y[..., 1:]))
     return tt.cumsum(x, axis=-1)
Example #43
0
    def __init__(self, hidden_size, num_labels, num_features, embedding_size, \
                 fixed_embeddings, activation='logistic'):
        '''
        hidden_size :: dimension of the hidden layer
        num_labels :: number of labels
        num_features :: number of word embeddings in the vocabulary
        embedding_size :: dimension of the word embeddings
        activation :: logistic or tanh
        '''
        self.hidden_size = hidden_size
        self.num_labels = num_labels
        self.num_features = num_features
        self.embedding_size = embedding_size
        self.original_embedding_size = fixed_embeddings.shape[0]
        self.bidirectional = True

        if activation == 'logistic':
            self.activation_function = T.nnet.sigmoid
        elif activation == 'tanh':
            self.activation_function = T.tanh
        else:
            raise NotImplementedError

        self.create_parameters()
        self.initialize_parameters()

        # Copy the fixed embeddings to self.emb.
        num_fixed_embeddings = fixed_embeddings.shape[1]
        self.num_fixed_embeddings = num_fixed_embeddings
        E = self.emb.get_value()
        E[:, :num_fixed_embeddings] = fixed_embeddings.astype(
            theano.config.floatX)
        self.emb.set_value(E)
        #T.set_subtensor(self.emb[:, :num_fixed_embeddings], \
        #                fixed_embeddings.astype(theano.config.floatX))

        # As many elements as words in the sentence.
        self.idxs = T.ivector()
        idxs = self.idxs

        #positions_nonupd = (idxs < num_fixed_embeddings).nonzero()[0]
        #positions_upd = (idxs >= num_fixed_embeddings).nonzero()[0]
        self.positions_nonupd = T.ivector()
        self.positions_upd = T.ivector()
        positions_nonupd = self.positions_nonupd
        positions_upd = self.positions_upd
        idxs_nonupd = idxs[positions_nonupd]
        idxs_upd = idxs[positions_upd]

        emb_nonupd = self.emb[:, idxs_nonupd]
        emb_upd = self.emb[:, idxs_upd]
        positions = T.concatenate([positions_nonupd, positions_upd])
        emb = T.concatenate([emb_nonupd, emb_upd], axis=1)
        emb = T.set_subtensor(emb[:, positions], emb)

        #emb = self.emb[:, idxs]
        #x = emb.T
        x = T.dot(self.Wx, emb).T

        #self.positions_to_update = T.ivector()
        #positions_to_update = self.positions_to_update
        #emb_to_update = emb[:, positions_to_update]

        self.y = T.iscalar('y')  # label.
        y = self.y

        #[h, s], _ = theano.scan(fn=self.recurrence_old,
        #                        sequences=x,
        #                        outputs_info=[self.h0, None],
        #                        n_steps=x.shape[0])

        h, _ = theano.scan(fn=self.recurrence,
                           sequences=x,
                           outputs_info=self.h0,
                           n_steps=x.shape[0])

        if self.bidirectional:
            l, _ = theano.scan(fn=self.recurrence_right_to_left,
                               sequences=x[::-1, :],
                               outputs_info=self.l0,
                               n_steps=x.shape[0])
            l = l[::-1, :]
            #s = T.nnet.softmax(T.dot(self.Why, h.T).T +
            #                   T.dot(self.Wly, l.T).T + self.by)
            s = T.nnet.softmax(
                T.dot(self.Why, h[-1, :]) + T.dot(self.Wly, l[0, :]) + self.by)
        else:
            #s = T.nnet.softmax(T.dot(self.Why, h.T).T + self.by)
            s = T.nnet.softmax(T.dot(self.Why, h[-1, :]) + self.by)

        p_y_given_x_sentence = s[0]  # check.
        self.y_pred = T.argmax(p_y_given_x_sentence)
        y_pred = self.y_pred

        self.num_mistakes = 1 - T.eq(y, y_pred)

        # cost and gradients and learning rate
        self.lr = T.scalar('lr')
        lr = self.lr
        #self.sentence_nll = -T.mean(T.log(p_y_given_x_sentence)
        #                       [T.arange(x.shape[0]), y])
        self.sentence_nll = -T.log(p_y_given_x_sentence[y])

        params_to_update = self.params[1:]
        sentence_gradients = T.grad(self.sentence_nll, params_to_update)
        sentence_gradient_emb = T.grad(self.sentence_nll, emb_upd)
        sentence_update_emb = [(self.emb,
                                T.inc_subtensor(emb_upd,
                                                -lr * sentence_gradient_emb))]
        self.sentence_updates = OrderedDict(
            (p, p - lr * g)
            for p, g in zip(params_to_update, sentence_gradients))
        self.sentence_updates.update(sentence_update_emb)

        self.classify = theano.function(
            inputs=[idxs, positions_upd, positions_nonupd],
            outputs=[y_pred, p_y_given_x_sentence])
    def fprop(self, XH, tparams):

        # XH is a list of inputs: [state_belows, state_befores]
        # each state vector is: [state_before; cell_before]
        # Hence, you use h[:, :self.nout] to compute recurrent term
        X, H = XH

        if len(X) != len(self.parent):
            raise AttributeError("The number of inputs doesn't match "
                                 "with the number of parents.")

        if len(H) != len(self.recurrent):
            raise AttributeError("The number of inputs doesn't match "
                                 "with the number of recurrents.")

        # The index of self recurrence is 0
        z_t = H[0]
        Nm = len(self.recurrent)
        z = T.zeros((X[0].shape[0], 4 * self.nout + Nm),
                    dtype=theano.config.floatX)

        for x, (parname, parout) in izip(X, self.parent.items()):
            W = tparams['W_' + parname + '__' + self.name]

            if x.ndim == 1:
                if 'int' not in x.dtype:
                    x = T.cast(x, 'int64')
                z += W[x]
            else:
                z += T.dot(x[:, :parout], W)

        for h, (recname, recout) in izip(H, self.recurrent.items()):
            U = tparams['U_' + recname + '__' + self.name]
            z = T.inc_subtensor(z[:, self.nout:],
                                T.dot(h[:, :recout], U[:, self.nout:]))

        z += tparams['b_' + self.name]

        # Compute activations of gating units
        i_t = T.nnet.sigmoid(z[:, self.nout:2 * self.nout])
        f_t = T.nnet.sigmoid(z[:, 2 * self.nout:3 * self.nout])
        o_t = T.nnet.sigmoid(z[:, 3 * self.nout:4 * self.nout])
        gron = T.nnet.sigmoid(z[:, 4 * self.nout:])
        c_t = z[:, :self.nout]

        for i, (h, (recname, recout)) in\
            enumerate(izip(H, self.recurrent.items())):
            gated_h = h[:, :recout] * gron[:, i].dimshuffle(0, 'x')
            U = tparams['U_' + recname + '__' + self.name]
            c_t += T.dot(gated_h, U[:, :self.nout])

        # Update hidden & cell states
        z_t = T.set_subtensor(
            z_t[:, self.nout:],
            f_t * z_t[:, self.nout:] + i_t * self.nonlin(c_t))

        z_t = T.set_subtensor(z_t[:, :self.nout],
                              o_t * self.nonlin(z_t[:, self.nout:]))

        z_t.name = self.name

        return z_t
Example #45
0
def set_subtensor(subtensor, amnt):
    return T.inc_subtensor(subtensor, amnt, set_instead_of_inc=True)
Example #46
0
 def get_output_for(self, input, deterministic=False, **kwargs):
     out, r = T.zeros(self.get_output_shape_for(input.shape)), self.upscale
     for y, x in itertools.product(range(r), repeat=2):
         out = T.inc_subtensor(out[:, :, y::r, x::r],
                               input[:, r * y + x::r * r, :, :])
     return out
Example #47
0
 def test_incsubtensor2(self):
     tv = numpy.asarray(self.rng.uniform(size=(10, )), theano.config.floatX)
     t = theano.shared(tv)
     out = tensor.inc_subtensor(t[:4], self.x[:4])
     self.check_rop_lop(out, (10, ))
    def costs(self, application_call, prediction, prediction_mask, groundtruth,
              groundtruth_mask, **inputs):
        def _prediction_subtensor(data):
            if data.ndim != 3:
                raise ValueError
            flat_data = data.reshape(
                (data.shape[0] * data.shape[1], data.shape[2]))
            flat_data = flat_data[tensor.arange(flat_data.shape[0]),
                                  prediction.flatten()]
            return flat_data.reshape(
                (prediction.shape[0], prediction.shape[1]))

        attended = disconnected_grad(inputs.pop('attended'))
        attended_mask = disconnected_grad(inputs.pop('attended_mask'))

        # Compute the rewards
        rewards = self.reward_brick.apply(prediction, prediction_mask,
                                          groundtruth, groundtruth_mask)[:, :,
                                                                         0]
        future_rewards = rewards[::-1].cumsum(axis=0)[::-1]

        # Compute the critic outputs
        if self.critic:
            padding = tensor.repeat(tensor.fill(prediction[0:1],
                                                self.bos_token),
                                    1,
                                    axis=0)
            mask_padding = tensor.repeat(tensor.fill(prediction_mask[0:1], 1.),
                                         1,
                                         axis=0)
            padded_prediction = tensor.concatenate([padding, prediction])
            padded_prediction_mask = tensor.concatenate(
                [mask_padding, prediction_mask])
            if self.critic_uses_groundtruth:
                critic_context = groundtruth
                critic_context_mask = groundtruth_mask
            else:
                critic_context = tensor.zeros_like(groundtruth[0:1])
                critic_context_mask = tensor.zeros_like(groundtruth_mask[0:1])
            critic_kwargs = dict(prediction=padded_prediction,
                                 prediction_mask=padded_prediction_mask,
                                 groundtruth=critic_context,
                                 groundtruth_mask=critic_context_mask,
                                 inputs=critic_context,
                                 inputs_mask=critic_context_mask)

            if self.critic_uses_actor_states:
                extra_inputs = disconnected_grad(inputs['states'])
                # We don't need the very last hidden state of the actor
                # in extra_inputs. We have to add something instead for the shapes
                # to match. It doesn't matter at all, what exactly we add.
                critic_kwargs['extra_inputs'] = tensor.concatenate(
                    [extra_inputs,
                     tensor.zeros_like(extra_inputs[0:1])])
            critic_cg = ComputationGraph(self.critic.costs(**critic_kwargs))
            outputs, = VariableFilter(
                applications=[self.critic.generator.readout.all_outputs],
                roles=[OUTPUT])(critic_cg)
            # The first subtensor should be discarded, because it was outputted
            # for the padding. In addition to that Q-values from the first
            # 'critic_burnin_steps' will be ignored, see later in the code.
            outputs = outputs[1:]
        else:
            outputs = self.merge(**dict_subset(inputs, self.merge_names))
        prediction_outputs = _prediction_subtensor(outputs)

        # Compute Q adjustments
        adjustments = outputs
        prediction_adjustments = prediction_outputs
        if self.accumulate_outputs:
            prediction_adjustments = prediction_outputs.cumsum(axis=0)
            adjustments = tensor.inc_subtensor(
                adjustments[1:], prediction_adjustments[:-1][:, :, None])

        # Compute shared additive biases for all Q values
        if self.use_value_biases:
            value_biases = (self.value_summand.apply(attended)[:, :, 0] *
                            attended_mask).sum(axis=0)
        else:
            value_biases = tensor.zeros_like(adjustments[0, :, 0])
        values = adjustments + value_biases[None, :, None]
        prediction_values = prediction_adjustments + value_biases[None, :]

        rolled_prediction_mask = tensor.roll(prediction_mask, -1, axis=0)
        rolled_prediction_mask = tensor.set_subtensor(
            rolled_prediction_mask[-1], 0)

        # Compute probabilities
        logs = self.scores(use_epsilon=False, **inputs)
        probs = tensor.exp(logs)
        if self.trpo_coef:
            logger.debug("Using TRPO coefficient of {}".format(self.trpo_coef))
            old_probs = tensor.tensor3('probs')
        else:
            old_probs = tensor.zeros_like(probs)
        prediction_logs = _prediction_subtensor(logs)

        # Compute value targets
        value_targets = (disconnected_grad(probs) * values).sum(axis=-1)
        value_targets = tensor.roll(value_targets, -1, axis=0)
        value_targets = (
            self.discount * value_targets * rolled_prediction_mask + rewards)
        value_targets = value_targets.astype(theano.config.floatX)

        total_costs = 0

        # Compute critic cost
        if not self.compute_targets:
            logger.debug("Using given targets")
            value_targets = tensor.matrix('value_targets')
        if self.solve_bellman == 'no':
            logger.debug("Not solving Bellman, just predicting the rewards")
            value_targets = rewards.copy(name='value_targets')
        elif self.solve_bellman == 'without_dp':
            future_rewards = rewards[::-1].cumsum(axis=0)[::-1]
            logger.debug("Solving Bellman, but without DP")
            value_targets = future_rewards
        elif self.solve_bellman is not True:
            raise ValueError()
        critic_errors = prediction_values - value_targets
        if self.critic_loss == 'L2':
            logger.debug("L2 loss for the critic")
            critic_costs_per_char = critic_errors**2 * prediction_mask
        elif self.critic_loss == 'huber':
            logger.debug("Huber loss for the critic")
            use_L2 = tensor.lt(abs(critic_errors), 0.5)
            critic_costs_per_char = (
                use_L2 * critic_errors**2 +
                (1 - use_L2) * abs(critic_errors)) * prediction_mask
        else:
            raise ValueError()
        critic_costs = critic_costs_per_char[self.critic_burnin_steps:].sum(
            axis=0)
        if not self.freeze_critic:
            total_costs += critic_costs

        # Compute critic Monte-Carlo cost
        critic_monte_carlo_costs = (
            (((prediction_values - future_rewards)**2) *
             prediction_mask)[self.critic_burnin_steps:].sum(axis=0))

        # Value penalty
        if self.value_penalty:
            logger.debug("Use value penalty")
            if self.value_penalty_type == 'L2':
                value_deviations = (values -
                                    values.mean(axis=-1, keepdims=True))**2
            elif self.value_penalty_type == 'L1':
                value_deviations = abs(values -
                                       values.mean(axis=-1, keepdims=True))
            else:
                raise ValueError("unknown value penalty type {}".format(
                    self.value_penalty_type))
            if not self.freeze_critic:
                total_costs += (
                    self.value_penalty *
                    (value_deviations.sum(axis=-1) *
                     prediction_mask)[self.critic_burnin_steps:].sum(axis=0))

        # Compute actor cost
        if self.critic:
            # The actor cost will be minimized, that's why values
            # must be negated.
            est_name = self.actor_grad_estimate
            if est_name == 'all_actions':
                disadvantages = disconnected_grad(
                    values.max(axis=-1)[:, :, None] - values)
                actor_costs = ((probs * disadvantages).sum(axis=-1) *
                               prediction_mask)
                actor_costs = actor_costs[self.critic_burnin_steps:]
            elif est_name.startswith('1_action'):
                # Here we do not provide a target for the first step for
                # the reason we lack an estimate of the value of the initial state.
                # This is how our critic works.
                # Hopefully the network won't unlearn
                # to produce a BOS first.
                future_reward_estimate = (future_rewards
                                          if est_name.endswith('unbiased') else
                                          prediction_values)
                weights = -disconnected_grad(future_reward_estimate[1:] +
                                             rewards[:-1] -
                                             prediction_values[:-1])
                actor_costs = ((prediction_logs[1:] * weights) *
                               prediction_mask[1:])
                actor_costs = actor_costs[self.critic_burnin_steps + 1:]
            else:
                raise ValueError
            actor_costs = actor_costs.sum(axis=0)

            actor_entropies = (probs * -logs).sum(axis=-1) * prediction_mask
            actor_entropies = actor_entropies[self.critic_burnin_steps:].sum(
                axis=0)
            old_actor_cross_entropies = (old_probs *
                                         -logs).sum(axis=-1) * prediction_mask
            old_actor_cross_entropies = old_actor_cross_entropies[
                self.critic_burnin_steps:].sum(axis=0)
            critic_policy = disconnected_grad(
                self.softmax.apply(self.critic_policy_t * values,
                                   extra_ndim=1))
            critic_cross_entropies = ((critic_policy * -logs).sum(axis=-1) *
                                      prediction_mask)
            critic_cross_entropies = critic_cross_entropies[
                self.critic_burnin_steps:].sum(axis=0)
            actor_costs_with_penalties = (
                actor_costs - self.entropy_reward_coof * actor_entropies
                # But really, should it be minus here, below?
                - self.cross_entropy_reward_coof * critic_cross_entropies +
                self.trpo_coef * old_actor_cross_entropies)
            if not self.freeze_actor:
                total_costs += actor_costs_with_penalties
            else:
                total_costs += disconnected_grad(actor_costs_with_penalties)

        # Add auxiliary variables for intermediate steps of the computation
        application_call.add_auxiliary_variable(rewards, name='rewards')
        application_call.add_auxiliary_variable(value_biases,
                                                name='value_biases')
        application_call.add_auxiliary_variable(values.copy(), name='values')
        application_call.add_auxiliary_variable(outputs.copy(), name='outputs')
        application_call.add_auxiliary_variable(prediction_values,
                                                name='prediction_values')
        application_call.add_auxiliary_variable(prediction_outputs,
                                                name='prediction_outputs')
        application_call.add_auxiliary_variable(value_targets.copy(),
                                                name='value_targets')
        application_call.add_auxiliary_variable(probs.copy(), name='probs')
        application_call.add_auxiliary_variable(prediction_logs,
                                                name='prediction_log_probs')

        # Compute some statistics for debugging
        last_character_mask = prediction_mask - rolled_prediction_mask
        last_character_costs = (critic_costs_per_char *
                                last_character_mask).sum(axis=0)
        mean2_output = (((prediction_outputs**2) * prediction_mask).sum() /
                        prediction_mask.sum())**0.5
        max_output = abs(prediction_outputs * prediction_mask).max()
        expected_reward = (probs[0] * values[0]).sum(axis=-1)
        application_call.add_auxiliary_variable(last_character_costs,
                                                name='last_character_costs')
        application_call.add_auxiliary_variable(critic_costs.mean(),
                                                name='mean_critic_cost')
        application_call.add_auxiliary_variable(
            critic_monte_carlo_costs.mean(),
            name='mean_critic_monte_carlo_cost')
        if self.critic:
            application_call.add_auxiliary_variable(actor_costs.mean(),
                                                    name='mean_actor_cost')
            application_call.add_auxiliary_variable(actor_entropies.mean(),
                                                    name='mean_actor_entropy')
        application_call.add_auxiliary_variable(expected_reward.mean(),
                                                name='mean_expected_reward')
        application_call.add_auxiliary_variable(mean2_output,
                                                name='mean2_output')
        application_call.add_auxiliary_variable(max_output, name='max_output')

        return total_costs
Example #49
0
    def _create_update_fun(self):
        """
            Given examples of the form:
                
                (
                    [1100, 1200, 12],
                    [1, 2, 0, 0, 1, 0]
                )
                
            Corresponding to a sequence of words 1100, 1200 and an object 12,
            with labels for class 1, 1, class 2, 2, and for the sigmoid classes
            the third class active, we can do regression.
            
        """

        input = T.ivector('input')
        input_object = T.iscalar('input_object_index')
        labels = T.ivector('labels')
        sigmoid_labels = T.ivector('sigmoid_labels')

        embeddings = self.model_matrix[input]
        object_embedding = self.object_matrix[input_object]

        if self.concatenate:
            # or we concatenate all the words and add the object to it
            merged_embeddings = T.concatenate(
                [embeddings.ravel(), object_embedding])
        else:
            # or we sum all the words and add the object to it:
            merged_embeddings = embeddings.sum(axis=1) + object_embedding

        preds, prediction, error = self.projection_fun(merged_embeddings,
                                                       labels, sigmoid_labels)

        updates = OrderedDict()

        gparams = T.grad(error, self.params)

        for gparam, param in zip(gparams, self.params):
            if param == self.model_matrix:
                updates[param] = T.inc_subtensor(param[input],
                                                 -self.alpha * gparam[input])
            elif param == self.object_matrix:
                updates[param] = T.inc_subtensor(
                    param[input_object], -self.alpha * gparam[input_object])
            else:
                updates[param] = param - self.alpha * gparam

        self.predict_proba = theano.function([input, input_object],
                                             preds + [prediction],
                                             mode=self.theano_mode)
        self.predict = theano.function([input, input_object],
                                       [pred.argmax() for pred in preds] +
                                       [prediction.round()],
                                       mode=self.theano_mode)

        input_vector = T.vector()
        alt_preds, alt_prediction, alt_error = self.projection_fun(
            input_vector, labels, sigmoid_labels)
        self.predict_vector = theano.function(
            [input_vector],
            [pred.argmax() for pred in alt_preds] + [alt_prediction.round()],
            mode=self.theano_mode)

        self.predict_vector_proba = theano.function([input_vector],
                                                    alt_preds +
                                                    [alt_prediction],
                                                    mode=self.theano_mode)

        training_inputs = []
        if len(self.output_classes) > 0:
            training_inputs.append(labels)
        if self.output_sigmoid_classes > 0:
            training_inputs.append(sigmoid_labels)

        self.gradient_fun = theano.function([input, input_object] +
                                            training_inputs,
                                            gparams,
                                            mode=self.theano_mode)
        self.update_fun = theano.function([input, input_object] +
                                          training_inputs,
                                          error,
                                          updates=updates,
                                          mode=self.theano_mode)
Example #50
0
    def step(x_t, M_tm1, c_tm1, h_tm1, r_tm1, wr_tm1, wu_tm1):
        # Feed Forward controller
        # h_t = lasagne.nonlinearities.tanh(T.dot(x_t, W_h) + b_h)
        # LSTM controller
        # p.3: "This memory is used by the controller as the input to a classifier,
        #       such as a softmax output layer, and as an additional
        #       input for the next controller state." -> T.dot(r_tm1, W_rh)
        preactivations = T.dot(x_t, W_xh) + T.dot(r_tm1, W_rh) + T.dot(
            h_tm1, W_hh) + b_h
        gf_, gi_, go_, u_ = slice_equally(preactivations, controller_size, 4)
        gf = lasagne.nonlinearities.sigmoid(gf_)
        gi = lasagne.nonlinearities.sigmoid(gi_)
        go = lasagne.nonlinearities.sigmoid(go_)
        u = lasagne.nonlinearities.tanh(u_)

        c_t = gf * c_tm1 + gi * u
        h_t = go * lasagne.nonlinearities.tanh(c_t)  # (batch_size, num_units)

        k_t = lasagne.nonlinearities.tanh(
            T.dot(h_t, W_key) +
            b_key)  # (batch_size, nb_reads, memory_size[1])
        a_t = lasagne.nonlinearities.tanh(
            T.dot(h_t, W_add) +
            b_add)  # (batch_size, nb_reads, memory_size[1])
        sigma_t = lasagne.nonlinearities.sigmoid(
            T.dot(h_t, W_sigma) + b_sigma)  # (batch_size, nb_reads, 1)
        sigma_t = T.addbroadcast(sigma_t, 2)

        wlu_tm1 = T.argsort(wu_tm1,
                            axis=1)[:, :nb_reads]  # (batch_size, nb_reads)
        # ww_t = sigma_t * wr_tm1 + (1. - sigma_t) * wlu_tm1
        ww_t = (sigma_t * wr_tm1).reshape(
            (batch_size * nb_reads, memory_shape[0]))
        ww_t = T.inc_subtensor(
            ww_t[T.arange(batch_size * nb_reads),
                 wlu_tm1.flatten()],
            1. - sigma_t.flatten())  # (batch_size * nb_reads, memory_size[0])
        ww_t = ww_t.reshape(
            (batch_size, nb_reads,
             memory_shape[0]))  # (batch_size, nb_reads, memory_size[0])

        # p.4: "Prior to writing to memory, the least used memory location is
        #       computed from wu_tm1 and is set to zero"
        M_t = T.set_subtensor(M_tm1[T.arange(batch_size), wlu_tm1[:, 0]], 0.)
        M_t = M_t + T.batched_dot(ww_t.dimshuffle(
            0, 2, 1), a_t)  # (batch_size, memory_size[0], memory_size[1])
        K_t = cosine_similarity(k_t,
                                M_t)  # (batch_size, nb_reads, memory_size[0])

        wr_t = lasagne.nonlinearities.softmax(
            K_t.reshape((batch_size * nb_reads, memory_shape[0])))
        wr_t = wr_t.reshape(
            (batch_size, nb_reads,
             memory_shape[0]))  # (batch_size, nb_reads, memory_size[0])
        if batch_size == 1:
            wr_t = T.unbroadcast(wr_t, 0)
        wu_t = gamma * wu_tm1 + T.sum(wr_t, axis=1) + T.sum(
            ww_t, axis=1)  # (batch_size, memory_size[0])

        r_t = T.batched_dot(wr_t, M_t).flatten(
            ndim=2)  # (batch_size, nb_reads * memory_size[1])

        return M_t, c_t, h_t, r_t, wr_t, wu_t
Example #51
0
def optimization_sgd(trainvec,
                     testvec,
                     n_epochs,
                     batch_size,
                     alpha=0.01,
                     beta=0.05):
    i = T.lvector('i')
    j = T.lvector('j')
    x = T.dvector('x')
    num_user = 6040
    num_item = 3952
    factors = 20
    init_mean = 0
    init_stdev = 0.02
    mfobj = MF_Batch(i, j, num_user, num_item, factors, init_mean, init_stdev)
    regcost, error = mfobj.errors(x, beta)
    gp, gq = T.grad(cost=regcost, wrt=[mfobj.P, mfobj.Q])
    updates = [(mfobj.P, T.inc_subtensor(mfobj.P[i, :], -gp[i, :] * alpha)),
               (mfobj.Q, T.inc_subtensor(mfobj.Q[j, :], -gq[j, :] * alpha))]
    train_model = theano.function(
        inputs=[i, j, x],
        #givens=[(mfobj.P[i, :]), mfobj.Q[:, j]],
        outputs=regcost,
        updates=updates)

    test_model = theano.function(
        inputs=[i, j, x],
        #givens=[(mfobj.P[i, :]), mfobj.Q[:, j]],
        outputs=error)

    mean_rating = np.mean(trainvec[:, 2])
    done_looping = False
    epoch = 0
    N = len(trainvec)

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        totalErrors = 0
        testErrors = 0
        for k in range(int(math.floor(N / batch_size))):
            batch = np.arange(k * batch_size, min(N - 1, (k + 1) * batch_size))
            idi = trainvec[batch, 0] - 1
            idj = trainvec[batch, 1] - 1
            ratings = trainvec[batch, 2] - mean_rating
            minibatch_cost = train_model(idi, idj, ratings)
            totalErrors += minibatch_cost

        NN = len(testvec)
        batch_size = 1000
        for k in range(int(math.floor(NN / batch_size))):
            batch = np.arange(k * batch_size, min(NN - 1,
                                                  (k + 1) * batch_size))
            p_idx = testvec[batch, 0] - 1
            q_idx = testvec[batch, 1] - 1
            ratings = testvec[batch, 2] - mean_rating
            testErrors += test_model(p_idx, q_idx, ratings)
        print(
            "the training cost at epoch {} is {}, and the testing error is {}".
            format(epoch, np.sqrt(totalErrors / N), np.sqrt(testErrors / NN)))

        # test it on the test dataset
    NN = len(testvec)
    batch_size = 1000
    diff = 0
    for k in range(int(math.floor(NN / batch_size))):
        batch = np.arange(k * batch_size, min(NN - 1, (k + 1) * batch_size))
        p_idx = testvec[batch, 0] - 1
        q_idx = testvec[batch, 1] - 1
        ratings = testvec[batch, 2] - mean_rating
        diff += test_model(p_idx, q_idx, ratings)

    print("Total average test error for {} instances is {}".format(
        NN, np.sqrt(diff / NN)))
Example #52
0
    def gradient_recurrence(x_t_plus_1, y_t_plus_1, y_t, isend_t, dh_t_plus_1,
                            h_t_plus_1, dV_re_t_plus_1, dV_im_t_plus_1,
                            dhidden_bias_t_plus_1, dtheta_t_plus_1,
                            dreflection_t_plus_1, dscale_t_plus_1, dU_t_plus_1,
                            dout_bias_t_plus_1, V_re, V_im, hidden_bias, theta,
                            reflection, scale, U, out_bias):

        dV_re_t = dV_re_t_plus_1
        dV_im_t = dV_im_t_plus_1
        dhidden_bias_t = dhidden_bias_t_plus_1
        dtheta_t = dtheta_t_plus_1
        dreflection_t = dreflection_t_plus_1
        dscale_t = dscale_t_plus_1
        dU_t = dU_t_plus_1
        dout_bias_t = dout_bias_t_plus_1

        # Compute h_t --------------------------------------------------------------------------
        data_linoutput_re = T.dot(x_t_plus_1, V_re)
        data_linoutput_im = T.dot(x_t_plus_1, V_im)
        data_linoutput = T.concatenate([data_linoutput_re, data_linoutput_im],
                                       axis=1)

        total_linoutput = apply_nonlinearity_inverse(h_t_plus_1, hidden_bias)
        hidden_linoutput = total_linoutput - data_linoutput

        step8 = scale_diag(hidden_linoutput, n_hidden, 1. / scale)
        step7 = times_diag(step8, n_hidden, -theta[2, :])
        step6 = times_reflection(step7, n_hidden, reflection[1, :])
        #        step5 = step6
        step5 = do_fft(step6, n_hidden)
        step4 = times_diag(step5, n_hidden, -theta[1, :])
        step3 = vec_permutation(step4, n_hidden, reverse_index_permute)
        step2 = times_reflection(step3, n_hidden, reflection[0, :])
        #        step1 = step2
        step1 = do_ifft(step2, n_hidden)
        step0 = times_diag(step1, n_hidden, -theta[0, :])

        h_t = step0

        # Compute deriv contributions to hidden to output params------------------------------------------------
        dU_contribution, dout_bias_contribution = \
            hidden_output_derivs(h_t_plus_1, U, out_bias, y_t_plus_1)

        dU_t = dU_t + dU_contribution
        dout_bias_t = dout_bias_t + dout_bias_contribution

        # Compute derivative of linoutputs -------------------------------------------------------------------
        deriv, rescale, modTL = compute_nonlinearity_deriv(
            total_linoutput, hidden_bias)

        dh_t_plus_1_TL = dh_t_plus_1 * total_linoutput
        matrix = dh_t_plus_1_TL[:, :n_hidden] + dh_t_plus_1_TL[:, n_hidden:]
        matrix = matrix * (deriv - rescale) / (modTL**2)

        dtotal_linoutput = dh_t_plus_1 * T.tile(rescale, [1, 2]) \
            + T.tile(matrix, [1, 2]) * total_linoutput

        dhidden_linoutput = dtotal_linoutput
        ddata_linoutput = dtotal_linoutput

        # Compute deriv contributions to hidden bias-------------------------------------------------------
        dhidden_bias_contribution = dh_t_plus_1_TL * T.tile(
            deriv / modTL, [1, 2])

        dhidden_bias_t = dhidden_bias_t + dhidden_bias_contribution[:, :n_hidden] \
            + dhidden_bias_contribution[:, n_hidden:]

        # Compute derivative of h_t -------------------------------------------------------------------

        # use transpose conjugate operations
        dstep8 = scale_diag(dhidden_linoutput, n_hidden, scale)
        dstep7 = times_diag(dstep8, n_hidden, -theta[2, :])
        dstep6 = times_reflection(dstep7, n_hidden, reflection[1, :])
        #        dstep5 = dstep6
        dstep5 = do_fft(dstep6, n_hidden)
        dstep4 = times_diag(dstep5, n_hidden, -theta[1, :])
        dstep3 = vec_permutation(dstep4, n_hidden, reverse_index_permute)
        dstep2 = times_reflection(dstep3, n_hidden, reflection[0, :])
        #        dstep1 = dstep2
        dstep1 = do_ifft(dstep2, n_hidden)
        dstep0 = times_diag(dstep1, n_hidden, -theta[0, :])

        dh_t = dstep0
        dh_t_contribution = compute_dctdht(h_t, U, out_bias, y_t)
        dh_t = theano.ifelse.ifelse(T.eq(isend_t, 0), dh_t + dh_t_contribution,
                                    dh_t)

        # Compute deriv contributions to Unitary parameters ----------------------------------------------------

        # scale------------------------------------------------
        dscale_contribution = dhidden_linoutput * step8
        dscale_t = dscale_t + dscale_contribution[:, :n_hidden] \
            + dscale_contribution[:, n_hidden:]

        # theta2-----------------------------------------------
        dtheta2_contribution = dstep8 * times_diag(step7, n_hidden,
                                                   theta[2, :] + 0.5 * np.pi)
        dtheta_t = T.inc_subtensor(
            dtheta_t[:, 2, :], dtheta2_contribution[:, :n_hidden] +
            dtheta2_contribution[:, n_hidden:])

        # reflection1-----------------------------------------
        v_re = reflection[1, :n_hidden]
        v_im = reflection[1, n_hidden:]
        vstarv = (v_re**2 + v_im**2).sum()

        dstep7_re = dstep7[:, :n_hidden]
        dstep7_im = dstep7[:, n_hidden:]
        step6_re = step6[:, :n_hidden]
        step6_im = step6[:, n_hidden:]

        v_re_dot_v_re = T.dot(v_re, v_re.T)
        v_im_dot_v_im = T.dot(v_im, v_im.T)
        v_im_dot_v_re = T.dot(v_im, v_re.T)

        dstep7_re_dot_v_re = T.dot(dstep7_re, v_re.T).dimshuffle(0,
                                                                 'x')  #n_b x 1
        dstep7_re_dot_v_im = T.dot(dstep7_re, v_im.T).dimshuffle(0, 'x')
        step6_re_dot_v_re = T.dot(step6_re, v_re.T).dimshuffle(0, 'x')
        step6_re_dot_v_im = T.dot(step6_re, v_im.T).dimshuffle(0, 'x')
        dstep7_im_dot_v_re = T.dot(dstep7_im, v_re.T).dimshuffle(0, 'x')
        dstep7_im_dot_v_im = T.dot(dstep7_im, v_im.T).dimshuffle(0, 'x')
        step6_im_dot_v_re = T.dot(step6_im, v_re.T).dimshuffle(0, 'x')
        step6_im_dot_v_im = T.dot(step6_im, v_im.T).dimshuffle(0, 'x')

        dstep7_re_timesum_step6_re = (dstep7_re * step6_re).sum(axis=1)
        dstep7_re_timesum_step6_im = (dstep7_re * step6_im).sum(axis=1)
        dstep7_im_timesum_step6_re = (dstep7_im * step6_re).sum(axis=1)
        dstep7_im_timesum_step6_im = (dstep7_im * step6_im).sum(axis=1)

        #--------

        dstep7_re_RedOpdv_re_term1 = -2. / vstarv * (
            dstep7_re * step6_re_dot_v_re + dstep7_re_dot_v_re * step6_re -
            dstep7_re * step6_im_dot_v_im + dstep7_re_dot_v_im * step6_im)

        outer_sum = (T.outer(step6_re_dot_v_re, v_re) +
                     T.outer(step6_re_dot_v_im, v_im) -
                     T.outer(step6_im_dot_v_im, v_re) +
                     T.outer(step6_im_dot_v_re, v_im))
        dstep7_re_RedOpdv_re_term2 = 4. / (vstarv**2) * T.outer(
            (dstep7_re * outer_sum).sum(axis=1), v_re)

        dstep7_im_ImdOpdv_re_term1 = -2. / vstarv * (
            dstep7_im * step6_im_dot_v_re + dstep7_im_dot_v_re * step6_im +
            dstep7_im * step6_re_dot_v_im - dstep7_im_dot_v_im * step6_re)

        outer_sum = (T.outer(step6_im_dot_v_re, v_re) +
                     T.outer(step6_im_dot_v_im, v_im) +
                     T.outer(step6_re_dot_v_im, v_re) -
                     T.outer(step6_re_dot_v_re, v_im))
        dstep7_im_ImdOpdv_re_term2 = 4. / (vstarv**2) * T.outer(
            (dstep7_im * outer_sum).sum(axis=1), v_re)

        dv_re_contribution = (dstep7_re_RedOpdv_re_term1 +
                              dstep7_re_RedOpdv_re_term2 +
                              dstep7_im_ImdOpdv_re_term1 +
                              dstep7_im_ImdOpdv_re_term2)

        #---------

        dstep7_re_RedOpdv_im_term1 = -2. / vstarv * (
            dstep7_re * step6_re_dot_v_im + dstep7_re_dot_v_im * step6_re -
            dstep7_re_dot_v_re * step6_im + dstep7_re * step6_im_dot_v_re)

        outer_sum = (T.outer(step6_re_dot_v_re, v_re) +
                     T.outer(step6_re_dot_v_im, v_im) -
                     T.outer(step6_im_dot_v_im, v_re) +
                     T.outer(step6_im_dot_v_re, v_im))
        dstep7_re_RedOpdv_im_term2 = 4. / (vstarv**2) * T.outer(
            (dstep7_re * outer_sum).sum(axis=1), v_im)

        dstep7_im_ImdOpdv_im_term1 = -2. / vstarv * (
            dstep7_im * step6_im_dot_v_im + dstep7_im_dot_v_im * step6_im +
            dstep7_im_dot_v_re * step6_re - dstep7_im * step6_re_dot_v_re)

        outer_sum = (T.outer(step6_im_dot_v_re, v_re) +
                     T.outer(step6_im_dot_v_im, v_im) +
                     T.outer(step6_re_dot_v_im, v_re) -
                     T.outer(step6_re_dot_v_re, v_im))
        dstep7_im_ImdOpdv_im_term2 = 4. / (vstarv**2) * T.outer(
            (dstep7_im * outer_sum).sum(axis=1), v_im)

        dv_im_contribution = (dstep7_re_RedOpdv_im_term1 +
                              dstep7_re_RedOpdv_im_term2 +
                              dstep7_im_ImdOpdv_im_term1 +
                              dstep7_im_ImdOpdv_im_term2)

        dreflection_t = T.inc_subtensor(dreflection_t[:, 1, :n_hidden],
                                        dv_re_contribution)
        dreflection_t = T.inc_subtensor(dreflection_t[:, 1, n_hidden:],
                                        dv_im_contribution)

        # theta1-----------------------------------------------------
        dtheta1_contribution = dstep5 * times_diag(step4, n_hidden,
                                                   theta[1, :] + 0.5 * np.pi)
        dtheta_t = T.inc_subtensor(
            dtheta_t[:, 1, :], dtheta1_contribution[:, :n_hidden] +
            dtheta1_contribution[:, n_hidden:])

        # reflection0------------------------------------------------
        v_re = reflection[0, :n_hidden]
        v_im = reflection[0, n_hidden:]
        vstarv = (v_re**2 + v_im**2).sum()

        dstep3_re = dstep3[:, :n_hidden]
        dstep3_im = dstep3[:, n_hidden:]
        step2_re = step2[:, :n_hidden]
        step2_im = step2[:, n_hidden:]

        v_re_dot_v_re = T.dot(v_re, v_re.T)
        v_im_dot_v_im = T.dot(v_im, v_im.T)
        v_im_dot_v_re = T.dot(v_im, v_re.T)

        dstep3_re_dot_v_re = T.dot(dstep3_re, v_re.T).dimshuffle(0,
                                                                 'x')  #n_b x 1
        dstep3_re_dot_v_im = T.dot(dstep3_re, v_im.T).dimshuffle(0, 'x')
        step2_re_dot_v_re = T.dot(step2_re, v_re.T).dimshuffle(0, 'x')
        step2_re_dot_v_im = T.dot(step2_re, v_im.T).dimshuffle(0, 'x')
        dstep3_im_dot_v_re = T.dot(dstep3_im, v_re.T).dimshuffle(0, 'x')
        dstep3_im_dot_v_im = T.dot(dstep3_im, v_im.T).dimshuffle(0, 'x')
        step2_im_dot_v_re = T.dot(step2_im, v_re.T).dimshuffle(0, 'x')
        step2_im_dot_v_im = T.dot(step2_im, v_im.T).dimshuffle(0, 'x')

        dstep3_re_timesum_step2_re = (dstep3_re * step2_re).sum(axis=1)
        dstep3_re_timesum_step2_im = (dstep3_re * step2_im).sum(axis=1)
        dstep3_im_timesum_step2_re = (dstep3_im * step2_re).sum(axis=1)
        dstep3_im_timesum_step2_im = (dstep3_im * step2_im).sum(axis=1)

        #--------

        dstep3_re_RedOpdv_re_term1 = -2. / vstarv * (
            dstep3_re * step2_re_dot_v_re + dstep3_re_dot_v_re * step2_re -
            dstep3_re * step2_im_dot_v_im + dstep3_re_dot_v_im * step2_im)

        outer_sum = (T.outer(step2_re_dot_v_re, v_re) +
                     T.outer(step2_re_dot_v_im, v_im) -
                     T.outer(step2_im_dot_v_im, v_re) +
                     T.outer(step2_im_dot_v_re, v_im))
        dstep3_re_RedOpdv_re_term2 = 4. / (vstarv**2) * T.outer(
            (dstep3_re * outer_sum).sum(axis=1), v_re)

        dstep3_im_ImdOpdv_re_term1 = -2. / vstarv * (
            dstep3_im * step2_im_dot_v_re + dstep3_im_dot_v_re * step2_im +
            dstep3_im * step2_re_dot_v_im - dstep3_im_dot_v_im * step2_re)

        outer_sum = (T.outer(step2_im_dot_v_re, v_re) +
                     T.outer(step2_im_dot_v_im, v_im) +
                     T.outer(step2_re_dot_v_im, v_re) -
                     T.outer(step2_re_dot_v_re, v_im))
        dstep3_im_ImdOpdv_re_term2 = 4. / (vstarv**2) * T.outer(
            (dstep3_im * outer_sum).sum(axis=1), v_re)

        dv_re_contribution = (dstep3_re_RedOpdv_re_term1 +
                              dstep3_re_RedOpdv_re_term2 +
                              dstep3_im_ImdOpdv_re_term1 +
                              dstep3_im_ImdOpdv_re_term2)

        #---------

        dstep3_re_RedOpdv_im_term1 = -2. / vstarv * (
            dstep3_re * step2_re_dot_v_im + dstep3_re_dot_v_im * step2_re -
            dstep3_re_dot_v_re * step2_im + dstep3_re * step2_im_dot_v_re)

        outer_sum = (T.outer(step2_re_dot_v_re, v_re) +
                     T.outer(step2_re_dot_v_im, v_im) -
                     T.outer(step2_im_dot_v_im, v_re) +
                     T.outer(step2_im_dot_v_re, v_im))
        dstep3_re_RedOpdv_im_term2 = 4. / (vstarv**2) * T.outer(
            (dstep3_re * outer_sum).sum(axis=1), v_im)

        dstep3_im_ImdOpdv_im_term1 = -2. / vstarv * (
            dstep3_im * step2_im_dot_v_im + dstep3_im_dot_v_im * step2_im +
            dstep3_im_dot_v_re * step2_re - dstep3_im * step2_re_dot_v_re)

        outer_sum = (T.outer(step2_im_dot_v_re, v_re) +
                     T.outer(step2_im_dot_v_im, v_im) +
                     T.outer(step2_re_dot_v_im, v_re) -
                     T.outer(step2_re_dot_v_re, v_im))
        dstep3_im_ImdOpdv_im_term2 = 4. / (vstarv**2) * T.outer(
            (dstep3_im * outer_sum).sum(axis=1), v_im)

        dv_im_contribution = (dstep3_re_RedOpdv_im_term1 +
                              dstep3_re_RedOpdv_im_term2 +
                              dstep3_im_ImdOpdv_im_term1 +
                              dstep3_im_ImdOpdv_im_term2)

        dreflection_t = T.inc_subtensor(dreflection_t[:, 0, :n_hidden],
                                        dv_re_contribution)
        dreflection_t = T.inc_subtensor(dreflection_t[:, 0, n_hidden:],
                                        dv_im_contribution)

        # theta0------------------------------------------------------------------------------
        dtheta0_contribution = dstep1 * times_diag(step0, n_hidden,
                                                   theta[0, :] + 0.5 * np.pi)
        dtheta_t = T.inc_subtensor(
            dtheta_t[:, 0, :], dtheta0_contribution[:, :n_hidden] +
            dtheta0_contribution[:, n_hidden:])

        # Compute deriv contributions to V --------------------------------------------------
        ddata_linoutput_re = ddata_linoutput[:, :n_hidden]
        ddata_linoutput_im = ddata_linoutput[:, n_hidden:]
        dV_re_contribution = T.batched_dot(
            x_t_plus_1.dimshuffle(0, 1, 'x'),
            ddata_linoutput_re.dimshuffle(0, 'x', 1))
        dV_im_contribution = T.batched_dot(
            x_t_plus_1.dimshuffle(0, 1, 'x'),
            ddata_linoutput_im.dimshuffle(0, 'x', 1))

        dV_re_t = dV_re_t + dV_re_contribution
        dV_im_t = dV_im_t + dV_im_contribution

        return [
            dh_t, h_t, dV_re_t, dV_im_t, dhidden_bias_t, dtheta_t,
            dreflection_t, dscale_t, dU_t, dout_bias_t
        ]
Example #53
0
    def fitt(self,
             X,
             num_neg_samples=10,
             learning_rate=10e-4,
             mu=0.99,
             reg=0.1,
             epochs=10):
        N = len(X)
        V = self.V
        D = self.D
        self._get_pnw(X)

        W1 = init_weights((V, D))
        W2 = init_weights((D, V))

        W1 = theano.shared(W1)
        W2 = theano.shared(W2)

        thInput = T.iscalar('input_word')
        thContext = T.ivector('context')
        thNegSamples = T.ivector('negative')

        W1_subset = W1[thInput]
        W2_psubset = W2[:, thContext]
        W2_nsubset = W2[:, thNegSamples]
        p_activation = W1_subset.dot(W2_psubset)
        pos_pY = T.nnet.sigmoid(p_activation)
        n_activation = W1_subset.dot(W2_nsubset)
        neg_pY = T.nnet.sigmoid(-n_activation)
        cost = -T.log(pos_pY).sum() - T.log(neg_pY).sum()

        W1_grad = T.grad(cost, W1_subset)
        W2_pgrad = T.grad(cost, W2_psubset)
        W2_ngrad = T.grad(cost, W2_nsubset)

        W1_update = T.inc_subtensor(W1_subset, -learning_rate * W1_grad)
        W2_update = T.inc_subtensor(
            T.inc_subtensor(W2_psubset,
                            -learning_rate * W2_pgrad)[:, thNegSamples],
            -learning_rate * W2_ngrad)

        updates = [(W1, W1_update), (W2, W2_update)]

        train_op = theano.function(
            inputs=[thInput, thContext, thNegSamples],
            outputs=cost,
            updates=updates,
            allow_input_downcast=True,
        )

        costs = []
        cost_per_epoch = []
        sample_indices = range(N)
        for i in xrange(epochs):
            t0 = datetime.now()
            sample_indices = shuffle(sample_indices)
            cost_per_epoch_i = []
            for it in xrange(N):
                j = sample_indices[it]
                x = X[j]

                if len(x) < 2 * self.context_sz + 1:
                    continue

                cj = []
                n = len(x)
                for jj in xrange(n):
                    start = max(0, jj - self.context_sz)
                    end = min(n, jj + 1 + self.context_sz)
                    context = np.concatenate([x[start:jj], x[(jj + 1):end]])
                    context = np.array(list(set(context)), dtype=np.int32)
                    neg_samples = self._get_negative_samples(
                        context, num_neg_samples)

                    c = train_op(x[jj], context, neg_samples)
                    cj.append(c / (num_neg_samples + len(context)))

                ########## try one random window per sentence ###########
                # jj = np.random.choice(n)
                # start = max(0, jj - self.context_sz)
                # end = min(n, jj + 1 + self.context_sz)
                # context = np.concatenate([x[start:jj], x[(jj+1):end]])
                # # NOTE: context can contain DUPLICATES!
                # # e.g. "<UNKOWN> <UNKOWN> cats and dogs"
                # context = np.array(list(set(context)), dtype=np.int32)
                # neg_samples = self._get_negative_samples(context, num_neg_samples)

                # c = train_op(x[jj], context, neg_samples)
                # cj.append(c / (num_neg_samples + len(context)))
                #########################################################

                cj = np.mean(cj)
                cost_per_epoch_i.append(cj)
                costs.append(cj)
                if it % 100 == 0:
                    sys.stdout.write('epoch:%d\tj:%d/%d\tcost:%f\r' %
                                     (i, it, N, cj))
                    sys.stdout.flush()

            epoch_cost = np.mean(cost_per_epoch_i)
            cost_per_epoch.append(epoch_cost)
            print "time to complete epoch %d:" % i, datetime.now(
            ) - t0, 'cost:', epoch_cost

        self.W1 = W1.get_value()
        self.W2 = W2.get_value()

        plt.plot(costs)
        plt.title('Theano costs')
        plt.show()

        plt.plot(cost_per_epoch)
        plt.title('Theano cost at each epoch')
        plt.show()
Example #54
0
def test_jax_IncSubtensor():
    x_np = np.random.uniform(-1, 1, size=(3, 4, 5)).astype(tt.config.floatX)
    x_tt = tt.arange(3 * 4 * 5).reshape((3, 4, 5)).astype(tt.config.floatX)

    # "Set" basic indices
    st_tt = tt.as_tensor_variable(np.array(-10.0, dtype=tt.config.floatX))
    out_tt = tt.set_subtensor(x_tt[1, 2, 3], st_tt)
    out_fg = theano.gof.FunctionGraph([], [out_tt])
    compare_jax_and_py(out_fg, [])

    st_tt = tt.as_tensor_variable(np.r_[-1.0, 0.0].astype(tt.config.floatX))
    out_tt = tt.set_subtensor(x_tt[:2, 0, 0], st_tt)
    out_fg = theano.gof.FunctionGraph([], [out_tt])
    compare_jax_and_py(out_fg, [])

    out_tt = tt.set_subtensor(x_tt[0, 1:3, 0], st_tt)
    out_fg = theano.gof.FunctionGraph([], [out_tt])
    compare_jax_and_py(out_fg, [])

    # "Set" advanced indices
    st_tt = tt.as_tensor_variable(np.r_[-1.0, 0.0].astype(tt.config.floatX))
    out_tt = tt.set_subtensor(x_tt[[0, 2], 0, 0], st_tt)
    out_fg = theano.gof.FunctionGraph([], [out_tt])
    compare_jax_and_py(out_fg, [])

    st_tt = tt.as_tensor_variable(x_np[[0, 2], 0, :3])
    out_tt = tt.set_subtensor(x_tt[[0, 2], 0, :3], st_tt)
    out_fg = theano.gof.FunctionGraph([], [out_tt])
    compare_jax_and_py(out_fg, [])

    # "Set" boolean indices
    mask_tt = tt.as_tensor_variable(x_np) > 0
    out_tt = tt.set_subtensor(x_tt[mask_tt], 0.0)
    out_fg = theano.gof.FunctionGraph([], [out_tt])
    compare_jax_and_py(out_fg, [])

    # "Increment" basic indices
    st_tt = tt.as_tensor_variable(np.array(-10.0, dtype=tt.config.floatX))
    out_tt = tt.inc_subtensor(x_tt[1, 2, 3], st_tt)
    out_fg = theano.gof.FunctionGraph([], [out_tt])
    compare_jax_and_py(out_fg, [])

    st_tt = tt.as_tensor_variable(np.r_[-1.0, 0.0].astype(tt.config.floatX))
    out_tt = tt.inc_subtensor(x_tt[:2, 0, 0], st_tt)
    out_fg = theano.gof.FunctionGraph([], [out_tt])
    compare_jax_and_py(out_fg, [])

    out_tt = tt.set_subtensor(x_tt[0, 1:3, 0], st_tt)
    out_fg = theano.gof.FunctionGraph([], [out_tt])
    compare_jax_and_py(out_fg, [])

    # "Increment" advanced indices
    st_tt = tt.as_tensor_variable(np.r_[-1.0, 0.0].astype(tt.config.floatX))
    out_tt = tt.inc_subtensor(x_tt[[0, 2], 0, 0], st_tt)
    out_fg = theano.gof.FunctionGraph([], [out_tt])
    compare_jax_and_py(out_fg, [])

    st_tt = tt.as_tensor_variable(x_np[[0, 2], 0, :3])
    out_tt = tt.inc_subtensor(x_tt[[0, 2], 0, :3], st_tt)
    out_fg = theano.gof.FunctionGraph([], [out_tt])
    compare_jax_and_py(out_fg, [])

    # "Increment" boolean indices
    mask_tt = tt.as_tensor_variable(x_np) > 0
    out_tt = tt.set_subtensor(x_tt[mask_tt], 1.0)
    out_fg = theano.gof.FunctionGraph([], [out_tt])
    compare_jax_and_py(out_fg, [])
Example #55
0
def create_optimization_updates(cost,
                                params,
                                method="sgd",
                                max_norm=10,
                                updates=None,
                                gradients=None,
                                lr=0.01,
                                eps=1e-8,
                                rho=0.95,
                                beta1=0.9,
                                beta2=0.999,
                                gsums=None,
                                xsums=None):

    lr = theano.shared(np.float64(lr).astype(theano.config.floatX))
    eps = np.float64(eps).astype(theano.config.floatX)
    rho = theano.shared(np.float64(rho).astype(theano.config.floatX))
    beta1 = theano.shared(np.float64(beta1).astype(theano.config.floatX))
    beta2 = theano.shared(np.float64(beta2).astype(theano.config.floatX))

    gparams = T.grad(cost, params) if gradients is None else gradients

    g_norm = 0
    for g in gparams:
        g_norm = g_norm + g.norm(2)**2
    g_norm = T.sqrt(g_norm)
    g_norm_list = g_norm

    # max_norm is useful for sgd
    if method != "sgd": max_norm = None

    if max_norm is not None and max_norm is not False:
        max_norm = theano.shared(
            np.float64(max_norm).astype(theano.config.floatX))
        shrink_factor = T.minimum(max_norm, g_norm + eps) / (g_norm + eps)
        gparams_clipped = []
        for g in gparams:
            g = shrink_factor * g
            gparams_clipped.append(g)
        gparams = gparams_clipped

    if updates is None:
        updates = OrderedDict()

    if gsums is None:
        gsums = create_accumulators(params) if method != "sgd" else None
    if xsums is None:
        xsums = create_accumulators(
            params) if method != "sgd" and method != "adagrad" else None

    if method == "sgd":
        for p, g in zip(params, gparams):
            if is_subtensor_op(p):
                origin, _ = get_subtensor_op_inputs(p)
                updates[origin] = T.inc_subtensor(p, -lr * g)
            else:
                updates[p] = p - lr * g

    elif method == "adagrad":
        create_adagrad_updates(updates, params, gparams, gsums, lr, eps)

    elif method == "adadelta":
        create_adadelta_updates(updates, params, gparams, gsums, xsums, lr,
                                eps, rho)

    elif method == "adam":
        create_adam_updates(updates, params, gparams, gsums, xsums, lr, eps,
                            beta1, beta2)

    else:
        raise Exception("Unknown optim method: {}\n".format(method))

    if method == "adadelta":
        lr = rho

    return updates, lr, g_norm_list, gsums, xsums, max_norm
Example #56
0
def makeResidualConnectionBetweenLayersAndReturnOutput(
        myLogger, deeperLayerOutputImagesTrValTest,
        deeperLayerOutputImageShapesTrValTest,
        earlierLayerOutputImagesTrValTest,
        earlierLayerOutputImageShapesTrValTest):
    # Add the outputs of the two layers and return the output, as well as its dimensions.
    # Result: The result should have exactly the same shape as the output of the Deeper layer. Both #FMs and Dimensions of FMs.

    (deeperLayerOutputImageTrain, deeperLayerOutputImageVal,
     deeperLayerOutputImageTest) = deeperLayerOutputImagesTrValTest
    (deeperLayerOutputImageShapeTrain, deeperLayerOutputImageShapeVal,
     deeperLayerOutputImageShapeTest) = deeperLayerOutputImageShapesTrValTest
    (earlierLayerOutputImageTrain, earlierLayerOutputImageVal,
     earlierLayerOutputImageTest) = earlierLayerOutputImagesTrValTest
    (earlierLayerOutputImageShapeTrain, earlierLayerOutputImageShapeVal,
     earlierLayerOutputImageShapeTest) = earlierLayerOutputImageShapesTrValTest
    # Note: deeperLayerOutputImageShapeTrain has dimensions: [batchSize, FMs, r, c, z]
    # The deeper FMs can be greater only when there is upsampling. But then, to do residuals, I would need to upsample the earlier FMs. Not implemented.
    if np.any(np.asarray(deeperLayerOutputImageShapeTrain[2:]) > np.asarray(earlierLayerOutputImageShapeTrain[2:])) or \
            np.any(np.asarray(deeperLayerOutputImageShapeVal[2:]) > np.asarray(earlierLayerOutputImageShapeVal[2:])) or \
                np.any(np.asarray(deeperLayerOutputImageShapeTest[2:]) > np.asarray(earlierLayerOutputImageShapeTest[2:])) :
        myLogger.print3(
            "ERROR: In function [makeResidualConnectionBetweenLayersAndReturnOutput] the RCZ-dimensions of a deeper layer FMs were found greater than the earlier layers. Not implemented functionality. Exiting!"
        )
        myLogger.print3("\t (train) Dimensions of Deeper Layer=" +
                        str(deeperLayerOutputImageShapeTrain) +
                        ". Dimensions of Earlier Layer=" +
                        str(earlierLayerOutputImageShapeTrain))
        myLogger.print3("\t (val) Dimensions of Deeper Layer=" +
                        str(deeperLayerOutputImageShapeVal) +
                        ". Dimensions of Earlier Layer=" +
                        str(earlierLayerOutputImageShapeVal))
        myLogger.print3("\t (test) Dimensions of Deeper Layer=" +
                        str(deeperLayerOutputImageShapeTest) +
                        ". Dimensions of Earlier Layer=" +
                        str(earlierLayerOutputImageShapeTest))
        exit(1)

    # get the part of the earlier layer that is of the same dimensions as the FMs of the deeper:
    partOfEarlierFmsToAddTrain = getMiddlePartOfFms(
        earlierLayerOutputImageTrain, deeperLayerOutputImageShapeTrain[2:])
    partOfEarlierFmsToAddVal = getMiddlePartOfFms(
        earlierLayerOutputImageVal, deeperLayerOutputImageShapeVal[2:])
    partOfEarlierFmsToAddTest = getMiddlePartOfFms(
        earlierLayerOutputImageTest, deeperLayerOutputImageShapeTest[2:])

    # Add the FMs, after taking care of zero padding if the deeper layer has more FMs.
    numFMsDeeper = deeperLayerOutputImageShapeTrain[1]
    numFMsEarlier = earlierLayerOutputImageShapeTrain[1]
    if numFMsDeeper >= numFMsEarlier:
        outputOfResConnTrain = T.inc_subtensor(
            deeperLayerOutputImageTrain[:, :numFMsEarlier, :, :, :],
            partOfEarlierFmsToAddTrain,
            inplace=False)
        outputOfResConnVal = T.inc_subtensor(
            deeperLayerOutputImageVal[:, :numFMsEarlier, :, :, :],
            partOfEarlierFmsToAddVal,
            inplace=False)
        outputOfResConnTest = T.inc_subtensor(
            deeperLayerOutputImageTest[:, :numFMsEarlier, :, :, :],
            partOfEarlierFmsToAddTest,
            inplace=False)
    else:  # Deeper FMs are fewer than earlier. This should not happen in most architectures. But oh well...
        outputOfResConnTrain = deeperLayerOutputImageTrain + partOfEarlierFmsToAddTrain[:, :
                                                                                        numFMsDeeper, :, :, :]
        outputOfResConnVal = deeperLayerOutputImageVal + partOfEarlierFmsToAddVal[:, :
                                                                                  numFMsDeeper, :, :, :]
        outputOfResConnTest = deeperLayerOutputImageTest + partOfEarlierFmsToAddTest[:, :
                                                                                     numFMsDeeper, :, :, :]

    # Dimensions of output are the same as those of the deeperLayer
    return (outputOfResConnTrain, outputOfResConnVal, outputOfResConnTest)
Example #57
0
 def just_numeric_args(a, b):
     return tt.inc_subtensor(a[s], b)
Example #58
0
 def u(i):
   upd = OrderedDict()
   upd[eta] = T.inc_subtensor(eta[i], dloss * eps * delta)
   upd[lam_diag] = T.inc_subtensor(lam_diag[i], eps * (r ** 2))
Example #59
0
 def forward(self, x):
     out = tt.zeros(x.shape)
     out = tt.inc_subtensor(out[0], x[0])
     out = tt.inc_subtensor(out[1:], tt.log(x[1:] - x[:-1]))
     return out
Example #60
0
def _update_cps(nnet, layer, X, dW, db, loss, idx=None):
    """
    update with compressed feature vectors
    """

    assert layer.isdense or layer.issvm

    if Cfg.store_on_gpu:
        assert idx is not None

    C = Cfg.C
    D = Cfg.D
    eps = Cfg.eps

    k = layer.k

    K = (C * D) / (C + D)

    W_s = dW * K * T.cast(1. / nnet.data.n_train, 'floatX')
    b_s = db * K * T.cast(1. / nnet.data.n_train, 'floatX')
    l_s = loss * T.cast(1. / nnet.data.n_train, 'floatX')

    if Cfg.store_on_gpu:
        Deltaw = W_s - layer.W_i[idx]
        Deltab = b_s - layer.b_i[idx]
        Deltal = l_s - layer.l_i[idx]
    else:
        Deltaw = W_s - layer.W_i_buffer
        Deltab = b_s - layer.b_i_buffer
        Deltal = l_s - layer.l_i_buffer

    # uncompress feature vectors and sum over mini-batch

    # Method 1: memory inefficient (full allocation before sum)
    # DeltaW = T.sum(T.shape_padaxis(X, 2) *
    #                T.shape_padaxis(Deltaw, 1), axis=0)

    # Method 2: same result but accumulates
    # results inplace on first dimension
    dummy = T.dot(X, layer.W)
    DeltaW = T.grad(cost=None, wrt=layer.W, known_grads={dummy: Deltaw})

    gamma = (K * Deltal +
             T.sum(DeltaW * layer.W) +
             T.sum(Deltab * layer.b)) / \
        (eps + T.sum(DeltaW ** 2) + T.sum(Deltab ** 2))

    gamma = gamma.clip(0, 1)

    W = layer.W - gamma * DeltaW
    b = layer.b - gamma * Deltab
    l = layer.l + gamma * Deltal

    if Cfg.store_on_gpu:
        # new value to assign
        W_i = T.inc_subtensor(layer.W_i[idx], gamma * Deltaw)
        b_i = T.inc_subtensor(layer.b_i[idx], gamma * Deltab)
        l_i = T.inc_subtensor(layer.l_i[idx], gamma * Deltal)

        # shared variable to update
        layer_W_i = layer.W_i
        layer_b_i = layer.b_i
        layer_l_i = layer.l_i
    else:
        # new value to assign
        W_i = layer.W_i_buffer + gamma * Deltaw
        b_i = layer.b_i_buffer + gamma * Deltab
        l_i = layer.l_i_buffer + gamma * Deltal

        # shared variable to update
        layer_W_i = layer.W_i_buffer
        layer_b_i = layer.b_i_buffer
        layer_l_i = layer.l_i_buffer

    # average
    W_avg = T.cast((k * 1. / (k + 2)), 'floatX') * layer.W_avg + \
        T.cast((2. / (k + 2)), 'floatX') * W
    b_avg = T.cast((k * 1. / (k + 2)), 'floatX') * layer.b_avg + \
        T.cast((2. / (k + 2)), 'floatX') * b
    k = k + 1

    updates = ((layer.W, W), (layer.b, b), (layer.W_avg, W_avg),
               (layer.b_avg, b_avg), (layer.k, k), (layer.l, l),
               (layer_W_i, W_i), (layer_b_i, b_i), (layer_l_i,
                                                    l_i), (layer.gamma, gamma))

    return updates