def custom_loss(y_true,y_pred):
    '''
    Args:
      y_true: Ground Truth output
      y_pred: Predicted output
      The forms of these two vectors are:
      ######################################
      ## x,y,h,w,p1,p2,...,p20,objectness ##
      ######################################
    Returns:
      The loss caused by y_pred
    '''
    y1 = y_pred
    y2 = y_true
    loss = 0.0

    scale_vector = []
    scale_vector.extend([2]*4)
    scale_vector.extend([1]*20)
    scale_vector = np.reshape(np.asarray(scale_vector),(1,len(scale_vector)))

    for i in range(49):
        y1_piece = y1[:,i*25:i*25+24]
        y2_piece = y2[:,i*25:i*25+24]

        y1_piece = y1_piece * scale_vector
        y2_piece = y2_piece * scale_vector

        loss_piece = T.sum(T.square(y1_piece - y2_piece),axis=1)
        loss = loss + loss_piece * y2[:,i*25+24]
        loss = loss + T.square(y2[:,i*25+24] - y1[:,i*25+24])

    loss = T.sum(loss)
    return loss
    def log_likelihood(self):
        Users = self.L[:, :-1]
        Items = self.R[:, :-1]
        UserBiases = self.L[:, -1].reshape((-1, 1))
        ItemBiases = self.R[:, -1].reshape((-1, 1))

        A = T.dot(self.L[:, :-1], (self.R[:, :-1]).T)
        A = T.inc_subtensor(A[:, :], UserBiases)
        A = T.inc_subtensor(A[:, :], ItemBiases.T)
        B = A * self.counts
        loglik = T.sum(B)

        A = T.exp(A)
        A += 1
        A = T.log(A)

        A = (self.counts + 1) * A
        loglik -= T.sum(A)

        # L2 regularization
        loglik -= 0.5 * self.reg_param * T.sum(T.square(self.L[:, :-1]))
        loglik -= 0.5 * self.reg_param * T.sum(T.square(self.R[:, :-1]))

        # Return negation of LogLikelihood cause we will minimize cost
        return -loglik
Exemple #3
0
 def kl_div_p_q(self, p_mean, p_std, q_mean, q_std):
     """KL divergence D_{KL}[p(x)||q(x)] for a fully factorized Gaussian"""
     numerator = T.square(p_mean - q_mean) + \
         T.square(p_std) - T.square(q_std)
     denominator = 2 * T.square(q_std) + 1e-8
     return T.sum(
         numerator / denominator + T.log(q_std) - T.log(p_std))
Exemple #4
0
def custom_loss(y_true, y_pred):
  epsilon = 0.001
  first_log = T.log(T.clip(y_pred, 0.001, np.inf) + 1.)
  second_log = T.log(T.clip(y_true, 0.001, np.inf) + 1.)
  first_sum = T.log(T.sum(T.clip(y_pred, 0.001, np.inf))+1)
  second_sum = T.log(T.sum(T.clip(y_true, 0.001, np.inf))+1)
  return T.mean(T.square(first_log-second_log), axis=-1) + CMC_PENALTY*T.square(first_sum-second_sum)
Exemple #5
0
    def __init__(self, incoming, b=lasagne.init.Constant(0.), g=lasagne.init.Constant(1.),
                 W=lasagne.init.Normal(0.05), train_g=False, init_stdv=1., nonlinearity=relu, **kwargs):
        super(WeightNormLayer, self).__init__(incoming, **kwargs)
        self.nonlinearity = nonlinearity
        self.init_stdv = init_stdv
        k = self.input_shape[1]
        if b is not None:
            self.b = self.add_param(b, (k,), name="b", regularizable=False)
        if g is not None:
            self.g = self.add_param(g, (k,), name="g", regularizable=False, trainable=train_g)
        if len(self.input_shape)==4:
            self.axes_to_sum = (0,2,3)
            self.dimshuffle_args = ['x',0,'x','x']
        else:
            self.axes_to_sum = 0
            self.dimshuffle_args = ['x',0]

        # scale weights in layer below
        incoming.W_param = incoming.W
        #incoming.W_param.set_value(W.sample(incoming.W_param.get_value().shape))
        if incoming.W_param.ndim==4:
            if isinstance(incoming, Deconv2DLayer):
                W_axes_to_sum = (0,2,3)
                W_dimshuffle_args = ['x',0,'x','x']
            else:
                W_axes_to_sum = (1,2,3)
                W_dimshuffle_args = [0,'x','x','x']
        else:
            W_axes_to_sum = 0
            W_dimshuffle_args = ['x',0]
        if g is not None:
            incoming.W = incoming.W_param * (self.g/T.sqrt(1e-6 + T.sum(T.square(incoming.W_param),axis=W_axes_to_sum))).dimshuffle(*W_dimshuffle_args)
        else:
            incoming.W = incoming.W_param / T.sqrt(1e-6 + T.sum(T.square(incoming.W_param),axis=W_axes_to_sum,keepdims=True))
Exemple #6
0
 def _build_conditional(self, Xnew, pred_noise, diag, X, Xu, y, sigma, cov_total, mean_total):
     sigma2 = tt.square(sigma)
     Kuu = cov_total(Xu)
     Kuf = cov_total(Xu, X)
     Luu = cholesky(stabilize(Kuu))
     A = solve_lower(Luu, Kuf)
     Qffd = tt.sum(A * A, 0)
     if self.approx == "FITC":
         Kffd = cov_total(X, diag=True)
         Lamd = tt.clip(Kffd - Qffd, 0.0, np.inf) + sigma2
     else:  # VFE or DTC
         Lamd = tt.ones_like(Qffd) * sigma2
     A_l = A / Lamd
     L_B = cholesky(tt.eye(Xu.shape[0]) + tt.dot(A_l, tt.transpose(A)))
     r = y - mean_total(X)
     r_l = r / Lamd
     c = solve_lower(L_B, tt.dot(A, r_l))
     Kus = self.cov_func(Xu, Xnew)
     As = solve_lower(Luu, Kus)
     mu = self.mean_func(Xnew) + tt.dot(tt.transpose(As), solve_upper(tt.transpose(L_B), c))
     C = solve_lower(L_B, As)
     if diag:
         Kss = self.cov_func(Xnew, diag=True)
         var = Kss - tt.sum(tt.square(As), 0) + tt.sum(tt.square(C), 0)
         if pred_noise:
             var += sigma2
         return mu, var
     else:
         cov = (self.cov_func(Xnew) - tt.dot(tt.transpose(As), As) +
                tt.dot(tt.transpose(C), C))
         if pred_noise:
             cov += sigma2 * tt.identity_like(cov)
         return mu, stabilize(cov)
    def __init__(self, input, centerbias = None, alpha=1.0):
        self.input = input
        if centerbias is None:
            centerbias = np.ones(12)
        self.alpha = theano.shared(value = np.array(alpha).astype(theano.config.floatX), name='alpha')
        self.centerbias_ys = theano.shared(value=np.array(centerbias, dtype=theano.config.floatX), name='centerbias_ys')
        self.centerbias_xs = theano.shared(value=np.linspace(0, 1, len(centerbias), dtype=theano.config.floatX), name='centerbias_xs')

        height = T.cast(input.shape[0], theano.config.floatX)
        width = T.cast(input.shape[1], theano.config.floatX)
        x_coords = (T.arange(width) - 0.5*width) / (0.5*width)
        y_coords = (T.arange(height) - 0.5*height) / (0.5*height) + 0.0001  # We cannot have zeros in there because of grad

        x_coords = x_coords.dimshuffle('x', 0)
        y_coords = y_coords.dimshuffle(0, 'x')

        dists = T.sqrt(T.square(x_coords) + self.alpha*T.square(y_coords))
        self.max_dist = T.sqrt(1 + self.alpha)
        self.dists = dists/self.max_dist

        self.factors = nonlinearity(self.dists, self.centerbias_xs, self.centerbias_ys, len(centerbias))

        apply_centerbias = T.gt(self.centerbias_ys.shape[0], 2)
        self.output = ifelse(apply_centerbias, self.input+self.factors, self.input)
        self.params = [self.centerbias_ys, self.alpha]
Exemple #8
0
    def __init__(self, xdim, args, dec_nonlin=None):
        self.xdim = xdim
        self.hdim = args.hdim
        self.zdim = args.zdim
        self.lmbda = args.lmbda  # weight decay coefficient * 2
        self.x = T.matrix('x', dtype=floatX)
        self.eps = T.matrix('eps', dtype=floatX)
        self.train_i = T.scalar('train_i', dtype=floatX)
        self.dec = args.decM
        self.COV = args.COV

        self.enc_mlp = GaussianMLP(self.x, self.xdim, self.hdim, self.zdim, nlayers=args.nlayers, eps=self.eps, COV=self.COV)
        if self.dec == 'bernoulli':
            # log p(x | z) defined as -CE(x, y) = dec_mlp.cost(y)
            self.dec_mlp = BernoulliMLP(self.enc_mlp.out, self.zdim, self.hdim, self.xdim, nlayers=args.nlayers, y=self.x)
        elif self.dec == 'gaussian':
            self.dec_mlp = GaussianMLP(self.enc_mlp.out, self.zdim, self.hdim, self.xdim, nlayers=args.nlayers, y=self.x, activation=dec_nonlin, COV=self.COV)
        else:
            raise RuntimeError('unrecognized decoder %' % dec)
        #encoder part + decoder part
        if self.COV == False:
            self.enc_cost = -T.sum(kld_unit_mvn(self.enc_mlp.mu, self.enc_mlp.var))
        else:
            self.enc_cost = -T.sum(kldu_unit_mvn(self.enc_mlp.mu, self.enc_mlp.var, self.enc_mlp.u))
        self.cost = (self.enc_cost + self.dec_mlp.cost) / args.batsize
        self.params = self.enc_mlp.params + self.dec_mlp.params
        ##[T.grad(self.cost, p) + self.lmbda * p for p in self.params]
        self.gparams = [T.grad(self.cost, p) for p in self.params]
        self.gaccums = [shared(value=np.zeros(p.get_value().shape, dtype=floatX)) for p in self.params]
        self.lr = args.lr * (1-args.lmbda)**self.train_i

        # update params, update sum(grad_params) for adagrade
        self.updates = [
                (param, param - self.lr*gparam/T.sqrt(gaccum+T.square(gparam)+ADAG_EPS))
                for param, gparam, gaccum in zip(self.params, self.gparams, self.gaccums) ]
        self.updates += [ (gaccum, gaccum + T.square(gparam))
                    for gaccum, gparam in zip(self.gaccums, self.gparams)  ]

        self.train = function(
            inputs=[self.x, self.eps, self.train_i],
            outputs=self.cost,
            updates=self.updates
        )
        self.test = function(
            inputs=[self.x, self.eps],
            outputs=self.cost,
            updates=None
        )
        # can be used for semi-supervised learning for example
        self.encode = function(
            inputs=[self.x, self.eps],
            outputs=self.enc_mlp.out
        )
        # use this to sample
        self.decode = function(
            inputs=[self.enc_mlp.out],  ##z with shape (1,2)
            outputs=self.dec_mlp.out
        ) ##mlp103 .out=.mu+.sigma*eps
Exemple #9
0
def mmd_full(x_t, y_t, alpha=0.5):
    """ Implementation of the full kernel MMD statistic (gaussian kernel)"""
    N = x_t.shape[1]
    M = y_t.shape[1]

    term1 = T.mean(T.exp(-0.5 * (1 / alpha) * T.square(T.repeat(x_t, N) - T.tile(x_t, N))))
    term2 = T.mean(T.exp(-0.5 * (1 / alpha) * T.square(T.repeat(x_t, M) - T.tile(y_t, N))))
    term3 = T.mean(T.exp(-0.5 * (1 / alpha) * T.square(T.repeat(y_t, M) - T.tile(y_t, M))))
    return term1 - 2 * term2 + term3
Exemple #10
0
 def _do_calc(self, v0, v1):
     if self._func is None:
         xv0 = T.matrix('v0')
         xv1 = T.matrix('v1')
         norm0 = T.sqrt(T.square(xv0).sum(axis=1, keepdims=True))
         norm1 = T.sqrt(T.square(xv1).sum(axis=0, keepdims=True))
         dist = 1 - T.dot(xv0 / norm0, xv1 / norm1)
         self._func = theano.function([xv0, xv1], dist)
     return self._func(v0, v1)
Exemple #11
0
    def in_transit(self, t, r=0.0, texp=None):
        """Get a list of timestamps that are in transit

        Args:
            t (vector): A vector of timestamps to be evaluated.
            r (Optional): The radii of the planets.
            texp (Optional[float]): The exposure time.

        Returns:
            The indices of the timestamps that are in transit.

        """

        z = tt.zeros_like(self.a)
        r = tt.as_tensor_variable(r) + z
        R = self.r_star + z

        # Wrap the times into time since transit
        hp = 0.5 * self.period
        dt = tt.mod(self._warp_times(t) - self.t0 + hp, self.period) - hp

        if self.ecc is None:
            # Equation 14 from Winn (2010)
            k = r / R
            arg = tt.square(1 + k) - tt.square(self.b)
            factor = R / (self.a * self.sin_incl)
            hdur = hp * tt.arcsin(factor * tt.sqrt(arg)) / np.pi
            t_start = -hdur
            t_end = hdur
            flag = z

        else:
            M_contact = self.contact_points_op(
                self.a, self.ecc, self.cos_omega, self.sin_omega,
                self.cos_incl + z, self.sin_incl + z, R + r)
            flag = M_contact[2]

            t_start = (M_contact[0] - self.M0) / self.n
            t_start = tt.mod(t_start + hp, self.period) - hp
            t_end = (M_contact[1] - self.M0) / self.n
            t_end = tt.mod(t_end + hp, self.period) - hp

            t_start = tt.switch(tt.gt(t_start, 0.0),
                                t_start - self.period, t_start)
            t_end = tt.switch(tt.lt(t_end, 0.0),
                              t_end + self.period, t_end)

        if texp is not None:
            t_start -= 0.5*texp
            t_end += 0.5*texp

        mask = tt.any(tt.and_(dt >= t_start, dt <= t_end), axis=-1)
        result = ifelse(tt.all(tt.eq(flag, 0)),
                        tt.arange(t.size)[mask],
                        tt.arange(t.size))

        return result
Exemple #12
0
 def square_dist(self, X, Xs):
     X2 = tt.sum(tt.square(X), 1)
     if Xs is None:
         sqd = (-2.0 * tt.dot(X, tt.transpose(X))
                + (tt.reshape(X2, (-1, 1)) + tt.reshape(X2, (1, -1))))
     else:
         Xs2 = tt.sum(tt.square(Xs), 1)
         sqd = (-2.0 * tt.dot(X, tt.transpose(Xs))
                + (tt.reshape(Xs2, (-1, 1)) + tt.reshape(Xs2, (1, -1))))
     return tt.clip(sqd, 0.0, np.inf)
Exemple #13
0
    def dynamics_costs_obs(self,x,u):
        fmatrix = TT.matrix(dtype=floatX).type
        uvector = TT.vector(dtype='int8').type
        ctrl_lo, ctrl_hi = self.ctrl_bounds()


        @theano.as_op(itypes=[fmatrix,fmatrix,uvector],otypes=[fmatrix,fmatrix,fmatrix,fmatrix,fmatrix])
        def stepmulti2op(x_nd,u_ne,done_n):
            x_nd = x_nd.copy()
            u_ne = np.clip(u_ne, ctrl_lo, ctrl_hi)
            move_to_origin = self.md["move_to_origin"]
            offset_n2 = x_nd[:,move_to_origin].copy()
            x_nd[:,move_to_origin] -= offset_n2
            x_nd,f,dcom,dist,kin = self.world.StepMulti2(x_nd.astype("float64"),u_ne.astype("float64"),done_n)
            for _ in xrange(self.frame_skip-1):
                x_nd,f1,dcom1,dist,kin = self.world.StepMulti2(x_nd.astype("float64"),u_ne.astype("float64"),done_n)
                dcom += dcom1
                f += f1
            f /= self.frame_skip

            dist = np.clip(dist, 0, .1) # XXX clip level ad hoc 
            # Consider using nan_to_num here
            x_nd[:,move_to_origin] += offset_n2
            return (x_nd.astype(floatX),f.astype(floatX),dcom.astype(floatX),dist.astype(floatX),kin.astype(floatX))

        done = self.trial_done(x)
        notdone = 1 - done

        y,f,dcom,dist,kin = stepmulti2op(x,u,done)


        if self.vel_cost_type == "linear":
            cost_vel = (-self.vel_cost_coeff/self.world_info["timestep"]) * dcom[:,0]
        elif self.vel_cost_type == "quadratic":
            cost_vel = TT.square(dcom[:,0]/self.world_info["timestep"] - self.vel_cost_target) #pylint: disable=E1111
        else:
            raise ValueError
        cost_ctrl = .5*self.ctrl_cost_coeff*TT.square(u).sum(axis=1)
        cost_impact = .5*self.impact_cost_coeff * TT.square(f).sum(axis=1)
        if self.clip_impact_cost:
            cost_impact = TT.minimum(cost_impact, self.clip_impact_cost) #pylint: disable=E1111


        jntpos_mask = self.world_info["jnt_islimited"]
        if self.jntpos_root_only: jntpos_mask &= (self.world_info["jnt_body_id"]==1)
        jntpos_inds = np.flatnonzero(jntpos_mask)
        jntpos_dofs = np.array([dofidx for (dofidx,jntidx) in enumerate(self.world_info["dof_jnt_id"]) if jntidx in jntpos_inds])        
        cost_jntpos = (.5*self.jntpos_cost_coeff) * (TT.abs_ if self.jntpos_use_l1 else TT.square)(y[:,jntpos_dofs]).sum(axis=1)

        cost_done = (done-1)*self.done_cost_coeff
        feats = [y[:,1:],f,dist]
        if self.use_kinematic_features: feats.append(kin)
        obs = TT.concatenate(feats,axis=1)
        return [TT.switch(done[:,None], x, y),  [notdone*cost_vel, notdone*cost_ctrl, notdone*cost_impact, notdone*cost_jntpos, cost_done] , obs ]
Exemple #14
0
    def __init__(self, xdim, args, dec='bernoulli'):
        self.xdim = xdim
        self.hdim = args.hdim
        self.zdim = args.zdim
        self.lmbda = args.lmbda  # weight decay coefficient * 2
        self.x = T.matrix('x', dtype=floatX)
        self.eps = T.matrix('eps', dtype=floatX)

        # XXX make this more general
        self.enc_mlp = GaussianMLP(self.x, self.xdim, self.hdim, self.zdim, nlayers=args.nlayers, eps=self.eps)
        if dec == 'bernoulli':
            # log p(x | z) defined as -CE(x, y) = dec_mlp.cost(y)
            self.dec_mlp = BernoulliMLP(self.enc_mlp.out, self.zdim, self.hdim, self.xdim, nlayers=args.nlayers, y=self.x)
        elif dec == 'gaussian':
            self.dec_mlp = GaussianMLP(self.enc_mlp.out, self.zdim, self.hdim, self.xdim, nlayers=args.nlayers, y=self.x)
        else:
            raise RuntimeError('unrecognized decoder %' % dec)

        self.cost = (-T.sum(kld_unit_mvn(self.enc_mlp.mu, self.enc_mlp.var)) + self.dec_mlp.cost) / args.batch_size
        self.params = self.enc_mlp.params + self.dec_mlp.params
        print(self.params)
        self.gparams = [T.grad(self.cost, p) + self.lmbda * p for p in self.params]
        self.gaccums = [theano.shared(value=np.zeros(p.get_value().shape, dtype=floatX)) for p in self.params]

        # XXX using adagrad update as described in paper, could try other optimizers
        self.updates = [
                (param, param - args.lr * gparam / T.sqrt(gaccum + T.square(gparam) + ADAGRAD_EPS))
                for param, gparam, gaccum in zip(self.params, self.gparams, self.gaccums)
        ]
        self.updates += [
            (gaccum, gaccum + T.square(gparam))
            for gaccum, gparam in zip(self.gaccums, self.gparams)
        ]

        self.train = theano.function(
            inputs=[self.x, self.eps],
            outputs=self.cost,
            updates=self.updates
        )
        self.test = theano.function(
            inputs=[self.x, self.eps],
            outputs=self.cost,
            updates=None
        )
        # can be used for semi-supervised learning for example
        self.encode = theano.function(
            inputs=[self.x, self.eps],
            outputs=self.enc_mlp.out
        )
        # use this to sample
        self.decode = theano.function(
            inputs=[self.enc_mlp.out],
            outputs=self.dec_mlp.out
        )
Exemple #15
0
 def calc(self, y, output):
     if y.ndim == 1:
         loss = (T.square(y - output))
     else:
         axis = tuple(range(y.ndim))[1:]
         loss = T.sum(T.square(y - output), axis=axis)
     if self.mode:
         loss = T.mean(loss)
     else:
         loss = T.sum(loss)
     return self.weight * loss
Exemple #16
0
def rmsprop(cost, params, lr, alpha=0.95, eps=1e-8, max_norm=None, max_norm_elemwise=None):
    grads = clip_grads([T.grad(cost, p) for p in params], max_norm=max_norm, max_norm_elemwise=max_norm_elemwise)
    accums = [theano.shared(value=np.zeros(p.get_value().shape, dtype=floatX))
            for p in params]
    updates = [
        (a, alpha * a + (1 - alpha) * T.square(g)) for g, a in zip(grads, accums)
    ]
    # XXX worth fix to assign square(grad) to accum during first iter?
    updates = updates + [
        (p, p - lr * g / (T.sqrt(alpha * a + (1 - alpha) * T.square(g)) + eps)) for p, g, a in zip(params, grads, accums)
    ]
    return updates, total_norm(grads), total_norm(params)
Exemple #17
0
    def get_stencil(self, t, r=None, texp=None):
        if r is None or texp is None:
            return tt.shape_padright(t)

        z = tt.zeros_like(self.a)
        r = tt.as_tensor_variable(r)
        R = self.r_star + z
        hp = 0.5 * self.period

        if self.ecc is None:
            # Equation 14 from Winn (2010)
            k = r / self.r_star
            arg1 = tt.square(1 + k) - tt.square(self.b)
            arg2 = tt.square(1 - k) - tt.square(self.b)
            factor = R / (self.a * self.sin_incl)
            hdur1 = hp * tt.arcsin(factor * tt.sqrt(arg1)) / np.pi
            hdur2 = hp * tt.arcsin(factor * tt.sqrt(arg2)) / np.pi
            ts = [-hdur1, -hdur2, hdur2, hdur1]
            flag = z

        else:
            M_contact1 = self.contact_points_op(
                self.a, self.ecc, self.cos_omega, self.sin_omega,
                self.cos_incl + z, self.sin_incl + z, R + r)
            M_contact2 = self.contact_points_op(
                self.a, self.ecc, self.cos_omega, self.sin_omega,
                self.cos_incl + z, self.sin_incl + z, R - r)

            flag = M_contact1[2] + M_contact2[2]

            ts = [
                tt.mod((M_contact1[0]-self.M0)/self.n+hp, self.period)-hp,
                tt.mod((M_contact2[0]-self.M0)/self.n+hp, self.period)-hp,
                tt.mod((M_contact2[1]-self.M0)/self.n+hp, self.period)-hp,
                tt.mod((M_contact1[1]-self.M0)/self.n+hp, self.period)-hp
            ]

        start = self.period * tt.floor((tt.min(t) - self.t0) / self.period)
        end = self.period * (tt.ceil((tt.max(t) - self.t0) / self.period) + 1)
        start += self.t0
        end += self.t0
        tout = []
        for i in range(4):
            if z.ndim < 1:
                tout.append(ts[i] + tt.arange(start, end, self.period))
            else:
                tout.append(theano.scan(
                    fn=lambda t0, s0, e0, p0: t0 + tt.arange(s0, e0, p0),
                    sequences=[ts[i], start, end, self.period],
                )[0].flatten())

        ts = tt.sort(tt.concatenate(tout))
        return ts, flag
Exemple #18
0
 def square_dist(self, X, Z):
     X = tt.mul(X, 1.0 / self.lengthscales)
     Xs = tt.sum(tt.square(X), 1)
     if Z is None:
         sqd = -2.0 * tt.dot(X, tt.transpose(X)) +\
               (tt.reshape(Xs, (-1, 1)) + tt.reshape(Xs, (1, -1)))
     else:
         Z = tt.mul(Z, 1.0 / self.lengthscales)
         Zs = tt.sum(tt.square(Z), 1)
         sqd = -2.0 * tt.dot(X, tt.transpose(Z)) +\
               (tt.reshape(Xs, (-1, 1)) + tt.reshape(Zs, (1, -1)))
     return tt.clip(sqd, 0.0, np.inf)
Exemple #19
0
 def square_dist(self, X, Z):
     X = tt.as_tensor_variable(X)
     Xs = tt.sum(tt.square(X), 1)
     if Z is None:
         sqd = -2.0 * tt.dot(X, tt.transpose(X)) +\
               (tt.reshape(Xs, (-1, 1)) + tt.reshape(Xs, (1, -1)))
     else:
         Z = tt.as_tensor_variable(Z)
         Zs = tt.sum(tt.square(Z), 1)
         sqd = -2.0 * tt.dot(X, tt.transpose(Z)) +\
               (tt.reshape(Xs, (-1, 1)) + tt.reshape(Zs, (1, -1)))
     return tt.clip(sqd, 0.0, np.inf)
    def log_likelihood(self):
        Users = self.L[:, :-2]
        Items = self.R[:, :-2]
        UserBiases = self.L[:, -1]
        ItemBiases = self.R[:, -2]
        UserOuter = self.L[:, -2]
        ItemOuter = self.R[:, -1]

        ## A = T.dot(Users, Items.T)
        ## A += UserBiases
        ## A += ItemBiases.T
        ## B = A * self.counts
        ## loglik = T.sum(B)

        # A implicitly stored as self.L @ self.R.T
        # loglik = T.sum(A * self.counts) => sum over nonzeros only
        print('nnz size: {}'.format(self.counts.nonzero()[0].size))
        loglik = T.dot(self.evaluate_lowrank(self.L, self.R, self.counts.nonzero(), fast=False),
                  np.array(self.counts[self.counts.nonzero()]).ravel())

        ## A = T.exp(A)
        ## A += 1
        ## A = T.log(A)
        # There we use Taylor series ln(exp(x) + 1) = ln(2) + x/2 + x^2/8 + O(x^4) at x=0
        # ln(2)
        const_term = (T.ones((self.num_users, 1)) * np.log(2), T.ones((self.num_items, 1)))
        # x/2
        first_order_term = (0.5 * self.L, 0.5 * self.R)
        # x^2/8
        second_order_term = hadamard((self.L, self.R), (self.L, self.R), self.num_factors)
        second_order_term = tuple(factor / 8.0 for factor in second_order_term)

        grouped_factors = list(zip(const_term, first_order_term, second_order_term))
        A = (T.concatenate(grouped_factors[0], axis=1), T.concatenate(grouped_factors[1], axis=1))

        ## A = (self.counts + 1) * A
        ## loglik -= T.sum(A)
        loglik -= sum_lowrank(A)
        loglik -= T.dot(self.evaluate_lowrank(A[0], A[1], self.counts.nonzero(), fast=False),
                  np.array(self.counts[self.counts.nonzero()]).ravel())


        # L2 regularization
        loglik -= 0.5 * self.reg_param * T.sum(T.square(Users))
        loglik -= 0.5 * self.reg_param * T.sum(T.square(Items))

        # we need strictly maintain UserOuter and ItemOuter be ones, just to ensure they properly
        # outer products with biases
        loglik -= self.num_users * T.sum(T.square(UserOuter - 1))
        loglik -= self.num_items * T.sum(T.square(ItemOuter - 1))

        # Return negation of LogLikelihood cause we will minimize cost
        return -loglik
Exemple #21
0
 def _do_calc(self, v0, v1):
     if self._func is None:
         xv0 = T.matrix('v0')
         xv1 = T.matrix('v1')
         sqrsum0 = T.square(xv0).sum(axis=1, keepdims=True)
         sqrsum1 = T.square(xv1).sum(axis=0, keepdims=True)
         dot = T.dot(xv0, xv1)
         dist = sqrsum0 + sqrsum1 - dot * 2
         if self._do_sqrt:
             dist = T.sqrt(dist)
         self._func = theano.function([xv0, xv1], dist)
     return self._func(v0, v1)
Exemple #22
0
 def full(self, X, Xs=None):
     X, Xs = self._slice(X, Xs)
     rx = self.lfunc(tt.as_tensor_variable(X), self.args)
     if Xs is None:
         rz = self.lfunc(tt.as_tensor_variable(X), self.args)
         r2 = self.square_dist(X, X)
     else:
         rz = self.lfunc(tt.as_tensor_variable(Xs), self.args)
         r2 = self.square_dist(X, Xs)
     rx2 = tt.reshape(tt.square(rx), (-1, 1))
     rz2 = tt.reshape(tt.square(rz), (1, -1))
     return (tt.sqrt((2.0 * tt.outer(rx, rz)) / (rx2 + rz2))
             * tt.exp(-1.0 * r2 / (rx2 + rz2)))
def get_norms(model, gradients):
    """Compute norm of weights and their gradients divided by the number of elements"""
    norms = []
    grad_norms = []
    for param_name, param in model.params.iteritems():
        norm = T.sqrt(T.sum(T.square(param))) / T.prod(param.shape.astype(theano.config.floatX))
        norm.name = 'norm_' + param_name
        norms.append(norm)
        grad = gradients[param]
        grad_norm = T.sqrt(T.sum(T.square(grad))) / T.prod(grad.shape.astype(theano.config.floatX))
        grad_norm.name = 'grad_norm_' + param_name
        grad_norms.append(grad_norm)
    return norms, grad_norms
Exemple #24
0
 def full(self, X, Xs=None):
     X, Xs = self._slice(X, Xs)
     rx = self.lfunc(X, self.args)
     rx2 = tt.reshape(tt.square(rx), (-1, 1))
     if Xs is None:
         r2 = self.square_dist(X, X)
         rz = self.lfunc(X, self.args)
     else:
         r2 = self.square_dist(X, Xs)
         rz = self.lfunc(Xs, self.args)
     rz2 = tt.reshape(tt.square(rz), (1, -1))
     return (tt.sqrt((2.0 * tt.dot(rx, tt.transpose(rz))) / (rx2 + rz2))
             * tt.exp(-1.0 * r2 / (rx2 + rz2)))
Exemple #25
0
def adagrad(cost, params, lr, eps=1e-8, max_norm=None, max_norm_elemwise=None):
    grads = clip_grads([T.grad(cost, p) for p in params], max_norm=max_norm, max_norm_elemwise=max_norm_elemwise)
    accums = [theano.shared(value=np.zeros(p.get_value().shape, dtype=floatX))
            for p in params]
    updates = [
            (p, p - lr * g / T.sqrt(a + T.square(g) + eps))
            for p, g, a in zip(params, grads, accums)
    ]
    updates += [
        (a, a + T.square(g))
        for a, g in zip(accums, grads)
    ]
    return updates, total_norm(grads), total_norm(params)
Exemple #26
0
 def optimize(self, params, cost):
     grads = tensor.grad(cost=theano.gradient.grad_clip(cost, -10, 10), wrt=params)
     accus = [_shared_zeros_like(p.get_value()) for p in params]
     delta_accus = [_shared_zeros_like(p.get_value()) for p in params]
     updates = []
     for p, g, a, d_a in zip(params, grads, accus, delta_accus):
         new_a = self.rho * a + (1.0 - self.rho) * tensor.square(g)
         updates.append((a, new_a))
         update = g * tensor.sqrt(d_a + EPS) / tensor.sqrt(new_a + EPS)
         new_p = p - self.lrate * update
         updates.append((p, new_p))
         new_d_a = self.rho * d_a + (1.0 - self.rho) * tensor.square(update)
         updates.append((d_a, new_d_a))
     return updates
def adadelta(params, cost, lr=1.0, rho=0.95):
    grads = T.grad(cost, params)
    accus = [shared_zeros_like(p.get_value()) for p in params]
    delta_accus = [shared_zeros_like(p.get_value()) for p in params]
    updates = []
    for p, g, a, d_a in zip(params, grads, accus, delta_accus):
        new_a = rho * a + (1.0 - rho) * T.square(g)
        updates.append((a, new_a))
        update = g * T.sqrt(d_a + epsilon) / T.sqrt(new_a + epsilon)
        new_p = p - lr * update
        updates.append((p, new_p))
        new_d_a = rho * d_a + (1.0 - rho) * T.square(update)
        updates.append((d_a, new_d_a))
    return updates
Exemple #28
0
def mmd_approx(x_t, y_t, alpha=0.5):
    """ Implementation of the linear time approximation to the gaussian kernel MMD statistic"""
    M = x_t.shape[1] // 2
    odd_x = x_t[:, ::2]
    even_x = x_t[:, 1::2]

    odd_y = y_t[:, ::2]
    even_y = y_t[:, 1::2]

    term1 = 2 * T.mean(T.exp(-0.5 * (1 / alpha) * T.square(odd_x - even_x)))  # k(x_{2i-1}, x_{2i})
    term2 = 2 * T.mean(T.exp(-0.5 * (1 / alpha) * T.square(odd_y - even_y)))  # k(y_{2i-1}, y_{2i})
    term3 = 2 * T.mean(T.exp(-0.5 * (1 / alpha) * T.square(odd_x - even_y)))  # k(x_{2i-1}, y_{2i})
    term4 = 2 * T.mean(T.exp(-0.5 * (1 / alpha) * T.square(even_x - odd_y)))  # k(x_{2i}, y_{2i-1})
    return term1 + term2 - term3 - term4
Exemple #29
0
def adadelta(params, cost, lr=1.0, rho=0.95):
    # from https://github.com/fchollet/keras/blob/master/keras/optimizers.py
    grads = T.grad(cost, params)
    accus = [shared_zeros_like(p.get_value()) for p in params]
    delta_accus = [shared_zeros_like(p.get_value()) for p in params]
    updates = []
    for p, g, a, d_a in zip(params, grads, accus, delta_accus):
        new_a = rho * a + (1.0 - rho) * T.square(g)
        updates.append((a, new_a))
        update = g * T.sqrt(d_a + epsilon) / T.sqrt(new_a + epsilon)
        new_p = p - lr * update
        updates.append((p, new_p))
        new_d_a = rho * d_a + (1.0 - rho) * T.square(update)
        updates.append((d_a, new_d_a))
    return updates
Exemple #30
0
 def get_updates(self, grads):
   norms = None
   for (d, dp, g) in grads:
     if norms is None:
       norms = T.sum(T.square(g)) 
     else:
       norms += T.sum(T.square(g)) 
   updates = []
   for (d, dp, g) in grads:
     g *= ifelse(T.lt(norms, self.threshold), 1., self.threshold / norms)
     if self.momentum > 0:
       g = self.momentum * dp + (1 - self.momentum) * g
       updates.append((dp, g))
     updates.append((d, d - self.lr * g))        
   return updates, T.sum(norms)
Exemple #31
0
 def __init__(self,
              incoming,
              num_units,
              theta=lasagne.init.Normal(0.1),
              b=lasagne.init.Constant(0.),
              weight_scale=lasagne.init.Constant(1.),
              train_scale=False,
              nonlinearity=relu,
              **kwargs):
     super(DenseLayer, self).__init__(incoming, **kwargs)
     self.nonlinearity = (lasagne.nonlinearities.identity
                          if nonlinearity is None else nonlinearity)
     self.num_units = num_units
     num_inputs = int(np.prod(self.input_shape[1:]))
     self.theta = self.add_param(theta, (num_inputs, num_units),
                                 name="theta")
     self.weight_scale = self.add_param(weight_scale, (num_units, ),
                                        name="weight_scale",
                                        trainable=train_scale)
     self.W = self.theta * (self.weight_scale / T.sqrt(
         T.sum(T.square(self.theta), axis=0))).dimshuffle('x', 0)
     self.b = self.add_param(b, (num_units, ), name="b")
Exemple #32
0
 def _build_conditional(self, Xnew, pred_noise, diag, X, y, noise,
                        cov_total, mean_total):
     Kxx = cov_total(X)
     Kxs = self.cov_func(X, Xnew)
     Knx = noise(X)
     rxx = y - mean_total(X)
     L = cholesky(stabilize(Kxx) + Knx)
     A = solve_lower(L, Kxs)
     v = solve_lower(L, rxx)
     mu = self.mean_func(Xnew) + tt.dot(tt.transpose(A), v)
     if diag:
         Kss = self.cov_func(Xnew, diag=True)
         var = Kss - tt.sum(tt.square(A), 0)
         if pred_noise:
             var += noise(Xnew, diag=True)
         return mu, var
     else:
         Kss = self.cov_func(Xnew)
         cov = Kss - tt.dot(tt.transpose(A), A)
         if pred_noise:
             cov += noise(Xnew)
         return mu, stabilize(cov)
Exemple #33
0
    def _compute_losses(self, model_output):
        mask = self.dataset.symb_mask

        # stopping_criteria_outputs.shape : (batch_size, seq_len)
        stopping_criteria_outputs = model_output[0][:, :, 0]

        # regression_outputs.shape : (batch_size, seq_len, regression_layer_size)
        regression_outputs = model_output[1]

        # mixture_weights.shape : (batch_size, seq_len, n_gaussians)
        # means.shape : (batch_size, seq_len, n_gaussians, 3)
        # stds.shape : (batch_size, seq_len, n_gaussians, 3)
        mixture_weights, means, stds = self.model.get_mixture_parameters(regression_outputs, ndim=4)

        # targets.shape : (batch_size, seq_len, 1, 3)
        targets = self.dataset.symb_targets[:, :, None, :3]

        # stopping_criteria_targets.shape : (batch_size, seq_len)
        stopping_criteria_targets = self.dataset.symb_targets[:, :, 3]

        log_prefix = -2 * T.log(mixture_weights) + self.d * np.float32(np.log(2*np.pi)) + 2 * T.sum(T.log(stds), axis=-1)
        square_mahalanobis_dist = T.sum(T.square((targets - means) / stds), axis=-1)
        gaussian_mixture_nll_per_time_step = -logsumexp(-0.5 * (log_prefix + square_mahalanobis_dist), axis=2)

        stopping_cross_entropy_per_time_step = T.nnet.binary_crossentropy(stopping_criteria_outputs, stopping_criteria_targets)

        # loss_per_timestep.shape : (batch_size, seq_len)
        # self.gamma should be used to balance the two loss terms. Consider tweaking this hyperparameter if training goes wrong.
        self.loss_per_time_step = gaussian_mixture_nll_per_time_step + self.gamma * stopping_cross_entropy_per_time_step

        # loss_per_seq.shape : (batch_size,)
        # loss_per_seq is the log probability for each sequence
        self.loss_per_seq = T.sum(self.loss_per_time_step * mask, axis=1)

        if not self.sum_over_timestep:
            # loss_per_seq is the average log probability for each sequence
            self.loss_per_seq /= T.sum(mask, axis=1)

        return self.loss_per_seq
Exemple #34
0
    def setup_orbit_model(self, period=None):

        # Estimate initial period from TD's

        # Get period estimate
        ls_model = LombScargle(self.times, self.tds[0])
        f = np.linspace(1e-3, 0.5 / np.median(np.diff(self.times)), 10000)
        power = ls_model.power(f, method="fast", normalization="psd")
        period_t = 1 / f[np.argmax(power)]

        with self.model as model:

            # Parameters
            self.period = pm.Normal("period", mu=period_t, sd=100)
            self.tref = pm.Uniform("tref", lower=-5000, upper=5000)
            self.varpi = pm.Uniform("varpi", lower=0, upper=50)
            self.eccen = pm.Uniform("eccen", lower=1e-3, upper=0.999)

            self.lighttime = pm.Uniform('lighttime',
                                        lower=-2000,
                                        upper=2000,
                                        shape=(len(self.freqs)))

            # Deterministic transformations
            # Mean anom
            M = 2.0 * np.pi * (self.times - self.tref) / self.period
            # True anom
            f = get_true_anomaly(M, self.eccen + tt.zeros_like(M))

            factor = 1.0 - tt.square(self.eccen)
            factor /= 1.0 + self.eccen * tt.cos(f)
            psi = -factor * tt.sin(f + self.varpi)

            tau = self.lighttime[:, None] * psi[None, :]
            taumodel = pm.Deterministic('taumodel', tau - tt.mean(tau))

            # Condition on the observations
            pm.Normal("obs", mu=taumodel, sd=None, observed=self.tds)
Exemple #35
0
    def _build_conditional(self, Xnew, pred_noise, diag):
        Xs, y, sigma = self.Xs, self.y, self.sigma

        # Old points
        X = cartesian(*Xs)
        delta = y - self.mean_func(X)
        Kns = [f(x) for f, x in zip(self.cov_funcs, Xs)]
        eigs_sep, Qs = zip(*map(eigh, Kns))  # Unzip
        QTs = list(map(tt.transpose, Qs))
        eigs = kron_diag(*eigs_sep)  # Combine separate eigs
        if sigma is not None:
            eigs += sigma**2

        # New points
        Km = self.cov_func(Xnew, diag=diag)
        Knm = self.cov_func(X, Xnew)
        Kmn = Knm.T

        # Build conditional mu
        alpha = kron_dot(QTs, delta)
        alpha = alpha / eigs[:, None]
        alpha = kron_dot(Qs, alpha)
        mu = tt.dot(Kmn, alpha).ravel() + self.mean_func(Xnew)

        # Build conditional cov
        A = kron_dot(QTs, Knm)
        A = A / tt.sqrt(eigs[:, None])
        if diag:
            Asq = tt.sum(tt.square(A), 0)
            cov = Km - Asq
            if pred_noise:
                cov += sigma
        else:
            Asq = tt.dot(A.T, A)
            cov = Km - Asq
            if pred_noise:
                cov += sigma * tt.identity_like(cov)
        return mu, cov
Exemple #36
0
    def get_output_for(self,
                       input,
                       deterministic=False,
                       set_bn_updates=True,
                       **kwargs):
        if deterministic:
            norm_features = (
                input - self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)
            ) / T.sqrt(1e-6 +
                       self.avg_batch_var).dimshuffle(*self.dimshuffle_args)
        else:
            batch_mean = T.mean(input, axis=self.axes_to_sum).flatten()
            centered_input = input - batch_mean.dimshuffle(
                *self.dimshuffle_args)
            batch_var = T.mean(T.square(centered_input),
                               axis=self.axes_to_sum).flatten()
            batch_stdv = T.sqrt(1e-6 + batch_var)
            norm_features = centered_input / batch_stdv.dimshuffle(
                *self.dimshuffle_args)

            # BN updates
            if set_bn_updates:
                new_m = 0.9 * self.avg_batch_mean + 0.1 * batch_mean
                new_v = 0.9 * self.avg_batch_var + T.cast(
                    (0.1 * input.shape[0]) /
                    (input.shape[0] - 1), th.config.floatX) * batch_var
                self.bn_updates = [(self.avg_batch_mean, new_m),
                                   (self.avg_batch_var, new_v)]

        if hasattr(self, 'g'):
            activation = norm_features * self.g.dimshuffle(
                *self.dimshuffle_args)
        else:
            activation = norm_features
        if hasattr(self, 'b'):
            activation += self.b.dimshuffle(*self.dimshuffle_args)

        return self.nonlinearity(activation)
Exemple #37
0
    def __call__(self, X):
        XY = X.dot(X.T)
        x2 = tt.reshape(tt.sum(tt.square(X), axis=1), (X.shape[0], 1))
        X2e = tt.repeat(x2, X.shape[0], axis=1)
        H = tt.sub(tt.add(X2e, X2e.T), 2 * XY)

        V = tt.sort(H.flatten())
        length = V.shape[0]
        # median distance
        h = tt.switch(tt.eq((length % 2), 0),
                      # if even vector
                      tt.mean(V[((length//2)-1):((length//2)+1)]),
                      # if odd vector
                      V[length // 2])

        h = tt.sqrt(0.5 * h / tt.log(X.shape[0].astype('float32') + 1.0))

        Kxy = tt.exp(-H / h ** 2 / 2.0)
        dxkxy = -tt.dot(Kxy, X)
        sumkxy = tt.sum(Kxy, axis=1).dimshuffle(0, 'x')
        dxkxy = tt.add(dxkxy, tt.mul(X, sumkxy)) / (h ** 2)

        return Kxy, dxkxy
Exemple #38
0
 def get_update_func(self):
     print('*** Update Function of Rmsprop ......')
     # opt_log.info("*** Update Function of Rmsprop ......")
     updates = []
     lr = TT.scalar(self._s("learning_rate"), dtype=theano.config.floatX)
     rho = TT.scalar(self._s("decay_rate"), dtype=theano.config.floatX)
     eps = numpy_floatX(1E-6)
     self.meansquare = [
         theano.shared(p.get_value() * numpy_floatX(0.),
                       name="%s.meansquare" % p.name)
         for p in self.model.param
     ]
     g_msnew_list = [
         rho * g_ms + (1 - rho) * (TT.square(g))
         for g, g_ms in zip(self.grad, self.meansquare)
     ]
     updates += [(g_ms, g_msnew)
                 for g_ms, g_msnew in zip(self.meansquare, g_msnew_list)]
     updates += [
         (p, p - lr * g / TT.sqrt(g_msnew + eps))
         for p, g, g_msnew in zip(self.model.param, self.grad, g_msnew_list)
     ]
     return self.model.get_update_func(updates, [lr, rho])
Exemple #39
0
 def _log_like(self, X, Y, n_examples):
     f_out = lasagne.layers.get_output(self.f_net, X)
     f_mean = f_out[:, 0].reshape((-1, 1))
     f_log_var = f_out[:, 1].reshape((-1, 1))
     f_var_inv = 1. / (T.exp(f_log_var) + 1e-8)
     MSE = T.square(Y - f_mean)
     if self.out_type == 'Gaussian':
         log_like = T.sum(
             T.sum(-MSE * (0.5 * f_var_inv) - 0.5 * f_log_var, axis=1))
     else:
         raise RuntimeError('{} not implemented'.format(self.out_type))
     # scale by batch size to make this work nicely with the updaters above
     log_like /= T.cast(X.shape[0], theano.config.floatX)
     #priors, scale these by dataset size for the same reason
     # prior for the variance
     self.tn_examples = sharedX(np.float32(n_examples))
     log_like += self.variance_prior.log_like(f_log_var,
                                              n_examples) / self.tn_examples
     # prior for the weights
     log_like += self.weight_prior.log_like(
         lasagne.layers.get_all_params(
             self.f_net, regularizable=True)) / self.tn_examples
     return log_like, T.sum(MSE)
Exemple #40
0
def adam(cost, params, lr, beta1=0.9, beta2=0.999, eps=1e-8, param_grads=None):
    # CHECK: Performs Gradient ascent? Likely yes
    updates = []
    if param_grads == None:
        grads = tensor.grad(cost, params); assert len(params) == len(grads)
    else:
        grads = theano.shared(param_grads)
    t0 = theano.shared(np.array(0., dtype=theano.config.floatX))
    t = t0 + 1
    corr1 = (1 - beta1**t)
    corr2 = (1 - beta2**t)
    alpha = lr * tensor.sqrt(corr2) / corr1
    for p, g in zip(params, grads):
        m = theano.shared(value=np.zeros(p.get_value().shape, dtype=theano.config.floatX), broadcastable=p.broadcastable)
        v = theano.shared(value=np.zeros(p.get_value().shape, dtype=theano.config.floatX), broadcastable=p.broadcastable)
        m_t = beta1 * m + (1 - beta1) * g 
        v_t = beta2 * v + (1 - beta2) * tensor.square(g)
        p_t = p - alpha * m_t/(tensor.sqrt(v_t) + eps)
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((t0, t))
    return updates
Exemple #41
0
    def negativ_log_likelihood(self, f_net, X, y, n_examples, weight_prior,
                               variance_prior):

        f_out = lasagne.layers.get_output(f_net, X)
        f_mean = f_out[:, 0].reshape((-1, 1))

        f_log_var = f_out[:, 1].reshape((-1, 1))

        f_var_inv = 1. / (T.exp(f_log_var) + 1e-16)
        mse = T.square(y - f_mean)
        log_like = T.sum(
            T.sum(-mse * (0.5 * f_var_inv) - 0.5 * f_log_var, axis=1))
        # scale by batch size to make this work nicely with the updaters above
        log_like /= T.cast(X.shape[0], theano.config.floatX)
        # scale the priors by the dataset size for the same reason
        # prior for the variance
        tn_examples = T.cast(n_examples, theano.config.floatX)
        log_like += variance_prior.log_like(f_log_var, n_examples)
        # prior for the weights
        params = lasagne.layers.get_all_params(f_net, trainable=True)
        log_like += weight_prior.log_like(params) / tn_examples

        return -log_like, T.mean(mse)
Exemple #42
0
    def _compute_losses(self, model_output):
        mask = self.dataset.symb_mask

        # regression_outputs.shape = (batch_size, seq_length, regression_layer_size)
        stopping_criteria_outputs = model_output[0][:, :, 0]
        regression_outputs = model_output[1]

        # mu.shape : (batch_size, seq_len, 3)
        # sigma.shape : (batch_size, seq_len, 3)
        mu, sigma = self.model.get_distribution_parameters(regression_outputs)

        # targets.shape : (batch_size, seq_len, 3)
        targets = self.dataset.symb_targets[:, :, :3]
        stopping_criteria_targets = self.dataset.symb_targets[:, :, 3]

        square_mahalanobis_dist = T.sum(T.square((targets - mu) / sigma),
                                        axis=-1)
        nll_per_time_step = 0.5 * (self.d * np.float32(np.log(2 * np.pi)) +
                                   2 * T.sum(T.log(sigma), axis=-1) +
                                   square_mahalanobis_dist)

        stopping_cross_entropy_per_time_step = T.nnet.binary_crossentropy(
            stopping_criteria_outputs, stopping_criteria_targets)

        # loss_per_timestep.shape : (batch_size, seq_len)
        # self.gamma should be used to balance the two loss terms. Consider tweaking this hyperparameter if training goes wrong.
        self.loss_per_time_step = nll_per_time_step + self.gamma * stopping_cross_entropy_per_time_step

        # loss_per_seq.shape : (batch_size,)
        # loss_per_seq is the log probability for each sequence
        self.loss_per_seq = T.sum(self.loss_per_time_step * mask, axis=1)

        if not self.sum_over_timestep:
            # loss_per_seq is the average log probability for each sequence
            self.loss_per_seq /= T.sum(mask, axis=1)

        return self.loss_per_seq
Exemple #43
0
    def lmul_sq_T(self, x):
        raise NotImplementedError(
            "This method is not yet modified since copy-pasting from pylearn2.linear.conv2d"
        )
        """ Kind of a stupid hacky method used to support convolutional score matching.
        Ought to find a way to make _filters symbolic rather than shared.
        """
        assert x.dtype == self._filters.dtype

        op_axes = ('b', 'c', 0, 1)
        axes = self.output_axes
        if tuple(axes) != op_axes:
            x = x.dimshuffle(axes.index('b'), axes.index('c'), axes.index(0),
                             axes.index(1))

        # dot(x, sq(A).T)
        dummy_v = T.tensor4()
        sqfilt = T.square(self._filters)
        z_hs = 0.  #conv2d(dummy_v, sqfilt,
        #image_shape=self._img_shape,
        #filter_shape=self._filters_shape,
        #kernel_stride=self._kernel_stride,
        #pad = self.pad
        #)
        rval, xdummy = z_hs.owner.op.grad((dummy_v, sqfilt), (x, ))

        # Format the output based on the input space
        axes = self.input_space.axes
        assert len(axes) == 4

        if tuple(axes) != op_axes:
            rval = rval.dimshuffle(op_axes.index(axes[0]),
                                   op_axes.index(axes[1]),
                                   op_axes.index(axes[2]),
                                   op_axes.index(axes[3]))

        return rval
Exemple #44
0
def rbf_kernel(X):
    # TODO. rbf may not be a good choice for high dimension data
    XY = tt.dot(X, X.transpose())
    x2 = tt.reshape(tt.sum(tt.square(X), axis=1), (X.shape[0], 1))
    X2e = tt.repeat(x2, X.shape[0], axis=1)
    H = tt.sub(tt.add(X2e, X2e.transpose()), 2 * XY)

    V = H.flatten()

    # median distance
    h = ifelse(tt.eq((V.shape[0] % 2), 0),
               # if even vector
               tt.mean(tt.sort(V)[ ((V.shape[0] // 2) - 1) : ((V.shape[0] // 2) + 1) ]),
               # if odd vector
               tt.sort(V)[V.shape[0] // 2])

    h = tt.sqrt(0.5 * h / tt.log(X.shape[0].astype('float32') + 1.0))

    Kxy = tt.exp(-H / h ** 2 / 2.0)
    dxkxy = -tt.dot(Kxy, X)
    sumkxy = tt.sum(Kxy, axis=1).dimshuffle(0, 'x')
    dxkxy = tt.add(dxkxy, tt.mul(X, sumkxy)) / (h ** 2)

    return Kxy, dxkxy
def vgd_kernel(X0):
    XY = T.dot(X0, X0.transpose())
    x2 = T.reshape(T.sum(T.square(X0), axis=1), (X0.shape[0], 1))
    X2e = T.repeat(x2, X0.shape[0], axis=1)
    H = T.sub(T.add(X2e, X2e.transpose()), 2 * XY)

    V = H.flatten()

    # median distance
    h = T.switch(
        T.eq((V.shape[0] % 2), 0),
        # if even vector
        T.mean(T.sort(V)[((V.shape[0] // 2) - 1):((V.shape[0] // 2) + 1)]),
        # if odd vector
        T.sort(V)[V.shape[0] // 2])

    h = T.sqrt(0.5 * h / T.log(X0.shape[0].astype('float32') + 1.0)) / 2.

    Kxy = T.exp(-H / h**2 / 2.0)
    dxkxy = -T.dot(Kxy, X0)
    sumkxy = T.sum(Kxy, axis=1).dimshuffle(0, 'x')
    dxkxy = T.add(dxkxy, T.mul(X0, sumkxy)) / (h**2)

    return (Kxy, dxkxy, h)
Exemple #46
0
            def second_order_update(loss_or_grads, params, oldparams,
                                    step_size):
                """Second-order update method for optimizing loss_last_sample, so basically,
                KL term (new params || old params) + NLL of latest sample. The Hessian is
                evaluated at the origin and provides curvature information to make a more
                informed step in the correct descent direction."""
                grads = T.grad(loss_or_grads, params)
                updates = OrderedDict()
                for i in range(len(params)):
                    param = params[i]
                    grad = grads[i]
                    if param.name == 'mu' or param.name == 'b_mu':
                        oldparam_rho = oldparams[i + 1]
                        invH = T.square(T.log(1 + T.exp(oldparam_rho)))
                    else:
                        oldparam_rho = oldparams[i]
                        p = param

                        H = 2. * (T.exp(2 * p)) / \
                            (1 + T.exp(p))**2 / (T.log(1 + T.exp(p))**2)
                        invH = 1. / H
                    updates[param] = param - step_size * invH * grad

                return updates
Exemple #47
0
def adam_conditional_updates(
        params,
        cost,
        mincost,
        lr=0.001,
        mom1=0.9,
        mom2=0.999):  # if cost is less than mincost, don't do update
    updates = []
    grads = T.grad(cost, params)
    t = th.shared(np.cast[th.config.floatX](1.))
    for p, g in zip(params, grads):
        v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
        mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.))
        v_t = mom1 * v + (1. - mom1) * g
        mg_t = mom2 * mg + (1. - mom2) * T.square(g)
        v_hat = v_t / (1. - mom1**t)
        mg_hat = mg_t / (1. - mom2**t)
        g_t = v_hat / T.sqrt(mg_hat + 1e-8)
        p_t = p - lr * g_t
        updates.append((v, ifelse(cost < mincost, v, v_t)))
        updates.append((mg, ifelse(cost < mincost, mg, mg_t)))
        updates.append((p, ifelse(cost < mincost, p, p_t)))
    updates.append((t, ifelse(cost < mincost, t, t + 1)))
    return updates
Exemple #48
0
def ADAM(lr,
         params,
         grads,
         loss,
         iteration,
         beta_1=0.9,
         beta_2=0.999,
         epsilon=1e-8):
    """
    ADAM update
    """
    t = iteration
    lr_t = lr * T.sqrt(1 - T.pow(beta_2, t)) / (1 - T.pow(beta_1, t))
    w_decay = cfg.TRAIN.WEIGHT_DECAY

    updates = []
    for p, g in zip(params, grads):
        # zero init of moment
        m = theano.shared(p.val.get_value() * 0.)
        # zero init of velocity
        v = theano.shared(p.val.get_value() * 0.)

        if p.is_bias or w_decay == 0:
            regularized_g = g
        else:
            regularized_g = g + w_decay * p.val

        m_t = (beta_1 * m) + (1 - beta_1) * regularized_g
        v_t = (beta_2 * v) + (1 - beta_2) * T.square(regularized_g)
        p_t = p.val - lr_t * m_t / (T.sqrt(v_t) + epsilon)

        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p.val, p_t))

    return updates
Exemple #49
0
    def _apply_gradients(self, grads_and_vars):
        b1, b2 = self.args['beta1'], self.args['beta2']
        ep, lr = self.args['epsilon'], self.args['learning_rate']

        b1_pow = self._create_slot(b1, 'beta1_power')
        b2_pow = self._create_slot(b2, 'beta2_power')
        alpha = lr * T.sqrt(1.0 - b2_pow) / (1.0 - b1_pow)

        updates = OrderedDict()
        for grad, var in grads_and_vars:
            m = self._create_slot_var(var, 'm')
            v = self._create_slot_var(var, 'v')

            new_m = m + (1.0 - b1) * (grad - m)
            new_v = v + (1.0 - b2) * (T.square(grad) - v)
            new_var = var - (new_m * alpha) / (T.sqrt(new_v) + ep)

            updates[m] = new_m
            updates[v] = new_v
            updates[var] = new_var

        updates[b1_pow] = b1_pow * b1
        updates[b2_pow] = b2_pow * b2
        return Operation(op=updates)
Exemple #50
0
    def lmul_sq_T(self, x):
        """ Kind of a stupid hacky method used to support convolutional score matching.
        Ought to find a way to make _filters symbolic rather than shared.
        """
        assert x.dtype == self._filters.dtype

        op_axes = ('b', 'c', 0, 1)
        axes = self.output_axes
        if tuple(axes) != op_axes:
            x = x.dimshuffle(axes.index('b'), axes.index('c'), axes.index(0),
                             axes.index(1))

        # dot(x, sq(A).T)
        dummy_v = T.tensor4()
        sqfilt = T.square(self._filters)
        z_hs = conv2d(
            dummy_v,
            sqfilt,
            image_shape=self._img_shape,
            filter_shape=self._filters_shape,
            subsample=self._subsample,
            border_mode=self._border_mode,
        )
        rval, xdummy = z_hs.owner.op.grad((dummy_v, sqfilt), (x, ))

        # Format the output based on the input space
        axes = self.input_space.axes
        assert len(axes) == 4

        if tuple(axes) != op_axes:
            rval = rval.dimshuffle(op_axes.index(axes[0]),
                                   op_axes.index(axes[1]),
                                   op_axes.index(axes[2]),
                                   op_axes.index(axes[3]))

        return rval
Exemple #51
0
def log_diff_normal_cdf(mu, sigma, x, y):
    """
    Compute :math:`\\log(\\Phi(\frac{x - \\mu}{\\sigma}) - \\Phi(\frac{y - \\mu}{\\sigma}))` safely in log space.

    Parameters
    ----------
    mu: float
        mean
    sigma: float
        std

    x: float

    y: float
        must be strictly less than x.

    Returns
    -------
    log (\\Phi(x) - \\Phi(y))

    """
    x = (x - mu) / sigma / tt.sqrt(2.0)
    y = (y - mu) / sigma / tt.sqrt(2.0)

    # To stabilize the computation, consider these three regions:
    # 1) x > y > 0 => Use erf(x) = 1 - e^{-x^2} erfcx(x) and erf(y) =1 - e^{-y^2} erfcx(y)
    # 2) 0 > x > y => Use erf(x) = e^{-x^2} erfcx(-x) and erf(y) = e^{-y^2} erfcx(-y)
    # 3) x > 0 > y => Naive formula log( (erf(x) - erf(y)) / 2 ) works fine.
    return tt.log(0.5) + tt.switch(
        tt.gt(y, 0),
        -tt.square(y) + tt.log(tt.erfcx(y) - tt.exp(tt.square(y) - tt.square(x)) * tt.erfcx(x)),
        tt.switch(
            tt.lt(x, 0),  # 0 > x > y
            -tt.square(x)
            + tt.log(tt.erfcx(-x) - tt.exp(tt.square(x) - tt.square(y)) * tt.erfcx(-y)),
            tt.log(tt.erf(x) - tt.erf(y)),  # x >0 > y
        ),
    )
        def f(x, u, i, terminal):
            # Original Gym does not impose a control cost, but does clip it
            # to [-1, 1]. This non-linear dynamics is hard for iLQG to handle,
            # so add a quadratic control penalty instead.
            if terminal:
                ctrl_cost = T.zeros_like(x[..., 0])
            else:
                ctrl_cost = T.square(u).sum(axis=-1)

            # x: (batch_size, 6), concatenation of qpos & qvel

            # Distance cost
            # The tricky part is finding Cartesian coords of pole tip.
            base_x = x[..., 0]  # qpos[0]: x axis of the slider
            hinge1_ang = x[..., 1]  # qpos[1]: angle of the first hinge
            hinge2_ang = x[..., 2]  # qpos[2]: angle of the second hinge
            hinge2_cum_ang = hinge1_ang + hinge2_ang
            # 0 degrees is y=1, x=0; rotates clockwise.
            hinge1_x, hinge1_y = T.sin(hinge1_ang), T.cos(hinge1_ang)
            hinge2_x, hinge2_y = T.sin(hinge2_cum_ang), T.cos(hinge2_cum_ang)
            tip_x = base_x + hinge1_x + hinge2_x
            tip_y = hinge1_y + hinge2_y
            dist_cost = 0.01 * T.square(tip_x) + T.square(tip_y - 2)

            # Velocity cost
            v1 = x[..., 4]  # qvel[1]
            v2 = x[..., 5]  # qvel[2]
            vel_cost = 1e-3 * T.square(v1) + 5e-3 * T.square(v2)

            # TODO: termination penalty? (shouldn't change optimal policy?)
            dist_below = T.max([T.zeros_like(tip_y), 1.1 - tip_y], axis=0)
            termination_cost = T.square(dist_below)

            cost = (5 * termination_cost + dist_cost + vel_cost +
                    ctrl_coef * ctrl_cost)
            return cost
Exemple #53
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import numpy as np
import theano.tensor as T
import theano
from theano import function
from theano import shared

w = T.dvector("w")
w_L2_norm_2 = T.square(w.norm(L=2))
g_w_L2_norm_2 = T.grad(w_L2_norm_2, w)

print "To see the graph, difficult to see the result in math form..."
theano.pp(g_w_L2_norm_2)


Exemple #54
0
def l2_normalize(x, axis):
    norm = T.sqrt(T.sum(T.square(x), axis=axis, keepdims=True))
    return x / norm
    weights.append(m.parameters[layer.weights])

weight_decay = ((weights[0]**2).sum() + (weights[1]**2).sum() +
                (weights[2]**2).sum())

weight_decay /= m.exprs['inpt'].shape[0]
m.exprs['true_loss'] = m.exprs['loss']
c_wd = 0.1
m.exprs['loss'] = m.exprs['loss'] + c_wd * weight_decay

mae = T.abs_((m.exprs['output'] * np.std(train_labels, axis=0) +
              np.mean(train_labels, axis=0)) - m.exprs['target']).mean(axis=0)
f_mae = m.function(['inpt', 'target'], mae)

rmse = T.sqrt(
    T.square((m.exprs['output'] * np.std(train_labels, axis=0) +
              np.mean(train_labels, axis=0)) - m.exprs['target']).mean(axis=0))
f_rmse = m.function(['inpt', 'target'], rmse)

start = time.time()
# Set up a nice printout.
keys = '#', 'seconds', 'loss', 'val loss', 'mae_train', 'rmse_train', 'mae_test', 'rmse_test'
max_len = max(len(i) for i in keys)
header = '\t'.join(i for i in keys)
print header
print '-' * len(header)
results = open('result.txt', 'a')
results.write(header + '\n')
results.write('-' * len(header) + '\n')
results.close()

EXP_DIR = os.getcwd()
Exemple #56
0
#顯示資料
#plt.scatter(x_data,y_data)
#plt.show()

#定義input,d為float64
x = T.dmatrix('x')
y = T.dmatrix('y')

#增加layer,in_size=1是因為x只有一個屬性,out_size=10自己定義的
l1 = Layer(x, 1, 10, T.nnet.relu)
#output layer 的大小out_size=1是因為y也是只有一個維度
l2 = Layer(l1.outputs, 10, 1, None)

#計算cost(平均誤差)
cost = T.mean(T.square(l2.outputs - y))

#Gradient 計算(每次weight、bias變化量),後面放參數
gW1, gb1, gW2, gb2 = T.grad(cost, [l1.W, l1.b, l2.W, l2.b])

#Gradient Descent 應用
learning_rate = 0.05
#input x是因為呼叫cost函數會用到x、y來計算出l2.outputs
train = theano.function(
    inputs=[x, y],
    outputs=cost,
    #每一個要更新的東西用小括號代替
    updates=[(l1.W, l1.W - learning_rate * gW1),
             (l1.b, l1.b - learning_rate * gb1),
             (l2.W, l2.W - learning_rate * gW2),
             (l2.b, l2.b - learning_rate * gb2)])
Exemple #57
0
    for p in disc_params
]
disc_avg_updates = [(a, a + 0.0001 * (p - a))
                    for p, a in zip(disc_params, disc_param_avg)]
disc_avg_givens = [(p, a) for p, a in zip(disc_params, disc_param_avg)
                   ]  # data based initialization
train_batch_disc = th.function(inputs=[x_lab, x_unl, lr],
                               outputs=[loss_lab, loss_unl],
                               updates=disc_param_updates)

# Theano functions for training the gen net
output_unl = ll.get_output(disc_layers[-3], x_unl, deterministic=False)
output_gen = ll.get_output(disc_layers[-3], gen_dat, deterministic=False)
m1 = T.mean(output_unl, axis=0)
m2 = T.mean(output_gen, axis=0)
loss_gen = T.mean(T.square(m1 - m2))  # feature matching loss
gen_params = ll.get_all_params(gen_layers, trainable=True)
gen_param_updates = nn.adam_updates(gen_params, loss_gen, lr=lr, mom1=0.5)
train_batch_gen = th.function(inputs=[x_unl, lr],
                              outputs=loss_gen,
                              updates=gen_param_updates)

x_temp = T.tensor4()
features = ll.get_output(disc_layers[-1], x_temp, deterministic=True)
generate_features = th.function(inputs=[x_temp], outputs=features)
# //////////// perform training //////////////
for epoch in range(1):
    begin = time.time()
    lr = np.cast[th.config.floatX](args.learning_rate *
                                   np.minimum(3. - epoch / 400., 1.))
    nr_batches_lab = int(txs.shape[0] / args.batch_size)
Exemple #58
0
def compute_y(P, no_dims, max_iter):
    (n, d) = P.shape
    # n = 2500
    # max_iter = 100
    initial_momentum = 0.5
    final_momentum = 0.8
    eta = 500
    min_gain = 0.01

    initial_momentum_f = tensor.cast(initial_momentum, FLOATX)
    final_momentum_f = tensor.cast(final_momentum, FLOATX)
    min_gain_f = tensor.cast(min_gain, FLOATX)

    # sample of normal distribution, mean = 0, stardand_variance = 1
    numpy.random.seed(2)
    Y = numpy.random.randn(n, no_dims).astype(FLOATX)
    iY = numpy.zeros((n, no_dims), dtype=FLOATX)
    gains = numpy.ones((n, no_dims), dtype=FLOATX)

    y_arg = theano.shared(Y)
    iy_arg = theano.shared(iY)
    gains_arg = theano.shared(gains)
    p_arg = theano.shared(P.astype(FLOATX))
    momentum = theano.shared(numpy.float32(initial_momentum))

    # Compute pairwise affinities
    sum_y = tensor.sum(tensor.square(y_arg), 1)
    num = 1 / (1 + tensor.add(
        tensor.add(-2 * tensor.dot(y_arg, y_arg.T), sum_y).T, sum_y))
    num = tensor.set_subtensor(num[range(n), range(n)], 0)

    Q = num / tensor.sum(num)
    Q = tensor.maximum(Q, 1e-12)

    PQ = p_arg - Q

    A = PQ * num
    dy_arg = (tensor.tile(tensor.sum(A, 0),
                          (no_dims, 1)).T * y_arg) - tensor.dot(A.T, y_arg)
    dy_arg = tensor.cast(dy_arg, FLOATX)

    indexsa = tensor.neq((dy_arg > 0), (iy_arg > 0)).nonzero()
    indexsb = tensor.eq((dy_arg > 0), (iy_arg > 0)).nonzero()
    resulta = tensor.set_subtensor(gains_arg[indexsa],
                                   gains_arg[indexsa] + 0.2)
    resultb = tensor.set_subtensor(resulta[indexsb], resulta[indexsb] * 0.8)

    indexs_min = (resultb < min_gain_f).nonzero()
    new_gains_arg = tensor.set_subtensor(resultb[indexs_min], min_gain_f)

    # last step in simple version of SNE
    new_iy_arg = momentum * iy_arg - eta * (new_gains_arg * dy_arg)
    new_y_arg = y_arg + new_iy_arg
    new_y_arg = new_y_arg - tensor.tile(tensor.mean(new_y_arg, 0), (n, 1))

    # # Compute current value of cost function
    # if (cur_step + 1) % 10 == 0:
    #     C = tensor.sum(p_arg * tensor.log(p_arg / Q))
    #     print "Iteration ", (cur_step + 1), ": error is ", C

    compute_y_fun = theano.function(inputs=[],
                                    updates=[(y_arg, new_y_arg),
                                             (iy_arg, new_iy_arg),
                                             (gains_arg, new_gains_arg)])

    for cur_step in range(max_iter):
        if cur_step == 20:
            momentum.set_value(numpy.float32(final_momentum))
        compute_y_fun()
        if cur_step == 100:
            p_arg.set_value((p_arg.get_value() / 4).astype(FLOATX))

    return y_arg.get_value()
def osl_w_brier_loss(o, f, class_weights):
    """f is the forecast and o is the original outcome"""
    d = T.argmax(T.mul(o, f), axis=-1, keepdims=True)
    return T.mean(T.dot(T.square(T.sub(f, d)), class_weights), axis=-1)
Exemple #60
0
def gan_unlabelled_classif(trainx, trainy, testx, testy, lab_cnt, inp_size,
                           train_ex_cnt):
    trainy = trainy.astype(np.int32)
    testy = testy.astype(np.int32)
    trainx = trainx.reshape((-1, inp_size)).astype(th.config.floatX)
    testx = testx.reshape((-1, inp_size)).astype(th.config.floatX)
    assert train_ex_cnt == trainx.shape[0]

    # settings
    parser = argparse.ArgumentParser()
    parser.add_argument('--seed', type=int, default=1)
    parser.add_argument('--seed_data', type=int, default=1)
    parser.add_argument('--unlabeled_weight', type=float, default=1.)
    parser.add_argument('--batch_size', type=int, default=100)
    parser.add_argument('--count', type=int, default=10)
    parser.add_argument('--iter_limit', type=int, default=300)
    args = parser.parse_args()
    print(args)

    # fixed random seeds
    rng = np.random.RandomState(args.seed)
    theano_rng = MRG_RandomStreams(rng.randint(2**15))
    lasagne.random.set_rng(np.random.RandomState(rng.randint(2**15)))
    data_rng = np.random.RandomState(args.seed_data)

    # npshow(trainx.reshape((-1, 27, 32))[0])

    trainx_unl = trainx.copy()
    trainx_unl2 = trainx.copy()
    nr_batches_train = int(trainx.shape[0] / args.batch_size)
    nr_batches_test = int(testx.shape[0] / args.batch_size)

    # select labeled data
    inds = data_rng.permutation(trainx.shape[0])
    trainx = trainx[inds]
    trainy = trainy[inds]
    txs = []
    tys = []
    for _j in range(10):
        j = _j % lab_cnt
        txs.append(trainx[trainy == j][:args.count])
        tys.append(trainy[trainy == j][:args.count])
    txs = np.concatenate(txs, axis=0)
    tys = np.concatenate(tys, axis=0)

    # specify generative model
    noise = theano_rng.uniform(size=(args.batch_size, 100))
    gen_layers = [LL.InputLayer(shape=(args.batch_size, 100), input_var=noise)]
    gen_layers.append(
        nn.batch_norm(LL.DenseLayer(gen_layers[-1],
                                    num_units=500,
                                    nonlinearity=T.nnet.softplus),
                      g=None))
    gen_layers.append(
        nn.batch_norm(LL.DenseLayer(gen_layers[-1],
                                    num_units=500,
                                    nonlinearity=T.nnet.softplus),
                      g=None))
    gen_layers.append(
        nn.l2normalize(
            LL.DenseLayer(gen_layers[-1],
                          num_units=inp_size,
                          nonlinearity=T.nnet.sigmoid)))
    gen_dat = LL.get_output(gen_layers[-1], deterministic=False)

    # specify supervised model
    layers = [LL.InputLayer(shape=(None, inp_size))]
    layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.3))
    layers.append(nn.DenseLayer(layers[-1], num_units=1000))
    layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.5))
    layers.append(nn.DenseLayer(layers[-1], num_units=500))
    layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.5))
    layers.append(nn.DenseLayer(layers[-1], num_units=250))
    layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.5))
    layers.append(nn.DenseLayer(layers[-1], num_units=250))
    layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.5))
    layers.append(nn.DenseLayer(layers[-1], num_units=250))
    layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.5))
    layers.append(
        nn.DenseLayer(layers[-1],
                      num_units=lab_cnt,
                      nonlinearity=None,
                      train_scale=True))

    # costs
    labels = T.ivector()
    x_lab = T.matrix()
    x_unl = T.matrix()

    temp = LL.get_output(gen_layers[-1], init=True)
    temp = LL.get_output(layers[-1], x_lab, deterministic=False, init=True)
    init_updates = [
        u for l in gen_layers + layers for u in getattr(l, 'init_updates', [])
    ]

    output_before_softmax_lab = LL.get_output(layers[-1],
                                              x_lab,
                                              deterministic=False)
    output_before_softmax_unl = LL.get_output(layers[-1],
                                              x_unl,
                                              deterministic=False)
    output_before_softmax_fake = LL.get_output(layers[-1],
                                               gen_dat,
                                               deterministic=False)

    z_exp_lab = T.mean(nn.log_sum_exp(output_before_softmax_lab))
    z_exp_unl = T.mean(nn.log_sum_exp(output_before_softmax_unl))
    z_exp_fake = T.mean(nn.log_sum_exp(output_before_softmax_fake))
    l_lab = output_before_softmax_lab[T.arange(args.batch_size), labels]
    l_unl = nn.log_sum_exp(output_before_softmax_unl)
    loss_lab = -T.mean(l_lab) + T.mean(z_exp_lab)
    loss_unl = -0.5 * T.mean(l_unl) + 0.5 * T.mean(
        T.nnet.softplus(
            nn.log_sum_exp(output_before_softmax_unl))) + 0.5 * T.mean(
                T.nnet.softplus(nn.log_sum_exp(output_before_softmax_fake)))

    train_err = T.mean(
        T.neq(T.argmax(output_before_softmax_lab, axis=1), labels))

    mom_gen = T.mean(LL.get_output(layers[-3], gen_dat), axis=0)
    mom_real = T.mean(LL.get_output(layers[-3], x_unl), axis=0)
    loss_gen = T.mean(T.square(mom_gen - mom_real))

    # test error
    output_before_softmax = LL.get_output(layers[-1],
                                          x_lab,
                                          deterministic=True)
    test_err = T.mean(T.neq(T.argmax(output_before_softmax, axis=1), labels))

    # Theano functions for training and testing
    lr = T.scalar()
    disc_params = LL.get_all_params(layers, trainable=True)
    disc_param_updates = nn.adam_updates(disc_params,
                                         loss_lab +
                                         args.unlabeled_weight * loss_unl,
                                         lr=lr,
                                         mom1=0.5)
    disc_param_avg = [
        th.shared(np.cast[th.config.floatX](0. * p.get_value()))
        for p in disc_params
    ]
    disc_avg_updates = [(a, a + 0.0001 * (p - a))
                        for p, a in zip(disc_params, disc_param_avg)]
    disc_avg_givens = [(p, a) for p, a in zip(disc_params, disc_param_avg)]
    gen_params = LL.get_all_params(gen_layers[-1], trainable=True)
    gen_param_updates = nn.adam_updates(gen_params, loss_gen, lr=lr, mom1=0.5)
    init_param = th.function(inputs=[x_lab],
                             outputs=None,
                             updates=init_updates)
    train_batch_disc = th.function(inputs=[x_lab, labels, x_unl, lr],
                                   outputs=[loss_lab, loss_unl, train_err],
                                   updates=disc_param_updates +
                                   disc_avg_updates)
    train_batch_gen = th.function(inputs=[x_unl, lr],
                                  outputs=[loss_gen],
                                  updates=gen_param_updates)
    test_batch = th.function(inputs=[x_lab, labels],
                             outputs=test_err,
                             givens=disc_avg_givens)

    init_param(trainx[:500])  # data dependent initialization

    # //////////// perform training //////////////
    lr = 0.003
    for epoch in range(args.iter_limit):
        begin = time.time()

        # construct randomly permuted minibatches
        trainx = []
        trainy = []
        for t in range(trainx_unl.shape[0] / txs.shape[0]):
            inds = rng.permutation(txs.shape[0])
            trainx.append(txs[inds])
            trainy.append(tys[inds])
        trainx = np.concatenate(trainx, axis=0)
        trainy = np.concatenate(trainy, axis=0)
        trainx_unl = trainx_unl[rng.permutation(trainx_unl.shape[0])]
        trainx_unl2 = trainx_unl2[rng.permutation(trainx_unl2.shape[0])]

        # train
        loss_lab = 0.
        loss_unl = 0.
        train_err = 0.
        for t in range(nr_batches_train):
            ll, lu, te = train_batch_disc(
                trainx[t * args.batch_size:(t + 1) * args.batch_size],
                trainy[t * args.batch_size:(t + 1) * args.batch_size],
                trainx_unl[t * args.batch_size:(t + 1) * args.batch_size], lr)
            loss_lab += ll
            loss_unl += lu
            train_err += te
            e = train_batch_gen(
                trainx_unl2[t * args.batch_size:(t + 1) * args.batch_size], lr)
        loss_lab /= nr_batches_train
        loss_unl /= nr_batches_train
        train_err /= nr_batches_train

        # test
        test_err = 0.
        for t in range(nr_batches_test):
            test_err += test_batch(
                testx[t * args.batch_size:(t + 1) * args.batch_size],
                testy[t * args.batch_size:(t + 1) * args.batch_size])
        test_err /= nr_batches_test

        # report
        print(
            "Iteration %d, time = %ds, loss_lab = %.4f, loss_unl = %.4f, train err = %.4f, test err = %.4f"
            % (epoch, time.time() - begin, loss_lab, loss_unl, train_err,
               test_err))
        sys.stdout.flush()