Example #1
0
    def get_monitoring_channels(self):
        warnings.warn("Layer.get_monitoring_channels is " +
                      "deprecated. Use get_layer_monitoring_channels " +
                      "instead. Layer.get_monitoring_channels " +
                      "will be removed on or after september 24th 2014",
                      stacklevel=2)

        W, = self.transformer.get_params()

        assert W.ndim == 2

        sq_W = T.sqr(W)

        row_norms = T.sqrt(sq_W.sum(axis=1))
        col_norms = T.sqrt(sq_W.sum(axis=0))

        row_norms_min = row_norms.min()
        row_norms_min.__doc__ = ("The smallest norm of any row of the "
                                 "weight matrix W. This is a measure of the "
                                 "least influence any visible unit has.")

        return OrderedDict([('row_norms_min',  row_norms_min),
                            ('row_norms_mean', row_norms.mean()),
                            ('row_norms_max',  row_norms.max()),
                            ('col_norms_min',  col_norms.min()),
                            ('col_norms_mean', col_norms.mean()),
                            ('col_norms_max',  col_norms.max()), ])
def adam(lr, tparams, grads, inp, cost):
    gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) for k, p in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]

    f_grad_shared = theano.function(inp, cost, updates=gsup)

    lr0 = 0.0002
    b1 = 0.1
    b2 = 0.001
    e = 1e-8

    updates = []

    i = theano.shared(numpy.float32(0.))
    i_t = i + 1.
    fix1 = 1. - b1**(i_t)
    fix2 = 1. - b2**(i_t)
    lr_t = lr0 * (tensor.sqrt(fix2) / fix1)

    for p, g in zip(tparams.values(), gshared):
        m = theano.shared(p.get_value() * 0.)
        v = theano.shared(p.get_value() * 0.)
        m_t = (b1 * g) + ((1. - b1) * m)
        v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (tensor.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))

    f_update = theano.function([lr], [], updates=updates, on_unused_input='ignore')

    return f_grad_shared, f_update
Example #3
0
def sgd_updates_adadelta(params,cost,rho=0.95,epsilon=1e-6,norm_lim=9,word_vec_name='Words'):
    """
    adadelta update rule, mostly from
    https://groups.google.com/forum/#!topic/pylearn-dev/3QbKtCumAW4 (for Adadelta)
    """
    updates = OrderedDict({})
    exp_sqr_grads = OrderedDict({})
    exp_sqr_ups = OrderedDict({})
    gparams = []
    for param in params:
        empty = numpy.zeros_like(param.get_value())
        exp_sqr_grads[param] = theano.shared(value=as_floatX(empty),name="exp_grad_%s" % param.name)
        gp = T.grad(cost, param)
        exp_sqr_ups[param] = theano.shared(value=as_floatX(empty), name="exp_grad_%s" % param.name)
        gparams.append(gp)
    for param, gp in zip(params, gparams):
        exp_sg = exp_sqr_grads[param]
        exp_su = exp_sqr_ups[param]
        up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp)
        updates[exp_sg] = up_exp_sg
        step =  -(T.sqrt(exp_su + epsilon) / T.sqrt(up_exp_sg + epsilon)) * gp
        updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step)
        stepped_param = param + step
        if (param.get_value(borrow=True).ndim == 2) and (param.name!='Words'):
            col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0))
            desired_norms = T.clip(col_norms, 0, T.sqrt(norm_lim))
            scale = desired_norms / (1e-7 + col_norms)
            tmp=stepped_param * scale
            tmp=T.cast(tmp,'float32')
            #print param.type,tmp.type
            updates[param] = tmp
        else:
            updates[param] = stepped_param
            #print param.type,stepped_param.type
    return updates 
Example #4
0
    def get_updates_adadelta(grads,params,decay=0.95):
        decay = constantX(decay)
        print 'build updates with adadelta'
        for param, grad in zip(params, grads):
            # mean_squared_grad := E[g^2]_{t-1}
            mean_square_grad = sharedX(numpy.zeros(param.get_value().shape, dtype=floatX))
            # mean_square_dx := E[(\Delta x)^2]_{t-1}
            mean_square_dx = sharedX(numpy.zeros(param.get_value().shape, dtype=floatX))
            if param.name is not None:
                mean_square_grad.name = 'mean_square_grad_' + param.name
                mean_square_dx.name = 'mean_square_dx_' + param.name

            # Accumulate gradient
            new_mean_squared_grad = \
                    decay * mean_square_grad +\
                    (1. - decay) * T.sqr(grad)
            # Compute update
            epsilon = constantX(1e-7)
            rms_dx_tm1 = T.sqrt(mean_square_dx + epsilon)
            rms_grad_t = T.sqrt(new_mean_squared_grad + epsilon)
            delta_x_t = - rms_dx_tm1 / rms_grad_t * grad

            # Accumulate updates
            new_mean_square_dx = \
                    decay * mean_square_dx + \
                    (1. - decay) * T.sqr(delta_x_t)

            # Apply update
            updates[mean_square_grad] = new_mean_squared_grad
            updates[mean_square_dx] = new_mean_square_dx
            updates[param] = param + delta_x_t
Example #5
0
  def buildUpdatesSimpleMomentum(self, batchTrainer, momentum,
                  batchLearningRate, error):

    deltaParams = T.grad(error, batchTrainer.params)
    updates = []
    parametersTuples = zip(batchTrainer.params,
                           deltaParams,
                           batchTrainer.oldUpdates,
                           batchTrainer.oldMeanSquare,
                           batchTrainer.hasNormConstraint)

    for param, delta, oldUpdate, oldMeanSquare, hasNormConstraint in parametersTuples:
      paramUpdate = momentum * oldUpdate
      if self.rmsprop:
        meanSquare = 0.9 * oldMeanSquare + 0.1 * delta ** 2
        paramUpdate += - batchLearningRate * delta / T.sqrt(meanSquare + 1e-8)
        updates.append((oldMeanSquare, meanSquare))
      else:
        paramUpdate += - batchLearningRate * delta

      newParam = param + paramUpdate

      if self.normConstraint is not None and hasNormConstraint:
        norms = SquaredElementWiseNorm(newParam)
        rescaled = norms > self.normConstraint
        factors = T.ones(norms.shape, dtype=theanoFloat) / T.sqrt(norms) * np.sqrt(self.normConstraint, dtype='float32') - 1.0
        replaceNewParam = (factors * rescaled) * newParam
        replaceNewParam += newParam
        newParam = replaceNewParam
        # paramUpdate = newParam - param

      updates.append((param, newParam))
      updates.append((oldUpdate, paramUpdate))

    return updates
    def generate_forward_diffusion_sample(self, X_noiseless):
        """
        Corrupt a training image with t steps worth of Gaussian noise, and
        return the corrupted image, as well as the mean and covariance of the
        posterior q(x^{t-1}|x^t, x^0).
        """

        X_noiseless = X_noiseless.reshape(
            (-1, self.n_colors, self.spatial_width, self.spatial_width))

        n_images = X_noiseless.shape[0].astype('int16')
        rng = Random().theano_rng
        # choose a timestep in [1, self.trajectory_length-1].
        # note the reverse process is fixed for the very
        # first timestep, so we skip it.
        # TODO for some reason random_integer is missing from the Blocks
        # theano random number generator.
        t = T.floor(rng.uniform(size=(1,1), low=1, high=self.trajectory_length,
            dtype=theano.config.floatX))
        t_weights = self.get_t_weights(t)
        N = rng.normal(size=(n_images, self.n_colors, self.spatial_width, self.spatial_width),
            dtype=theano.config.floatX)

        # noise added this time step
        beta_forward = self.get_beta_forward(t)
        # decay in noise variance due to original signal this step
        alpha_forward = 1. - beta_forward
        # compute total decay in the fraction of the variance due to X_noiseless
        alpha_arr = 1. - self.beta_arr
        alpha_cum_forward_arr = T.extra_ops.cumprod(alpha_arr).reshape((self.trajectory_length,1))
        alpha_cum_forward = T.dot(t_weights.T, alpha_cum_forward_arr)
        # total fraction of the variance due to noise being mixed in
        beta_cumulative = 1. - alpha_cum_forward
        # total fraction of the variance due to noise being mixed in one step ago
        beta_cumulative_prior_step = 1. - alpha_cum_forward/alpha_forward

        # generate the corrupted training data
        X_uniformnoise = X_noiseless + (rng.uniform(size=(n_images, self.n_colors, self.spatial_width, self.spatial_width),
            dtype=theano.config.floatX)-T.constant(0.5,dtype=theano.config.floatX))*T.constant(self.uniform_noise,dtype=theano.config.floatX)
        X_noisy = X_uniformnoise*T.sqrt(alpha_cum_forward) + N*T.sqrt(1. - alpha_cum_forward)

        # compute the mean and covariance of the posterior distribution
        mu1_scl = T.sqrt(alpha_cum_forward / alpha_forward)
        mu2_scl = 1. / T.sqrt(alpha_forward)
        cov1 = 1. - alpha_cum_forward/alpha_forward
        cov2 = beta_forward / alpha_forward
        lam = 1./cov1 + 1./cov2
        mu = (
                X_uniformnoise * mu1_scl / cov1 +
                X_noisy * mu2_scl / cov2
            ) / lam
        sigma = T.sqrt(1./lam)
        sigma = sigma.reshape((1,1,1,1))

        mu.name = 'mu q posterior'
        sigma.name = 'sigma q posterior'
        X_noisy.name = 'X_noisy'
        t.name = 't'

        return X_noisy, t, mu, sigma
Example #7
0
def adadelta(lr,tparams,grads,x,mask,y,cost):
    zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
                                  name='%s_grad' % k)
                    for k, p in tparams.items()]
    running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.),
                                 name='%s_rup2' % k)
                   for k, p in tparams.items()]
    running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
                                    name='%s_rgrad2' % k)
                      for k, p in tparams.items()]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
             for rg2, g in zip(running_grads2, grads)]

    f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up,
                                    name='adadelta_f_grad_shared')

    updir = [-T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg
             for zg, ru2, rg2 in zip(zipped_grads,
                                     running_up2,
                                     running_grads2)]
    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
             for ru2, ud in zip(running_up2, updir)]
    #梯度更新字典
    param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]

    f_update = theano.function([lr], [], updates=ru2up + param_up,
                               on_unused_input='ignore',
                               name='adadelta_f_update')

    return f_grad_shared, f_update
Example #8
0
def adadelta(lr, tparams, grads, inp, cost, extra_ups=[], extra_outs=[],
             exclude_params=set([])):
    '''Adadelta'''
    zipped_grads = [theano.shared(p.get_value() * np.float32(0.), name='%s_grad'%k)
                    for k, p in tparams.iteritems()]
    running_up2 = [theano.shared(p.get_value() * np.float32(0.), name='%s_rup2'%k)
                   for k, p in tparams.iteritems()]
    running_grads2 = [theano.shared(p.get_value() * np.float32(0.), name='%s_rgrad2'%k)
                      for k, p in tparams.iteritems()]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
        for rg2, g in zip(running_grads2, grads)]

    f_grad_shared = theano.function(
        inp, [cost]+extra_outs, updates=zgup+rg2up+extra_ups, profile=profile)

    updir = [-T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg
             for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)]
    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
        for ru2, ud in zip(running_up2, updir)]
    param_up = [(p, p + ud) for p, ud in zip(tools.itemlist(tparams), updir)
        if p.name not in exclude_params]

    if not isinstance(lr, list): lr = [lr]
    f_update = theano.function(lr, [], updates=ru2up+param_up,
        on_unused_input='ignore', profile=profile)

    return f_grad_shared, f_update
    def get_mu_sigma(self, X_noisy, t):
        """
        Generate mu and sigma for one step in the reverse trajectory,
        starting from a minibatch of images X_noisy, and at timestep t.
        """
        Z = self.mlp.apply(X_noisy)
        mu_coeff, beta_coeff = self.temporal_readout(Z, t)
        # reverse variance is perturbation around forward variance
        beta_forward = self.get_beta_forward(t)
        # make impact of beta_coeff scaled appropriately with mu_coeff
        beta_coeff_scaled = beta_coeff / np.sqrt(self.trajectory_length).astype(theano.config.floatX)
        beta_reverse = T.nnet.sigmoid(beta_coeff_scaled + util.logit(beta_forward))
        # # reverse mean is decay towards mu_coeff
        # mu = (X_noisy - mu_coeff)*T.sqrt(1. - beta_reverse) + mu_coeff
        # reverse mean is a perturbation around the mean under forward
        # process


        # # DEBUG -- use these lines to test objective is 0 for isotropic Gaussian model
        # beta_reverse = beta_forward
        # mu_coeff = mu_coeff*0


        mu = X_noisy*T.sqrt(1. - beta_forward) + mu_coeff*T.sqrt(beta_forward)
        sigma = T.sqrt(beta_reverse)
        mu.name = 'mu p'
        sigma.name = 'sigma p'
        return mu, sigma
Example #10
0
 def dev_loss(self, dev_types, dev_lams, ss_ratio, y):
     su_mask = ss_ratio * T.neq(y, 0).reshape((y.shape[0], 1))
     un_mask = T.eq(y, 0).reshape((y.shape[0], 1))
     ss_mask = su_mask + un_mask
     var_fun = lambda x1, x2: T.sum(((x1 - x2) * ss_mask)**2.0) / T.sum(ss_mask)
     tanh_fun = lambda x1, x2: var_fun(T.tanh(x1), T.tanh(x2))
     norm_fun = lambda x1, x2: var_fun( \
             (x1 / T.sqrt(T.sum(x1**2.0,axis=1,keepdims=1) + 1e-6)), \
             (x2 / T.sqrt(T.sum(x2**2.0,axis=1,keepdims=1) + 1e-6)))
     sigm_fun = lambda x1, x2: var_fun(T.nnet.sigmoid(x1), T.nnet.sigmoid(x2))
     cent_fun = lambda xt, xo: T.sum(T.nnet.binary_crossentropy( \
             T.nnet.sigmoid(xo), T.nnet.sigmoid(xt))) / xt.shape[0]
     L = 0.0
     for i in xrange(self.layer_count):
         if (i < (self.layer_count - 1)):
             x1 = self.layers[i].output
             x2 = self.drop_nets[0][i].output
         else:
             x1 = self.layers[i].linear_output
             x2 = self.drop_nets[0][i].linear_output
         if (dev_types[i] == 1):
             L = L + (dev_lams[i] * norm_fun(x1, x2))
         elif (dev_types[i] == 2):
             L = L + (dev_lams[i] * tanh_fun(x1, x2))
         elif (dev_types[i] == 3):
             L = L + (dev_lams[i] * sigm_fun(x1, x2))
         elif (dev_types[i] == 4):
             L = L + (dev_lams[i] * cent_fun(x1, x2))
         else:
             L = L + (dev_lams[i] * var_fun(x1, x2))
     return L
def batchnorm(X, rescale=None, reshift=None, u=None, s=None, e=1e-8):
    """
    batchnorm with support for not using scale and shift parameters
    as well as inference values (u and s) and partial batchnorm (via a)
    will detect and use convolutional or fully connected version
    """
    g = rescale
    b = reshift
    if X.ndim == 4:
        if u is not None and s is not None:
            # use normalization params given a priori
            b_u = u.dimshuffle('x', 0, 'x', 'x')
            b_s = s.dimshuffle('x', 0, 'x', 'x')
        else:
            # compute normalization params from input
            b_u = T.mean(X, axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x')
            b_s = T.mean(T.sqr(X - b_u), axis=[0, 2, 3]).dimshuffle('x', 0, 'x', 'x')
        # batch normalize
        X = (X - b_u) / T.sqrt(b_s + e)
        if g is not None and b is not None:
            # apply rescale and reshift
            X = X*T.exp(0.2*g.dimshuffle('x', 0, 'x', 'x')) + b.dimshuffle('x', 0, 'x', 'x')
    elif X.ndim == 2:
        if u is None and s is None:
            # compute normalization params from input
            u = T.mean(X, axis=0)
            s = T.mean(T.sqr(X - u), axis=0)
        # batch normalize
        X = (X - u) / T.sqrt(s + e)
        if g is not None and b is not None:
            # apply rescale and reshift
            X = X*T.exp(0.2*g) + b
    else:
        raise NotImplementedError
    return X
    def sample_v_given_hs(self, h_sample, s_sample, rng=None, size=None):
        """
        Generates sample from p(v | h, s)
        """
        v_mean = self.v_given_hs(h_sample, s_sample)

        rng = self.theano_rng if rng is None else rng
        size = size if size else self.batch_size
        if self.flags['truncate_v']:
            v_sample = truncated.truncated_normal(
                    size=(size, self.n_v),
                    avg = v_mean, 
                    std = T.sqrt(1./self.lambd_prec),
                    lbound = -self.truncation_bound['v'],
                    ubound = self.truncation_bound['v'],
                    theano_rng = rng,
                    dtype=floatX)
        else:
            v_sample = rng.normal(
                    size=(size, self.n_v),
                    avg = v_mean, 
                    std = T.sqrt(1./self.lambd_prec),
                    dtype=floatX)

        return v_sample
Example #13
0
def adadelta(lr, tparams, grads, inp, cost):
    zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
                                  name='%s_grad' % k)
                    for k, p in tparams.iteritems()]
    running_up2 = [theano.shared(p.get_value() * numpy.float32(0.),
                                 name='%s_rup2' % k)
                   for k, p in tparams.iteritems()]
    running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
                                    name='%s_rgrad2' % k)
                      for k, p in tparams.iteritems()]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
             for rg2, g in zip(running_grads2, grads)]

    f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up,
                                    profile=profile)

    updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
             for zg, ru2, rg2 in zip(zipped_grads, running_up2,
                                     running_grads2)]
    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
             for ru2, ud in zip(running_up2, updir)]
    param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)]

    f_update = theano.function([lr], [], updates=ru2up+param_up,
                               on_unused_input='ignore', profile=profile)

    return f_grad_shared, f_update
Example #14
0
 def updates(self, cost, params, learning_rate = 0.1, momentum= 0.95, rescale=5.):
     grads = T.grad(cost, params)
     grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads)))
     not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
     grad_norm = T.sqrt(grad_norm)
     scaling_num = rescale
     scaling_den = T.maximum(rescale, grad_norm)
     # Magic constants
     combination_coeff = 0.9
     minimum_grad = 1e-4
     updates = []
     for n, (param, grad) in enumerate(zip(params, grads)):
         grad = T.switch(not_finite, 0.1 * param,
                         grad * (scaling_num / scaling_den))
         old_square = self.running_square_[n]
         new_square = combination_coeff * old_square + (
             1. - combination_coeff) * T.sqr(grad)
         old_avg = self.running_avg_[n]
         new_avg = combination_coeff * old_avg + (
             1. - combination_coeff) * grad
         rms_grad = T.sqrt(new_square - new_avg ** 2)
         rms_grad = T.maximum(rms_grad, minimum_grad)
         memory = self.memory_[n]
         update = momentum * memory - learning_rate * grad / rms_grad
         update2 = momentum * momentum * memory - (
             1 + momentum) * learning_rate * grad / rms_grad
         updates.append((old_square, new_square))
         updates.append((old_avg, new_avg))
         updates.append((memory, update))
         updates.append((param, param + update2))
     return updates
    def sample_s_given_ghv(self, g_sample, h_sample, v_sample, rng=None, size=None):
        """
        Generates sample from p(s | g, h, v)
        """
        s_mean = self.s_given_ghv(g_sample, h_sample, v_sample)
        
        rng = self.theano_rng if rng is None else rng
        size = size if size else self.batch_size

        if self.flags['truncate_s']:
            s_sample = truncated.truncated_normal(
                    size=(size, self.n_s),
                    avg = s_mean, 
                    std = T.sqrt(1./self.alpha_prec),
                    lbound = self.truncation_bound['s'],
                    ubound = self.truncation_bound['s'],
                    theano_rng = rng,
                    dtype=floatX)
        else: 
            s_sample = rng.normal(
                    size=(size, self.n_s),
                    avg = s_mean, 
                    std = T.sqrt(1./self.alpha_prec),
                    dtype=floatX)
        return s_sample
def sgd_updates_adadelta(params, cost, rho=0.95, epsilon=1e-6,
        norm_lim=9, word_vec_name='embedding'):
    updates = OrderedDict({})
    exp_sqr_grads = OrderedDict({})
    exp_sqr_ups = OrderedDict({})
    gparams = [] 
    for param in params:
        empty = np.zeros_like(param.get_value())
        exp_sqr_grads[param] = theano.shared(value=as_floatX(empty),name="exp_grad_%s" % param.name)
        gp = T.grad(cost, param)
        exp_sqr_ups[param] = theano.shared(value=as_floatX(empty), name="exp_grad_%s" % param.name)
        gparams.append(gp)

    for param, gp in zip(params, gparams):
        exp_sg = exp_sqr_grads[param] 
        exp_su = exp_sqr_ups[param]
        up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp)
        updates[exp_sg] = up_exp_sg
        step =  -(T.sqrt(exp_su + epsilon) / T.sqrt(up_exp_sg + epsilon)) * gp
        updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step)
        stepped_param = param + step
        
        if (param.get_value(borrow=True).ndim == 2) and (param.name!='embedding'):
            col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0)) 
            desired_norms = T.clip(col_norms, 0, T.sqrt(norm_lim)) 
            scale = desired_norms / (1e-7 + col_norms)
            updates[param] = stepped_param * scale
        else:
            updates[param] = stepped_param
    return updates
Example #17
0
 def __init__(self, vocab_size, dim, lr=0.5):
     W = np.asarray(np.random.rand(vocab_size, dim),
                    dtype=theano.config.floatX) / float(dim)
     W1 = np.asarray((np.random.rand(vocab_size, dim)),
                     dtype=theano.config.floatX) / float(dim)
     self.W = theano.shared(W, name='W', borrow=True)
     self.W1 = theano.shared(W1, name='W1', borrow=True)
     gW = np.asarray(np.ones((vocab_size, dim)), dtype=theano.config.floatX)
     gW1 = np.asarray(
         np.ones((vocab_size, dim)), dtype=theano.config.floatX)
     self.gW = theano.shared(gW, name='gW', borrow=True)
     self.gW1 = theano.shared(gW1, name='gW1', borrow=True)
     X = T.vector()
     fX = T.vector()
     ind_W = T.ivector()
     ind_W1 = T.ivector()
     w = self.W[ind_W, :]
     w1 = self.W1[ind_W1, :]
     cost = T.sum(fX * ((T.sum(w * w1, axis=1) - X) ** 2))
     grad = T.clip(T.grad(cost, [w, w1]), -5.0, 5.0)
     updates1 = [(self.gW, T.inc_subtensor(self.gW[ind_W, :],
                                           grad[0] ** 2))]
     updates2 = [(self.gW1, T.inc_subtensor(self.gW1[ind_W1, :],
                                            grad[1] ** 2))]
     updates3 = [(self.W, T.inc_subtensor(self.W[ind_W, :],
                                          - (lr / T.sqrt(self.gW[ind_W, :])) *
                                          grad[0]))]
     updates4 = [(self.W1, T.inc_subtensor(self.W1[ind_W1, :],
                                           - (lr / T.sqrt(self.gW1[ind_W1, :])) *
                                           grad[1]))]
     updates = updates1 + updates2 + updates3 + updates4
     self.cost_fn = theano.function(
         inputs=[ind_W, ind_W1, X, fX], outputs=cost, updates=updates)
Example #18
0
def get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=9, word_vec_name='W_emb'):
    """
    adadelta update rule, mostly from
    https://groups.google.com/forum/#!topic/pylearn-dev/3QbKtCumAW4 (for Adadelta)
    """
    print "Generating adadelta updates"
    updates = OrderedDict({})
    exp_sqr_grads = OrderedDict({})
    exp_sqr_ups = OrderedDict({})
    gparams = []
    for param in params:
        exp_sqr_grads[param] = build_shared_zeros(param.shape.eval(), name="exp_grad_%s" % param.name)
        gp = T.grad(cost, param)
        exp_sqr_ups[param] = build_shared_zeros(param.shape.eval(), name="exp_grad_%s" % param.name)
        gparams.append(gp)
    for param, gp in zip(params, gparams):
        exp_sg = exp_sqr_grads[param]
        exp_su = exp_sqr_ups[param]
        up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp)
        updates[exp_sg] = up_exp_sg
        step =  -(T.sqrt(exp_su + eps) / T.sqrt(up_exp_sg + eps)) * gp
        updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step)
        stepped_param = param + step
        # if (param.get_value(borrow=True).ndim == 2) and (param.name != word_vec_name):
        if max_norm and param.name != word_vec_name:
            col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0))
            desired_norms = T.clip(col_norms, 0, T.sqrt(max_norm))
            scale = desired_norms / (1e-7 + col_norms)
            updates[param] = stepped_param * scale
        else:
            updates[param] = stepped_param
    return updates
Example #19
0
def _get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=9, word_vec_name='W_emb'):
  print "Generating adadelta updates (implementation from dnn)"
  # compute list of weights updates
  gparams = T.grad(cost, params)

  accugrads, accudeltas = [], []
  for param in params:
    accugrads.append(build_shared_zeros(param.shape.eval(), 'accugrad'))
    accudeltas.append(build_shared_zeros(param.shape.eval(), 'accudelta'))

  # compute list of weights updates
  updates = OrderedDict()

  for accugrad, accudelta, param, gparam in zip(accugrads, accudeltas, params, gparams):
      # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
      agrad = rho * accugrad + (1 - rho) * gparam * gparam
      dx = - T.sqrt((accudelta + eps) / (agrad + eps)) * gparam
      updates[accudelta] = (rho * accudelta + (1 - rho) * dx * dx)
      if (max_norm > 0) and param.ndim == 2 and param.name != word_vec_name:
          W = param + dx
          col_norms = W.norm(2, axis=0)
          desired_norms = T.clip(col_norms, 0, T.sqrt(max_norm))
          updates[param] = W * (desired_norms / (1e-7 + col_norms))
      else:
          updates[param] = param + dx
      updates[accugrad] = agrad
  return updates
Example #20
0
def adadelta(parameters, gradients, rho=0.95, eps=1e-6):
    """
        adadelta : training algorithm
    """
    # create variables to store intermediate updates
    gradients_sq = [theano.shared(numpy.zeros(p.get_value().shape,
    							  dtype=theano.config.floatX))
    			    for p in parameters]
    deltas_sq = [theano.shared(numpy.zeros(p.get_value().shape,
    						   dtype=theano.config.floatX))
    		    for p in parameters]

    # calculates the new "average" delta for the next iteration
    gradients_sq_new = [rho*g_sq + (1-rho)*(g**2)
    				   for g_sq,g in izip(gradients_sq, gradients)]

    # calculates the step in direction. The square root is an approximation to getting the RMS for the average value
    deltas = [(T.sqrt(d_sq+eps)/T.sqrt(g_sq+eps))*grad
    		 for d_sq,g_sq,grad in izip(deltas_sq,gradients_sq_new,gradients)]

    # calculates the new "average" deltas for the next step.
    deltas_sq_new = [rho*d_sq + (1-rho)*(d**2) for d_sq,d in izip(deltas_sq,deltas)]

    # Prepare it as a list f
    gradient_sq_updates = zip(gradients_sq,gradients_sq_new)
    deltas_sq_updates = zip(deltas_sq,deltas_sq_new)
    parameters_updates = [(p,T.clip(p - d, -15,15)) for p,d in izip(parameters,deltas)]

    return gradient_sq_updates + deltas_sq_updates + parameters_updates
Example #21
0
    def __init__(self, incoming, b=lasagne.init.Constant(0.), g=lasagne.init.Constant(1.),
                 W=lasagne.init.Normal(0.05), train_g=False, init_stdv=1., nonlinearity=relu, **kwargs):
        super(WeightNormLayer, self).__init__(incoming, **kwargs)
        self.nonlinearity = nonlinearity
        self.init_stdv = init_stdv
        k = self.input_shape[1]
        if b is not None:
            self.b = self.add_param(b, (k,), name="b", regularizable=False)
        if g is not None:
            self.g = self.add_param(g, (k,), name="g", regularizable=False, trainable=train_g)
        if len(self.input_shape)==4:
            self.axes_to_sum = (0,2,3)
            self.dimshuffle_args = ['x',0,'x','x']
        else:
            self.axes_to_sum = 0
            self.dimshuffle_args = ['x',0]

        # scale weights in layer below
        incoming.W_param = incoming.W
        #incoming.W_param.set_value(W.sample(incoming.W_param.get_value().shape))
        if incoming.W_param.ndim==4:
            if isinstance(incoming, Deconv2DLayer):
                W_axes_to_sum = (0,2,3)
                W_dimshuffle_args = ['x',0,'x','x']
            else:
                W_axes_to_sum = (1,2,3)
                W_dimshuffle_args = [0,'x','x','x']
        else:
            W_axes_to_sum = 0
            W_dimshuffle_args = ['x',0]
        if g is not None:
            incoming.W = incoming.W_param * (self.g/T.sqrt(1e-6 + T.sum(T.square(incoming.W_param),axis=W_axes_to_sum))).dimshuffle(*W_dimshuffle_args)
        else:
            incoming.W = incoming.W_param / T.sqrt(1e-6 + T.sum(T.square(incoming.W_param),axis=W_axes_to_sum,keepdims=True))
Example #22
0
    def _get_model_updates(self):
        alpha = self.params['optimizer/learning_rate']

        updates = dict()
        for name, param in self.network.params.items():
            gradient = self.params[name + '_gradient']
            ms_gradient = self.params[name + '_mean_sqr_gradient']
            ms_velocity = self.params[name + '_mean_sqr_velocity']
            # rms_velocity quantity lags behind rms_gradient by 1 time step,
            # due to the recurrence relationship for velocity.
            rms_gradient = tensor.sqrt(ms_gradient + self._epsilon)
            rms_velocity = tensor.sqrt(ms_velocity + self._epsilon)
            velocity = -gradient * rms_velocity / rms_gradient
            updates[name] = velocity
        self._normalize(updates)

        result = []
        for name, param in self.network.params.items():
            update = updates[name]
            ms_velocity = self.params[name + '_mean_sqr_velocity']
            ms_velocity_new = self._gamma * ms_velocity + \
                              (1.0 - self._gamma) * tensor.sqr(update)
            param_new = param + alpha * update
            result.append((ms_velocity, ms_velocity_new))
            result.append((param, param_new))
        return result
Example #23
0
    def ADAMopt(self, tVars, loss, lr, momentum=0):

        i = T.iscalar('i'); lr = T.fscalar('lr');
        grads = T.grad(loss, tVars)
        '''ADAM Code from
            https://github.com/danfischetti/deep-recurrent-attentive-writer/blob/master/DRAW/adam.py
        '''
        self.m = [theano.shared(name = 'm', \
                value = np.zeros(param.get_value().shape,dtype=theano.config.floatX)) for param in model.params]
        self.v = [theano.shared(name = 'v', \
        	value = np.zeros(param.get_value().shape,dtype=theano.config.floatX)) for param in model.params]
        self.t = theano.shared(name = 't',value = np.asarray(1).astype(theano.config.floatX))
        updates = [(self.t,self.t+1)]

        for param, gparam,m,v in zip(model.params, gparams, self.m, self.v):

            b1_t = 1-(1-beta1)*(l**(self.t-1))
            m_t = b1_t*gparam + (1-b1_t)*m
            updates.append((m,m_t))
            v_t = beta2*(gparam**2)+(1-beta2)*v
            updates.append((v,v_t))
            m_t_bias = m_t/(1-(1-beta1)**self.t)
            v_t_bias = v_t/(1-(1-beta2)**self.t)
            if param.get_value().ndim == 1:
                updates.append((param,param - 5*lr*m_t_bias/(T.sqrt(v_t_bias)+epsilon)))
            else:
                updates.append((param,param - lr*m_t_bias/(T.sqrt(v_t_bias)+epsilon)))

        return theano.function([], loss, updates=updates)
Example #24
0
    def get_updates(self, params, loss):
        grads = self.get_gradients(loss, params)
        accumulators = [shared_zeros(p.get_value().shape) for p in params]
        delta_accumulators = [shared_zeros(p.get_value().shape) for p in params]
        self.updates = []
        n_step = theano.shared(1.0)
        self.updates.append((n_step, n_step + 1))

        for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
            g_noise = self.rng.normal(p.shape, 0, T.sqrt(n_step ** - 0.55), dtype='float32')
            g_deviated = g + g_noise

            new_a = self.rho * a + (1 - self.rho) * g_deviated ** 2  # update accumulator
            self.updates.append((a, new_a))

            # use the new accumulator and the *old* delta_accumulator
            update = g_deviated * T.sqrt(d_a + self.epsilon) / T.sqrt(new_a +
                                                             self.epsilon)

            new_p = p - self.lr * update
            self.updates.append((p, new_p))

            # update delta_accumulator
            new_d_a = self.rho * d_a + (1 - self.rho) * update ** 2
            self.updates.append((d_a, new_d_a))
        return self.updates
Example #25
0
    def get_updates(self, params, loss, **kwargs):
        grads = self.get_gradients(loss, params, **kwargs)
        self.updates = [(self.iterations, self.iterations+1.)]

        t = self.iterations + 1
        lr_t = self.lr * T.sqrt(1-self.beta_2**t)/(1-self.beta_1**t)

        # n_step = theano.shared(1.0)
        # self.updates.append((n_step, n_step + 1))

        gradients = []

        for p, g in zip(params, grads):
            m = theano.shared(p.get_value() * 0.)  # zero init of moment
            v = theano.shared(p.get_value() * 0.)  # zero init of velocity

            # g_noise = self.rng.normal(g.shape, 0, T.sqrt(0.5 * n_step ** - 0.55), dtype='float32')
            # g_deviated = g + g_noise
            g_deviated = g

            # for debug purposes
            gradients.append(g)

            m_t = (self.beta_1 * m) + (1 - self.beta_1) * g_deviated
            v_t = (self.beta_2 * v) + (1 - self.beta_2) * (g_deviated**2)
            p_t = p - lr_t * m_t / (T.sqrt(v_t) + self.epsilon)

            self.updates.append((m, m_t))
            self.updates.append((v, v_t))
            self.updates.append((p, p_t))  # apply constraints
        return self.updates, gradients
Example #26
0
def AdadeltaUpdate(params,cost,stepSize=1.0,rho=0.95,epsilon=1e-6,norm_lim=9):
    updates=OrderedDict({})
    exp_sqr_grads=OrderedDict({})
    exp_sqr_update=OrderedDict({})
    g_params=[]
    for param in params:
        empty=np.zeros_like(param.get_value())
        exp_sqr_grads[param]=theano.shared(value=as_floatX(empty),name='exp_grad_%s'%param.name)
        exp_sqr_update[param]=theano.shared(value=as_floatX(empty),name='exp_grad_%s'%param.name)
        gp=T.grad(cost,param)
        g_params.append(gp)
    for param,gp in zip(params,g_params):
        exp_sg=exp_sqr_grads[param]
        exp_su=exp_sqr_update[param]
        update_exp_sg=rho*exp_sg+(1-rho)*T.sqr(gp)#????
        updates[exp_sg]=update_exp_sg
        
        step=-(T.sqrt(exp_su+epsilon)/T.sqrt(update_exp_sg+epsilon))*gp
        stepped_param=param+step*stepSize
        
        update_exp_su=rho*exp_su+(1-rho)*T.sqr(step)
        updates[exp_su]=update_exp_su

        if param.get_value(borrow=True).ndim==2 and param.name!='wordVec':
            col_norms=T.sqrt(T.sum(T.sqr(stepped_param),axis=0))
            desired_norms=T.clip(col_norms,0,T.sqrt(norm_lim))#???
            scale=desired_norms/(1e-7+col_norms)
            updates[param]=stepped_param*scale
        else:
            updates[param]=stepped_param
    return updates
Example #27
0
    def forward(self,input_org,train=True,update_batch_stat=True,finetune=False):
        print "Layer/BatchNormalization"
        ldim,cdim,rdim = self._internal_shape(input_org)
        input = input_org.reshape((ldim,cdim,rdim))
        if (train):
            mean = T.mean(input, axis=(0, 2), keepdims=True )
            var = T.mean((input-mean)**2, axis=(0, 2), keepdims=True)

            if(update_batch_stat):
                finetune_N = theano.clone(self.finetune_N, share_inputs=False)
                if(finetune):
                    finetune_N.default_update = finetune_N+1
                    ratio = T.cast(1-1.0/(finetune_N+1),theano.config.floatX)
                else:
                    finetune_N.default_update = 0
                    ratio = self.moving_avg_ratio
                m = ldim*rdim
                scale = T.cast(m/(m-1.0),theano.config.floatX)
                est_mean = theano.clone(self.est_mean, share_inputs=False)
                est_var = theano.clone(self.est_var, share_inputs=False)
                est_mean.default_update = T.cast(ratio*self.est_mean + (1-ratio)*mean,theano.config.floatX)
                est_var.default_update = T.cast(ratio*self.est_var + (1-ratio)*scale*var,theano.config.floatX)
                mean += 0 * est_mean
                var += 0 * est_var
            output = self._pbc(self.gamma) * (input - self._pbc(mean)) \
                     / T.sqrt(1e-6+self._pbc(var)) + self._pbc(self.beta)

        else:
            output = self._pbc(self.gamma) * (input - self._pbc(self.est_mean)) \
                     / T.sqrt(1e-6+self._pbc(self.est_var)) + self._pbc(self.beta)

        return output.reshape(input_org.shape)
Example #28
0
 def applyConstraint(self, param):
     if param.ndim != 4 and param.ndim != 2:
         warnings.warn("Norm constraints are normally applied to matrices"
                       +" or 4-dimensional tensors, but currently got "
                       +"%d dimensions, please make sure this is the desired"
                       +" parameter to apply norm constraints" % param.ndim)
         
     needFlip = False
     if param.ndim == 4: # a hack for conv layer filters
         prevShape = param.shape
         # conv layer filter shape is (nChannelOut, nChannelIn, r, c)
         param = param.flatten(2)
         # now it is (nout, nin), which is different from (nin, nout) 
         # from fulling connected networks, so need to flip here
         needFlip = True
     
     if needFlip:
         col_norm = T.sqrt(T.sum(T.sqr(param), axis=1, keepdims=True))
     else:
         col_norm = T.sqrt(T.sum(T.sqr(param), axis=0, keepdims=True))
         
     param /= (col_norm+1e-7)
     param *= self.norm
     
     if needFlip:
         param = param.reshape(prevShape)
                     
     return param
Example #29
0
def make_functions(inputs,outputs,params,grads,lr):
	shapes = [ p.get_value().shape for p in params ]
	acc_grads = [ theano.shared(np.zeros(s,dtype=np.float32)) for s in shapes ]
	count = theano.shared(np.float32(0))
	acc_update = [ (a,a+g) for a,g in zip(acc_grads,grads) ] + [ (count,count + 1.) ]

#	deltas = acc_grads
	deltas	  = [ ag / count for ag in acc_grads ]
	grads_norms = [ T.sqrt(T.sum(g**2)) for g in deltas ]
	deltas	  = [ T.switch(T.gt(n,1.),1.*g/n,g) for n,g in zip(grads_norms,deltas) ]
	
#	param_update = [ (p, p - lr * g) for p,g in zip(params,deltas) ]
	param_update = updates.adadelta(params,deltas,learning_rate=lr) # ,learning_rate=lr,rho=np.float32(0.95)

	clear_update = [ 
			(a,np.zeros(s,dtype=np.float32)) 
			for a,s in zip(acc_grads,shapes) 
			] + [ (count,0) ]
	acc = theano.function(
			inputs  = inputs,
			outputs = [outputs,output_ans[ans_lbl]],
			updates = acc_update,
			on_unused_input='warn',
#			mode=theano.compile.MonitorMode(post_func=detect_nan)
		)
	update = theano.function(
			inputs=[lr],
			updates = param_update + clear_update,
			outputs = [ T.sqrt(T.sum(T.sqr(w))) for w in deltas ],
			on_unused_input='warn',
#			mode=theano.compile.MonitorMode(post_func=detect_nan)
		)
	return acc,update
Example #30
0
    def get_updates(self, cost, params):
        grads = self.get_gradients(cost, params)
        updates = []
        if self.i is None:
            self.i = sharedasarray(0)
        updates.append((self.i, self.i+1))

        t = self.i+1
        lr_t = self.lr * T.sqrt(1-self.beta2**t) / (1-self.beta1**t)
        eps_hat = self.eps * T.sqrt(1-self.beta2**t)
        if self.ms is None:
            self.ms = [sharedzeros(p.get_value().shape) for p in params]
        if self.vs is None:
            self.vs = [sharedzeros(p.get_value().shape) for p in params]

        for p, g, m, v in zip(params, grads, self.ms, self.vs):
            m_t = (self.beta1*m) + (1.-self.beta1)*g
            v_t = (self.beta2*v) + (1.-self.beta2)*(g**2)
            p_t = p - lr_t*m_t/(T.sqrt(v_t)+eps_hat)

            updates.append((m, m_t))
            updates.append((v, v_t))
            updates.append((p, p_t))

        return updates
def quantization(W,Wacc,method, Wb):
	
	if method == "FPN":
		Wb = W
	
	elif method == "LAB":
		L = (T.sqrt(Wacc) + 1e-8) 
		Wb = hard_sigmoid(W)
		Wb = round3(Wb)
		Wb = T.cast(T.switch(Wb,1.,-1.), theano.config.floatX) 

		alpha  = (T.abs_(L*W).sum()/L.sum()).astype('float32') 
		Wb = alpha*Wb	

	elif method=="LATa":
		D = (T.sqrt(Wacc) + 1e-8) 
		b = T.sgn(Wb)
		# compute the threshold, converge within 10 iterations 
		alpha  = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') 
		b = T.switch(T.gt(W/alpha, 0.5), 1., T.switch(T.lt(W/alpha, -0.5), -1., 0.) )
		def OneStep(alpha, b):
			# minimize alpha
			alpha_new  = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') 
			# minimize b
			b_new = T.switch(T.gt(W/alpha_new, 0.5), 1., T.switch(T.lt(W/alpha_new, -0.5), -1., 0.))
			delta = T.abs_(alpha_new-alpha)
			condition = T.lt(delta, 1e-6)
			return [alpha_new, b_new], theano.scan_module.until(condition)

		[out1, out2], updates = theano.scan(fn=OneStep ,outputs_info=[alpha, b],n_steps=10) 
		Wb  = out1[-1]*out2[-1]

	elif method=="LATe":
		D = (T.sqrt(Wacc) + 1e-8) 
		thres = findalpha(D, W)
		alpha = thres*2
		Wt = T.switch(T.gt(W, thres), 1., T.switch(T.lt(W, -thres), -1., 0.) )
		Wb = alpha*Wt

	elif method=="LAT2e":
		D = (T.sqrt(Wacc) + 1e-8) 
		thres1, thres2 = findalpha2(D, W)
		alpha1 = thres1*2
		Wt1 = T.switch(T.gt(W, thres1), 1., 0.) 
		alpha2 = thres2*2
		Wt2 = T.switch(T.lt(W, -thres2), -1., 0.) 

		Wb = alpha1*Wt1 + alpha2*Wt2	

	elif method=="LAT2a":
		D = (T.sqrt(Wacc) + 1e-8) 
		b1 = T.ge(Wb,0)
		alpha1 = (T.abs_(b1*D*W).sum()/T.abs_(b1*D).sum()).astype('float32') 
		b1 = T.switch(T.gt(W/alpha1, 0.5), 1., 0.)
		# Wb1 = alpha1*mask1*Wb
		b2 =  T.lt(Wb,0)
		alpha2 = (T.abs_(b2*D*W).sum()/T.abs_(b2*D).sum()).astype('float32') 
		b2 = T.switch(T.lt(W/alpha2, -0.5), -1., 0.)
		def OneStep(alpha1, b1, alpha2, b2):
			alpha1_new  = (T.abs_(b1*D*W).sum()/T.abs_(b1*D).sum()).astype('float32') 
			b1_new = T.switch(T.gt(W/alpha1_new, 0.5), 1., 0.)
			alpha2_new = (T.abs_(b2*D*W).sum()/T.abs_(b2*D).sum()).astype('float32') 
			b2_new = T.switch(T.lt(W/alpha2_new, -0.5), -1., 0.)

			delta1 = T.abs_(alpha1_new-alpha1)
			delta2 = T.abs_(alpha2_new-alpha2)
			condition = T.lt(delta1, 1e-6) and T.lt(delta2, 1e-6)
			return [alpha1_new, b1_new, alpha2_new, b2_new], theano.scan_module.until(condition)

		[out1, out2, out3, out4], updates = theano.scan(fn=OneStep ,outputs_info=[alpha1, b1, alpha2, b2],n_steps=10)
		Wb  = out1[-1]*out2[-1] + out3[-1]*out4[-1]	
								  

	elif method=="LAQ_linear":
		D = (T.sqrt(Wacc) + 1e-8) 
		b = T.sgn(Wb)
		alpha  = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') 
		# b = T.switch(T.gt(W/alpha, 0.5), 1., T.switch(T.lt(W/alpha, -0.5), -1., 0.) )
		m = 3 # number of bits
		n = 2**(m-1)-1

		b = round3(T.clip(W/alpha, -1., 1.)*n)/(n)
		def OneStep(alpha, b):
			# minimize alpha
			alpha_new  = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') 
			# minimize b
			# b_new = T.switch(T.gt(W/alpha, 0.5), 1., T.switch(T.lt(W/alpha, -0.5), -1., 0.))
			b_new = round3(T.clip(W/alpha_new, -1., 1.)*n)/(n)
			delta = T.abs_(alpha_new-alpha)
			condition = T.lt(delta, 1e-6)
			return [alpha_new, b_new], theano.scan_module.until(condition)

		[out1, out2], updates = theano.scan(fn=OneStep ,outputs_info=[alpha, b],n_steps=10)
		Wb  = out1[-1]*out2[-1]		

	elif method=="LAQ_log":
		D = (T.sqrt(Wacc) + 1e-8) 
		b = T.sgn(Wb)
		alpha  = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') 
		m = 3  # number of bits
		n = 2**(m-1)-1
		tmp = T.clip(W/alpha, -1., 1.)
		# log2(1/2*(2^(-n)+2^(-(n+1)))) - (-n-(n+1))/2 = 0.0849625
		b =  T.switch( T.ge(tmp, pow(2, -n)), T.pow(2, round3(T.log2(tmp)-0.0849625)), 
			T.switch( T.le(tmp, -pow(2,-n)), -T.pow(2, round3(T.log2(-tmp)-0.0849625)), 0.))
		b = T.switch(T.ge(b, pow(2, - (n-1))), b, T.switch(T.le(b, -pow(2, -(n-1))), b, T.sgn(b)*pow(2,-(n-1))))

		def OneStep(alpha, b):
			# minimize alpha
			alpha_new  = (T.abs_(b*D*W).sum()/T.abs_(b*D).sum()).astype('float32') 
			# minimize b
			tmp_new = T.clip(W/alpha_new, -1., 1.)
			b_new =  T.switch( T.ge(tmp_new, pow(2, -n)), T.pow(2, round3(T.log2(tmp_new)-0.0849625)), 
				T.switch( T.le(tmp_new, -pow(2, -n)), -T.pow(2, round3(T.log2(-tmp_new)-0.0849625)), 0.))		
			b_new = T.switch(T.ge(b_new, pow(2, - (n-1))), b_new, 
				T.switch(T.le(b_new, -pow(2, -(n-1))), b_new, T.sgn(b_new)*pow(2, -(n-1))))
		
			delta = T.abs_(alpha_new-alpha)
			condition = T.lt(delta, 1e-6)
			return [alpha_new, b_new], theano.scan_module.until(condition)

		[out1, out2], updates = theano.scan(fn=OneStep ,outputs_info=[alpha, b],n_steps=10)
		Wb  = out1[-1]*out2[-1]	

	return Wb
Example #32
0
def std_cdf(x):
    """
    Calculates the standard normal cumulative distribution function.
    """
    return 0.5 + 0.5 * erf(x / sqrt(2.))
Example #33
0
def create_optimization_updates(cost,
                                params,
                                updates=None,
                                max_norm=5.0,
                                lr=0.01,
                                eps=1e-6,
                                rho=0.95,
                                method="adadelta",
                                gradients=None):
    """
    Get the updates for a gradient descent optimizer using
    SGD, AdaDelta, or AdaGrad.
    Returns the shared variables for the gradient caches,
    and the updates dictionary for compilation by a
    theano function.
    Inputs
    ------
    cost     theano variable : what to minimize
    params   list            : list of theano variables
                               with respect to which
                               the gradient is taken.
    max_norm float           : cap on excess gradients
    lr       float           : base learning rate for
                               adagrad and SGD
    eps      float           : numerical stability value
                               to not divide by zero
                               sometimes
    rho      float           : adadelta hyperparameter.
    method   str             : 'adagrad', 'adadelta', or 'sgd'.
    Outputs:
    --------
    updates  OrderedDict   : the updates to pass to a
                             theano function
    gsums    list          : gradient caches for Adagrad
                             and Adadelta
    xsums    list          : gradient caches for AdaDelta only
    lr       theano shared : learning rate
    max_norm theano_shared : normalizing clipping value for
                             excessive gradients (exploding).
    """
    lr = theano.shared(np.float64(lr).astype(theano.config.floatX))
    eps = np.float64(eps).astype(theano.config.floatX)
    rho = theano.shared(np.float64(rho).astype(theano.config.floatX))
    if max_norm is not None and max_norm is not False:
        max_norm = theano.shared(
            np.float64(max_norm).astype(theano.config.floatX))

    gsums = [
        theano.shared(np.zeros_like(param.get_value(borrow=True))) if
        (method == 'adadelta' or method == 'adagrad') else None
        for param in params
    ]
    xsums = [
        theano.shared(np.zeros_like(param.get_value(
            borrow=True))) if method == 'adadelta' else None
        for param in params
    ]

    gparams = T.grad(cost, params) if gradients is None else gradients

    if updates is None:
        updates = OrderedDict()

    for gparam, param, gsum, xsum in zip(gparams, params, gsums, xsums):
        # clip gradients if they get too big
        if max_norm is not None and max_norm is not False:
            grad_norm = gparam.norm(L=2)
            gparam = (T.minimum(max_norm, grad_norm) /
                      (grad_norm + eps)) * gparam

        if method == 'adadelta':
            updates[gsum] = T.cast(rho * gsum + (1. - rho) * (gparam**2),
                                   theano.config.floatX)
            dparam = -T.sqrt((xsum + eps) / (updates[gsum] + eps)) * gparam
            updates[xsum] = T.cast(rho * xsum + (1. - rho) * (dparam**2),
                                   theano.config.floatX)
            updates[param] = T.cast(param + dparam, theano.config.floatX)
        elif method == 'adagrad':
            updates[gsum] = T.cast(gsum + (gparam**2), theano.config.floatX)
            updates[param] = T.cast(
                param - lr * (gparam / (T.sqrt(updates[gsum] + eps))),
                theano.config.floatX)
        else:
            updates[param] = param - gparam * lr

    if method == 'adadelta':
        lr = rho

    return updates, gsums, xsums, lr, max_norm
Example #34
0
def test_net(
        classifier,
        num_classes,
        learning_rate,
        learning_rate_decay,
        squared_filter_length_limit,
        n_epochs,
        timeout,
        batch_size,
        x,
        mom_params,
        dropout,
        results_file_name,
        dataset,
        use_bias,
        random_seed,
        decay=True,
        momentum=True,
        L2=True,
        plot = False,
        return_classifier = False,
        augment_schedule = []):

    #[(train_set_x, train_set_y), (valid_set_x, valid_set_y), (test_set_x, test_set_y)] = dataset
    [(train_set_x, train_set_y),(valid_set_x, valid_set_y),(test_set_x, test_set_y)] = dataset
    
    # extract the params for momentum
    mom_start = mom_params["start"]
    mom_end = mom_params["end"]
    mom_epoch_interval = mom_params["interval"]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    #n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################

    print '... building the model'
    index = T.lscalar()    # index to a [mini]batch
    epoch = T.scalar()
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels
    rng = np.random.RandomState(random_seed)

    # Build the expresson for the cost function.
    if L2:
        lamb = 0.00000001
        cost = classifier.negative_log_likelihood(y)
        dropout_cost = classifier.dropout_negative_log_likelihood(y)
        if use_bias:
            cost += lamb * sum([(classifier.params[i]**2).sum() for i in range(0,len(classifier.params),2)])/2*batch_size
            dropout_cost += lamb * sum([(classifier.params[i]**2).sum() for i in range(0,len(classifier.params),2)])/2*batch_size
        else:
            cost += lamb *sum([(param**2).sum() for param in classifier.params])/2*batch_size
            dropout_cost += lamb *sum([(param**2).sum() for param in classifier.params])/2*batch_size
    else:
        cost = classifier.negative_log_likelihood(y)
        dropout_cost = classifier.dropout_negative_log_likelihood(y)

    # Compile theano function for testing.
    test_model = theano.function(inputs=[index],
            outputs=classifier.errors(y),
            givens={
                x: test_set_x[index * batch_size:(index + 1) * batch_size],
                y: test_set_y[index * batch_size:(index + 1) * batch_size]},
            on_unused_input='ignore')

    softmax_predictions = theano.function(inputs=[index],
            outputs=classifier.p_y_given_x_(),
            givens={
                x: test_set_x[index * batch_size:(index + 1) * batch_size],
                y: test_set_y[index * batch_size:(index + 1) * batch_size]},
            on_unused_input='ignore')

    test_labels = theano.function(inputs=[index],
            outputs=test_set_y[index * batch_size:(index + 1) * batch_size])

    #theano.printing.pydotprint(test_model, outfile="test_file.png",
    #        var_with_name_simple=True)

    # Compile theano function for validation.
    #validate_model = theano.function(inputs=[index],
    #        outputs=classifier.errors(y),
    #        givens={
    #            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
    #            y: valid_set_y[index * batch_size:(index + 1) * batch_size]},
    #        on_unused_input='ignore')
    #theano.printing.pydotprint(validate_model, outfile="validate_file.png",
    #        var_with_name_simple=True)

    # Compute gradients of the model wrt parameters
    gparams = []
    for param in classifier.params:
        # Use the right cost function here to train with or without dropout.
        gparam = T.grad(dropout_cost if dropout else cost, param)
        gparams.append(gparam)

    if momentum:
        print >> sys.stderr, ("Using momentum")
        # ... and allocate mmeory for momentum'd versions of the gradient
        gparams_mom = []
        for param in classifier.params:
            gparam_mom = theano.shared(np.zeros(param.get_value(borrow=True).shape,
                dtype=theano.config.floatX))
            gparams_mom.append(gparam_mom)
    
        # Compute momentum for the current epoch
        mom = ifelse(epoch < mom_epoch_interval,
                mom_start*(1.0 - epoch/mom_epoch_interval) + mom_end*(epoch/mom_epoch_interval),
                mom_end)
    
        # Update the step direction using momentum
        updates = OrderedDict()
        for gparam_mom, gparam in zip(gparams_mom, gparams):
            # Misha Denil's original version
            #updates[gparam_mom] = mom * gparam_mom + (1. - mom) * gparam
          
            # change the update rule to match Hinton's dropout paper
            updates[gparam_mom] = mom * gparam_mom - (1. - mom) * learning_rate * gparam
    
        # ... and take a step along that direction
        for param, gparam_mom in zip(classifier.params, gparams_mom):
            # Misha Denil's original version
            #stepped_param = param - learning_rate * updates[gparam_mom]
            
            # since we have included learning_rate in gparam_mom, we don't need it
            # here
            stepped_param = param + updates[gparam_mom]
    
            # This is a silly hack to constrain the norms of the rows of the weight
            # matrices.  This just checks if there are two dimensions to the
            # parameter and constrains it if so... maybe this is a bit silly but it
            # should work for now.
            if param.get_value(borrow=True).ndim == 2:
                #squared_norms = T.sum(stepped_param**2, axis=1).reshape((stepped_param.shape[0],1))
                #scale = T.clip(T.sqrt(squared_filter_length_limit / squared_norms), 0., 1.)
                #updates[param] = stepped_param * scale
                
                # constrain the norms of the COLUMNs of the weight, according to
                # https://github.com/BVLC/caffe/issues/109
                col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0))
                desired_norms = T.clip(col_norms, 0, T.sqrt(squared_filter_length_limit))
                scale = desired_norms / (1e-7 + col_norms)
                updates[param] = stepped_param * scale
            else:
                updates[param] = stepped_param
    else:
        if L2:
            print >> sys.stderr, ("Using gradient decent with L2 regularization")
            updates = [
            (param_i, param_i - learning_rate * (grad_i + lamb*param_i/batch_size))
            for param_i, grad_i in zip(classifier.params, gparams)
            ]
        else:
            print >> sys.stderr, ("Using gradient decent")
            updates = [
            (param_i, param_i - learning_rate * grad_i)
            for param_i, grad_i in zip(classifier.params, gparams)
            ]
    # Compile theano function for training.  This returns the training cost and
    # updates the model parameters.
    output = dropout_cost if dropout else cost
    train_model = theano.function(inputs=[epoch,index], outputs=output,
            updates=updates,
            givens={
                x: train_set_x[index * batch_size:(index + 1) * batch_size],
                y: train_set_y[index * batch_size:(index + 1) * batch_size]},
            on_unused_input='ignore')
    #theano.printing.pydotprint(train_model, outfile="train_file.png",
    #        var_with_name_simple=True)

    # Theano function to decay the learning rate, this is separate from the
    # training function because we only want to do this once each epoch instead
    # of after each minibatch.
    decay_learning_rate = theano.function(inputs=[], outputs=learning_rate,
            updates={learning_rate: learning_rate * learning_rate_decay})

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'

    best_test_error = np.inf
    best_test_score = np.inf
    best_iter_test = 0
    test_score = 0.
    epoch_counter = 0
    start_time = time.clock()

    plot_training = []
    plot_test = []
    best_model = None # saves the best model to be returned by function.
    #layer0_weights = None

    train_set_x_backup = train_set_x.get_value()
    while epoch_counter < n_epochs:
        # Train this epoch
        # augment the images
        if len(augment_schedule) > 0:
            opp = augment_schedule[epoch_counter % len(augment_schedule)]
            train_set_x_augment = copy.deepcopy(train_set_x_backup)
            print 'augmenting epoch with operation:', opp
            for i in range(len(train_set_x_augment)):
                # augment even images on even epochs and odd images on odd epochs.
                if (epoch_counter % 2 == 0 and i % 2 == 0) or (epoch_counter % 2 == 1 and i % 2 == 1): 
                    img = train_set_x_augment[i]
                    img = (img*256.0).astype(dtype=np.uint8)
                    if img.shape[2] == 1: # if it's a one channel image
                        img = np.reshape(img, (img.shape[0], img.shape[1]))
                    img = augment(img, opp)
                    img = img.astype(dtype=np.float32)/256.0
                    if len(img.shape) == 2:
                        img = np.reshape(img, (img.shape[0], img.shape[1], 1))
                    train_set_x_augment[i] = img
            train_set_x.set_value(train_set_x_augment)
        epoch_counter = epoch_counter + 1
        minibatch_avg_cost = 0
        for minibatch_index in xrange(n_train_batches):
            minibatch_avg_cost += train_model(epoch_counter, minibatch_index)
        plot_training.append(minibatch_avg_cost/n_train_batches)
        test_losses = [test_model(i) for i in xrange(n_test_batches)]
        this_test_error = np.mean(test_losses)
        plot_test.append(this_test_error)
        print "epoch {}, test error {}%, train error {}, learning_rate={}{}".format(
               epoch_counter, this_test_error*100.0, plot_training[-1],
               learning_rate.get_value(borrow=True),
               " **" if this_test_error < best_test_error else ""
               )
        #print 'predictions', test_softmax_predictions
        if this_test_error < best_test_error:
            best_test_error = this_test_error
            best_iter_test = epoch_counter
            test_softmax_predictions = [softmax_predictions(i) for i in xrange(n_test_batches)]
            test_labels_ = [test_labels(i) for i in xrange(n_test_batches)]
            #best_model_ = [param.get_value() for param in classifier.params]
            #best_model = cPickle.dumps(best_model_, protocol=cPickle.HIGHEST_PROTOCOL) # doesn't work TODO TODO
            if return_classifier:
                best_model = cPickle.dumps(classifier.params, protocol=cPickle.HIGHEST_PROTOCOL)
            # TODO extract filter images.
            #layer0_weights = classifier.layer0.W.get_value()
        if (timeout is not None) and (epoch_counter - best_iter_test >= timeout):
            break
        if decay:
            new_learning_rate = decay_learning_rate()
    end_time = time.clock()
    print >> sys.stderr, (('Optimization complete. Best test score of %f %% '
           'obtained at epoch %i') %
           (best_test_error * 100., best_iter_test))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
    preds, lbls, cmc, roc = get_cmc_roc_data(test_softmax_predictions, test_labels_)
    test_set = test_set_x.get_value()
    misses = []
    hits = []
    for sample in range(len(preds)):
        pred = preds[sample]
        pred_ = zip(pred, range(len(pred)))
        pred_.sort(reverse=True)
        prediction = pred_[0][1]
        lbl = lbls[sample]
        if prediction != lbl:
            misses.append(test_set[sample])
        else:
            hits.append(test_set[sample])
    #for idx, img in enumerate(d):
    #    img_ = (img*256.0).astype(dtype=np.uint8)
    #    cv2.imwrite('missed_img'+str(idx)+'.jpg', img_)
    if plot:
        plot_cmc(cmc)
        plot_roc(roc)
        plot_training_error(plot_training, epoch_counter)
        plot_testing_error(plot_test, epoch_counter)
    return (best_model, hits, misses, roc, cmc, preds, lbls, plot_training, plot_test)
Example #35
0
 def batch_norm(self,
                h,
                dim,
                use_shift=True,
                use_std=True,
                use_sample=0.0,
                force_sample=False,
                index=None,
                sample_mean=None,
                gamma=None,
                beta=None):
     x = h
     if h.ndim == 3:
         if index is None: index = self.index
         x = h.reshape((h.shape[0] * h.shape[1],
                        h.shape[2]))[(index.flatten() > 0).nonzero()]
     elif h.ndim == 4:  # index is sizes here
         assert index is not None
         x = h.reshape((h.shape[0] * h.shape[1] * h.shape[2], h.shape[3]))
         #x = x[(T.gt(x,numpy.float32(0))>0).nonzero()]
     mean = T.mean(x, axis=0)
     std = T.sqrt(T.mean((x - mean)**2, axis=0))
     if sample_mean is None:
         sample_mean = self.add_param(theano.shared(
             numpy.zeros((dim, ), 'float32'),
             '%s_%s_mean' % (self.name, h.name)),
                                      custom_update=mean,
                                      custom_update_normalized=True)
     self.sample_mean = sample_mean
     sample_std = T.sqrt(T.mean((x - sample_mean)**2, axis=0))
     if not self.train_flag and not force_sample:
         use_sample = 1.0
     mean = T.constant(1. - use_sample, 'float32') * mean + T.constant(
         use_sample, 'float32') * sample_mean
     std = T.constant(1. - use_sample, 'float32') * std + T.constant(
         use_sample, 'float32') * sample_std
     if h.ndim == 3:
         mean = mean.dimshuffle('x', 'x',
                                0).repeat(h.shape[0],
                                          axis=0).repeat(h.shape[1], axis=1)
         std = std.dimshuffle('x', 'x', 0).repeat(h.shape[0],
                                                  axis=0).repeat(h.shape[1],
                                                                 axis=1)
     elif h.ndim == 4:
         mean = mean.dimshuffle('x', 'x', 'x', 0).repeat(
             h.shape[0], axis=0).repeat(h.shape[1],
                                        axis=1).repeat(h.shape[2], axis=2)
         std = std.dimshuffle('x', 'x', 'x', 0).repeat(
             h.shape[0], axis=0).repeat(h.shape[1],
                                        axis=1).repeat(h.shape[2], axis=2)
     else:
         mean = mean.dimshuffle('x', 0).repeat(h.shape[0], axis=0)
         std = std.dimshuffle('x', 0).repeat(h.shape[0], axis=0)
     bn = (h - mean) / (std + numpy.float32(1e-10))
     if use_std:
         if gamma is None:
             gamma = self.add_param(
                 self.shared(
                     numpy.zeros((dim, ), 'float32') + numpy.float32(0.1),
                     "%s_%s_gamma" % (self.name, h.name)))
         self.gamma = gamma
         if h.ndim == 3:
             bn *= gamma.dimshuffle('x', 'x',
                                    0).repeat(h.shape[0],
                                              axis=0).repeat(h.shape[1],
                                                             axis=1)
         elif h.ndim == 4:
             bn *= gamma.dimshuffle('x', 'x', 'x', 0).repeat(
                 h.shape[0], axis=0).repeat(h.shape[1],
                                            axis=1).repeat(h.shape[2],
                                                           axis=2)
         else:
             bn *= gamma.dimshuffle('x', 0).repeat(h.shape[0], axis=0)
     if use_shift:
         if beta is None:
             beta = self.add_param(
                 self.shared(numpy.zeros((dim, ), 'float32'),
                             "%s_%s_beta" % (self.name, h.name)))
         self.beta = beta
         bn += beta
     return bn
Example #36
0
def train(
        dim_word=100,  # word vector dimensionality
        dim=1000,  # the number of LSTM units
        encoder='gru',
        decoder='gru_cond',
        patience=10,  # early stopping patience
        max_epochs=5000,
        finish_after=10000000,  # finish after this many updates
        dispFreq=100,
        decay_c=0.,  # L2 regularization penalty
        alpha_c=0.,  # alignment regularization
        clip_c=-1.,  # gradient clipping threshold
        lrate=0.01,  # learning rate
        n_words_src=100000,  # source vocabulary size
        n_words=100000,  # target vocabulary size
        maxlen=100,  # maximum length of the description
        optimizer='rmsprop',
        batch_size=16,
        valid_batch_size=16,
        saveto='model.npz',
        validFreq=1000,
        saveFreq=1000,  # save the parameters after every saveFreq updates
        sampleFreq=100,  # generate some samples after every sampleFreq
        datasets=[
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok',
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'
        ],
        valid_datasets=[
            '../data/dev/newstest2011.en.tok',
            '../data/dev/newstest2011.fr.tok'
        ],
        dictionaries=[
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'
        ],
        use_dropout=False,
        reload_=False):

    # Model options
    model_options = locals().copy()

    # load dictionaries and invert them
    worddicts = [None] * len(dictionaries)
    worddicts_r = [None] * len(dictionaries)
    for ii, dd in enumerate(dictionaries):
        with open(dd, 'rb') as f:
            worddicts[ii] = pkl.load(f)
        worddicts_r[ii] = dict()
        for kk, vv in worddicts[ii].iteritems():
            worddicts_r[ii][vv] = kk

    # reload options
    if reload_ and os.path.exists(saveto):
        with open('%s.pkl' % saveto, 'rb') as f:
            models_options = pkl.load(f)

    print 'Loading data'
    train = TextIterator(datasets[0],
                         datasets[1],
                         dictionaries[0],
                         dictionaries[1],
                         n_words_source=n_words_src,
                         n_words_target=n_words,
                         batch_size=batch_size,
                         maxlen=maxlen)
    valid = TextIterator(valid_datasets[0],
                         valid_datasets[1],
                         dictionaries[0],
                         dictionaries[1],
                         n_words_source=n_words_src,
                         n_words_target=n_words,
                         batch_size=valid_batch_size,
                         maxlen=maxlen)

    print 'Building model'
    params = init_params(model_options)
    # reload parameters
    if reload_ and os.path.exists(saveto):
        params = load_params(saveto, params)

    tparams = init_tparams(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask]

    print 'Buliding sampler'
    f_init, f_next = build_sampler(tparams, model_options, trng)

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    print 'Done'

    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # regularize the alpha weights
    if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * (
            (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] -
             opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
        cost += alpha_reg

    # after all regularizers - compile the computational graph for cost
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'

    # apply gradient clipping here
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(
                tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c,
                              g))
        grads = new_grads

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)
    print 'Done'

    print 'Optimization'

    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        history_errs = list(numpy.load(saveto)['history_errs'])
    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size
    if sampleFreq == -1:
        sampleFreq = len(train[0]) / batch_size

    uidx = 0
    estop = False
    for eidx in xrange(max_epochs):
        n_samples = 0

        for x, y in train:
            n_samples += len(x)
            uidx += 1
            use_noise.set_value(1.)

            x, x_mask, y, y_mask = prepare_data(x,
                                                y,
                                                maxlen=maxlen,
                                                n_words_src=n_words_src,
                                                n_words=n_words)

            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                continue

            ud_start = time.time()

            # compute cost, grads and copy grads to shared variables
            cost = f_grad_shared(x, x_mask, y, y_mask)

            # do the update on parameters
            f_update(lrate)

            ud = time.time() - ud_start

            # check for bad numbers, usually we remove non-finite elements
            # and continue training - but not done here
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            # verbose
            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud

            # save the best model so far
            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',

                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)
                numpy.savez(saveto, history_errs=history_errs, **params)
                pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
                print 'Done'

            # generate some samples with the model and display them
            if numpy.mod(uidx, sampleFreq) == 0:
                # FIXME: random selection?
                for jj in xrange(numpy.minimum(5, x.shape[1])):
                    stochastic = True
                    sample, score = gen_sample(tparams,
                                               f_init,
                                               f_next,
                                               x[:, jj][:, None],
                                               model_options,
                                               trng=trng,
                                               k=1,
                                               maxlen=30,
                                               stochastic=stochastic,
                                               argmax=False)
                    print 'Source ', jj, ': ',
                    for vv in x[:, jj]:
                        if vv == 0:
                            break
                        if vv in worddicts_r[0]:
                            print worddicts_r[0][vv],
                        else:
                            print 'UNK',
                    print
                    print 'Truth ', jj, ' : ',
                    for vv in y[:, jj]:
                        if vv == 0:
                            break
                        if vv in worddicts_r[1]:
                            print worddicts_r[1][vv],
                        else:
                            print 'UNK',
                    print
                    print 'Sample ', jj, ': ',
                    if stochastic:
                        ss = sample
                    else:
                        score = score / numpy.array([len(s) for s in sample])
                        ss = sample[score.argmin()]
                    for vv in ss:
                        if vv == 0:
                            break
                        if vv in worddicts_r[1]:
                            print worddicts_r[1][vv],
                        else:
                            print 'UNK',
                    print

            # validate model on validation set and early stop if necessary
            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                valid_errs = pred_probs(f_log_probs, prepare_data,
                                        model_options, valid)
                valid_err = valid_errs.mean()
                history_errs.append(valid_err)

                if uidx == 0 or valid_err <= numpy.array(history_errs).min():
                    best_p = unzip(tparams)
                    bad_counter = 0
                if len(history_errs) > patience and valid_err >= \
                        numpy.array(history_errs)[:-patience].min():
                    bad_counter += 1
                    if bad_counter > patience:
                        print 'Early Stop!'
                        estop = True
                        break

                if numpy.isnan(valid_err):
                    ipdb.set_trace()

                print 'Valid ', valid_err

            # finish after this many updates
            if uidx >= finish_after:
                print 'Finishing after %d iterations!' % uidx
                estop = True
                break

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid_err = pred_probs(f_log_probs, prepare_data, model_options,
                           valid).mean()

    print 'Valid ', valid_err

    params = copy.copy(best_p)
    numpy.savez(saveto,
                zipped_params=best_p,
                history_errs=history_errs,
                **params)

    return valid_err
Example #37
0
def rmsprop(lr, tparams, grads, x, y, cost):
    """
    A variant of  SGD that scales the step size by running average of the
    recent step norms.

    Parameters
    ----------
    lr : Theano SharedVariable
        Initial learning rate
    tpramas: Theano SharedVariable
        Model parameters
    grads: Theano variable
        Gradients of cost w.r.t to parameres
    x: Theano variable
        Model inputs
    y: Theano variable
        Targets
    cost: Theano variable
        Objective fucntion to minimize

    Notes
    -----
    For more information, see [Hint2014]_.

    .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*,
       lecture 6a,
       http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
    """

    zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
                                  name='%s_grad' % k)
                    for k, p in tparams.iteritems()]
    running_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
                                   name='%s_rgrad' % k)
                     for k, p in tparams.iteritems()]
    running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
                                    name='%s_rgrad2' % k)
                      for k, p in tparams.iteritems()]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
             for rg2, g in zip(running_grads2, grads)]

    f_grad_shared = theano.function([x, y], cost,
                                    updates=zgup + rgup + rg2up,
                                    name='rmsprop_f_grad_shared')

    updir = [theano.shared(p.get_value() * numpy_floatX(0.),
                           name='%s_updir' % k)
             for k, p in tparams.iteritems()]
    updir_new = [(ud, 0.9 * ud - 1e-4 * zg / T.sqrt(rg2 - rg ** 2 + 1e-4))
                 for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
                                            running_grads2)]
    param_up = [(p, p + udn[1])
                for p, udn in zip(tparams.values(), updir_new)]
    f_update = theano.function([lr], [], updates=updir_new + param_up,
                               on_unused_input='ignore',
                               name='rmsprop_f_update')

    return f_grad_shared, f_update
def adadelta(lr, tparams, grads, x, mask, y, cost):
    """
    An adaptive learning rate optimizer

    Parameters
    ----------
    lr : Theano SharedVariable
        Initial learning rate
    tpramas: Theano SharedVariable
        Model parameters
    grads: Theano variable
        Gradients of cost w.r.t to parameres
    x: Theano variable
        Model inputs
    mask: Theano variable
        Sequence mask
    y: Theano variable
        Targets
    cost: Theano variable
        Objective fucntion to minimize

    Notes
    -----
    For more information, see [ADADELTA]_.

    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
       Rate Method*, arXiv:1212.5701.
    """

    zipped_grads = [
        theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k)
        for k, p in tparams.iteritems()
    ]
    running_up2 = [
        theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rup2' % k)
        for k, p in tparams.iteritems()
    ]
    running_grads2 = [
        theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k)
        for k, p in tparams.iteritems()
    ]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g**2))
             for rg2, g in zip(running_grads2, grads)]

    f_grad_shared = theano.function([x, mask, y],
                                    cost,
                                    updates=zgup + rg2up,
                                    name='adadelta_f_grad_shared')

    updir = [
        -tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
        for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)
    ]
    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud**2))
             for ru2, ud in zip(running_up2, updir)]
    param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]

    f_update = theano.function([lr], [],
                               updates=ru2up + param_up,
                               on_unused_input='ignore',
                               name='adadelta_f_update')

    return f_grad_shared, f_update
Example #39
0
    def __init__(self,
                 input,
                 n_in,
                 n_hidden,
                 n_out,
                 activation=T.tanh,
                 output_type='real'):

        self.input = input
        self.activation = activation
        self.output_type = output_type

        self.batch_size = T.iscalar()

        # theta is a vector of all trainable parameters
        # it represents the value of W, W_in, W_out, h0, bh, by
        theta_shape = n_hidden ** 2 + n_in * n_hidden + n_hidden * n_out + \
                      n_hidden + n_hidden + n_out
        self.theta = theano.shared(
            value=np.zeros(theta_shape, dtype=theano.config.floatX))

        # Parameters are reshaped views of theta
        param_idx = 0  # pointer to somewhere along parameter vector

        # recurrent weights as a shared variable
        self.W = self.theta[param_idx:(param_idx + n_hidden**2)].reshape(
            (n_hidden, n_hidden))
        self.W.name = 'W'
        W_init = np.asarray(np.random.uniform(size=(n_hidden, n_hidden),
                                              low=-0.01,
                                              high=0.01),
                            dtype=theano.config.floatX)
        param_idx += n_hidden**2

        # input to hidden layer weights
        self.W_in = self.theta[param_idx:(param_idx + n_in * \
                                          n_hidden)].reshape((n_in, n_hidden))
        self.W_in.name = 'W_in'
        W_in_init = np.asarray(np.random.uniform(size=(n_in, n_hidden),
                                                 low=-0.01,
                                                 high=0.01),
                               dtype=theano.config.floatX)
        param_idx += n_in * n_hidden

        # hidden to output layer weights
        self.W_out = self.theta[param_idx:(param_idx + n_hidden * \
                                           n_out)].reshape((n_hidden, n_out))
        self.W_out.name = 'W_out'

        W_out_init = np.asarray(np.random.uniform(size=(n_hidden, n_out),
                                                  low=-0.01,
                                                  high=0.01),
                                dtype=theano.config.floatX)
        param_idx += n_hidden * n_out

        self.h0 = self.theta[param_idx:(param_idx + n_hidden)]
        self.h0.name = 'h0'
        h0_init = np.zeros((n_hidden, ), dtype=theano.config.floatX)
        param_idx += n_hidden

        self.bh = self.theta[param_idx:(param_idx + n_hidden)]
        self.bh.name = 'bh'
        bh_init = np.zeros((n_hidden, ), dtype=theano.config.floatX)
        param_idx += n_hidden

        self.by = self.theta[param_idx:(param_idx + n_out)]
        self.by.name = 'by'
        by_init = np.zeros((n_out, ), dtype=theano.config.floatX)
        param_idx += n_out

        assert (param_idx == theta_shape)

        # for convenience
        self.params = [
            self.W, self.W_in, self.W_out, self.h0, self.bh, self.by
        ]

        # shortcut to norms (for monitoring)
        self.l2_norms = {}
        for param in self.params:
            self.l2_norms[param] = T.sqrt(T.sum(param**2))

        # initialize parameters
        # DEBUG_MODE gives division by zero error when we leave parameters
        # as zeros
        self.theta.set_value(
            np.concatenate([
                x.ravel() for x in (W_init, W_in_init, W_out_init, h0_init,
                                    bh_init, by_init)
            ]))

        self.theta_update = theano.shared(
            value=np.zeros(theta_shape, dtype=theano.config.floatX))

        # recurrent function (using tanh activation function) and arbitrary output
        # activation function
        def step(x_t, h_tm1):
            h_t = self.activation(T.dot(x_t, self.W_in) + \
                                  T.dot(h_tm1, self.W) + self.bh)
            y_t = T.dot(h_t, self.W_out) + self.by
            return h_t, y_t

        # the hidden state `h` for the entire sequence, and the output for the
        # entire sequence `y` (first dimension is always time)
        # Note the implementation of weight-sharing h0 across variable-size
        # batches using T.ones multiplying h0
        # Alternatively, T.alloc approach is more robust
        [self.h,
         self.y_pred], _ = theano.scan(step,
                                       sequences=self.input,
                                       outputs_info=[
                                           T.alloc(self.h0,
                                                   self.input.shape[1],
                                                   n_hidden), None
                                       ])
        # outputs_info=[T.ones(shape=(self.input.shape[1],
        # self.h0.shape[0])) * self.h0, None])

        # L1 norm ; one regularization option is to enforce L1 norm to
        # be small
        self.L1 = 0
        self.L1 += abs(self.W.sum())
        self.L1 += abs(self.W_in.sum())
        self.L1 += abs(self.W_out.sum())

        # square of L2 norm ; one regularization option is to enforce
        # square of L2 norm to be small
        self.L2_sqr = 0
        self.L2_sqr += (self.W**2).sum()
        self.L2_sqr += (self.W_in**2).sum()
        self.L2_sqr += (self.W_out**2).sum()

        if self.output_type == 'real':
            self.loss = lambda y: self.mse(y)
        elif self.output_type == 'binary':
            # push through sigmoid
            self.p_y_given_x = T.nnet.sigmoid(self.y_pred[-1])  # apply sigmoid
            self.y_out = T.round(self.p_y_given_x)  # round to {0,1}
            self.loss = lambda y: self.nll_binary(y)
        elif self.output_type == 'softmax':
            # push through softmax, computing vector of class-membership
            # probabilities in symbolic form
            #
            # T.nnet.softmax will not operate on T.tensor3 types, only matrices
            # We take our n_steps x n_seq x n_classes output from the net
            # and reshape it into a (n_steps * n_seq) x n_classes matrix
            # apply softmax, then reshape back
            y_p = self.y_pred
            y_p_m = T.reshape(y_p, (y_p.shape[0] * y_p.shape[1], -1))
            y_p_s = T.nnet.softmax(y_p_m)
            self.p_y_given_x = T.reshape(y_p_s, y_p.shape)

            # compute prediction as class whose probability is maximal
            self.y_out = T.argmax(self.p_y_given_x, axis=-1)
            self.loss = lambda y: self.nll_multiclass(y)

        else:
            raise NotImplementedError
Example #40
0
def irl(structure,
        feature_matrix,
        n_actions,
        discount,
        transition_probability,
        trajectories,
        epochs,
        learning_rate,
        initialisation="normal",
        l1=0.1,
        l2=0.1):
    """
    Find the reward function for the given trajectories.

    structure: Neural network structure tuple, e.g. (10, 3, 3) would be a
        3-layer neural network with 10 inputs.
    feature_matrix: Matrix with the nth row representing the nth state. NumPy
        array with shape (N, D) where N is the number of states and D is the
        dimensionality of the state.
    n_actions: Number of actions A. int.
    discount: Discount factor of the MDP. float.
    transition_probability: NumPy array mapping (state_i, action, state_k) to
        the probability of transitioning from state_i to state_k under action.
        Shape (N, A, N).
    trajectories: 3D array of state/action pairs. States are ints, actions
        are ints. NumPy array with shape (T, L, 2) where T is the number of
        trajectories and L is the trajectory length.
    epochs: Number of gradient descent steps. int.
    learning_rate: Gradient descent learning rate. float.
    initialisation: What distribution to use. str in {normal, uniform}. Default
        normal.
    l1: L1 regularisation. Default 0.1. float.
    l2: L2 regularisation. Default 0.1. float.
    -> Reward vector with shape (N,).
    """

    n_states, d_states = feature_matrix.shape
    transition_probability = th.shared(transition_probability, borrow=True)
    trajectories = th.shared(trajectories, borrow=True)

    # Initialise W matrices; b biases.
    n_layers = len(structure) - 1
    weights = []
    hist_w_grads = []  # For AdaGrad.
    biases = []
    hist_b_grads = []  # For AdaGrad.
    for i in range(n_layers):
        # W
        shape = (structure[i + 1], structure[i])
        if initialisation == "normal":
            matrix = th.shared(rn.normal(size=shape), name="W", borrow=True)
        else:
            matrix = th.shared(rn.uniform(size=shape), name="W", borrow=True)
        weights.append(matrix)
        hist_w_grads.append(th.shared(np.zeros(shape), name="hdW",
                                      borrow=True))

        # b
        shape = (structure[i + 1], 1)
        if initialisation == "normal":
            matrix = th.shared(rn.normal(size=shape), name="b", borrow=True)
        else:
            matrix = th.shared(rn.uniform(size=shape), name="b", borrow=True)
        biases.append(matrix)
        hist_b_grads.append(th.shared(np.zeros(shape), name="hdb",
                                      borrow=True))

    # Initialise α weight, β bias.
    if initialisation == "normal":
        α = th.shared(rn.normal(size=(1, structure[-1])),
                      name="alpha",
                      borrow=True)
    else:
        α = th.shared(rn.uniform(size=(1, structure[-1])),
                      name="alpha",
                      borrow=True)
    hist_α_grad = T.zeros(α.shape)  # For AdaGrad.

    adagrad_epsilon = 1e-6  # AdaGrad numerical stability.

    #### Theano symbolic setup. ####

    # Symbolic input.
    s_feature_matrix = T.matrix("x")
    # Feature matrices.
    # All dimensions of the form (d_layer, n_states).
    φs = [s_feature_matrix.T]
    # Forward propagation.
    for W, b in zip(weights, biases):
        φ = T.nnet.sigmoid(
            th.compile.ops.Rebroadcast((0, False), (1, True))(b) +
            W.dot(φs[-1]))
        φs.append(φ)
        # φs[1] = φ1 etc.
    # Reward.
    r = α.dot(φs[-1]).reshape((n_states, ))
    # Engineering hack: z-score the reward.
    r = (r - r.mean()) / r.std()
    # Associated feature expectations.
    expected_svf = find_expected_svf(n_states, r, n_actions, discount,
                                     transition_probability, trajectories)
    svf = maxent.find_svf(n_states, trajectories.get_value())
    # Derivatives (backward propagation).
    updates = []
    α_grad = φs[-1].dot(svf - expected_svf).T
    hist_α_grad += α_grad**2
    adj_α_grad = α_grad / (adagrad_epsilon + T.sqrt(hist_α_grad))
    updates.append((α, α + adj_α_grad * learning_rate))

    def grad_for_state(s, theta, svf_diff, r):
        """
        Calculate the gradient with respect to theta for one state.
        """

        regularisation = abs(theta).sum() * l1 + (theta**2).sum() * l2
        return svf_diff[s] * T.grad(r[s], theta) - regularisation, {}

    for i, W in enumerate(weights):
        w_grads, _ = th.scan(fn=grad_for_state,
                             sequences=[T.arange(n_states)],
                             non_sequences=[W, svf - expected_svf, r])
        w_grad = w_grads.sum(axis=0)
        hist_w_grads[i] += w_grad**2
        adj_w_grad = w_grad / (adagrad_epsilon + T.sqrt(hist_w_grads[i]))
        updates.append((W, W + adj_w_grad * learning_rate))
    for i, b in enumerate(biases):
        b_grads, _ = th.scan(fn=grad_for_state,
                             sequences=[T.arange(n_states)],
                             non_sequences=[b, svf - expected_svf, r])
        b_grad = b_grads.sum(axis=0)
        hist_b_grads[i] += b_grad**2
        adj_b_grad = b_grad / (adagrad_epsilon + T.sqrt(hist_b_grads[i]))
        updates.append((b, b + adj_b_grad * learning_rate))

    train = th.function([s_feature_matrix], updates=updates, outputs=r)
    run = th.function([s_feature_matrix], outputs=r)

    for e in range(epochs):
        reward = train(feature_matrix)

    return reward.reshape((n_states, ))
Example #41
0
        def step(inp_h1_t, gat_h1_t, inp_h2_t, gat_h2_t, inp_h3_t, gat_h3_t,
                 h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1, context_oh):

            attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1)
            inp_h1_t += attinp_h1
            gat_h1_t += attgat_h1

            h1_t = self.rnn1.apply(inp_h1_t, gat_h1_t, h1_tm1, iterate=False)

            a_t, b_t, k_t = self.h1_to_att.apply(h1_t)

            if self.attention_type == "softmax":
                a_t = tensor.nnet.softmax(a_t) + self.epsilon
            else:
                a_t = tensor.exp(a_t) + self.epsilon

            b_t = tensor.exp(b_t) + self.epsilon
            k_t = k_tm1 + self.attention_alignment * tensor.exp(k_t)

            a_t_ = a_t
            a_t = tensor.shape_padright(a_t)
            b_t = tensor.shape_padright(b_t)
            k_t_ = tensor.shape_padright(k_t)

            # batch size X att size X len context
            if self.attention_type == "softmax":
                # numpy.sqrt(1/(2*numpy.pi)) is the weird number
                phi_t = 0.3989422917366028 * tensor.sum(
                    a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t *
                                                        (k_t_ - u)**2),
                    axis=1)
            else:
                phi_t = tensor.sum(a_t * tensor.exp(-b_t * (k_t_ - u)**2),
                                   axis=1)

            # batch size X len context X num letters
            w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1)

            attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t)
            attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t)
            inp_h2_t += attinp_h2
            gat_h2_t += attgat_h2
            inp_h3_t += attinp_h3
            gat_h3_t += attgat_h3

            h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t)
            h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t)

            to_normalize = [h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3]
            h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            h2_t = self.rnn2.apply(inp_h2_t + h1inp_h2,
                                   gat_h2_t + h1gat_h2,
                                   h2_tm1,
                                   iterate=False)

            h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t)

            to_normalize = [h2inp_h3, h2gat_h3]
            h2inp_h3, h2gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            h3_t = self.rnn3.apply(inp_h3_t + h1inp_h3 + h2inp_h3,
                                   gat_h3_t + h1gat_h3 + h2gat_h3,
                                   h3_tm1,
                                   iterate=False)

            return h1_t, h2_t, h3_t, k_t, w_t, phi_t, a_t_
Example #42
0
    def __init__(self, model, algo="sgd"):
        self.model = model
        self.algo = algo

        self.x = self.model.x
        self.y = T.ivector('y')
        self.outc = T.matrix('outc')

        # due to theano bugs
        self.x_d = shared_empty(2)
        self.y_d = shared_empty(1, dtype="int32")
        self.outc_d = shared_empty(2)
        # ---

        # -- target def --
        self.loss = get_loss(self.model.a, self.outc)
        self.err = get_error(get_pred(self.model.a), self.y)

        self.grad_vec = T.concatenate([T.grad(self.loss, p).flatten() for p in self.model.params])

        srng = RandomStreams(seed=234)
        self.grad = {p: T.grad(self.loss, p) for p in self.model.params}

        for p in self.grad:
            self.grad[p] += srng.normal(p.shape)*1e-4

        self.updates = OrderedDict()

        if self.algo == 'gd':
            self.rate = 10.
            for p in self.model.params:
                self.updates[p] = p - self.rate * self.grad[p]
        elif self.algo == 'adagrad':
            self.rate = 5e-2
            eps = 1e-6
            for p in self.model.params:
                value = p.get_value(borrow=True)
                hist = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=p.broadcastable)
                hist_n = hist + self.grad[p]**2
                self.updates[hist] = hist_n
                self.updates[p] = p - self.rate * self.grad[p] / T.sqrt(hist + eps)
        elif self.algo == 'rmsprop':
            self.rate = 1e-2
            eps = 1e-6
            rho = 0.7
            for p in self.model.params:
                value = p.get_value(borrow=True)
                hist = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=p.broadcastable)
                hist_n = rho * hist + (1 - rho) * self.grad[p]**2
                self.updates[hist] = hist_n
                self.updates[p] = p - self.rate * self.grad[p] / T.sqrt(hist + eps)
        elif self.algo == 'nag':
            self.rate = 10
            mu = 0.2
            for p in self.model.params:
                value = p.get_value(borrow=True)
                vel = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=p.broadcastable)
                x = mu * vel + self.rate * self.grad[p]
                self.updates[vel] = x
                self.updates[p] = p - self.rate * self.grad[p] - mu * x
        elif self.algo == 'adam':
            self.rate = 4e-2
            beta1 = 0.9
            beta2 = 0.999
            eps = 1e-8
            one = T.constant(1)
            t_prev = theano.shared(np.asarray(0, dtype=theano.config.floatX))
            t = t_prev + 1
            a_t = self.rate*T.sqrt(one-beta2**t)/(one-beta1**t)

            for p in self.model.params:
                value = p.get_value(borrow=True)

                m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=p.broadcastable)
                v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=p.broadcastable)

                m_t = beta1*m_prev + (one-beta1)*self.grad[p]
                v_t = beta2*v_prev + (one-beta2)*self.grad[p]**2
                step = a_t*m_t/(T.sqrt(v_t) + eps)

                self.updates[m_prev] = m_t
                self.updates[v_prev] = v_t
                self.updates[p] = p - step
            self.updates[t_prev] = t

        self.print_pls = []
        self.print_pls += [T.mean(self.grad_vec**2)**0.5]

        self.train = theano.function(
            inputs=[],
            outputs=[self.loss, self.err] + self.print_pls,
            updates=self.updates,
            givens={
                self.x: self.x_d,
                self.y: self.y_d,
                self.outc: self.outc_d,
            },
            on_unused_input='warn',
            allow_input_downcast=True
        )

        self.eva = theano.function(
            inputs=[],
            outputs=[self.loss],
            givens={
                self.x: self.x_d,
                self.outc: self.outc_d
            },
            on_unused_input='warn',
            allow_input_downcast=True
        )
Example #43
0
 def RMSprop(self,
             cost,
             params,
             full_params,
             sampled_params,
             sidxs,
             epsilon=1e-6):
     grads = [T.grad(cost=cost, wrt=param) for param in params]
     sgrads = [T.grad(cost=cost, wrt=sparam) for sparam in sampled_params]
     updates = OrderedDict()
     if self.grad_cap > 0:
         norm = T.cast(
             T.sqrt(
                 T.sum([
                     T.sum([T.sum(g**2) for g in g_list])
                     for g_list in grads
                 ]) + T.sum([T.sum(g**2) for g in sgrads])),
             theano.config.floatX)
         grads = [[
             T.switch(T.ge(norm, self.grad_cap), g * self.grad_cap / norm,
                      g) for g in g_list
         ] for g_list in grads]
         sgrads = [
             T.switch(T.ge(norm, self.grad_cap), g * self.grad_cap / norm,
                      g) for g in sgrads
         ]
     for p_list, g_list in zip(params, grads):
         for p, g in zip(p_list, g_list):
             if self.adapt:
                 if self.adapt == 'adagrad':
                     g = self.adagrad(p, g, updates)
                 if self.adapt == 'rmsprop':
                     g = self.rmsprop(p, g, updates)
                 if self.adapt == 'adadelta':
                     g = self.adadelta(p, g, updates)
                 if self.adapt == 'adam':
                     g = self.adam(p, g, updates)
             if self.momentum > 0:
                 velocity = theano.shared(p.get_value(borrow=False) * 0.,
                                          borrow=True)
                 velocity2 = self.momentum * velocity - np.float32(
                     self.learning_rate) * (g + self.lmbd * p)
                 updates[velocity] = velocity2
                 updates[p] = p + velocity2
             else:
                 updates[p] = p * np.float32(1.0 - self.learning_rate *
                                             self.lmbd) - np.float32(
                                                 self.learning_rate) * g
     for i in range(len(sgrads)):
         g = sgrads[i]
         fullP = full_params[i]
         sample_idx = sidxs[i]
         sparam = sampled_params[i]
         if self.adapt:
             if self.adapt == 'adagrad':
                 g = self.adagrad(fullP, g, updates, sample_idx)
             if self.adapt == 'rmsprop':
                 g = self.rmsprop(fullP, g, updates, sample_idx)
             if self.adapt == 'adadelta':
                 g = self.adadelta(fullP, g, updates, sample_idx)
             if self.adapt == 'adam':
                 g = self.adam(fullP, g, updates, sample_idx)
         if self.lmbd > 0:
             delta = np.float32(
                 self.learning_rate) * (g + self.lmbd * sparam)
         else:
             delta = np.float32(self.learning_rate) * g
         if self.momentum > 0:
             velocity = theano.shared(fullP.get_value(borrow=False) * 0.,
                                      borrow=True)
             vs = velocity[sample_idx]
             velocity2 = self.momentum * vs - delta
             updates[velocity] = T.set_subtensor(vs, velocity2)
             updates[fullP] = T.inc_subtensor(sparam, velocity2)
         else:
             updates[fullP] = T.inc_subtensor(sparam, -delta)
     return updates
Example #44
0
        def sample_step(inp_cell_h1_t, inp_gat_h1_t, inp_cell_h2_t,
                        inp_gat_h2_t, inp_cell_h3_t, inp_gat_h3_t, x_tm1,
                        h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1):

            cell_h1_t = inp_cell_h1_t
            cell_h2_t = inp_cell_h2_t
            cell_h3_t = inp_cell_h3_t

            gat_h1_t = inp_gat_h1_t
            gat_h2_t = inp_gat_h2_t
            gat_h3_t = inp_gat_h3_t

            attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1)
            cell_h1_t += attinp_h1
            gat_h1_t += attgat_h1

            if self.weak_feedback:
                out_cell_h1_t, out_gat_h1_t = self.out_to_h1.apply(x_tm1)

                to_normalize = [out_cell_h1_t, out_gat_h1_t]
                out_cell_h1_t, out_gat_h1_t = \
                    [_apply_norm(x, self.layer_norm) for x in to_normalize]

                cell_h1_t += out_cell_h1_t
                gat_h1_t += out_gat_h1_t

            if self.full_feedback:
                out_cell_h2_t, out_gat_h2_t = self.out_to_h2.apply(x_tm1)
                out_cell_h3_t, out_gat_h3_t = self.out_to_h3.apply(x_tm1)

                to_normalize = [
                    out_cell_h2_t, out_gat_h2_t, out_cell_h3_t, out_gat_h3_t
                ]
                out_cell_h2_t, out_gat_h2_t, \
                    out_cell_h3_t, out_gat_h3_t = \
                    [_apply_norm(x, self.layer_norm) for x in to_normalize]

                cell_h2_t += out_cell_h2_t
                cell_h3_t += out_cell_h3_t
                gat_h2_t += out_gat_h2_t
                gat_h3_t += out_gat_h3_t

            h1_t = self.rnn1.apply(cell_h1_t, gat_h1_t, h1_tm1, iterate=False)

            a_t, b_t, k_t = self.h1_to_att.apply(h1_t)

            if self.attention_type == "softmax":
                a_t = tensor.nnet.softmax(a_t) + self.epsilon
            else:
                a_t = tensor.exp(a_t) + self.epsilon

            b_t = tensor.exp(b_t) * self.sharpening_coeff + self.epsilon
            k_t = k_tm1 + self.attention_alignment * \
                tensor.exp(k_t) / self.timing_coeff

            a_t_ = a_t
            a_t = tensor.shape_padright(a_t)
            b_t = tensor.shape_padright(b_t)
            k_t_ = tensor.shape_padright(k_t)

            # batch size X att size X len context
            if self.attention_type == "softmax":
                # numpy.sqrt(1/(2*numpy.pi)) is the weird number
                phi_t = 0.3989422917366028 * tensor.sum(
                    a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t *
                                                        (k_t_ - u)**2),
                    axis=1)
            else:
                phi_t = tensor.sum(a_t * tensor.exp(-b_t * (k_t_ - u)**2),
                                   axis=1)

            # batch size X len context X num letters
            w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1)

            attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t)
            attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t)
            cell_h2_t += attinp_h2
            gat_h2_t += attgat_h2
            cell_h3_t += attinp_h3
            gat_h3_t += attgat_h3

            h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t)
            h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t)

            to_normalize = [h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3]
            h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            h2_t = self.rnn2.apply(cell_h2_t + h1inp_h2,
                                   gat_h2_t + h1gat_h2,
                                   h2_tm1,
                                   iterate=False)

            h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t)

            to_normalize = [h2inp_h3, h2gat_h3]
            h2inp_h3, h2gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            h3_t = self.rnn3.apply(cell_h3_t + h1inp_h3 + h2inp_h3,
                                   gat_h3_t + h1gat_h3 + h2gat_h3,
                                   h3_tm1,
                                   iterate=False)

            h1_out_t = self.h1_to_readout.apply(h1_t)
            h2_out_t = self.h2_to_readout.apply(h2_t)
            h3_out_t = self.h3_to_readout.apply(h3_t)

            to_normalize = [h1_out_t, h2_out_t, h3_out_t]
            h1_out_t, h2_out_t, h3_out_t = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            readout_t = h1_out_t + h2_out_t + h3_out_t

            readout_t += self.att_to_readout.apply(w_t)

            if self.use_speaker:
                readout_t += spk_readout

            output_t = self.readout_to_output.apply(readout_t)

            if self.which_cost == 'MSE':
                predicted_x_t = output_t
                if self.use_speaker:
                    predicted_x_t += spk_output

                # Dummy value for coeff_t
                coeff_t = predicted_x_t
            elif self.which_cost == "GMM":
                mu_t, sigma_t, coeff_t = output_t
                if self.use_speaker:
                    mu_t += spk_output[0]
                    sigma_t += spk_output[1]
                    coeff_t += spk_output[2]

                sigma_t = tensor.exp(sigma_t - self.sampling_bias) + \
                    self.epsilon

                coeff_t = tensor.nnet.softmax(
                    coeff_t.reshape(
                        (-1, self.k_gmm)) * (1. + self.sampling_bias)).reshape(
                            coeff_t.shape) + self.epsilon

                predicted_x_t = sample_gmm(mu_t, sigma_t, coeff_t,
                                           self.theano_rng)

            return predicted_x_t, h1_t, h2_t, h3_t, \
                k_t, w_t, coeff_t, phi_t, a_t_
Example #45
0
def l2_normalize(x, axis):
    norm = T.sqrt(T.sum(T.square(x), axis=axis, keepdims=True))
    return x / norm
Example #46
0
 def stdize(layer, input):
     m = T.mean(input, layer.axes_to_sum)
     input -= m.dimshuffle(*layer.dimshuffle_args)
     stdv = T.sqrt(T.mean(T.square(input), axis=layer.axes_to_sum))
     input /= stdv.dimshuffle(*layer.dimshuffle_args)
     return -m / stdv, 1. / stdv, input
Example #47
0
def L1sim(left, right):
    return -T.sum(T.sqrt(T.sqr(left - right)), axis=1)
Example #48
0
 def get_expr_rff_feature_map_component(x, omega, u):
     phi = T.cos(T.dot(x, omega) + u) * T.sqrt(2.)
     return phi
    def __init__(self, We_initial, params):

        initial_We = theano.shared(np.asarray(We_initial, dtype=config.floatX))
        We = theano.shared(np.asarray(We_initial, dtype=config.floatX))

        g1 = T.imatrix()
        g2 = T.imatrix()
        p1 = T.imatrix()
        p2 = T.imatrix()
        g1mask = T.matrix()
        g2mask = T.matrix()
        p1mask = T.matrix()
        p2mask = T.matrix()

        l_in = lasagne.layers.InputLayer((None, None))
        l_mask = lasagne.layers.InputLayer(shape=(None, None))
        l_emb = lasagne.layers.EmbeddingLayer(l_in, input_size=We.get_value().shape[0],
                                              output_size=We.get_value().shape[1], W=We)

        if params.model == "wordaverage":
            l_out = lasagne_average_layer([l_emb, l_mask], tosum=False)

        elif params.model == "maxpool":
            l_out = lasagne_max_layer([l_emb, l_mask], params)

        elif params.model == "lstmavg":
            l_lstm = lasagne.layers.LSTMLayer(l_emb, params.dim, peepholes=True, learn_init=False,
                                              mask_input=l_mask)
            l_out = lasagne_average_layer([l_lstm, l_mask], tosum = False)

        elif params.model == "lstmmax":
            l_lstm = lasagne.layers.LSTMLayer(l_emb, params.dim, peepholes=True, learn_init=False,
                                              mask_input=l_mask)
            l_out = lasagne_max_layer([l_lstm, l_mask], params)

        elif params.model == "bilstmavg":
            l_lstm = lasagne.layers.LSTMLayer(l_emb, params.dim, peepholes=True, learn_init=False,
                                              mask_input=l_mask)
            l_lstmb = lasagne.layers.LSTMLayer(l_emb, params.dim, learn_init=False,
                                               mask_input=l_mask, backwards=True)

            l_cleanse = lasagne_cleanse_layer([l_lstm, l_mask], to_pool = False)
            l_cleanse_b = lasagne_cleanse_layer([l_lstmb, l_mask], to_pool = False)
            l_concat = lasagne.layers.ConcatLayer([l_cleanse, l_cleanse_b], axis = params.axis)
            l_out = lasagne_average_layer2([l_concat, l_mask])

        elif params.model == "bilstmmax":
            l_lstm = lasagne.layers.LSTMLayer(l_emb, params.dim, peepholes=True, learn_init=False,
                                              mask_input=l_mask)
            l_lstmb = lasagne.layers.LSTMLayer(l_emb, params.dim, learn_init=False,
                                               mask_input=l_mask, backwards=True)
            l_cleanse = lasagne_cleanse_layer([l_lstm, l_mask], to_pool = True)
            l_cleanse_b = lasagne_cleanse_layer([l_lstmb, l_mask], to_pool = True)
            l_concat = lasagne.layers.ConcatLayer([l_cleanse, l_cleanse_b], axis=params.axis)
            l_out = lasagne_max_layer2([l_concat])

        embg1 = lasagne.layers.get_output(l_out, {l_in: g1, l_mask: g1mask}, deterministic=False)
        embg2 = lasagne.layers.get_output(l_out, {l_in: g2, l_mask: g2mask}, deterministic=False)
        embp1 = lasagne.layers.get_output(l_out, {l_in: p1, l_mask: p1mask}, deterministic=False)
        embp2 = lasagne.layers.get_output(l_out, {l_in: p2, l_mask: p2mask}, deterministic=False)

        def fix(x):
            return x*(x > 0) + 1E-10*(x <= 0)

        #objective function
        g1g2 = (embg1 * embg2).sum(axis=1)
        g1g2norm = T.sqrt(fix(T.sum(embg1 ** 2, axis=1))) * T.sqrt(fix(T.sum(embg2 ** 2, axis=1)))
        g1g2 = g1g2 / g1g2norm

        p1g1 = (embp1 * embg1).sum(axis=1)
        p1g1norm = T.sqrt(fix(T.sum(embp1 ** 2, axis=1))) * T.sqrt(fix(T.sum(embg1 ** 2, axis=1)))
        p1g1 = p1g1 / p1g1norm

        p2g2 = (embp2 * embg2).sum(axis=1)
        p2g2norm = T.sqrt(fix(T.sum(embp2 ** 2, axis=1))) * T.sqrt(fix(T.sum(embg2 ** 2, axis=1)))
        p2g2 = p2g2 / p2g2norm

        costp1g1 = params.margin - g1g2 + p1g1
        costp1g1 = costp1g1 * (costp1g1 > 0)

        costp2g2 = params.margin - g1g2 + p2g2
        costp2g2 = costp2g2 * (costp2g2 > 0)

        cost = costp1g1 + costp2g2
        network_params = lasagne.layers.get_all_params(l_out, trainable=True)
        network_params.pop(0)

        self.final_layer = l_out
        self.all_params = lasagne.layers.get_all_params(l_out, trainable=True)

        if params.LC:
            l2 = 0.5 * params.LC * sum(lasagne.regularization.l2(x) for x in network_params)
        else:
            l2 = 0
        word_reg = 0.5 * params.LW * lasagne.regularization.l2(We - initial_We)
        cost = T.mean(cost) + l2 + word_reg

        self.feedforward_function = theano.function([g1, g1mask], embg1)

        prediction = g1g2
        self.scoring_function = theano.function([g1, g2,
            g1mask, g2mask],prediction)

        grads = theano.gradient.grad(cost, self.all_params)
        updates = params.learner(grads, self.all_params, params.eta)
        self.train_function = theano.function([g1, g2, p1, p2,
                                                  g1mask, g2mask, p1mask, p2mask], cost, updates=updates)

        cost = costp1g1 + costp2g2
        cost = T.mean(cost)
        self.cost_function = theano.function([g1, g2, p1, p2,
                                                  g1mask, g2mask, p1mask, p2mask], cost)

        print("Num Params:", lasagne.layers.count_params(self.final_layer))
Example #50
0
def sqrt(x):
    x = T.clip(x, 0., np.inf)
    return T.sqrt(x)
Example #51
0
        size = np.asarray(x_shp[1] * inker_shape[0] * inker_shape[1],
                          dtype=x.dtype)
        ssq, ssqshp = boxconv((x**2, x_shp), inker_shape, channels=True)
        xs = inker_shape[0] // 2
        ys = inker_shape[1] // 2
        # --local contrast normalization in regions that are not symmetric
        #   about the pixel being normalized feels weird, but we're
        #   allowing it here.
        xs_inc = (inker_shape[0] + 1) % 2
        ys_inc = (inker_shape[1] + 1) % 2
        if div_method == 'euclidean':
            if remove_mean:
                arr_sum, _shp = boxconv((x, x_shp), inker_shape, channels=True)
                arr_num = (x[:, :, xs - xs_inc:-xs, ys - ys_inc:-ys] -
                           arr_sum / size)
                arr_div = EPSILON + tensor.sqrt(
                    tensor.maximum(0, ssq - (arr_sum**2) / size))
            else:
                arr_num = x[:, :, xs - xs_inc:-xs, ys - ys_inc:-ys]
                arr_div = EPSILON + tensor.sqrt(ssq)
        else:
            raise NotImplementedError('div_method', div_method)
    else:
        raise NotImplementedError('outker_shape != inker_shape', outker_shape,
                                  inker_shape)

    if (hasattr(stretch, '__iter__') and (stretch != 1).any()) or stretch != 1:
        arr_num = arr_num * stretch
        arr_div = arr_div * stretch
    # XXX: IS THIS 1.0 supposed to be (threshold + EPSILON) ??
    arr_div = tensor.switch(arr_div < (threshold + EPSILON), 1.0, arr_div)
Example #52
0
def compute_norms(array, norm_axes=None):
    """ Compute incoming weight vector norms.

    Parameters
    ----------
    array : numpy array or Theano expression
        Weight or bias.
    norm_axes : sequence (list or tuple)
        The axes over which to compute the norm.  This overrides the
        default norm axes defined for the number of dimensions
        in `array`. When this is not specified and `array` is a 2D array,
        this is set to `(0,)`. If `array` is a 3D, 4D or 5D array, it is
        set to a tuple listing all axes but axis 0. The former default is
        useful for working with dense layers, the latter is useful for 1D,
        2D and 3D convolutional layers.
        Finally, in case `array` is a vector, `norm_axes` is set to an empty
        tuple, and this function will simply return the absolute value for
        each element. This is useful when the function is applied to all
        parameters of the network, including the bias, without distinction.
        (Optional)

    Returns
    -------
    norms : 1D array or Theano vector (1D)
        1D array or Theano vector of incoming weight/bias vector norms.

    Examples
    --------
    >>> array = np.random.randn(100, 200)
    >>> norms = compute_norms(array)
    >>> norms.shape
    (200,)

    >>> norms = compute_norms(array, norm_axes=(1,))
    >>> norms.shape
    (100,)
    """

    # Check if supported type
    if not isinstance(array, theano.Variable) and \
       not isinstance(array, np.ndarray):
        raise RuntimeError("Unsupported type {}. "
                           "Only theano variables and numpy arrays "
                           "are supported".format(type(array)))

    # Compute default axes to sum over
    ndim = array.ndim
    if norm_axes is not None:
        sum_over = tuple(norm_axes)
    elif ndim == 1:  # For Biases that are in 1d (e.g. b of DenseLayer)
        sum_over = ()
    elif ndim == 2:  # DenseLayer
        sum_over = (0, )
    elif ndim in [3, 4, 5]:  # Conv{1,2,3}DLayer
        sum_over = tuple(range(1, ndim))
    else:
        raise ValueError("Unsupported tensor dimensionality {}. "
                         "Must specify `norm_axes`".format(array.ndim))

    # Run numpy or Theano norm computation
    if isinstance(array, theano.Variable):
        # Apply theano version if it is a theano variable
        if len(sum_over) == 0:
            norms = T.abs_(array)  # abs if we have nothing to sum over
        else:
            norms = T.sqrt(T.sum(array**2, axis=sum_over))
    elif isinstance(array, np.ndarray):
        # Apply the numpy version if ndarray
        if len(sum_over) == 0:
            norms = abs(array)  # abs if we have nothing to sum over
        else:
            norms = np.sqrt(np.sum(array**2, axis=sum_over))

    return norms
Example #53
0
def cdf(x, miu=0.0, variance=1.0):
    return 1.0 / 2 * (1.0 + T.erf((x - miu) / T.sqrt(2 * variance)))
Example #54
0
    def _setup_backprop_with(self, dec_init_state, annotations, y, d,
                             y_in_x_inds, eta, l2_reg):
        def decoder_recurrence(y_t, d_t, cur_y_in_x_inds, h_prev, annotations,
                               *params):
            h_for_write = self.specs[self.domain].decoder.get_h_for_write(
                h_prev)
            scores = self.specs[self.domain].get_attention_scores(
                h_for_write, annotations)
            alpha = self.specs[self.domain].get_alpha(scores)
            c_t = self.specs[self.domain].get_context(alpha, annotations)
            write_dist = self.specs[self.domain].f_write(
                h_for_write, c_t, scores)
            base_p_y_t = write_dist[y_t]
            if self.spec.attention_copying:
                copying_p_y_t = T.dot(
                    write_dist[self.specs[self.domain].out_vocabulary.size(
                    ):self.specs[self.domain].out_vocabulary.size() +
                               cur_y_in_x_inds.shape[0]], cur_y_in_x_inds)
                p_y_t = base_p_y_t + copying_p_y_t
            else:
                p_y_t = base_p_y_t
            h_t = self.specs[self.domain].f_dec(y_t, d_t, c_t, h_prev)
            return (h_t, p_y_t)

        dec_results, _ = theano.scan(fn=decoder_recurrence,
                                     sequences=[y, d, y_in_x_inds],
                                     outputs_info=[dec_init_state, None],
                                     non_sequences=[annotations] +
                                     self.all_shared)
        p_y_seq = dec_results[1]
        log_p_y = T.sum(T.log(p_y_seq))
        nll = -log_p_y
        # Add L2 regularization
        regularization = l2_reg / 2 * sum(T.sum(p**2) for p in self.params)
        objective = nll + regularization
        gradients = T.grad(objective, self.params)

        # Do the updates here
        updates = []
        if self.specs[self.domain].step_rule in ('adagrad', 'rmsprop'):
            # Adagrad updates
            for p, g, c in zip(self.params, gradients, self.grad_cache):
                grad_norm = g.norm(2)
                clipped_grad = ifelse(grad_norm >= CLIP_THRESH,
                                      g * CLIP_THRESH / grad_norm, g)
                if self.spec.step_rule == 'adagrad':
                    new_c = c + clipped_grad**2
                else:  # rmsprop
                    decay_rate = 0.9  # Use fixed decay rate of 0.9
                    new_c = decay_rate * c + (1.0 -
                                              decay_rate) * clipped_grad**2
                new_p = p - eta * clipped_grad / T.sqrt(new_c + 1e-4)
                has_non_finite = T.any(T.isnan(new_p) + T.isinf(new_p))
                updates.append((p, ifelse(has_non_finite, p, new_p)))
                updates.append((c, ifelse(has_non_finite, c, new_c)))
        elif self.specs[self.domain].step_rule == 'nesterov':
            # Nesterov momentum
            for p, g, v in zip(self.params, gradients, self.grad_cache):
                grad_norm = g.norm(2)
                clipped_grad = ifelse(grad_norm >= CLIP_THRESH,
                                      g * CLIP_THRESH / grad_norm, g)
                new_v = NESTEROV_MU * v - eta * clipped_grad
                new_p = p - NESTEROV_MU * v + (1 + NESTEROV_MU) * new_v
                has_non_finite = (T.any(T.isnan(new_p) + T.isinf(new_p)) +
                                  T.any(T.isnan(new_v) + T.isinf(new_v)))
                updates.append((p, ifelse(has_non_finite, p, new_p)))
                updates.append((v, ifelse(has_non_finite, v, new_v)))
        else:
            # Simple SGD updates
            for p, g in zip(self.params, gradients):
                grad_norm = g.norm(2)
                clipped_grad = ifelse(grad_norm >= CLIP_THRESH,
                                      g * CLIP_THRESH / grad_norm, g)
                new_p = p - eta * clipped_grad
                has_non_finite = T.any(T.isnan(new_p) + T.isinf(new_p))
                updates.append((p, ifelse(has_non_finite, p, new_p)))
        return nll, p_y_seq, objective, updates
Example #55
0
def cdf(sample, mu=0, sigma=1, eps=1e-6):
    div = T.sqrt(2) * sigma
    erf_arg = (sample - mu) / div
    return .5 * (1 + T.erf(erf_arg))
Example #56
0
 def nonDeterminstic(self, x):
     x += self.theanoGenerator.normal(avg=0.0,
                                      std=(T.sqrt(T.nnet.sigmoid(x)) +
                                           1e-8))
     return x * (x > 0.0)
def _get_compiled_theano_functions():
    # Planet masses: m1,m2
    m1, m2 = T.dscalars(2)
    mstar = 1
    mu1 = m1 * mstar / (mstar + m1)
    mu2 = m2 * mstar / (mstar + m2)
    eta1 = mstar + m1
    eta2 = mstar + m2
    beta1 = mu1 * T.sqrt(eta1 / mstar) / (mu1 + mu2)
    beta2 = mu2 * T.sqrt(eta2 / mstar) / (mu1 + mu2)
    j, k = T.lscalars('jk')
    s = (j - k) / k

    # Dynamical variables:
    dyvars = T.vector()
    s1, s2, psi, phi, Omega, I1, I2, Psi, Phi, Rtilde = [
        dyvars[i] for i in range(10)
    ]
    l1 = phi - 0.5 * k * psi
    l2 = phi + 0.5 * k * psi
    gamma1 = s1 - (1 + s) * l2 + s * l1
    gamma2 = s2 - (1 + s) * l2 + s * l1
    Gamma1 = I1
    Gamma2 = I2
    L1 = Phi / 2 - Psi / k - s * (I1 + I2)
    L2 = Phi / 2 + Psi / k + (s + 1) * (I1 + I2)
    Cz = -1 * Rtilde

    R = L1 + L2 - Gamma1 - Gamma2 - Cz
    G1 = L1 - Gamma1
    G2 = L2 - Gamma2

    r2_by_r1 = (L2 - L1 - Gamma2 + Gamma1) / (L1 + L2 - Gamma1 - Gamma2 - R)
    rho1 = 0.5 * R * (1 + r2_by_r1)
    rho2 = 0.5 * R * (1 - r2_by_r1)

    a1 = (L1 / beta1)**2
    e1 = T.sqrt(1 - (1 - (Gamma1 / L1))**2)

    a2 = (L2 / beta2)**2
    e2 = T.sqrt(1 - (1 - (Gamma2 / L2))**2)

    cos_inc1 = 1 - rho1 / G1
    cos_inc2 = 1 - rho2 / G2
    inc1 = T.arccos(cos_inc1)
    inc2 = T.arccos(cos_inc2)

    l1_r = l1 - Omega
    l2_r = l2 - Omega

    Omega1_r = T.constant(np.pi / 2) - Omega
    Omega2_r = Omega1_r - T.constant(np.pi)

    pomega1 = -1 * gamma1
    pomega2 = -1 * gamma2

    pomega1_r = pomega1 - Omega
    pomega2_r = pomega2 - Omega

    omega1 = pomega1_r - Omega1_r
    omega2 = pomega2_r - Omega2_r

    Hkep = -0.5 * T.sqrt(eta1) * beta1 / a1 - 0.5 * T.sqrt(eta2) * beta2 / a2

    ko = KeplerOp()
    M1 = l1_r - pomega1_r
    M2 = l2_r - pomega2_r
    sinf1, cosf1 = ko(M1, e1 + T.zeros_like(M1))
    sinf2, cosf2 = ko(M2, e2 + T.zeros_like(M2))
    #
    n1 = T.sqrt(eta1 / mstar) * a1**(-3 / 2)
    n2 = T.sqrt(eta2 / mstar) * a2**(-3 / 2)
    Hint_dir, Hint_ind, r1, r2, v1, v2 = calc_Hint_components_sinf_cosf(
        a1, a2, e1, e2, inc1, inc2, omega1, omega2, Omega1_r, Omega2_r, n1, n2,
        sinf1, cosf1, sinf2, cosf2)
    eps = m1 * m2 / (mu1 + mu2) / T.sqrt(mstar)
    Hpert = (Hint_dir + Hint_ind / mstar)
    Htot = Hkep + eps * Hpert

    #####################################################
    # Set parameters for compiling functions with Theano
    #####################################################

    # 'ins' will set the inputs of Theano functions compiled below
    #   Note: 'extra_ins' will be passed as values of object attributes
    #   of the 'ResonanceEquations' class 'defined below
    extra_ins = [m1, m2, j, k]
    givens = []
    ins = [dyvars] + extra_ins
    orbels = [
        a1, e1, inc1, l1_r, pomega1_r, Omega1_r, a2, e2, inc2, l2_r, pomega2_r,
        Omega2_r
    ]
    #  Conservative flow
    gradHtot = T.grad(Htot, wrt=dyvars)
    hessHtot = theano.gradient.hessian(Htot, wrt=dyvars)
    Jtens = T.as_tensor(_get_Omega_matrix(5))
    H_flow_vec = Jtens.dot(gradHtot)
    H_flow_jac = Jtens.dot(hessHtot)

    ##########################
    # Compile Theano functions
    ##########################
    orbels_fn = theano.function(inputs=ins,
                                outputs=orbels,
                                givens=givens,
                                on_unused_input='ignore')

    rv1_fn = theano.function(inputs=ins,
                             outputs=r1 + v1,
                             givens=givens,
                             on_unused_input='ignore')
    rv2_fn = theano.function(inputs=ins,
                             outputs=r2 + v2,
                             givens=givens,
                             on_unused_input='ignore')

    Htot_fn = theano.function(inputs=ins,
                              outputs=Htot,
                              givens=givens,
                              on_unused_input='ignore')

    Hpert_fn = theano.function(inputs=ins,
                               outputs=Hpert,
                               givens=givens,
                               on_unused_input='ignore')

    Hpert_components_fn = theano.function(inputs=ins,
                                          outputs=[Hint_dir, Hint_ind],
                                          givens=givens,
                                          on_unused_input='ignore')

    H_flow_vec_fn = theano.function(inputs=ins,
                                    outputs=H_flow_vec,
                                    givens=givens,
                                    on_unused_input='ignore')

    H_flow_jac_fn = theano.function(inputs=ins,
                                    outputs=H_flow_jac,
                                    givens=givens,
                                    on_unused_input='ignore')
    return dict({
        'orbital_elements': orbels_fn,
        'Hamiltonian': Htot_fn,
        'Hpert': Hpert_fn,
        'Hpert_components': Hpert_components_fn,
        'Hamiltonian_flow': H_flow_vec_fn,
        'Hamiltonian_flow_jacobian': H_flow_jac_fn,
        'positions_and_velocities1': rv1_fn,
        'positions_and_velocities2': rv2_fn
    })
Example #58
0
def expectedValueRectified(mean, variance):
    std = T.sqrt(variance)
    return std / T.sqrt(2.0 * np.pi) * T.exp(
        -mean**2 / (2.0 * variance)) + mean * cdf(mean / std)
def calc_Hint_components_sinf_cosf(a1, a2, e1, e2, inc1, inc2, omega1, omega2,
                                   Omega1, Omega2, n1, n2, sinf1, cosf1, sinf2,
                                   cosf2):
    """
    Compute the value of the disturbing function
    .. math::
        \frac{1}{|r-r'|} - ??? v.v'
    from a set of input orbital elements for coplanar planets.

    Arguments
    ---------
    a1 : float
        inner semi-major axis 
    a2 : float
        outer semi-major axis 
    e1 : float
        inner eccentricity
    e2 : float
        outer eccentricity
    I1 : float
        inner inclination
    I2 : float
        outer inclination
    omega1 : float
        inner argument of periapse
    omega2 : float
        outer argument of periapse
    dOmega : float
        difference in long. of nodes, Omega2-Omega1
    n1 : float
        inner mean motion
    n2 : float
        outer mean motion
    sinf1 : float
        sine of inner planet true anomaly
    cosf1 : float
        cosine of inner planet true anomaly
    sinf2 : float
        sine of outer planet true anomaly
    cosf2 : float
        cosine of outer planet true anomaly

    Returns
    -------
    (direct,indirect) : tuple
        Returns a tuple containing the direct and indirect parts
        of the interaction Hamiltonian
    """
    r1 = a1 * (1 - e1 * e1) / (1 + e1 * cosf1)
    _x1 = r1 * cosf1
    _y1 = r1 * sinf1
    _z1 = 0.
    x1, y1, z1 = EulerAnglesTransform(_x1, _y1, _z1, Omega1, inc1, omega1)

    vel1 = n1 * a1 / T.sqrt(1 - e1 * e1)
    _u1 = -1 * vel1 * sinf1
    _v1 = vel1 * (e1 + cosf1)
    _w1 = 0.
    u1, v1, w1 = EulerAnglesTransform(_u1, _v1, _w1, Omega1, inc1, omega1)

    r2 = a2 * (1 - e2 * e2) / (1 + e2 * cosf2)
    _x2 = r2 * cosf2
    _y2 = r2 * sinf2
    _z2 = 0.
    x2, y2, z2 = EulerAnglesTransform(_x2, _y2, _z2, Omega2, inc2, omega2)
    vel2 = n2 * a2 / T.sqrt(2 - e2 * e2)
    _u2 = -1 * vel2 * sinf2
    _v2 = vel2 * (e2 + cosf2)
    _w2 = 0.
    u2, v2, w2 = EulerAnglesTransform(_u2, _v2, _w2, Omega2, inc2, omega2)

    # direct term
    dx = x2 - x1
    dy = y2 - y1
    dz = z2 - z1
    dr2 = dx * dx + dy * dy + dz * dz
    direct = -1 / T.sqrt(dr2)
    # indirect terms
    indirect = u1 * u2 + v1 * v2 + w1 * w2
    return direct, indirect, [x1, y1, z1], [x2, y2, z2], [u1, v1,
                                                          w1], [u2, v2, w2]
Example #60
0
        def step(input_n, gamma, time_step, cell_previous, hid_previous, *args):
            hidden = T.dot(hid_previous, W_hid_stacked)

            # batch normalization of hidden states
            if deterministic:
                mean = self.running_mean[time_step]
                inv_std = self.running_inv_std[time_step]
            else:
                mean = hidden.mean(0)
                inv_std = T.inv(T.sqrt(hidden.var(0) + self.epsilon))

                self.running_mean_clone.default_update = \
                    T.set_subtensor(self.running_mean_clone.default_update[time_step],
                        (1-self.alpha) * self.running_mean_clone.default_update[time_step] + self.alpha * mean)
                self.running_inv_std_clone.default_update = \
                    T.set_subtensor(self.running_inv_std_clone.default_update[time_step],
                        (1-self.alpha) * self.running_inv_std_clone.default_update[time_step] + self.alpha * inv_std)
                mean += 0 * self.running_mean_clone[time_step]
                inv_std += 0 * self.running_inv_std_clone[time_step]

            gamma = gamma.dimshuffle('x', 0)
            mean = mean.dimshuffle('x', 0)
            inv_std = inv_std.dimshuffle('x', 0)

            # normalize
            normalized = (hidden - mean) * (gamma * inv_std)

            # Calculate gates pre-activations and slice
            gates = input_n + normalized

            # Clip gradients
            if self.grad_clipping:
                gates = theano.gradient.grad_clip(
                    gates, -self.grad_clipping, self.grad_clipping)

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)

            if self.peepholes:
                # Compute peephole connections
                ingate += cell_previous*self.W_cell_to_ingate
                forgetgate += cell_previous*self.W_cell_to_forgetgate

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)

            # Compute new cell value
            cell = forgetgate*cell_previous + ingate*cell_input

            if self.peepholes:
                outgate += cell*self.W_cell_to_outgate
            outgate = self.nonlinearity_outgate(outgate)

            # Compute new hidden unit activation
            hid = outgate*self.nonlinearity(cell)
            return [cell, hid]