def DSSIM(p, y, eps=1e-7):
        # Taken/Modified from https://github.com/fchollet/keras/issues/4292
        # Nan issue : T.maximum(x, eps)

        y_patch = neigh.images2neibs(y, [4, 4], mode='ignore_borders')
        p_patch = neigh.images2neibs(p, [4, 4], mode='ignore_borders')

        y_mean = T.mean(y_patch, axis=-1)
        p_mean = T.mean(p_patch, axis=-1)

        y_var = T.var(y_patch, axis=-1, corrected=True)
        p_var = T.var(p_patch, axis=-1, corrected=True)

        y_std = T.sqrt(T.maximum(y_var, eps))
        p_std = T.sqrt(T.maximum(p_var, eps))

        c1 = 0.01**2
        c2 = 0.02**2

        num = (2 * y_mean * p_mean + c1) * (2 * y_std * p_std + c2)
        denom = (T.pow(y_mean, 2) + T.pow(p_mean, 2) + c1) * (y_var + p_var +
                                                              c2)

        ssim = num / T.maximum(denom, eps)

        return T.mean(1.0 - ssim)
    def computeA(self, symmetric_double_encoder, params):

        regularization = 0
        if self._layer == -1:
            for layer in symmetric_double_encoder:
                hidden_x = layer.output_forward_x
                hidden_y = layer.output_forward_y

                cov_x = Tensor.dot(hidden_x, hidden_x.T)
                cov_y = Tensor.dot(hidden_y, hidden_y.T)

                regularization += Tensor.mean(Tensor.sum(abs(cov_x), axis=1, dtype=Tensor.config.floatX)) + Tensor.mean(
                    Tensor.sum(abs(cov_y), axis=1, dtype=Tensor.config.floatX))

        elif self._layer < len(symmetric_double_encoder):
            hidden_x = symmetric_double_encoder[self._layer].output_forward_x
            hidden_y = symmetric_double_encoder[self._layer].output_forward_y

            var_x = Tensor.var(hidden_x, axis=1)
            var_y = Tensor.var(hidden_y, axis=1)

            norm_x = Tensor.mean(Tensor.sum(hidden_x ** 2, axis=1, dtype=Tensor.config.floatX))
            norm_y = Tensor.mean(Tensor.sum(hidden_y ** 2, axis=1, dtype=Tensor.config.floatX))

            regularization -= norm_x
            regularization -= norm_y

            #
            # cov_x = Tensor.dot(hidden_x.T, hidden_x)
            # cov_y = Tensor.dot(hidden_y.T, hidden_y)
            #
            # regularization -= ((Tensor.sum(abs(cov_x))) + (Tensor.sum(abs(cov_y))))

        return self.weight * regularization
Ejemplo n.º 3
0
 def build(self, output, tparams=None, BNparams=None):
     if self.BN_mode:
         self.BN_eps = npt(self.BN_eps)
         if not hasattr(self, 'BN_mean'):
             self.BN_mean = T.mean(output)
         if not hasattr(self, 'BN_std'):
             m2 = (1 + 1 / (T.prod(output.shape) - 1)).astype(floatX)
             self.BN_std = T.sqrt(m2 * T.var(output) + self.BN_eps)
         if self.BN_mode == 2:
             t_mean = T.mean(output, axis=[0, 2, 3], keepdims=True)
             t_var = T.var(output, axis=[0, 2, 3], keepdims=True)
             BN_mean = BNparams[p_(self.prefix, 'mean')].dimshuffle(
                 'x', 0, 'x', 'x')
             BN_std = BNparams[p_(self.prefix, 'std')].dimshuffle(
                 'x', 0, 'x', 'x')
             output = ifelse(
                 self.training,
                 (output - t_mean) / T.sqrt(t_var + self.BN_eps),
                 (output - BN_mean) / BN_std)
             output *= tparams[p_(self.prefix, 'BN_scale')].dimshuffle(
                 'x', 0, 'x', 'x')
             output += tparams[p_(self.prefix, 'BN_shift')].dimshuffle(
                 'x', 0, 'x', 'x')
         elif self.BN_mode == 1:
             t_mean = T.mean(output)
             t_var = T.var(output)
             output = ifelse(
                 self.training,
                 (output - t_mean) / T.sqrt(t_var + self.BN_eps),
                 ((output - BNparams[p_(self.prefix, 'mean')])
                  / BNparams[p_(self.prefix, 'std')]))
             output *= tparams[p_(self.prefix, 'BN_scale')]
             output += tparams[p_(self.prefix, 'BN_shift')]
     self.output = self.activation(output)
Ejemplo n.º 4
0
    def __init__(self, network):
        self.network = network
        self.parameters = network.parameters

        num_trails = self.parameters.num_trials
        n_layers = network.n_layers
        self.channels = {}

        for channel in self.training_values:
            self.channels[channel] = np.zeros((n_layers, num_trails))
        for channel in self.training_mean_std:
            self.channels[channel] = np.zeros((n_layers, num_trails, 2))

        outputs = []

        for layer in range(n_layers):
            if layer == 0:
                X = self.network.X
            else:
                X = self.network.Y[layer - 1]
            Y = self.network.Y[layer]
            Q = self.network.Q[layer]
            W = self.network.W[layer]
            theta = self.network.theta[layer]
            y_bar = Y.mean()
            Cyy_bar = (Y.T.dot(Y) / network.parameters.batch_size).mean()
            outputs.extend([y_bar, Cyy_bar])

            X_rec = Y.dot(Q.T)
            X_rec_norm = T.sqrt(T.sum(T.sqr(X_rec), axis=1, keepdims=True))
            X_norm = T.sqrt(T.sum(T.sqr(X), axis=1, keepdims=True))
            X_rec_bar = X_rec_norm.mean()
            X_rec_std = X_rec_norm.std()
            outputs.extend([X_rec_bar, X_rec_std])

            X_bar = X_norm.mean()
            X_std = X_norm.std()
            outputs.extend([X_bar, X_std])

            SNR_Norm = T.mean(T.var(X, axis=0)) / T.mean(
                T.var(X - X_rec * X_norm / X_rec_norm, axis=0))
            SNR = T.mean(T.var(X, axis=0)) / T.mean(
                T.var(X - X_rec_norm, axis=0))
            outputs.extend([SNR, SNR_Norm])

            Q_norm = T.sqrt(T.sum(T.sqr(Q), axis=0))
            Q_bar = Q_norm.mean()
            Q_std = Q_norm.std()
            outputs.extend([Q_bar, Q_std])

            W_bar = W.mean()
            W_std = W.std()
            outputs.extend([W_bar, W_std])

            theta_bar = theta.mean()
            theta_std = theta.std()
            outputs.extend([theta_bar, theta_std])

        self.f = theano.function([], outputs)
Ejemplo n.º 5
0
    def decorate(self, layer):
        if self.onTrain:
            std = tt.sqrt(tt.var(layer.outputs) + self.espilon)
            layer.output = (layer.output - tt.mean(layer.output) / std)

        if self.onTest:
            std = tt.sqrt(tt.var(layer.testOutputs) + self.espilon)
            layer.testOutput = (layer.testOutput -
                                tt.mean(layer.testOutput)) / std
Ejemplo n.º 6
0
    def __init__(self, network):
        self.network = network
        self.parameters = network.parameters

        num_trails = self.parameters.num_trials
        n_layers = network.n_layers
        self.channels = {}

        for channel in self.training_values:
            self.channels[channel] = np.zeros((n_layers, num_trails))
        for channel in self.training_mean_std:
            self.channels[channel] = np.zeros((n_layers, num_trails, 2))

        outputs = []

        for layer in range(n_layers):
            if layer == 0:
                X = self.network.X
            else:
                X = self.network.Y[layer-1]
            Y = self.network.Y[layer]
            Q = self.network.Q[layer]
            W = self.network.W[layer]
            theta = self.network.theta[layer]
            y_bar = Y.mean()
            Cyy_bar = (Y.T.dot(Y)/network.parameters.batch_size).mean()
            outputs.extend([y_bar, Cyy_bar])

            X_rec = Y.dot(Q.T)
            X_rec_norm = T.sqrt(T.sum(T.sqr(X_rec),axis =1,keepdims=True))
            X_norm = T.sqrt(T.sum(T.sqr(X),axis =1,keepdims=True))
            X_rec_bar = X_rec_norm.mean()
            X_rec_std = X_rec_norm.std()
            outputs.extend([X_rec_bar, X_rec_std])

            X_bar = X_norm.mean()
            X_std = X_norm.std()
            outputs.extend([X_bar, X_std])

            SNR_Norm = T.mean(T.var(X,axis=0))/T.mean(T.var(X-X_rec*X_norm/X_rec_norm,axis=0))
            SNR = T.mean(T.var(X,axis=0))/T.mean(T.var(X-X_rec_norm,axis=0))
            outputs.extend([SNR, SNR_Norm])
            
            Q_norm = T.sqrt(T.sum(T.sqr(Q), axis=0))
            Q_bar = Q_norm.mean()
            Q_std = Q_norm.std()
            outputs.extend([Q_bar, Q_std])

            W_bar = W.mean()
            W_std = W.std()
            outputs.extend([W_bar, W_std])

            theta_bar = theta.mean()
            theta_std = theta.std()
            outputs.extend([theta_bar, theta_std])

        self.f = theano.function([], outputs)
Ejemplo n.º 7
0
    def instance(self, train_x, infer_x, dropout=None, epsilon=1e-8, **kwargs):
        """Returns (train_output, inference_output, statistics_updates, train_reconstruction, infer_reconstruction)"""

        # dropout
        dropout = dropout or 0.
        mask = self.srng.binomial(n=1, p=1 - dropout, size=train_x.shape)
        # cast because int * float32 = float64 which does not run on GPU
        train_x = train_x * T.cast(mask, theano.config.floatX)

        # outputs with batch-specific normalization
        train_lin_output = T.dot(train_x, self.t_W) + self.t_b
        train_lin_output.name = self.subname("trainLinOutput")
        batch_mean = T.mean(train_lin_output, axis=0)
        offset_output = train_lin_output - batch_mean
        batch_var = T.var(offset_output, axis=0)
        batch_sd = T.sqrt(batch_var + epsilon)
        normalized_lin_output = offset_output / batch_sd
        train_output = self.activation_fn(self.gamma * normalized_lin_output + self.beta)
        train_output.name = self.subname("trainOutput")

        # reconstruct batch-specific output
        W_T = self.t_W.T
        W_T.name = self.subname("W_T")
        recon_lin_output = T.dot(train_output, W_T) + self.t_decode_b
        recon_lin_output.name = self.subname("reconLinOutput")
        decode_batch_mean = T.mean(recon_lin_output, axis=0)
        recon_offset_output = recon_lin_output - decode_batch_mean
        decode_batch_var = T.var(recon_offset_output, axis=0)
        decode_batch_sd = T.sqrt(decode_batch_var + epsilon)
        normalized_recon_lin_output = recon_offset_output / decode_batch_sd
        reconstructed_output = self.activation_fn(self.decode_gamma * normalized_recon_lin_output + self.decode_beta)

        # outputs with rolling-average normalization
        infer_lin_output = T.dot(infer_x, self.t_W) + self.t_b
        infer_lin_output.name = self.subname("inferLinOutput")
        sd = T.sqrt(self.variance + epsilon)
        normalized_infer_lin_output = infer_lin_output - self.mean
        inference_output = self.activation_fn(self.gamma / sd * normalized_infer_lin_output + self.beta)
        infer_lin_output.name = self.subname("inferenceOutput")

        # reconstruct batch-specific output
        recon_infer_lin_output = T.dot(inference_output, W_T) + self.t_decode_b
        recon_infer_lin_output.name = self.subname("reconInferLinOutput")
        decode_sd = T.sqrt(self.decode_variance + epsilon)
        normalized_recon_infer_lin_output = recon_infer_lin_output - self.decode_mean
        recon_infer_output = self.activation_fn(self.decode_gamma / decode_sd * normalized_recon_infer_lin_output + self.decode_beta)

        # save exponential moving average for batch mean/variance
        statistics_updates = [
            (self.mean, self.alpha * self.mean + (1.0 - self.alpha) * batch_mean),
            (self.variance, self.alpha * self.variance + (1.0 - self.alpha) * batch_var),
            (self.decode_mean, self.alpha * self.decode_mean + (1.0 - self.alpha) * decode_batch_mean),
            (self.decode_variance, self.alpha * self.decode_variance + (1.0 - self.alpha) * decode_batch_var),
        ]

        return train_output, inference_output, statistics_updates, reconstructed_output, recon_infer_output
Ejemplo n.º 8
0
 def f_prop(self, x):
     if x.ndim == 2:
         mean = T.mean(x, axis=0, keepdims=True)
         std = T.sqrt(T.var(x, axis=0, keepdims=True)+self.epsilon)
     elif x.ndim == 4:
         mean = T.mean(x, axis=(0,2,3), keepdims=True)
         std = T.sqrt(T.var(x, axis=(0,2,3), keepdims=True)+self.epsilon)
     
     normalized_x = (x-mean)/std
     self.z = self.gamma*normalized_x+self.beta
     return self.z
Ejemplo n.º 9
0
def LayerNormalization(x, gamma, mask, estimated_mean=0.0, estimated_var=1.0):
    assert x.ndim == 3 or x.ndim == 2
    if x.ndim == 3:
        x_mean = T.mean(x, axis=2).dimshuffle(0, 1, 'x')
        x_var = T.var(x, axis=2).dimshuffle(0, 1, 'x')
        return gamma * (
            (x - x_mean) / T.sqrt(x_var + 1e-7)), x_mean[0, 0], x_var[0, 0]

    elif x.ndim == 2:
        x_mean = T.mean(x, axis=1).dimshuffle(0, 'x')
        x_var = T.var(x, axis=1).dimshuffle(0, 'x')
        return gamma * (
            (x - x_mean) / T.sqrt(x_var + 1e-7)), x_mean[0], x_var[0]
Ejemplo n.º 10
0
    def get_result(self, input):
        # returns BN result for given input.
        epsilon = 1e-06

        if self.mode == 0:
            if self.run_mode == 0:
                now_mean = T.mean(input, axis=0)
                now_var = T.var(input, axis=0)
                now_normalize = (input - now_mean) / T.sqrt(
                    now_var + epsilon)  # should be broadcastable..
                output = self.gamma * now_normalize + self.beta
                #print ('norm.shape =')
                #print (now_normalize.shape.eval({x: np.random.rand(2,2).astype(dtype=theano.config.floatX)}))

                # mean, var update
                self.mean = self.momentum * self.mean + (
                    1.0 - self.momentum) * now_mean
                self.var = self.momentum * self.var + (1.0 - self.momentum) * (
                    self.input_shape[0] / (self.input_shape[0] - 1) * now_var)
            else:
                output = self.gamma * (
                    input - self.mean) / T.sqrt(self.var + epsilon) + self.beta

        else:
            # in CNN mode, gamma and beta exists for every single channel separately.
            # for each channel, calculate mean and std for (mini_batch_size * row * column) elements.
            # then, each channel has own scalar gamma/beta parameters.
            if self.run_mode == 0:
                now_mean = T.mean(input, axis=(0, 2, 3))
                now_var = T.var(input, axis=(0, 2, 3))
                # mean, var update
                self.mean = self.momentum * self.mean + (
                    1.0 - self.momentum) * now_mean
                self.var = self.momentum * self.var + (1.0 - self.momentum) * (
                    self.input_shape[0] / (self.input_shape[0] - 1) * now_var)
            else:
                now_mean = self.mean
                now_var = self.var
            # change shape to fit input shape
            now_mean = self.change_shape(now_mean)
            now_var = self.change_shape(now_var)
            now_gamma = self.change_shape(self.gamma)
            now_beta = self.change_shape(self.beta)

            output = now_gamma * (
                input - now_mean) / T.sqrt(now_var + epsilon) + now_beta

        return output
Ejemplo n.º 11
0
def build_trainer(input_data,
                  input_mask,
                  target_data,
                  target_mask,
                  network_params,
                  output_layer,
                  cond_layer_list,
                  feat_reg,
                  updater,
                  learning_rate,
                  load_updater_params=None):
    output_score = get_output(output_layer, deterministic=False)
    frame_prd_idx = T.argmax(output_score, axis=-1)

    one_hot_target = T.extra_ops.to_one_hot(y=T.flatten(target_data, 1),
                                            nb_class=output_dim,
                                            dtype=floatX)

    output_score = T.reshape(x=output_score,
                             newshape=(-1, output_dim),
                             ndim=2)
    output_score = output_score - T.max(output_score, axis=-1, keepdims=True)
    output_score = output_score - T.log(T.sum(T.exp(output_score), axis=-1, keepdims=True))

    train_ce = -T.sum(T.mul(one_hot_target, output_score), axis=-1)*T.flatten(target_mask, 1)

    train_loss = T.sum(train_ce)/target_mask.shape[0]
    frame_loss = T.sum(train_ce)/T.sum(target_mask)

    frame_accr = T.sum(T.eq(frame_prd_idx, target_data)*target_mask)/T.sum(target_mask)

    train_feat_loss = 0
    for cond_layer in cond_layer_list:
        sample_feat = cond_layer.get_sample_feat()
        sample_feat_cost = T.var(sample_feat, axis=0)
        sample_feat_cost = -T.mean(sample_feat_cost)
        train_feat_loss += sample_feat_cost
    train_feat_loss /= len(cond_layer_list)

    train_total_loss = train_loss + train_feat_loss*feat_reg

    network_grads = theano.grad(cost=train_total_loss, wrt=network_params)
    network_grads_norm = T.sqrt(sum(T.sum(grad**2) for grad in network_grads))

    train_lr = theano.shared(lasagne.utils.floatX(learning_rate))
    train_updates, updater_params = updater(loss_or_grads=network_grads,
                                            params=network_params,
                                            learning_rate=train_lr,
                                            load_params_dict=load_updater_params)

    training_fn = theano.function(inputs=[input_data,
                                          input_mask,
                                          target_data,
                                          target_mask],
                                  outputs=[frame_loss,
                                           frame_accr,
                                           train_feat_loss,
                                           network_grads_norm],
                                  updates=train_updates)
    return training_fn, train_lr, updater_params
Ejemplo n.º 12
0
def get_stats(input, stat=None):
    """
    Returns a dictionary mapping the name of the statistic to the result on the input.
    Currently gets mean, var, std, min, max, l1, l2.

    Parameters
    ----------
    input : tensor
        Theano tensor to grab stats for.

    Returns
    -------
    dict
        Dictionary of all the statistics expressions {string_name: theano expression}
    """
    stats = {
        'mean': T.mean(input),
        'var': T.var(input),
        'std': T.std(input),
        'min': T.min(input),
        'max': T.max(input),
        'l1': input.norm(L=1),
        'l2': input.norm(L=2),
        #'num_nonzero': T.sum(T.nonzero(input)),
    }
    stat_list = raise_to_list(stat)
    compiled_stats = {}
    if stat_list is None:
        return stats

    for stat in stat_list:
        if isinstance(stat, string_types) and stat in stats:
            compiled_stats.update({stat: stats[stat]})
    return compiled_stats
Ejemplo n.º 13
0
def ZCA(data, n_component=2):
    '''
    m is the number of data points
    n is the dimension of the data

    :param data: <numpy matrix, (m,n)> imput data
    :param n_component: <int> number of dimension to be extracted
    :return:
    '''

    # data standardization
    x = T.matrix('x')
    eps = T.scalar('eps')
    y = (x - T.mean(x, axis=0)) / T.sqrt(T.var(x) + eps)
    standardize = th.function([x, eps], y)

    # zca whitening
    x_n = T.matrix('x_n')  # normalized input
    eps2 = T.scalar('eps2')  # small esp to prevent div by zero
    x_cov = T.dot(x_n.T, x_n) / x_n.shape[0]  # variance of input
    u, s, v = T.nlinalg.svd(x_cov)

    z = T.dot(T.dot(u, T.nlinalg.diag(1. / T.sqrt(s + eps2))), u.T)
    x_zca = T.dot(x_n, z.T[:, :n_component])
    zca_whiten = th.function([x_n, eps2], x_zca)
    return  zca_whiten(standardize(data, 0.1), 0.01)
Ejemplo n.º 14
0
	def decorate(self, layer) :
		if not hasattr(layer, "batchnorm_W") or not hasattr(layer, "batchnorm_b") :
			self.paramShape = layer.getOutputShape()#(layer.nbOutputs, )
			self.WInitialization.initialize(self)
			self.bInitialization.initialize(self)

			layer.batchnorm_W = self.W
			layer.batchnorm_b = self.b

			mu = tt.mean(layer.outputs)
			sigma = tt.sqrt( tt.var(layer.outputs) + self.epsilon )
			layer.outputs = layer.batchnorm_W * ( (layer.outputs - mu) / sigma ) + layer.batchnorm_b

			mu = tt.mean(layer.testOutputs)
			sigma = tt.sqrt( tt.var(layer.testOutputs) + self.epsilon )
			layer.testOutputs = layer.batchnorm_W * ( (layer.testOutputs - mu) / sigma ) + layer.batchnorm_b
Ejemplo n.º 15
0
    def get_output_for(self,
                       input,
                       moving_avg_hooks=None,
                       deterministic=False,
                       *args,
                       **kwargs):
        if deterministic is False:
            m = T.mean(input, axis=self.axis, keepdims=True)
            v = T.sqrt(
                T.var(input, axis=self.axis, keepdims=True) + self.epsilon)
            m.name = "tensor:mean"
            v.name = "tensor:variance"

            key = "BatchNormLayer:movingavg"
            if key not in moving_avg_hooks:
                moving_avg_hooks[key] = []
            moving_avg_hooks[key].append(
                [[m, v], [self.mean_inference, self.variance_inference]])
        else:
            m = self.mean_inference
            v = self.variance_inference

        input_norm = (input - m) / v  # normalize
        y = self.gamma * input_norm + self.beta  # scale and shift
        return self.nonlinearity(y)
Ejemplo n.º 16
0
 def _compute_training_statistics(self, input_):
     if self.n_iter:
         axes = (0, ) + tuple(
             (i + 1)
             for i, b in enumerate(self.population_mean[0].broadcastable)
             if b)
     else:
         axes = (0, ) + tuple(
             (i + 1)
             for i, b in enumerate(self.population_mean.broadcastable) if b)
     mean = input_.mean(axis=axes, keepdims=True)
     if self.n_iter:
         assert mean.broadcastable[1:] == self.population_mean[
             0].broadcastable
     else:
         assert mean.broadcastable[1:] == self.population_mean.broadcastable
     stdev = tensor.sqrt(
         tensor.var(input_, axis=axes, keepdims=True) +
         numpy.cast[theano.config.floatX](self.epsilon))
     if self.n_iter:
         assert stdev.broadcastable[1:] == self.population_stdev[
             0].broadcastable
     else:
         assert stdev.broadcastable[
             1:] == self.population_stdev.broadcastable
     add_role(mean, BATCH_NORM_MINIBATCH_ESTIMATE)
     add_role(stdev, BATCH_NORM_MINIBATCH_ESTIMATE)
     return mean, stdev
Ejemplo n.º 17
0
    def process(self, input, tparams, BNparams):
        mode = 'full' if self.border_mode == 'same' else self.border_mode
        output = conv.conv2d(
            input=input,
            filters=tparams[p_(self.prefix, 'W')],
            image_shape=[self.batch_size, self.n_in[0]] + self.image_shape,
            filter_shape=[self.n_out] + self.n_in,
            border_mode=mode,
            subsample=self.stride)

        if self.border_mode == 'same':
            a1 = (self.filter_size[0] - 1) // 2
            b1 = (self.filter_size[1] - 1) // 2
            a2 = self.filter_size[0] - a1
            b2 = self.filter_size[1] - b1
            if a2 == 1:
                if b2 == 1:
                    output = output[:, :, a1:, b1:]
                else:
                    output = output[:, :, a1:, b1:-b2+1]
            else:
                if b2 == 1:
                    output = output[:, :, a1:-a2+1, b1:]
                else:
                    output = output[:, :, a1:-a2+1, b1:-b2+1]

        if self.with_bias:
            output += tparams[p_(self.prefix, 'b')].dimshuffle('x', 0, 'x', 'x')

        self.BN_mean = T.mean(output, axis=[0, 2, 3])
        m2 = (1 + 1 / (T.prod(output.shape) / self.n_out - 1)).astype(floatX)
        self.BN_std = T.sqrt(m2 * T.var(output, axis=[0, 2, 3])
                             + npt(self.BN_eps))
        return output
Ejemplo n.º 18
0
    def __init__(self, rng, layers, mc_samples=None):
        
        self.layers = layers
        self.params = [param for layer in self.layers
                       for param in layer.params]
        self.cost = self.layers[-1].cost # function pointer
        
        if mc_samples is None:
            # Standard dropout network.
            try:
                self.preds = self.layers[-1].preds
                self.error = self.layers[-1].error # function pointer
            except:
                print('Could not access network outputs'
                      ' - did you pass a (non-dropout) input?'
                      )
        else:
            # mc_dropout network.
            self.mc_samples = mc_samples 
            mc_outputs, _ = theano.scan(lambda: self.layers[-1].output_dropout,
                                        outputs_info=None,
                                        n_steps = self.mc_samples)
            
            self.predictive_distribution_mean = T.mean(mc_outputs, axis=0)
            self.predictive_distribution_var = T.var(mc_outputs, axis=0)
            self.preds = T.argmax(self.predictive_distribution_mean, axis=1)
            self.error = self.__error_mc

        self.L1 = (
            T.sum([abs(layer.W).sum() for layer in self.layers]) 
        )
        self.L2_sqr = (
            T.sum([(layer.W ** 2).sum() for layer in self.layers])
        )
Ejemplo n.º 19
0
        def batch_norm(X, gamma, beta, m_shared, v_shared, test, add_updates):
            if X.ndim > 2:
                output_shape = X.shape
                X = X.flatten(2)

            if test is False:
                m = T.mean(X, axis=0, keepdims=True)
                v = T.sqrt(T.var(X, axis=0, keepdims=True) + self.epsilon)

                mulfac = 1.0 / 1000
                if m_shared in add_updates:
                    add_updates[m_shared] = (
                        1.0 - mulfac) * add_updates[m_shared] + mulfac * m
                    add_updates[v_shared] = (
                        1.0 - mulfac) * add_updates[v_shared] + mulfac * v
                else:
                    add_updates[m_shared] = (1.0 -
                                             mulfac) * m_shared + mulfac * m
                    add_updates[v_shared] = (1.0 -
                                             mulfac) * v_shared + mulfac * v
            else:
                m = m_shared
                v = v_shared

            X_hat = (X - m) / v
            y = gamma * X_hat + beta

            if X.ndim > 2:
                y = T.reshape(y, output_shape)
            return y
Ejemplo n.º 20
0
        def batch_norm(X, gamma, beta, m_shared, v_shared, test, add_updates):
            if X.ndim > 2:
                output_shape = X.shape
                X = X.flatten(2)
 
            if test is False:
                m = T.mean(X, axis=0, keepdims=True)
                v = T.sqrt(T.var(X, axis=0, keepdims=True) + self.epsilon)
                
                mulfac = 1.0/1000
                if m_shared in add_updates:
                    add_updates[m_shared] = (1.0-mulfac)*add_updates[m_shared] + mulfac*m
                    add_updates[v_shared] = (1.0-mulfac)*add_updates[v_shared] + mulfac*v
                else:
                    add_updates[m_shared] = (1.0-mulfac)*m_shared + mulfac*m
                    add_updates[v_shared] = (1.0-mulfac)*v_shared + mulfac*v
            else:
                m = m_shared
                v = v_shared
            
            X_hat = (X - m) / v
            y = gamma*X_hat + beta
 
            if X.ndim > 2:
                y = T.reshape(y, output_shape)
            return y
Ejemplo n.º 21
0
 def layer_var(self):
     # square of L2 norm ; one regularization option is to enforce
     # square of L2 norm to be small
     var = []
     for layer in self.layers:
         var.append(T.var(layer.W))
     return var
Ejemplo n.º 22
0
 def layer_var(self):
     # square of L2 norm ; one regularization option is to enforce
     # square of L2 norm to be small
     var = []
     for layer in self.layers:
         var.append(T.var(layer.W))
     return var
Ejemplo n.º 23
0
def layer_normalization(x, bias=None, scale=None, eps=1e-5):
  """
  Layer Normalization, https://arxiv.org/abs/1607.06450
  x is mean and variance normalized along its feature dimension.
  After that, we allow a bias and a rescale. This is supposed to be trainable.
  :param x: 3d tensor (time,batch,dim) (or any ndim, last dim is expected to be dim)
  :param bias: 1d tensor (dim) or None
  :param scale: 1d tensor (dim) or None
  """
  mean = T.mean(x, axis=x.ndim - 1, keepdims=True)
  std = T.sqrt(T.var(x, axis=x.ndim - 1, keepdims=True) + numpy.float32(eps))
  assert mean.ndim == std.ndim == x.ndim
  output = (x - mean) / std
  assert output.ndim == x.ndim
  if scale is not None:
    assert scale.ndim == 1
    scale = scale.dimshuffle(*(('x',) * (x.ndim - 1) + (0,)))
    assert scale.ndim == x.ndim
    output = output * scale
  if bias is not None:
    assert bias.ndim == 1
    bias = bias.dimshuffle(*(('x',) * (x.ndim - 1) + (0,)))
    assert bias.ndim == x.ndim
    output = output + bias
  return output
    def normalize_samples(self, x, gamma, beta):
        OutputLog().write('Normalizing Samples')
        mean = Tensor.mean(x, axis=1, keepdims=True)
        var = Tensor.var(x, axis=1, keepdims=True)

        normalized_output = (x - mean) / Tensor.sqrt(var + self.epsilon)
        return normalized_output / gamma + beta
Ejemplo n.º 25
0
def ln(input, alpha, beta=None):
    output = (input - T.mean(input, axis=1, keepdims=True)
              ) / T.sqrt(T.var(input, axis=1, keepdims=True) + eps)
    output *= alpha[None, :]
    if beta:
        output += beta[None, :]
    return output
Ejemplo n.º 26
0
 def _normalize_input(self):
     X = T.matrix('X')
     results, updates = theano.scan(
         lambda x_i: (x_i - T.mean(x_i)) / T.sqrt(T.var(x_i) + 10),
         sequences=[X]
     )
     return theano.function(inputs=[X], outputs=results)
Ejemplo n.º 27
0
    def instance(self, train_x, infer_x, dropout=None, epsilon=1e-8, **kwargs):
        """Returns (train_output, inference_output, statistics_updates)"""

        # dropout
        dropout = dropout or 0.
        mask = self.srng.binomial(n=1, p=1 - dropout, size=train_x.shape)
        # cast because int * float32 = float64 which does not run on GPU
        train_x = train_x * T.cast(mask, theano.config.floatX)

        # outputs with batch-specific normalization
        train_lin_output = T.dot(train_x, self.t_W) + self.t_b
        batch_mean = T.mean(train_lin_output, axis=0)
        offset_output = train_lin_output - batch_mean
        batch_var = T.var(offset_output, axis=0)
        normalized_lin_output = offset_output / T.sqrt(batch_var + epsilon)
        train_output = self.activation_fn(self.gamma * normalized_lin_output + self.beta)

        # outputs with rolling-average normalization
        infer_lin_output = T.dot(infer_x, self.t_W) + self.t_b
        sd = T.sqrt(self.variance + epsilon)
        inference_output = self.activation_fn(self.gamma / sd * infer_lin_output + (self.beta - (self.gamma * self.mean) / sd))

        # save exponential moving average for batch mean/variance
        statistics_updates = [
            (self.mean, self.alpha * self.mean + (1.0 - self.alpha) * batch_mean),
            (self.variance, self.alpha * self.variance + (1.0 - self.alpha) * batch_var)
        ]

        return train_output, inference_output, statistics_updates
Ejemplo n.º 28
0
    def get_output_for(self, input, deterministic=False, **kwargs):
        beta   = self.beta
        gamma  = self.gamma
        means  = self.means
        stdevs = self.stdevs

        output_shape = input.shape

        if input.ndim > 2:
            # if the input has more than two dimensions, flatten it into a
            # batch of feature vectors.
            input = input.flatten(2)

        if deterministic == False:
            m = T.mean(input, axis=0, keepdims=False)
            s = T.sqrt(T.var(input, axis=0, keepdims=False) + self.eta)

            means.default_update = self.alpha * means + (1-self.alpha) * m
            Es = self.alpha * stdevs + (1-self.alpha) * s
            u  = self.batch_size / (self.batch_size - 1)
            stdevs.default_update = u * Es

        else:
            m = means
            s = stdevs

        output = input - m
        output /= s

        # transform normalized outputs based on learned shift and scale
        if self.learn_transform is True:
            output = gamma * output + beta
        output = output.reshape(output_shape)
        return self.nonlinearity(output)
Ejemplo n.º 29
0
 def add_param(self, param, name="", constraints=True,
               custom_update=None, custom_update_normalized=False, custom_update_exp_average=0,
               custom_update_condition=None, custom_update_accumulate_batches=None):
   """
   :type param: theano.SharedVariable
   :type name: str
   :rtype: theano.SharedVariable
   """
   param = super(Layer, self).add_param(param, name)
   if custom_update:
     # Handled in Device and Updater.
     param.custom_update = custom_update
     param.custom_update_normalized = custom_update_normalized
     param.custom_update_exp_average = custom_update_exp_average
     param.custom_update_condition = custom_update_condition
     param.custom_update_accumulate_batches = custom_update_accumulate_batches
   if constraints:
     if 'L1' in self.attrs and self.attrs['L1'] > 0:
       self.constraints += T.constant(self.attrs['L1'], name="L1", dtype='floatX') * abs(param).sum()
     if 'L2' in self.attrs and self.attrs['L2'] > 0:
       self.constraints += T.constant(self.attrs['L2'], name="L2", dtype='floatX') * (param**2).sum()
     if self.attrs.get('L2_eye', 0) > 0:
       L2_eye = T.constant(self.attrs['L2_eye'], name="L2_eye", dtype='floatX')
       if param.ndim == 2:
         eye = tiled_eye(param.shape[0], param.shape[1], dtype=param.dtype)
         self.constraints += L2_eye * ((param - eye)**2).sum()
       else:  # standard L2
         self.constraints += L2_eye * (param**2).sum()
     if 'varreg' in self.attrs and self.attrs['varreg'] > 0:
       self.constraints += self.attrs['varreg'] * (1.0 * T.sqrt(T.var(param)) - 1.0 / numpy.sum(param.get_value().shape))**2
   return param
Ejemplo n.º 30
0
def layer_normalization(x, bias=None, scale=None, eps=1e-5):
    """
  Layer Normalization, https://arxiv.org/abs/1607.06450
  x is mean and variance normalized along its feature dimension.
  After that, we allow a bias and a rescale. This is supposed to be trainable.
  :param x: 3d tensor (time,batch,dim) (or any ndim, last dim is expected to be dim)
  :param bias: 1d tensor (dim) or None
  :param scale: 1d tensor (dim) or None
  """
    mean = T.mean(x, axis=x.ndim - 1, keepdims=True)
    std = T.sqrt(T.var(x, axis=x.ndim - 1, keepdims=True) + numpy.float32(eps))
    assert mean.ndim == std.ndim == x.ndim
    output = (x - mean) / std
    assert output.ndim == x.ndim
    if scale is not None:
        assert scale.ndim == 1
        scale = scale.dimshuffle(*(('x', ) * (x.ndim - 1) + (0, )))
        assert scale.ndim == x.ndim
        output = output * scale
    if bias is not None:
        assert bias.ndim == 1
        bias = bias.dimshuffle(*(('x', ) * (x.ndim - 1) + (0, )))
        assert bias.ndim == x.ndim
        output = output + bias
    return output
Ejemplo n.º 31
0
    def get_output_for(self, input, moving_avg_hooks=None,
                       deterministic=False, *args, **kwargs):

        if deterministic is False:

            m = T.mean(input, axis=0, keepdims=True)
            m.name = "tensor:mean"
            v = T.sqrt(T.var(input, axis=0, keepdims=True) + self.epsilon)
            v.name = "tensor:variance"

            R = T.dot(((input - m)).T, ((input - m)))

            key = "WhiteningLayer:movingavg"
            if key not in moving_avg_hooks:
                moving_avg_hooks[key] = []
            moving_avg_hooks[key].append(
                [[self.R_inference], [self.W]])

            key = "BatchNormalizationLayer:movingavg"
            if key not in moving_avg_hooks:
                moving_avg_hooks[key] = []
            moving_avg_hooks[key].append(
                [[m, v, R], [self.mean_inference, self.variance_inference, self.R_inference]])
        else:
            m = self.mean_inference
            v = self.variance_inference

        input_hat = T.dot((input - m), self.W.T) # normalize
        y = input_hat / self.gamma + self.beta  # scale and shift

        return y
Ejemplo n.º 32
0
def get_stats(input, stat=None):
    """
    Returns a dictionary mapping the name of the statistic to the result on the input.
    Currently gets mean, var, std, min, max, l1, l2.

    Parameters
    ----------
    input : tensor
        Theano tensor to grab stats for.

    Returns
    -------
    dict
        Dictionary of all the statistics expressions {string_name: theano expression}
    """
    stats = {
        'mean': T.mean(input),
        'var': T.var(input),
        'std': T.std(input),
        'min': T.min(input),
        'max': T.max(input),
        'l1': input.norm(L=1),
        'l2': input.norm(L=2),
        #'num_nonzero': T.sum(T.nonzero(input)),
    }
    stat_list = raise_to_list(stat)
    compiled_stats = {}
    if stat_list is None:
        return stats

    for stat in stat_list:
        if isinstance(stat, six.string_types) and stat in stats:
            compiled_stats.update({stat: stats[stat]})
    return compiled_stats
Ejemplo n.º 33
0
    def __init__(self, inputData, image_shape):
        self.input = inputData
        num_out = image_shape[-3]
        epsilon = 0.01
        self.image_shape = image_shape

        gamma_values = numpy.ones((num_out, ), dtype=theano.config.floatX)
        self.gamma_vals = theano.shared(value=gamma_values, borrow=True)

        beta_values = numpy.zeros((num_out, ), dtype=theano.config.floatX)
        self.beta_vals = theano.shared(value=beta_values, borrow=True)

        batch_mean = T.mean(self.input, keepdims=True, axis=(0, -2, -1))
        batch_var = T.var(self.input, keepdims=True,
                          axis=(0, -2, -1)) + epsilon

        self.batch_mean = self.adjustVals(batch_mean)
        batch_var = self.adjustVals(batch_var)
        self.batch_var = T.pow(batch_var, 0.5)

        batch_normalize = (inputData - self.batch_mean) / (T.pow(
            self.batch_var, 0.5))

        if self.input.ndim == 5:
            self.beta = self.beta_vals.dimshuffle('x', 'x', 0, 'x', 'x')
            self.gamma = self.gamma_vals.dimshuffle('x', 'x', 0, 'x', 'x')
        else:
            self.beta = self.beta_vals.dimshuffle('x', 0, 'x', 'x')
            self.gamma = self.gamma_vals.dimshuffle('x', 0, 'x', 'x')

        self.output = batch_normalize * self.gamma + self.beta
        #self.output=inputData-self.batch_mean

        self.params = [self.gamma_vals, self.beta_vals]
Ejemplo n.º 34
0
 def make_consensus(self, networks, axis=2):
   cns = self.attrs['consensus']
   if cns == 'max':
     return T.max(networks, axis=axis)
   elif cns == 'min':
     return T.min(networks, axis=axis)
   elif cns == 'mean':
     return T.mean(networks, axis=axis)
   elif cns == 'flat':
     if self.depth == 1:
       return networks
     if axis == 2:
       return networks.flatten(ndim=3)
       #return T.reshape(networks, (networks.shape[0], networks.shape[1], T.prod(networks.shape[2:]) ))
     else:
       return networks.flatten(ndim=2) # T.reshape(networks, (networks.shape[0], T.prod(networks.shape[1:]) ))
   elif cns == 'sum':
     return T.sum(networks, axis=axis, acc_dtype=theano.config.floatX)
   elif cns == 'prod':
     return T.prod(networks, axis=axis)
   elif cns == 'var':
     return T.var(networks, axis=axis)
   elif cns == 'project':
     p = self.add_param(self.create_random_uniform_weights(self.attrs['n_out'], 1, self.attrs['n_out'] + self.depth + 1))
     return T.tensordot(p, networks, [[1], [axis]])
   elif cns == 'random':
     idx = self.rng.random_integers(size=(1,), low=0, high=self.depth)
     if axis == 0: return networks[idx]
     if axis == 1: return networks[:,idx]
     if axis == 2: return networks[:,:,idx]
     if axis == 3: return networks[:,:,:,idx]
     assert False, "axis too large"
   else:
     assert False, "consensus method unknown: " + cns
Ejemplo n.º 35
0
    def get_output_for(self, input, moving_avg_hooks=None,
                       deterministic=False, *args, **kwargs):
            
        reshape = False
        if input.ndim > 2:
            output_shape = input.shape
            reshape = True
            input = input.flatten(2)

        if deterministic is False:
            m  = T.mean(input, axis=0, keepdims=True)
            v = T.sqrt(T.var(input, axis=0, keepdims=True)+self.epsilon)
            m.name = "tensor:mean-" + self.name
            v.name = "tensor:variance-" + self.name

            key = "BatchNormalizationLayer:movingavg"
            if key not in moving_avg_hooks:
#                moving_avg_hooks[key] = {}
                moving_avg_hooks[key] = []
#            moving_avg_hooks[key][self.name] = [[m,v], [self.mean_inference, self.variance_inference]]
            moving_avg_hooks[key].append([[m,v], [self.mean_inference, self.variance_inference]])
        else:
            m = self.mean_inference
            v = self.variance_inference

        input_hat = (input - m) / v            # normalize
        y = self.gamma*input_hat + self.beta        # scale and shift

        if reshape:#input.ndim > 2:
            y = T.reshape(y, output_shape)
        return self.nonlinearity(y)
Ejemplo n.º 36
0
	def decorate(self, layer) :
		if not hasattr(layer, "batchnorm_W") or not hasattr(layer, "batchnorm_b") :
			self.paramShape = layer.getOutputShape()#(layer.nbOutputs, )
			self.WInitialization.initialize(self)
			self.bInitialization.initialize(self)

			layer.batchnorm_W = self.W
			layer.batchnorm_b = self.b

			mu = tt.mean(layer.outputs)
			sigma = tt.sqrt( tt.var(layer.outputs) + self.epsilon )
			layer.outputs = layer.batchnorm_W * ( (layer.outputs - mu) / sigma ) + layer.batchnorm_b

			mu = tt.mean(layer.testOutputs)
			sigma = tt.sqrt( tt.var(layer.testOutputs) + self.epsilon )
			layer.testOutputs = layer.batchnorm_W * ( (layer.testOutputs - mu) / sigma ) + layer.batchnorm_b
Ejemplo n.º 37
0
 def add_param(self, param, name="", constraints=True,
               custom_update=None, custom_update_normalized=False, custom_update_exp_average=0,
               custom_update_condition=None, custom_update_accumulate_batches=None, live_update=None):
   """
   :type param: theano.SharedVariable
   :type name: str
   :rtype: theano.SharedVariable
   """
   param = super(Layer, self).add_param(param, name)
   param.live_update = live_update
   if custom_update:
     # Handled in Device and Updater.
     param.custom_update = custom_update
     param.custom_update_normalized = custom_update_normalized
     param.custom_update_exp_average = custom_update_exp_average
     param.custom_update_condition = custom_update_condition
     param.custom_update_accumulate_batches = custom_update_accumulate_batches
   if constraints:
     if 'L1' in self.attrs and self.attrs['L1'] > 0:
       self.constraints += T.constant(self.attrs['L1'], name="L1", dtype='floatX') * abs(param).sum()
     if 'L2' in self.attrs and self.attrs['L2'] > 0:
       self.constraints += T.constant(self.attrs['L2'], name="L2", dtype='floatX') * (param**2).sum()
     if self.attrs.get('L2_eye', 0) > 0:
       L2_eye = T.constant(self.attrs['L2_eye'], name="L2_eye", dtype='floatX')
       if param.ndim == 2:
         eye = tiled_eye(param.shape[0], param.shape[1], dtype=param.dtype)
         self.constraints += L2_eye * ((param - eye)**2).sum()
       else:  # standard L2
         self.constraints += L2_eye * (param**2).sum()
     if 'varreg' in self.attrs and self.attrs['varreg'] > 0:
       self.constraints += self.attrs['varreg'] * (1.0 * T.sqrt(T.var(param)) - 1.0 / numpy.sum(param.get_value().shape))**2
   return param
def normalise(X):

    eps = 1e-4
    X_m = T.mean(X, keepdims=True, axis=0)
    X_var = T.var(X, keepdims=True, axis=0)
    X = (X - X_m) / (T.sqrt(X_var + eps))
    return X
Ejemplo n.º 39
0
    def __init__(self,inputData,image_shape):
        self.input=inputData
        num_out=image_shape[1]
        epsilon=0.01
        self.image_shape=image_shape

        gamma_values = numpy.ones((num_out,), dtype=theano.config.floatX)
        self.gamma_vals = theano.shared(value=gamma_values, borrow=True)

        beta_values = numpy.zeros((num_out,), dtype=theano.config.floatX)
        self.beta_vals = theano.shared(value=beta_values, borrow=True)

        batch_mean=T.mean(self.input,keepdims=True,axis=(0,2,3))
        batch_var=T.var(self.input,keepdims=True,axis=(0,2,3))+epsilon

        self.batch_mean=self.adjustVals(batch_mean)
        batch_var=self.adjustVals(batch_var)
        self.batch_var=T.pow(batch_var,0.5)

        batch_normalize=(inputData-self.batch_mean)/(T.pow(self.batch_var,0.5))

        self.beta = self.beta_vals.dimshuffle('x', 0, 'x', 'x')
        self.gamma = self.gamma_vals.dimshuffle('x', 0, 'x', 'x')

        self.output=batch_normalize*self.gamma+self.beta
        #self.output=inputData-self.batch_mean

        self.params=[self.gamma_vals,self.beta_vals]
Ejemplo n.º 40
0
    def get_output_for(self, input, deterministic=False, **kwargs):
        beta = self.beta
        gamma = self.gamma
        means = self.means
        stdevs = self.stdevs

        output_shape = input.shape

        if input.ndim > 2:
            # if the input has more than two dimensions, flatten it into a
            # batch of feature vectors.
            input = input.flatten(2)

        if deterministic == False:
            m = T.mean(input, axis=0, keepdims=False)
            s = T.sqrt(T.var(input, axis=0, keepdims=False) + self.eta)

            means.default_update = self.alpha * means + (1 - self.alpha) * m
            Es = self.alpha * stdevs + (1 - self.alpha) * s
            u = self.batch_size / (self.batch_size - 1)
            stdevs.default_update = u * Es

        else:
            m = means
            s = stdevs

        output = input - m
        output /= s

        # transform normalized outputs based on learned shift and scale
        if self.learn_transform is True:
            output = gamma * output + beta
        output = output.reshape(output_shape)
        return self.nonlinearity(output)
Ejemplo n.º 41
0
    def activations(self, dataset):
        prev_activations = self._prev_layer.activations(dataset)

        if prev_activations.ndim == 2:
            # flat dataset: (example, vector)
            mean = T.mean(prev_activations, axis=0)
            variance = T.var(prev_activations, axis=0)
        elif prev_activations.ndim == 3:
            # sequence dataset: (seq num, example, vector)
            mean = T.mean(prev_activations, axis=1).dimshuffle(0,'x',1)
            variance = T.var(prev_activations, axis=1).dimshuffle(0,'x',1)

        normalized = (prev_activations - mean) / T.sqrt(variance + self.EPSILON)
        scaled_and_shifted = (normalized * self._scale) + self._shift

        return scaled_and_shifted
Ejemplo n.º 42
0
 def make_consensus(self, networks, axis=2):
   cns = self.attrs['consensus']
   if cns == 'max':
     return T.max(networks, axis=axis)
   elif cns == 'min':
     return T.min(networks, axis=axis)
   elif cns == 'mean':
     return T.mean(networks, axis=axis)
   elif cns == 'flat':
     if self.depth == 1:
       return networks
     if axis == 2:
       return networks.flatten(ndim=3)
       #return T.reshape(networks, (networks.shape[0], networks.shape[1], T.prod(networks.shape[2:]) ))
     else:
       return networks.flatten(ndim=2) # T.reshape(networks, (networks.shape[0], T.prod(networks.shape[1:]) ))
   elif cns == 'sum':
     return T.sum(networks, axis=axis, acc_dtype=theano.config.floatX)
   elif cns == 'prod':
     return T.prod(networks, axis=axis)
   elif cns == 'var':
     return T.var(networks, axis=axis)
   elif cns == 'project':
     p = self.add_param(self.create_random_uniform_weights(self.attrs['n_out'], 1, self.attrs['n_out'] + self.depth + 1))
     return T.tensordot(p, networks, [[1], [axis]])
   elif cns == 'random':
     idx = self.rng.random_integers(size=(1,), low=0, high=self.depth)
     if axis == 0: return networks[idx]
     if axis == 1: return networks[:,idx]
     if axis == 2: return networks[:,:,idx]
     if axis == 3: return networks[:,:,:,idx]
     assert False, "axis too large"
   else:
     assert False, "consensus method unknown: " + cns
def kmeans(train_set_x):

    if train_set_x is None:
        train_set_x = T.matrix('train_set_x')

    ########################
    # Normalize the inputs #
    ########################

    epsilon_norm = 10
    epsilon_zca = 0.015
    K = 500

    train_set_x = train_set_x - T.mean(train_set_x, axis=0) / T.sqrt(T.var(train_set_x, axis=0) + epsilon_norm)

    #####################
    # Whiten the inputs #
    #####################

    # a simple choice of whitening transform is the ZCA whitening transform
    # epsilon_zca is small constant
    # for contrast-normalizaed data, setting epsilon_zca to 0.01 for 16-by-16 pixel patches,
    #                                                 or to  0.1 for 8-by-8   pixel patches
    # is good starting point
    cov = T.dot(train_set_x, T.transpose(train_set_x)) / train_set_x.shape[1]
    U, S, V = linalg.svd(cov)
    tmp = T.dot(U, T.diag(1/T.sqrt(S + epsilon_zca)))
    tmp = T.dot(tmp, T.transpose(U))
    whitened_x = T.dot(tmp, train_set_x)

    ######################
    # Training the Model #
    ######################

    # Initialization
    dimension_size = whitened_x.shape[0]
    num_samples = whitened_x.shape[1]
    srng = RandomStreams(seed=234)

    D = srng.normal(size=(dimension_size, K))
    D = D / T.sqrt(T.sum(T.sqr(D), axis=0))

    # typically 10 iterations is enough
    num_iteration = 15

    # compute new centroids, D_new
    for i in xrange(num_iteration):

        dx = T.dot(D.T, whitened_x)
        arg_max_dx = T.argmax(dx, axis=0)
        s = dx[arg_max_dx, T.arange(num_samples)]

        S = T.zeros((K, num_samples))
        S = T.set_subtensor(S[arg_max_dx, T.arange(num_samples)], s)
        D = T.dot(whitened_x, T.transpose(S)) + D

        D = D / T.sqrt(T.sum(T.sqr(D), axis=0))

    return D
Ejemplo n.º 44
0
    def activations(self, dataset):
        prev_activations = self._prev_layer.activations(dataset)

        if prev_activations.ndim == 2:
            # flat dataset: (example, vector)
            mean = T.mean(prev_activations, axis=0)
            variance = T.var(prev_activations, axis=0)
        elif prev_activations.ndim == 3:
            # sequence dataset: (seq num, example, vector)
            mean = T.mean(prev_activations, axis=1).dimshuffle(0, 'x', 1)
            variance = T.var(prev_activations, axis=1).dimshuffle(0, 'x', 1)

        normalized = (prev_activations - mean) / T.sqrt(variance +
                                                        self.EPSILON)
        scaled_and_shifted = (normalized * self._scale) + self._shift

        return scaled_and_shifted
Ejemplo n.º 45
0
    def fprop(self, input):
        """Propogate the input through the layer."""
        output = input - T.mean(input, axis=1, keepdims=True)
        output = output / T.sqrt(T.var(input, axis=1, keepdims=True) + 1e-5)
        output = self.alpha.dimshuffle('x', 0) * output + \
            self.beta.dimshuffle('x', 0)  # scale and shift

        return output
Ejemplo n.º 46
0
def Kmeans(X_train=None, K=300, epsilon_whitening=0.015):

    if X_train is None:
        X_train = T.matrix('X_train')

    ########################
    # Normalize the inputs #
    ########################

    # A constant added to the variance to avoid division by zero
    epsilon_norm = 10

    # We subtract from each training sample (each column in X_train) its mean
    X_train = X_train - T.mean(
        X_train, axis=0) / T.sqrt(T.var(X_train, axis=0) + epsilon_norm)

    #####################
    # Whiten the inputs #
    #####################

    sigma = T.dot(X_train, T.transpose(X_train)) / X_train.shape[1]
    U, s, V = linalg.svd(sigma, full_matrices=False)
    tmp = T.dot(U, T.diag(1 / T.sqrt(s + epsilon_whitening)))
    tmp = T.dot(tmp, T.transpose(U))
    X_Whitened = T.dot(tmp, X_train)

    ######################
    # Training the Model #
    ######################

    # Initialization
    dimensions = X_Whitened.shape[0]
    samples = X_Whitened.shape[1]
    srng = RandomStreams(seed=234)

    # We initialize the centroids by sampling them from a normal
    # distribution, and then normalizing them to unit length
    # D \in R^{n \times k}
    D = srng.normal(size=(dimensions, K))
    D = D / T.sqrt(T.sum(T.sqr(D), axis=0))

    iterations = 30

    for i in xrange(iterations):

        # Initialize new point representations
        # for every pass of the algorithm
        S = T.zeros((K, samples))

        tmp = T.dot(D.T, X_Whitened)
        res = T.argmax(tmp, axis=0)
        max_values = tmp[res, T.arange(samples)]
        S = T.set_subtensor(S[res, T.arange(samples)], max_values)

        D = T.dot(X_Whitened, T.transpose(S))
        D = D / T.sqrt(T.sum(T.sqr(D), axis=0))

    return D
Ejemplo n.º 47
0
def sample_elbo(model, population=None, samples=1, pi=1, vp=None):
    """ pi*KL[q(w|mu,rho)||p(w)] + E_q[log p(D|w)]
    approximated by Monte Carlo sampling

    Parameters
    ----------
    model : pymc3.Model
    population : dict - maps observed_RV to its population size
        if not provided defaults to full population
    samples : number of Monte Carlo samples used for approximation,
        defaults to 1
    pi : additional coefficient for KL[q(w|mu,rho)||p(w)] as proposed in [1]_
    vp : gelato.variational.utils.VariatioanalParams
        tuple, holding nodes mappings with shared params, if None - new
        will be created

    Returns
    -------
    (E_q[elbo], V_q[elbo], updates, VariationalParams)
        mean, variance of elbo, updates for random streams, shared dicts

    Notes
    -----
    You can pass tensors for `pi`  and `samples` to control them while
        training

    References
    ----------
    .. [1] Charles Blundell et al: "Weight Uncertainty in Neural Networks"
        arXiv preprint arXiv:1505.05424
    """
    if population is None:
        population = dict()
    if vp is None:
        vp = variational_replacements(model.root)
    x = flatten(vp.mapping.values())
    mu = flatten(vp.shared.means.values())
    rho = flatten(vp.shared.rhos.values())

    def likelihood(var):
        tot = population.get(var, population.get(var.name))
        logpt = tt.sum(var.logpt)
        if tot is not None:
            tot = tt.as_tensor(tot)
            logpt *= tot / var.size
        return logpt

    log_p_D = tt.add(*map(likelihood, model.root.observed_RVs))
    log_p_W = model.root.varlogpt + tt.sum(model.root.potentials)
    log_q_W = tt.sum(log_normal3(x, mu, rho))
    _elbo_ = log_p_D + pi * (log_p_W - log_q_W)
    _elbo_ = apply_replacements(_elbo_, vp)

    samples = tt.as_tensor(samples)
    elbos, updates = theano.scan(fn=lambda: _elbo_,
                                 outputs_info=None,
                                 n_steps=samples)
    return tt.mean(elbos), tt.var(elbos), updates, vp
Ejemplo n.º 48
0
    def fprop(self, x, can_fit, eval):
        """
        x : input to the layer
        can_fit : 
        eval : 
        """
        # shape the input as a matrix (batch_size, n_inputs)
        self.x = x.flatten(2)

        # apply dropout mask
        if self.dropout < 1.:

            if eval == False:
                # The cast is important because
                # int * float32 = float64 which pulls things off the gpu

                # very slow ??
                # srng = T.shared_randomstreams.RandomStreams(self.rng.randint(999999))

                srng = theano.sandbox.rng_mrg.MRG_RandomStreams(self.rng.randint(999999))
                mask = T.cast(srng.binomial(n=1, p=self.dropout, size=T.shape(self.x)), theano.config.floatX)

                # apply the mask
                self.x = self.x * mask
            else:
                self.x = self.x * self.dropout

        # binarize the weights
        self.Wb = self.binarize_weights(self.W, eval)

        z = T.dot(self.x, self.Wb)

        # for BN updates
        self.z = z

        # batch normalization
        if self.BN == True:

            self.batch_mean = T.mean(z,axis=0)
            self.batch_var = T.var(z,axis=0)

            if can_fit == True:
                mean = self.batch_mean
                var = self.batch_var

            else:
                mean = self.mean
                var = self.var

            z = (z - mean)/(T.sqrt(var+self.BN_epsilon))
            z = self.a * z

        self.z = z + self.b

        # activation function
        y = self.activation(self.z)

        return y
Ejemplo n.º 49
0
def mom(cost, params, learning_rate, runningGradientStats, activations,
        runningActGrad):
    updates = []

    resNumber = 0
    for res in params:
        insideNumber = 0
        for current_params in res:
            p_no = 0
            for p in current_params:  #weight bias gamma beta
                p_no += 1
                if (p_no == 4):
                    break
                g = T.grad(cost, p)
                updates.append((p, T.clip(p - learning_rate * g, -1.0, 1.0)))
                #now update weight gradient stats
                if (p_no == 1):
                    mu = T.mean(g)
                    sigma2 = T.var(g)
                    updates.append(
                        (runningGradientStats[resNumber][insideNumber][0],
                         0.9 * runningGradientStats[resNumber][insideNumber][0]
                         + 0.1 * mu))
                    updates.append(
                        (runningGradientStats[resNumber][insideNumber][1],
                         0.9 * runningGradientStats[resNumber][insideNumber][1]
                         + 0.1 * sigma2))
            insideNumber += 1
        resNumber += 1
    resNumber = 0
    for res in activations:
        insideNumber = 0
        for a in res:
            g = T.grad(cost, a)
            mu = T.mean(g)
            sigma2 = T.var(g)
            updates.append(
                (runningActGrad[resNumber][insideNumber][0],
                 0.9 * runningActGrad[resNumber][insideNumber][0] + 0.1 * mu))
            updates.append((runningActGrad[resNumber][insideNumber][1],
                            0.9 * runningActGrad[resNumber][insideNumber][1] +
                            0.1 * sigma2))
            insideNumber += 1
        resNumber += 1
    return updates
Ejemplo n.º 50
0
    def _ln(self, x, lnb, lns):

        _eps = np.float32(1e-5)

        out = (x - T.mean(x, axis=-1, keepdims=True)
               ) / T.sqrt(T.var(x, axis=-1, keepdims=True) + _eps)
        out = lns * out + lnb

        return out
Ejemplo n.º 51
0
 def compute_output(self, network, in_vw):
     super(MonitorVarianceNode, self).compute_output(network, in_vw)
     if network.find_hyperparameter(["monitor"]):
         network.create_vw(
             "var",
             variable=T.var(in_vw.variable),
             shape=(),
             tags={"monitor"},
         )
Ejemplo n.º 52
0
    def fprop(self, x, can_fit, eval):

        # shape the input as a matrix (batch_size, n_inputs)
        self.x = x.flatten(2)

        # apply dropout mask
        if self.dropout < 1.:

            if eval == False:
                # The cast is important because
                # int * float32 = float64 which pulls things off the gpu

                # very slow ??
                # srng = T.shared_randomstreams.RandomStreams(self.rng.randint(999999))

                srng = theano.sandbox.rng_mrg.MRG_RandomStreams(
                    self.rng.randint(999999))
                mask = T.cast(
                    srng.binomial(n=1, p=self.dropout, size=T.shape(self.x)),
                    theano.config.floatX)

                # apply the mask
                self.x = self.x * mask
            else:
                self.x = self.x * self.dropout

        # binarize the weights
        self.Wb = self.binarize_weights(self.W, eval)

        z = T.dot(self.x, self.Wb)

        # for BN updates
        self.z = z

        # batch normalization
        if self.BN == True:

            self.batch_mean = T.mean(z, axis=0)
            self.batch_var = T.var(z, axis=0)

            if can_fit == True:
                mean = self.batch_mean
                var = self.batch_var

            else:
                mean = self.mean
                var = self.var

            z = (z - mean) / (T.sqrt(var + self.BN_epsilon))
            z = self.a * z

        self.z = z + self.b

        # activation function
        y = self.activation(self.z)

        return y
def Kmeans(X_train=None, K=300, epsilon_whitening=0.015):

    if X_train is None:
        X_train = T.matrix("X_train")

    ########################
    # Normalize the inputs #
    ########################

    # A constant added to the variance to avoid division by zero
    epsilon_norm = 10

    # We subtract from each training sample (each column in X_train) its mean
    X_train = X_train - T.mean(X_train, axis=0) / T.sqrt(T.var(X_train, axis=0) + epsilon_norm)

    #####################
    # Whiten the inputs #
    #####################

    sigma = T.dot(X_train, T.transpose(X_train)) / X_train.shape[1]
    U, s, V = linalg.svd(sigma, full_matrices=False)
    tmp = T.dot(U, T.diag(1 / T.sqrt(s + epsilon_whitening)))
    tmp = T.dot(tmp, T.transpose(U))
    X_Whitened = T.dot(tmp, X_train)

    ######################
    # Training the Model #
    ######################

    # Initialization
    dimensions = X_Whitened.shape[0]
    samples = X_Whitened.shape[1]
    srng = RandomStreams(seed=234)

    # We initialize the centroids by sampling them from a normal
    # distribution, and then normalizing them to unit length
    # D \in R^{n \times k}
    D = srng.normal(size=(dimensions, K))
    D = D / T.sqrt(T.sum(T.sqr(D), axis=0))

    iterations = 30

    for i in xrange(iterations):

        # Initialize new point representations
        # for every pass of the algorithm
        S = T.zeros((K, samples))

        tmp = T.dot(D.T, X_Whitened)
        res = T.argmax(tmp, axis=0)
        max_values = tmp[res, T.arange(samples)]
        S = T.set_subtensor(S[res, T.arange(samples)], max_values)

        D = T.dot(X_Whitened, T.transpose(S))
        D = D / T.sqrt(T.sum(T.sqr(D), axis=0))

    return D
Ejemplo n.º 54
0
    def fprop(self, input):
        """"Propogate input through the layer."""
        if self.layer == 'fc':
            # Training time
            if self.run_mode == 0:
                mean_t = T.mean(input, axis=0)  # Compute mean
                var_t = T.var(input, axis=0)  # Compute variance
                # Subtract mean and divide by std
                norm_t = (input - mean_t) / T.sqrt(var_t + self.epsilon)
                # Add parameters
                output = self.gamma * norm_t + self.beta
                # Update mean and variance
                self.mean = self.momentum * self.mean + \
                    (1.0 - self.momentum) * mean_t
                self.var = self.momentum * self.var + (1.0 - self.momentum) \
                    * (self.input_shape[0] / (self.input_shape[0] - 1) * var_t)
            # Test time - use statistics from the training data
            else:
                output = self.gamma * (input - self.mean) / \
                    T.sqrt(self.var + self.epsilon) + self.beta

        elif self.layer == 'conv':
            if self.run_mode == 0:
                # Mean across every channel
                mean_t = T.mean(input, axis=(0, 2, 3))
                var_t = T.var(input, axis=(0, 2, 3))
                # mean, var update
                self.mean = self.momentum * self.mean + \
                    (1.0 - self.momentum) * mean_t
                self.var = self.momentum * self.var + (1.0 - self.momentum) * \
                    (self.input_shape[0] / (self.input_shape[0] - 1) * var_t)
            else:
                mean_t = self.mean
                var_t = self.var
            # change shape to fit input shape
            mean_t = self.change_shape(mean_t)
            var_t = self.change_shape(var_t)
            gamma_t = self.change_shape(self.gamma)
            beta_t = self.change_shape(self.beta)

            output = gamma_t * (input - mean_t) / \
                T.sqrt(var_t + self.epsilon) + beta_t
        return output
Ejemplo n.º 55
0
    def forward(self, x, train=True):
        if train or (not self.moving):
            if x.ndim == 2:
                mean = T.mean(x, axis=0)
                var = T.var(x, axis=0)
            elif x.ndim == 4:
                mean = T.mean(x, axis=(0, 2, 3))
                var = T.var(x, axis=(0, 2, 3))
            else:
                raise ValueError('input.shape must be (batch_size, dim) '
                                 'or (batch_size, filter_num, h, w).')
            if self.moving:
                bs = x.shape[0].astype(theano.config.floatX)
                mean_inf_next = (self.momentum*self.mean_inf +
                                 (1-self.momentum)*mean)
                var_inf_next = (self.momentum*self.var_inf
                                + (1-self.momentum)*var*bs/(bs-1.))
                self.updates = [(self.mean_inf, mean_inf_next),
                                (self.var_inf, var_inf_next)]
            else:
                self.updates = []
        else:
            mean = self.mean_inf
            var = self.var_inf

        if x.ndim == 4:
            mean = mean.dimshuffle('x', 0, 'x', 'x')
            var = var.dimshuffle('x', 0, 'x', 'x')

        output = (x-mean) / T.sqrt(var+self.eps)

        if self.gamma is not None:
            if x.ndim == 4:
                output *= self.gamma.dimshuffle('x', 0, 'x', 'x')
            else:
                output *= self.gamma
        if self.beta is not None:
            if x.ndim == 4:
                output += self.beta.dimshuffle('x', 0, 'x', 'x')
            else:
                output += self.beta

        return output
Ejemplo n.º 56
0
    def perform(self, x):
        EPSI = 1e-5

        S = self.params[0]
        b = self.params[1]

        x_ln = (x - T.mean(x_ln, axis=-1, keepdims=True))/T.sqrt(T.var(x, axis=-1, keepdims=True)+EPSI)
        if x.ndim==3:
            return x_ln * S.dimshuffle('x', 'x', 0) + b.dimshuffle('x', 'x', 0)
        else:
            return x_ln * S.dimshuffle('x', 0) + b.dimshuffle('x', 0)
Ejemplo n.º 57
0
 def _compute_training_statistics(self, input_):
     axes = (0,) + tuple((i + 1) for i, b in
                         enumerate(self.population_mean.broadcastable)
                         if b)
     mean = input_.mean(axis=axes, keepdims=True)
     assert mean.broadcastable[1:] == self.population_mean.broadcastable
     stdev = tensor.sqrt(tensor.var(input_, axis=axes, keepdims=True) +
                         numpy.cast[theano.config.floatX](self.epsilon))
     assert stdev.broadcastable[1:] == self.population_stdev.broadcastable
     add_role(mean, BATCH_NORM_MINIBATCH_ESTIMATE)
     add_role(stdev, BATCH_NORM_MINIBATCH_ESTIMATE)
     return mean, stdev
Ejemplo n.º 58
0
    def get_output(self, train):
        X = self.get_input(train)

        if self.mode == 0:
            X_normed = (X - self.running_mean) / self.running_std

        elif self.mode == 1:
            m = T.mean(X, self.axis, keepdims=True)
            std = T.sqrt(T.var(X, self.axis, keepdims=True) + self.epsilon)
            X_normed = (X - m) / std

        out = self.gamma * X_normed + self.beta
        return out