Example #1
0
 def call_on_params(self, x, params):
     k = self.hidden_depth + 2
     ws = params[0:k]
     bs = params[k:2 * k]
     if self.use_bn:
         bn = params[2 * k:]
         gammas = bn[:k - 1]
         betas = bn[k - 1:]
     h = T.dot(x, ws[0]) + bs[0]
     if self.hidden_activation:
         h = self.hidden_activation(h)
     if self.use_bn:
         h, _m, _s = batch_normalization_train(h,
                                               gamma=gammas[0],
                                               beta=betas[0])
     for j in range(self.hidden_depth):
         h = T.dot(h, ws[j + 1]) + bs[j + 1]
         if self.hidden_activation:
             h = self.hidden_activation(h)
         if self.use_bn:
             h, _m, _s = batch_normalization_train(h,
                                                   gamma=gammas[1 + j],
                                                   beta=betas[1 + j])
     y = T.dot(h, ws[-1]) + bs[-1]
     if self.output_activation:
         y = self.output_activation(y)
     return y
Example #2
0
    def forward(self, X, is_training):
        activation = X.dot(self.W)
        if is_training:
            # returns:
            #   batch-normalized output
            #   batch mean
            #   batch variance
            #   running mean (for later use as population mean estimate)
            #   running var (for later use as population var estimate)
            out, batch_mean, batch_invstd, new_running_mean, new_running_var = batch_normalization_train(
                activation,
                self.gamma,
                self.beta,
                running_mean=self.running_mean,
                running_var=self.running_var,
            )

            self.running_update = [
                (self.running_mean, new_running_mean),
                (self.running_var, new_running_var),
            ]

            # if you don't trust the built-in bn function
            # batch_var = 1 / (batch_invstd * batch_invstd)
            # self.running_update = [
            #   (self.running_mean, 0.9*self.running_mean + 0.1*batch_mean),
            #   (self.running_var, 0.9*self.running_var + 0.1*batch_var),
            # ]

        else:
            out = batch_normalization_test(activation, self.gamma, self.beta,
                                           self.running_mean, self.running_var)
        return self.f(out)
Example #3
0
def batch_norm(input_,
               gamma,
               beta,
               running_mean,
               running_var,
               is_training,
               axes='per-activation'):

    if is_training:
        # returns:
        #   batch-normalized output
        #   batch mean
        #   batch variance
        #   running mean (for later use as population mean estimate)
        #   running var (for later use as population var estimate)
        out, _, _, running_mean, running_var = batch_normalization_train(
            input_,
            gamma,
            beta,
            running_mean=running_mean,
            running_var=running_var,
            axes=axes,
            running_average_factor=0.9,
        )
    else:
        out = batch_normalization_test(
            input_,
            gamma,
            beta,
            running_mean,
            running_var,
            axes=axes,
        )
    return out, running_mean, running_var
Example #4
0
def test_batch_normalization_train_without_running_averages():
    # compile and run batch_normalization_train without running averages
    utt.seed_rng()

    x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4(
        'bias'), T.tensor4('dy')
    data_shape = (5, 10, 30, 25)
    param_shape = (1, 10, 30, 25)

    # forward pass
    out, x_mean, x_invstd = bn.batch_normalization_train(
        x, scale, bias, 'per-activation')
    # backward pass
    grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
    # compile
    f = theano.function([x, scale, bias, dy], [out, x_mean, x_invstd] + grads)
    # check if the abstract Ops have been replaced
    assert not any([
        isinstance(n.op,
                   (bn.AbstractBatchNormTrain, bn.AbstractBatchNormInference,
                    bn.AbstractBatchNormTrainGrad))
        for n in f.maker.fgraph.toposort()
    ])
    # run
    X = 4 + 3 * np.random.randn(*data_shape).astype(theano.config.floatX)
    Dy = -1 + 2 * np.random.randn(*data_shape).astype(theano.config.floatX)
    Scale = np.random.randn(*param_shape).astype(theano.config.floatX)
    Bias = np.random.randn(*param_shape).astype(theano.config.floatX)
    f(X, Scale, Bias, Dy)
Example #5
0
def test_batch_normalization_broadcastable():
    # check if the broadcastable pattern is preserved by the optimizations
    x, dy, scale, bias, mean, var = (T.scalar(n).dimshuffle(["x"] * 5)
                                     for n in ("x", "dy", "scale", "bias",
                                               "mean", "var"))

    # forward pass
    out_train, x_mean, x_invstd = bn.batch_normalization_train(
        x, scale, bias, "spatial")
    out_test = bn.batch_normalization_test(x, scale, bias, mean, var,
                                           "spatial")
    # backward pass
    grads_train = T.grad(None,
                         wrt=[x, scale, bias],
                         known_grads={out_train: dy})
    grads_test = T.grad(None, wrt=[x, scale, bias], known_grads={out_test: dy})
    # compile
    f = theano.function(
        [x, scale, bias, mean, var, dy],
        [out_train, x_mean, x_invstd, out_test] + grads_train + grads_test,
    )
    assert not any([
        isinstance(
            n.op,
            (
                bn.AbstractBatchNormTrain,
                bn.AbstractBatchNormInference,
                bn.AbstractBatchNormTrainGrad,
            ),
        ) for n in f.maker.fgraph.toposort()
    ])
Example #6
0
def batchNorm(x, train, gamma, beta, RM, RV, ax):
    values_train, _, _, newRM, newRV = batch_normalization_train(
        x, gamma, beta, axes=ax, running_mean=RM, running_var=RV)
    values = ifelse(T.neq(train, 1),
                    batch_normalization_test(x, gamma, beta, RM, RV, axes=ax),
                    values_train)
    return values, newRM, newRV
Example #7
0
    def input_layer(self, input_data):

        if self.perform_normalization == "all"\
           or self.perform_normalization == "only input":
            gamma = theano.shared(1.)
            bias = theano.shared(0.)
            running_mean = theano.shared(0.)
            running_var = theano.shared(0.)

            normalized_input_data, _, _,\
                new_running_mean, new_running_var = \
                batch_normalization_train(input_data,
                                          gamma,
                                          bias,
                                          axes=(0, 1),
                                          running_mean=running_mean,
                                          running_var=running_var)

            output = \
                normalized_input_data.reshape(self.convolution_input_shape)

            self.updates.append((running_mean, new_running_mean))
            self.updates.append((running_var, new_running_var))

        else:
            output = input_data.reshape(self.convolution_input_shape)

        return output
    def forward(self, X, is_traning):
        activation = X.dot(self.W)
        if is_traning:
            out, batch_mean, batch_invstd, new_running_mean, new_running_var = batch_normalization_train(
                activation,
                self.gamma,
                self.beta,
                running_mean=self.running_mean,
                running_var=self.running_var)

            self.running_update = [
                (self.running_mean, new_running_mean),
                (self.running_var, new_running_var),
            ]

            # how it updates exactly
            # batch_var = 1 / (batch_invstd * batch_invstd)
            # self.running_update = [
            #   (self.running_mean, 0.9*self.running_mean + 0.1*batch_mean),
            #   (self.running_var, 0.9*self.running_var + 0.1*batch_var),
            #   ]

        else:
            out = batch_normalization_test(activation, self.gamma, self.beta,
                                           self.running_mean, self.running_var)
        return self.f(out)
Example #9
0
def test_batch_normalization_train_without_running_averages():
    # compile and run batch_normalization_train without running averages
    utt.seed_rng()

    x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias'), T.tensor4('dy')
    data_shape = (5, 10, 30, 25)
    param_shape = (1, 10, 30, 25)

    # forward pass
    out, x_mean, x_invstd = bn.batch_normalization_train(x, scale, bias, 'per-activation')
    # backward pass
    grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
    # compile
    f = theano.function([x, scale, bias, dy], [out, x_mean, x_invstd] + grads)
    # check if the abstract Ops have been replaced
    assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
                                      bn.AbstractBatchNormInference,
                                      bn.AbstractBatchNormTrainGrad))
                    for n in f.maker.fgraph.toposort()])
    # run
    X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
    Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
    Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
    Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
    f(X, Scale, Bias, Dy)
Example #10
0
 def forward(self, X, is_training):
     activation = X.dot(self.W)
     if is_training:
         out, b_mean, b_invstd, new_mean, new_var = batch_normalization_train(
             activation,
             self.gamma,
             self.beta,
             running_mean =self.running_mean,
             running_var= self.running_var
         )
         # write the updates rules of mean and var in the layer to access them outside
         self.running_update = [
             (self.running_mean, new_mean),
             (self.running_var, new_var)                
         ]
     else :
         out = batch_normalization_test(
             activation,
             self.gamma,
             self.beta,
             self.running_mean,
             self.running_var
         )
         
     if self.af == None:
         return out
     else:
         return self.af(out)
Example #11
0
    def convolution_layer(self, input_data):

        """ Weight and bias for the layer """
        filter_weights = \
            theano.shared(
                np.asarray(
                    np.random.normal(0,
                                     1,
                                     size=self.filter_shape),
                    dtype=theano.config.floatX),
                name="Filter weights",
                borrow=True)

        bias_convolution = \
            theano.shared(np.zeros((self.number_of_filters,),
                                   dtype=theano.config.floatX),
                          borrow=True)

        """ Convolution """
        convolution = causal_conv1d(input=input_data,
                                    filters=filter_weights,
                                    filter_shape=self.filter_shape,
                                    input_shape=self.convolution_input_shape)

        convolution_output = \
            convolution + bias_convolution.dimshuffle("x", 0, "x")

        if self.perform_normalization == "all":
            gamma = theano.shared(1.)
            bias = theano.shared(0.)
            running_mean = theano.shared(0.)
            running_var = theano.shared(0.)

            normalized_output, _, _,\
                new_running_mean, new_running_var = \
                batch_normalization_train(convolution_output,
                                          gamma,
                                          bias,
                                          axes=(0, 1, 2),
                                          running_mean=running_mean,
                                          running_var=running_var)

            self.updates.append((running_mean, new_running_mean))
            self.updates.append((running_var, new_running_var))

            activation_output = \
                self.activation(normalized_output)

        else:
            activation_output = \
                self.activation(
                    convolution_output)

        """ Add parameters to be updated """
        self.parameters.append(filter_weights)
        self.parameters.append(bias_convolution)

        return activation_output
Example #12
0
 def call(self, x):
     out, mean, std, newmean, newvar = BN.batch_normalization_train(inputs=x,
                                                                    gamma=self.gamma,
                                                                    beta=self.beta,
                                                                    axes='per-activation',
                                                                    running_mean=self.mean,
                                                                    running_var=self.var)
     updates = [(self.mean, T.cast(newmean, 'float32')), (self.var, T.cast(newvar, 'float32'))]
     return out, updates
Example #13
0
    def fully_connected_layer(self, input_data):

        """ Weight and bias for the layer """
        W_fully_connected = \
            theano.shared(
                np.asarray(
                    np.random.normal(0,
                                     1,
                                     size=self.fully_connected_layer_shape),
                    dtype=theano.config.floatX),
                name="W fully connected",
                borrow=True)

        bias_fully_connected = \
            theano.shared(np.zeros((self.fully_connected_layer_shape[1],),
                                   dtype=theano.config.floatX),
                          name="bias fully connected",
                          borrow=True)

        dot_output = \
            T.dot(input_data, W_fully_connected) + bias_fully_connected

        if self.perform_normalization == "all":
            gamma = theano.shared(1.)
            bias = theano.shared(0.)
            running_mean = theano.shared(0.)
            running_var = theano.shared(0.)

            normalized_output, _, _,\
                new_running_mean, new_running_var = \
                batch_normalization_train(dot_output,
                                          gamma,
                                          bias,
                                          axes=(0, 1),
                                          running_mean=running_mean,
                                          running_var=running_var)

            self.updates.append((running_mean, new_running_mean))
            self.updates.append((running_var, new_running_var))

            output_fully_connected = self.activation(
                normalized_output)

        else:
            output_fully_connected = self.activation(dot_output)

        """ Add parameters to be updated """
        self.parameters.append(W_fully_connected)
        self.parameters.append(bias_fully_connected)

        return output_fully_connected
Example #14
0
    def output_layer(self, input_data):

        """ Weight and bias for the layer """
        W_output_layer = \
            theano.shared(
                np.asarray(
                    np.random.normal(0,
                                     1,
                                     size=self.output_layer_shape),
                    dtype=theano.config.floatX),
                name="W output",
                borrow=True)

        bias_output_layer = \
            theano.shared(np.zeros((self.output_layer_shape[1],),
                                   dtype=theano.config.floatX),
                          name="bias output",
                          borrow=True)

        dot_output = T.dot(input_data, W_output_layer) + bias_output_layer

        if self.perform_normalization:
            gamma = theano.shared(1.)
            bias = theano.shared(0.)
            running_mean = theano.shared(0.)
            running_var = theano.shared(0.)

            normalized_output, _, _,\
                new_running_mean, new_running_var = \
                batch_normalization_train(dot_output,
                                          gamma,
                                          bias,
                                          axes=(0, 1),
                                          running_mean=running_mean,
                                          running_var=running_var)

            self.updates.append((running_mean, new_running_mean))
            self.updates.append((running_var, new_running_var))

            output = self.classification_method(normalized_output)

        else:
            output = self.classification_method(dot_output)

        """ Add parameters to be updated """
        self.parameters.append(W_output_layer)
        self.parameters.append(bias_output_layer)

        return output
 def forward(self, Z, is_training):
     a = Z.dot(self.W)
     if is_training:
         out, batch_mean, batch_invstd, new_rn_mean, new_rn_var = batch_normalization_train(
             a,
             self.gamma,
             self.beta,
             running_mean=self.rn_mean,
             running_var=self.rn_var)
         self.running_update = [(self.rn_mean, new_rn_mean),
                                (self.rn_var, new_rn_var)]
     else:
         out = batch_normalization_test(a, self.gamma, self.beta,
                                        self.rn_mean, self.rn_var)
     return self.f(out)
Example #16
0
def test_batch_normalization_broadcastable():
    # check if the broadcastable pattern is preserved by the optimizations
    x, dy, scale, bias, mean, var = (T.scalar(n).dimshuffle(['x'] * 5)
                                     for n in ('x', 'dy', 'scale', 'bias', 'mean', 'var'))

    # forward pass
    out_train, x_mean, x_invstd = bn.batch_normalization_train(x, scale, bias, 'spatial')
    out_test = bn.batch_normalization_test(x, scale, bias, mean, var, 'spatial')
    # backward pass
    grads_train = T.grad(None, wrt=[x, scale, bias], known_grads={out_train: dy})
    grads_test = T.grad(None, wrt=[x, scale, bias], known_grads={out_test: dy})
    # compile
    f = theano.function([x, scale, bias, mean, var, dy],
                        [out_train, x_mean, x_invstd, out_test] + grads_train + grads_test)
    assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
                                      bn.AbstractBatchNormInference,
                                      bn.AbstractBatchNormTrainGrad))
                    for n in f.maker.fgraph.toposort()])
    def forward(self, X, is_training, decay=0.9):
        Z = X.dot(self.W)

        if is_training:
            Z, batch_mean, batch_invstd, new_rn_mean, new_rn_var = batch_normalization_train(
                Z,
                self.gamma,
                self.betta,
                running_mean=self.rn_mean,
                running_var=self.rn_var)

            self.rn_update = [(self.rn_mean, new_rn_mean),
                              (self.rn_var, new_rn_var)]

        else:
            Z = batch_normalization_test(Z, self.gamma, self.betta,
                                         self.rn_mean, self.rn_var)

        return self.f(Z)
  def forward(self, X, is_training):
    activation = X.dot(self.W)
    if is_training:
      # returns:
      #   batch-normalized output
      #   batch mean
      #   batch variance
      #   running mean (for later use as population mean estimate)
      #   running var (for later use as population var estimate)
      out, batch_mean, batch_invstd, new_running_mean, new_running_var = batch_normalization_train(
        activation,
        self.gamma,
        self.beta,
        running_mean=self.running_mean,
        running_var=self.running_var,
      )

      self.running_update = [
        (self.running_mean, new_running_mean),
        (self.running_var, new_running_var),
      ]

      # if you don't trust the built-in bn function
      # batch_var = 1 / (batch_invstd * batch_invstd)
      # self.running_update = [
      #   (self.running_mean, 0.9*self.running_mean + 0.1*batch_mean),
      #   (self.running_var, 0.9*self.running_var + 0.1*batch_var),
      # ]

    else:
      out = batch_normalization_test(
        activation,
        self.gamma,
        self.beta,
        self.running_mean,
        self.running_var
      )
    return self.f(out)
def batch_norm(
  input_,
  gamma,
  beta,
  running_mean,
  running_var,
  is_training,
  axes='per-activation'):

  if is_training:
    # returns:
    #   batch-normalized output
    #   batch mean
    #   batch variance
    #   running mean (for later use as population mean estimate)
    #   running var (for later use as population var estimate)
    out, _, _, new_running_mean, new_running_var = batch_normalization_train(
      input_,
      gamma,
      beta,
      running_mean=running_mean,
      running_var=running_var,
      axes=axes,
      running_average_factor=0.9,
    )
  else:
    new_running_mean = None
    new_running_var = None # just to ensure we don't try to use them
    out = batch_normalization_test(
      input_,
      gamma,
      beta,
      running_mean,
      running_var,
      axes=axes,
    )
  return out, new_running_mean, new_running_var
    def get_output(self, input, **kwargs):
        # prepare dimshuffle pattern inserting broadcastable axes as needed
        param_axes = iter(list(range(input.ndim - len(self.axes))))
        pattern = [
            'x' if input_axis in self.axes else next(param_axes)
            for input_axis in range(input.ndim)
        ]

        # apply dimshuffle pattern to all parameters
        beta = self.beta.dimshuffle(pattern)
        gamma = self.gamma.dimshuffle(pattern)
        mean = self.mean.dimshuffle(pattern)
        var = self.var.dimshuffle(pattern)

        if not self.deterministic:
            normalized, _, _, mean_, var_ = bn.batch_normalization_train(
                input, gamma, beta, self.axes_org, self.epsilon, self.alpha,
                mean, var)

            # Update running mean and variance
            # Tricks adopted from Lasagne implementation
            # http://lasagne.readthedocs.io/en/latest/modules/layers/normalization.html
            running_mean = theano.clone(self.mean, share_inputs=False)
            running_var = theano.clone(self.var, share_inputs=False)
            running_mean.default_update = mean_.dimshuffle(self.non_bc_axes)
            running_var.default_update = var_.dimshuffle(self.non_bc_axes)
            self.mean += 0 * running_mean
            self.var += 0 * running_var
        else:
            normalized = bn.batch_normalization_test(input, gamma, beta, mean,
                                                     var, self.axes_org,
                                                     self.epsilon)
            # normalized, _, _, _, _ = bn.batch_normalization_train(
            #     input, gamma, beta, self.axes_org, self.epsilon, 0, mean, var)
            # normalized = (input - mean) * (gamma / T.sqrt(var)) + beta

        return self.activation(normalized)
Example #21
0
 def forward(self, prev_layer, train):
     self.drop = self.rng.binomial(size=prev_layer.shape,
                                   p=1 - self.dropout_rate)
     prev_layer = prev_layer * self.drop
     self.Z = T.dot(prev_layer, self.weights)
     if self.batch_norm == True:
         if train == True:
             self.Z, _, _, self.n_running_mean, self.n_running_variance = batch_normalization_train(
                 self.Z,
                 self.gamma,
                 self.beta,
                 running_mean=self.running_mean,
                 running_var=self.running_variance)
             self.n_norm_params = [
                 self.n_running_mean, self.n_running_variance
             ]
         else:
             self.Z = batch_normalization_test(self.Z, self.gamma,
                                               self.beta, self.running_mean,
                                               self.running_variance)
     else:
         self.Z += self.biases
         self.n_norm_params = []
     if self.activation == 'relu':
         self.A = T.nnet.nnet.relu(self.Z)
     elif self.activation == 'sigmoid':
         self.A = T.nnet.nnet.sigmoid(self.Z)
     elif self.activation == 'tanh':
         self.A = 2 * T.nnet.nnet.sigmoid(self.Z) - 1
     elif self.activation == 'leaky_relu':
         self.A = T.nnet.nnet.relu(self.Z, alpha=0.1)
     elif self.activation == 'softmax':
         self.A = T.nnet.nnet.softmax(self.Z)
     else:
         raise ValueError('Activation Error')
     return self.A
Example #22
0
def test_batch_normalization_train():
    utt.seed_rng()

    for axes in ('per-activation', 'spatial', (1, 2, 3, 4)):
        for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
            x, scale, bias, running_mean, running_var = (vartype(n)
                                                         for n in ('x', 'scale', 'bias',
                                                                   'running_mean',
                                                                   'running_var'))
            ndim = x.ndim
            eps = 5e-3  # some non-standard value to test if it's used
            running_average_factor = 0.3

            # remove non-existing axes
            if isinstance(axes, tuple):
                axes = tuple(i for i in axes if i < ndim)
            if len(axes) == 0:
                continue

            # forward pass
            out, x_mean, x_invstd, out_running_mean, out_running_var = \
                bn.batch_normalization_train(
                    x, scale, bias, axes, eps,
                    running_average_factor, running_mean, running_var)
            # reference forward pass
            if axes == 'per-activation':
                axes2 = (0,)
            elif axes == 'spatial':
                axes2 = (0,) + tuple(range(2, ndim))
            else:
                axes2 = axes
            x_mean2 = x.mean(axis=axes2, keepdims=True)
            x_var2 = x.var(axis=axes2, keepdims=True)
            x_invstd2 = T.inv(T.sqrt(x_var2 + eps))
            scale2 = T.addbroadcast(scale, *axes2)
            bias2 = T.addbroadcast(bias, *axes2)
            out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2
            m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
            out_running_mean2 = running_mean * (1 - running_average_factor) + \
                x_mean2 * running_average_factor
            out_running_var2 = running_var * (1 - running_average_factor) + \
                (m / (m - 1)) * x_var2 * running_average_factor
            # backward pass
            dy = vartype('dy')
            grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
            # reference backward pass
            grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy})
            # compile
            f = theano.function([x, scale, bias, running_mean, running_var, dy],
                                [out, x_mean, x_invstd, out_running_mean, out_running_var,
                                 out2, x_mean2, x_invstd2, out_running_mean2, out_running_var2] +
                                grads + grads2)
            # check if the abstract Ops have been replaced
            assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
                                              bn.AbstractBatchNormInference,
                                              bn.AbstractBatchNormTrainGrad))
                            for n in f.maker.fgraph.toposort()])
            # run
            for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (2, 3, 5, 5, 5)):
                data_shape = data_shape[:ndim]
                param_shape = tuple(1 if d in axes2 else s
                                    for d, s in enumerate(data_shape))
                X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
                Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
                Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
                Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
                Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
                Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX)
                outputs = f(X, Scale, Bias, Running_mean, Running_var, Dy)
                # compare outputs
                utt.assert_allclose(outputs[0], outputs[0 + 5])  # out
                utt.assert_allclose(outputs[1], outputs[1 + 5])  # mean
                utt.assert_allclose(outputs[2], outputs[2 + 5])  # invstd
                utt.assert_allclose(outputs[3], outputs[3 + 5])  # running_mean
                utt.assert_allclose(numpy.nan_to_num(outputs[4]),
                                    numpy.nan_to_num(outputs[4 + 5]))  # running_var
                # compare gradients
                utt.assert_allclose(outputs[10], outputs[10 + 3], atol=1e-4)  # dx
                utt.assert_allclose(outputs[11], outputs[11 + 3], rtol=2e-4, atol=1e-4)  # dscale
                utt.assert_allclose(outputs[12], outputs[12 + 3])  # dbias
Example #23
0
def test_batch_normalization_train():
    utt.seed_rng()

    for axes in ('per-activation', 'spatial', (1, 2, 3, 4)):
        for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
            x, scale, bias, running_mean, running_var = (vartype(n)
                                                         for n in ('x', 'scale', 'bias',
                                                                   'running_mean',
                                                                   'running_var'))
            ndim = x.ndim
            eps = 5e-3  # some non-standard value to test if it's used
            running_average_factor = 0.3

            # remove non-existing axes
            if isinstance(axes, tuple):
                axes = tuple(i for i in axes if i < ndim)
            if len(axes) == 0:
                continue

            # forward pass
            out, x_mean, x_invstd, out_running_mean, out_running_var = \
                bn.batch_normalization_train(
                    x, scale, bias, axes, eps,
                    running_average_factor, running_mean, running_var)
            # reference forward pass
            if axes == 'per-activation':
                axes2 = (0,)
            elif axes == 'spatial':
                axes2 = (0,) + tuple(range(2, ndim))
            else:
                axes2 = axes
            x_mean2 = x.mean(axis=axes2, keepdims=True)
            x_var2 = x.var(axis=axes2, keepdims=True)
            x_invstd2 = T.inv(T.sqrt(x_var2 + eps))
            scale2 = T.addbroadcast(scale, *axes2)
            bias2 = T.addbroadcast(bias, *axes2)
            out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2
            m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
            out_running_mean2 = running_mean * (1 - running_average_factor) + \
                x_mean2 * running_average_factor
            out_running_var2 = running_var * (1 - running_average_factor) + \
                (m / (m - 1)) * x_var2 * running_average_factor
            # backward pass
            dy = vartype('dy')
            grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
            # reference backward pass
            grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy})
            # compile
            f = theano.function([x, scale, bias, running_mean, running_var, dy],
                                [out, x_mean, x_invstd, out_running_mean, out_running_var,
                                 out2, x_mean2, x_invstd2, out_running_mean2, out_running_var2] +
                                grads + grads2)
            # check if the abstract Ops have been replaced
            assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
                                              bn.AbstractBatchNormInference,
                                              bn.AbstractBatchNormTrainGrad))
                            for n in f.maker.fgraph.toposort()])
            # run
            for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)):
                data_shape = data_shape[:ndim]
                param_shape = tuple(1 if d in axes2 else s
                                    for d, s in enumerate(data_shape))
                X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
                Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
                Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
                Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
                Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
                Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX)
                outputs = f(X, Scale, Bias, Running_mean, Running_var, Dy)
                # compare outputs
                utt.assert_allclose(outputs[0], outputs[0 + 5])  # out
                utt.assert_allclose(outputs[1], outputs[1 + 5])  # mean
                utt.assert_allclose(outputs[2], outputs[2 + 5])  # invstd
                utt.assert_allclose(outputs[3], outputs[3 + 5])  # running_mean
                utt.assert_allclose(numpy.nan_to_num(outputs[4]),
                                    numpy.nan_to_num(outputs[4 + 5]))  # running_var
                # compare gradients
                utt.assert_allclose(outputs[10], outputs[10 + 3], atol=1e-4)  # dx
                utt.assert_allclose(outputs[11], outputs[11 + 3], rtol=2e-4, atol=1e-4)  # dscale
                utt.assert_allclose(outputs[12], outputs[12 + 3])  # dbias
Example #24
0
def test_batch_normalization_train_broadcast():
    for axes in ("per-activation", "spatial", (1, 2, 3, 4)):
        for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
            x = vartype("x")
            ndim = x.ndim
            eps = 5e-3  # some non-standard value to test if it's used
            running_average_factor = 0.3

            # remove non-existing axes
            if isinstance(axes, tuple):
                axes = tuple(i for i in axes if i < ndim)
            if len(axes) == 0:
                continue

            # convert axes to explicit list
            if axes == "per-activation":
                axes2 = (0, )
            elif axes == "spatial":
                axes2 = (0, ) + tuple(range(2, ndim))
            else:
                axes2 = axes

            # compute axes for parameter tensors
            non_bc_axes = tuple(i for i in range(ndim) if i not in axes2)
            params_dimshuffle = ["x"] * ndim
            for i, axis in enumerate(non_bc_axes):
                params_dimshuffle[axis] = i

            # construct non-broadcasted parameter variables
            param_type = T.TensorType(x.dtype, (False, ) * len(non_bc_axes))
            scale, bias, running_mean, running_var = (param_type(n)
                                                      for n in ("scale",
                                                                "bias",
                                                                "running_mean",
                                                                "running_var"))

            # broadcast parameter variables
            scale_bc = scale.dimshuffle(params_dimshuffle)
            bias_bc = bias.dimshuffle(params_dimshuffle)
            running_mean_bc = running_mean.dimshuffle(params_dimshuffle)
            running_var_bc = running_var.dimshuffle(params_dimshuffle)

            # batch_normalization_train with original, non-broadcasted variables
            train_non_bc = bn.batch_normalization_train(
                x,
                scale,
                bias,
                axes,
                eps,
                running_average_factor,
                running_mean,
                running_var,
            )
            # batch_normalization_train with broadcasted variables
            train_bc = bn.batch_normalization_train(
                x,
                scale_bc,
                bias_bc,
                axes,
                eps,
                running_average_factor,
                running_mean_bc,
                running_var_bc,
            )
            train_bc = tuple([train_bc[0]] +
                             [r.dimshuffle(non_bc_axes)
                              for r in train_bc[1:]]  # out
                             )

            # batch_normalization_test with original, non-broadcasted variables
            test_non_bc = bn.batch_normalization_test(x, scale, bias,
                                                      running_mean,
                                                      running_var, axes, eps)
            # batch_normalization_test with broadcasted variables
            test_bc = bn.batch_normalization_test(x, scale_bc, bias_bc,
                                                  running_mean_bc,
                                                  running_var_bc, axes, eps)

            # subtract the results of the non-broadcasted and broadcasted calls
            results_non_bc = train_non_bc + (test_non_bc, )
            results_bc = train_bc + (test_bc, )
            results = [
                abs(r - r_bc) for (r, r_bc) in zip(results_non_bc, results_bc)
            ]

            # compile to compute all differences
            f = theano.function([x, scale, bias, running_mean, running_var],
                                T.sum(sum(results)))

            # the paired ops are exactly the same, so the optimizer should have
            # collapsed the sum of differences to a constant zero
            nodes = f.maker.fgraph.toposort()
            if theano.config.mode != "FAST_COMPILE":
                assert len(nodes) == 1
                assert isinstance(nodes[0].op, theano.compile.DeepCopyOp)
            inputs = [
                np.asarray(np.random.rand(*((4, ) * n)), x.dtype) for n in [
                    x.ndim,
                    scale.ndim,
                    bias.ndim,
                    running_mean.ndim,
                    running_var.ndim,
                ]
            ]
            assert 0.0 == f(*inputs)
Example #25
0
def test_batch_normalization_train():
    utt.seed_rng()

    for axes in ("per-activation", "spatial", (1, 2, 3, 4)):
        for vartype in (T.tensor5, T.tensor3, T.vector):
            x, scale, bias, running_mean, running_var = (vartype(n) for n in (
                "x", "scale", "bias", "running_mean", "running_var"))
            ndim = x.ndim
            eps = 5e-3  # some non-standard value to test if it's used
            running_average_factor = 0.3

            # remove non-existing axes
            if isinstance(axes, tuple):
                axes = tuple(i for i in axes if i < ndim)
            if len(axes) == 0:
                continue

            # forward pass
            (
                out,
                x_mean,
                x_invstd,
                out_running_mean,
                out_running_var,
            ) = bn.batch_normalization_train(
                x,
                scale,
                bias,
                axes,
                eps,
                running_average_factor,
                running_mean,
                running_var,
            )
            # reference forward pass
            if axes == "per-activation":
                axes2 = (0, )
            elif axes == "spatial":
                axes2 = (0, ) + tuple(range(2, ndim))
            else:
                axes2 = axes
            x_mean2 = x.mean(axis=axes2, keepdims=True)
            x_var2 = x.var(axis=axes2, keepdims=True)
            x_invstd2 = T.inv(T.sqrt(x_var2 + eps))
            scale2 = T.addbroadcast(scale, *axes2)
            bias2 = T.addbroadcast(bias, *axes2)
            out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2
            m = T.cast(
                T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
            out_running_mean2 = (running_mean * (1 - running_average_factor) +
                                 x_mean2 * running_average_factor)
            out_running_var2 = (running_var * (1 - running_average_factor) +
                                (m /
                                 (m - 1)) * x_var2 * running_average_factor)
            # backward pass
            dy = vartype("dy")
            grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy})
            # reference backward pass
            grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy})
            # second-order backward pass
            dx = vartype("dinputs")
            dscale = vartype("dscale")
            dbias = vartype("dbias")
            grad_grads = T.grad(
                None,
                wrt=[x, dy, scale],
                known_grads=OrderedDict({
                    grads[0]: dx,
                    grads[1]: dscale,
                    grads[2]: dbias
                }),
                consider_constant=[
                    x,
                    dy,
                    scale,
                    bias,
                    x_mean,
                    x_invstd,
                    running_mean,
                    running_var,
                ],
                return_disconnected="zero",
            )
            # reference second-order backward pass
            grad_grads2 = T.grad(
                None,
                wrt=[x, dy, scale],
                known_grads=OrderedDict({
                    grads2[0]: dx,
                    grads2[1]: dscale,
                    grads2[2]: dbias
                }),
                consider_constant=[
                    x,
                    dy,
                    scale,
                    bias,
                    x_mean2,
                    x_var2,
                    running_mean,
                    running_var,
                ],
                return_disconnected="zero",
            )
            # compile
            f = theano.function(
                [
                    x, scale, bias, running_mean, running_var, dy, dx, dscale,
                    dbias
                ],
                [
                    out,
                    x_mean,
                    x_invstd,
                    out_running_mean,
                    out_running_var,
                    out2,
                    x_mean2,
                    x_invstd2,
                    out_running_mean2,
                    out_running_var2,
                ] + grads + grads2 + grad_grads + grad_grads2,
            )
            # check if the abstract Ops have been replaced
            assert not any([
                isinstance(
                    n.op,
                    (
                        bn.AbstractBatchNormTrain,
                        bn.AbstractBatchNormInference,
                        bn.AbstractBatchNormTrainGrad,
                    ),
                ) for n in f.maker.fgraph.toposort()
            ])
            # run
            for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (2, 3, 5,
                                                                      5, 5)):
                data_shape = data_shape[:ndim]
                param_shape = tuple(1 if d in axes2 else s
                                    for d, s in enumerate(data_shape))
                X = 4 + 3 * np.random.randn(*data_shape).astype(
                    theano.config.floatX)
                Dy = -1 + 2 * np.random.randn(*data_shape).astype(
                    theano.config.floatX)
                Scale = np.random.randn(*param_shape).astype(
                    theano.config.floatX)
                Bias = np.random.randn(*param_shape).astype(
                    theano.config.floatX)
                Running_mean = np.random.randn(*param_shape).astype(
                    theano.config.floatX)
                Running_var = np.random.randn(*param_shape).astype(
                    theano.config.floatX)
                Dx = 4 + 3 * np.random.randn(*data_shape).astype(
                    theano.config.floatX)
                Dscale = -1 + 2 * np.random.randn(*param_shape).astype(
                    theano.config.floatX)
                Dbias = np.random.randn(*param_shape).astype(
                    theano.config.floatX)

                outputs = f(X, Scale, Bias, Running_mean, Running_var, Dy, Dx,
                            Dscale, Dbias)
                # compare outputs
                utt.assert_allclose(outputs[0], outputs[0 + 5])  # out
                utt.assert_allclose(outputs[1], outputs[1 + 5])  # mean
                utt.assert_allclose(outputs[2], outputs[2 + 5])  # invstd
                utt.assert_allclose(outputs[3], outputs[3 + 5])  # running_mean
                utt.assert_allclose(np.nan_to_num(outputs[4]),
                                    np.nan_to_num(outputs[4 +
                                                          5]))  # running_var
                # compare gradients
                utt.assert_allclose(outputs[10], outputs[10 + 3],
                                    atol=1e-4)  # dx
                utt.assert_allclose(outputs[11],
                                    outputs[11 + 3],
                                    rtol=2e-4,
                                    atol=1e-4)  # dscale
                utt.assert_allclose(outputs[12], outputs[12 + 3])  # dbias
                # compare second-order gradients
                utt.assert_allclose(outputs[16], outputs[16 + 3],
                                    atol=1e-4)  # ddx
                utt.assert_allclose(outputs[17], outputs[17 + 3])  # ddy
                utt.assert_allclose(outputs[18],
                                    outputs[18 + 3],
                                    rtol=3e-4,
                                    atol=1e-4)  # ddscale
Example #26
0
    def __init__(
        self,
        input,
        nkerns,
        input_shape,
        id,
        output_shape,
        filter_shape=(3, 3),
        poolsize=(1, 1),
        pooltype='max',
        batch_norm=False,
        border_mode='valid',
        stride=(1, 1),
        rng=None,
        borrow=True,
        activation='relu',
        input_params=None,
        verbose=2,
    ):

        super(deconv_layer_2d, self).__init__(id=id,
                                              type='deconv',
                                              verbose=verbose)
        if verbose >= 3:
            print "... Creating deconv layer"

        if rng is None:
            rng = numpy.random

        create_w = False
        create_b = False
        create_bn = False

        # To copy weights previously created or some wierd initializations
        if not input_params is None:
            if input_params[0] is None:
                create_w = True
            if input_params[1] is None:
                create_b = True
            if batch_norm is True:
                if input_params[2] is None:
                    create_bn = True
        else:
            create_w = True
            create_b = True
            create_bn = True

        mini_batch_size = input_shape[0]
        channels = input_shape[1]
        width = input_shape[3]
        height = input_shape[2]
        # srng = RandomStreams(rng.randint(1,2147462579))
        # Initialize the parameters of this layer.

        w_shp = (nkerns, output_shape[2], filter_shape[0], filter_shape[1])
        o_shp = (input_shape[0], output_shape[2], output_shape[0],
                 output_shape[1])

        if create_w is True:
            self.w = theano.shared(value=numpy.asarray(
                0.01 * rng.standard_normal(size=w_shp),
                dtype=theano.config.floatX),
                                   borrow=borrow,
                                   name='filterbank')
        else:
            self.w = input_params[0]

        if create_b is True:
            self.b = theano.shared(value=numpy.zeros(
                (output_shape[2], ), dtype=theano.config.floatX),
                                   name='bias',
                                   borrow=borrow)
        else:
            self.b = input_params[1]

        if batch_norm is True:
            if create_bn is True:
                self.gamma = theano.shared(value=numpy.ones(
                    (output_shape[2], ), dtype=theano.config.floatX),
                                           name='gamma',
                                           borrow=borrow)
                self.beta = theano.shared(value=numpy.zeros(
                    (output_shape[2], ), dtype=theano.config.floatX),
                                          name='beta',
                                          borrow=borrow)
                self.running_mean = theano.shared(value=numpy.zeros(
                    (output_shape[2], ), dtype=theano.config.floatX),
                                                  name='population_mean',
                                                  borrow=borrow)
                self.running_var = theano.shared(value=numpy.ones(
                    (output_shape[2], ), dtype=theano.config.floatX),
                                                 name='population_var',
                                                 borrow=borrow)
            else:
                self.gamma = input_params[2]
                self.beta = input_params[3]
                self.running_mean = input_params[4]
                self.running_var = input_params[5]

        # Perform the convolution part
        convolver = deconvolver_2d(input=input,
                                   filters=self.w,
                                   output_shape=o_shp,
                                   subsample=stride,
                                   filter_shape=w_shp,
                                   image_shape=input_shape,
                                   border_mode=border_mode,
                                   verbose=verbose)

        conv_out = convolver.out
        conv_out_shp = o_shp

        self.conv_out = conv_out
        if not poolsize == (1, 1):
            raise Exception(
                " Unpool operation not yet supported be deconv layer")
            """ #pragma: no cover
             pooler = pooler_2d(
                                input = conv_out,
                                img_shp = conv_out_shp,
                                mode = pooltype,
                                ds = poolsize,
                                verbose = verbose
                            )
             pool_out = pooler.out
             pool_out_shp = pooler.out_shp
             """
        else:
            unpool_out = conv_out
            unpool_out_shp = conv_out_shp
        """
        Ioffe, Sergey, and Christian Szegedy. "Batch normalization: Accelerating deep network
        training by reducing internal covariate shift." arXiv preprint arXiv:1502.03167 (2015). """
        if batch_norm is True:
            batch_norm_out,_,_,mean,var = batch_normalization_train(
                                                  inputs = unpool_out + \
                                                                self.b.dimshuffle('x', 0, 'x', 'x'),
                                                  gamma = self.gamma,
                                                  beta = self.beta,
                                                  axes ='spatial',
                                                  running_mean = self.running_mean,
                                                  running_var = self.running_var )

            mean = theano.tensor.unbroadcast(mean, 0)
            var = theano.tensor.unbroadcast(var, 0)
            var = var + 0.000001
            self.updates[self.running_mean] = mean
            self.updates[self.running_var] = var

            batch_norm_inference = batch_normalization_test (
                                                    inputs = unpool_out + \
                                                            self.b.dimshuffle('x', 0, 'x', 'x'),
                                                    gamma = self.gamma,
                                                    beta = self.beta,
                                                    axes = 'spatial',
                                                    mean = self.running_mean,
                                                    var = self.running_var )
        else:
            batch_norm_out = unpool_out + self.b.dimshuffle('x', 0, 'x', 'x')
            batch_norm_inference = batch_norm_out

        batch_norm_out_shp = unpool_out_shp
        if type(activation) is tuple:
            if activation[0] == 'maxout':
                raise Exception(
                    'Deconvolution layer does not support maxout activation')
        self.output, self.output_shape = _activate(
            x=batch_norm_out,
            activation=activation,
            input_size=batch_norm_out_shp,
            verbose=verbose,
            dimension=2)

        self.inference, _ = _activate(x=batch_norm_inference,
                                      activation=activation,
                                      input_size=batch_norm_out_shp,
                                      verbose=verbose,
                                      dimension=2)
        # store parameters of this layer and do some book keeping.
        self.params = [self.w, self.b]
        self.active_params = [self.w, self.b]
        if batch_norm is True:
            self.params.append(self.gamma)
            self.params.append(self.beta)
            self.active_params.append(self.gamma)
            self.active_params.append(self.beta)
            self.params.append(self.running_mean)  # inactive params
            self.params.append(self.running_var)  # inactive params

        self.L1 = abs(self.w).sum()
        # if batch_norm is True : self.L1 = self.L1 # + abs(self.gamma).sum()
        self.L2 = (self.w**2).sum()
        # if batch_norm is True: self.L2 = self.L2 # + (self.gamma**2).sum()

        # Just doing this for print_layer method to use.
        self.nkerns = nkerns
        self.filter_shape = filter_shape
        self.poolsize = poolsize
        self.stride = stride
        self.input_shape = input_shape
        self.num_neurons = nkerns
        self.activation = activation
        self.batch_norm = batch_norm
Example #27
0
    def __init__(
        self,
        input,
        nkerns,
        input_shape,
        id,
        filter_shape=(3, 3),
        poolsize=(2, 2),
        pooltype='max',
        batch_norm=False,
        border_mode='valid',
        stride=(1, 1),
        rng=None,
        borrow=True,
        activation='relu',
        input_params=None,
        verbose=2,
    ):

        super(conv_pool_layer_2d, self).__init__(id=id,
                                                 type='conv_pool',
                                                 verbose=verbose)
        if verbose >= 3:
            print "... Creating conv pool layer"

        if rng is None:
            rng = numpy.random

        # To copy weights previously created or some wierd initializations
        if input_params is not None:
            init_w = input_params[0]
            init_b = input_params[1]
            if batch_norm is True:
                init_gamma = input_params[2]
                init_beta = input_params[3]
                init_mean = input_params[4]
                init_var = input_params[5]

        mini_batch_size = input_shape[0]
        channels = input_shape[1]
        width = input_shape[3]
        height = input_shape[2]
        # srng = RandomStreams(rng.randint(1,2147462579))
        # Initialize the parameters of this layer.
        w_shp = (nkerns, channels, filter_shape[0], filter_shape[1])

        if input_params is None:
            # fan_in = filter_shape[0]*filter_shape[1]
            # fan_out = filter_shape[0]*filter_shape[1] / numpy.prod(poolsize)
            # w_bound = numpy.sqrt(6. / (fan_in + fan_out))
            self.w = theano.shared(
                value=
                # numpy.asarray(rng.uniform(low=-w_bound, high=w_bound, size =w_shp),
                numpy.asarray(0.01 * rng.standard_normal(size=w_shp),
                              dtype=theano.config.floatX),
                borrow=borrow,
                name='filterbank')
            self.b = theano.shared(value=numpy.zeros(
                (nkerns, ), dtype=theano.config.floatX),
                                   name='bias',
                                   borrow=borrow)
            if batch_norm is True:
                self.gamma = theano.shared(value=numpy.ones(
                    (nkerns, ), dtype=theano.config.floatX),
                                           name='gamma',
                                           borrow=borrow)
                self.beta = theano.shared(value=numpy.zeros(
                    (nkerns, ), dtype=theano.config.floatX),
                                          name='beta',
                                          borrow=borrow)
                self.running_mean = theano.shared(value=numpy.zeros(
                    (nkerns, ), dtype=theano.config.floatX),
                                                  name='population_mean',
                                                  borrow=borrow)
                self.running_var = theano.shared(value=numpy.ones(
                    (nkerns, ), dtype=theano.config.floatX),
                                                 name='population_var',
                                                 borrow=borrow)
        else:
            self.w = init_w
            self.b = init_b
            if batch_norm is True:
                self.gamma = init_gamma
                self.beta = init_beta
                self.running_mean = init_mean
                self.running_var = init_var

        # Perform the convolution part
        convolver = convolver_2d(input=input,
                                 filters=self.w,
                                 subsample=stride,
                                 filter_shape=w_shp,
                                 image_shape=input_shape,
                                 border_mode=border_mode,
                                 verbose=verbose)

        conv_out = convolver.out
        conv_out_shp = (mini_batch_size, nkerns, convolver.out_shp[0],
                        convolver.out_shp[1])

        self.conv_out = conv_out
        if not poolsize == (1, 1):
            pooler = pooler_2d(input=conv_out,
                               img_shp=conv_out_shp,
                               mode=pooltype,
                               ds=poolsize,
                               verbose=verbose)
            pool_out = pooler.out
            pool_out_shp = pooler.out_shp
        else:
            pool_out = conv_out
            pool_out_shp = conv_out_shp
        """
        Ioffe, Sergey, and Christian Szegedy. "Batch normalization: Accelerating deep network
        training by reducing internal covariate shift." arXiv preprint arXiv:1502.03167 (2015). """
        if batch_norm is True:
            batch_norm_out,_,_,mean,var = batch_normalization_train(
                                                  inputs = pool_out + \
                                                                self.b.dimshuffle('x', 0, 'x', 'x'),
                                                  gamma = self.gamma,
                                                  beta = self.beta,
                                                  axes ='spatial',
                                                  running_mean = self.running_mean,
                                                  running_var = self.running_var )

            mean = theano.tensor.unbroadcast(mean, 0)
            var = theano.tensor.unbroadcast(var, 0)
            self.updates[self.running_mean] = mean
            self.updates[self.running_var] = var + 0.001

            batch_norm_inference = batch_normalization_test (
                                                    inputs = pool_out + \
                                                            self.b.dimshuffle('x', 0, 'x', 'x'),
                                                    gamma = self.gamma,
                                                    beta = self.beta,
                                                    axes = 'spatial',
                                                    mean = self.running_mean,
                                                    var = self.running_var )
        else:
            batch_norm_out = pool_out + self.b.dimshuffle('x', 0, 'x', 'x')
            batch_norm_inference = batch_norm_out

        batch_norm_out_shp = pool_out_shp
        self.output, self.output_shape = _activate(
            x=batch_norm_out,
            activation=activation,
            input_size=batch_norm_out_shp,
            verbose=verbose,
            dimension=2)

        self.inference, _ = _activate(x=batch_norm_inference,
                                      activation=activation,
                                      input_size=batch_norm_out_shp,
                                      verbose=verbose,
                                      dimension=2)
        # store parameters of this layer and do some book keeping.
        self.params = [self.w, self.b]
        self.active_params = [self.w, self.b]
        if batch_norm is True:
            self.params.append(self.gamma)
            self.params.append(self.beta)
            self.active_params.append(self.gamma)
            self.active_params.append(self.beta)
            self.params.append(self.running_mean)  # inactive params
            self.params.append(self.running_var)  # inactive params

        self.L1 = abs(self.w).sum()
        # if batch_norm is True : self.L1 = self.L1 # + abs(self.gamma).sum()
        self.L2 = (self.w**2).sum()
        # if batch_norm is True: self.L2 = self.L2 # + (self.gamma**2).sum()

        # Just doing this for print_layer method to use.
        self.nkerns = nkerns
        self.filter_shape = filter_shape
        self.poolsize = poolsize
        self.stride = stride
        self.input_shape = input_shape
        self.num_neurons = nkerns
        self.activation = activation
        self.batch_norm = batch_norm
Example #28
0
def test_batch_normalization_train_broadcast():
    for axes in ('per-activation', 'spatial', (1, 2, 3, 4)):
        for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
            x = vartype('x')
            ndim = x.ndim
            eps = 5e-3  # some non-standard value to test if it's used
            running_average_factor = 0.3

            # remove non-existing axes
            if isinstance(axes, tuple):
                axes = tuple(i for i in axes if i < ndim)
            if len(axes) == 0:
                continue

            # convert axes to explicit list
            if axes == 'per-activation':
                axes2 = (0,)
            elif axes == 'spatial':
                axes2 = (0,) + tuple(range(2, ndim))
            else:
                axes2 = axes

            # compute axes for parameter tensors
            non_bc_axes = tuple(i for i in range(ndim) if i not in axes2)
            params_dimshuffle = ['x'] * ndim
            for i, axis in enumerate(non_bc_axes):
                params_dimshuffle[axis] = i

            # construct non-broadcasted parameter variables
            param_type = T.TensorType(x.dtype, (False,) * len(non_bc_axes))
            scale, bias, running_mean, running_var = (param_type(n)
                                                      for n in ('scale', 'bias',
                                                                'running_mean',
                                                                'running_var'))

            # broadcast parameter variables
            scale_bc = scale.dimshuffle(params_dimshuffle)
            bias_bc = bias.dimshuffle(params_dimshuffle)
            running_mean_bc = running_mean.dimshuffle(params_dimshuffle)
            running_var_bc = running_var.dimshuffle(params_dimshuffle)

            # batch_normalization_train with original, non-broadcasted variables
            train_non_bc = \
                bn.batch_normalization_train(
                    x, scale, bias, axes, eps,
                    running_average_factor, running_mean, running_var)
            # batch_normalization_train with broadcasted variables
            train_bc = \
                bn.batch_normalization_train(
                    x, scale_bc, bias_bc, axes, eps,
                    running_average_factor, running_mean_bc, running_var_bc)
            train_bc = tuple([train_bc[0]] +  # out
                             [r.dimshuffle(non_bc_axes) for r in train_bc[1:]])

            # batch_normalization_test with original, non-broadcasted variables
            test_non_bc = \
                bn.batch_normalization_test(
                    x, scale, bias, running_mean, running_var, axes, eps)
            # batch_normalization_test with broadcasted variables
            test_bc = \
                bn.batch_normalization_test(
                    x, scale_bc, bias_bc, running_mean_bc, running_var_bc, axes, eps)

            # subtract the results of the non-broadcasted and broadcasted calls
            results_non_bc = train_non_bc + (test_non_bc,)
            results_bc = train_bc + (test_bc,)
            results = [abs(r - r_bc) for (r, r_bc) in zip(results_non_bc, results_bc)]

            # compile to compute all differences
            f = theano.function([x, scale, bias, running_mean, running_var],
                                T.sum(sum(results)))

            # the paired ops are exactly the same, so the optimizer should have
            # collapsed the sum of differences to a constant zero
            nodes = f.maker.fgraph.toposort()
            if theano.config.mode != "FAST_COMPILE":
                assert len(nodes) == 1
                assert isinstance(nodes[0].op, theano.compile.DeepCopyOp)
            inputs = [numpy.asarray(numpy.random.rand(*((4,) * n)), x.dtype)
                      for n in [x.ndim, scale.ndim, bias.ndim,
                                running_mean.ndim, running_var.ndim]]
            assert 0.0 == f(*inputs)
Example #29
0
    def __init__(
        self,
        input,
        input_shape,
        id,
        rng=None,
        borrow=True,
        input_params=None,
        verbose=2,
    ):

        super(batch_norm_layer_2d, self).__init__(id=id,
                                                  type='batch_norm',
                                                  verbose=verbose)
        if verbose >= 3:
            print "... Creating batch norm layer"

        if rng is None:
            rng = numpy.random

        # To copy weights previously created or some wierd initializations
        if input_params is not None:
            init_gamma = input_params[0]
            init_beta = input_params[1]
            init_mean = input_params[2]
            init_var = input_params[3]

        channels = input_shape[1]

        if input_params is None:
            self.gamma = theano.shared(value=numpy.ones(
                (channels, ), dtype=theano.config.floatX),
                                       name='gamma',
                                       borrow=borrow)
            self.beta = theano.shared(value=numpy.zeros(
                (channels, ), dtype=theano.config.floatX),
                                      name='beta',
                                      borrow=borrow)
            self.running_mean = theano.shared(value=numpy.zeros(
                (channels, ), dtype=theano.config.floatX),
                                              name='population_mean',
                                              borrow=borrow)
            self.running_var = theano.shared(value=numpy.ones(
                (channels, ), dtype=theano.config.floatX),
                                             name='population_var',
                                             borrow=borrow)
        else:
            self.gamma = init_gamma
            self.beta = init_beta
            self.running_mean = init_mean
            self.running_var = init_var
        """
        Ioffe, Sergey, and Christian Szegedy. "Batch normalization: Accelerating deep network
        training by reducing internal covariate shift." arXiv preprint arXiv:1502.03167 (2015). """
        self.output, _, _, mean, var = batch_normalization_train(
            inputs=input,
            gamma=self.gamma,
            beta=self.beta,
            axes='spatial',
            running_mean=self.running_mean,
            running_var=self.running_var)

        mean = theano.tensor.unbroadcast(mean, 0)
        var = theano.tensor.unbroadcast(var, 0)
        self.updates[self.running_mean] = mean
        self.updates[self.running_var] = var + 0.001

        self.inference = batch_normalization_test(inputs=input,
                                                  gamma=self.gamma,
                                                  beta=self.beta,
                                                  axes='spatial',
                                                  mean=self.running_mean,
                                                  var=self.running_var)

        # store parameters of this layer and do some book keeping.
        self.parmas = [
            self.gamma, self.beta, self.running_mean, self.running_var
        ]
        self.active_params = [self.gamma, self.beta]
        self.input_shape = input_shape
        self.output_shape = input_shape
Example #30
0
    def __init__(self,
                 input,
                 num_neurons,
                 input_shape,
                 id,
                 rng=None,
                 input_params=None,
                 borrow=True,
                 activation='relu',
                 batch_norm=True,
                 verbose=2):
        super(dot_product_layer, self).__init__(id=id,
                                                type='dot_product',
                                                verbose=verbose)
        if verbose >= 3:
            print "... Creating dot product layer"

        if rng is None:
            rng = numpy.random

        create = False
        if input_params is None:
            create = True
        elif input_params[0] is None:
            create = True
        if create is True:
            w_values = numpy.asarray(
                0.01 * rng.standard_normal(size=(input_shape[1], num_neurons)),
                dtype=theano.config.floatX)
            if activation == 'sigmoid':
                w_values *= 4
            self.w = theano.shared(value=w_values, name='weights')
        else:
            self.w = input_params[0]

        create = False
        if input_params is None:
            create = True
        elif input_params[1] is None:
            create = True
        if create is True:
            b_values = numpy.zeros((num_neurons, ), dtype=theano.config.floatX)
            self.b = theano.shared(value=b_values, name='bias')
        else:
            self.b = input_params[1]

        if batch_norm is True:
            create = False
            if input_params is None:
                create = True
            elif input_params[2] is None:
                create = True
            if create is True:
                gamma_values = numpy.ones((1, num_neurons),
                                          dtype=theano.config.floatX)
                self.gamma = theano.shared(value=gamma_values, name='gamma')
                beta_values = numpy.zeros((1, num_neurons),
                                          dtype=theano.config.floatX)
                self.beta = theano.shared(value=beta_values, name='beta')
                self.running_mean = theano.shared(value=numpy.zeros(
                    (1, num_neurons), dtype=theano.config.floatX),
                                                  name='population_mean',
                                                  borrow=borrow)
                self.running_var = theano.shared(value=numpy.ones(
                    (1, num_neurons), dtype=theano.config.floatX),
                                                 name='population_var',
                                                 borrow=borrow)
            else:
                self.gamma = input_params[2]
                self.beta = input_params[3]
                self.running_mean = input_params[4]
                self.running_var = input_params[5]

        linear_fit = T.dot(input, self.w) + self.b

        if batch_norm is True:
            batch_norm_out, _, _, mean, var = batch_normalization_train(
                inputs=linear_fit,
                gamma=self.gamma,
                beta=self.beta,
                running_mean=self.running_mean,
                running_var=self.running_var)

            mean = theano.tensor.unbroadcast(mean, 0)
            var = theano.tensor.unbroadcast(var, 0)
            self.updates[self.running_mean] = mean
            self.updates[self.running_var] = var + 0.001

            batch_norm_inference = batch_normalization_test(
                inputs=linear_fit,
                gamma=self.gamma,
                beta=self.beta,
                mean=self.running_mean,
                var=self.running_var)
        else:
            batch_norm_out = linear_fit
            batch_norm_inference = batch_norm_out

        batch_norm_shp = (input_shape[0], num_neurons)
        self.output, self.output_shape = _activate(x=batch_norm_out,
                                                   activation=activation,
                                                   input_size=batch_norm_shp,
                                                   verbose=verbose,
                                                   dimension=1)

        self.inference, _ = _activate(x=batch_norm_out,
                                      activation=activation,
                                      input_size=batch_norm_shp,
                                      verbose=verbose,
                                      dimension=1)

        # parameters of the model
        if batch_norm is True:
            self.params = [
                self.w, self.b, self.gamma, self.beta, self.running_mean,
                self.running_var
            ]
            self.active_params = [self.w, self.b, self.gamma, self.beta]
        else:
            self.params = [self.w, self.b]
            self.active_params = [self.w, self.b]

        self.L1 = abs(self.w).sum()
        # if batch_norm is True: self.L1 = self.L1 + abs(self.gamma).sum()
        self.L2 = (self.w**2).sum()
        # if batch_norm is True: self.L2 = self.L2 + (self.gamma**2).sum()
        """
        Ioffe, Sergey, and Christian Szegedy. "Batch normalization: Accelerating deep network
        training by reducing internal covariate shift." arXiv preprint arXiv:1502.03167 (2015). """

        if verbose >= 3:
            print "... Dot Product layer is created with output shape " + str(
                self.output_shape)

        self.num_neurons = num_neurons
        self.activation = activation
        self.batch_norm = batch_norm