Beispiel #1
0
def test_batch_normalization():
    def bn_ref(x, G, B, M, V):
        n = (x - M) / V
        return n * G + B

    np.random.seed(1234)
    X = 1 + np.random.random([10, 20]).astype("float32")
    B = 1 + np.random.random([20]).astype("float32")
    G = 1 + np.random.random([20]).astype("float32")
    M = 1 + np.random.random([20]).astype("float32")
    V = 1 + np.random.random([20]).astype("float32")

    x = tt.matrix("x")
    b = tt.vector("b")
    g = tt.vector("g")
    m = tt.vector("m")
    v = tt.vector("v")

    bn_ref_op = bn_ref(x, g, b, m, v)
    f_ref = theano.function([x, g, b, m, v], [bn_ref_op])
    res_ref = f_ref(X, G, B, M, V)
    for mode in ["low_mem", "high_mem"]:
        bn_op = bn.batch_normalization(x, g, b, m, v, mode=mode)
        f = theano.function([x, g, b, m, v], [bn_op])
        res = f(X, G, B, M, V)
        utt.assert_allclose(res_ref, res)

        def bn_f(inputs, gamma, beta, mean, std):
            return bn.batch_normalization(inputs, gamma, beta, mean, std, mode=mode)

        utt.verify_grad(bn_f, [X, G, B, M, V])

    bn_ref_op = bn_ref(
        x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True)
    )
    f_ref = theano.function([x, b, g], [bn_ref_op])
    res_ref = f_ref(X, G, B)
    for mode in ["low_mem", "high_mem"]:
        bn_op = bn.batch_normalization(
            x,
            g,
            b,
            x.mean(axis=0, keepdims=True),
            x.std(axis=0, keepdims=True),
            mode=mode,
        )
        f = theano.function([x, b, g], [bn_op])
        res = f(X, G, B)
        utt.assert_allclose(res_ref, res)

        def bn_f(inputs, gamma, beta, mean, std):
            return bn.batch_normalization(inputs, gamma, beta, mean, std, mode=mode)

        utt.verify_grad(
            bn_f, [X, G, B, X.mean(axis=0)[np.newaxis], X.std(axis=0)[np.newaxis]]
        )
Beispiel #2
0
 def conv_bn(inputs, gamma, beta, mean, std):
     return bn.batch_normalization(inputs,
                                   gamma.dimshuffle('x', 0, 'x', 'x'),
                                   beta.dimshuffle('x', 0, 'x', 'x'),
                                   mean.dimshuffle('x', 0, 'x', 'x'),
                                   std.dimshuffle('x', 0, 'x', 'x'),
                                   mode=mode)
Beispiel #3
0
 def apply(self, input_, application_call, i=None):
     if self._training_mode:
         mean, stdev = self._compute_training_statistics(input_)
     else:
         mean, stdev = self._prepare_population_statistics(i)
     # Useful for filtration of calls that were already made in
     # training mode when doing graph transformations.
     # Very important to cast to bool, as self._training_mode is
     # normally a list (to support nested context managers), which would
     # otherwise get passed by reference and be remotely mutated.
     application_call.metadata['training_mode'] = bool(self._training_mode)
     # Useful for retrieving a list of updates for population
     # statistics. Ditch the broadcastable first axis, though, to
     # make it the same dimensions as the population mean and stdev
     # shared variables.
     application_call.metadata['offset'] = mean[0]
     application_call.metadata['divisor'] = stdev[0]
     # Give these quantities roles in the graph.
     _add_role_and_annotate(mean, BATCH_NORM_OFFSET,
                            [self, application_call])
     _add_role_and_annotate(stdev, BATCH_NORM_DIVISOR,
                            [self, application_call])
     scale = _add_batch_axis(self.scale)
     shift = _add_batch_axis(self.shift)
     # Heavy lifting is done by the Theano utility function.
     normalized = bn.batch_normalization(
         input_,
         scale,
         shift,
         mean,
         stdev,
         mode=('low_mem' if self.conserve_memory else 'high_mem'))
     return normalized
Beispiel #4
0
 def conv_bn(inputs, gamma, beta, mean, std):
     return bn.batch_normalization(inputs,
                                   gamma.dimshuffle('x', 0, 'x', 'x'),
                                   beta.dimshuffle('x', 0, 'x', 'x'),
                                   mean.dimshuffle('x', 0, 'x', 'x'),
                                   std.dimshuffle('x', 0, 'x', 'x'),
                                   mode=mode)
Beispiel #5
0
 def bn_f(inputs, gamma, beta, mean, std):
     return bn.batch_normalization(inputs,
                                   gamma,
                                   beta,
                                   mean,
                                   std,
                                   mode=mode)
Beispiel #6
0
 def apply(self, input_, application_call):
     if self._training_mode:
         mean, stdev = self._compute_training_statistics(input_)
     else:
         mean, stdev = self._prepare_population_statistics()
     # Useful for filtration of calls that were already made in
     # training mode when doing graph transformations.
     # Very important to cast to bool, as self._training_mode is
     # normally a list (to support nested context managers), which would
     # otherwise get passed by reference and be remotely mutated.
     application_call.metadata['training_mode'] = bool(self._training_mode)
     # Useful for retrieving a list of updates for population
     # statistics. Ditch the broadcastable first axis, though, to
     # make it the same dimensions as the population mean and stdev
     # shared variables.
     application_call.metadata['offset'] = mean[0]
     application_call.metadata['divisor'] = stdev[0]
     # Give these quantities roles in the graph.
     _add_role_and_annotate(mean, BATCH_NORM_OFFSET,
                            [self, application_call])
     _add_role_and_annotate(stdev, BATCH_NORM_DIVISOR,
                            [self, application_call])
     scale = _add_batch_axis(self.scale)
     shift = _add_batch_axis(self.shift)
     # Heavy lifting is done by the Theano utility function.
     normalized = bn.batch_normalization(input_, scale, shift, mean, stdev,
                                         mode=('low_mem'
                                               if self.conserve_memory
                                               else 'high_mem'))
     return normalized
Beispiel #7
0
def test_bn():
    def bn_ref(x, G, B, M, V):
        n = (x - M) / V
        return n * G + B

    numpy.random.seed(1234)
    X = 1 + numpy.random.random([10, 20]).astype("float32")
    B = 1 + numpy.random.random([20]).astype("float32")
    G = 1 + numpy.random.random([20]).astype("float32")
    M = 1 + numpy.random.random([20]).astype("float32")
    V = 1 + numpy.random.random([20]).astype("float32")

    x = theano.tensor.matrix("x")
    b = theano.tensor.vector("b")
    g = theano.tensor.vector("g")
    m = theano.tensor.vector("m")
    v = theano.tensor.vector("v")

    bn_ref_op = bn_ref(x, g, b, m, v)
    f_ref = theano.function([x, b, g, m, v], [bn_ref_op])
    res_ref = f_ref(X, G, B, M, V)
    for mode in ["low_mem", "high_mem"]:
        bn_op = batch_normalization(x, g, b, m, v, mode=mode)
        f = theano.function([x, b, g, m, v], [bn_op])
        res = f(X, G, B, M, V)
        utt.assert_allclose(res_ref, res)

        def bn(inputs, gamma, beta, mean, std):
            return batch_normalization(inputs, gamma, beta, mean, std, mode=mode)

        utt.verify_grad(bn, [X, G, B, M, V])

    bn_ref_op = bn_ref(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True))
    f_ref = theano.function([x, b, g], [bn_ref_op])
    res_ref = f_ref(X, G, B)
    for mode in ["low_mem", "high_mem"]:
        bn_op = batch_normalization(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True), mode=mode)
        f = theano.function([x, b, g], [bn_op])
        res = f(X, G, B)
        utt.assert_allclose(res_ref, res)

        def bn(inputs, gamma, beta, mean, std):
            return batch_normalization(inputs, gamma, beta, mean, std, mode=mode)

        utt.verify_grad(batch_normalization, [X, G, B, X.mean(axis=0)[numpy.newaxis], X.std(axis=0)[numpy.newaxis]])
Beispiel #8
0
def test_batch_normalization():

    def bn_ref(x, G, B, M, V):
        n = (x - M) / V
        return n * G + B

    numpy.random.seed(1234)
    X = 1 + numpy.random.random([10, 20]).astype('float32')
    B = 1 + numpy.random.random([20]).astype('float32')
    G = 1 + numpy.random.random([20]).astype('float32')
    M = 1 + numpy.random.random([20]).astype('float32')
    V = 1 + numpy.random.random([20]).astype('float32')

    x = theano.tensor.matrix('x')
    b = theano.tensor.vector('b')
    g = theano.tensor.vector('g')
    m = theano.tensor.vector('m')
    v = theano.tensor.vector('v')

    bn_ref_op = bn_ref(x, g, b, m, v)
    f_ref = theano.function([x, b, g, m, v], [bn_ref_op])
    res_ref = f_ref(X, G, B, M, V)
    for mode in ['low_mem', 'high_mem']:
        bn_op = bn.batch_normalization(x, g, b, m, v, mode=mode)
        f = theano.function([x, b, g, m, v], [bn_op])
        res = f(X, G, B, M, V)
        utt.assert_allclose(res_ref, res)

        def bn_f(inputs, gamma, beta, mean, std):
            return bn.batch_normalization(inputs, gamma, beta, mean, std, mode=mode)
        utt.verify_grad(bn_f, [X, G, B, M, V])

    bn_ref_op = bn_ref(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True))
    f_ref = theano.function([x, b, g], [bn_ref_op])
    res_ref = f_ref(X, G, B)
    for mode in ['low_mem', 'high_mem']:
        bn_op = bn.batch_normalization(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True), mode=mode)
        f = theano.function([x, b, g], [bn_op])
        res = f(X, G, B)
        utt.assert_allclose(res_ref, res)

        def bn_f(inputs, gamma, beta, mean, std):
            return bn.batch_normalization(inputs, gamma, beta, mean, std, mode=mode)
        utt.verify_grad(bn_f, [X, G, B,
                               X.mean(axis=0)[numpy.newaxis], X.std(axis=0)[numpy.newaxis]])
Beispiel #9
0
def test_batch_normalization():

    def bn_ref(x, G, B, M, V):
        n = (x - M) / V
        return n * G + B

    numpy.random.seed(1234)
    X = 1 + numpy.random.random([10, 20]).astype('float32')
    B = 1 + numpy.random.random([20]).astype('float32')
    G = 1 + numpy.random.random([20]).astype('float32')
    M = 1 + numpy.random.random([20]).astype('float32')
    V = 1 + numpy.random.random([20]).astype('float32')

    x = theano.tensor.matrix('x')
    b = theano.tensor.vector('b')
    g = theano.tensor.vector('g')
    m = theano.tensor.vector('m')
    v = theano.tensor.vector('v')

    bn_ref_op = bn_ref(x, g, b, m, v)
    f_ref = theano.function([x, b, g, m, v], [bn_ref_op])
    res_ref = f_ref(X, G, B, M, V)
    for mode in ['low_mem', 'high_mem']:
        bn_op = bn.batch_normalization(x, g, b, m, v, mode=mode)
        f = theano.function([x, b, g, m, v], [bn_op])
        res = f(X, G, B, M, V)
        utt.assert_allclose(res_ref, res)

        def bn_f(inputs, gamma, beta, mean, std):
            return bn.batch_normalization(inputs, gamma, beta, mean, std, mode=mode)
        utt.verify_grad(bn_f, [X, G, B, M, V])

    bn_ref_op = bn_ref(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True))
    f_ref = theano.function([x, b, g], [bn_ref_op])
    res_ref = f_ref(X, G, B)
    for mode in ['low_mem', 'high_mem']:
        bn_op = bn.batch_normalization(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True), mode=mode)
        f = theano.function([x, b, g], [bn_op])
        res = f(X, G, B)
        utt.assert_allclose(res_ref, res)

        def bn_f(inputs, gamma, beta, mean, std):
            return bn.batch_normalization(inputs, gamma, beta, mean, std, mode=mode)
        utt.verify_grad(bn_f, [X, G, B,
                               X.mean(axis=0)[numpy.newaxis], X.std(axis=0)[numpy.newaxis]])
Beispiel #10
0
 def conv_bn(inputs, gamma, beta, mean, std):
     return batch_normalization(
         inputs,
         gamma.dimshuffle("x", 0, "x", "x"),
         beta.dimshuffle("x", 0, "x", "x"),
         mean.dimshuffle("x", 0, "x", "x"),
         std.dimshuffle("x", 0, "x", "x"),
         mode=mode,
     )
Beispiel #11
0
 def _inference(self, input_):
     output = bn.batch_normalization(input_,
         self.gamma.dimshuffle(*self.pattern),
         self.beta.dimshuffle(*self.pattern),
         self.pop_means.dimshuffle(*self.pattern),
         tensor.sqrt(self.pop_vars.dimshuffle(*self.pattern) +
                     self.epsilon),
         mode='low_mem')
     return output
Beispiel #12
0
 def _inference(self, input_):
     output = bn.batch_normalization(
         input_,
         self.gamma.dimshuffle(*self.pattern),
         self.beta.dimshuffle(*self.pattern),
         self.pop_means.dimshuffle(*self.pattern),
         tensor.sqrt(
             self.pop_vars.dimshuffle(*self.pattern) + self.epsilon),
         mode='low_mem')
     return output
Beispiel #13
0
Datei: ae.py Projekt: mufan-li/sg
 def get_reconstructed_input(self, hidden):
     lin_output = T.dot(hidden, self.W_prime) + self.b_prime
     bn_output = batch_normalization(
         inputs=lin_output,
         gamma=self.gamma_o,
         beta=self.beta_o,
         mean=lin_output.mean((0,), keepdims=True),
         std=lin_output.std((0,), keepdims=True),
         mode="low_mem",
     )
     return self.actv_fcn(bn_output)
Beispiel #14
0
Datei: ae.py Projekt: mufan-li/sg
 def get_hidden_values(self, input):
     lin_output = T.dot(input, self.W) + self.b
     bn_output = batch_normalization(
         inputs=lin_output,
         gamma=self.gamma_h,
         beta=self.beta_h,
         mean=lin_output.mean((0,), keepdims=True),
         std=lin_output.std((0,), keepdims=True),
         mode="low_mem",
     )
     return self.actv_fcn(bn_output)
Beispiel #15
0
 def _training(self, input_):
     self.batch_means = input_.mean(axis=self.axes, keepdims=False,
                                    dtype=floatX)
     self.batch_vars = input_.var(axis=self.axes, keepdims=False)
     output = bn.batch_normalization(input_,
         self.gamma.dimshuffle(*self.pattern),
         self.beta.dimshuffle(*self.pattern),
         self.batch_means.dimshuffle(*self.pattern),
         tensor.sqrt(self.batch_vars.dimshuffle(*self.pattern) +
                     self.epsilon),
         mode='low_mem')
     return output
Beispiel #16
0
def test_bn_feature_maps():
    def bn_ref(x, G, B, M, V):
        n = (x - M) / V
        return n * G + B

    np.random.seed(1234)
    X = 1 + np.random.random([2, 3, 4, 4]).astype("float32")
    B = 1 + np.random.random([3]).astype("float32")
    G = 1 + np.random.random([3]).astype("float32")
    M = 1 + np.random.random([3]).astype("float32")
    V = 1 + np.random.random([3]).astype("float32")

    x = theano.tensor.tensor4("x")
    b = theano.tensor.vector("b")
    g = theano.tensor.vector("g")
    m = theano.tensor.vector("m")
    v = theano.tensor.vector("v")

    bn_ref_op = bn_ref(
        x,
        g.dimshuffle("x", 0, "x", "x"),
        b.dimshuffle("x", 0, "x", "x"),
        m.dimshuffle("x", 0, "x", "x"),
        v.dimshuffle("x", 0, "x", "x"),
    )
    f_ref = theano.function([x, b, g, m, v], [bn_ref_op])
    res_ref = f_ref(X, G, B, M, V)

    for mode in ["low_mem", "high_mem"]:
        bn_op = bn.batch_normalization(
            x,
            g.dimshuffle("x", 0, "x", "x"),
            b.dimshuffle("x", 0, "x", "x"),
            m.dimshuffle("x", 0, "x", "x"),
            v.dimshuffle("x", 0, "x", "x"),
            mode=mode,
        )
        f = theano.function([x, b, g, m, v], [bn_op])
        res = f(X, G, B, M, V)
        utt.assert_allclose(res_ref, res)

        def conv_bn(inputs, gamma, beta, mean, std):
            return bn.batch_normalization(
                inputs,
                gamma.dimshuffle("x", 0, "x", "x"),
                beta.dimshuffle("x", 0, "x", "x"),
                mean.dimshuffle("x", 0, "x", "x"),
                std.dimshuffle("x", 0, "x", "x"),
                mode=mode,
            )

        utt.verify_grad(conv_bn, [X, G, B, M, V])
Beispiel #17
0
def test_bn_feature_maps():
    def bn_ref(x, G, B, M, V):
        n = (x - M) / V
        return n * G + B

    numpy.random.seed(1234)
    X = 1 + numpy.random.random([10, 20, 4, 4]).astype("float32")
    B = 1 + numpy.random.random([20]).astype("float32")
    G = 1 + numpy.random.random([20]).astype("float32")
    M = 1 + numpy.random.random([20]).astype("float32")
    V = 1 + numpy.random.random([20]).astype("float32")

    x = theano.tensor.tensor4("x")
    b = theano.tensor.vector("b")
    g = theano.tensor.vector("g")
    m = theano.tensor.vector("m")
    v = theano.tensor.vector("v")

    bn_ref_op = bn_ref(
        x,
        g.dimshuffle("x", 0, "x", "x"),
        b.dimshuffle("x", 0, "x", "x"),
        m.dimshuffle("x", 0, "x", "x"),
        v.dimshuffle("x", 0, "x", "x"),
    )
    f_ref = theano.function([x, b, g, m, v], [bn_ref_op])
    res_ref = f_ref(X, G, B, M, V)

    for mode in ["low_mem", "high_mem"]:
        bn_op = batch_normalization(
            x,
            g.dimshuffle("x", 0, "x", "x"),
            b.dimshuffle("x", 0, "x", "x"),
            m.dimshuffle("x", 0, "x", "x"),
            v.dimshuffle("x", 0, "x", "x"),
            mode=mode,
        )
        f = theano.function([x, b, g, m, v], [bn_op])
        res = f(X, G, B, M, V)
        utt.assert_allclose(res_ref, res)

        def conv_bn(inputs, gamma, beta, mean, std):
            return batch_normalization(
                inputs,
                gamma.dimshuffle("x", 0, "x", "x"),
                beta.dimshuffle("x", 0, "x", "x"),
                mean.dimshuffle("x", 0, "x", "x"),
                std.dimshuffle("x", 0, "x", "x"),
                mode=mode,
            )

        utt.verify_grad(conv_bn, [X, G, B, M, V])
    def __init__(self,
                 input1,
                 n_in,
                 n_out,
                 W_values=None,
                 b_values=None,
                 activation=T.tanh,
                 batch_norm=True):
        self.input1 = input1
        self.W = theano.shared(value=W_values, name='W', borrow=True)
        self.b = theano.shared(value=b_values, name='b', borrow=True)
        lin_output = T.dot(input1, self.W) + self.b

        # self.gamma = theano.shared(value = numpy.ones((n_out,), dtype=theano.config.floatX), name='gamma')
        # self.beta = theano.shared(value = numpy.zeros((n_out,), dtype=theano.config.floatX), name='beta')
        # bn_output = batch_normalization(inputs = lin_output,
        #                                 gamma = self.gamma, beta = self.beta, mean = lin_output.mean((0,), keepdims=True),
        #                                 std = lin_output.std((0,), keepdims = True),
        #                                 mode='high_mem')
        # self.output1 = (
        #     bn_output if activation is None
        #     else activation(bn_output)
        # )
        if batch_norm:
            self.gamma = theano.shared(value=numpy.ones(
                (n_out, ), dtype=theano.config.floatX),
                                       name='gamma',
                                       borrow=True)
            self.beta = theano.shared(value=numpy.zeros(
                (n_out, ), dtype=theano.config.floatX),
                                      name='beta',
                                      borrow=True)
            # bn_output = batch_normalization(inputs = self.linear,
            #     	                    gamma = self.gamma, beta = self.beta, mean = self.linear.mean((0,), keepdims=True),
            #     	                    std = T.ones_like(self.linear.var((0,), keepdims = True)), mode='high_mem')
            #            xmean = lin_output.mean(0, keepdims=True)
            #            xstd = T.sqrt(lin_output.std(0, keepdims=True)**2+1e-6)

            bn_output = batch_normalization(
                inputs=lin_output,
                gamma=self.gamma,
                beta=self.beta,
                mean=lin_output.mean(0, keepdims=True),
                std=T.sqrt(lin_output.std(0, keepdims=True)**2 + 1e-6),
                mode='high_mem')
            self.output1 = T.clip(bn_output, 0, 40)
            self.params = [self.W, self.b, self.gamma, self.beta]

        else:
            self.output1 = (lin_output
                            if activation is None else activation(lin_output))
            self.params = [self.W, self.b]
Beispiel #19
0
def bn_layer(x, a, b, normParam, params, phase):
    ''' Apply BN.    

    # phase = 0 : BN eval with m1v1, BN ups weighter average 
    # phase = 1 : BN eval with m2v2, no BN ups

    '''

    minAlpha = params.movingAvMin
    iterStep = params.movingAvStep
    # compute mean & variance
    if params.model == 'convnet':
        mean1 = T.mean(x, axis=(0, 2, 3))
        var1 = T.var(x, axis=(0, 2, 3))
    else:
        mean1 = T.mean(x, axis=0)
        var1 = T.var(x, axis=0)

    # moving average as a proxi for validation model
    alpha = (1. - phase) * T.maximum(minAlpha, 1. / normParam['iter'])
    mean2 = (1. - alpha) * normParam['mean'] + alpha * mean1
    var2 = (1. - alpha) * normParam['var'] + alpha * var1

    mean = (1. - phase) * mean2 + phase * mean1
    var = (1. - phase) * var1 + phase * var1
    std = T.sqrt(var + eps)

    # apply transformation:
    if params.model == 'convnet':
        x = bn.batch_normalization(x,
                                   a.dimshuffle('x', 0, 'x', 'x'),
                                   b.dimshuffle('x', 0, 'x', 'x'),
                                   mean.dimshuffle('x', 0, 'x', 'x'),
                                   std.dimshuffle('x', 0, 'x', 'x'),
                                   mode='high_mem')
    else:
        x = bn.batch_normalization(x, a, b, mean, std)
    updateBN = [mean2, var2, mean1, var1, normParam['iter'] + iterStep]
    return x, updateBN
Beispiel #20
0
 def _training(self, input_):
     self.batch_means = input_.mean(axis=self.axes,
                                    keepdims=False,
                                    dtype=floatX)
     self.batch_vars = input_.var(axis=self.axes, keepdims=False)
     output = bn.batch_normalization(
         input_,
         self.gamma.dimshuffle(*self.pattern),
         self.beta.dimshuffle(*self.pattern),
         self.batch_means.dimshuffle(*self.pattern),
         tensor.sqrt(
             self.batch_vars.dimshuffle(*self.pattern) + self.epsilon),
         mode='low_mem')
     return output
Beispiel #21
0
    def get_cnn2_log_prob(self, X, Z, w1, w2, w3, b3, w4, b4, gamma1, beta1,
                          gamma2, beta2, gamma3, beta3, pool_horiz, n_conv,
                          dropout, deterministic):

        l1 = T.nnet.relu(
            T.nnet.conv2d(X, w1, border_mode='valid', subsample=(1, 1)))
        bn1 = batch_normalization(inputs = l1, gamma = gamma1, beta = beta1, mean = l1.mean((0,), keepdims=True), \
         std = T.ones_like(l1.var((0,), keepdims = True)), mode='high_mem')
        l2 = max_pool_2d(bn1,
                         ds=(1, pool_horiz),
                         st=(1, 1),
                         ignore_border=True)

        l3 = T.nnet.relu(
            T.nnet.conv2d(l2, w2, border_mode='valid', subsample=(1, 1)))
        bn2 = batch_normalization(inputs = l3, gamma = gamma2, beta = beta2, mean = l3.mean((0,), keepdims=True), \
         std = T.ones_like(l3.var((0,), keepdims = True)), mode='high_mem')
        l4 = max_pool_2d(bn2,
                         ds=(1, pool_horiz),
                         st=(1, 1),
                         ignore_border=True)

        l5 = l4.reshape((X.shape[0], n_conv))

        l5 = self.add_dropout(l5, dropout, deterministic)
        l6 = T.nnet.relu(T.dot(l5, w3) + b3)
        l6 = self.add_dropout(l6, dropout, deterministic)

        l7 = T.concatenate([l6, Z], axis=1)

        l8 = T.dot(l7, w4) + b4
        bn3 = batch_normalization(inputs = l8, gamma = gamma3, beta = beta3, mean = l8.mean((0,), keepdims=True), \
         std = T.ones_like(l8.var((0,), keepdims = True)), mode='high_mem')
        #self.helper_fn = theano.function(inputs=[X, Z], outputs=[bn3], allow_input_downcast=True)
        log_prob = T.nnet.logsoftmax(bn3)

        return log_prob
Beispiel #22
0
    def get_output(self, input, **kwargs):
        input_mean = input.mean(self.axes)
        # input_std = T.inv(T.sqrt(input.var(self.axes) + self.epsilon))
        input_std = T.sqrt(input.var(self.axes) + self.epsilon)

        # Decide whether to use the stored averages or mini-batch statistics
        use_averages = self.deterministic
        if use_averages:
            mean = self.mean
            std = self.std
        else:
            mean = input_mean
            std = input_std

        # Decide whether to update the stored averages
        update_averages = self.update_averages and not use_averages
        if update_averages:
            # Trick: To update the stored statistics, we create memory-aliased
            # clones of the stored statistics:
            running_mean = theano.clone(self.mean, share_inputs=False)
            running_std = theano.clone(self.std, share_inputs=False)
            # set a default update for them:
            running_mean.default_update = ((1 - self.alpha) * running_mean +
                                           self.alpha * input_mean)
            running_std.default_update = ((1 - self.alpha) * running_std +
                                          self.alpha * input_std)
            # and make sure they end up in the graph without participating in
            # the computation (this way their default_update will be collected
            # and applied, but the computation will be optimized away):
            mean += 0 * running_mean
            std += 0 * running_std

        # prepare dimshuffle pattern inserting broadcastable axes as needed
        param_axes = iter(list(range(input.ndim - len(self.axes))))
        pattern = ['x' if input_axis in self.axes
                   else next(param_axes)
                   for input_axis in range(input.ndim)]

        # apply dimshuffle pattern to all parameters
        beta = 0 if self.beta is None else self.beta.dimshuffle(pattern)
        gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern)
        mean = mean.dimshuffle(pattern)
        std = std.dimshuffle(pattern)

        # normalize
        # normalized = (input - mean) * (gamma * std) + beta
        normalized = batch_normalization(
            input, gamma, beta, mean, std, mode='low_mem')
        return self.activation(normalized)
Beispiel #23
0
def bn_layer(x, a, b, normParam, params, phase):

    ''' Apply BN.    

    # phase = 0 : BN eval with m1v1, BN ups weighter average 
    # phase = 1 : BN eval with m2v2, no BN ups

    '''
        
    minAlpha = params.movingAvMin
    iterStep = params.movingAvStep                  
    # compute mean & variance    
    if params.model == 'convnet':
        mean1 = T.mean(x, axis = (0, 2, 3))
        var1 = T.var(x, axis = (0, 2, 3))
    else:
        mean1 = T.mean(x, axis = 0)
        var1 = T.var(x, axis = 0)

    # moving average as a proxi for validation model 
    alpha = (1.-phase)*T.maximum(minAlpha, 1./normParam['iter'])                     
    mean2 = (1.-alpha)*normParam['mean'] + alpha*mean1 
    var2 = (1.-alpha)*normParam['var'] + alpha*var1   

    mean = (1.-phase)*mean2 + phase*mean1 
    var = (1.-phase)*var1 + phase*var1
    std = T.sqrt(var+eps)

    # apply transformation: 
    if params.model == 'convnet':
        x = bn.batch_normalization(x, a.dimshuffle('x', 0, 'x', 'x'), b.dimshuffle('x', 0, 'x', 'x'), 
                                mean.dimshuffle('x', 0, 'x', 'x'), std.dimshuffle('x', 0, 'x', 'x'), mode='high_mem')
    else:    
        x = bn.batch_normalization(x, a, b, mean, std) 
    updateBN = [mean2, var2, mean1, var1, normParam['iter']+iterStep]  
    return x, updateBN
Beispiel #24
0
    def __init__(self,
                 rng,
                 input,
                 n_in,
                 n_out,
                 W=None,
                 b=None,
                 activation=T.tanh,
                 bn=False):

        self.input = input
        if W is None:
            W_values = numpy.asarray(rng.uniform(
                low=-numpy.sqrt(6. / (n_in + n_out)),
                high=numpy.sqrt(6. / (n_in + n_out)),
                size=(n_in, n_out)),
                                     dtype=theano.config.floatX)
            if activation == theano.tensor.nnet.sigmoid:
                W_values *= 4
            W = theano.shared(value=W_values, name='W', borrow=True)

        if b is None:
            b_values = numpy.zeros((n_out, ), dtype=theano.config.floatX)
            b = theano.shared(value=b_values, name='b', borrow=True)

        self.W, self.b = W, b
        lin_output = T.dot(input, self.W) + self.b
        if bn:
            self.gamma = theano.shared(value=numpy.ones(
                (n_out, ), dtype=theano.config.floatX),
                                       name='gamma')
            self.beta = theano.shared(value=numpy.zeros(
                (n_out, ), dtype=theano.config.floatX),
                                      name='beta')
            mean = lin_output.mean(0, keepdims=True)
            std = T.sqrt(lin_output.std(0, keepdims=True)**2 + 0.01)
            output = batch_normalization(inputs=lin_output,
                                         gamma=self.gamma,
                                         beta=self.beta,
                                         mean=mean,
                                         std=std)
        else:
            output = lin_output

        self.output = (output if activation is None else activation(output))
        # parameters of the model
        self.params = [self.W, self.b, self.gamma, self.beta
                       ] if bn else [self.W, self.b]
Beispiel #25
0
def test_bn_feature_maps():

    def bn_ref(x, G, B, M, V):
        n = (x - M) / V
        return n * G + B

    numpy.random.seed(1234)
    X = 1 + numpy.random.random([2, 3, 4, 4]).astype('float32')
    B = 1 + numpy.random.random([3]).astype('float32')
    G = 1 + numpy.random.random([3]).astype('float32')
    M = 1 + numpy.random.random([3]).astype('float32')
    V = 1 + numpy.random.random([3]).astype('float32')

    x = theano.tensor.tensor4('x')
    b = theano.tensor.vector('b')
    g = theano.tensor.vector('g')
    m = theano.tensor.vector('m')
    v = theano.tensor.vector('v')

    bn_ref_op = bn_ref(x,
                       g.dimshuffle('x', 0, 'x', 'x'),
                       b.dimshuffle('x', 0, 'x', 'x'),
                       m.dimshuffle('x', 0, 'x', 'x'),
                       v.dimshuffle('x', 0, 'x', 'x'))
    f_ref = theano.function([x, b, g, m, v], [bn_ref_op])
    res_ref = f_ref(X, G, B, M, V)

    for mode in ['low_mem', 'high_mem']:
        bn_op = bn.batch_normalization(x,
                                       g.dimshuffle('x', 0, 'x', 'x'),
                                       b.dimshuffle('x', 0, 'x', 'x'),
                                       m.dimshuffle('x', 0, 'x', 'x'),
                                       v.dimshuffle('x', 0, 'x', 'x'),
                                       mode=mode)
        f = theano.function([x, b, g, m, v], [bn_op])
        res = f(X, G, B, M, V)
        utt.assert_allclose(res_ref, res)

        def conv_bn(inputs, gamma, beta, mean, std):
            return bn.batch_normalization(inputs,
                                          gamma.dimshuffle('x', 0, 'x', 'x'),
                                          beta.dimshuffle('x', 0, 'x', 'x'),
                                          mean.dimshuffle('x', 0, 'x', 'x'),
                                          std.dimshuffle('x', 0, 'x', 'x'),
                                          mode=mode)
        utt.verify_grad(conv_bn, [X, G, B, M, V])
Beispiel #26
0
def test_bn_feature_maps():

    def bn_ref(x, G, B, M, V):
        n = (x - M) / V
        return n * G + B

    numpy.random.seed(1234)
    X = 1 + numpy.random.random([2, 3, 4, 4]).astype('float32')
    B = 1 + numpy.random.random([3]).astype('float32')
    G = 1 + numpy.random.random([3]).astype('float32')
    M = 1 + numpy.random.random([3]).astype('float32')
    V = 1 + numpy.random.random([3]).astype('float32')

    x = theano.tensor.tensor4('x')
    b = theano.tensor.vector('b')
    g = theano.tensor.vector('g')
    m = theano.tensor.vector('m')
    v = theano.tensor.vector('v')

    bn_ref_op = bn_ref(x,
                       g.dimshuffle('x', 0, 'x', 'x'),
                       b.dimshuffle('x', 0, 'x', 'x'),
                       m.dimshuffle('x', 0, 'x', 'x'),
                       v.dimshuffle('x', 0, 'x', 'x'))
    f_ref = theano.function([x, b, g, m, v], [bn_ref_op])
    res_ref = f_ref(X, G, B, M, V)

    for mode in ['low_mem', 'high_mem']:
        bn_op = bn.batch_normalization(x,
                                       g.dimshuffle('x', 0, 'x', 'x'),
                                       b.dimshuffle('x', 0, 'x', 'x'),
                                       m.dimshuffle('x', 0, 'x', 'x'),
                                       v.dimshuffle('x', 0, 'x', 'x'),
                                       mode=mode)
        f = theano.function([x, b, g, m, v], [bn_op])
        res = f(X, G, B, M, V)
        utt.assert_allclose(res_ref, res)

        def conv_bn(inputs, gamma, beta, mean, std):
            return bn.batch_normalization(inputs,
                                          gamma.dimshuffle('x', 0, 'x', 'x'),
                                          beta.dimshuffle('x', 0, 'x', 'x'),
                                          mean.dimshuffle('x', 0, 'x', 'x'),
                                          std.dimshuffle('x', 0, 'x', 'x'),
                                          mode=mode)
        utt.verify_grad(conv_bn, [X, G, B, M, V])
Beispiel #27
0
	def __init__(self, x, n_in, n_out, dropout_on,
				layer=0, act=T.nnet.sigmoid,
				w = None, b = None, dropout_rate=0.3):
		if w==None:
			w = theano.shared(
				value=w_init(n_in, n_out),
				name='w'+str(layer),
				borrow=True
			)

		if b==None:
			b = theano.shared(
				value=b_init(n_out),
				name='b'+str(layer),
				borrow=True
			)

		self.w = w
		self.b = b
		self.gamma = theano.shared(value = numpy.ones((n_out,), 
						dtype=theano.config.floatX), name='gamma')
		self.beta = theano.shared(value = numpy.zeros((n_out,), 
						dtype=theano.config.floatX), name='beta')

		rng = np.random.RandomState(42)
		srng = RandomStreams(rng.randint(10**9))
		mask = srng.binomial(n=1, p=1-dropout_rate, size=x.shape)
		cast_mark = T.cast(mask, theano.config.floatX)

		drop_input = T.switch(dropout_on, x*cast_mark,x*(1-dropout_rate))
		lin_output = T.dot(drop_input, self.w) + self.b

		bn_output = batch_normalization(inputs = lin_output,
			gamma = self.gamma, beta = self.beta, 
			mean = lin_output.mean((0,), keepdims=True),
			std = lin_output.std((0,), keepdims = True),
						mode='low_mem')

		self.output = (
			bn_output if act is None
			else act(bn_output)
		)

		self.params = [self.w, self.b]
Beispiel #28
0
    def __init__(self, rng, input, filter_shape, 
                 image_shape, use_bn = 1):
        assert image_shape[1] == filter_shape[1]
        self.input = input
        # there are "num input feature maps * filter height * filter width"
        # inputs to each hidden unit
        fan_in = numpy.prod(filter_shape[1:])
        # each unit in the lower layer receives a gradient from:
        # "num output feature maps * filter height * filter width" /
        #   pooling size
        fan_out = filter_shape[0] * numpy.prod(filter_shape[2:])
        W_bound = numpy.sqrt(2. /(fan_in + fan_out))
        W_value = rng.normal(loc = 0., scale = W_bound, size = filter_shape)
        self.W = theano.shared(W_value, name = 'W', borrow = True)
        conv_out = conv2d(input = self.input, 
                   filters = self.W)        
        
#        pooled_out = pool.pool_2d(input = conv_out, 
#                                  ds=poolsize, ignore_border=True)                            
        
        b_bound = numpy.sqrt(2. /fan_out)
        b_value = rng.normal(loc = 0, scale = b_bound, size=(filter_shape[0],))
        self.b = theano.shared(b_value, name = 'b', borrow  = True)
        
        self.linear = conv_out + self.b.dimshuffle('x', 0, 'x','x')
        if use_bn == 1:
            self.gamma = theano.shared(value = numpy.ones((filter_shape[0],), dtype=theano.config.floatX), name='gamma')
            self.beta = theano.shared(value = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX), name='beta')
            self.linear_shuffle = self.linear.dimshuffle(0, 2, 3, 1)        
            self.linear_res = self.linear_shuffle.reshape( (self.linear.shape[0]*self.linear.shape[2]*self.linear.shape[3],  self.linear.shape[1]))
            bn_output = batch_normalization(inputs = self.linear_shuffle,
    			gamma = self.gamma, beta = self.beta, mean = self.linear_res.mean((0,), keepdims=True),
    			std = T.std(self.linear_res, axis=0), mode='high_mem')
                    
            self.output = T.nnet.relu( bn_output.dimshuffle(0, 3, 1, 2) )    
            self.params = [self.W, self.b, self.gamma, self.beta]
        else:
            self.output = T.nnet.relu(self.linear)
            self.params = [self.W, self.b]
Beispiel #29
0
def test_BNComposite():
    try:
        orig = theano.config.compute_test_value

        theano.config.compute_test_value = "raise"

        def bn_ref(x, G, B, M, V):
            n = (x - M) / V
            return n * G + B

        np.random.seed(1234)
        X = 1 + np.random.random([10, 20]).astype("float32")
        B = 1 + np.random.random([20]).astype("float32")
        G = 1 + np.random.random([20]).astype("float32")
        M = 1 + np.random.random([20]).astype("float32")
        V = 1 + np.random.random([20]).astype("float32")

        x = theano.tensor.matrix("x")
        b = theano.tensor.vector("b")
        g = theano.tensor.vector("g")
        m = theano.tensor.vector("m")
        v = theano.tensor.vector("v")

        x.tag.test_value = np.random.rand(2, 2).astype(theano.config.floatX)
        b.tag.test_value = np.random.rand(2).astype(theano.config.floatX)
        g.tag.test_value = np.random.rand(2).astype(theano.config.floatX)
        m.tag.test_value = np.random.rand(2).astype(theano.config.floatX)
        v.tag.test_value = np.random.rand(2).astype(theano.config.floatX)

        bn_ref_op = bn_ref(x, g, b, m, v)
        f_ref = theano.function([x, b, g, m, v], [bn_ref_op])
        res_ref = f_ref(X, G, B, M, V)
        for mode in ["low_mem", "high_mem"]:
            bn_op = bn.batch_normalization(x, g, b, m, v, mode=mode)
            f = theano.function([x, b, g, m, v], [bn_op])
            res = f(X, G, B, M, V)
            utt.assert_allclose(res_ref, res)
    finally:
        theano.config.compute_test_value = orig
Beispiel #30
0
def test_BNComposite():
    try:
        orig = theano.config.compute_test_value

        theano.config.compute_test_value = 'raise'

        def bn_ref(x, G, B, M, V):
            n = (x - M) / V
            return n * G + B

        np.random.seed(1234)
        X = 1 + np.random.random([10, 20]).astype('float32')
        B = 1 + np.random.random([20]).astype('float32')
        G = 1 + np.random.random([20]).astype('float32')
        M = 1 + np.random.random([20]).astype('float32')
        V = 1 + np.random.random([20]).astype('float32')

        x = theano.tensor.matrix('x')
        b = theano.tensor.vector('b')
        g = theano.tensor.vector('g')
        m = theano.tensor.vector('m')
        v = theano.tensor.vector('v')

        x.tag.test_value = np.random.rand(2, 2).astype(theano.config.floatX)
        b.tag.test_value = np.random.rand(2).astype(theano.config.floatX)
        g.tag.test_value = np.random.rand(2).astype(theano.config.floatX)
        m.tag.test_value = np.random.rand(2).astype(theano.config.floatX)
        v.tag.test_value = np.random.rand(2).astype(theano.config.floatX)

        bn_ref_op = bn_ref(x, g, b, m, v)
        f_ref = theano.function([x, b, g, m, v], [bn_ref_op])
        res_ref = f_ref(X, G, B, M, V)
        for mode in ['low_mem', 'high_mem']:
            bn_op = bn.batch_normalization(x, g, b, m, v, mode=mode)
            f = theano.function([x, b, g, m, v], [bn_op])
            res = f(X, G, B, M, V)
            utt.assert_allclose(res_ref, res)
    finally:
        theano.config.compute_test_value = orig
Beispiel #31
0
def test_BNComposite():
    try:
        orig = theano.config.compute_test_value

        theano.config.compute_test_value = 'raise'

        def bn_ref(x, G, B, M, V):
            n = (x - M) / V
            return n * G + B

        numpy.random.seed(1234)
        X = 1 + numpy.random.random([10, 20]).astype('float32')
        B = 1 + numpy.random.random([20]).astype('float32')
        G = 1 + numpy.random.random([20]).astype('float32')
        M = 1 + numpy.random.random([20]).astype('float32')
        V = 1 + numpy.random.random([20]).astype('float32')

        x = theano.tensor.matrix('x')
        b = theano.tensor.vector('b')
        g = theano.tensor.vector('g')
        m = theano.tensor.vector('m')
        v = theano.tensor.vector('v')

        x.tag.test_value = numpy.random.rand(2, 2).astype(theano.config.floatX)
        b.tag.test_value = numpy.random.rand(2).astype(theano.config.floatX)
        g.tag.test_value = numpy.random.rand(2).astype(theano.config.floatX)
        m.tag.test_value = numpy.random.rand(2).astype(theano.config.floatX)
        v.tag.test_value = numpy.random.rand(2).astype(theano.config.floatX)

        bn_ref_op = bn_ref(x, g, b, m, v)
        f_ref = theano.function([x, b, g, m, v], [bn_ref_op])
        res_ref = f_ref(X, G, B, M, V)
        for mode in ['low_mem', 'high_mem']:
            bn_op = bn.batch_normalization(x, g, b, m, v, mode=mode)
            f = theano.function([x, b, g, m, v], [bn_op])
            res = f(X, G, B, M, V)
            utt.assert_allclose(res_ref, res)
    finally:
        theano.config.compute_test_value = orig
Beispiel #32
0
    def __init__(self,
                 rng,
                 is_train,
                 input,
                 n_in,
                 n_out,
                 dropout_rate=0.5,
                 W=None,
                 b=None,
                 activation=ReLu):
        self.input = input
        p = dropout_rate

        W = numpy.asarray(numpy.random.normal(loc=0.0,
                                              scale=0.05,
                                              size=(n_in, n_out)),
                          dtype=theano.config.floatX)
        self.W = theano.shared(W, borrow=True)

        b = numpy.zeros((n_out, ), dtype=theano.config.floatX)
        self.b = theano.shared(value=b, borrow=True)

        linearOutput = T.dot(self.input, self.W) + self.b
        train_output = drop(input=np.cast[theano.config.floatX](1. / p) *
                            linearOutput,
                            p=dropout_rate,
                            rng=rng)
        tempOutPut = T.switch(T.neq(is_train, 0), train_output, linearOutput)

        bnOutput = bn.batch_normalization(inputs=tempOutPut,
                                          gamma=1.,
                                          beta=0,
                                          mean=T.mean(tempOutPut),
                                          std=T.std(tempOutPut))

        self.output = activation(bnOutput)

        self.params = [self.W, self.b]
Beispiel #33
0
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=100,
                    emb_size=300,
                    batch_size=70,
                    filter_size=[3, 1],
                    maxSentLen=70,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    "load raw data"
    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_SNLI_dataset(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    train_sents_l = np.asarray(all_sentences_l[0], dtype='int32')
    dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32')
    test_sents_l = np.asarray(all_sentences_l[2], dtype='int32')

    train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX)
    dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[0], dtype='int32')
    dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32')
    test_sents_r = np.asarray(all_sentences_r[2], dtype='int32')

    train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX)
    dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[0], dtype='int32')
    dev_labels_store = np.asarray(all_labels[1], dtype='int32')
    test_labels_store = np.asarray(all_labels[2], dtype='int32')

    train_size = len(train_labels_store)
    dev_size = len(dev_labels_store)
    test_size = len(test_labels_store)
    print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    "first randomly initialize each word in the matrix 'rand_values', then load pre-trained word2vec embeddinds to initialize words, uncovered"
    "words keep random initialization"
    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    "now, start to build the input form of the model"
    sents_ids_l = T.imatrix()
    sents_mask_l = T.fmatrix()
    sents_ids_r = T.imatrix()
    sents_mask_r = T.fmatrix()
    labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    'Use word ids in sentences to retrieve word embeddings from matrix "init_embeddings", each sentence will be in'
    'tensor2 (emb_size, sen_length), then the minibatch will be in tensor3 (batch_size, emb_size, sen_length) '
    embed_input_l = init_embeddings[sents_ids_l.flatten(
    )].reshape((batch_size, maxSentLen, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r = init_embeddings[sents_ids_r.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)
    '''create parameters for attentive convolution function '''
    gate_filter_shape = (emb_size, 1, emb_size, 1)
    conv_W_pre, conv_b_pre = create_conv_para(rng,
                                              filter_shape=gate_filter_shape)
    conv_W_gate, conv_b_gate = create_conv_para(rng,
                                                filter_shape=gate_filter_shape)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size[0], 1,
                                                      emb_size,
                                                      filter_size[1]))
    conv_W2_context, conv_b2_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    NN_para = [
        conv_W, conv_b, conv_W_context, conv_W_pre, conv_b_pre, conv_W_gate,
        conv_b_gate, conv_W2, conv_b2, conv_W2_context
    ]

    "A gated convolution layer to form more expressive word representations in each sentence"
    "input tensor3 (batch_size, emb_size, sen_length), output tensor3 (batch_size, emb_size, sen_length)"
    conv_layer_gate_l = Conv_with_Mask_with_Gate(
        rng,
        input_tensor3=embed_input_l,
        mask_matrix=sents_mask_l,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=gate_filter_shape,
        W=conv_W_pre,
        b=conv_b_pre,
        W_gate=conv_W_gate,
        b_gate=conv_b_gate)
    conv_layer_gate_r = Conv_with_Mask_with_Gate(
        rng,
        input_tensor3=embed_input_r,
        mask_matrix=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=gate_filter_shape,
        W=conv_W_pre,
        b=conv_b_pre,
        W_gate=conv_W_gate,
        b_gate=conv_b_gate)
    '''
    attentive convolution function, two sizes of filter_width 3&1 are used. Multi-channel
    '''

    attentive_conv_layer = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=conv_layer_gate_l.output_tensor3,
        input_tensor3_r=conv_layer_gate_r.output_tensor3,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r

    attentive_conv_layer2 = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=conv_layer_gate_l.output_tensor3,
        input_tensor3_r=conv_layer_gate_r.output_tensor3,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_W2,
        b=conv_b2,
        W_context=conv_W2_context,
        b_context=conv_b2_context)
    attentive_sent_embeddings_l2 = attentive_conv_layer2.attentive_maxpool_vec_l
    attentive_sent_embeddings_r2 = attentive_conv_layer2.attentive_maxpool_vec_r

    "Batch normalization for the four output sentence representation vectors"
    gamma = theano.shared(np.asarray(rng.uniform(
        low=-1.0 / math.sqrt(hidden_size[0]),
        high=1.0 / math.sqrt(hidden_size[0]),
        size=(hidden_size[0])),
                                     dtype=theano.config.floatX),
                          borrow=True)
    beta = theano.shared(np.zeros((hidden_size[0]),
                                  dtype=theano.config.floatX),
                         borrow=True)
    bn_params = [gamma, beta]
    bn_attentive_sent_embeddings_l = batch_normalization(
        inputs=attentive_sent_embeddings_l,
        gamma=gamma,
        beta=beta,
        mean=attentive_sent_embeddings_l.mean((0, ), keepdims=True),
        std=attentive_sent_embeddings_l.std((0, ), keepdims=True),
        mode='low_mem')
    bn_attentive_sent_embeddings_r = batch_normalization(
        inputs=attentive_sent_embeddings_r,
        gamma=gamma,
        beta=beta,
        mean=attentive_sent_embeddings_r.mean((0, ), keepdims=True),
        std=attentive_sent_embeddings_r.std((0, ), keepdims=True),
        mode='low_mem')

    bn_attentive_sent_embeddings_l2 = batch_normalization(
        inputs=attentive_sent_embeddings_l2,
        gamma=gamma,
        beta=beta,
        mean=attentive_sent_embeddings_l2.mean((0, ), keepdims=True),
        std=attentive_sent_embeddings_l2.std((0, ), keepdims=True),
        mode='low_mem')
    bn_attentive_sent_embeddings_r2 = batch_normalization(
        inputs=attentive_sent_embeddings_r2,
        gamma=gamma,
        beta=beta,
        mean=attentive_sent_embeddings_r2.mean((0, ), keepdims=True),
        std=attentive_sent_embeddings_r2.std((0, ), keepdims=True),
        mode='low_mem')

    "Before logistic regression layer, we insert a hidden layer. Now form input to HL classifier"
    HL_layer_1_input = T.concatenate([
        bn_attentive_sent_embeddings_l, bn_attentive_sent_embeddings_r,
        bn_attentive_sent_embeddings_l + bn_attentive_sent_embeddings_r,
        bn_attentive_sent_embeddings_l * bn_attentive_sent_embeddings_r,
        bn_attentive_sent_embeddings_l2, bn_attentive_sent_embeddings_r2,
        bn_attentive_sent_embeddings_l2 + bn_attentive_sent_embeddings_r2,
        bn_attentive_sent_embeddings_l2 * bn_attentive_sent_embeddings_r2
    ],
                                     axis=1)
    HL_layer_1_input_size = 8 * hidden_size[0]
    "Create hidden layer parameters"
    HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para(
        rng, HL_layer_1_input_size, hidden_size[1])
    HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b]
    "Hidden Layer and batch norm to its output again"
    HL_layer_1 = HiddenLayer(rng,
                             input=HL_layer_1_input,
                             n_in=HL_layer_1_input_size,
                             n_out=hidden_size[1],
                             W=HL_layer_1_W,
                             b=HL_layer_1_b,
                             activation=T.tanh)
    gamma_HL = theano.shared(np.asarray(rng.uniform(
        low=-1.0 / math.sqrt(hidden_size[1]),
        high=1.0 / math.sqrt(hidden_size[1]),
        size=(hidden_size[1])),
                                        dtype=theano.config.floatX),
                             borrow=True)
    beta_HL = theano.shared(np.zeros((hidden_size[1]),
                                     dtype=theano.config.floatX),
                            borrow=True)
    bn_params_HL = [gamma_HL, beta_HL]
    bn_HL_output = batch_normalization(inputs=HL_layer_1.output,
                                       gamma=gamma_HL,
                                       beta=beta_HL,
                                       mean=HL_layer_1.output.mean(
                                           (0, ), keepdims=True),
                                       std=HL_layer_1.output.std(
                                           (0, ), keepdims=True),
                                       mode='low_mem')
    "Form input to LR classifier"
    LR_input = T.concatenate([HL_layer_1_input, bn_HL_output], axis=1)
    LR_input_size = HL_layer_1_input_size + hidden_size[1]
    U_a = create_ensemble_para(rng, 3, LR_input_size)  # (input_size, 3)
    LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]
    "Logistic Regression layer"
    layer_LR = LogisticRegression(
        rng,
        input=normalize_matrix_col_wise(LR_input),
        n_in=LR_input_size,
        n_out=3,
        W=U_a,
        b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    loss = layer_LR.negative_log_likelihood(
        labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    params = [
        init_embeddings
    ] + NN_para + LR_para + bn_params + HL_layer_1_params + bn_params_HL
    cost = loss
    "Use AdaGrad to update parameters"
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    dev_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        layer_LR.errors(labels),
        allow_input_downcast=True,
        on_unused_input='ignore')
    test_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        layer_LR.errors(labels),
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_dev_batches = dev_size / batch_size
    dev_batch_start = list(
        np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_acc_dev = 0.0
    max_acc_test = 0.0

    cost_i = 0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed

        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]
            cost_i += train_model(train_sents_l[train_id_batch],
                                  train_masks_l[train_id_batch],
                                  train_sents_r[train_id_batch],
                                  train_masks_r[train_id_batch],
                                  train_labels_store[train_id_batch])

            if (epoch == 1 and iter % 1000 == 0) or (epoch >= 2
                                                     and iter % 5 == 0):
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                dev_error_sum = 0.0
                for dev_batch_id in dev_batch_start:  # for each test batch
                    dev_error_i = dev_model(
                        dev_sents_l[dev_batch_id:dev_batch_id + batch_size],
                        dev_masks_l[dev_batch_id:dev_batch_id + batch_size],
                        dev_sents_r[dev_batch_id:dev_batch_id + batch_size],
                        dev_masks_r[dev_batch_id:dev_batch_id + batch_size],
                        dev_labels_store[dev_batch_id:dev_batch_id +
                                         batch_size])

                    dev_error_sum += dev_error_i
                dev_acc = 1.0 - dev_error_sum / (len(dev_batch_start))

                if dev_acc > max_acc_dev:
                    max_acc_dev = dev_acc
                    print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev
                    '''
                    best dev model, test
                    '''
                    error_sum = 0.0
                    for test_batch_id in test_batch_start:  # for each test batch
                        error_i = test_model(
                            test_sents_l[test_batch_id:test_batch_id +
                                         batch_size],
                            test_masks_l[test_batch_id:test_batch_id +
                                         batch_size],
                            test_sents_r[test_batch_id:test_batch_id +
                                         batch_size],
                            test_masks_r[test_batch_id:test_batch_id +
                                         batch_size],
                            test_labels_store[test_batch_id:test_batch_id +
                                              batch_size])

                        error_sum += error_i
                    test_acc = 1.0 - error_sum / (len(test_batch_start))

                    if test_acc > max_acc_test:
                        max_acc_test = test_acc
                    print '\t\tcurrent test_acc:', test_acc, ' ; ', '\t\t\t\t\tmax_test_acc:', max_acc_test
                else:
                    print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
    def __init__(self, input, rng, n_in, n_out, stochastic=False, binary=True):
        # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
        self.high = 2.#numpy.float32(numpy.sqrt(6. / (float(n_in) + float(n_out) )))
        self.W0 = numpy.float32(self.high/2.)        
        #self.W = theano.shared(value=numpy.zeros((n_in, n_out),
        #                         dtype=theano.config.floatX),
        #                         name='W', borrow=True)
        #self.high = numpy.float32(2.)
        #self.W0 = numpy.float32(self.high/2.)
        W_values = numpy.asarray(
            rng.uniform(
                low= -1.,#numpy.sqrt(6. / (n_in + n_out)),
                high= 1.,#numpy.sqrt(6. / (n_in + n_out)),
                size=(n_in, n_out)
            ),
            dtype=theano.config.floatX
        )
        #srng = RandomStreams(seed=420) 
        srng = theano.sandbox.rng_mrg.MRG_RandomStreams(420)
        self.W = theano.shared(value=W_values, name='W', borrow=True)

        # initialize the baises b as a vector of n_out 0s
        self.b = theano.shared(value=numpy.zeros((n_out,),
                                dtype=theano.config.floatX),
                                name='b', borrow=True)

        self.gamma = theano.shared(value = numpy.ones((n_out,), dtype=theano.config.floatX), name='gamma')
        self.beta = theano.shared(value = numpy.zeros((n_out,), dtype=theano.config.floatX), name='beta')
        
        def hard_sigma(w):
            p=T.clip((w+1)/2,0,1)
            return p

        if binary:
            Wb = hard_sigma(self.W/self.W0)
            if stochastic:
                #Wb = T.cast(numpy.random.binomial(n=1, p=T.ge(Wb), size=(n_in, n_out)),  theano.config.floatX)
                Wb = srng.binomial(n=1, p=Wb, size=(n_in, n_out) )

            else:
                Wb = T.round(Wb)

            # Leave below alone
            Wb = T.switch(Wb,self.W0, -self.W0)
            #Wb = T.cast(T.switch(Wb,self.W0, -self.W0), dtype=theano.config.floatX)
            #Wb = theano.shared(Wb.eval(), name='Wb', borrow=True)
            self.Wb = Wb

        else:
            self.Wb = self.W

        # parameters of the model
        self.linear = T.dot(input, self.Wb) + self.b

        bn_output = batch_normalization(inputs = self.linear,
                    gamma = self.gamma, beta = self.beta, mean = self.linear.mean((0,), keepdims=True),
                    std = T.ones_like(self.linear.var((0,), keepdims = True)), mode='high_mem')

        self.linear_output = bn_output
        self.y_pred = T.argmax(bn_output, axis=1)

        if binary:
            self.params = [self.W, self.Wb, self.gamma, self.beta, self.b]

        elif binary==False:
            self.params = [self.Wb, self.gamma, self.beta, self.b]

        self.len_params = len(self.params)
        self.n_in=n_in
        # keep track of model input
        self.input = input
    def __init__(self, input, rng, n_in, n_out, stochastic=False, binary=True):
        """ Initialize the parameters of the logistic regression

        :type input: theano.tensor.TensorType
        :param input: symbolic variable that describes the input of the
                      architecture (one minibatch)

        :type n_in: int
        :param n_in: number of input units, the dimension of the space in
                     which the datapoints lie

        :type n_out: int
        :param n_out: number of output units, the dimension of the space in
                      which the labels lie

        """
        # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
        # self.W = theano.shared(
        #     value=numpy.zeros(
        #         (n_in, n_out),
        #         dtype=theano.config.floatX
        #     ),
        #     name='W',
        #     borrow=True
        # )
        W_values = numpy.asarray(
            rng.uniform(
                low= -1.,#numpy.sqrt(6. / (n_in + n_out)),
                high= 1.,#numpy.sqrt(6. / (n_in + n_out)),
                size=(n_in, n_out)
            ),
            dtype=theano.config.floatX
        )
        self.W = theano.shared(value=W_values, name='W', borrow=True)

        # initialize the biases b as a vector of n_out 0s
        self.b = theano.shared(
            value=numpy.zeros(
                (n_out,),
                dtype=theano.config.floatX
            ),
            name='b',
            borrow=True
        )

        self.gamma = theano.shared(value = numpy.ones((n_out,), dtype=theano.config.floatX), name='gamma')
        self.beta = theano.shared(value = numpy.zeros((n_out,), dtype=theano.config.floatX), name='beta')
        
        self.high = 2. #numpy.float32(numpy.sqrt(6. / (n_in + n_out)))
        self.W0 = numpy.float32(self.high/2)
        #srng = RandomStreams(seed=420) 
        srng = theano.sandbox.rng_mrg.MRG_RandomStreams(420)
        
        def hard_sigma(w):
            return T.clip((w+1.)/2,0,1)

        if binary:
            if stochastic:
                Wb = hard_sigma(self.W/self.W0)
                # using numpy was insanely slow and it caused issues with having to evaluate the function
                #Wb = T.cast(numpy.random.binomial(n=1, p=Wb, size=(n_in, n_out)),  theano.config.floatX)
                Wb = srng.binomial(n=1, p=Wb, size=(n_in, n_out) )         # This works much better

            else:
                # T.ge is greater than or equal to
                #Wb = T.ge(Wb, 0)
                Wb = T.ge(self.W, 0)
                #Wb = T.round(Wb)

            Wb = T.switch(Wb, self.W0, -self.W0)
            self.Wb = Wb

            # The code below was way slower
            #Wb = T.cast(T.switch(Wb,self.W0, -self.W0), dtype=theano.config.floatX)
            #Wb = theano.shared(Wb.eval(), name='Wb', borrow=True)

        else:
            self.Wb = self.W
        
        # symbolic expression for computing the matrix of class-membership
        # probabilities
        # Where:
        # W is a matrix where column-k represent the separation hyperplane for
        # class-k
        # x is a matrix where row-j  represents input training sample-j
        # b is a vector where element-k represent the free parameter of
        # hyperplane-k
        
        self.linear=T.dot(input, self.Wb) + self.b

        bn_output = batch_normalization(inputs = self.linear,
                    gamma = self.gamma, beta = self.beta, mean = self.linear.mean((0,), keepdims=True),
                    std = T.ones_like(self.linear.var((0,), keepdims = True)), mode='high_mem')
        
        self.p_y_given_x = T.nnet.softmax(bn_output)
                          
                          
        # symbolic description of how to compute prediction as class whose
        # probability is maximal
        self.y_pred = T.argmax(self.p_y_given_x, axis=1)

        # parameters of the model
        if binary:
            self.params = [self.W, self.Wb, self.gamma, self.beta, self.b]

        elif not binary:
            self.params = [self.Wb, self.gamma, self.beta, self.b]

        self.len_params = len(self.params)
        
        self.n_in=n_in
        
        # keep track of model input
        self.input = input
Beispiel #36
0
    def __init__(self, rng, input, n_in, n_out, W=None, b=None,
                 activation=T.tanh):
        """
        Typical hidden layer of a MLP: units are fully-connected and have
        sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
        and the bias vector b is of shape (n_out,).
        NOTE : The nonlinearity used here is tanh

        Hidden unit activation is given by: tanh(dot(input,W) + b)

        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type input: theano.tensor.dmatrix
        :param input: a symbolic tensor of shape (n_examples, n_in)

        :type n_in: int
        :param n_in: dimensionality of input

        :type n_out: int
        :param n_out: number of hidden units

        :type activation: theano.Op or function
        :param activation: Non linearity to be applied in the hidden
                           layer
        """
        self.input = input

        # `W` is initialized with `W_values` which is uniformely sampled
        # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
        # for tanh activation function
        # the output of uniform if converted using asarray to dtype
        # theano.config.floatX so that the code is runable on GPU
        # Note : optimal initialization of weights is dependent on the
        #        activation function used (among other things).
        #        For example, results presented in [Xavier10] suggest that you
        #        should use 4 times larger initial weights for sigmoid
        #        compared to tanh
        #        We have no info for other function, so we use the same as
        #        tanh.

        if W is None:
            W_values = numpy.asarray(
                rng.uniform(
                    low=-numpy.sqrt(6. / (n_in + n_out)),
                    high=numpy.sqrt(6. / (n_in + n_out)),
                    size=(n_in, n_out)
                ),
                dtype=theano.config.floatX
            )
            if activation == theano.tensor.nnet.sigmoid:
                W_values *= 4

            W = theano.shared(value=W_values, name='W', borrow=True)

        if b is None:
            b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
            b = theano.shared(value=b_values, name='b', borrow=True)

        self.gamma = theano.shared(value = numpy.ones((n_out,), dtype=theano.config.floatX), name='gamma')
        self.beta = theano.shared(value = numpy.zeros((n_out,), dtype=theano.config.floatX), name='beta')

        self.W = W
        self.b = b

        lin_output = T.dot(input, self.W) + self.b
        
        bn_output = batch_normalization(inputs = lin_output,
            gamma = self.gamma, beta = self.beta, mean = lin_output.mean((0,), keepdims=True),
            std = lin_output.std((0,), keepdims = True),
                        mode='low_mem')

        self.output = (T.clip(bn_output,0,20) if activation is 'relu' else activation(bn_output))        
        # self.output = (
        #     lin_output if activation is None
        #     else activation(lin_output)
        # )
        # parameters of the model
        self.params = [self.W, self.b, self.gamma, self.beta]
Beispiel #37
0
    def __init__(self, input, n_in, n_out):
        """ Initialize the parameters of the logistic regression

        :type input: theano.tensor.TensorType
        :param input: symbolic variable that describes the input of the
                      architecture (one minibatch)

        :type n_in: int
        :param n_in: number of input units, the dimension of the space in
                     which the datapoints lie

        :type n_out: int
        :param n_out: number of output units, the dimension of the space in
                      which the labels lie

        """
        # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
        self.W = theano.shared(
            value=numpy.zeros(
                (n_in, n_out),
                dtype=theano.config.floatX
            ),
            name='W',
            borrow=True
        )
        # initialize the biases b as a vector of n_out 0s
        self.b = theano.shared(
            value=numpy.zeros(
                (n_out,),
                dtype=theano.config.floatX
            ),
            name='b',
            borrow=True
        )

        self.gamma = theano.shared(value = numpy.ones((n_out,), dtype=theano.config.floatX), name='gamma')
        self.beta = theano.shared(value = numpy.zeros((n_out,), dtype=theano.config.floatX), name='beta')

        self.high = numpy.float32(numpy.sqrt(6. / (n_in + n_out)))
        self.W0 = numpy.float32(self.high/2)
        
        Wb = T.switch(T.ge(self.W.get_value(),0),self.W0,-self.W0).eval()
        Wb = theano.shared(Wb, name='Wb', borrow=True)
        
        self.Wb = Wb

        # symbolic expression for computing the matrix of class-membership
        # probabilities
        # Where:
        # W is a matrix where column-k represent the separation hyperplane for
        # class-k
        # x is a matrix where row-j  represents input training sample-j
        # b is a vector where element-k represent the free parameter of
        # hyperplane-k

        self.linear = T.dot(input, self.W) + self.b

        bn_output = batch_normalization(inputs = self.linear,
            gamma = self.gamma, beta = self.beta, mean = self.linear.mean((0,), keepdims=True),
            std = T.ones_like(self.linear.var((0,), keepdims = True)), mode='high_mem')

        self.linear.std((0,))

        self.p_y_given_x = T.nnet.softmax(bn_output)

        # bn_output = lin_output
        bn_output = batch_normalization(inputs = self.p_y_given_x,
            gamma = self.gamma, beta = self.beta, mean = self.p_y_given_x.mean((0,), keepdims=True),
            std = T.ones_like(self.p_y_given_x.var((0,), keepdims = True)), mode='high_mem')

        self.p_y_given_x.std((0,))
        # symbolic description of how to compute prediction as class whose
        # probability is maximal
        self.y_pred = T.argmax(self.p_y_given_x, axis=1)

        # parameters of the model
        self.params = [self.Wb, self.b, self.gamma, self.beta]

        # keep track of model input
        self.n_in = n_in
        self.input = input
Beispiel #38
0
 def bn_f(inputs, gamma, beta, mean, std):
     return bn.batch_normalization(inputs, gamma, beta, mean, std, mode=mode)
    def __init__(self, input, n_in, n_out, stochastic=False, binary=True):
        """ Initialize the parameters of the Support Vector Machine layer

        :type input: theano.tensor.TensorType
        :param input: symbolic variable that describes the input of the
                      architecture (one minibatch)

        :type n_in: int
        :param n_in: number of input units, the dimension of the space in
                     which the datapoints lie

        :type n_out: int
        :param n_out: number of output units, the dimension of the space in
                      which the labels lie
        
        :type binary: boolean
        :param binary: indicates whether to implement Binary Connect binarization for weights
        
        :type stochastic: boolean
        :param stochastic: indicate whether to implement a stochatic or deterministic Binary Connect layer
        """
        
        # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
        self.high = numpy.float32(numpy.sqrt(6. / (n_in + n_out)))
        

        self.W = theano.shared(value=numpy.zeros((n_in, n_out),
                                dtype=theano.config.floatX),
                                name='W', borrow=True)

        # initialize the baises b as a vector of n_out 0s
        self.b = theano.shared(value=numpy.zeros((n_out,),
                                dtype=theano.config.floatX),
                                name='b', borrow=True)

        self.gamma = theano.shared(value = numpy.ones((n_out,), dtype=theano.config.floatX), name='gamma')
        self.beta = theano.shared(value = numpy.zeros((n_out,), dtype=theano.config.floatX), name='beta')
        
        self.W0 = numpy.float32(self.high/2)
        
        # binarize weights either deterministically or stochastically, if indicated
        def hard_sigma(w):
            p=T.clip((w+1)/2,0,1)
            return p

        if stochastic:
            p = hard_sigma(self.W/self.W0)
            p_mask = T.cast(numpy.random.binomial(n=1, p=p.eval(), size=(n_in, n_out)), theano.config.floatX)
            Wb = T.switch(p_mask,self.W0,-self.W0).eval()
        else:        
            Wb = T.switch(T.ge(self.W.get_value(),0),self.W0,-self.W0).eval()
       
        if binary:
            Wb = theano.shared(Wb, name='Wb', borrow=True)
            self.Wb = Wb
        else:
            self.Wb=self.W

        # parameters of the model
        self.linear = T.dot(input, self.Wb) + self.b
        bn_output = batch_normalization(inputs = self.linear,
                    gamma = self.gamma, beta = self.beta, mean = self.linear.mean((0,), keepdims=True),
                    std = T.ones_like(self.linear.var((0,), keepdims = True)), mode='high_mem')

        self.linear_output = bn_output
        self.y_pred = T.argmax(bn_output, axis=1)

        if binary:
            self.params = [self.W, self.Wb, self.gamma, self.beta, self.b]

        elif binary==False:
            self.params = [self.Wb, self.gamma, self.beta, self.b]

        self.len_params = len(self.params)
        self.n_in=n_in
        # keep track of model input
        self.input = input
    def __init__(self, rng, input, n_in, n_out, stochastic=False, binary=True, W=None, b=None,
                 activation=T.nnet.relu):
        """
        Typical hidden layer of a MLP: units are fully-connected and have
        sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
        and the bias vector b is of shape (n_out,).
        NOTE : The nonlinearity used here is ReLU

        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type input: theano.tensor.dmatrix
        :param input: a symbolic tensor of shape (n_examples, n_in)

        :type n_in: int
        :param n_in: dimensionality of input

        :type n_out: int
        :param n_out: number of hidden units
        
        :type binary: boolean
        :param binary: indicates whether to implement Binary Connect binarization for weights
        
        :type stochastic: boolean
        :param stochastic: indicate whether to implement a stochatic or deterministic Binary Connect layer
        
        :type activation: theano.Op or function
        :param activation: Non linearity to be applied in the hidden
                           layer
        """
        self.input = input

        # `W` is initialized with `W_values` which is uniformely sampled
        # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
        # for tanh activation function
        # the output of uniform if converted using asarray to dtype
        # theano.config.floatX so that the code is runable on GPU
        # Note : optimal initialization of weights is dependent on the
        #        activation function used (among other things).
        #        For example, results presented in [Xavier10] suggest that you
        #        should use 4 times larger initial weights for sigmoid
        #        compared to tanh
        #        We have no info for other function, so we use the same as
        #        tanh.
        if W is None:
            W_values = numpy.asarray(
                rng.uniform(
                    low=-numpy.sqrt(6. / (n_in + n_out)),
                    high=numpy.sqrt(6. / (n_in + n_out)),
                    size=(n_in, n_out)
                ),
                dtype=theano.config.floatX
            )
           
            if activation == theano.tensor.nnet.sigmoid:
                W_values *= 4

            W = theano.shared(value=W_values, name='W', borrow=True)

        if b is None:
            b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
            b = theano.shared(value=b_values, name='b', borrow=True)

       
        self.high = numpy.float32(numpy.sqrt(6. / (n_in + n_out)))
        self.W0 = numpy.float32(self.high/2)

        
        # binarize weights either deterministically or stochastically, if indicated
        def hard_sigma(w):
            p=T.clip((w+1)/2,0,1)
            return p
        
        if stochastic:
            p = hard_sigma(W/self.W0)
            p_mask = T.cast(numpy.random.binomial(n=1, p=p.eval(), size=(n_in, n_out)), theano.config.floatX)
            Wb = T.switch(p_mask,self.W0,-self.W0).eval()
        else:        
            Wb = T.switch(T.ge(W.get_value(),0),self.W0,-self.W0).eval()
       
        if binary:
            Wb = theano.shared(Wb, name='Wb', borrow=True)
            self.Wb = Wb
        else:
            self.Wb=W
            
        self.W = W
        self.b = b
        self.n_in=n_in
         
        self.gamma = theano.shared(value = numpy.ones((n_out,), dtype=theano.config.floatX), name='gamma')
        self.beta = theano.shared(value = numpy.zeros((n_out,), dtype=theano.config.floatX), name='beta')

        lin_output = T.dot(input, self.Wb) + self.b
        
        # batch normalization at output
        bn_output = batch_normalization(inputs = lin_output,
            gamma = self.gamma, beta = self.beta, mean = lin_output.mean((0,), keepdims=True),
            std = lin_output.std((0,), keepdims = True),
                        mode='low_mem')
        
        self.output = (
            bn_output if activation is None
            else activation(bn_output)
        )
            
        # parameters of the model
        if binary:
            self.params = [self.W, self.Wb, self.gamma, self.beta, self.b]

        elif binary==False:
            self.params = [self.Wb, self.gamma, self.beta, self.b]
    def __init__(self, input, n_in, n_out, stochastic=False, binary=True):
        """ Initialize the parameters of the logistic regression

        :type input: theano.tensor.TensorType
        :param input: symbolic variable that describes the input of the
                      architecture (one minibatch)

        :type n_in: int
        :param n_in: number of input units, the dimension of the space in
                     which the datapoints lie

        :type n_out: int
        :param n_out: number of output units, the dimension of the space in
                      which the labels lie
        
        :type binary: boolean
        :param binary: indicates whether to implement Binary Connect binarization for weights
        
        :type stochastic: boolean
        :param stochastic: indicate whether to implement a stochatic or deterministic Binary Connect layer
        """
        # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
        self.W = theano.shared(
            value=numpy.zeros(
                (n_in, n_out),
                dtype=theano.config.floatX
            ),
            name='W',
            borrow=True
        )
        # initialize the biases b as a vector of n_out 0s
        self.b = theano.shared(
            value=numpy.zeros(
                (n_out,),
                dtype=theano.config.floatX
            ),
            name='b',
            borrow=True
        )
        
        self.gamma = theano.shared(value = numpy.ones((n_out,), dtype=theano.config.floatX), name='gamma')
        self.beta = theano.shared(value = numpy.zeros((n_out,), dtype=theano.config.floatX), name='beta')
        
        self.high = numpy.float32(numpy.sqrt(6. / (n_in + n_out)))
        self.W0 = numpy.float32(self.high/2)
        
        # binarize weights either deterministically or stochastically, if indicated
        def hard_sigma(w):
            p=T.clip((w+1)/2,0,1)
            return p
        
        if stochastic:
            p = hard_sigma(self.W/self.W0)
            p_mask = T.cast(numpy.random.binomial(n=1, p=p.eval(), size=(n_in, n_out)), theano.config.floatX)
            Wb = T.switch(p_mask,self.W0,-self.W0).eval()
        else:        
            Wb = T.switch(T.ge(self.W.get_value(),0),self.W0,-self.W0).eval()
       
        if binary:
            Wb = theano.shared(Wb, name='Wb', borrow=True)
            self.Wb = Wb
        else:
            self.Wb = self.W
        
        # symbolic expression for computing the matrix of class-membership
        # probabilities
        # Where:
        # W is a matrix where column-k represent the separation hyperplane for
        # class-k
        # x is a matrix where row-j  represents input training sample-j
        # b is a vector where element-k represent the free parameter of
        # hyperplane-k
        
        self.linear=T.dot(input, self.Wb) + self.b
        
        # batch normalize at the output
        bn_output = batch_normalization(inputs = self.linear,
                    gamma = self.gamma, beta = self.beta, mean = self.linear.mean((0,), keepdims=True),
                    std = T.ones_like(self.linear.var((0,), keepdims = True)), mode='high_mem')
        
        self.p_y_given_x = T.nnet.softmax(bn_output)
                          
                          
        # symbolic description of how to compute prediction as class whose
        # probability is maximal
        self.y_pred = T.argmax(self.p_y_given_x, axis=1)

        # parameters of the model
        if binary:
            self.params = [self.W, self.Wb, self.gamma, self.beta, self.b]

        elif not binary:
            self.params = [self.Wb, self.gamma, self.beta, self.b]

        self.len_params = len(self.params)
        
        self.n_in=n_in
        
        # keep track of model input
        self.input = input
    def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2, 2),
        pool_ignore_border=True, stochastic=False, binary=True):
        """
        Allocate a LeNetConvPoolLayer with shared variable internal parameters.

        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type input: theano.tensor.dtensor4
        :param input: symbolic image tensor, of shape image_shape

        :type filter_shape: tuple or list of length 4
        :param filter_shape: (number of filters, num input feature maps,
                              filter height, filter width)

        :type image_shape: tuple or list of length 4
        :param image_shape: (batch size, num input feature maps,
                             image height, image width)

        :type poolsize: tuple or list of length 2
        :param poolsize: the downsampling (pooling) factor (#rows, #cols)
        
        :type binary: boolean
        :param binary: indicates whether to implement Binary Connect binarization for weights
        
        :type stochastic: boolean
        :param stochastic: indicate whether to implement a stochatic or deterministic Binary Connect layer
        """

        assert image_shape[1] == filter_shape[1]
        self.input = input

        # there are "num input feature maps * filter height * filter width"
        # inputs to each hidden unit
        fan_in = numpy.prod(filter_shape[1:])
        # each unit in the lower layer receives a gradient from:
        # "num output feature maps * filter height * filter width" /
        #   pooling size
        fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) //
                   numpy.prod(poolsize))
        # initialize weights with random weights
        W_bound = numpy.sqrt(6. / (fan_in + fan_out))
        self.W = theano.shared(
            numpy.asarray(
                rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),
                dtype=theano.config.floatX
            ),
            borrow=True
        )

        # the bias is a 1D tensor -- one bias per output feature map
        b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX)
        self.b = theano.shared(value=b_values, borrow=True)
 
        self.high = numpy.float32(numpy.sqrt(6. / (fan_in + fan_out)))
        self.W0 = numpy.float32(self.high/2)
        
        
        # binarize weights either deterministically or stochastically, if indicated
        def hard_sigma(w):
                p=T.clip((w+1)/2,0,1)
                return p
            
        if stochastic:
            p = hard_sigma(self.W/self.W0)
            p_mask = T.cast(numpy.random.binomial(n=1, p=p.eval(), size=filter_shape), theano.config.floatX)
            Wb = T.switch(p_mask,self.W0,-self.W0).eval()
        else:        
            Wb = T.switch(T.ge(self.W.get_value(),0),self.W0,-self.W0).eval()
       
        if binary:
            Wb = theano.shared(Wb, name='Wb', borrow=True)
            self.Wb = Wb
        else:
            self.Wb=self.W

        self.gamma = theano.shared(value = numpy.ones((image_shape[0], filter_shape[0], (image_shape[3]-2)/poolsize[0], 
                                                       (image_shape[3]-2)/poolsize[0]), dtype=theano.config.floatX), name='gamma')
        self.beta = theano.shared(value = numpy.zeros((image_shape[0], filter_shape[0], (image_shape[3]-2)/poolsize[0], 
                                                       (image_shape[3]-2)/poolsize[0]), dtype=theano.config.floatX), name='beta')
       
        # convolve input feature maps with filters
        conv_out = conv2d(
           input=input,
           filters=self.Wb,
           filter_shape=filter_shape,
           image_shape=image_shape
       )
        
        # downsample each feature map individually, using maxpooling
        pooled_out = downsample.max_pool_2d(
            input=conv_out,
            ds=poolsize,
            ignore_border=pool_ignore_border
        )
        
        # implement batch normalization at output
        bn_output = batch_normalization(inputs = pooled_out,
                   gamma = self.gamma, beta = self.beta, mean = pooled_out.mean((0,2,3), keepdims=True),
                   std = pooled_out.var((0,2,3), keepdims = True), mode='high_mem')

        # add the bias term. Since the bias is a vector (1D array), we first
        # reshape it to a tensor of shape (1, n_filters, 1, 1). Each bias will
        # thus be broadcasted across mini-batches and feature map
        # width & height
        self.output = T.nnet.relu(bn_output + self.b.dimshuffle('x', 0, 'x', 'x'))

        # store parameters of this layer
        if binary:
            self.params = [self.W, self.Wb, self.gamma, self.beta, self.b]
        elif not binary:
            self.params = [self.Wb, self.gamma, self.beta, self.b]

        self.len_params = len(self.params)
        # keep track of model input
        self.input = input
    def __init__(
            self,
            input,
            image_shape,
            filter_shape,
            convstride,
            padsize,
            group,
            poolsize,
            poolstride,
            bias_init,
            lrn=False,
            lib_conv='cudnn',
            poolpadsize=(0, 0),
            caffe_style=False,
            Bn=False,
    ):
        '''
        lib_conv can be cudnn (recommended)or cudaconvnet
        '''

        self.filter_size = filter_shape
        self.convstride = convstride
        self.padsize = padsize
        self.poolsize = poolsize
        self.poolstride = poolstride
        self.channel = image_shape[0]
        self.lrn = lrn
        self.lib_conv = lib_conv
        # assert input.shape==image_shape
        assert group in [1, 2]

        self.filter_shape = np.asarray(filter_shape)
        self.image_shape = np.asarray(image_shape)

        if self.lrn:
            self.lrn_func = CrossChannelNormalization(alpha=0.0005, k=1)
            # self.lrn_func = CrossChannelNormalization(alpha=0.0005)

        if group == 1:
            self.W = Weight(self.filter_shape)
            self.b = Weight(self.filter_shape[3], bias_init, std=0)
        else:
            self.filter_shape[0] = self.filter_shape[0] / 2
            self.filter_shape[3] = self.filter_shape[3] / 2
            self.image_shape[0] = self.image_shape[0] / 2
            self.image_shape[3] = self.image_shape[3] / 2
            self.W0 = Weight(self.filter_shape)
            self.W1 = Weight(self.filter_shape)
            self.b0 = Weight(self.filter_shape[3], bias_init, std=0)
            self.b1 = Weight(self.filter_shape[3], bias_init, std=0)

        if lib_conv == 'cudaconvnet':
            self.conv_op = FilterActs(pad=self.padsize,
                                      stride=self.convstride,
                                      partial_sum=1)
            # Conv
            if group == 1:
                contiguous_input = gpu_contiguous(input)
                contiguous_filters = gpu_contiguous(self.W.val)
                conv_out = self.conv_op(contiguous_input, contiguous_filters)
                conv_out = conv_out + self.b.val.dimshuffle(0, 'x', 'x', 'x')
            else:
                contiguous_input0 = gpu_contiguous(input[:self.channel /
                                                         2, :, :, :])
                contiguous_filters0 = gpu_contiguous(self.W0.val)
                conv_out0 = self.conv_op(contiguous_input0,
                                         contiguous_filters0)
                conv_out0 = conv_out0 + \
                    self.b0.val.dimshuffle(0, 'x', 'x', 'x')

                contiguous_input1 = gpu_contiguous(input[self.channel /
                                                         2:, :, :, :])
                contiguous_filters1 = gpu_contiguous(self.W1.val)
                conv_out1 = self.conv_op(contiguous_input1,
                                         contiguous_filters1)
                conv_out1 = conv_out1 + \
                    self.b1.val.dimshuffle(0, 'x', 'x', 'x')
                conv_out = T.concatenate([conv_out0, conv_out1], axis=0)
            # ReLu
            self.output = T.maximum(conv_out, 0)
            # Pooling
            if self.poolsize != 1:
                self.pool_op = MaxPool(ds=poolsize, stride=poolstride)
                self.output = self.pool_op(self.output)

        elif lib_conv == 'cudnn':

            input_shuffled = input.dimshuffle(3, 0, 1, 2)  # c01b to bc01
            # in01out to outin01
            if group == 1:
                W_shuffled = self.W.val.dimshuffle(3, 0, 1, 2)  # c01b to bc01
                conv_out = dnn.dnn_conv(
                    img=input_shuffled,
                    kerns=W_shuffled,
                    subsample=(convstride, convstride),
                    border_mode=padsize,
                )
                conv_out = conv_out + self.b.val.dimshuffle('x', 0, 'x', 'x')
            else:
                W0_shuffled = \
                    self.W0.val.dimshuffle(3, 0, 1, 2)  # c01b to bc01
                conv_out0 = \
                    dnn.dnn_conv(img=input_shuffled[:, :self.channel / 2,
                                     :, :],
                                 kerns=W0_shuffled,
                                 subsample=(convstride, convstride),
                                 border_mode=padsize,
                                 )
                conv_out0 = conv_out0 + \
                            self.b0.val.dimshuffle('x', 0, 'x', 'x')
                W1_shuffled = \
                    self.W1.val.dimshuffle(3, 0, 1, 2)  # c01b to bc01
                conv_out1 = \
                    dnn.dnn_conv(img=input_shuffled[:, self.channel / 2:,
                                     :, :],
                                 kerns=W1_shuffled,
                                 subsample=(convstride, convstride),
                                 border_mode=padsize,
                                 )
                conv_out1 = conv_out1 + \
                            self.b1.val.dimshuffle('x', 0, 'x', 'x')
                conv_out = T.concatenate([conv_out0, conv_out1], axis=1)

            self.conv_out = conv_out
            if Bn:
                #Warning this just used for testing phase!!!!
                self.mean = theano.shared(
                    value=np.zeros((1, filter_shape[3], 1, 1),
                                   dtype=theano.config.floatX),
                    broadcastable=[True, False, True, True],
                    name='mean',
                    borrow=True)
                self.var = theano.shared(
                    value=np.ones((1, filter_shape[3], 1, 1),
                                  dtype=theano.config.floatX),
                    broadcastable=[True, False, True, True],
                    name='var',
                    borrow=True)

                self.gamma = theano.shared(value=np.ones(
                    (filter_shape[3], ), dtype=theano.config.floatX),
                                           name='gamma',
                                           borrow=True)
                self.beta = theano.shared(value=np.zeros(
                    (filter_shape[3], ), dtype=theano.config.floatX),
                                          name='beta',
                                          borrow=True)
                conv_out = batch_normalization(inputs=conv_out,
                                               gamma=self.gamma,
                                               beta=self.beta,
                                               mean=self.mean,
                                               std=T.sqrt(self.var),
                                               mode='high_mem')
                # ReLu
                self.Bn = conv_out
            self.output = T.maximum(conv_out, 0)
            # # Pooling
            if caffe_style:
                self.output = self.output[:, :, ::-1, ::-1]
            if self.poolsize != 1:
                self.output = dnn.dnn_pool(self.output,
                                           ws=(poolsize, poolsize),
                                           stride=(poolstride, poolstride),
                                           pad=poolpadsize)
            if caffe_style:
                self.output = self.output[:, :, ::-1, ::-1]

            self.output = self.output.dimshuffle(1, 2, 3, 0)  # bc01 to c01b
        else:
            NotImplementedError("lib_conv can only be cudaconvnet or cudnn")

        if group == 1:
            if Bn:
                #self.params = [self.W.val, self.b.val,self.beta,self.gamma,self.mean,self.var]
                self.params = [self.W.val, self.b.val]
                self.weight_type = ['W', 'b']
                #self.weight_type = ['W', 'b','b','b','b','b']
                pass
            else:
                self.params = [self.W.val, self.b.val]
                self.weight_type = ['W', 'b']
        else:
            self.params = [self.W0.val, self.b0.val, self.W1.val, self.b1.val]
            self.weight_type = ['W', 'b', 'W', 'b']

        print "conv ({}) layer with shape_in: {}".format(
            lib_conv, str(image_shape))
Beispiel #44
0
    def __init__(self, rng, input, filter_shape, image_shape):
        """
        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type input: theano.tensor.dtensor4
        :param input: symbolic image tensor, of shape image_shape

        :type filter_shape: tuple or list of length 4
        :param filter_shape: (number of filters, num input feature maps,
                              filter height, filter width)

        :type image_shape: tuple or list of length 4
        :param image_shape: (batch size, num input feature maps,
                             image height, image width)
        """

        assert image_shape[1] == filter_shape[1]
        self.input = input

        # there are "num input feature maps * filter height * filter width"
        # inputs to each hidden unit
        fan_in = numpy.prod(filter_shape[1:])
        # each unit in the lower layer receives a gradient from:
        # "num output feature maps * filter height * filter width" /
        #   pooling size
        fan_out = filter_shape[0] * numpy.prod(filter_shape[2:])
        # initialize weights with random weights
        W_bound = numpy.sqrt(6. / (fan_in + fan_out))
        self.W = theano.shared(numpy.asarray(rng.uniform(low=-W_bound,
                                                         high=W_bound,
                                                         size=filter_shape),
                                             dtype=theano.config.floatX),
                               borrow=True)

        # the bias is a 1D tensor -- one bias per output feature map
        b_values = numpy.zeros((filter_shape[0], ), dtype=theano.config.floatX)
        self.b = theano.shared(value=b_values, borrow=True)

        # convolve input feature maps with filters
        conv_out = conv.conv2d(input=input,
                               filters=self.W,
                               filter_shape=filter_shape,
                               image_shape=image_shape)

        # add the bias term. Since the bias is a vector (1D array), we first
        # reshape it to a tensor of shape (1, n_filters, 1, 1). Each bias will
        # thus be broadcasted across mini-batches and feature map
        # width & height
        #alpha_value = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) + 0.25
        #self.alpha = theano.shared(value=alpha_value, borrow=True)
        linearOutput = conv_out + self.b.dimshuffle('x', 0, 'x', 'x')
        bnOutput = bn.batch_normalization(inputs=linearOutput,
                                          gamma=1.,
                                          beta=0,
                                          mean=T.mean(linearOutput),
                                          std=T.std(linearOutput))
        self.output = ReLu(bnOutput)

        # store parameters of this layer
        self.params = [self.W, self.b]
Beispiel #45
0
    def __init__(self,
                 rng,
                 input,
                 n_in,
                 n_out,
                 W=None,
                 b=None,
                 activation=T.tanh,
                 reluSlope=0.0):
        """
        Typical hidden layer of a MLP: units are fully-connected and have
        sigmoidal activation function. Weight matrix W is of shape (n_in,n_out)
        and the bias vector b is of shape (n_out,).

        NOTE : The nonlinearity used here is tanh

        Hidden unit activation is given by: tanh(dot(input,W) + b)

        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type input: theano.tensor.dmatrix
        :param input: a symbolic tensor of shape (n_examples, n_in)

        :type n_in: int
        :param n_in: dimensionality of input

        :type n_out: int
        :param n_out: number of hidden units

        :type activation: theano.Op or function
        :param activation: Non linearity to be applied in the hidden
                           layer
        """
        self.input = input
        # end-snippet-1

        # `W` is initialized with `W_values` which is uniformely sampled
        # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden))
        # for tanh activation function
        # the output of uniform if converted using asarray to dtype
        # theano.config.floatX so that the code is runable on GPU
        # Note : optimal initialization of weights is dependent on the
        #        activation function used (among other things).
        #        For example, results presented in [Xavier10] suggest that you
        #        should use 4 times larger initial weights for sigmoid
        #        compared to tanh
        #        We have no info for other function, so we use the same as
        #        tanh.
        if W is None:
            W_values = numpy.asarray(rng.uniform(
                low=-numpy.sqrt(6. / (n_in + n_out)),
                high=numpy.sqrt(6. / (n_in + n_out)),
                size=(n_in, n_out)),
                                     dtype=theano.config.floatX)
            if activation == theano.tensor.nnet.sigmoid:
                W_values *= 4

            W = theano.shared(value=W_values, name='W', borrow=True)

        if b is None:
            b_values = numpy.zeros((n_out, ), dtype=theano.config.floatX)
            b = theano.shared(value=b_values, name='b', borrow=True)

        self.W = W
        self.b = b
        self.gamma = theano.shared(value=numpy.ones(
            (n_out, ), dtype=theano.config.floatX),
                                   name='gamma')
        self.beta = theano.shared(value=numpy.zeros(
            (n_out, ), dtype=theano.config.floatX),
                                  name='beta')
        lin_output = T.dot(input, self.W) + self.b
        '''
        self.output = (
            lin_output if activation is None
            else activation(lin_output)
        )
        '''
        # parameters of the model
        self.params = [self.W, self.b]
        self.lin_output = lin_output
        bn_output = batch_normalization(
            inputs=self.lin_output,
            gamma=self.gamma,
            beta=self.beta,
            mean=self.lin_output.mean((0, ), keepdims=True),
            std=T.ones_like(self.lin_output.var((0, ), keepdims=True)),
            mode='high_mem')

        if activation is None:
            self.output = lin_output
            #self.output = bn_output
        elif activation is T.nnet.relu:
            self.output = T.nnet.relu(lin_output, reluSlope)
            #self.output = T.nnet.relu(bn_output, reluSlope)
        else:
            self.output = activation(lin_output)
            #self.output = activation(bn_output)
        self.bn_output = bn_output