Example #1
0
    def test_seed_fn(self):
        random = RandomStreams(234)
        fn = function([], random.uniform((2, 2)), updates=random.updates())

        random.seed(utt.fetch_seed())

        fn_val0 = fn()
        fn_val1 = fn()

        rng_seed = np.random.RandomState(utt.fetch_seed()).randint(2**30)
        rng = np.random.RandomState(int(rng_seed))  # int() is for 32bit

        numpy_val0 = rng.uniform(size=(2, 2))
        numpy_val1 = rng.uniform(size=(2, 2))

        assert np.allclose(fn_val0, numpy_val0)
        assert np.allclose(fn_val1, numpy_val1)
    def test_setitem(self):

        random = RandomStreams(234)
        out = random.uniform((2, 2))
        fn = function([], out, updates=random.updates())

        random.seed(888)

        rng = numpy.random.RandomState(utt.fetch_seed())
        random[out.rng] = numpy.random.RandomState(utt.fetch_seed())

        fn_val0 = fn()
        fn_val1 = fn()
        numpy_val0 = rng.uniform(size=(2, 2))
        numpy_val1 = rng.uniform(size=(2, 2))
        assert numpy.allclose(fn_val0, numpy_val0)
        assert numpy.allclose(fn_val1, numpy_val1)
Example #3
0
    def test_setitem(self):

        random = RandomStreams(234)
        out = random.uniform((2, 2))
        fn = function([], out, updates=random.updates())

        random.seed(888)

        rng = np.random.RandomState(utt.fetch_seed())
        random[out.rng] = np.random.RandomState(utt.fetch_seed())

        fn_val0 = fn()
        fn_val1 = fn()
        numpy_val0 = rng.uniform(size=(2, 2))
        numpy_val1 = rng.uniform(size=(2, 2))
        assert np.allclose(fn_val0, numpy_val0)
        assert np.allclose(fn_val1, numpy_val1)
    def test_seed_fn(self):
        random = RandomStreams(234)
        fn = function([], random.uniform((2, 2)), updates=random.updates())

        random.seed(utt.fetch_seed())

        fn_val0 = fn()
        fn_val1 = fn()

        rng_seed = numpy.random.RandomState(utt.fetch_seed()).randint(2**30)
        rng = numpy.random.RandomState(int(rng_seed))  #int() is for 32bit

        # print fn_val0
        numpy_val0 = rng.uniform(size=(2, 2))
        numpy_val1 = rng.uniform(size=(2, 2))
        # print numpy_val0

        assert numpy.allclose(fn_val0, numpy_val0)
        assert numpy.allclose(fn_val1, numpy_val1)
Example #5
0
    def test_examples_9(self):

        from theano.tensor.shared_randomstreams import RandomStreams
        srng = RandomStreams(seed=234)
        rv_u = srng.uniform((2,2))
        rv_n = srng.normal((2,2))
        f = function([], rv_u)
        g = function([], rv_n, no_default_updates=True)    #Not updating rv_n.rng
        nearly_zeros = function([], rv_u + rv_u - 2 * rv_u)


        f_val0 = f()
        f_val1 = f()  #different numbers from f_val0
        assert numpy.all(f_val0 != f_val1)

        g_val0 = g()  # different numbers from f_val0 and f_val1
        g_val1 = g()  # same numbers as g_val0 !!!

        assert numpy.all(g_val0 == g_val1)
        assert numpy.all(g_val0 != f_val0)
        assert numpy.all(g_val0 != f_val1)

        nearly_zeros = function([], rv_u + rv_u - 2 * rv_u)
        assert numpy.allclose(nearly_zeros(), [[0.,0.],[0.,0.]])

        rng_val = rv_u.rng.get_value(borrow=True)   # Get the rng for rv_u
        rng_val.seed(89234)                         # seeds the generator
        rv_u.rng.set_value(rng_val, borrow=True)    # Assign back seeded rng

        srng.seed(902340)  # seeds rv_u and rv_n with different seeds each
        state_after_v0 = rv_u.rng.get_value().get_state()
        nearly_zeros()       # this affects rv_u's generator
        v1 = f()
        rng = rv_u.rng.get_value(borrow=True)
        rng.set_state(state_after_v0)
        rv_u.rng.set_value(rng, borrow=True)
        v2 = f()             # v2 != v1
        v3 = f()             # v3 == v1
        assert numpy.all(v1 != v2)
        assert numpy.all(v1 == v3)
Example #6
0
    def test_examples_9(self):

        from theano.tensor.shared_randomstreams import RandomStreams
        srng = RandomStreams(seed=234)
        rv_u = srng.uniform((2,2))
        rv_n = srng.normal((2,2))
        f = function([], rv_u)
        g = function([], rv_n, no_default_updates=True)    #Not updating rv_n.rng
        nearly_zeros = function([], rv_u + rv_u - 2 * rv_u)


        f_val0 = f()
        f_val1 = f()  #different numbers from f_val0
        assert numpy.all(f_val0 != f_val1)

        g_val0 = g()  # different numbers from f_val0 and f_val1
        g_val1 = g()  # same numbers as g_val0 !!!

        assert numpy.all(g_val0 == g_val1)
        assert numpy.all(g_val0 != f_val0)
        assert numpy.all(g_val0 != f_val1)

        nearly_zeros = function([], rv_u + rv_u - 2 * rv_u)
        assert numpy.allclose(nearly_zeros(), [[0.,0.],[0.,0.]])

        rng_val = rv_u.rng.get_value(borrow=True)   # Get the rng for rv_u
        rng_val.seed(89234)                         # seeds the generator
        rv_u.rng.set_value(rng_val, borrow=True)    # Assign back seeded rng

        srng.seed(902340)  # seeds rv_u and rv_n with different seeds each
        state_after_v0 = rv_u.rng.get_value().get_state()
        nearly_zeros()       # this affects rv_u's generator
        v1 = f()
        rng = rv_u.rng.get_value(borrow=True)
        rng.set_state(state_after_v0)
        rv_u.rng.set_value(rng, borrow=True)
        v2 = f()             # v2 != v1
        v3 = f()             # v3 == v1
        assert numpy.all(v1 != v2)
        assert numpy.all(v1 == v3)
g_val0 = g()
g_val1 = g()

print f_val0
print f_val1
print g_val0
print g_val1
print nearly_zeros()

rng_val = rv_u.rng.get_value(borrow=True)
rng_val.seed(89234)
rv_u.rng.set_value(rng_val, borrow=True)
print f()
print

srng.seed(900890)
print f()
print g()
print

state_after_v0 = rv_u.rng.get_value().get_state()
print nearly_zeros()
print f()

rng = rv_u.rng.get_value(borrow=True)
rng.set_state(state_after_v0)
rv_u.rng.set_value(rng, borrow=True)
print f()
print f()
print f()
print
Example #8
0
class LocalNoiseEBM(object):
    def reset_rng(self):

        self.rng = N.random.RandomState([12., 9., 2.])
        self.theano_rng = RandomStreams(self.rng.randint(2**30))
        if self.initialized:
            self.redo_theano()

    #

    def __getstate__(self):
        d = copy.copy(self.__dict__)

        #remove everything set up by redo_theano

        for name in self.names_to_del:
            if name in d:
                del d[name]

        return d

    def __setstate__(self, d):
        self.__dict__.update(d)
        #self.redo_theano()      # todo: make some way of not running this, so it's possible to just open something up and look at its weights fast without recompiling it

    def weights_format(self):
        return ['v', 'h']

    def get_dimensionality(self):
        return 0

    def important_error(self):
        return 2

    def __init__(
            self,
            nvis,
            nhid,
            learning_rate,
            irange,
            init_bias_hid,
            init_noise_var,
            min_misclass,
            max_misclass,
            time_constant,
            noise_var_scale_up,
            noise_var_scale_down,
            max_noise_var,
            different_examples,
            energy_function,
            init_vis_prec,
            learn_vis_prec,
            vis_prec_lr_scale=1e-2,  # 0 won't make it not learn, it will just make the transfer function invalid
            init_delta=0.0,
            clean_contrastive_coeff=0.0,
            use_two_noise_vars=False,
            denoise=False):
        self.denoise = denoise
        self.initialized = False
        self.reset_rng()
        self.nhid = nhid
        self.nvis = nvis
        self.learning_rate = learning_rate
        self.ERROR_RECORD_MODE_MONITORING = 0
        self.error_record_mode = self.ERROR_RECORD_MODE_MONITORING
        self.init_weight_mag = irange
        self.force_batch_size = 0
        self.init_bias_hid = init_bias_hid
        self.noise_var = shared(N.cast[floatX](init_noise_var))
        self.min_misclass = min_misclass
        self.max_misclass = max_misclass
        self.time_constant = time_constant
        self.noise_var_scale_up = noise_var_scale_up
        self.noise_var_scale_down = noise_var_scale_down
        self.max_noise_var = max_noise_var
        self.misclass = -1
        self.different_examples = different_examples
        self.init_vis_prec = init_vis_prec
        self.learn_vis_prec = learn_vis_prec
        self.vis_prec_lr_scale = vis_prec_lr_scale
        self.energy_function = energy_function
        self.init_delta = init_delta
        self.use_two_noise_vars = use_two_noise_vars
        self.clean_contrastive_coeff = clean_contrastive_coeff

        self.names_to_del = []

        self.redo_everything()

    def set_error_record_mode(self, mode):
        self.error_record_mode = mode

    def set_size_from_dataset(self, dataset):
        self.nvis = dataset.get_output_dim()
        self.redo_everything()
        self.vis_mean.set_value(dataset.get_marginals(), borrow=False)

    #

    def get_input_dim(self):
        return self.nvis

    def get_output_dim(self):
        return self.nhid

    def redo_everything(self):
        self.initialized = True

        self.error_record = []
        self.examples_seen = 0
        self.batches_seen = 0

        self.W = shared(N.cast[floatX](self.rng.uniform(
            -self.init_weight_mag, self.init_weight_mag,
            (self.nvis, self.nhid))))
        self.W.name = 'W'

        self.b = shared(N.cast[floatX](N.zeros(self.nhid) +
                                       self.init_bias_hid))
        self.b.name = 'b'

        self.c = shared(N.cast[floatX](N.zeros(self.nvis)))
        self.c.name = 'c'

        self.params = [self.W, self.c, self.b]

        self.vis_prec_driver = shared(
            N.zeros(self.nvis) +
            N.log(N.exp(self.init_vis_prec) - 1.) / self.vis_prec_lr_scale)
        self.vis_prec_driver.name = 'vis_prec_driver'

        assert not N.any(N.isnan(self.vis_prec_driver.get_value()))
        assert not N.any(N.isinf(self.vis_prec_driver.get_value()))

        if self.learn_vis_prec:
            self.params.append(self.vis_prec_driver)
        #

        if self.energy_function == 'mse autoencoder':
            self.delta = shared(self.init_delta + N.zeros(self.nhid))
            self.delta.name = 'delta'
            self.s = shared(N.ones(self.nhid))
            self.s.name = 's'
            self.params.append(self.s)
            if not self.denoise:
                self.params.append(self.delta)
        #

        self.redo_theano()

    #

    def batch_energy(self, V, H):

        if self.energy_function != 'gaussian-binary rbm':
            assert False

        output_scan, updates = scan(
            lambda v, h, beta: 0.5 * T.dot(v, beta * v) - T.dot(
                self.b, h) - T.dot(self.c, v) - T.dot(v, T.dot(self.W, h)),
            sequences=(V, H),
            non_sequences=self.vis_prec)

        return output_scan

    def p_h_given_v(self, V):
        if self.energy_function != 'gaussian-binary rbm':
            assert False

        return T.nnet.sigmoid(self.b + T.dot(V, self.W))

    def free_energy(self, V):
        return self.batch_free_energy(V)

    def batch_free_energy(self, V):

        if self.energy_function == 'gaussian-binary rbm':
            output_scan, updates = scan(
                lambda v, beta: 0.5 * T.dot(v, beta * v) - T.dot(self.c, v) - T
                .sum(T.nnet.softplus(T.dot(v, self.W) + self.b)),
                sequences=V,
                non_sequences=self.vis_prec)
        elif self.energy_function == 'mse autoencoder':

            def fn(v, beta, w):
                h = T.nnet.sigmoid((self.s / w) * T.dot(v, self.W) - self.s +
                                   self.b)
                h.name = 'h'
                r = T.dot(self.W, h) + self.c
                r.name = 'r'

                assert len(h.type().broadcastable) == 1
                assert len(self.delta.type().broadcastable) == 1

                penalty = -T.dot(self.delta, h)

                d = v - r

                scaled_mse = T.dot(d, beta * d)

                rval = scaled_mse + penalty

                assert len(rval.type().broadcastable) == 0

                return rval

            output_scan, updates = scan(
                fn, sequences=V, non_sequences=[self.vis_prec, self.wnorms])

        assert len(output_scan.type().broadcastable) == 1

        return output_scan

    def redo_theano(self):

        if 'denoise' not in dir(self):
            self.denoise = False

        if 'energy_function' not in dir(self):
            self.energy_function = 'gaussian-binary rbm'

        if 'noise_var' not in dir(self):
            self.noise_var = self.beta
            del self.beta

        if 'different_examples' not in dir(self):
            self.different_examples = False

        if 'vis_prec_driver' not in dir(self):
            self.vis_prec_lr_scale = 1.
            self.vis_prec_driver = shared(
                N.zeros(self.nvis) +
                N.log(N.exp(1.0) - 1.) / self.vis_prec_lr_scale)

        pre_existing_names = dir(self)

        self.wnorms = T.sum(T.sqr(self.W), axis=0)

        self.vis_prec = T.nnet.softplus(self.vis_prec_driver *
                                        self.vis_prec_lr_scale)

        self.vis_prec.name = 'vis_prec'

        self.W_T = self.W.T
        self.W_T.name = 'W.T'

        alpha = T.scalar()

        X = T.matrix()
        X.name = 'X'

        if self.use_two_noise_vars:
            switch = self.theano_rng.normal(
                size=[
                    1,
                ], avg=0, std=1, dtype='float32') > 0.0
        else:
            switch = 1.0

        final_noise_var = switch * self.noise_var + (1.0 - switch) * 2.0

        corrupted = self.theano_rng.normal(size=X.shape,
                                           avg=X,
                                           std=T.sqrt(final_noise_var),
                                           dtype=X.dtype)

        corrupted.name = 'prenorm_corrupted'

        old_norm = T.sqr(X).sum(axis=1)
        old_norm.name = 'old_norm'

        new_norm = T.sqr(corrupted).sum(axis=1)
        new_norm.name = 'new_norm'

        norm_ratio = old_norm / (1e-8 + new_norm)
        norm_ratio.name = 'norm_ratio'

        norm_ratio_shuffled = norm_ratio.dimshuffle(0, 'x')
        norm_ratio_shuffled.name = 'norm_ratio_shuffled'

        #corrupted = corrupted * norm_ratio_shuffled
        #corrupted.name = 'postnorm_corrupted'

        print "NOT USING NORM RESCALING"

        self.corruption_func = function([X], corrupted)

        E_c = self.batch_free_energy(corrupted)

        E_c.name = 'E_c'

        if self.different_examples:
            X2 = T.matrix()
            inputs = [X, X2]
        else:
            X2 = X
            inputs = [X]
        #

        E_d = self.batch_free_energy(X2)
        assert len(E_d.type().broadcastable) == 1

        E_d.name = 'E_d'

        noise_contrastive = T.mean(-T.log(T.nnet.sigmoid(E_c - E_d)))

        if self.denoise:
            H = h = T.nnet.sigmoid((self.s / self.wnorms) *
                                   T.dot(corrupted, self.W) - self.s + self.b)
            H.name = 'H'
            R = (T.dot(H, self.W.T) + self.c) / self.vis_prec

            recons_diff = R - X

            #obj = T.mean(T.sqr(recons_diff))

            model_score_diffs = corrupted - R
            noise_dir = corrupted - X

            model_score = self.vis_prec * model_score_diffs
            model_score.name = 'model_score'

            data_score = noise_dir / self.noise_var

            score_diffs = data_score - model_score

            obj = T.mean(T.sqr(score_diffs))

            HX = T.nnet.sigmoid((self.s / self.wnorms) * T.dot(X, self.W) -
                                self.s + self.b)
            RX = T.dot(HX, self.W.T) + self.c

            recons_diff_X = RX - X

            recons_norms = T.sum(T.sqr(recons_diff_X), axis=1)

            recons_dir = recons_diff_X / (
                1e-14 + T.sqrt(recons_norms.dimshuffle((0, 'x'))))

            self.recons_dir_func = function([X], recons_dir)

        elif self.clean_contrastive_coeff > 0:
            assert not self.different_examples

            E_d_0 = self.batch_free_energy(X)

            clean_contrastive = T.mean(-T.log(T.nnet.sigmoid(E_d - E_d_0)))

            obj = noise_contrastive + self.clean_contrastive_coeff * clean_contrastive
        else:
            obj = noise_contrastive

        self.error_func = function(inputs, obj)

        misclass_batch = (E_c < E_d)
        misclass_batch.name = 'misclass_batch'

        misclass = misclass_batch.mean()
        misclass.name = 'misclass'

        #print 'maker'
        #print theano.printing.debugprint(self.error_func.maker.env.outputs[0])
        #print 'obj'
        #print theano.printing.debugprint(obj)

        self.E_d_func = function(inputs, E_d.mean())
        self.E_d_batch_func = function(inputs, E_d)
        self.E_X_batch_func = function([X2], E_d)
        self.E_c_func = function(inputs, E_c.mean())
        self.sqnorm_grad_E_c_func = function(
            inputs, T.sum(T.sqr(T.grad(T.mean(E_c), corrupted))))
        self.sqnorm_grad_E_d_func = function(
            inputs, T.sum(T.sqr(T.grad(T.mean(E_d), X2))))

        self.misclass_func = function(inputs, misclass)

        #self.norm_misclass_func = function([X], ( T.sum(T.sqr(corrupted),axis=1) < T.sum(T.sqr(X),axis=1) ).mean())
        #self.norm_c_func = function([X], T.sum(T.sqr(corrupted),axis=1).mean())
        #self.norm_d_func = function([X], T.sum(T.sqr(X),axis=1).mean())

        grads = [T.grad(obj, param) for param in self.params]

        learn_inputs = [ipt for ipt in inputs]
        learn_inputs.append(alpha)

        self.learn_func = function(learn_inputs,
                                   updates=[
                                       (param, param - alpha * grad)
                                       for (param,
                                            grad) in zip(self.params, grads)
                                   ],
                                   name='learn_func')

        if self.energy_function != 'mse autoencoder':
            self.recons_func = function([X],
                                        self.gibbs_step_exp(X),
                                        name='recons_func')
        #

        post_existing_names = dir(self)

        self.names_to_del = [
            name for name in post_existing_names
            if name not in pre_existing_names
        ]

    def learn(self, dataset, batch_size):
        self.learn_mini_batch([
            dataset.get_batch_design(batch_size)
            for x in xrange(1 + self.different_examples)
        ])

    def recons_func(self, x):
        rval = N.zeros(x.shape)
        for i in xrange(x.shape[0]):
            rval[i, :] = self.gibbs_step_exp(x[i, :])

        return rval

    def print_suite(self, dataset, batch_size, batches, things_to_print):
        self.theano_rng.seed(5)

        tracker = {}

        for thing in things_to_print:
            tracker[thing[0]] = []

        for i in xrange(batches):
            x = dataset.get_batch_design(batch_size)
            assert x.shape == (batch_size, self.nvis)

            if self.different_examples:
                inputs = [x, dataset.get_batch_design(batch_size)]
            else:
                inputs = [x]

            for thing in things_to_print:
                tracker[thing[0]].append(thing[1](*inputs))

        for thing in things_to_print:
            print thing[0] + ': ' + str(N.asarray(tracker[thing[0]]).mean())
        #

    #

    def record_monitoring_error(self, dataset, batch_size, batches):
        assert self.error_record_mode == self.ERROR_RECORD_MODE_MONITORING

        print 'noise variance (before norm rescaling): ' + str(
            self.noise_var.get_value())

        #always use the same seed for monitoring error
        self.theano_rng.seed(5)

        errors = []

        misclasses = []

        for i in xrange(batches):
            x = dataset.get_batch_design(batch_size)
            assert x.shape == (batch_size, self.nvis)

            if self.different_examples:
                inputs = [x, dataset.get_batch_design(batch_size)]
            else:
                inputs = [x]

            error = self.error_func(*inputs)
            errors.append(error)
            misclass = self.misclass_func(*inputs)
            misclasses.append(misclass)
        #

        misclass = N.asarray(misclasses).mean()

        print 'misclassification rate: ' + str(misclass)

        error = N.asarray(errors).mean()

        assert not N.isnan(misclass)
        assert not N.isnan(error)

        self.error_record.append((self.examples_seen, self.batches_seen, error,
                                  self.noise_var.get_value(), misclass))

        print "TODO: restore old theano_rng state instead of jumping to new one"
        self.theano_rng.seed(self.rng.randint(2**30))

    #

    def reconstruct(self, x, use_noise):
        assert x.shape[0] == 1

        print 'x summary: ' + str((x.min(), x.mean(), x.max()))

        #this method is mostly a hack to make the formatting work the same as denoising autoencoder
        self.truth_shared = shared(x.copy())

        if use_noise:
            self.vis_shared = shared(self.corruption_func(x))
        else:
            self.vis_shared = shared(x.copy())

        self.reconstruction = self.recons_func(self.vis_shared.get_value())

        print 'recons summary: ' + str(
            (self.reconstruction.min(), self.reconstruction.mean(),
             self.reconstruction.max()))

    def gibbs_step_exp(self, V):
        base_name = V.name

        if base_name is None:
            base_name = 'anon'

        Q = self.p_h_given_v(V)
        H = self.sample_hid(Q)

        H.name = base_name + '->hid_sample'

        sample = self.c + T.dot(H, self.W_T)

        sample.name = base_name + '->sample_expectation'

        return sample

    def sample_hid(self, Q):
        return self.theano_rng.binomial(size=Q.shape, n=1, p=Q, dtype=Q.dtype)

    def learn_mini_batch(self, inputs):

        for x in inputs:
            assert x.shape[1] == self.nvis

        cur_misclass = self.misclass_func(*inputs)

        if self.misclass == -1:
            self.misclass = cur_misclass
        else:
            self.misclass = self.time_constant * cur_misclass + (
                1. - self.time_constant) * self.misclass

        #print 'current misclassification rate: '+str(self.misclass)

        if self.misclass > self.max_misclass:
            self.noise_var.set_value(
                min(self.max_noise_var,
                    self.noise_var.get_value() * self.noise_var_scale_up))
        elif self.misclass < self.min_misclass:
            self.noise_var.set_value(
                max(1e-8,
                    self.noise_var.get_value() * self.noise_var_scale_down))
        #

        learn_inputs = [ipt for ipt in inputs]
        learn_inputs.append(self.learning_rate)
        self.learn_func(*learn_inputs)

        self.examples_seen += x.shape[0]
        self.batches_seen += 1
Example #9
0
from theano import scan
import time, numpy as np

srng = RandomStreams(seed=234)
rv_u = srng.uniform((2, 2))
rv_n = srng.normal((2, 2))
f = function([], rv_u)
g = function([], rv_n, no_default_updates=True)
nearly_zeros = function([], rv_u + rv_u - 2 * rv_u)

# seeding seed
rng_val = rv_u.rng.get_value(borrow=True)
rng_val.seed(89234)
rv_u.rng.set_value(rng_val)

srng.seed(100)

state_after_v0 = rv_u.rng.get_value().get_state()
f()
f()

nearly_zeros()
v1 = f()
rng = rv_u.rng.get_value(borrow=True)
rng.set_state(state_after_v0)
rv_u.rng.set_value(rng, borrow=True)

v2 = f()
v3 = f()

# Derivatives
Example #10
0
class LocalNoiseEBM(object):
    def reset_rng(self):

        self.rng = N.random.RandomState([12.,9.,2.])
        self.theano_rng = RandomStreams(self.rng.randint(2**30))
        if self.initialized:
            self.redo_theano()
    #

    def __getstate__(self):
        d = copy.copy(self.__dict__)

        #remove everything set up by redo_theano

        for name in self.names_to_del:
            if name in d:
                del d[name]

        return d

    def __setstate__(self, d):
        self.__dict__.update(d)
        #self.redo_theano()      # todo: make some way of not running this, so it's possible to just open something up and look at its weights fast without recompiling it

    def weights_format(self):
        return ['v','h']

    def get_dimensionality(self):
        return 0

    def important_error(self):
        return 2

    def __init__(self, nvis, nhid,
                learning_rate, irange,
                init_bias_hid,
                init_noise_var,
                min_misclass,
                max_misclass,
                time_constant,
                noise_var_scale_up,
                noise_var_scale_down,
                max_noise_var,
                different_examples,
                energy_function,
                init_vis_prec,
                learn_vis_prec,
                vis_prec_lr_scale = 1e-2, # 0 won't make it not learn, it will just make the transfer function invalid
                init_delta = 0.0,
                clean_contrastive_coeff = 0.0,
                use_two_noise_vars = False,
                denoise = False
                ):
        self.denoise = denoise
        self.initialized = False
        self.reset_rng()
        self.nhid = nhid
        self.nvis = nvis
        self.learning_rate = learning_rate
        self.ERROR_RECORD_MODE_MONITORING = 0
        self.error_record_mode = self.ERROR_RECORD_MODE_MONITORING
        self.init_weight_mag = irange
        self.force_batch_size = 0
        self.init_bias_hid = init_bias_hid
        self.noise_var = shared(N.cast[floatX] (init_noise_var))
        self.min_misclass = min_misclass
        self.max_misclass = max_misclass
        self.time_constant = time_constant
        self.noise_var_scale_up = noise_var_scale_up
        self.noise_var_scale_down = noise_var_scale_down
        self.max_noise_var = max_noise_var
        self.misclass = -1
        self.different_examples = different_examples
        self.init_vis_prec = init_vis_prec
        self.learn_vis_prec = learn_vis_prec
        self.vis_prec_lr_scale = vis_prec_lr_scale
        self.energy_function = energy_function
        self.init_delta = init_delta
        self.use_two_noise_vars = use_two_noise_vars
        self.clean_contrastive_coeff = clean_contrastive_coeff

        self.names_to_del = []

        self.redo_everything()

    def set_error_record_mode(self, mode):
        self.error_record_mode = mode

    def set_size_from_dataset(self, dataset):
        self.nvis = dataset.get_output_dim()
        self.redo_everything()
        self.vis_mean.set_value( dataset.get_marginals(), borrow=False)
    #

    def get_input_dim(self):
        return self.nvis

    def get_output_dim(self):
        return self.nhid

    def redo_everything(self):
        self.initialized = True

        self.error_record = []
        self.examples_seen = 0
        self.batches_seen = 0

        self.W = shared( N.cast[floatX](self.rng.uniform(-self.init_weight_mag, self.init_weight_mag, (self.nvis, self.nhid ) ) ))
        self.W.name = 'W'

        self.b = shared( N.cast[floatX](N.zeros(self.nhid) + self.init_bias_hid) )
        self.b.name = 'b'

        self.c = shared( N.cast[floatX](N.zeros(self.nvis)))
        self.c.name = 'c'

        self.params = [ self.W, self.c, self.b ]


        self.vis_prec_driver = shared(N.zeros(self.nvis) + N.log(N.exp(self.init_vis_prec) - 1.) / self.vis_prec_lr_scale)
        self.vis_prec_driver.name = 'vis_prec_driver'

        assert not N.any(N.isnan( self.vis_prec_driver.get_value() ))
        assert not N.any(N.isinf( self.vis_prec_driver.get_value() ))


        if self.learn_vis_prec:
            self.params.append(self.vis_prec_driver)
        #

        if self.energy_function == 'mse autoencoder':
            self.delta = shared(self.init_delta + N.zeros(self.nhid))
            self.delta.name = 'delta'
            self.s = shared(N.ones(self.nhid))
            self.s.name = 's'
            self.params.append(self.s)
            if not self.denoise:
                self.params.append(self.delta)
        #


        self.redo_theano()
    #


    def batch_energy(self, V, H):

        if self.energy_function != 'gaussian-binary rbm':
            assert False

        output_scan, updates = scan(
                 lambda v, h, beta: 0.5 * T.dot(v,beta*v) - T.dot(self.b,h) - T.dot(self.c,v) -T.dot(v,T.dot(self.W,h)),
                 sequences  = (V,H), non_sequences = self.vis_prec)


        return output_scan

    def p_h_given_v(self, V):
        if self.energy_function != 'gaussian-binary rbm':
            assert False

        return T.nnet.sigmoid(self.b + T.dot(V,self.W))

    def free_energy(self, V):
        return self.batch_free_energy(V)

    def batch_free_energy(self, V):

        if self.energy_function == 'gaussian-binary rbm':
            output_scan, updates = scan(
                lambda v, beta: 0.5 * T.dot(v,beta * v) - T.dot(self.c,v) - T.sum(T.nnet.softplus( T.dot(v,self.W)+self.b)),
                 sequences  = V,  non_sequences = self.vis_prec
                 )
        elif self.energy_function == 'mse autoencoder':


            def fn(v, beta, w):
                h = T.nnet.sigmoid((self.s/w) * T.dot(v,self.W)-self.s+self.b)
                h.name = 'h'
                r = T.dot(self.W,h)+self.c
                r.name = 'r'

                assert len(h.type().broadcastable ) == 1
                assert len(self.delta.type().broadcastable ) == 1

                penalty =  - T.dot(self.delta , h)

                d = v -r

                scaled_mse = T.dot(d,beta * d)

                rval =  scaled_mse + penalty

                assert len(rval.type().broadcastable ) == 0

                return rval

            output_scan, updates = scan(
                    fn,
                    sequences = V, non_sequences = [self.vis_prec, self.wnorms])

        assert len(output_scan.type().broadcastable ) == 1

        return output_scan

    def redo_theano(self):

        if 'denoise' not in dir(self):
            self.denoise = False

        if 'energy_function' not in dir(self):
            self.energy_function = 'gaussian-binary rbm'

        if 'noise_var' not in dir(self):
            self.noise_var = self.beta
            del self.beta

        if 'different_examples' not in dir(self):
            self.different_examples = False

        if 'vis_prec_driver' not in dir(self):
            self.vis_prec_lr_scale = 1.
            self.vis_prec_driver = shared(N.zeros(self.nvis) + N.log(N.exp(1.0) - 1.) / self.vis_prec_lr_scale)


        pre_existing_names = dir(self)

        self.wnorms = T.sum(T.sqr(self.W),axis=0)

        self.vis_prec = T.nnet.softplus(self.vis_prec_driver *  self.vis_prec_lr_scale)


        self.vis_prec.name = 'vis_prec'

        self.W_T = self.W.T
        self.W_T.name = 'W.T'

        alpha = T.scalar()

        X = T.matrix()
        X.name = 'X'


        if self.use_two_noise_vars:
            switch = self.theano_rng.normal(size=[1,], avg = 0, std = 1, dtype='float32') > 0.0
        else:
            switch = 1.0

        final_noise_var = switch * self.noise_var + (1.0 - switch)* 2.0

        corrupted = self.theano_rng.normal(size = X.shape, avg = X,
                                    std = T.sqrt(final_noise_var), dtype = X.dtype)


        corrupted.name = 'prenorm_corrupted'

        old_norm = T.sqr(X).sum(axis=1)
        old_norm.name = 'old_norm'


        new_norm = T.sqr(corrupted).sum(axis=1)
        new_norm.name = 'new_norm'


        norm_ratio = old_norm / (1e-8 + new_norm)
        norm_ratio.name = 'norm_ratio'

        norm_ratio_shuffled = norm_ratio.dimshuffle(0,'x')
        norm_ratio_shuffled.name = 'norm_ratio_shuffled'


        #corrupted = corrupted * norm_ratio_shuffled
        #corrupted.name = 'postnorm_corrupted'

        print "NOT USING NORM RESCALING"

        self.corruption_func = function([X],corrupted)



        E_c = self.batch_free_energy(corrupted)

        E_c.name = 'E_c'

        if self.different_examples:
            X2 = T.matrix()
            inputs = [ X, X2]
        else:
            X2 = X
            inputs = [ X ]
        #

        E_d = self.batch_free_energy(X2)
        assert len(E_d.type().broadcastable) == 1

        E_d.name = 'E_d'


        noise_contrastive = T.mean(
                -T.log(
                    T.nnet.sigmoid(
                        E_c - E_d)   ) )


        if self.denoise:
            H = h = T.nnet.sigmoid((self.s/self.wnorms) * T.dot(corrupted,self.W)-self.s+self.b)
            H.name = 'H'
            R = (T.dot(H,self.W.T)+self.c)/self.vis_prec

            recons_diff = R - X

            #obj = T.mean(T.sqr(recons_diff))



            model_score_diffs = corrupted - R
            noise_dir = corrupted - X

            model_score = self.vis_prec * model_score_diffs
            model_score.name = 'model_score'

            data_score = noise_dir / self.noise_var

            score_diffs = data_score  - model_score


            obj = T.mean(T.sqr(score_diffs ))



            HX = T.nnet.sigmoid((self.s/self.wnorms) * T.dot(X,self.W)-self.s+self.b)
            RX = T.dot(HX,self.W.T)+self.c

            recons_diff_X = RX - X

            recons_norms = T.sum(T.sqr(recons_diff_X),axis=1)

            recons_dir = recons_diff_X / (1e-14+T.sqrt(recons_norms.dimshuffle((0,'x'))))

            self.recons_dir_func = function( [X], recons_dir)

        elif self.clean_contrastive_coeff > 0:
            assert not self.different_examples

            E_d_0 = self.batch_free_energy(X)

            clean_contrastive = T.mean(
                -T.log(T.nnet.sigmoid( E_d - E_d_0)))

            obj =  noise_contrastive + self.clean_contrastive_coeff * clean_contrastive
        else:
            obj = noise_contrastive



        self.error_func = function(inputs,obj )

        misclass_batch = (E_c < E_d)
        misclass_batch.name = 'misclass_batch'

        misclass = misclass_batch.mean()
        misclass.name = 'misclass'

        #print 'maker'
        #print theano.printing.debugprint(self.error_func.maker.env.outputs[0])
        #print 'obj'
        #print theano.printing.debugprint(obj)

        self.E_d_func = function(inputs, E_d.mean())
        self.E_d_batch_func = function(inputs, E_d)
        self.E_X_batch_func =  function([X2], E_d)
        self.E_c_func = function(inputs, E_c.mean())
        self.sqnorm_grad_E_c_func = function(inputs, T.sum(T.sqr(T.grad(T.mean(E_c),corrupted))))
        self.sqnorm_grad_E_d_func = function(inputs, T.sum(T.sqr(T.grad(T.mean(E_d),X2))))

        self.misclass_func = function(inputs, misclass)




        #self.norm_misclass_func = function([X], ( T.sum(T.sqr(corrupted),axis=1) < T.sum(T.sqr(X),axis=1) ).mean())
        #self.norm_c_func = function([X], T.sum(T.sqr(corrupted),axis=1).mean())
        #self.norm_d_func = function([X], T.sum(T.sqr(X),axis=1).mean())

        grads = [ T.grad(obj,param) for param in self.params ]

        learn_inputs = [ ipt for ipt in inputs ]
        learn_inputs.append(alpha)

        self.learn_func = function(learn_inputs, updates =
                [ (param, param - alpha * grad) for (param,grad)
                    in zip(self.params, grads) ] , name='learn_func')

        if self.energy_function != 'mse autoencoder':
            self.recons_func = function([X], self.gibbs_step_exp(X) , name = 'recons_func')
        #

        post_existing_names = dir(self)

        self.names_to_del = [ name for name in post_existing_names if name not in pre_existing_names]

    def learn(self, dataset, batch_size):
        self.learn_mini_batch([dataset.get_batch_design(batch_size) for x in xrange(1+self.different_examples)])


    def recons_func(self, x):
        rval = N.zeros(x.shape)
        for i in xrange(x.shape[0]):
            rval[i,:] = self.gibbs_step_exp(x[i,:])

        return rval


    def print_suite(self, dataset, batch_size, batches,  things_to_print):
        self.theano_rng.seed(5)

        tracker =  {}

        for thing in things_to_print:
            tracker[thing[0]] = []

        for i in xrange(batches):
            x = dataset.get_batch_design(batch_size)
            assert x.shape == (batch_size, self.nvis)

            if self.different_examples:
                inputs = [ x , dataset.get_batch_design(batch_size) ]
            else:
                inputs = [ x ]

            for thing in things_to_print:
                tracker[thing[0]].append(thing[1](*inputs))

        for thing in things_to_print:
            print thing[0] + ': '+str(N.asarray(tracker[thing[0]]).mean())
        #
    #

    def record_monitoring_error(self, dataset, batch_size, batches):
        assert self.error_record_mode == self.ERROR_RECORD_MODE_MONITORING

        print 'noise variance (before norm rescaling): '+str(self.noise_var.get_value())

        #always use the same seed for monitoring error
        self.theano_rng.seed(5)

        errors = []

        misclasses = []

        for i in xrange(batches):
            x = dataset.get_batch_design(batch_size)
            assert x.shape == (batch_size, self.nvis)

            if self.different_examples:
                inputs = [ x, dataset.get_batch_design(batch_size) ]
            else:
                inputs = [ x ]

            error = self.error_func(*inputs)
            errors.append( error )
            misclass = self.misclass_func(*inputs)
            misclasses.append(misclass)
        #

        misclass = N.asarray(misclasses).mean()

        print 'misclassification rate: '+str(misclass)

        error = N.asarray(errors).mean()

        assert not N.isnan(misclass)
        assert not N.isnan(error)

        self.error_record.append( (self.examples_seen, self.batches_seen, error, self.noise_var.get_value(), misclass ) )

        print "TODO: restore old theano_rng state instead of jumping to new one"
        self.theano_rng.seed(self.rng.randint(2**30))
    #

    def reconstruct(self, x, use_noise):
        assert x.shape[0] == 1

        print 'x summary: '+str((x.min(),x.mean(),x.max()))

        #this method is mostly a hack to make the formatting work the same as denoising autoencoder
        self.truth_shared = shared(x.copy())

        if use_noise:
            self.vis_shared = shared(self.corruption_func(x))
        else:
            self.vis_shared = shared(x.copy())

        self.reconstruction = self.recons_func(self.vis_shared.get_value())

        print 'recons summary: '+str((self.reconstruction.min(),self.reconstruction.mean(),self.reconstruction.max()))


    def gibbs_step_exp(self, V):
        base_name = V.name

        if base_name is None:
            base_name = 'anon'

        Q = self.p_h_given_v(V)
        H = self.sample_hid(Q)

        H.name =  base_name + '->hid_sample'

        sample =  self.c + T.dot(H,self.W_T)

        sample.name = base_name + '->sample_expectation'

        return sample


    def sample_hid(self, Q):
        return self.theano_rng.binomial(size = Q.shape, n = 1, p = Q,
                                dtype = Q.dtype)


    def learn_mini_batch(self, inputs):

        for x in inputs:
            assert x.shape[1] == self.nvis

        cur_misclass = self.misclass_func(*inputs)

        if self.misclass == -1:
            self.misclass = cur_misclass
        else:
            self.misclass = self.time_constant * cur_misclass + (1.-self.time_constant) * self.misclass

        #print 'current misclassification rate: '+str(self.misclass)

        if self.misclass > self.max_misclass:
            self.noise_var.set_value(min(self.max_noise_var,self.noise_var.get_value() * self.noise_var_scale_up) )
        elif self.misclass < self.min_misclass:
            self.noise_var.set_value(max(1e-8,self.noise_var.get_value() * self.noise_var_scale_down ))
        #

        learn_inputs = [ ipt for ipt in inputs ]
        learn_inputs.append(self.learning_rate)
        self.learn_func( * learn_inputs)



        self.examples_seen += x.shape[0]
        self.batches_seen += 1
Example #11
0
class Network(object):

    ''' Core neural network class that forms the basis for all further implementations (e.g.
        MultilayerNet, Autoencoder, etc). Contains basic functions for propagating data forward
        and backwards through the network, as well as fitting the weights to data'''

    def __init__(self, d=None, k=None, num_hids=None, activs=None, loss_terms=[None], **loss_params):

        # Number of units in the output layer determined by k, so not explicitly specified in
        # num_hids. still need to check that there's one less hidden layer than number of activation
        # functions
        assert(len(num_hids) + 1 == len(activs))

        self.num_nodes = [d] + num_hids + [k]

        # needed mainly for gradient checking...
        self.num_params = 0
        for i, (n1, n2) in enumerate(zip(self.num_nodes[:-1], self.num_nodes[1:])):
            self.num_params += (n1 + 1) * n2

        self.activs = [None] * len(activs)
        for idx, activ in enumerate(activs):
            if activ == 'sigmoid':
                self.activs[idx] = na.sigmoid
            elif activ == 'tanh':
                self.activs[idx] = na.tanh
            elif activ == 'reLU':
                self.activs[idx] = na.reLU
            elif activ == 'softmax':
                self.activs[idx] = na.softmax
            else:
                sys.exit(ne.activ_err())

        self.loss_terms = loss_terms
        self.loss_params = loss_params
        self.srng = RandomStreams()
        self.srng.seed(np.random.randint(99999))

    def set_weights(self, wts=None, bs=None, init_method=None, scale_factor=None, seed=None):
        ''' Initializes the weights and biases of the neural network

        Parameters:
        -----------
        param: wts - weights
        type: np.ndarray, optional

        param: bs - biases
        type: np.ndarray, optional

        param: init_method - calls some pre-specified weight initialization routines
        type: string

        param: scale_factor - additional hyperparameter for weight initialization
        type: float, optional

        param: seed - seeds the random number generator
        type: int, optional
        '''
        if seed is not None:
            np.random.seed(seed=seed)
            self.srng.seed(seed)

        if wts is None and bs is None:
            wts = (len(self.num_nodes) - 1) * [None]
            bs = (len(self.num_nodes) - 1) * [None]

            if init_method == 'gauss':
                for i, (n1, n2) in enumerate(zip(self.num_nodes[:-1], self.num_nodes[1:])):
                    wts[i] = scale_factor * 1. / \
                        np.sqrt(n2) * np.random.randn(n1, n2)
                    bs[i] = np.zeros(n2)

            elif init_method == 'fan-io':
                for i, (n1, n2) in enumerate(zip(self.num_nodes[:-1], self.num_nodes[1:])):
                    v = scale_factor * np.sqrt(6. / (n1 + n2 + 1))
                    wts[i] = 2.0 * v * np.random.rand(n1, n2) - v
                    bs[i] = np.zeros(n2)
            else:
                sys.exit(ne.weight_error())

        else:
            # this scenario occurs most when doing unsupervised pre-training to initialize
            # the weights
            assert isinstance(wts, list)
            assert isinstance(bs, list)

        self.wts_ = [theano.shared(nu.floatX(wt), borrow=True) for wt in wts]
        self.bs_ = [theano.shared(nu.floatX(b), borrow=True) for b in bs]

    def fit(self, X_tr, y_tr, X_val=None, y_val=None, wts=None, bs=None, plotting=False, **optim_params):
        ''' The primary function which ingests data and fits to the neural network.

        Parameters:
        -----------
        param: X_tr - training data
        type: theano matrix

        param: y_tr - training labels
        type: theano matrix

        param: X_val - validation data
        type: theano matrix

        param: y_val - validation labels
        type: theano matrix

        param: plotting - specifies whether any curves should be generated
        type: boolean

        param: **optim_params
        type: dictionary of optimization parameters

        '''
        # initialize weights...
        if all(node for node in self.num_nodes):
            init_method = optim_params.pop('init_method')
            scale_factor = optim_params.pop('scale_factor')
            try:
                seed = optim_params.pop('seed')
            except KeyError:
                seed = None
            self.set_weights(
                wts=wts, bs=bs, init_method=init_method, scale_factor=scale_factor, seed=seed)

        #...and train
        try:
            optim_type = optim_params.pop('optim_type')
        except KeyError:
            sys.exit(ne.opt_type_err())

        num_epochs = optim_params.pop('num_epochs', None)
        batch_size = optim_params.pop('batch_size', None)

        if optim_type == 'minibatch':
            self.minibatch_optimize(X_tr, y_tr, X_val=X_val, y_val=y_val, batch_size=batch_size, num_epochs=num_epochs,
                                    plotting=plotting, **optim_params)
        elif optim_type == 'fullbatch':
            self.fullbatch_optimize(
                X_tr, y_tr, X_val=X_val, y_val=y_val, num_epochs=num_epochs, **optim_params)
        else:
            sys.exit(ne.opt_type_err())

        return self

    def shared_dataset(self, X, y):
        ''' As per the deep learning tutorial, loading the data all at once (if possible)
        into the GPU will significantly speed things up '''

        return theano.shared(nu.floatX(X)), theano.shared(nu.floatX(y))

    def fullbatch_optimize(self, X_tr, y_tr, X_val=None, y_val=None, num_epochs=None, **optim_params):
        ''' Full-batch optimization using scipy's L-BFGS-B and CG

        Parameters:
        -----------
        param: X_tr - training data
        type: theano matrix

        param: y_tr - training labels
        type: theano matrix

        param: num_epochs - the number of full runs through the dataset
        type: int

        param: **optim_params
        type: dictionary of optimization parameters
        '''

        X = T.matrix('X')  # input variable
        y = T.matrix('y')  # output variable
        w = T.vector('w')  # weight vector

        # reshape the vector w into weight and bias matrices, and set up the 
        # theano graph to compute the loss and gradient
        wts, bs = nu.t_reroll(w, self.num_nodes)
        optim_loss = self.compute_optim_loss(X, y, wts=wts, bs=bs)
        params = [p for param in [wts, bs] for p in param]  
        grad_params = [T.grad(optim_loss, param) for param in params]
        grad_w = nu.t_unroll(grad_params[:len(wts)], grad_params[len(wts):])

        compute_loss_grad_from_vector = theano.function(
            inputs=[w, X, y],
            outputs=[optim_loss, grad_w],
            allow_input_downcast=True)

        compute_loss_from_vector = theano.function(
            inputs=[w, X, y],
            outputs=[optim_loss],
            allow_input_downcast=True)

        # initialize the weight vector and perform full-batch optimization
        wts0 = [wt.get_value() for wt in self.wts_]
        bs0 = [b.get_value() for b in self.bs_]
        w0 = nu.unroll(wts0, bs0)

        # print 'Checking gradients...'
        # self.check_gradients(X_tr,y_tr,wts0,bs0)
        # print 'Pre-training loss:',compute_loss_from_vector(w0,X_tr,y_tr)

        try:
            optim_method = optim_params.pop('optim_method')
        except KeyError:
            sys.exit(ne.method_err())

        # very annoying.
        if optim_method == 'L-BFGS-B' and theano.config.floatX == 'float32':
            sys.exit('Sorry, L-BFGS-B only works with float64')

        wf = sp.optimize.minimize(compute_loss_grad_from_vector, w0, args=(X_tr, y_tr), method=optim_method, jac=True,
                                  options={'maxiter': num_epochs})


        # re-rolln back into weights and biases
        wts, bs = nu.reroll(wf.x, self.num_nodes)

        self.wts_ = [theano.shared(nu.floatX(wt)) for wt in wts]
        self.bs_ = [theano.shared(nu.floatX(b)) for b in bs]

    def minibatch_optimize(self, X_tr, y_tr, X_val=None, y_val=None, batch_size=None, num_epochs=None, plotting=False, **optim_params):
        ''' Mini-batch optimization using update functions; however, if the batch size = m, then this is basically
        full-batch learning with gradient descent

        Parameters:
        -----------
        param: X_tr - training data
        type: theano matrix

        param: y_tr - training labels
        type: theano matrix

        param: updates - update per rule for each

        param: batch_size - number of examples per mini-batch
        type: int

        param: num_epochs - the number of full runs through the dataset
        type: int

        param: **optim_params
        type: dictionary of optimization parameters

        '''
        X = T.matrix('X')  # input variable
        y = T.matrix('y')  # output variable
        idx = T.ivector('idx')  # integer index

        optim_loss = self.compute_optim_loss(X, y)
        eval_loss = self.compute_eval_loss(X, y)
        params = [p for param in [self.wts_, self.bs_] for p in param]
        grad_params = [T.grad(optim_loss, param) for param in params]

        try:
            optim_method = optim_params.pop('optim_method')
        except KeyError:
            sys.exit(ne.method_err())

        # define the update rule
        updates = []
        if optim_method == 'SGD':
            updates = nopt.sgd(
                params, grad_params, **optim_params)  # update rule

        elif optim_method == 'ADAGRAD':
            updates = nopt.adagrad(
                params, grad_params, **optim_params)  # update rule

        elif optim_method == 'RMSPROP':
            updates = nopt.rmsprop(params, grad_params, **optim_params)

        else:
            print method_err()

        # define the mini-batches
        m = X_tr.shape[0]  # total number of training instances
        # number of batches, based on batch size
        n_batches = int(m / batch_size)
        # batch_size won't divide the data evenly, so get leftover
        leftover = m - n_batches * batch_size

        # load the full dataset into a shared variable - this is especially useful
        # for test
        X_tr, y_tr = self.shared_dataset(X_tr, y_tr)

        # training function for minibatchs
        train = theano.function(
            inputs=[idx],
            updates=updates,
            allow_input_downcast=True,
            mode='FAST_RUN',
            givens={
                X: X_tr[idx],
                y: y_tr[idx]
            })

        compute_train_loss = theano.function(
            inputs=[],
            outputs=eval_loss,
            allow_input_downcast=True,
            mode='FAST_RUN',
            givens={
                X: X_tr,
                y: y_tr
            })

        # if validation data is provided, validation loss
        compute_val_loss = None

        if X_val is not None and y_val is not None:
            X_val, y_val = self.shared_dataset(X_val, y_val)
            compute_val_loss = theano.function(
                inputs=[],
                outputs=eval_loss,
                allow_input_downcast=True,
                mode='FAST_RUN',
                givens={
                    X: X_val,
                    y: y_val
                })

        # iterate through the training examples
        tr_loss = []
        val_loss = []
        epoch = 0

        while epoch < num_epochs:
            # randomly shuffle the data indices
            tr_idx = np.random.permutation(m)
            # define the start-stop indices
            ss_idx = range(0, m + 1, batch_size)
            ss_idx[-1] += leftover  # add the leftovers to the last batch

            # run through a full epoch
            for idx, (start_idx, stop_idx) in enumerate(zip(ss_idx[:-1], ss_idx[1:])):

                # total number of batches processed up until now
                n_batch_iter = (epoch - 1) * n_batches + idx
                batch_idx = tr_idx[start_idx:stop_idx]  # get the next batch

                train(batch_idx)

            epoch += 1  # update the epoch count
            if epoch % 10 == 0:
                tr_loss.append(compute_train_loss())
                if compute_val_loss is not None:
                    val_loss.append(compute_val_loss())
                    print 'Epoch: %s, Training error: %.15f, Validation error: %.15f' % (epoch, tr_loss[-1], val_loss[-1])
                else:
                    print 'Epoch: %s, Training error: %.15f' % (epoch, tr_loss[-1])

            # training and validation curves - very useful to see how training
            # error evolves
            if plotting:
                num_pts = len(tr_loss)
                pts = [idx * 10 for idx in range(num_pts)]
                plt.plot(pts, tr_loss, label='Training loss')
                # sort of a weak way to check if validation losses have been
                # computed
                if len(val_loss) > 0:
                    plt.plot(pts, val_loss, label='Validation loss')

                plt.xlabel('Epochs')
                plt.ylabel('Loss')
                plt.legend(loc='upper right')
                plt.show()

    def dropout(self, act, p=0.5):
        ''' Randomly drops an activation with probability p 

        Parameters
        ----------
        param: act - activation values, in a matrix
        type: theano matrix

        param: p - probability of dropping out a node
        type: float, optional

        Returns:
        --------
        param: [expr] - activation values randomly zeroed out
        type: theano matrix

        '''
        if p > 0:
            # randomly dropout p activations
            retain_prob = 1. - p
            return (1. / retain_prob) * act * self.srng.binomial(act.shape, p=retain_prob, dtype=theano.config.floatX)

    def train_fprop(self, X, wts=None, bs=None):
        ''' Performs forward propagation with for training, which could be different from
        the vanilla frprop we would use for testing, due to extra bells and whistles such as 
        dropout, corruption, etc'''

        if wts is None and bs is None:
            wts = self.wts_
            bs = self.bs_

        if 'dropout' in self.loss_terms:
            input_p = self.loss_params['input_p']
            hidden_p = self.loss_params['hidden_p']

            # compute the first activation separately in case we have no hidden
            # layer;
            act = self.activs[0](
                T.dot(self.dropout(X, input_p), wts[0]) + bs[0])
            if len(wts) > 1:  # len(wts) = 1 corresponds to softmax regression
                for i, (w, b, activ) in enumerate(zip(wts[1:], bs[1:], self.activs[1:])):
                    act = activ(T.dot(self.dropout(act, hidden_p), w) + b)

            eps = 1e-6
            act = T.switch(act < eps, eps, act)
            act = T.switch(act > (1. - eps), (1. - eps), act)

            return act
        else:
            return self.fprop(X, wts, bs)

    def fprop(self, X, wts=None, bs=None):
        ''' Performs vanilla forward propagation through the network

        Parameters
        ----------
        param: X - training data
        type: theano matrix

        param: wts - weights
        type: theano matrix

        param: bs - biases
        type: theano matrix

        Returns:
        --------
        param: act - final activation values
        type: theano matrix
        '''
        if wts is None and bs is None:
            wts = self.wts_
            bs = self.bs_

        # use the first data matrix to compute the first activation
        act = self.activs[0](T.dot(X, wts[0]) + bs[0])
        
        # len(wts) = 1 corresponds to softmax regression
        if len(wts) > 1:
            for i, (w, b, activ) in enumerate(zip(wts[1:], bs[1:], self.activs[1:])):
                act = activ(T.dot(act, w) + b)

        # for numericaly stability
        eps = 1e-6
        act = T.switch(act < eps, eps, act)
        act = T.switch(act > (1. - eps), (1. - eps), act)

        return act

    def check_gradients(self, X_in, Y_in, wts=None, bs=None):
        ''' this seems like overkill, but I suppose it doesn't hurt to have it in here...'''

        # assume that if it's not provided, they will be shared variables - this is
        # probably dangerous, but this is a debugging tool anyway,
        # so...whatever
        if wts is None and bs is None:
            wts = self.wts_
            bs = self.bs_
        else:
            wts = [theano.shared(nu.floatX(w), borrow=True) for w in wts]
            bs = [theano.shared(nu.floatX(b), borrow=True) for b in bs]

        X = T.matrix()  # inputs
        Y = T.matrix()  # labels
        v = T.vector()  # vector of biases and weights
        i = T.lscalar()  # index

        # 1. compile the numerical gradient function
        def compute_numerical_gradient(v, i, X, Y, eps=1e-4):

            # perturb the input
            v_plus = T.inc_subtensor(v[i], eps)
            v_minus = T.inc_subtensor(v[i], -1.0 * eps)

            # roll it back into the weight matrices and bias vectors
            wts_plus, bs_plus = nu.t_reroll(v_plus, self.num_nodes)
            wts_minus, bs_minus = nu.t_reroll(v_minus, self.num_nodes)

            # compute the loss for both sides, and then compute the numerical
            # gradient
            loss_plus = self.compute_optim_loss(X, Y, wts=wts_plus, bs=bs_plus)
            loss_minus = self.compute_optim_loss(X, Y, wts_minus, bs_minus)

            # ( E(weights[i]+eps) - E(weights[i]-eps) )/(2*eps)
            return 1.0 * (loss_plus - loss_minus) / (2 * eps)

        compute_ngrad = theano.function(
            inputs=[v, i, X, Y], outputs=compute_numerical_gradient(v, i, X, Y))

        # 2. compile backprop (theano's autodiff)
        optim_loss = self.compute_optim_loss(X, Y, wts=wts, bs=bs)
        params = [p for param in [wts, bs]
                  for p in param]  # all model parameters in a list
        # gradient of each model param w.r.t training loss
        grad_params = [T.grad(optim_loss, param) for param in params]
        # gradient of the full weight vector
        grad_w = nu.t_unroll(grad_params[:len(wts)], grad_params[len(wts):])

        compute_bgrad = theano.function(inputs=[X, Y], outputs=grad_w)

        # compute the mean difference between the numerical and exact gradients
        v0 = nu.unroll([wt.get_value()
                        for wt in wts], [b.get_value() for b in bs])
        # get the indices of the weights/biases we want to check
        idxs = np.random.permutation(self.num_params)[:(self.num_params / 5)]

        ngrad = [None] * len(idxs)
        for j, idx in enumerate(idxs):
            ngrad[j] = compute_ngrad(v0, idx, X_in, Y_in)
        bgrad = compute_bgrad(X_in, Y_in)[idxs]

        cerr = np.mean(np.abs(ngrad - bgrad))
        assert cerr < 1e-10

    def compute_eval_loss(self, X, y, wts=None, bs=None):
        ''' Given inputs, returns the evaluation loss at the current state of the model

        Parameters:
        -----------
        param: X - training data
        type: theano matrix

        param: y - training labels
        type: theano matrix

        param: wts - weights
        type: theano matrix, optional

        param: bs - biases
        type: theano matrix, optional

        Returns:
        --------
        param: eval_loss - evaluation loss, which doesn't include regularization
        type: theano scalar

        '''
        if wts is None and bs is None:
            wts = self.wts_
            bs = self.bs_

        eval_loss = None  # the loss function we can evaluate during validation
        y_pred = self.fprop(X, wts, bs)

        if 'cross_entropy' in self.loss_terms:
            eval_loss = nl.cross_entropy(y, y_pred)

        elif 'binary_cross_entropy' in self.loss_terms:
            eval_loss = nl.binary_cross_entropy(y, y_pred)

        elif 'squared_error' in self.loss_terms:
            eval_loss = nl.squared_error(y, y_pred)
        else:
            sys.exit('Must be either cross_entropy or squared_error')

        return eval_loss

    def compute_optim_loss(self, X, y, wts=None, bs=None):
        ''' Given inputs, returns the training loss at the current state of the model

        Parameters:
        -----------
        param: X - training data
        type: theano matrix

        param: y - training labels
        type: theano matrix

        param: wts - weights
        type: theano matrix, optional

        param: bs - biases
        type: theano matrix, optional

        Returns:
        --------
        param: optim_loss - the optimization loss which must be optimized over
        type: theano scalar
        '''
        if wts is None and bs is None:
            wts = self.wts_
            bs = self.bs_

        y_optim = self.train_fprop(X, wts, bs)
        # the loss function which will specifically be optimized over
        optim_loss = None

        if 'cross_entropy' in self.loss_terms:
            optim_loss = nl.cross_entropy(y, y_optim)

        elif 'binary_cross_entropy' in self.loss_terms:
            optim_loss = nl.binary_cross_entropy(y, y_optim)

        elif 'squared_error' in self.loss_terms:
            optim_loss = nl.squared_error(y, y_optim)

        else:
            sys.exit('Must be either cross_entropy or squared_error')

        if 'l1_reg' in self.loss_terms:
            l1_decay = self.loss_params.get('l1_decay')
            optim_loss += nl.l1_reg(wts, l1_decay=l1_decay)

        if 'l2_reg' in self.loss_terms:
            l2_decay = self.loss_params.get('l2_decay')
            optim_loss += nl.l2_reg(wts, l2_decay=l2_decay)

        return optim_loss

    def get_weights_and_biases(self):
        ''' simple function which returns the weights and biases as numpy arrays'''

        wts = [wt.get_value() for wt in self.wts_]
        bs = [b.get_value() for b in self.bs_]

        return wts, bs

    # debugging
    def check_nans(self):
        ''' simple function which returns True if any value is NaN in wts or biases '''

        # poke into the shared variables and get their values
        wts, bs = self.get_weights_and_biases()
        nans = 0
        for wt, b in zip(wts, bs):
            nans += np.sum(wt) + np.sum(b)

        return np.isnan(nans)
Example #12
0
print(f()) # different uniform numbers
print(g()) # different normal numbers
print(g()) # same normal numbers as the prev. call

# NOTE: a single RV is sampled only once in one function call, regardless of how many
# times it appears in the formula (which makes sense, in math it is the same)
nearly_zeros = function([], rv_unif + rv_unif - 2*rv_unif)
print(nearly_zeros()) # returns 0

# Using seeds: you can seed each RV separately or all at once (pretty much to the same effect)
rng_val = rv_unif.rng.get_value(borrow=True)
rng_val.seed(81232)
rv_unif.rng.set_value(rng_val, borrow=True)

# or all at once
srng.seed(123321)

# and to explicitly show that RandomStreams have a shared state:
state_after_v0 = rv_unif.rng.get_value().get_state()
nearly_zeros()
v1 = f()
# Go one step back
rng = rv_unif.rng.get_value(borrow=True)
rng.set_state(state_after_v0)
rv_unif.rng.set_value(rng, borrow=True)
print(v1 == f()) # False
print(v1 == f()) # True

"""
Copying random states from one function to another
"""
Example #13
0
class MaskGenerator(object):

    def __init__(self, input_size, hidden_sizes, l, random_seed=1234):
        self._random_seed = random_seed
        self._mrng = MRG_RandomStreams(seed=random_seed)
        self._rng = RandomStreams(seed=random_seed)

        self._hidden_sizes = hidden_sizes
        self._input_size = input_size
        self._l = l

        self.ordering = theano.shared(value=np.arange(input_size, dtype=theano.config.floatX), name='ordering', borrow=False)

        # Initial layer connectivity
        self.layers_connectivity = [theano.shared(value=(self.ordering + 1).eval(), name='layer_connectivity_input', borrow=False)]
        for i in range(len(self._hidden_sizes)):
            self.layers_connectivity += [theano.shared(value=np.zeros((self._hidden_sizes[i]), dtype=theano.config.floatX), name='layer_connectivity_hidden{0}'.format(i), borrow=False)]
        self.layers_connectivity += [self.ordering]

        ## Theano functions
        new_ordering = self._rng.shuffle_row_elements(self.ordering)
        self.shuffle_ordering = theano.function(name='shuffle_ordering',
                                                inputs=[],
                                                updates=[(self.ordering, new_ordering), (self.layers_connectivity[0], new_ordering + 1)])

        self.layers_connectivity_updates = []
        for i in range(len(self._hidden_sizes)):
            self.layers_connectivity_updates += [self._get_hidden_layer_connectivity(i)]
        # self.layers_connectivity_updates = [self._get_hidden_layer_connectivity(i) for i in range(len(self._hidden_sizes))]  # WTF THIS DO NOT WORK
        self.sample_connectivity = theano.function(name='sample_connectivity',
                                                   inputs=[],
                                                   updates=[(self.layers_connectivity[i+1], self.layers_connectivity_updates[i]) for i in range(len(self._hidden_sizes))])

        # Save random initial state
        self._initial_mrng_rstate = copy.deepcopy(self._mrng.rstate)
        self._initial_mrng_state_updates = [state_update[0].get_value() for state_update in self._mrng.state_updates]

        # Ensuring valid initial connectivity
        self.sample_connectivity()

    def reset(self):
        # Set Original ordering
        self.ordering.set_value(np.arange(self._input_size, dtype=theano.config.floatX))

        # Reset RandomStreams
        self._rng.seed(self._random_seed)

        # Initial layer connectivity
        self.layers_connectivity[0].set_value((self.ordering + 1).eval())
        for i in range(1, len(self.layers_connectivity)-1):
            self.layers_connectivity[i].set_value(np.zeros((self._hidden_sizes[i-1]), dtype=theano.config.floatX))
        self.layers_connectivity[-1].set_value(self.ordering.get_value())

        # Reset MRG_RandomStreams (GPU)
        self._mrng.rstate = self._initial_mrng_rstate
        for state, value in zip(self._mrng.state_updates, self._initial_mrng_state_updates):
            state[0].set_value(value)

        self.sample_connectivity()

    def _get_p(self, start_choice):
        start_choice_idx = (start_choice-1).astype('int32')
        p_vals = T.concatenate([T.zeros((start_choice_idx,)), T.nnet.nnet.softmax(self._l * T.arange(start_choice, self._input_size, dtype=theano.config.floatX))[0]])
        p_vals = T.inc_subtensor(p_vals[start_choice_idx], 1.)  # Stupid hack because de multinomial does not contain a safety for numerical imprecision.
        return p_vals

    def _get_hidden_layer_connectivity(self, layerIdx):
        layer_size = self._hidden_sizes[layerIdx]
        if layerIdx == 0:
            p_vals = self._get_p(T.min(self.layers_connectivity[layerIdx]))
        else:
            p_vals = self._get_p(T.min(self.layers_connectivity_updates[layerIdx-1]))

        # #Implementations of np.choose in theano GPU
        # return T.nonzero(self._mrng.multinomial(pvals=[self._p_vals] * layer_size, dtype=theano.config.floatX))[1].astype(dtype=theano.config.floatX)
        # return T.argmax(self._mrng.multinomial(pvals=[self._p_vals] * layer_size, dtype=theano.config.floatX), axis=1)
        return T.sum(T.cumsum(self._mrng.multinomial(pvals=T.tile(p_vals[::-1][None, :], (layer_size, 1)), dtype=theano.config.floatX), axis=1), axis=1)

    def _get_mask(self, layerIdxIn, layerIdxOut):
        return (self.layers_connectivity[layerIdxIn][:, None] <= self.layers_connectivity[layerIdxOut][None, :]).astype(theano.config.floatX)

    def get_mask_layer_UPDATE(self, layerIdx):
        return self._get_mask(layerIdx, layerIdx + 1)

    def get_direct_input_mask_layer_UPDATE(self, layerIdx):
        return self._get_mask(0, layerIdx)

    def get_direct_output_mask_layer_UPDATE(self, layerIdx):
        return self._get_mask(layerIdx, -1)
Example #14
0
# 8. Seed Streams

# Random variables can be seeded individually or collectively.

# You can seed just one random variable by seeding or assigning the .rng
# attribute, using the .rng.set_value().
rng_val = rv_u.rng.get_value(borrow=True)  # Get the rng for rv_u
rng_val.seed(89234)  # seeds the generator
rv_u.rng.set_value(rng_val, borrow=True)  # Assign back seeded rng

# You can also seed all the random variables allocated by a RandomStreams
# object by that object's seed method. This seed will be used to seed a
# temporary random number generator, that will in turn generate seeds for
# each of the random variables.
print 'seed'
srng.seed(902340)  # seeds rv_u and rv_n with different seeds each
print f()
print f()
print g()
srng.seed(156456)
print g()

# 9. Sharing Streams Between Functions

# As usual for shared variables, the random number generators used for
# random variables are common between functions. So our nearly_zeros
# function will update the state of the generators used in function f
# above.

# For example:
state_after_v0 = rv_u.rng.get_value().get_state()
Example #15
0
srng = RandomStreams(seed=234)
rv_u = srng.uniform((2, 2))
rv_n = srng.normal((2, 2))
f = function([], rv_u)
g = function([], rv_n, no_default_updates=True)
nearly_zeros = function([], rv_u + rv_u - 2 * rv_u)

print(f())
print(f())
print(g())
print(g())
print(".....")
print(nearly_zeros())

srng.seed(902340)
print(f())
print(f())
print(g())
print(g())

state_after_v0 = rv_u.rng.get_value().get_state()
print(nearly_zeros())
v1 = f()
rng = rv_u.rng.get_value(borrow=True)
rng.set_state(state_after_v0)
rv_u.rng.set_value(rng, borrow=True)
print(rng)
v2 = f()  # v2 != v1
v3 = f()  # v3 == v1
print(v2)
Example #16
0
g = function([], rv_n, no_default_updates=True)
g = function([], ev_n, no_default_updates=True)
nearly_zeros = function([], rv_u + rv_u  - 2 * rv_u)
f_val0 = f()
f_val1 = f()
f_val0
f_val1
g_val0 = g()
g_val1 = g()
g_val0
g_val1
nearly_zeros()
rng_val = rv_u.rng.get_value(borrow=True)
rng_val.seed(89234)
rv_u.rng.set_value(rng_val, borrow=True)
srng.seed(902340)
rv_u
rv_u.get_value()
rv_u[0]
rv_u[0,0]
help(rv_u)
rv_u.all()
help(rv_u)
rv_u.argmax()
state_after_v0 = rv_u.rng.get_value().get_state()
nearly_zeros()
v1 = f()
rng = rv_u.rng.get_value(borrow=True)
rng.set_state(state_after_v0)
rv_u.rng.set_value(rng, borrow=True)
v2 = f()
rv_n = srng.normal((2,2))
f = function([], rv_u)
g = function([], rv_n, no_default_updates=True)    #Not updating rv_n.rng
nearly_zeros = function([], rv_u + rv_u - 2 * rv_u)

#Call the random number function - normally distributed
print f()
print f()

#same value every time - no_default_updates = True
print g()  # different numbers from f_val0 and f_val1
print g()

#Seeding streams

rng_val = rv_u.rng.get_value(borrow=True)   # Get the rng for rv_u
rng_val.seed(89234)                         # seeds the generator
rv_u.rng.set_value(rng_val, borrow=True)    # Assign back seeded rng

srng.seed(902340)  # seeds rv_u and rv_n with different seeds each

state_after_v0 = rv_u.rng.get_value().get_state()
nearly_zeros()       # this affects rv_u's generator
v1 = f()
rng = rv_u.rng.get_value(borrow=True)
rng.set_state(state_after_v0)
rv_u.rng.set_value(rng, borrow=True)
v2 = f()             # v2 != v1
v3 = f()             # v3 == v1

print v1, v2, v3
Example #18
0
class Network(object):
    ''' Core neural network class that forms the basis for all further implementations (e.g.
        MultilayerNet, Autoencoder, etc). Contains basic functions for propagating data forward
        and backwards through the network, as well as fitting the weights to data'''
    def __init__(self,
                 d=None,
                 k=None,
                 num_hids=None,
                 activs=None,
                 loss_terms=[None],
                 **loss_params):

        # Number of units in the output layer determined by k, so not explicitly specified in
        # num_hids. still need to check that there's one less hidden layer than number of activation
        # functions
        assert (len(num_hids) + 1 == len(activs))

        self.num_nodes = [d] + num_hids + [k]

        # needed mainly for gradient checking...
        self.num_params = 0
        for i, (n1,
                n2) in enumerate(zip(self.num_nodes[:-1], self.num_nodes[1:])):
            self.num_params += (n1 + 1) * n2

        self.activs = [None] * len(activs)
        for idx, activ in enumerate(activs):
            if activ == 'sigmoid':
                self.activs[idx] = na.sigmoid
            elif activ == 'tanh':
                self.activs[idx] = na.tanh
            elif activ == 'reLU':
                self.activs[idx] = na.reLU
            elif activ == 'softmax':
                self.activs[idx] = na.softmax
            else:
                sys.exit(ne.activ_err())

        self.loss_terms = loss_terms
        self.loss_params = loss_params
        self.srng = RandomStreams()
        self.srng.seed(np.random.randint(99999))

    def set_weights(self,
                    wts=None,
                    bs=None,
                    init_method=None,
                    scale_factor=None,
                    seed=None):
        ''' Initializes the weights and biases of the neural network
        Parameters:
        -----------
        param: wts - weights
        type: np.ndarray, optional
        param: bs - biases
        type: np.ndarray, optional
        param: init_method - calls some pre-specified weight initialization routines
        type: string
        param: scale_factor - additional hyperparameter for weight initialization
        type: float, optional
        param: seed - seeds the random number generator
        type: int, optional
        '''
        if seed is not None:
            np.random.seed(seed=seed)
            self.srng.seed(seed)

        if wts is None and bs is None:
            wts = (len(self.num_nodes) - 1) * [None]
            bs = (len(self.num_nodes) - 1) * [None]

            if init_method == 'gauss':
                for i, (n1, n2) in enumerate(
                        zip(self.num_nodes[:-1], self.num_nodes[1:])):
                    wts[i] = scale_factor * 1. / \
                        np.sqrt(n2) * np.random.randn(n1, n2)
                    bs[i] = np.zeros(n2)

            elif init_method == 'fan-io':
                for i, (n1, n2) in enumerate(
                        zip(self.num_nodes[:-1], self.num_nodes[1:])):
                    v = scale_factor * np.sqrt(6. / (n1 + n2 + 1))
                    wts[i] = 2.0 * v * np.random.rand(n1, n2) - v
                    bs[i] = np.zeros(n2)
            else:
                sys.exit(ne.weight_error())

        else:
            # this scenario occurs most when doing unsupervised pre-training to initialize
            # the weights
            assert isinstance(wts, list)
            assert isinstance(bs, list)

        self.wts_ = [theano.shared(nu.floatX(wt), borrow=True) for wt in wts]
        self.bs_ = [theano.shared(nu.floatX(b), borrow=True) for b in bs]

    def fit(self,
            X_tr,
            y_tr,
            X_val=None,
            y_val=None,
            wts=None,
            bs=None,
            plotting=False,
            **optim_params):
        ''' The primary function which ingests data and fits to the neural network.
        Parameters:
        -----------
        param: X_tr - training data
        type: theano matrix
        param: y_tr - training labels
        type: theano matrix
        param: X_val - validation data
        type: theano matrix
        param: y_val - validation labels
        type: theano matrix
        param: plotting - specifies whether any curves should be generated
        type: boolean
        param: **optim_params
        type: dictionary of optimization parameters
        '''
        # initialize weights...
        if all(node for node in self.num_nodes):
            init_method = optim_params.pop('init_method')
            scale_factor = optim_params.pop('scale_factor')
            try:
                seed = optim_params.pop('seed')
            except KeyError:
                seed = None
            self.set_weights(wts=wts,
                             bs=bs,
                             init_method=init_method,
                             scale_factor=scale_factor,
                             seed=seed)

        #...and train
        try:
            optim_type = optim_params.pop('optim_type')
        except KeyError:
            sys.exit(ne.opt_type_err())

        num_epochs = optim_params.pop('num_epochs', None)
        batch_size = optim_params.pop('batch_size', None)

        if optim_type == 'minibatch':
            self.minibatch_optimize(X_tr,
                                    y_tr,
                                    X_val=X_val,
                                    y_val=y_val,
                                    batch_size=batch_size,
                                    num_epochs=num_epochs,
                                    plotting=plotting,
                                    **optim_params)
        elif optim_type == 'fullbatch':
            self.fullbatch_optimize(X_tr,
                                    y_tr,
                                    X_val=X_val,
                                    y_val=y_val,
                                    num_epochs=num_epochs,
                                    **optim_params)
        else:
            sys.exit(ne.opt_type_err())

        return self

    def shared_dataset(self, X, y):
        ''' As per the deep learning tutorial, loading the data all at once (if possible)
        into the GPU will significantly speed things up '''

        return theano.shared(nu.floatX(X)), theano.shared(nu.floatX(y))

    def fullbatch_optimize(self,
                           X_tr,
                           y_tr,
                           X_val=None,
                           y_val=None,
                           num_epochs=None,
                           **optim_params):
        ''' Full-batch optimization using scipy's L-BFGS-B and CG
        Parameters:
        -----------
        param: X_tr - training data
        type: theano matrix
        param: y_tr - training labels
        type: theano matrix
        param: num_epochs - the number of full runs through the dataset
        type: int
        param: **optim_params
        type: dictionary of optimization parameters
        '''

        X = T.matrix('X')  # input variable
        y = T.matrix('y')  # output variable
        w = T.vector('w')  # weight vector

        # reshape the vector w into weight and bias matrices, and set up the
        # theano graph to compute the loss and gradient
        wts, bs = nu.t_reroll(w, self.num_nodes)
        optim_loss = self.compute_optim_loss(X, y, wts=wts, bs=bs)
        params = [p for param in [wts, bs] for p in param]
        grad_params = [T.grad(optim_loss, param) for param in params]
        grad_w = nu.t_unroll(grad_params[:len(wts)], grad_params[len(wts):])

        compute_loss_grad_from_vector = theano.function(
            inputs=[w, X, y],
            outputs=[optim_loss, grad_w],
            allow_input_downcast=True)

        compute_loss_from_vector = theano.function(inputs=[w, X, y],
                                                   outputs=[optim_loss],
                                                   allow_input_downcast=True)

        # initialize the weight vector and perform full-batch optimization
        wts0 = [wt.get_value() for wt in self.wts_]
        bs0 = [b.get_value() for b in self.bs_]
        w0 = nu.unroll(wts0, bs0)

        # print 'Checking gradients...'
        # self.check_gradients(X_tr,y_tr,wts0,bs0)
        # print 'Pre-training loss:',compute_loss_from_vector(w0,X_tr,y_tr)

        try:
            optim_method = optim_params.pop('optim_method')
        except KeyError:
            sys.exit(ne.method_err())

        # very annoying.
        if optim_method == 'L-BFGS-B' and theano.config.floatX == 'float32':
            sys.exit('Sorry, L-BFGS-B only works with float64')

        wf = sp.optimize.minimize(compute_loss_grad_from_vector,
                                  w0,
                                  args=(X_tr, y_tr),
                                  method=optim_method,
                                  jac=True,
                                  options={'maxiter': num_epochs})

        # re-rolln back into weights and biases
        wts, bs = nu.reroll(wf.x, self.num_nodes)

        self.wts_ = [theano.shared(nu.floatX(wt)) for wt in wts]
        self.bs_ = [theano.shared(nu.floatX(b)) for b in bs]

    def minibatch_optimize(self,
                           X_tr,
                           y_tr,
                           X_val=None,
                           y_val=None,
                           batch_size=None,
                           num_epochs=None,
                           plotting=False,
                           **optim_params):
        ''' Mini-batch optimization using update functions; however, if the batch size = m, then this is basically
        full-batch learning with gradient descent
        Parameters:
        -----------
        param: X_tr - training data
        type: theano matrix
        param: y_tr - training labels
        type: theano matrix
        param: updates - update per rule for each
        param: batch_size - number of examples per mini-batch
        type: int
        param: num_epochs - the number of full runs through the dataset
        type: int
        param: **optim_params
        type: dictionary of optimization parameters
        '''
        X = T.matrix('X')  # input variable
        y = T.matrix('y')  # output variable
        idx = T.ivector('idx')  # integer index

        optim_loss = self.compute_optim_loss(X, y)
        eval_loss = self.compute_eval_loss(X, y)
        params = [p for param in [self.wts_, self.bs_] for p in param]
        grad_params = [T.grad(optim_loss, param) for param in params]

        try:
            optim_method = optim_params.pop('optim_method')
        except KeyError:
            sys.exit(ne.method_err())

        # define the update rule
        updates = []
        if optim_method == 'SGD':
            updates = nopt.sgd(params, grad_params,
                               **optim_params)  # update rule

        elif optim_method == 'ADAGRAD':
            updates = nopt.adagrad(params, grad_params,
                                   **optim_params)  # update rule

        elif optim_method == 'RMSPROP':
            updates = nopt.rmsprop(params, grad_params, **optim_params)

        else:
            print method_err()

        # define the mini-batches
        m = X_tr.shape[0]  # total number of training instances
        # number of batches, based on batch size
        n_batches = int(m / batch_size)
        # batch_size won't divide the data evenly, so get leftover
        leftover = m - n_batches * batch_size

        # load the full dataset into a shared variable - this is especially useful
        # for test
        X_tr, y_tr = self.shared_dataset(X_tr, y_tr)

        # training function for minibatchs
        train = theano.function(inputs=[idx],
                                updates=updates,
                                allow_input_downcast=True,
                                mode='FAST_RUN',
                                givens={
                                    X: X_tr[idx],
                                    y: y_tr[idx]
                                })

        compute_train_loss = theano.function(inputs=[],
                                             outputs=eval_loss,
                                             allow_input_downcast=True,
                                             mode='FAST_RUN',
                                             givens={
                                                 X: X_tr,
                                                 y: y_tr
                                             })

        # if validation data is provided, validation loss
        compute_val_loss = None

        if X_val is not None and y_val is not None:
            X_val, y_val = self.shared_dataset(X_val, y_val)
            compute_val_loss = theano.function(inputs=[],
                                               outputs=eval_loss,
                                               allow_input_downcast=True,
                                               mode='FAST_RUN',
                                               givens={
                                                   X: X_val,
                                                   y: y_val
                                               })

        # iterate through the training examples
        tr_loss = []
        val_loss = []
        epoch = 0

        while epoch < num_epochs:
            # randomly shuffle the data indices
            tr_idx = np.random.permutation(m)
            # define the start-stop indices
            ss_idx = range(0, m + 1, batch_size)
            ss_idx[-1] += leftover  # add the leftovers to the last batch

            # run through a full epoch
            for idx, (start_idx,
                      stop_idx) in enumerate(zip(ss_idx[:-1], ss_idx[1:])):

                # total number of batches processed up until now
                n_batch_iter = (epoch - 1) * n_batches + idx
                batch_idx = tr_idx[start_idx:stop_idx]  # get the next batch

                train(batch_idx)

            epoch += 1  # update the epoch count
            if epoch % 10 == 0:
                tr_loss.append(compute_train_loss())
                if compute_val_loss is not None:
                    val_loss.append(compute_val_loss())
                    print 'Epoch: %s, Training error: %.15f, Validation error: %.15f' % (
                        epoch, tr_loss[-1], val_loss[-1])
                else:
                    print 'Epoch: %s, Training error: %.15f' % (epoch,
                                                                tr_loss[-1])

            # training and validation curves - very useful to see how training
            # error evolves
            if plotting:
                num_pts = len(tr_loss)
                pts = [idx * 10 for idx in range(num_pts)]
                plt.plot(pts, tr_loss, label='Training loss')
                # sort of a weak way to check if validation losses have been
                # computed
                if len(val_loss) > 0:
                    plt.plot(pts, val_loss, label='Validation loss')

                plt.xlabel('Epochs')
                plt.ylabel('Loss')
                plt.legend(loc='upper right')
                plt.show()

    def dropout(self, act, p=0.5):
        ''' Randomly drops an activation with probability p
        Parameters
        ----------
        param: act - activation values, in a matrix
        type: theano matrix
        param: p - probability of dropping out a node
        type: float, optional
        Returns:
        --------
        param: [expr] - activation values randomly zeroed out
        type: theano matrix
        '''
        if p > 0:
            # randomly dropout p activations
            retain_prob = 1. - p
            return (1. / retain_prob) * act * self.srng.binomial(
                act.shape, p=retain_prob, dtype=theano.config.floatX)

    def train_fprop(self, X, wts=None, bs=None):
        ''' Performs forward propagation with for training, which could be different from
        the vanilla frprop we would use for testing, due to extra bells and whistles such as
        dropout, corruption, etc'''

        if wts is None and bs is None:
            wts = self.wts_
            bs = self.bs_

        if 'dropout' in self.loss_terms:
            input_p = self.loss_params['input_p']
            hidden_p = self.loss_params['hidden_p']

            # compute the first activation separately in case we have no hidden
            # layer;
            act = self.activs[0](T.dot(self.dropout(X, input_p), wts[0]) +
                                 bs[0])
            if len(wts) > 1:  # len(wts) = 1 corresponds to softmax regression
                for i, (w, b, activ) in enumerate(
                        zip(wts[1:], bs[1:], self.activs[1:])):
                    act = activ(T.dot(self.dropout(act, hidden_p), w) + b)

            eps = 1e-6
            act = T.switch(act < eps, eps, act)
            act = T.switch(act > (1. - eps), (1. - eps), act)

            return act
        else:
            return self.fprop(X, wts, bs)

    def fprop(self, X, wts=None, bs=None):
        ''' Performs vanilla forward propagation through the network
        Parameters
        ----------
        param: X - training data
        type: theano matrix
        param: wts - weights
        type: theano matrix
        param: bs - biases
        type: theano matrix
        Returns:
        --------
        param: act - final activation values
        type: theano matrix
        '''
        if wts is None and bs is None:
            wts = self.wts_
            bs = self.bs_

        # use the first data matrix to compute the first activation
        act = self.activs[0](T.dot(X, wts[0]) + bs[0])

        # len(wts) = 1 corresponds to softmax regression
        if len(wts) > 1:
            for i, (w, b,
                    activ) in enumerate(zip(wts[1:], bs[1:], self.activs[1:])):
                act = activ(T.dot(act, w) + b)

        # for numericaly stability
        eps = 1e-6
        act = T.switch(act < eps, eps, act)
        act = T.switch(act > (1. - eps), (1. - eps), act)

        return act

    def check_gradients(self, X_in, Y_in, wts=None, bs=None):
        ''' this seems like overkill, but I suppose it doesn't hurt to have it in here...'''

        # assume that if it's not provided, they will be shared variables - this is
        # probably dangerous, but this is a debugging tool anyway,
        # so...whatever
        if wts is None and bs is None:
            wts = self.wts_
            bs = self.bs_
        else:
            wts = [theano.shared(nu.floatX(w), borrow=True) for w in wts]
            bs = [theano.shared(nu.floatX(b), borrow=True) for b in bs]

        X = T.matrix()  # inputs
        Y = T.matrix()  # labels
        v = T.vector()  # vector of biases and weights
        i = T.lscalar()  # index

        # 1. compile the numerical gradient function
        def compute_numerical_gradient(v, i, X, Y, eps=1e-4):

            # perturb the input
            v_plus = T.inc_subtensor(v[i], eps)
            v_minus = T.inc_subtensor(v[i], -1.0 * eps)

            # roll it back into the weight matrices and bias vectors
            wts_plus, bs_plus = nu.t_reroll(v_plus, self.num_nodes)
            wts_minus, bs_minus = nu.t_reroll(v_minus, self.num_nodes)

            # compute the loss for both sides, and then compute the numerical
            # gradient
            loss_plus = self.compute_optim_loss(X, Y, wts=wts_plus, bs=bs_plus)
            loss_minus = self.compute_optim_loss(X, Y, wts_minus, bs_minus)

            # ( E(weights[i]+eps) - E(weights[i]-eps) )/(2*eps)
            return 1.0 * (loss_plus - loss_minus) / (2 * eps)

        compute_ngrad = theano.function(inputs=[v, i, X, Y],
                                        outputs=compute_numerical_gradient(
                                            v, i, X, Y))

        # 2. compile backprop (theano's autodiff)
        optim_loss = self.compute_optim_loss(X, Y, wts=wts, bs=bs)
        params = [p for param in [wts, bs]
                  for p in param]  # all model parameters in a list
        # gradient of each model param w.r.t training loss
        grad_params = [T.grad(optim_loss, param) for param in params]
        # gradient of the full weight vector
        grad_w = nu.t_unroll(grad_params[:len(wts)], grad_params[len(wts):])

        compute_bgrad = theano.function(inputs=[X, Y], outputs=grad_w)

        # compute the mean difference between the numerical and exact gradients
        v0 = nu.unroll([wt.get_value() for wt in wts],
                       [b.get_value() for b in bs])
        # get the indices of the weights/biases we want to check
        idxs = np.random.permutation(self.num_params)[:(self.num_params / 5)]

        ngrad = [None] * len(idxs)
        for j, idx in enumerate(idxs):
            ngrad[j] = compute_ngrad(v0, idx, X_in, Y_in)
        bgrad = compute_bgrad(X_in, Y_in)[idxs]

        cerr = np.mean(np.abs(ngrad - bgrad))
        assert cerr < 1e-10

    def compute_eval_loss(self, X, y, wts=None, bs=None):
        ''' Given inputs, returns the evaluation loss at the current state of the model
        Parameters:
        -----------
        param: X - training data
        type: theano matrix
        param: y - training labels
        type: theano matrix
        param: wts - weights
        type: theano matrix, optional
        param: bs - biases
        type: theano matrix, optional
        Returns:
        --------
        param: eval_loss - evaluation loss, which doesn't include regularization
        type: theano scalar
        '''
        if wts is None and bs is None:
            wts = self.wts_
            bs = self.bs_

        eval_loss = None  # the loss function we can evaluate during validation
        y_pred = self.fprop(X, wts, bs)

        if 'cross_entropy' in self.loss_terms:
            eval_loss = nl.cross_entropy(y, y_pred)

        elif 'binary_cross_entropy' in self.loss_terms:
            eval_loss = nl.binary_cross_entropy(y, y_pred)

        elif 'squared_error' in self.loss_terms:
            eval_loss = nl.squared_error(y, y_pred)
        else:
            sys.exit('Must be either cross_entropy or squared_error')

        return eval_loss

    def compute_optim_loss(self, X, y, wts=None, bs=None):
        ''' Given inputs, returns the training loss at the current state of the model
        Parameters:
        -----------
        param: X - training data
        type: theano matrix
        param: y - training labels
        type: theano matrix
        param: wts - weights
        type: theano matrix, optional
        param: bs - biases
        type: theano matrix, optional
        Returns:
        --------
        param: optim_loss - the optimization loss which must be optimized over
        type: theano scalar
        '''
        if wts is None and bs is None:
            wts = self.wts_
            bs = self.bs_

        y_optim = self.train_fprop(X, wts, bs)
        # the loss function which will specifically be optimized over
        optim_loss = None

        if 'cross_entropy' in self.loss_terms:
            optim_loss = nl.cross_entropy(y, y_optim)

        elif 'binary_cross_entropy' in self.loss_terms:
            optim_loss = nl.binary_cross_entropy(y, y_optim)

        elif 'squared_error' in self.loss_terms:
            optim_loss = nl.squared_error(y, y_optim)

        else:
            sys.exit('Must be either cross_entropy or squared_error')

        if 'l1_reg' in self.loss_terms:
            l1_decay = self.loss_params.get('l1_decay')
            optim_loss += nl.l1_reg(wts, l1_decay=l1_decay)

        if 'l2_reg' in self.loss_terms:
            l2_decay = self.loss_params.get('l2_decay')
            optim_loss += nl.l2_reg(wts, l2_decay=l2_decay)

        return optim_loss

    def get_weights_and_biases(self):
        ''' simple function which returns the weights and biases as numpy arrays'''

        wts = [wt.get_value() for wt in self.wts_]
        bs = [b.get_value() for b in self.bs_]

        return wts, bs

    # debugging
    def check_nans(self):
        ''' simple function which returns True if any value is NaN in wts or biases '''

        # poke into the shared variables and get their values
        wts, bs = self.get_weights_and_biases()
        nans = 0
        for wt, b in zip(wts, bs):
            nans += np.sum(wt) + np.sum(b)

        return np.isnan(nans)
Example #19
0
class MaskGenerator(object):
    def __init__(self, input_size, hidden_sizes, l, random_seed=1234):
        self._random_seed = random_seed
        self._mrng = MRG_RandomStreams(seed=random_seed)
        self._rng = RandomStreams(seed=random_seed)

        self._hidden_sizes = hidden_sizes
        self._input_size = input_size
        self._l = l

        self.ordering = theano.shared(value=np.arange(
            input_size, dtype=theano.config.floatX),
                                      name='ordering',
                                      borrow=False)

        # Initial layer connectivity
        self.layers_connectivity = [
            theano.shared(value=(self.ordering + 1).eval(),
                          name='layer_connectivity_input',
                          borrow=False)
        ]
        for i in range(len(self._hidden_sizes)):
            self.layers_connectivity += [
                theano.shared(value=np.zeros((self._hidden_sizes[i]),
                                             dtype=theano.config.floatX),
                              name='layer_connectivity_hidden{0}'.format(i),
                              borrow=False)
            ]
        self.layers_connectivity += [self.ordering]

        ## Theano functions
        new_ordering = self._rng.shuffle_row_elements(self.ordering)
        self.shuffle_ordering = theano.function(
            name='shuffle_ordering',
            inputs=[],
            updates=[(self.ordering, new_ordering),
                     (self.layers_connectivity[0], new_ordering + 1)])

        self.layers_connectivity_updates = []
        for i in range(len(self._hidden_sizes)):
            self.layers_connectivity_updates += [
                self._get_hidden_layer_connectivity(i)
            ]
        # self.layers_connectivity_updates = [self._get_hidden_layer_connectivity(i) for i in range(len(self._hidden_sizes))]  # WTF THIS DO NOT WORK
        self.sample_connectivity = theano.function(
            name='sample_connectivity',
            inputs=[],
            updates=[(self.layers_connectivity[i + 1],
                      self.layers_connectivity_updates[i])
                     for i in range(len(self._hidden_sizes))])

        # Save random initial state
        self._initial_mrng_rstate = copy.deepcopy(self._mrng.rstate)
        self._initial_mrng_state_updates = [
            state_update[0].get_value()
            for state_update in self._mrng.state_updates
        ]

        # Ensuring valid initial connectivity
        self.sample_connectivity()

    def reset(self):
        # Set Original ordering
        self.ordering.set_value(
            np.arange(self._input_size, dtype=theano.config.floatX))

        # Reset RandomStreams
        self._rng.seed(self._random_seed)

        # Initial layer connectivity
        self.layers_connectivity[0].set_value((self.ordering + 1).eval())
        for i in range(1, len(self.layers_connectivity) - 1):
            self.layers_connectivity[i].set_value(
                np.zeros((self._hidden_sizes[i - 1]),
                         dtype=theano.config.floatX))
        self.layers_connectivity[-1].set_value(self.ordering.get_value())

        # Reset MRG_RandomStreams (GPU)
        self._mrng.rstate = self._initial_mrng_rstate
        for state, value in zip(self._mrng.state_updates,
                                self._initial_mrng_state_updates):
            state[0].set_value(value)

        self.sample_connectivity()

    def _get_p(self, start_choice):
        start_choice_idx = (start_choice - 1).astype('int32')
        p_vals = T.concatenate([
            T.zeros((start_choice_idx, )),
            T.nnet.nnet.softmax(self._l * T.arange(
                start_choice, self._input_size, dtype=theano.config.floatX))[0]
        ])
        p_vals = T.inc_subtensor(
            p_vals[start_choice_idx], 1.
        )  # Stupid hack because de multinomial does not contain a safety for numerical imprecision.
        return p_vals

    def _get_hidden_layer_connectivity(self, layerIdx):
        layer_size = self._hidden_sizes[layerIdx]
        if layerIdx == 0:
            p_vals = self._get_p(T.min(self.layers_connectivity[layerIdx]))
        else:
            p_vals = self._get_p(
                T.min(self.layers_connectivity_updates[layerIdx - 1]))

        # #Implementations of np.choose in theano GPU
        # return T.nonzero(self._mrng.multinomial(pvals=[self._p_vals] * layer_size, dtype=theano.config.floatX))[1].astype(dtype=theano.config.floatX)
        # return T.argmax(self._mrng.multinomial(pvals=[self._p_vals] * layer_size, dtype=theano.config.floatX), axis=1)
        return T.sum(T.cumsum(self._mrng.multinomial(
            pvals=T.tile(p_vals[::-1][None, :], (layer_size, 1)),
            dtype=theano.config.floatX),
                              axis=1),
                     axis=1)

    def _get_mask(self, layerIdxIn, layerIdxOut):
        return (self.layers_connectivity[layerIdxIn][:, None] <=
                self.layers_connectivity[layerIdxOut][None, :]).astype(
                    theano.config.floatX)

    def get_mask_layer_UPDATE(self, layerIdx):
        return self._get_mask(layerIdx, layerIdx + 1)

    def get_direct_input_mask_layer_UPDATE(self, layerIdx):
        return self._get_mask(0, layerIdx)

    def get_direct_output_mask_layer_UPDATE(self, layerIdx):
        return self._get_mask(layerIdx, -1)
Example #20
0
# Fix random seed for reproducible experiments
if K.backend() == "tensorflow":
    import tensorflow as tf
    session_conf = tf.ConfigProto(
        intra_op_parallelism_threads=1,
        inter_op_parallelism_threads=1)  #, device_count={"GPU": 0}
    tf.set_random_seed(1234)
    session_conf.gpu_options.allow_growth = True
    sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
    K.set_session(sess)
else:
    from theano.tensor.shared_randomstreams import RandomStreams
    # from theano import function
    srng = RandomStreams(seed=123456789)
    srng.seed(123456789)  # seeds rv_u and rv_n with different seeds each

from utils import invert_dict, unsplit_query, merge_two_dicts, sample_aaai_val_set
from data_preprocess import gen_data, load_data, save_data, construct_vocab_emb
from attention_model import create_attention_model


def evaluate(predictions_file, qrels_file):
    print(predictions_file, qrels_file)
    pargs = shlex.split("/bin/sh run_eval.sh '{}' '{}'".format(
        qrels_file, predictions_file))
    p = subprocess.Popen(pargs, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    pout, perr = p.communicate()

    print(perr)
    if sys.version_info[0] < 3:
Example #21
0
g=function([], rv_n, no_default_updates=True) # 如果以后一直用这组随机数,就不再更新 .也就是每次调用g(),都会产生相同的结果
g_val0 = g()   
g_val1 = g()   
print(g_val0)
print(g_val1)

nearly_zeros=function([], rv_u+rv_u-2*rv_u) # 一个randow变量在每次执行函数时只提取一个数  , 所以接近与0
print(nearly_zeros())

# 分别设置,使用.rng.set_value()函数
rng_val =rv_u.rng.get_value(borrow=True) # Get the rng for rv_u。 borrow=true共用一个内存空间, borrow 是借的意思
rng_val.seed(89234) # seeds thegenerator    重新设置一个种子 
rv_u.rng.set_value(rng_val,borrow=True)  

srng.seed(902340)  # 当然你也可以选择全局设置,使用.seed()函数  

state_after_v0 = rv_u.rng.get_value().get_state()   # 保存调用前的state  这是保存rng的state,get_value其实是从rv_u中获取这个rng
print(nearly_zeros())
v1 = f()                                            #第一个调用,之后state会变化  
rng = rv_u.rng.get_value(borrow=True)   
rng.set_state(state_after_v0)                       # 为其state还原 
rv_u.rng.set_value(rng, borrow = True)  
v2 = f()    # 回到了v1之前的那个状态  # v2 != v1输出更新后state对应的随机数  
v3 = f()    # v3 == v1再次更新又还原成原来的state了  
  
print(v1)   
print(v2)  
print(v3)  

Example #22
0
class MaskGenerator(object):

    def __init__(self, input_size, hidden_sizes, l, random_seed=1234):
        self._random_seed = random_seed
        self._mrng = MRG_RandomStreams(seed=random_seed)
        self._rng = RandomStreams(seed=random_seed)

        self._hidden_sizes = hidden_sizes
        self._input_size = input_size
        self._l = l

        self.ordering = theano.shared(np.arange(input_size, 
                                                dtype=theano.config.floatX), 
                                      'ordering', 
                                      borrow=False)

        # Initial layer connectivity
        self.layers_connectivity = [theano.shared((self.ordering + 1).eval(), 
                                                  'layer_connectivity_input', 
                                                  borrow=False)]
        for i in range(len(self._hidden_sizes)):
            lc = theano.shared(np.zeros((self._hidden_sizes[i]),dtype=floatX), 
                               'layer_connectivity_hidden{0}'.format(i),
                               borrow=False)
            self.layers_connectivity += [lc]
        self.layers_connectivity += [self.ordering]

        ## Theano functions
        new_ordering = self._rng.shuffle_row_elements(self.ordering)
        updates = [(self.ordering, new_ordering), 
                   (self.layers_connectivity[0], new_ordering + 1)]
        self.shuffle_ordering = theano.function(name='shuffle_ordering',
                                                inputs=[],
                                                updates=updates)

        self.layers_connectivity_updates = []
        for i in range(len(self._hidden_sizes)):
            lcu = self._get_hidden_layer_connectivity(i)
            self.layers_connectivity_updates += [lcu]
        
        hsizes = range(len(self._hidden_sizes))
        updates = [(self.layers_connectivity[i+1], 
                    self.layers_connectivity_updates[i]) for i in hsizes]
        self.sample_connectivity = theano.function(name='sample_connectivity',
                                                   inputs=[],
                                                   updates=updates)

        # Save random initial state
        self._initial_mrng_rstate = copy.deepcopy(self._mrng.rstate)
        self._initial_mrng_state_updates = [sup[0].get_value() for sup in 
                                            self._mrng.state_updates]

        # Ensuring valid initial connectivity
        self.sample_connectivity()

    def reset(self):
        # Set Original ordering
        self.ordering.set_value(np.arange(self._input_size, 
                                          dtype=theano.config.floatX))

        # Reset RandomStreams
        self._rng.seed(self._random_seed)

        # Initial layer connectivity
        self.layers_connectivity[0].set_value((self.ordering + 1).eval())
        for i in range(1, len(self.layers_connectivity)-1):
            value = np.zeros((self._hidden_sizes[i-1]), 
                             dtype=theano.config.floatX)
            self.layers_connectivity[i].set_value(value)
        self.layers_connectivity[-1].set_value(self.ordering.get_value())

        # Reset MRG_RandomStreams (GPU)
        self._mrng.rstate = self._initial_mrng_rstate
        states_values = zip(self._mrng.state_updates, 
                            self._initial_mrng_state_updates)
        for state, value in states_values:
            state[0].set_value(value)

        self.sample_connectivity()

    def _get_p(self, start_choice):
        start_choice_idx = (start_choice-1).astype('int32')
        prob = T.nnet.nnet.softmax(self._l * T.arange(start_choice, 
                                                      self._input_size, 
                                                      dtype=floatX))[0]
        p_vals = T.concatenate([T.zeros((start_choice_idx,)),prob])
        p_vals = T.inc_subtensor(p_vals[start_choice_idx], 1.)  
        return p_vals

    def _get_hidden_layer_connectivity(self, layerIdx):
        layer_size = self._hidden_sizes[layerIdx]
        if layerIdx == 0:
            lc = self.layers_connectivity[layerIdx]
            p_vals = self._get_p(T.min(lc))
        else:
            lc = self.layers_connectivity_updates[layerIdx-1]
            p_vals = self._get_p(T.min(lc))

        return T.sum(
            T.cumsum(self._mrng.multinomial(
            pvals=T.tile(p_vals[::-1][None, :],(layer_size, 1)), 
            dtype=floatX), axis=1), axis=1
        )

    def _get_mask(self, layerIdxIn, layerIdxOut):
        return (self.layers_connectivity[layerIdxIn][:, None] <= 
                self.layers_connectivity[layerIdxOut][None, :]).astype(floatX)

    def get_mask_layer_UPDATE(self, layerIdx):
        return self._get_mask(layerIdx, layerIdx + 1)

    def get_direct_input_mask_layer_UPDATE(self, layerIdx):
        return self._get_mask(0, layerIdx)

    def get_direct_output_mask_layer_UPDATE(self, layerIdx):
        return self._get_mask(layerIdx, -1)