def test_deterministic(): seed = utt.fetch_seed() sample_size = (10, 20) test_use_cuda = [False] if cuda_available: test_use_cuda.append(True) for use_cuda in test_use_cuda: #print 'use_cuda =', use_cuda R = MRG_RandomStreams(seed=seed, use_cuda=use_cuda) u = R.uniform(size=sample_size) f = theano.function([], u) fsample1 = f() fsample2 = f() assert not numpy.allclose(fsample1, fsample2) R2 = MRG_RandomStreams(seed=seed, use_cuda=use_cuda) u2 = R2.uniform(size=sample_size) g = theano.function([], u2) gsample1 = g() gsample2 = g() assert numpy.allclose(fsample1, gsample1) assert numpy.allclose(fsample2, gsample2)
def test_uniform_broadcastable(): x = tensor.matrix() size1 = (10, 1) size2 = (x.shape[0], 1) R = MRG_RandomStreams(234, use_cuda=False) # check when all dimensions are constant uu = R.uniform(size=size1) assert uu.broadcastable == (False, True) # check when some dimensions are theano variables uu = R.uniform(size=size2) assert uu.broadcastable == (False, True)
def test_seed_fn(): test_use_cuda = [False] if cuda_available: test_use_cuda.append(True) idx = tensor.ivector() for use_cuda in test_use_cuda: if config.mode == 'FAST_COMPILE' and use_cuda: mode = 'FAST_RUN' else: mode = config.mode for new_seed, same in [(234, True), (None, True), (23, False)]: random = MRG_RandomStreams(234, use_cuda=use_cuda) fn1 = theano.function([], random.uniform((2, 2), dtype='float32'), mode=mode) fn2 = theano.function([], random.uniform((3, 3), nstreams=2, dtype='float32'), mode=mode) fn3 = theano.function([idx], random.uniform(idx, nstreams=3, ndim=1, dtype='float32'), mode=mode) fn1_val0 = fn1() fn1_val1 = fn1() assert not numpy.allclose(fn1_val0, fn1_val1) fn2_val0 = fn2() fn2_val1 = fn2() assert not numpy.allclose(fn2_val0, fn2_val1) fn3_val0 = fn3([4]) fn3_val1 = fn3([4]) assert not numpy.allclose(fn3_val0, fn3_val1) assert fn1_val0.size == 4 assert fn2_val0.size == 9 random.seed(new_seed) fn1_val2 = fn1() fn1_val3 = fn1() fn2_val2 = fn2() fn2_val3 = fn2() fn3_val2 = fn3([4]) fn3_val3 = fn3([4]) assert numpy.allclose(fn1_val0, fn1_val2) == same assert numpy.allclose(fn1_val1, fn1_val3) == same assert numpy.allclose(fn2_val0, fn2_val2) == same assert numpy.allclose(fn2_val1, fn2_val3) == same assert numpy.allclose(fn3_val0, fn3_val2) == same assert numpy.allclose(fn3_val1, fn3_val3) == same
def test_consistency_randomstreams(): '''Verify that the random numbers generated by MRG_RandomStreams are the same as the reference (Java) implementation by L'Ecuyer et al. ''' seed = 12345 n_samples = 5 n_streams = 12 n_substreams = 7 test_use_cuda = [False] if cuda_available: test_use_cuda.append(True) for use_cuda in test_use_cuda: #print 'use_cuda =', use_cuda samples = [] rng = MRG_RandomStreams(seed=seed, use_cuda=False) for i in range(n_streams): stream_samples = [] u = rng.uniform(size=(n_substreams,), nstreams=n_substreams) f = theano.function([], u) for j in range(n_samples): s = f() stream_samples.append(s) stream_samples = numpy.array(stream_samples) stream_samples = stream_samples.T.flatten() samples.append(stream_samples) samples = numpy.array(samples).flatten() assert(numpy.allclose(samples, java_samples))
def compare_speed(): # To run this speed comparison # cd <directory of this file> # THEANO_FLAGS=device=gpu \ # python -c 'import test_rng_curand; test_rng_curand.compare_speed()' mrg = MRG_RandomStreams() crn = CURAND_RandomStreams(234) N = 1000 * 100 dest = theano.shared(numpy.zeros(N, dtype=theano.config.floatX)) mrg_u = theano.function([], [], updates={dest: mrg.uniform((N,))}, profile='mrg uniform') crn_u = theano.function([], [], updates={dest: crn.uniform((N,))}, profile='crn uniform') mrg_n = theano.function([], [], updates={dest: mrg.normal((N,))}, profile='mrg normal') crn_n = theano.function([], [], updates={dest: crn.normal((N,))}, profile='crn normal') for f in mrg_u, crn_u, mrg_n, crn_n: # don't time the first call, it has some startup cost print('DEBUGPRINT') print('----------') theano.printing.debugprint(f) for i in range(100): for f in mrg_u, crn_u, mrg_n, crn_n: # don't time the first call, it has some startup cost f.fn.time_thunks = (i > 0) f()
def compute_output(self, network, in_vw): # gather hyperparameters deterministic = network.find_hyperparameter(["deterministic"]) l = network.find_hyperparameter(["alpha_lower"], 3) u = network.find_hyperparameter(["alpha_upper"], 8) if deterministic: negative_coefficient = 2.0 / (l + u) else: # TODO save this state so that we can seed the rng srng = MRG_RandomStreams() alphas = srng.uniform(size=in_vw.symbolic_shape(), low=l, high=u) negative_coefficient = 1.0 / alphas # return output network.create_vw( "default", variable=treeano.utils.rectify( in_vw.variable, negative_coefficient=negative_coefficient), shape=in_vw.shape, tags={"output"}, )
def test_broadcastable(): R = MRG_RandomStreams(234) x = tensor.matrix() size1 = (10, 1) size2 = (x.shape[0], 1) pvals_1 = np.random.uniform(0, 1, size=size1) pvals_1 = pvals_1 / sum(pvals_1) pvals_2 = R.uniform(size=size2) pvals_2 = pvals_2 / tensor.sum(pvals_2) for distribution in [R.uniform, R.binomial, R.multinomial, R.multinomial_wo_replacement, R.normal]: # multinomial or multinomial_wo_replacement does not support "size" argument, # the sizes of them are implicitly defined with "pvals" argument. if distribution in [R.multinomial, R.multinomial_wo_replacement]: # check when all dimensions are constant uu = distribution(pvals=pvals_1) assert uu.broadcastable == (False, True) # check when some dimensions are theano variables uu = distribution(pvals=pvals_2) assert uu.broadcastable == (False, True) else: # check when all dimensions are constant uu = distribution(size=size1) assert uu.broadcastable == (False, True) # check when some dimensions are theano variables uu = distribution(size=size2) assert uu.broadcastable == (False, True)
def prediction(self, h, bias): srng = RandomStreams(seed=42) prop, mean_x, mean_y, std_x, std_y, rho, bernoulli = \ self.compute_parameters(h, bias) mode = T.argmax(srng.multinomial(pvals=prop, dtype=prop.dtype), axis=1) v = T.arange(0, mean_x.shape[0]) m_x = mean_x[v, mode] m_y = mean_y[v, mode] s_x = std_x[v, mode] s_y = std_y[v, mode] r = rho[v, mode] # cov = r * (s_x * s_y) normal = srng.normal((h.shape[0], 2)) x = normal[:, 0] y = normal[:, 1] # x_n = T.shape_padright(s_x * x + cov * y + m_x) # y_n = T.shape_padright(s_y * y + cov * x + m_y) x_n = T.shape_padright(m_x + s_x * x) y_n = T.shape_padright(m_y + s_y * (x * r + y * T.sqrt(1.-r**2))) uniform = srng.uniform((h.shape[0],)) pin = T.shape_padright(T.cast(bernoulli > uniform, floatX)) return T.concatenate([x_n, y_n, pin], axis=1)
def test_f16_nonzero(mode=None, op_to_check=rng_mrg.mrg_uniform): srng = MRG_RandomStreams(seed=utt.fetch_seed()) m = srng.uniform(size=(1000, 1000), dtype='float16') assert m.dtype == 'float16', m.type f = theano.function([], m, mode=mode) assert any(isinstance(n.op, op_to_check) for n in f.maker.fgraph.apply_nodes) m_val = f() assert np.all((0 < m_val) & (m_val < 1))
class UniformRandom(object): """Implements uniform random sampling in Tensorflow""" def __init__(self): self._rng = RandomStreams(seed=self.seed or 123456) def _sample(self, shape, dtype): return self._rng.uniform( size=shape, low=self.low, high=self.high, dtype=dtype)
def compile_iter_fns(self, *args, **kwargs): eta = theano.shared(lasagne.utils.floatX(initial_eta)) self.eta=eta self.shared_lr=eta loss_critic = self.real_out.mean() - self.fake_out.mean() critic_updates = rmsprop( -1*loss_critic, self.critic_params, learning_rate=eta) loss_gen = -1*self.fake_out.mean() generator_updates = rmsprop( loss_gen, self.generator_params, learning_rate=eta) # Clip critic parameters in a limited range around zero (except biases) critic_clip_updates=[] for param in lasagne.layers.get_all_params(self.critic, trainable=True, regularizable=True): critic_clip_updates.append([param, T.clip(param, -clip, clip)]) # Instantiate a symbolic noise generator to use for training from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams srng = RandomStreams(seed=np.random.randint(2147462579, size=6)) noise = srng.uniform((batchsize, 100)) # Compile functions performing a training step on a mini-batch (according # to the updates dictionary) and returning the corresponding score: print('Compiling...') import time start = time.time() self.generator_train_fn = theano.function([], loss_gen, givens={self.noise_var: noise}, updates=generator_updates) self.critic_train_fn = theano.function([self.input_var],loss_critic, givens={self.noise_var: noise}, updates=critic_updates) self.critic_clip_fn = theano.function([],updates=critic_clip_updates) # Compile another function generating some data self.gen_fn = theano.function([self.noise_var], lasagne.layers.get_output(self.generator, deterministic=True)) self.val_fn = theano.function([self.input_var], outputs=[loss_critic, loss_gen], givens={self.noise_var: noise}) if self.verbose: print ('Compile time: %.3f s' % (time.time()-start))
def test_deterministic(): seed = utt.fetch_seed() sample_size = (10, 20) R = MRG_RandomStreams(seed=seed) u = R.uniform(size=sample_size) f = theano.function([], u) fsample1 = f() fsample2 = f() assert not np.allclose(fsample1, fsample2) R2 = MRG_RandomStreams(seed=seed) u2 = R2.uniform(size=sample_size) g = theano.function([], u2) gsample1 = g() gsample2 = g() assert np.allclose(fsample1, gsample1) assert np.allclose(fsample2, gsample2)
def test_uniform(): # TODO: test param low, high # TODO: test size=None # TODO: test ndim!=size.ndim # TODO: test bad seed # TODO: test size=Var, with shape that change from call to call if (config.mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE'] or config.mode == 'Mode' and config.linker in ['py']): sample_size = (10, 100) steps = 50 else: sample_size = (500, 50) steps = int(1e3) x = tensor.matrix() for size, const_size, var_input, input in [ (sample_size, sample_size, [], []), (x.shape, sample_size, [x], [np.zeros(sample_size, dtype=config.floatX)]), ((x.shape[0], sample_size[1]), sample_size, [x], [np.zeros(sample_size, dtype=config.floatX)]), # test empty size (scalar) ((), (), [], []), ]: # TEST CPU IMPLEMENTATION # The python and C implementation are tested with DebugMode x = tensor.matrix() R = MRG_RandomStreams(234) # Note: we specify `nstreams` to avoid a warning. # TODO Look for all occurrences of `guess_n_streams` and `30 * 256` # for such situations: it would be better to instead filter the # warning using the warning module. u = R.uniform(size=size, nstreams=rng_mrg.guess_n_streams(size, warn=False)) f = theano.function(var_input, u) assert any([isinstance(node.op, theano.sandbox.rng_mrg.mrg_uniform) for node in f.maker.fgraph.toposort()]) f(*input) # Increase the number of steps if sizes implies only a few samples if np.prod(const_size) < 10: steps_ = steps * 100 else: steps_ = steps basictest(f, steps_, const_size, prefix='mrg cpu', inputs=input) RR = theano.tensor.shared_randomstreams.RandomStreams(234) uu = RR.uniform(size=size) ff = theano.function(var_input, uu) # It's not our problem if numpy generates 0 or 1 basictest(ff, steps_, const_size, prefix='numpy', allow_01=True, inputs=input)
def new_update_deltas(self, network): alpha_vw = network.get_variable("alpha") step_size = network.find_hyperparameter(["step_size"]) # NOTE: each MRG_RandomStreams has the same seed, so # all nodes with the same shape end up with the same alphas srng = MRG_RandomStreams() steps = srng.uniform(size=alpha_vw.shape, low=-step_size, high=step_size) # TODO clip value of alpha (to prevent it becoming linear) return treeano.UpdateDeltas({alpha_vw.variable: steps})
class Dropout(object): def __init__(self, shape = None, prob=0.5): self.retain_prob = 1.0 - prob self.shape = shape self.seed = RNG.randint(1e6) self.rng = RandomStreams(self.seed) def drop(self, cur_in): self.mask = T.switch(self.rng.uniform(self.shape, dtype=theano.config.floatX)<self.retain_prob, 1., 0.) h = cur_in * self.mask h /= self.retain_prob return h
def test_target_parameter(): srng = MRG_RandomStreams() pvals = np.array([[.98, .01, .01], [.01, .49, .50]]) def basic_target_parameter_test(x): f = theano.function([], x) assert isinstance(f(), np.ndarray) basic_target_parameter_test(srng.uniform((3, 2), target='cpu')) basic_target_parameter_test(srng.binomial((3, 2), target='cpu')) basic_target_parameter_test(srng.multinomial(pvals=pvals.astype('float32'), target='cpu')) basic_target_parameter_test(srng.choice(p=pvals.astype('float32'), replace=False, target='cpu')) basic_target_parameter_test(srng.multinomial_wo_replacement(pvals=pvals.astype('float32'), target='cpu'))
def test_seed_fn(): idx = tensor.ivector() for new_seed, same in [(234, True), (None, True), (23, False)]: random = MRG_RandomStreams(234) fn1 = theano.function([], random.uniform((2, 2), dtype='float32')) fn2 = theano.function([], random.uniform((3, 3), nstreams=2, dtype='float32')) fn3 = theano.function([idx], random.uniform(idx, nstreams=3, ndim=1, dtype='float32')) fn1_val0 = fn1() fn1_val1 = fn1() assert not np.allclose(fn1_val0, fn1_val1) fn2_val0 = fn2() fn2_val1 = fn2() assert not np.allclose(fn2_val0, fn2_val1) fn3_val0 = fn3([4]) fn3_val1 = fn3([4]) assert not np.allclose(fn3_val0, fn3_val1) assert fn1_val0.size == 4 assert fn2_val0.size == 9 random.seed(new_seed) fn1_val2 = fn1() fn1_val3 = fn1() fn2_val2 = fn2() fn2_val3 = fn2() fn3_val2 = fn3([4]) fn3_val3 = fn3([4]) assert np.allclose(fn1_val0, fn1_val2) == same assert np.allclose(fn1_val1, fn1_val3) == same assert np.allclose(fn2_val0, fn2_val2) == same assert np.allclose(fn2_val1, fn2_val3) == same assert np.allclose(fn3_val0, fn3_val2) == same assert np.allclose(fn3_val1, fn3_val3) == same
def fprop(self, state_below): print "======fprop=====" rng = RandomStreams(seed=234) #size = theano.tensor.as_tensor_variable((state_below.shape[0], self.dim)) un = rng.uniform(size=(state_below.shape[0], self.dim), low=0., high=1., dtype=config.floatX) self.noise = T.log(un/(1-un)) p = self._linear_part(state_below) + self.noise * self.noise_factor batch_size = (p.shape[0]).astype(config.floatX) self.active_rate = T.gt(p, self.threshold).sum(axis=0, dtype=config.floatX) / batch_size return T.gt(p, self.threshold) * p
class IfElseDropLayer(Layer): def __init__(self, incoming, p=0.5, **kwargs): super(IfElseDropLayer, self).__init__(incoming, **kwargs) self._srng = RandomStreams(get_rng().randint(1, 2147462579)) self.p = p def get_output_for(self, input, deterministic=False, **kwargs): if deterministic: return self.p*input else: return ifelse( T.lt(self._srng.uniform( (1,), 0, 1)[0], self.p), input, T.zeros(input.shape) )
class SaltAndPepperNoiseLayer(lasagne.layers.Layer): def __init__(self, incoming, rate=0.1, **kwargs): super(SaltAndPepperNoiseLayer, self).__init__(incoming, **kwargs) self._srng = RandomStreams(np.random.randint(1, 2147462579)) self.rate = rate def get_output_for(self, input, deterministic=False, **kwargs): if deterministic or self.rate == 0: return input else: drop = self._srng.uniform(input.shape) z = T.lt(drop, 0.5 * self.rate) o = T.lt(T.abs_(drop - 0.75 * self.rate), 0.25 * self.rate) input = T.set_subtensor(input[z.nonzero()], 0.) input = T.set_subtensor(input[o.nonzero()], 1.) return input
def compile_iter_fns(self, *args, **kwargs): # Create loss expressions to be minimized # a, b, c = -1, 1, 0 # Equation (8) in the paper a, b, c = 0, 1, 1 # Equation (9) in the paper loss_gen = lasagne.objectives.squared_error(self.fake_out, c).mean() # loss_gen = -1*self.fake_out.mean() loss_critic = (lasagne.objectives.squared_error(self.real_out, b).mean() + lasagne.objectives.squared_error(self.fake_out, a).mean()) # loss_critic = self.real_out.mean() - self.fake_out.mean() self.shared_lr = theano.shared(lasagne.utils.floatX(initial_eta)) generator_updates = lasagne.updates.rmsprop( loss_gen, self.generator_params, learning_rate=self.shared_lr) critic_updates = lasagne.updates.rmsprop( loss_critic, self.critic_params, learning_rate=self.shared_lr) # Instantiate a symbolic noise generator to use for training from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams srng = RandomStreams(seed=np.random.randint(2147462579, size=6)) noise = srng.uniform((batchsize, 100)) # Compile functions performing a training step on a mini-batch (according # to the updates dictionary) and returning the corresponding score: if self.verbose: print('Compiling...') import time start = time.time() self.generator_train_fn = theano.function([], loss_gen, givens={self.noise_var: noise}, updates=generator_updates) self.critic_train_fn = theano.function([self.input_var],loss_critic, givens={self.noise_var: noise}, updates=critic_updates) # Compile another function generating some data self.gen_fn = theano.function([self.noise_var], lasagne.layers.get_output(self.generator, deterministic=True)) self.val_fn = theano.function([self.input_var], outputs=[loss_critic, loss_gen], givens={self.noise_var: noise}) if self.verbose: print ('Compile time: %.3f s' % (time.time()-start))
class IfElseDropLayer(lasagne.layers.Layer): def __init__(self, incoming, nonlinearity=elu, survival_p=0.5, **kwargs): super(IfElseDropLayer, self).__init__(incoming, **kwargs) self.nonlinearity = (identity if nonlinearity is None else nonlinearity) self._srng = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579)) self.p = 1-survival_p def get_output_for(self, input, deterministic=False, **kwargs): if deterministic: return self.p*input else: return theano.ifelse.ifelse( T.lt(self._srng.uniform( (1,), 0, 1)[0], self.p), input, T.zeros(input.shape) )
def test_GPUA_full_fill(): # Make sure the whole sample buffer is filled. Also make sure # large samples are consistent with CPU results. # This needs to be large to trigger the problem on GPU size = (10, 1000) R = MRG_RandomStreams(234) uni = R.uniform(size, nstreams=60 * 256) f_cpu = theano.function([], uni) rstate_gpu = gpuarray_shared_constructor(R.state_updates[-1][0].get_value()) new_rstate, sample = GPUA_mrg_uniform.new(rstate_gpu, ndim=None, dtype='float32', size=size) rstate_gpu.default_update = new_rstate f_gpu = theano.function([], sample, mode=mode) utt.assert_allclose(f_cpu(), f_gpu())
def test_cpu_target_with_shared_variable(): srng = MRG_RandomStreams() s = np.random.rand(2, 3).astype('float32') x = gpuarray_shared_constructor(s, name='x') try: # To have theano.shared(x) try to move on the GPU theano.compile.shared_constructor(gpuarray_shared_constructor) y = srng.uniform(x.shape, target='cpu') y.name = 'y' z = (x * y).sum() z.name = 'z' fz = theano.function([], z, mode=mode) nodes = fz.maker.fgraph.toposort() assert not any([isinstance(node.op, GPUA_mrg_uniform) for node in nodes]) finally: theano.compile.shared_constructor(gpuarray_shared_constructor, remove=True)
def make_samples(self): """Generate sample points uniformly distributed within the sphere. Returns float array of sample points. """ srng = RandomStreams(seed=self.ensemble.seed) samples = srng.normal((self.num_samples, self.ensemble.dimensions)) # normalize magnitude of sampled points to be of unit length norm = TT.sum(samples * samples, axis=[1], keepdims=True) samples = samples / TT.sqrt(norm) # generate magnitudes for vectors from uniform distribution scale = (srng.uniform((self.num_samples,)) ** (1.0 / self.ensemble.dimensions)) # scale sample points samples = samples.T * scale return theano.function([], samples)()
def test_consistency_randomstreams(): # Verify that the random numbers generated by MRG_RandomStreams # are the same as the reference (Java) implementation by L'Ecuyer et al. seed = 12345 n_samples = 5 n_streams = 12 n_substreams = 7 samples = [] rng = MRG_RandomStreams(seed=seed) for i in range(n_streams): stream_samples = [] u = rng.uniform(size=(n_substreams,), nstreams=n_substreams) f = theano.function([], u) for j in range(n_samples): s = f() stream_samples.append(s) stream_samples = np.array(stream_samples) stream_samples = stream_samples.T.flatten() samples.append(stream_samples) samples = np.array(samples).flatten() assert(np.allclose(samples, java_samples))
class RangeDropoutLayer(Layer): def __init__(self, low_p, high_p): super(RangeDropoutLayer, self).__init__() assert 0.0 < low_p < 1.0 assert 0.0 < high_p < 1.0 assert low_p < high_p self.low_p = low_p self.high_p = high_p self.srng = RandomStreams(seed=np.random.randint(10e6)) def build(self, input, is_train): super(RangeDropoutLayer, self).build(input) self.p = self.srng.uniform((1,1), low=self.low_p, high=self.high_p) retain_prob = 1 - self.p[0][0] self.output = T.switch(T.gt(is_train, 0), self.input * self.srng.binomial(self.input.shape, p=retain_prob, dtype=theano.config.floatX) / retain_prob, self.input) def get_output(self): return self.output
def test_nanguardmode(): # this is the case which requires a custom nanguardmode srng = MRG_RandomStreams() x = srng.uniform((3, 4, 5)) def random_number(mode): return theano.function([], [x], mode=mode)() @nt.raises(AssertionError) def fails(): random_number(theano.compile.nanguardmode.NanGuardMode( nan_is_error=True, inf_is_error=True, big_is_error=True )) fails() random_number(treeano.theano_extensions.nanguardmode.NanGuardMode( nan_is_error=True, inf_is_error=True, big_is_error=True ))
class GanModel(object): def __init__(self, gan_model_name, *args, **kwargs): self.model_func = getattr(gan_models, gan_model_name) self.logging = kwargs.pop('logging') self.kwargs = kwargs.pop('opt_kwargs') self.batch_size = kwargs['batch_size'] self.gan_mode = kwargs['gan_mode'] d_bn_mode = kwargs['d_bn'] g_bn_mode = kwargs['g_bn'] d_dp_p = kwargs['d_dp'] g_dp_p = kwargs['g_dp'] if kwargs['d_bn'] in ('bn', 'ln'): def d_bn(x): return gan_models.batch_norm(x, steal_nl=0, axes=d_bn_mode) else: self.logging.info('no D normalization') def d_bn(x): return x if kwargs['g_bn'] in ('bn', 'ln'): def g_bn(x): return gan_models.batch_norm(x, steal_nl=0, axes=g_bn_mode) else: self.logging.info('no G normalization') def g_bn(x): return x if floatX(kwargs['d_dp']) != 0.: def d_dp(x): return ll.DropoutLayer(x, floatX(d_dp_p)) else: d_dp = None if floatX(kwargs['g_dp']) != 0.: def g_dp(x): return ll.DropoutLayer(x, floatX(g_dp_p)) else: g_dp = None kwargs['d_bn'] = d_bn kwargs['g_bn'] = g_bn kwargs['d_dp'] = d_dp kwargs['g_dp'] = g_dp self.l_out_g, self.l_out_d, self.l_data, self.g_layers, self.d_layers = \ self.model_func(*args, **kwargs) rng_data = np.random.RandomState() rng = np.random.RandomState() self.theano_rng = MRG_RandomStreams(rng.randint(2**15)) lasagne.random.set_rng(np.random.RandomState(rng.randint(2**15))) def get_g_outputs(self): self.fake_dat = ll.get_output(self.l_out_g, deterministic=False) self.constant_z = T.constant( np.random.randn( *self.g_layers['l_z'].input_var.eval().shape).astype(floatX)) self.fake_dat_cz = ll.get_output(self.l_out_g, self.constant_z, deterministic=False) return self.fake_dat, self.fake_dat_cz def get_d_outputs(self, real_dat=None, fake_dat=None): kwargs = self.kwargs def df_dx(last_ll, dat): gradients = T.grad(T.sum(last_ll, axis=0).squeeze(), dat) slopes = T.sqrt(T.sum(gradients**2, axis=(1, 2, 3))) return gradients, slopes if real_dat is None: real_dat = self.real_dat = T.tensor4() if fake_dat is None: fake_dat = self.fake_dat self.alpha = self.theano_rng.uniform((self.batch_size, 1, 1, 1), low=0.0, high=1.0) differences = real_dat - fake_dat self.interp_dat = interp_dat = real_dat + (self.alpha * differences) interp_evaled_layers = fake_evaled_layers = fake_cz_evaled_layers = real_evaled_layers = [ self.l_out_d ] preact_layers = [ self.d_layers[d] for d in self.d_layers.keys() if d.startswith('preact') ] if 'last_linear' in self.d_layers: interp_evaled_layers.append(self.d_layers['last_linear']) fake_evaled_layers.append(self.d_layers['last_linear']) real_evaled_layers.append(self.d_layers['last_linear']) fake_cz_evaled_layers.append(self.d_layers['last_linear']) interp_evaled_layers.extend(preact_layers) fake_evaled_layers.extend(preact_layers) fake_cz_evaled_layers.extend(preact_layers) real_evaled_layers.extend(preact_layers) interp_evaled = ll.get_output(interp_evaled_layers, self.interp_dat, deterministic=False) fake_evaled = ll.get_output(fake_evaled_layers, self.fake_dat, deterministic=False) fake_cz_evaled = ll.get_output(fake_cz_evaled_layers, self.fake_dat_cz, deterministic=False) real_evaled = ll.get_output(real_evaled_layers, self.real_dat, deterministic=False) real_l2v = OrderedDict(safe_zip(real_evaled_layers, real_evaled)) fake_l2v = OrderedDict(safe_zip(fake_evaled_layers, fake_evaled)) fake_cz_l2v = OrderedDict( safe_zip(fake_cz_evaled_layers, fake_cz_evaled)) interp_l2v = OrderedDict(safe_zip(interp_evaled_layers, interp_evaled)) output_gen = fake_l2v[self.l_out_d] output_data = real_l2v[self.l_out_d] gradients_fake, slopes_fake = df_dx( fake_l2v[self.d_layers['last_linear']], self.fake_dat) gradients_real, slopes_real = df_dx( real_l2v[self.d_layers['last_linear']], self.real_dat) gradients_interp, slopes_interp = df_dx( interp_l2v[self.d_layers['last_linear']], self.interp_dat) gradient_penalty_real = grad_penalty(slopes_real, floatX(kwargs['gp_slope'])) gradient_penalty_fake = grad_penalty(slopes_fake, floatX(kwargs['gp_slope'])) gradient_penalty_interp = grad_penalty(slopes_interp, floatX(kwargs['gp_slope'])) # loss terms self.d_cost_adv, self.g_cost_adv = build_costs(output_gen, output_data, None, None, self.gan_mode) d_cost = self.d_cost_adv g_cost = self.g_cost_adv if floatX(kwargs['smoothness']): smoothness_cost = 0. for pl in preact_layers: mm = self.theano_rng.binomial( (self.batch_size, ) + pl.input_shape[1:], p=.5, dtype='float32') zz = self.theano_rng.uniform((1, ) + pl.input_shape[1:], dtype='float32') zz = zz * mm pl_zh = pl.get_output_for(zz) print zz.eval().shape, pl_zh.eval().shape smoothness_cost += w_smoothness(pl_zh, self.batch_size) # import ipdb # ipdb.set_trace() d_cost += floatX(kwargs['smoothness']) * smoothness_cost if floatX(kwargs['gp_weight']): self.logging.info('gp with weight:' + kwargs['gp_weight']) d_cost += floatX(kwargs['gp_weight']) * gradient_penalty_interp h_layers_fake = [fake_l2v[l] for l in preact_layers] h_layers_interp = [interp_l2v[l] for l in preact_layers] h_layers_real = [real_l2v[l] for l in preact_layers] bre_real, me_real, ac_real, _, ac_stats_real = bre( h_layers_real, binarizer=kwargs['binarizer']) bre_fake, me_fake, ac_fake, _, ac_stats_fake = bre( h_layers_fake, binarizer=kwargs['binarizer']) bre_interp, me_interp, ac_interp, _, ac_stats_interp = bre( h_layers_interp, binarizer=kwargs['binarizer']) self.bre_w = bre_w = theano.shared(floatX(kwargs['bre_w'])) if floatX(kwargs['bre_w']): bre_loss = 0. if floatX(kwargs['bre_on_real']): self.logging.info('BRE regularization on real') bre_loss += bre_w * bre_real if floatX(kwargs['bre_on_fake']): self.logging.info('BRE regularization on fake') bre_loss += bre_w * bre_fake if floatX(kwargs['bre_on_interp']): self.logging.info('BRE regularization on interp') bre_loss += bre_w * bre_interp d_cost += bre_loss if floatX(kwargs['monitor']): ac_min_fake, ac_mean_fake, ac_max_fake, \ ac_abs_mean_fake, ac_sat_ratio9_fake, ac_sat_ratio9_fake = ac_stats_fake ac_min_interp, ac_mean_interp, ac_max_interp, \ ac_abs_mean_interp, ac_sat_ratio9_interp, ac_sat_ratio9_interp = ac_stats_interp ac_min_real, ac_mean_real, ac_max_real, \ ac_abs_mean_real, ac_sat_ratio9_real, ac_sat_ratio9_real = ac_stats_real all_g_layers = ll.get_all_layers(self.g_layers.values()) all_d_layers = ll.get_all_layers(self.d_layers.values()) gen_wdecay = regularize_layer_params(all_g_layers, lasagne.regularization.l2) disc_wdecay = regularize_layer_params(all_d_layers, lasagne.regularization.l2) d_cost += floatX(kwargs['d_wdecay']) * disc_wdecay g_cost += floatX(kwargs['g_wdecay']) * gen_wdecay self.d_cost = d_cost self.g_cost = g_cost err_data = T.cast(output_data < .5, 'float32').mean() err_gen = T.cast(output_gen > .5, 'float32').mean() # monitor if floatX(kwargs['monitor']): monitor_stats = [] monitor_stats += [err_data, err_gen] monitor_stats += [me_fake, me_interp, me_real] # absh_mu_fake, absh_mu_interp, absh_mu_real] monitor_stats += [ gradients_fake, slopes_fake, gradient_penalty_fake ] monitor_stats += [ gradients_real, slopes_real, gradient_penalty_real ] monitor_stats += [ gradients_interp, slopes_interp, gradient_penalty_interp ] monitor_stats += [ac_fake, ac_real, ac_interp] if floatX(kwargs['smoothness']): monitor_stats += [smoothness_cost] if ac_stats_real: monitor_stats += [ ac_min_real, ac_mean_real, ac_max_real, ac_abs_mean_real, ac_sat_ratio9_real, ac_sat_ratio9_real ] if ac_stats_fake: monitor_stats += [ ac_min_fake, ac_mean_fake, ac_max_fake, ac_abs_mean_fake, ac_sat_ratio9_fake, ac_sat_ratio9_fake ] if ac_stats_interp: monitor_stats += [ ac_min_interp, ac_mean_interp, ac_max_interp, ac_abs_mean_interp, ac_sat_ratio9_interp, ac_sat_ratio9_interp ] _vars = locals() def get_name(v): for k in _vars: if _vars[k] is v and k != 'v' and k != 'k': return k self.monitor_stats = OrderedDict([(get_name(v), v) for v in monitor_stats]) def build_funcs(self): self.get_g_outputs() self.get_d_outputs() kwargs = self.kwargs d_trainable_params = ll.get_all_params(ll.get_all_layers( self.d_layers.values()), trainable=True) g_trainable_params = ll.get_all_params(ll.get_all_layers( self.g_layers.values()), trainable=True) self.d_trainable_params = d_trainable_params self.g_trainable_params = g_trainable_params all_d_params = ll.get_all_params( ll.get_all_layers(self.d_layers.values())) all_g_params = ll.get_all_params( ll.get_all_layers(self.g_layers.values())) self.all_d_params = all_d_params self.all_g_params = all_g_params self.g_sh_lr = theano.shared( lasagne.utils.floatX(floatX(kwargs['g_lr']))) self.d_sh_lr = theano.shared( lasagne.utils.floatX(floatX(kwargs['d_lr']))) d_beta1 = floatX(kwargs['d_beta1']) g_beta1 = floatX(kwargs['g_beta1']) beta2 = floatX(kwargs['beta2']) d_updater = lu.adam(self.d_cost, d_trainable_params, self.d_sh_lr, beta1=d_beta1, beta2=beta2) g_updater = lu.adam(self.g_cost, g_trainable_params, self.g_sh_lr, beta1=g_beta1, beta2=beta2) self.out_var2name = out_var2name = OrderedDict([]) out_var2name[self.g_cost] = 'g_cost_tot' out_var2name[self.g_cost_adv] = 'g_cost_adv' out_var2name[self.d_cost] = 'd_cost_tot' out_var2name[self.d_cost_adv] = 'd_cost_adv' # 0: no, 1: every eval interval, 2: every step if int(self.kwargs.get('monitor', '0')) >= 2: for k in self.monitor_stats: out_var2name[self.monitor_stats[k]] = k self.g_inputs = g_inputs = [self.real_dat] else: self.g_inputs = g_inputs = [] all_g_updates = g_updater all_d_updates = d_updater self.g_outs = [self.g_cost, self.g_cost_adv] self.g_train_func = theano.function(inputs=g_inputs, outputs=self.g_outs, updates=all_g_updates) self.d_outs = [self.d_cost, self.d_cost_adv] self.d_train_func = theano.function(inputs=[self.real_dat], outputs=self.d_outs, updates=all_d_updates) self.g_sample_func = theano.function(inputs=[], outputs=(self.fake_dat + 1.) / 2.) self.g_sample_cz_func = theano.function( inputs=[], outputs=(self.fake_dat_cz + 1.) / 2.) if int(kwargs.get('monitor', '0')) >= 1: self.monitor_func = theano.function( inputs=[self.real_dat], outputs=self.monitor_stats.values()) def g_sample(self, N_samples): g_samples = [] while sum(x.shape[0] for x in g_samples) <= N_samples: success = False while not success: try: g_data_v = self.g_sample_func() success = True except Exception as err: self.logging.info(err) g_samples.append(g_data_v) g_samples = np.vstack(g_samples)[:N_samples] return g_samples
def __init__(self, rng, input, n_in, n_out, num_MC, num_FF, n_tot, free_param, Domain_number=None, number="1", Domain_consideration=True): #inputも100*N*Dで入ってくるようにする. self.DATA = input #N=DATA.shape[1] #n_in_D=DATA.shape[2] srng = RandomStreams(seed=234) self.num_rff = num_FF #Define hyperparameters lhyp_values = np.zeros(n_in + 1, dtype=theano.config.floatX) + np.log( 0.1, dtype=theano.config.floatX) #lhyp_values = np.zeros(n_in+1,dtype=theano.config.floatX)+np.log(1.,dtype=theano.config.floatX) self.lhyp = theano.shared(value=lhyp_values, name='lhyp' + number, borrow=True) self.sf2, self.l = T.exp(self.lhyp[0]), T.exp(self.lhyp[1:1 + n_in]) if Domain_consideration: #先行研究は0.1でうまくいった ls_value = np.zeros(Domain_number, dtype=theano.config.floatX) + np.log( 0.1, dtype=theano.config.floatX) else: ls_value = np.zeros(1, dtype=theano.config.floatX) + np.log( 0.1, dtype=theano.config.floatX) self.ls = theano.shared(value=ls_value, name='ls' + number, borrow=True) #Define prior omega #prior_mean_Omega.append(tf.zeros([self.d_in[i],1])) self.log_prior_var_Omega = T.tile(1 / (self.l)**0.5, (num_FF, 1)).T #Define posterior omega #get samples from omega sample_value = np.random.randn(1, n_in, num_FF) self.sample_Omega_epsilon_0 = theano.shared(value=sample_value, name='sample_Omega' + number) #self.sample_Omega_epsilon_0 = srng.normal((1,n_in,num_FF)) Omega_sample = self.sample_Omega_epsilon_0 * self.log_prior_var_Omega[ None, :, :] Omega_samples = T.tile(Omega_sample, (num_MC, 1, 1)) self.samples = Omega_samples #Define prior W #prior_mean_W = T.zeros(2*num_FF) #log_prior_var_W = T.ones(2*num_FF) #Define posterior W mean_mu_value = np.random.randn(2 * num_FF, n_out) #* 1e-2 self.mean_mu = theano.shared(value=mean_mu_value, name='mean_mu' + number, borrow=True) log_var_value = np.zeros((2 * num_FF, n_out)) self.log_var_W = theano.shared(value=log_var_value, name='q_W' + number, borrow=True) #get samples from W sample_Omega_epsilon = srng.normal((num_MC, 2 * num_FF, n_out)) f2 = T.cast(free_param, 'int64') N = srng.uniform(size=(f2 + n_tot, num_MC), low=1e-10, high=1.0) gamma_factor = T.sum(T.log(N), 0) * (-1) #gamma_factor=self.gamma_dist(free_param+n_tot,1,num_MC) sample_Omega_epsilon_gamma = ( (free_param + n_tot) / gamma_factor)[:, None, None] * sample_Omega_epsilon #MC*Nrff*dout W_samples = sample_Omega_epsilon_gamma * (T.exp( self.log_var_W)**0.5)[None, :, :] + self.mean_mu[None, :, :] # calculate lyaer N_MC*N*D_out F_next, updates = theano.scan( fn=lambda a, b, c: self.passage(a, b, c, num_FF), sequences=[input, Omega_samples, W_samples]) #output self.output = F_next #KL-divergence #Omega #W #cross-entropy-term #self.KL_W=self.DKL_gaussian(self.mean_mu, self.log_var_W, prior_mean_W, log_prior_var_W) CH_const = T.gammaln((n_out + free_param) / 2) - T.log(( (free_param - 2) * np.pi)**(n_out / 2)) - T.gammaln(free_param / 2) ch_mc, updates = theano.scan(fn=lambda a: (T.log(1 + T.sum(a * a, -1) / (free_param - 2))), sequences=[W_samples]) CH_MC = T.mean(T.sum(ch_mc, -1)) CH = CH_const * num_FF - CH_MC * (free_param + n_out) / 2 #entropy-term HF = self.shanon_Entropy_studentt(self.log_var_W, free_param + n_tot) self.KL_W = -HF - CH #parameter_setting self.all_params = [self.lhyp, self.ls, self.mean_mu, self.log_var_W] self.hyp_params = [self.lhyp, self.ls] self.variational_params = [self.mean_mu, self.log_var_W]
class RBM(object): # Implement a Bernoulli Restricted Boltzmann Machine def __init__(self, input, n_visible, n_hidden): self.n_visible = n_visible self.n_hidden = n_hidden self.input = input # Rescale terms for visible units self.a = theano.shared(value=np.zeros(n_visible, dtype=theano.config.floatX), borrow=True, name='a') # Bias terms for hidden units self.b = theano.shared(np.zeros(n_hidden, dtype=theano.config.floatX), borrow=True, name='b') # Weights rng = np.random.RandomState(2468) # Warray = np.asarray(rng.uniform(-0.1, 0.1, (n_visible, n_hidden)), # dtype=theano.config.floatX) Warray = np.asarray(rng.uniform( -4 * np.sqrt(6. / (n_hidden + n_visible)), 4 * np.sqrt(6. / (n_hidden + n_visible)), (n_visible, n_hidden)), dtype=theano.config.floatX) self.W = theano.shared(Warray, borrow=True, name='W') # self.Wt = self.W.T self.Wt = theano.shared(np.transpose(Warray), borrow=True, name='Wt') self.srng = RandomStreams(rng.randint(2**30)) def v_sample(self, h): # Derive a sample of visible units from the hidden units h act = self.a + tensor.dot(h, self.Wt) prob = tensor.nnet.sigmoid(act) return [ prob, self.srng.binomial(size=act.shape, n=1, p=prob, dtype=theano.config.floatX) ] def h_sample(self, v): # Derive a sample of hidden units from the visible units v act = self.b + tensor.dot(v, self.W) prob = tensor.nnet.sigmoid(act) return [ prob, self.srng.binomial(size=act.shape, n=1, p=prob, dtype=theano.config.floatX) ] def output(self): prob, hS = self.h_sample(self.input) return prob def gibbs_step_hvh(self, h): # A Gibbs step nv_prob, nv_sample = self.v_sample(h) nh_prob, nh_sample = self.h_sample(nv_sample) return [nv_prob, nv_sample, nh_prob, nh_sample] def gibbs_step_hvhp(self, hp): # A Gibbs step nv_prob, nv_sample = self.v_sample(hp) nh_prob, nh_sample = self.h_sample(nv_prob) return [nv_prob, nv_sample, nh_prob, nh_sample] def gibbs_step_vhv(self, v): # A Gibbs step nh_prob, nh_sample = self.h_sample(v) nv_prob, nv_sample = self.v_sample(nh_sample) return [nv_prob, nv_sample, nh_prob, nh_sample] def free_energy(self, v_sample): ''' Function to compute the free energy ''' wx_b = tensor.dot(v_sample, self.W) + self.b vbias_term = tensor.dot(v_sample, self.a) hidden_term = tensor.sum(tensor.log(1 + tensor.exp(wx_b)), axis=1) return -hidden_term - vbias_term def get_cost_updates(self, k=1, lr=0.01, lam1=0.0, lam2=0.0, batch_size=None, persistent=None, stocastic_steps=True): # Contrastive divergence # Positive phase h0_prob, h0_sample = self.h_sample(self.input) if persistent is None: h_sample = h0_sample else: h_sample = persistent # self.Wt = self.W.T # Negative phase if stocastic_steps: ([nv_probs, nv_samples, nh_probs, nh_samples], updates) = theano.scan(self.gibbs_step_hvh, outputs_info=[None, None, None, h_sample], n_steps=k, name="gibbs_update") else: ([nv_probs, nv_samples, nh_probs, nh_samples], updates) = theano.scan(self.gibbs_step_hvhp, outputs_info=[None, None, h0_prob, None], n_steps=k, name="gibbs_update") vK_prob = nv_probs[-1] vK_sample = nv_samples[-1] hK_prob = nh_probs[-1] hK_sample = nh_samples[-1] if persistent: updates[persistent] = hK_sample # See https://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf # I keep sigma unit as reported in https://www.cs.toronto.edu/~hinton/absps/guideTR.pdf 13.2 eps = tensor.cast(lr, dtype=theano.config.floatX) params = [self.a, self.b] if batch_size is None: cost = tensor.mean(self.free_energy(self.input)) -\ tensor.mean(self.free_energy(vK_sample)) # We must not compute the gradient through the gibbs sampling params += [self.W] gparams = tensor.grad(cost, params, consider_constant=[vK_sample]) for param, gparam in zip(params, gparams): updates[param] = param - eps * gparam updates[self.Wt] = updates[self.W].T else: eps0 = eps / tensor.cast(batch_size, dtype=theano.config.floatX) W_grad = eps0 * (tensor.dot(self.input.T, h0_prob) - tensor.dot(vK_prob.T, hK_prob)) W_gradT = W_grad.T a_grad = eps * tensor.mean(self.input - vK_prob, axis=0) b_grad = eps * tensor.mean(h0_prob - hK_prob, axis=0) gparams = [a_grad, b_grad] for param, gparam in zip(params, gparams): updates[param] = param + gparam if (lam1 + lam2) == 0: updates[self.W] = self.W + W_grad updates[self.Wt] = self.Wt + W_gradT else: # Used in M. Liang et al. 2015 l1 = tensor.cast(2 * lam1 * lr, dtype=theano.config.floatX) l2 = tensor.cast(1 - 2 * lam2 * lr, dtype=theano.config.floatX) updates[self.W] = (l2 * self.W + W_grad ) /\ (1 + l1/tensor.abs_(self.W)) updates[self.Wt] = (l2 * self.Wt + W_gradT ) /\ (1 + l1/tensor.abs_(self.Wt)) if stocastic_steps: sme = tensor.mean(tensor.sum((self.input - vK_sample)**2, axis=1)) else: sme = tensor.mean(tensor.sum((self.input - vK_prob)**2, axis=1)) return sme, updates def training(self, dataset, batch_size, training_epochs, k, lr, lam1=0, lam2=0, CD=True, persistent=None, stocastic_steps=True, data_shuffle=False, display_fn=None): index = tensor.lscalar('index') train_set = theano.shared(dataset, borrow=True) sme, updates = self.get_cost_updates(k=k, lr=lr, persistent=persistent, lam1=lam1, lam2=lam2, stocastic_steps=stocastic_steps) n_data = dataset.shape[0] if not data_shuffle: train = theano.function( [index], sme, updates=updates, givens={ self.input: train_set[index * batch_size:(index + 1) * batch_size] }, name="train") else: indexes = tensor.floor( self.srng.uniform((1, batch_size), low=0, high=n_data)) train = theano.function([], sme, updates=updates, givens={ self.input: train_set[tensor.cast( indexes, dtype='int64')[0]] }, name="train") for epoch in xrange(training_epochs): sme_list = [] if not data_shuffle: for n_batch in xrange(n_data // batch_size): sme_list.append(train(n_batch)) else: sme_list.append(train()) print("Training epoch %d, reconstruction error %f" % (epoch, np.mean(sme_list))) if display_fn is not None: # Construct image from the weight matrix Wimg = display_fn(self.W.get_value(borrow=True), self.n_hidden) scipy.misc.imsave('filters_at_epoch_%i.png' % epoch, Wimg) # Construct image from the weight matrix Wimg = display_fn(self.W.get_value(borrow=True), self.n_hidden) scipy.misc.imsave('filters_at_epoch_%i.png' % epoch, Wimg)
def makemodel( name="ADGM" if ADGM else "SDGM", nls=["rectify"] * 2, seed=seed, descenter=G.Adam, K=K, L=L, ): #$ tensor_shapes """ Creates the ADGM or SDGM model. Xl has dimension (Nl, 1, 1, 1, X) Xu has dimension (Nu, 1, 1, 1, X) Yl has dimension (Nl, 1, 1, 1, Y) Yu has dimension ( 1, 1, 1, Y, Y) EAl has dimension (Nl, K, 1, 1, A) EAu has dimension (Nu, K, 1, 1, A) EZl has dimension (Nl, K, L, 1, Z) EZu has dimension (Nu, K, L, Y, Z) Al will have dimension (Nl, K, 1, 1, A) Au will have dimension (Nu, K, 1, 1, A) Zl will have dimension (Nl, K, L, 1, Z) Zu will have dimension (Nu, K, L, Y, Z) """ #$ Print = Log("../log/{}".format(name), "w", quiet=True) model = Model(name=name, shuffledata=shuffledata, thresholddata=thresholdX, normalizedata=normalizeX, seed=seed, maxvar=highvaronly) model.Print = Print model.loadmomentum = loadmomentum model.descenter = descenter(gradnorm) networks = OrderedDict() rng = MRG_RandomStreams() X = model.XCols model.constants = OrderedDict([ (" ", model.name), ("shuffle data?", shuffledata), ("data seed", model.seed), ("Nu", Nu), ("Nl", Nl), ("X", X), ("Y", Y), ("Z", Z), ("A", A), ("L", L), ("K", K), ("Kt", Kt), ("aJL", aJL), ("aJU", aJU), ("aJA", aJA), ("aJW", aJW), ("gradient norm?", gradnorm), ("std. normal A?", Anormal), ("A to Z?", AtoZ), ("gaussian X?", gaussianX), ("sample X?", sampleX), ("threshold X?", thresholdX), ("normalize X?", normalizeX), ("high var only?", highvaronly), ("NSaves", NSaves), ("enable save?", enablesave), ("combolength", combolength), ("load momentum?", loadmomentum), ("juggle momentum?", jugglemomentum), ("random juggler?", randomjuggler), ("epsilon", epsilon), ]) for name, val in model.constants.items(): model.Print("{:>20s}".format(name), val) #$ px_stack # Create the networks for px ins = [Y, Z] if ADGM else [A, Y, Z] last = "linear" if gaussianX else "sigmoid" O = [X, X] if gaussianX else [X] fx = Stack(insizes=ins, outsizes=O, hidnls=nls, lastnl=last) networks["fx"] = fx #$ #$ pa_stack # Create the networks for pa ins = [X, Y, Z] if ADGM else [Y, Z] fa = Stack(insizes=ins, outsizes=[A, A], hidnls=nls) if not Anormal: networks["fa"] = fa #$ #$ qz_stack # Create the networks for qz ins = [A, X, Y] if AtoZ else [X, Y] fz = Stack(insizes=ins, outsizes=[Z, Z], hidnls=nls) networks["fz"] = fz #$ #$ qax_stack # Create the networks for qax ins = [X] fax = Stack(insizes=ins, outsizes=[A, A], hidnls=nls) networks["fax"] = fax #$ #$ qy_stack # Create the network for qy. Outputs are # probabilities, so last layer is always # softmax. ins = [A, X] last = "softmax" fy = Stack(insizes=ins, outsizes=[Y], hidnls=nls, lastnl=last) networks["fy"] = fy #$ #$ model.networks # Collect all of the parameters together # so we can optimize the objectives with # respect to them. model.networks = networks model.params = [] for name, net in model.networks.items(): model.Print("{:>20s}".format(name), net) model.params += net.params #$ # For now, throw an error if Nl or Nu are # not specified. # Eventually, we would like to be able to # handle only Nl, only Nu, or both Nl and Nu. if Nl is None or Nu is None: raise ValueError("Need to specify Nl and Nu") #$ shared_inputs # Xl, Ylh, and Xu are shared variables on the # GPU. For Xu, we take random batch slices. # We assume for now that all (Xl,Yl) are used # in each batch. Xl2 = model.Xl[:Nl] Yl2 = model.Ylh[:Nl] bidxs = rng.uniform((Nu, )) * model.Xu.shape[0] bidxs = T.cast(bidxs, "int32") Xu2 = model.Xu[bidxs] #$ #$ sampleX # If X is binary, then sample it on each # minibatch. This idea borrowed from Maaloe's # code. Not sure if it helps. # # Keep track of Xl2s, Yl2, and Xu2s so we can # do theano variable substitution later. if not gaussianX and sampleX: Xl2s = rng.binomial(n=1, p=Xl2, size=Xl2.shape, dtype=theano.config.floatX) Xu2s = rng.binomial(n=1, p=Xu2, size=Xu2.shape, dtype=theano.config.floatX) else: Xl2s = Xl2 Xu2s = Xu2 #$ #$ dimshuffled # Reshape the labeled set matrices # to 5th-order tensors. Xl = Xl2s.dimshuffle([0, "x", "x", "x", 1]) Yl = Yl2.dimshuffle([0, "x", "x", "x", 1]) # Xu is known, but Yu is not known. # Create one possible Y per class. Xu = Xu2s.dimshuffle([0, "x", "x", "x", 1]) Yu = T.eye(Y, Y).dimshuffle(["x", "x", "x", 0, 1]) #$ #$ noises # EZ and EA will be used to approximate # the integrals using L samples for Z and # K samples for A. # # Create shared variables for K and L so we # can do variable substitutions later. K = theano.shared(K, name="samplesA") L = theano.shared(L, name="samplesZ") EAl = rng.normal((Xl.shape[0], K, 1, 1, A)) EAu = rng.normal((Xu.shape[0], K, 1, 1, A)) EZl = rng.normal((Xl.shape[0], K, L, 1, Z)) EZu = rng.normal((Xu.shape[0], K, L, Y, Z)) #$ # Assign inputs to the model. # We assume that all data is already on the GPU. # Furthermore, we create functions that # evaluate the objectives on the test data # directly. Therefore, there are no inputs # needed for calling the training function. model.inputs = [] #$ al_au # Find the latent variables. # Note that multiplying by E effectively tiles # all latent variables L or K times. # # Auxiliary A has to be found first # because latent Z is a function of it. muaxl, sdaxl = fax([Xl]) muaxu, sdaxu = fax([Xu]) Al = muaxl + T.exp(sdaxl) * EAl Au = muaxu + T.exp(sdaxu) * EAu #$ #$ zl_zu # Compute Z. inputl = [Al, Xl, Yl] if AtoZ else [Xl, Yl] inputu = [Au, Xu, Yu] if AtoZ else [Xu, Yu] muzl, sdzl = fz(inputl) muzu, sdzu = fz(inputu) Zl = muzl + T.exp(sdzl) * EZl Zu = muzu + T.exp(sdzu) * EZu #$ #$ muxl_muxu # Find the reconstruction means and # standard deviations. # Note: sdxl and sdxu are used only if # gaussian is True. The binary case # ignores those. # If ADGM, then X is a function of YZ. # If SDGM, then X is a function of AYZ. inputl = [Yl, Zl] if ADGM else [Al, Yl, Zl] inputu = [Yu, Zu] if ADGM else [Au, Yu, Zu] if gaussianX: muxl, sdxl = fx(inputl) muxu, sdxu = fx(inputu) else: muxl = fx(inputl) muxu = fx(inputu) #$ #$ mual_muau # Find mu and sd for A in the generative # (reconstruction) direction. # If ADGM, then A depends on XYZ. # If SDGM, then A depends on YZ. inputl = [Xl, Yl, Zl] if ADGM else [Yl, Zl] inputu = [Xu, Yu, Zu] if ADGM else [Yu, Zu] mual, sdal = fa(inputl) muau, sdau = fa(inputu) #$ #$ JL_1 # Find the component probabilities and the # labeled objective, JL. l_pz = loggauss(Zl) l_qz = loggauss(Zl, muzl, sdzl) l_py = T.log(1.0 / Y) if gaussianX: l_px = loggauss(Xl, muxl, sdxl) else: l_px = logbernoulli(Xl, muxl) #$ #$ JL_2 # In Maaloe's first revision, A is disconnected # in the generative model, so we assume it # to be standard normal. # # In the more updated version, A is fed into # by X, Y, and Z. # In SDGM, A is generated by Z and Y. normal = zero if Anormal else one l_pa = loggauss(Al, normal * mual, normal * sdal) l_qa = loggauss(Al, muaxl, sdaxl) #$ #$ JL_3 JL = l_qz + l_qa JL = JL - l_px - l_py - l_pz - l_pa JL = batchaverage(exA(exZ(JL))) JL = aJL * JL #$ #$ JU_1 # Find the component probabilities and the # unlabeled objective, JU. # The output of fy(Au, Xu) is pi. # (Nu, K, 1, 1, Y) # We need to relocate the last axis. # (Nu, K, 1, Y, 1) inputu = [Au, Xu] pi = fy(inputu).dimshuffle([0, 1, "x", 4, "x"]) #$ #$ JU_2 u_pz = loggauss(Zu) u_qz = loggauss(Zu, muzu, sdzu) u_py = T.log(1.0 / Y) u_qy = T.log(pi) u_pa = loggauss(Au, normal * muau, normal * sdau) u_qa = loggauss(Au, muaxu, sdaxu) if gaussianX: u_px = loggauss(Xu, muxu, sdxu) else: u_px = logbernoulli(Xu, muxu) #$ #$ JU_3 JU = u_qz + u_qa + u_qy JU = JU - u_px - u_py - u_pz - u_pa JU = batchaverage(exA(classsum(exZ(JU), pi))) JU = aJU * JU #$ #$ JA # Make sure that the known labels are correctly # assigned. # Yl has dimension (Nl, 1, 1, 1, Y) # Al,Xl has dimension (Nl, K, 1, 1, A+X) # fy(Al,Xl) is (Nl, K, 1, 1, Y) # # Yl is one-hot. # Multiply by Yl and perform a sum over # Y to get the one probability out, then neg # log it, average it over K, and # average it over N. inputl = [Al, Xl] JA = batchaverage(exA(-T.log(T.sum(fy(inputl) * Yl, axis=-1)))) JA = aJA * JA #$ # Regularize the weight matrices of the # networks so they do not stray far from zero. # Copied from Maaloe's github code. JW = zero for p in model.params: if 'W' not in str(p): continue JW += T.mean(p**two) JW = aJW * JW JCombined = JL + JU + JA + JW # Stick the objectives into the model. model.objective = JCombined #$ prediction_comments # Create a function for predictions! # We need to evaluate a bunch of values for A, # so Xt is an N by X dimensional matrix and # Et is a K by A dimensional matrix. # Reshape Xt to (N, 1, X) and # Et to (1, K, A). # # Then, At = fmuax(Xt) + Et*fsdax(Xt) # and has a dimension of (N, K, A). # # Class probabilities pi are fy(AXt) # and have shape (N, K, Y). Take their # log, average over K, then argmax over Y # to find class predictions. #$ #$ prediction_function Xt2 = T.matrix("Xt") Et2 = rng.normal((Kt, A)) Xt = Xt2.dimshuffle([0, "x", 1]) Et = Et2.dimshuffle(["x", 0, 1]) muat, sdat = fax([Xt]) At = muat + T.exp(sdat) * Et inputt = [At, Xt] prediction = T.argmax(T.mean(T.log(fy(inputt)), axis=1), axis=-1) predict = theano.function(inputs=[Xt2], outputs=prediction, allow_input_downcast=True) model.predict = predict #$ #$ classification Yt = T.ivector("Yt") accuracyT = T.eq(Yt, prediction).mean(dtype=theano.config.floatX) model.accuracyT = theano.function(inputs=[], outputs=accuracyT, givens={ Xt2: model.Xt, Yt: model.Yt }, allow_input_downcast=True) #$ model.accuracyL = theano.function(inputs=[], outputs=accuracyT, givens={ Xt2: model.Xl, Yt: model.Yl }, allow_input_downcast=True) # Create a stats function that outputs # extra information. model.adds = [ JL, JU, JA, JW, T.mean(l_qa), T.mean(u_qa), T.mean(u_qy), T.mean(l_qz), T.mean(u_qz), -T.mean(l_px.max(axis=AxisY)), -T.mean(u_px.max(axis=AxisY)), -T.mean(l_pa), -T.mean(u_pa), ] model.headings = [ "J", "JL", "JU", "JA", "JW", "l q(a)", "u q(a)", "u q(y)", "l q(z)", "u q(z)", "l -p(x)", "u -p(x)", "l -p(a)", "u -p(a)", ] model.outputs = [model.objective] + model.adds model.stats = theano.function(inputs=[], outputs=model.outputs, givens={ Xl2s: model.Xl[:1000], Yl2: model.Ylh[:1000], Xu2s: model.Xu[:1000], K: 1 }, allow_input_downcast=True) return model
real_data_int = T.itensor4('images') real_data = (T.cast(real_data_int, 'float32') * (2. / 255) - 1.).reshape( (-1, 64 * 64 * 3)) fake_data = Generator(BATCH_SIZE) disc_out = Discriminator(T.concatenate([real_data, fake_data], axis=0)) disc_real = disc_out[:BATCH_SIZE] disc_fake = disc_out[BATCH_SIZE:] gen_cost = -T.mean(Discriminator(fake_data)) disc_cost = T.mean(disc_fake) - T.mean(disc_real) alpha = srng.uniform(size=(BATCH_SIZE, 1), low=0., high=1.) differences = fake_data - real_data interpolates = real_data + (alpha * differences) gradients = T.grad(T.sum(Discriminator(interpolates)), interpolates) slopes = T.sqrt(T.sum(T.sqr(gradients), axis=1)) lipschitz_penalty = T.mean((slopes - 1.)**2) disc_cost += 10 * lipschitz_penalty gen_params = lib.search( gen_cost, lambda x: hasattr(x, 'param') and 'Generator' in x.name) discrim_params = lib.search( disc_cost, lambda x: hasattr(x, 'param') and 'Discriminator' in x.name) gen_grads = T.grad(gen_cost, gen_params) discrim_grads = T.grad(disc_cost, discrim_params) gen_grads = [T.clip(g, lib.floatX(-1.0), lib.floatX(1.0)) for g in gen_grads]
def train(num_epochs, filename, gen_lr=5e-5, beta_1_gen=0.5, beta_1_disc=0.5, print_freq=50, disc_lr=5e-5, num_iter_gen=1, n_samples=20, image_dir=None, binary_dir=None, gt_image_dir=None): f = h5py.File('/home/devon/Data/basic/celeba_64.hdf5', 'r') arr = f['features'][:1000] arr = arr.transpose(0, 2, 3, 1) arr = arr.reshape((arr.shape[0] * arr.shape[1], arr.shape[2], arr.shape[3])) img = Image.fromarray(arr).convert('P', palette=Image.ADAPTIVE, colors=16) # Load the dataset log_file = open(filename, 'w') print("Loading data...") print("Testing RW_DCGAN ...") log_file.write("Testing RW_DCGAN...\n") log_file.write("Loading data...\n") log_file.write("Num_epochs: {}, disc_lr: {}, gen_lr: {}\n".format(num_epochs, disc_lr, gen_lr)) log_file.flush() train_stream, test_stream = load_stream(img=img) # Prepare Theano variables for inputs and targets noise_var = T.matrix('noise') input_var = T.tensor4('inputs') #target_var = T.ivector('targets') # Create neural network model print("Building model and compiling GAN functions...") log_file.write("Building model and compiling GAN functions...\n") parameter = initial_parameters() generator = build_generator(parameter, noise_var) discriminator = build_discriminator(input_var) trng = RandomStreams(random.randint(1, 1000000)) # Sample batch_size = noise_var.shape[0] dim_c = input_var.shape[1] dim_x = input_var.shape[2] dim_y = input_var.shape[3] R = trng.uniform(size=(n_samples, batch_size, dim_c, dim_x, dim_y), dtype=floatX) g_output = lasagne.layers.get_output(generator) samples = (R <= T.shape_padleft(g_output)).astype(floatX) # Create expression for passing real data through the discriminator real_out = lasagne.layers.get_output(discriminator) fake_out = lasagne.layers.get_output( discriminator, samples.reshape( (n_samples * batch_size, dim_c, dim_x, dim_y))) fake_out_ = fake_out.reshape((n_samples, batch_size)) log_d1 = -T.nnet.softplus(-fake_out_) log_d0 = -(fake_out_ + T.nnet.softplus(-fake_out_)) log_w = log_d1 - log_d0 g_output_ = T.shape_padleft(T.clip(g_output, 1e-7, 1. - 1e-7)) log_g = (samples * T.log(g_output_) + (1. - samples) * T.log(1. - g_output_)).sum(axis=(2, 3, 4)) # Find normalized weights. log_N = T.log(log_w.shape[0]).astype(floatX) #log_Z_est = T.maximum(log_sum_exp(log_w - log_N, axis=0), -4) log_Z_est = log_sum_exp(log_w - log_N, axis=0) log_Z_est_ = log_sum_exp(log_w - log_N, axis=0) log_w_tilde = log_w - T.shape_padleft(log_Z_est) - log_N w_tilde = T.exp(log_w_tilde) w_tilde_ = theano.gradient.disconnected_grad(w_tilde) #Create gen_loss generator_loss = -(w_tilde_ * log_g).sum(0).mean() #generator_loss = (T.nnet.softplus(-fake_out)).mean() -- Original GAN loss # Create disc_loss discriminator_loss = (T.nnet.softplus(-real_out)).mean() + (T.nnet.softplus(-fake_out)).mean() + fake_out.mean() # Create update expressions for training generator_params = lasagne.layers.get_all_params(generator, trainable=True) discriminator_params = lasagne.layers.get_all_params(discriminator, trainable=True) # Losses / updates generator_updates = lasagne.updates.adam( generator_loss, generator_params, learning_rate=gen_lr, beta1=beta_1_gen) discriminator_updates = lasagne.updates.adam( discriminator_loss, discriminator_params, learning_rate=disc_lr, beta1=beta_1_disc) ''' generator_updates = lasagne.updates.rmsprop( generator_loss, generator_params, learning_rate=gen_lr) discriminator_updates = lasagne.updates.rmsprop( discriminator_loss, discriminator_params, learning_rate=disc_lr) ''' train_discriminator = theano.function([noise_var, input_var], [(real_out > 0.).mean(), discriminator_loss], allow_input_downcast=True, updates=discriminator_updates) train_generator = theano.function([noise_var, input_var], [(fake_out < 0.).mean(), generator_loss, log_Z_est_.mean()], allow_input_downcast=True, updates=generator_updates) # Compile another function generating some data gen_fn = theano.function([noise_var], lasagne.layers.get_output(generator, deterministic=True)) # Finally, launch the training loop. print("Starting training of GAN...") log_file.write("Starting training of GAN...\n") log_file.flush() # We iterate over epochs: for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: print("Epoch: ", epoch) train_err = 0 train_batches = 0 start_time = time.time() prefix = "ep_{}".format(epoch) for batch in train_stream.get_epoch_iterator(): inputs = np.array(batch[0], dtype=np.float32) noise = lasagne.utils.floatX(np.random.rand(len(inputs), 100)) samples_print_gt = convert_to_rgb(inputs, img) print_images(samples_print_gt[:64], 8, 8, file=gt_image_dir + prefix + '_gt.png') train_discriminator(noise, inputs) disc_train_out = train_discriminator(noise, inputs) p_real, disc_loss = disc_train_out gen_loss_array = [] p_fake_array = [] z_est_array = [] for i in range(num_iter_gen): gen_train_out = train_generator(noise, inputs) p_fake, gen_loss, z_est = gen_train_out gen_loss_array.append(gen_loss) p_fake_array.append(p_fake) z_est_array.append(z_est) gen_loss = np.mean(gen_loss_array) p_fake = np.mean(p_fake_array) z_est = np.mean(z_est_array) train_batches += 1 if train_batches % print_freq == 0: print('-' * 80) print("Batch Number: {}, Epoch Number: {}".format(train_batches + 1, epoch + 1)) print("Generator: p_fake: {}, gen_loss: {}, z_est: {}".format(p_fake, gen_loss, z_est)) print("Discriminator: p_real: {}, disc_loss: {}".format(p_real, disc_loss)) log_file.write('-' * 80 + '\n') log_file.write("Batch Number: {}".format(train_batches + 1, epoch + 1) + '\n') log_file.write("Generator: p_fake: {}, gen_loss: {} \n".format(p_fake, disc_loss)) log_file.write("Discriminator: p_real: {}, disc_loss: {} \n".format(p_real, disc_loss)) log_file.write('-' * 80 + '\n') samples = gen_fn(lasagne.utils.floatX(np.random.rand(5000, 100))) samples = (samples >= 0.5).astype('int') samples = samples[0:49] samples_print = convert_to_rgb(samples, img) print_images(samples_print, 7, 7, file=image_dir + prefix + "_{}".format(train_batches) +'_gen.png') samples_print_gt = convert_to_rgb(inputs, img) print_images(samples_print_gt[:64], 8, 8, file=gt_image_dir + prefix + '_gt.png') # Then we print the results for this epoch: print("Total Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) log_file.write("Total Epoch {} of {} took {:.3f}s\n".format( epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{}".format(train_err / train_batches)) log_file.write(" training loss:\t{}\n".format(train_err / train_batches)) log_file.flush() # And finally, we plot some generated data samples = gen_fn(lasagne.utils.floatX(np.random.rand(5000, 100))) samples = (samples >= 0.5).astype('int') samples = samples[0:49] samples_print = convert_to_rgb(samples, img) print_images(samples_print, 7, 7, file=image_dir + prefix + '_gen.png') #if epoch == num_epochs - 1: #save binary data for further calculation np.savez(binary_dir + prefix + '_celeba_gen_params.npz', *lasagne.layers.get_all_param_values(generator)) log_file.flush() log_file.close()
class ConcreteSampleLayer(lasagne.layers.Layer): """ Sampling layer supporting importance sampling as described in [BURDA]_ and multiple Monte Carlo samples for the approximation of E_q [log( p(x,z) / q(z|x) )]. Parameters ---------- mu : class:`Layer` instance Parameterizing the mean of the distribution to sample from as described in [BURDA]_. log_var : class:`Layer` instance By default assumed to parametrize log(sigma^2) of the distribution to sample from as described in [BURDA]_ which is transformed to sigma using the nonlinearity function as described below. Effectively this means that the nonlinearity function controls what log_var parametrizes. A few common examples: -nonlinearity = lambda x: T.exp(0.5*x) => log_var = log(sigma^2)[default] -nonlinearity = lambda x: T.sqrt(x) => log_var = sigma^2 -nonlinearity = lambda x: x => log_var = sigma eq_samples : int or T.scalar Number of Monte Carlo samples used to estimate the expectation over q(z|x) in eq. (8) in [BURDA]_. iw_samples : int or T.scalar Number of importance samples in the sum over k in eq. (8) in [BURDA]_. nonlinearity : callable or None The nonlinearity that is applied to the log_var input layer to transform it into a standard deviation. By default we assume that log_var = log(sigma^2) and hence the corresponding nonlinearity is f(x) = T.exp(0.5*x) such that T.exp(0.5*log(sigma^2)) = sigma seed : int seed to random stream Methods ---------- seed : Helper function to change the random seed after init is called References ---------- .. [BURDA] Burda, Yuri, Roger Grosse, and Ruslan Salakhutdinov. "Importance Weighted Autoencoders." arXiv preprint arXiv:1509.00519 (2015). """ def __init__(self, logits, eq_samples=1, iw_samples=1, seed=lasagne.random.get_rng().randint(1, 2147462579), **kwargs): super(ConcreteSampleLayer, self).__init__(logits, **kwargs) self.eq_samples = eq_samples self.iw_samples = iw_samples self._srng = RandomStreams(seed) def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)): self._srng.seed(seed) def get_output_shape_for(self, input_shapes): batch_size, num_latent = input_shapes if isinstance(batch_size, int) and \ isinstance(self.iw_samples, int) and \ isinstance(self.eq_samples, int): out_dim = (batch_size * self.eq_samples * self.iw_samples, num_latent) else: out_dim = (None, num_latent) return out_dim def get_output_for(self, input, deterministic=False, **kwargs): logits = input batch_size, num_latent = logits.shape if deterministic: p = T.nnet.softmax(logits) z = p.dimshuffle(0, 'x', 'x', 1) * T.ones( (batch_size, self.eq_samples, self.iw_samples, num_latent)) z_reshaped = z.reshape((-1, num_latent)) else: shape = (batch_size, self.eq_samples, self.iw_samples, num_latent) U = self._srng.uniform(shape, dtype=theano.config.floatX) gumbel_sample = -T.log(-T.log(U + 1e-4) + 1e-4) y = logits.dimshuffle(0, 'x', 'x', 1) + gumbel_sample y_reshaped = y.reshape((-1, num_latent)) z_reshaped = T.nnet.softmax(y_reshaped / 1) return z_reshaped
def sampling(self, num_samples, epoch): sample_batch_size = 10 self.args.batch_size = sample_batch_size rng = np.random.RandomState(self.args.seed) # fixed random seeds theano_rng = MRG_RandomStreams(rng.randint(2**15)) lasagne.random.set_rng(np.random.RandomState(rng.randint(2**15))) data_rng = np.random.RandomState(self.args.seed_data) self.G_weights_layer = nn.softmax_weights( self.args.ng, LL.InputLayer(shape=(), input_var=self.dummy_input)) self.D_weights_layer = nn.softmax_weights( self.args.ng, LL.InputLayer(shape=(), input_var=self.dummy_input)) self.G_weights = LL.get_output(self.G_weights_layer, None, deterministic=True) self.D_weights = LL.get_output(self.D_weights_layer, None, deterministic=True) Gen_x_list = [] z = theano_rng.uniform(size=(sample_batch_size, 50)) y_1hot = T.fmatrix() for i in range(self.args.ng): gen_layers_i, gen_x_i = self.get_generator(self.meanx, z, y_1hot) self.G_layers.append(gen_layers_i) Gen_x_list.append(gen_x_i) for i in range(self.args.nd): disc_layers_i, disc_layer_adv_i, disc_layer_z_recon_i = self.get_discriminator( ) self.D_layers.append(disc_layers_i) self.D_layer_adv.append(disc_layer_adv_i) self.D_layer_z_recon.append(disc_layer_z_recon_i) self.load_model(epoch) samplefun_list = [] for i in range(self.args.ng): samplefun_list.append( th.function(inputs=[self.meanx, y_1hot], outputs=Gen_x_list[i])) mix_weights = th.function(inputs=[self.dummy_input], outputs=[self.G_weights]) g_mix_weights = mix_weights(0.0) ''' load mean img ''' meanimg = np.load('data/meanimg.npy') samples = [] prob_list = g_mix_weights[0].tolist()[0] prob_list[-1] = 1 - sum(prob_list[:-1]) refy = np.zeros((sample_batch_size, ), dtype=np.int) for i in range(sample_batch_size): refy[i] = i % 10 refy_1hot = np.zeros((sample_batch_size, 10), dtype=np.float32) refy_1hot[np.arange(sample_batch_size), refy] = 1 for k in range(num_samples // sample_batch_size): Gen_indx = np.random.choice(5, 1, p=prob_list) #Gen_indx = [4] #z = np.uniform(size=(sample_batch_size, 16), dtype=np.float32) imgs = samplefun_list[Gen_indx[0]](meanimg, refy_1hot) imgs = imgs + meanimg imgs = np.transpose( np.reshape(imgs, (sample_batch_size, 3, 32, 32)), (0, 2, 3, 1)) samples.append(imgs) samples = np.concatenate(samples, 0) np.save( 'sampled_imgs/MixGAN_samples_for_inception_epoch%d.npy' % epoch, samples) scipy.misc.imsave("cifar_samples_mixgan_%d.png" % epoch, samples[20])
def random_uniform(shape, low=0.0, high=1.0, dtype=_FLOATX, seed=None): if seed is None: seed = np.random.randint(10e6) rng = RandomStreams(seed=seed) return rng.uniform(shape, low=low, high=high, dtype=dtype)
parser.add_argument('--seed', type=int, default=1) parser.add_argument('--seed_data', type=int, default=1) parser.add_argument('--unlabeled_weight', type=float, default=10) parser.add_argument('--batch_size', type=int, default=5) parser.add_argument('--count', type=int, default=10) args = parser.parse_args() print(args) # fixed random seeds rng = np.random.RandomState(args.seed) theano_rng = MRG_RandomStreams(rng.randint(2**15)) lasagne.random.set_rng(np.random.RandomState(rng.randint(2**15))) data_rng = np.random.RandomState(args.seed_data) # specify generative model noise = theano_rng.uniform(size=(args.batch_size, 3000)) gen_layers = [LL.InputLayer(shape=(args.batch_size, 3000), input_var=noise)] gen_layers.append( nn.batch_norm(LL.DenseLayer(gen_layers[-1], num_units=500, nonlinearity=T.nnet.softplus), g=None)) gen_layers.append( nn.batch_norm(LL.DenseLayer(gen_layers[-1], num_units=500, nonlinearity=T.nnet.softplus), g=None)) gen_layers.append( nn.l2normalize( LL.DenseLayer(gen_layers[-1], num_units=28**2,
def gpu_evaluate(gru, test_data, items=None, session_key='SessionId', item_key='ItemId', time_key='Time', cut_off=20, batch_size=100, mode='conservative'): if gru.error_during_train: raise Exception print('Measuring Recall@{} and MRR@{}'.format(cut_off, cut_off)) srng = RandomStreams() X = T.ivector() Y = T.ivector() M = T.iscalar() C = [] yhat, H, updatesH = gru.symbolic_predict(X, Y, M, items, batch_size) if mode == 'tiebreaking': yhat += srng.uniform(size=yhat.shape) * 1e-10 if items is None: targets = T.diag(yhat.T[Y]) others = yhat.T else: targets = T.diag(yhat.T[:M]) others = yhat.T[M:] if mode == 'standard': ranks = (others > targets).sum(axis=0) + 1 elif mode == 'conservative': ranks = (others >= targets).sum(axis=0) elif mode == 'median': ranks = (others > targets).sum(axis=0) + 0.5 * ( (others == targets).sum(axis=0) - 1) + 1 elif mode == 'tiebreaking': ranks = (others > targets).sum(axis=0) + 1 else: raise NotImplementedError REC = (ranks <= cut_off).sum() MRR = ((ranks <= cut_off) / ranks).sum() evaluate = theano.function(inputs=[X, Y, M] + C, outputs=[REC, MRR], updates=updatesH, allow_input_downcast=True, on_unused_input='ignore') test_data = pd.merge(test_data, pd.DataFrame({ 'ItemIdx': gru.itemidmap.values, item_key: gru.itemidmap.index }), on=item_key, how='inner') test_data.sort_values([session_key, time_key, item_key], inplace=True) test_data_items = test_data.ItemIdx.values if items is not None: item_idxs = gru.itemidmap[items] recall, mrr, n = 0, 0, 0 iters = np.arange(batch_size) maxiter = iters.max() offset_sessions = np.zeros(test_data[session_key].nunique() + 1, dtype=np.int32) offset_sessions[1:] = test_data.groupby(session_key).size().cumsum() start = offset_sessions[iters] end = offset_sessions[iters + 1] finished = False cidxs = [] while not finished: minlen = (end - start).min() out_idx = test_data_items[start] for i in range(minlen - 1): in_idx = out_idx out_idx = test_data_items[start + i + 1] if items is not None: y = np.hstack([out_idx, item_idxs]) else: y = out_idx rec, m = evaluate(in_idx, y, len(iters), *cidxs) recall += rec mrr += m n += len(iters) start = start + minlen - 1 finished_mask = (end - start <= 1) n_finished = finished_mask.sum() iters[finished_mask] = maxiter + np.arange(1, n_finished + 1) maxiter += n_finished valid_mask = (iters < len(offset_sessions) - 1) n_valid = valid_mask.sum() if n_valid == 0: finished = True break mask = finished_mask & valid_mask sessions = iters[mask] start[mask] = offset_sessions[sessions] end[mask] = offset_sessions[sessions + 1] iters = iters[valid_mask] start = start[valid_mask] end = end[valid_mask] if valid_mask.any(): for i in range(len(H)): tmp = H[i].get_value(borrow=True) tmp[mask] = 0 tmp = tmp[valid_mask] H[i].set_value(tmp, borrow=True) return recall / n, mrr / n
def build_model(d_params, g_params, s_params, options): trng = RandomStreams(SEED) x = tensor.matrix('x', dtype='int32') # n_sample * n_emb where is n_word if options['debug']: x.tag.test_value = np.random.randint(2, size=(64, 40)).astype( 'int32') # batchsize * sent_len(n_word) item: 0-voc_size # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) # generative model part z = tensor.matrix('z', dtype='float32') # n_batch * n_feature n_z = z.shape[0] n_samples = options['batch_size'] n_words = options['n_words'] n_x = d_params['Wemb'].shape[1] #embeding dim if options['shareLSTM']: h_decoder = decoder_g(g_params, z, options, max_step=options['max_step'], prefix='decoder_0') else: z_code = tensor.cast(z[:, 0], dtype='int32') h_decoder = tensor.zeros( [options['max_step'], n_samples, options['n_h']]) h_temp = [] for idx in range(options['n_codes']): temp_idx = tensor.eq(z_code, idx).nonzero()[0] if options['sharedEmb']: h_decoder_temp = decoder_emb_from_d( g_params, d_params, z[:, 1:], options, max_step=options['max_step'], prefix=_p('decoder', idx)) else: h_decoder_temp = decoder_g(g_params, z[:, 1:], options, max_step=options['max_step'], prefix=_p('decoder', idx)) h_temp.append(h_decoder_temp) h_decoder = tensor.inc_subtensor(h_decoder[:, temp_idx, :], h_temp[idx][:, temp_idx, :]) #h_decoder = dropout(h_decoder, trng, use_noise) # reconstruct the original sentence shape_w = h_decoder.shape # n_step, n_sample , n_h h_decoder = h_decoder.reshape((shape_w[0] * shape_w[1], shape_w[2])) # pred_w: (n_steps * n_samples) * n_words if options['sharedEmb']: Vhid = tensor.dot(g_params['Vhid'], d_params['Wemb'].T) else: Vhid = tensor.dot(g_params['Vhid'], g_params['Wemb'].T) pred_w = tensor.dot(h_decoder, Vhid) + g_params['bhid'] n_steps = shape_w[0] # nondifferentiable if options['delta'] > 1e-10: pred_w = tensor.switch(tensor.ge(pred_w, options['delta']), pred_w, 0) #pred_w = tensor.nnet.softmax(pred_w*options['L']) max_w = tensor.max(pred_w, axis=1, keepdims=True) e0 = tensor.exp((pred_w - max_w) * options['L']) pred_w = e0 / tensor.sum(e0, axis=1, keepdims=True) max_print = tensor.max(pred_w, axis=1) max_print = max_print.reshape((n_steps, n_samples)).dimshuffle(1, 0) pred_w = pred_w.reshape( (n_steps, n_samples, n_words)).dimshuffle(1, 0, 2) # reshape need parenthesis if options['force_cut'] == 'cut': rng_temp = tensor.minimum( -tensor.sum(tensor.log(trng.uniform( (n_samples, 6))), axis=1) * 3.3, options['max_step'] - 5) rng_length = tensor.floor(rng_temp).astype('int32') #gamma(6,3.3) # pred_mask = tensor.zeros(pred_w.shape) period = options['period'] # should use set values for i in xrange(n_samples): pred_w = tensor.set_subtensor(pred_w[i, rng_length[i]:, :], 0) pred_w = tensor.set_subtensor(pred_w[i, rng_length[i], period], 1) pred_w = tensor.set_subtensor(pred_w[i, (rng_length[i] + 1):, 0], 1) elif options['force_cut'] == 'strip': for i in xrange(n_samples): pred_w = tensor.set_subtensor( pred_w[i, options['max_step'] - 1, 0], 1) idx_end = theano.tensor.eq(tensor.argmax(pred_w[i, :, :], axis=1), 0).nonzero()[0][0] pred_w = tensor.set_subtensor(pred_w[i, (idx_end + 1):, 0], 1) pred_w = tensor.set_subtensor(pred_w[i, (idx_end + 1):, 1:], 0) pad = max(options['filter_hs']) - 1 end_mat = tensor.concatenate([ tensor.ones([n_samples, pad, 1]), tensor.zeros([n_samples, pad, n_words - 1]) ], axis=2) pred_w = tensor.concatenate([end_mat, pred_w, end_mat], axis=1) n_steps = n_steps + 2 * pad pred_w = pred_w.reshape((n_steps * n_samples, n_words)) # should be d's embeding fake_input = tensor.dot(pred_w, d_params['Wemb']) # real[ 64 1 68 300] fake[ 64 1 41 300] fake_input = fake_input.reshape( (n_samples, 1, n_steps, d_params['Wemb'].shape[1])) #(64,1, ) use_noise2 = theano.shared(numpy_floatX(0.)) fake_input = dropout(fake_input, trng, use_noise2) # fake feature output fake_outputs1 = [] for i in xrange(len(options['filter_hs'])): filter_shape = options['filter_shapes'][i] pool_size = options['pool_sizes'][i] conv_layer = encoder(d_params, fake_input, filter_shape, pool_size, options, prefix=_p('cnn_d', i)) fake_output1 = conv_layer fake_outputs1.append(fake_output1) fake_output1 = tensor.concatenate(fake_outputs1, 1) # should be 64*900 if options['batch_norm']: fake_output1 = batch_norm(d_params, fake_output1, options, prefix='fake') if options['cnn_activation'] == 'tanh': fake_pred = mlp_layer_linear(d_params, fake_output1, prefix='dis_d') elif options['cnn_activation'] == 'linear': fake_pred = mlp_layer_linear(d_params, tensor.tanh(fake_output1), prefix='dis_d') # if not options['wgan']: fake_pred = tensor.nnet.sigmoid(fake_pred) * ( 1 - 2 * options['label_smoothing']) + options['label_smoothing'] # for reverse model # if options['reverse']: fake_recon = mlp_layer_tanh(d_params, fake_output1, prefix='recon') r_t = fake_recon / 2.0 + .5 z_t = z / 2.0 + .5 r_cost = (-z_t * tensor.log(r_t + 0.0001) - (1. - z_t) * tensor.log(1.0001 - r_t)).sum() / n_samples / n_z # Proposal nets (for infogan) fake_outputs2 = [] for i in xrange(len(options['filter_hs'])): filter_shape = options['filter_shapes'][i] pool_size = options['pool_sizes'][i] conv_layer = encoder(g_params, fake_input, filter_shape, pool_size, options, prefix=_p('cnn_d', i)) fake_output2 = conv_layer fake_outputs2.append(fake_output2) fake_output2 = tensor.concatenate( fake_outputs2, 1) # should be 64*900 # why it is 64*0??? # check whether to use softmax or tanh fake_propose = mlp_layer_tanh(g_params, fake_output2, prefix='dis_q') fake_propose = (fake_propose + 1) / 2 fake_propose = tensor.log(fake_propose) z_code = tensor.cast(z[:, 0], dtype='int32') z_index = tensor.arange(n_z) fake_logent = fake_propose[z_index, z_code] l_I = tensor.sum(fake_logent) # Wemb: voc_size(n_words) * n_emb 64* 1* 40 *48 real_input = d_params['Wemb'][tensor.cast( x.flatten(), dtype='int32')].reshape( (x.shape[0], 1, x.shape[1], d_params['Wemb'].shape[1])) # n_sample,1,n_length,n_emb real_input = dropout(real_input, trng, use_noise2) real_outputs = [] for i in xrange(len(options['filter_hs'])): filter_shape = options['filter_shapes'][i] pool_size = options['pool_sizes'][i] conv_layer2 = encoder(d_params, real_input, filter_shape, pool_size, options, prefix=_p('cnn_d', i)) real_output = conv_layer2 real_outputs.append(real_output) real_output = tensor.concatenate(real_outputs, 1) if options['batch_norm']: real_output = batch_norm(d_params, real_output, options, prefix='real') if options['cnn_activation'] == 'tanh': real_pred = mlp_layer_linear(d_params, real_output, prefix='dis_d') elif options['cnn_activation'] == 'linear': real_pred = mlp_layer_linear(d_params, tensor.tanh(real_output), prefix='dis_d') if not options['wgan']: real_pred = tensor.nnet.sigmoid(real_pred) * ( 1 - 2 * options['label_smoothing']) + options['label_smoothing'] #Compute for KDE mu = real_output X = fake_output1 KDE = cal_nkde(X, mu, options['kde_sigma']) #calculate KDE on real_input and fake_input X_i = fake_input.reshape((n_samples, n_steps * d_params['Wemb'].shape[1])) mu_i = real_input.reshape((n_samples, n_steps * d_params['Wemb'].shape[1])) KDE_input = cal_nkde(X_i, mu_i, options['kde_sigma']) # sufficient statistics cur_size = s_params['seen_size'] * 1.0 identity = tensor.eye(options['n_z']) * options['diag'] fake_mean = tensor.mean(fake_output1, axis=0) real_mean = tensor.mean(real_output, axis=0) fake_xx = tensor.dot(fake_output1.T, fake_output1) real_xx = tensor.dot(real_output.T, real_output) acc_fake_xx = (s_params['acc_fake_xx'] * cur_size + fake_xx) / (cur_size + n_samples) acc_real_xx = (s_params['acc_real_xx'] * cur_size + real_xx) / (cur_size + n_samples) acc_fake_mean = (s_params['acc_fake_mean'] * cur_size + fake_mean * n_samples) / (cur_size + n_samples) acc_real_mean = (s_params['acc_real_mean'] * cur_size + real_mean * n_samples) / (cur_size + n_samples) cov_fake = acc_fake_xx - tensor.dot(acc_fake_mean.dimshuffle(0, 'x'), acc_fake_mean.dimshuffle( 0, 'x').T) + identity cov_real = acc_real_xx - tensor.dot(acc_real_mean.dimshuffle(0, 'x'), acc_real_mean.dimshuffle( 0, 'x').T) + identity cov_fake_inv = tensor.nlinalg.matrix_inverse(cov_fake) cov_real_inv = tensor.nlinalg.matrix_inverse(cov_real) if options['feature_match'] == 'moment': temp1 = ((fake_mean - real_mean)**2).sum() fake_obj = temp1 elif options['feature_match'] == 'JSD_acc': temp1 = tensor.nlinalg.trace( tensor.dot(cov_fake_inv, cov_real) + tensor.dot(cov_real_inv, cov_fake)) temp2 = tensor.dot( tensor.dot((acc_fake_mean - acc_real_mean), (cov_fake_inv + cov_real_inv)), (acc_fake_mean - acc_real_mean).T) fake_obj = temp1 + temp2 elif options['feature_match'] == 'mmd': #### too many nodes, use scan #### kxx, kxy, kyy = 0, 0, 0 dividend = 1 dist_x, dist_y = fake_output1 / dividend, real_output / dividend x_sq = tensor.sum(dist_x**2, axis=1).dimshuffle(0, 'x') # 64*1 y_sq = tensor.sum(dist_y**2, axis=1).dimshuffle(0, 'x') # 64*1 tempxx = -2 * tensor.dot(dist_x, dist_x.T) + x_sq + x_sq.T # (xi -xj)**2 tempxy = -2 * tensor.dot(dist_x, dist_y.T) + x_sq + y_sq.T # (xi -yj)**2 tempyy = -2 * tensor.dot(dist_y, dist_y.T) + y_sq + y_sq.T # (yi -yj)**2 for sigma in options['sigma_range']: kxx += tensor.mean(tensor.exp(-tempxx / 2 / (sigma**2))) kxy += tensor.mean(tensor.exp(-tempxy / 2 / (sigma**2))) kyy += tensor.mean(tensor.exp(-tempyy / 2 / (sigma**2))) fake_obj = tensor.sqrt(kxx + kyy - 2 * kxy) elif options['feature_match'] == 'mmd_cov': kxx, kxy, kyy = 0, 0, 0 cov_sum = (cov_fake + cov_real) / 2 cov_sum_inv = tensor.nlinalg.matrix_inverse(cov_sum) dividend = 1 dist_x, dist_y = fake_output1 / dividend, real_output / dividend cov_inv_mat = cov_sum_inv x_sq = tensor.sum(tensor.dot(dist_x, cov_inv_mat) * dist_x, axis=1).dimshuffle(0, 'x') y_sq = tensor.sum(tensor.dot(dist_y, cov_inv_mat) * dist_y, axis=1).dimshuffle(0, 'x') tempxx = -2 * tensor.dot(tensor.dot(dist_x, cov_inv_mat), dist_x.T) + x_sq + x_sq.T # (xi -xj)**2 tempxy = -2 * tensor.dot(tensor.dot(dist_x, cov_inv_mat), dist_y.T) + x_sq + y_sq.T # (xi -yj)**2 tempyy = -2 * tensor.dot(tensor.dot(dist_y, cov_inv_mat), dist_y.T) + y_sq + y_sq.T # (yi -yj)**2 for sigma in options['sigma_range']: kxx += tensor.mean(tensor.exp(-tempxx / 2 / (sigma**2))) kxy += tensor.mean(tensor.exp(-tempxy / 2 / (sigma**2))) kyy += tensor.mean(tensor.exp(-tempyy / 2 / (sigma**2))) fake_obj = tensor.sqrt(kxx + kyy - 2 * kxy) elif options['feature_match'] == 'mmd_ld': kxx, kxy, kyy = 0, 0, 0 real_mmd = mlp_layer_tanh(d_params, real_output, prefix='dis_mmd') fake_mmd = mlp_layer_tanh(d_params, fake_output1, prefix='dis_mmd') dividend = options['dim_mmd'] # for numerical stability & scale with dist_x, dist_y = fake_mmd / dividend, real_mmd / dividend x_sq = tensor.sum(dist_x**2, axis=1).dimshuffle(0, 'x') # 64*1 y_sq = tensor.sum(dist_y**2, axis=1).dimshuffle(0, 'x') # 64*1 tempxx = -2 * tensor.dot(dist_x, dist_x.T) + x_sq + x_sq.T # (xi -xj)**2 tempxy = -2 * tensor.dot(dist_x, dist_y.T) + x_sq + y_sq.T # (xi -yj)**2 tempyy = -2 * tensor.dot(dist_y, dist_y.T) + y_sq + y_sq.T # (yi -yj)**2 for sigma in options['sigma_range']: kxx += tensor.exp(-tempxx / 2 / sigma).sum() kxy += tensor.exp(-tempxy / 2 / sigma).sum() kyy += tensor.exp(-tempyy / 2 / sigma).sum() fake_obj = tensor.sqrt(kxx + kyy - 2 * kxy) elif options['feature_match'] == 'mmd_h': #### too many nodes, use scan #### kxx, kxy, kyy = 0, 0, 0 if options['cnn_activation'] == 'tanh': fake_mmd = middle_layer(d_params, fake_output1, prefix='dis_d') elif options['cnn_activation'] == 'linear': fake_mmd = middle_layer(d_params, tensor.tanh(fake_output1), prefix='dis_d') # if options['cnn_activation'] == 'tanh': real_mmd = middle_layer(d_params, real_output, prefix='dis_d') elif options['cnn_activation'] == 'linear': real_mmd = middle_layer(d_params, tensor.tanh(real_output), prefix='dis_d') # dividend = 1 dist_x, dist_y = fake_mmd / dividend, real_mmd / dividend x_sq = tensor.sum(dist_x**2, axis=1).dimshuffle(0, 'x') # 64*1 y_sq = tensor.sum(dist_y**2, axis=1).dimshuffle(0, 'x') # 64*1 tempxx = -2 * tensor.dot(dist_x, dist_x.T) + x_sq + x_sq.T # (xi -xj)**2 tempxy = -2 * tensor.dot(dist_x, dist_y.T) + x_sq + y_sq.T # (xi -yj)**2 tempyy = -2 * tensor.dot(dist_y, dist_y.T) + y_sq + y_sq.T # (yi -yj)**2 for sigma in options['sigma_range']: kxx += tensor.mean(tensor.exp(-tempxx / 2 / (sigma**2))) kxy += tensor.mean(tensor.exp(-tempxy / 2 / (sigma**2))) kyy += tensor.mean(tensor.exp(-tempyy / 2 / (sigma**2))) fake_obj = tensor.sqrt(kxx + kyy - 2 * kxy) else: fake_obj = -tensor.log(fake_pred + 1e-6).sum() / n_z if options['wgan']: gan_cost_d = fake_pred.sum() / n_z - real_pred.sum() / n_samples gan_cost_g = -fake_pred.sum() / n_z + 0 * ( (fake_mean - acc_real_mean)**2).sum() else: gan_cost_d = -tensor.log(1 - fake_pred + 1e-6).sum( ) / n_z - tensor.log(real_pred + 1e-6).sum() / n_samples gan_cost_g = fake_obj #result4 = fake_obj d_cost = gan_cost_d - options['lambda_fm'] * fake_obj + options[ 'lambda_recon'] * r_cost + options['lambda_q'] * l_I / n_z g_cost = gan_cost_g - options['lambda_q'] * l_I / n_z #result1, result2, result4, result5, result6 = x_sq, y_sq, tempxx, tempxy, tempyy result1 = tensor.mean(real_pred) # goes to nan result2 = tensor.mean(fake_pred) # goes to nan result3 = tensor.argmax(pred_w, axis=1).reshape([n_samples, n_steps]) result4 = tensor.nlinalg.trace( tensor.dot(cov_fake_inv, cov_real) + tensor.dot(cov_real_inv, cov_fake)) result5 = max_print[ 0] #mu #tensor.dot( tensor.dot((acc_fake_mean - acc_real_mean) , (cov_fake_inv + cov_real_inv)), (acc_fake_mean - acc_real_mean).T) result6 = ((fake_mean - real_mean)**2).sum() return use_noise, use_noise2, x, z, d_cost, g_cost, r_cost, fake_recon, acc_fake_xx, acc_real_xx, acc_fake_mean, acc_real_mean, result1, result2, result3, result4, result5, result6, KDE, KDE_input
class RandomizedRectifierLayer(Layer): """ A layer that applies a randomized leaky rectify nonlinearity to its input. The randomized leaky rectifier was first proposed and used in the Kaggle NDSB Competition, and later evaluated in [1]_. Compared to the standard leaky rectifier :func:`leaky_rectify`, it has a randomly sampled slope for negative input during training, and a fixed slope during evaluation. Equation for the randomized rectifier linear unit during training: :math:`\\varphi(x) = \\max((\\sim U(lower, upper)) \\cdot x, x)` During evaluation, the factor is fixed to the arithmetic mean of `lower` and `upper`. Parameters ---------- incoming : a :class:`Layer` instance or a tuple The layer feeding into this layer, or the expected input shape lower : Theano shared variable, expression, or constant The lower bound for the randomly chosen slopes. upper : Theano shared variable, expression, or constant The upper bound for the randomly chosen slopes. shared_axes : 'auto', 'all', int or tuple of int The axes along which the random slopes of the rectifier units are going to be shared. If ``'auto'`` (the default), share over all axes except for the second - this will share the random slope over the minibatch dimension for dense layers, and additionally over all spatial dimensions for convolutional layers. If ``'all'``, share over all axes, thus using a single random slope. **kwargs Any additional keyword arguments are passed to the `Layer` superclass. References ---------- .. [1] Bing Xu, Naiyan Wang et al. (2015): Empirical Evaluation of Rectified Activations in Convolutional Network, http://arxiv.org/abs/1505.00853 """ def __init__(self, incoming, lower=0.3, upper=0.8, shared_axes='auto', **kwargs): super(RandomizedRectifierLayer, self).__init__(incoming, **kwargs) self._srng = RandomStreams(get_rng().randint(1, 2147462579)) self.lower = lower self.upper = upper if not isinstance(lower > upper, theano.Variable) and lower > upper: raise ValueError("Upper bound for RandomizedRectifierLayer needs " "to be higher than lower bound.") if shared_axes == 'auto': self.shared_axes = (0, ) + tuple(range(2, len(self.input_shape))) elif shared_axes == 'all': self.shared_axes = tuple(range(len(self.input_shape))) elif isinstance(shared_axes, int): self.shared_axes = (shared_axes, ) else: self.shared_axes = shared_axes def get_output_for(self, input, deterministic=False, **kwargs): """ Parameters ---------- input : tensor output from the previous layer deterministic : bool If true, the arithmetic mean of lower and upper are used for the leaky slope. """ if deterministic or self.upper == self.lower: return theano.tensor.nnet.relu(input, (self.upper + self.lower) / 2.0) else: shape = list(self.input_shape) if any(s is None for s in shape): shape = list(input.shape) for ax in self.shared_axes: shape[ax] = 1 rnd = self._srng.uniform(tuple(shape), low=self.lower, high=self.upper, dtype=theano.config.floatX) rnd = theano.tensor.addbroadcast(rnd, *self.shared_axes) return theano.tensor.nnet.relu(input, rnd)
assert np.abs(np.mean(l.avg_batch_mean.get_value()) - 0)>1e-7 # input variables y = T.ivector() y_1hot = T.matrix() x = T.tensor4() meanx = T.tensor3() # real_fc3 = LL.get_output(enc_layer_fc3, x, deterministic=True) #y_pred, real_pool3 = LL.get_output([fc8, poo5], x, deterministic=False) # real_pool3 = LL.get_output(poo5, x, deterministic=False) #enc_error = T.mean(T.neq(T.argmax(y_pred,axis=1),y)) # classification error of the encoder, to make sure the encoder is working properly # specify generator, gen_x = G(z, real_pool3) z = theano_rng.uniform(size=(args.batch_size, 50)) # uniform noise # y_1hot = T.matrix() gen_x_layer_z = LL.InputLayer(shape=(args.batch_size, 50), input_var=z) # z, 20 # gen_x_layer_z_embed = nn.batch_norm(LL.DenseLayer(gen_x_layer_z, num_units=128), g=None) # 20 -> 64 gen_x_layer_y = LL.InputLayer(shape=(args.batch_size, 10), input_var=y_1hot) # conditioned on real fc3 activations gen_x_layer_y_z = LL.ConcatLayer([gen_x_layer_y,gen_x_layer_z],axis=1) #512+256 = 768 gen_x_layer_pool2 = LL.ReshapeLayer(nn.batch_norm(LL.DenseLayer(gen_x_layer_y_z, num_units=256*5*5)), (args.batch_size,256,5,5)) gen_x_layer_dconv2_1 = nn.batch_norm(nn.Deconv2DLayer(gen_x_layer_pool2, (args.batch_size,256,10,10), (5,5), stride=(2, 2), padding = 'half', W=Normal(0.02), nonlinearity=nn.relu)) gen_x_layer_dconv2_2 = nn.batch_norm(nn.Deconv2DLayer(gen_x_layer_dconv2_1, (args.batch_size,128,14,14), (5,5), stride=(1, 1), padding = 'valid', W=Normal(0.02), nonlinearity=nn.relu)) gen_x_layer_dconv1_1 = nn.batch_norm(nn.Deconv2DLayer(gen_x_layer_dconv2_2, (args.batch_size,128,28,28), (5,5), stride=(2, 2), padding = 'half', W=Normal(0.02), nonlinearity=nn.relu)) gen_x_layer_x = nn.Deconv2DLayer(gen_x_layer_dconv1_1, (args.batch_size,3,32,32), (5,5), stride=(1, 1), padding = 'valid',
class Ensemble: """An ensemble is a collection of neurons representing a vector space. """ def __init__(self, neurons, dimensions, dt, tau_ref=0.002, tau_rc=0.02, max_rate=(200, 300), intercept=(-1.0, 1.0), radius=1.0, encoders=None, seed=None, neuron_type='lif', array_size=1, eval_points=None, decoder_noise=0.1, noise_type='uniform', noise=None, mode='spiking'): """Construct an ensemble composed of the specific neuron model, with the specified neural parameters. :param int neurons: number of neurons in this population :param int dimensions: number of dimensions in the vector space that these neurons represent :param float tau_ref: length of refractory period :param float tau_rc: RC constant; approximately how long until 2/3 of the threshold voltage is accumulated :param tuple max_rate: lower and upper bounds on randomly generated firing rates for each neuron :param tuple intercept: lower and upper bounds on randomly generated x offsets for each neuron :param float radius: the range of input values (-radius:radius) per dimension this population is sensitive to :param list encoders: set of possible preferred directions :param int seed: seed value for random number generator :param string neuron_type: type of neuron model to use, options = {'lif'} :param int array_size: number of sub-populations for network arrays :param list eval_points: specific set of points to optimize decoders over by default :param float decoder_noise: amount of noise to assume when computing decoder :param string noise_type: the type of noise added to the input current. Possible options = {'uniform', 'gaussian'}. Default is 'uniform' to match the Nengo implementation. :param float noise: noise parameter for noise added to input current, sampled at every timestep. If noise_type = uniform, this is the lower and upper bound on the distribution. If noise_type = gaussian, this is the variance. """ if seed is None: seed = np.random.randint(1000) self.seed = seed self.neurons_num = neurons self.dimensions = dimensions self.array_size = array_size self.radius = radius self.noise = noise self.noise_type = noise_type self.decoder_noise = decoder_noise self.mode = mode # make sure that eval_points is the right shape if eval_points is not None: eval_points = np.array(eval_points) if len(eval_points.shape) == 1: eval_points.shape = [1, eval_points.shape[0]] self.eval_points = eval_points # make sure intercept is the right shape if isinstance(intercept, (int, float)): intercept = [intercept, 1] elif len(intercept) == 1: intercept.append(1) self.cache_key = cache.generate_ensemble_key( neurons=neurons, dimensions=dimensions, tau_rc=tau_rc, tau_ref=tau_ref, max_rate=max_rate, intercept=intercept, radius=radius, encoders=encoders, decoder_noise=decoder_noise, eval_points=eval_points, noise=noise, seed=seed, dt=dt, array_size=array_size) # make dictionary for origins self.origin = {} # set up a dictionary for decoded_input self.decoded_input = {} # if we're creating a spiking ensemble if self.mode == 'spiking': # TODO: handle different neuron types, self.neurons = neuron.types[neuron_type](size=(array_size, self.neurons_num), tau_rc=tau_rc, tau_ref=tau_ref) # compute alpha and bias self.srng = RandomStreams(seed=seed) self.max_rate = max_rate max_rates = np.random.uniform(size=(self.array_size, self.neurons_num), low=max_rate[0], high=max_rate[1]) threshold = np.random.uniform(size=(self.array_size, self.neurons_num), low=intercept[0], high=intercept[1]) self.alpha, self.bias = self.neurons.make_alpha_bias( max_rates, threshold) # compute encoders self.encoders = self.make_encoders(encoders=encoders) # combine encoders and gain for simplification self.encoders = (self.encoders.T * self.alpha.T).T self.shared_encoders = theano.shared( self.encoders, name='ensemble.shared_encoders') # set up a dictionary for encoded_input connections self.encoded_input = {} # list of learned terminations on ensemble self.learned_terminations = [] # make default origin self.add_origin('X', func=None, dt=dt, eval_points=self.eval_points) elif self.mode == 'direct': # make default origin self.add_origin('X', func=None, dimensions=self.dimensions * self.array_size) # reset neurons_num to 0 self.neurons_num = 0 def add_termination(self, name, pstc, decoded_input=None, encoded_input=None): """Accounts for a new termination that takes the given input (a theano object) and filters it with the given pstc. Adds its contributions to the set of decoded, encoded, or learn input with the same pstc. Decoded inputs are represented signals, encoded inputs are decoded_output * weight matrix, learn input is activities * weight_matrix. Can only have one of decoded OR encoded OR learn input != None. :param float pstc: post-synaptic time constant :param decoded_input: theano object representing the decoded output of the pre population multiplied by this termination's transform matrix :param encoded_input: theano object representing the encoded output of the pre population multiplied by a connection weight matrix :param learn_input: theano object representing the learned output of the pre population multiplied by a connection weight matrix """ # make sure one and only one of # (decoded_input, encoded_input) is specified if decoded_input is not None: assert (encoded_input is None) elif encoded_input is not None: assert (decoded_input is None) else: assert False if decoded_input: if self.mode is not 'direct': # rescale decoded_input by this neuron's radius source = TT.true_div(decoded_input, self.radius) # ignore radius in direct mode else: source = decoded_input name = helpers.get_unique_name(name, self.decoded_input) self.decoded_input[name] = filter.Filter(name=name, pstc=pstc, source=source, shape=(self.array_size, self.dimensions)) elif encoded_input: name = helpers.get_unique_name(name, self.encoded_input) self.encoded_input[name] = filter.Filter(name=name, pstc=pstc, source=encoded_input, shape=(self.array_size, self.neurons_num)) def add_learned_termination(self, name, pre, error, pstc, dt, learned_termination_class=hPESTermination, **kwargs): """Adds a learned termination to the ensemble. Input added to encoded_input, and a learned_termination object is created to keep track of the pre and post (self) spike times, and adjust the weight matrix according to the specified learning rule. :param Ensemble pre: the pre-synaptic population :param Ensemble error: the Origin that provides the error signal :param float pstc: :param learned_termination_class: """ #TODO: is there ever a case we wouldn't want this? assert error.dimensions == self.dimensions * self.array_size # generate an initial weight matrix if none provided, # random numbers between -.001 and .001 if 'weight_matrix' not in kwargs.keys(): weight_matrix = np.random.uniform( size=(self.array_size * pre.array_size, self.neurons_num, pre.neurons_num), low=-1e-3, high=1e-3) kwargs['weight_matrix'] = weight_matrix.astype('float32') else: # make sure it's an np.array #TODO: error checking to make sure it's the right size kwargs['weight_matrix'] = np.array(kwargs['weight_matrix'], dtype='float32') learned_term = learned_termination_class(pre=pre, post=self, error=error, **kwargs) learn_projections = [ TT.dot(pre.neurons.output[learned_term.pre_index(i)], learned_term.weight_matrix[i % self.array_size].T) / dt for i in range(self.array_size * pre.array_size) ] # now want to sum all the output to each of the post ensembles # going to reshape and sum along the 0 axis learn_output = TT.sum( TT.reshape(learn_projections, (pre.array_size, self.array_size, self.neurons_num)), axis=0) # reshape to make it (array_size x neurons_num) learn_output = TT.reshape(learn_output, (self.array_size, self.neurons_num)) # the input_current from this connection during simulation self.add_termination(name=name, pstc=pstc, encoded_input=learn_output) self.learned_terminations.append(learned_term) return learned_term def add_origin(self, name, func, **kwargs): """Create a new origin to perform a given function on the represented signal. :param string name: name of origin :param function func: desired transformation to perform over represented signal :param list eval_points: specific set of points to optimize decoders over for this origin """ # if we're in spiking mode create an ensemble_origin with decoders # and the whole shebang for interpreting the neural activity if self.mode == 'spiking': if 'eval_points' not in kwargs.keys(): kwargs['eval_points'] = self.eval_points self.origin[name] = ensemble_origin.EnsembleOrigin(ensemble=self, func=func, **kwargs) # if we're in direct mode then this population is just directly # performing the specified function, use a basic origin elif self.mode == 'direct': if func is not None: if 'initial_value' not in kwargs.keys(): # [func(np.zeros(self.dimensions)) for i in range(self.array_size)] init = func(np.zeros(self.dimensions)) init = np.array([init for i in range(self.array_size)]) kwargs['initial_value'] = init.flatten() if kwargs.has_key('dt'): del kwargs['dt'] self.origin[name] = origin.Origin(func=func, **kwargs) def make_encoders(self, encoders=None): """Generates a set of encoders. :param int neurons: number of neurons :param int dimensions: number of dimensions :param theano.tensor.shared_randomstreams snrg: theano random number generator function :param list encoders: set of possible preferred directions of neurons """ if encoders is None: # if no encoders specified, generate randomly encoders = np.random.normal(size=(self.array_size, self.neurons_num, self.dimensions)) else: # if encoders were specified, cast list as array encoders = np.array(encoders).T # repeat array until 'encoders' is the same length # as number of neurons in population encoders = np.tile(encoders, (self.neurons_num / len(encoders) + 1)).T[:self.neurons_num, :self.dimensions] encoders = np.tile(encoders, (self.array_size, 1, 1)) # normalize encoders across represented dimensions norm = np.sum(encoders * encoders, axis=2)[:, :, None] encoders = encoders / np.sqrt(norm) return encoders.astype('float32') def theano_tick(self): if self.mode == 'direct': # set up matrix to store accumulated decoded input X = np.zeros((self.array_size, self.dimensions), dtype='float32') # updates is an ordered dictionary of theano variables to update for di in self.decoded_input.values(): # add its values to the total decoded input X += di.value.get_value() # if we're calculating a function on the decoded input for o in self.origin.values(): if o.func is not None: val = np.float32([o.func(X[i]) for i in range(len(X))]) o.decoded_output.set_value(val.flatten()) def update(self, dt): """Compute the set of theano updates needed for this ensemble. Returns a dictionary with new neuron state, termination, and origin values. :param float dt: the timestep of the update """ ### find the total input current to this population of neurons # set up matrix to store accumulated decoded input X = None # updates is an ordered dictionary of theano variables to update updates = OrderedDict() for ii, di in enumerate(self.decoded_input.values()): # add its values to the total decoded input if ii == 0: X = di.value else: X += di.value updates.update(di.update(dt)) # if we're in spiking mode, then look at the input current and # calculate new neuron activities for output if self.mode == 'spiking': # apply respective biases to neurons in the population J = TT.as_tensor_variable(np.array(self.bias)) for ei in self.encoded_input.values(): # add its values directly to the input current J += (ei.value.T * self.alpha.T).T updates.update(ei.update(dt)) # only do this if there is decoded_input if X is not None: # add to input current for each neuron as # represented input signal x preferred direction for i in range(self.array_size): #len(self.bias)): J = TT.inc_subtensor( J[i], TT.dot(X[i], self.shared_encoders[i].T)) # if noise has been specified for this neuron, if self.noise: # generate random noise values, one for each input_current element, # with standard deviation = sqrt(self.noise=std**2) # When simulating white noise, the noise process must be scaled by # sqrt(dt) instead of dt. Hence, we divide the std by sqrt(dt). if self.noise_type.lower() == 'gaussian': J += self.srng.normal(size=self.bias.shape, std=np.sqrt(self.noise / dt)) elif self.noise_type.lower() == 'uniform': J += self.srng.uniform(size=self.bias.shape, low=-self.noise / np.sqrt(dt), high=self.noise / np.sqrt(dt)) # pass that total into the neuron model to produce # the main theano computation updates.update(self.neurons.update(J, dt)) for l in self.learned_terminations: # also update the weight matrices on learned terminations updates.update(l.update(dt)) # and compute the decoded origin decoded_input from the neuron output for o in self.origin.values(): updates.update(o.update(dt, updates[self.neurons.output])) if self.mode == 'direct': # if we're in direct mode then just directly pass the decoded_input # to the origins for decoded_output for o in self.origin.values(): if o.func is None: if len(self.decoded_input) > 0: updates.update( OrderedDict({ o.decoded_output: TT.flatten(X).astype('float32') })) return updates
class HardGatedRecurrent(BaseRecurrent, Initializable): @lazy(allocation=['dim']) def __init__(self, dim, activation=None, mlp=None, **kwargs): super(HardGatedRecurrent, self).__init__(**kwargs) self.dim = dim if not activation: activation = Tanh() self.activation = activation # The activation of the mlp should be a Logistic function self.mlp = mlp # The random stream self.randomstream = MRG_RandomStreams() self.children = [activation, mlp] @property def state_to_state(self): return self.params[0] @property def matrix_gate(self): return self.params[1] def get_dim(self, name): if name == 'mask': return 0 if name in ['inputs', 'states']: return self.dim return super(HardGatedRecurrent, self).get_dim(name) def _allocate(self): self.params.append( shared_floatx_nans((self.dim, self.dim), name='state_to_state')) self.params.append( shared_floatx_zeros((self.dim, ), name="initial_state")) add_role(self.params[0], WEIGHT) add_role(self.params[1], INITIAL_STATE) def _initialize(self): self.weights_init.initialize(self.state_to_state, self.rng) @recurrent(sequences=['mask', 'inputs'], states=['states'], outputs=['states'], contexts=[]) def apply(self, inputs, states, mask=None): """Apply the gated recurrent transition. Parameters ---------- states : :class:`~tensor.TensorVariable` The 2 dimensional matrix of current states in the shape (batch_size, dim). Required for `one_step` usage. inputs : :class:`~tensor.TensorVariable` The 2 dimensional matrix of inputs in the shape (batch_size, dim) mask : :class:`~tensor.TensorVariable` A 1D binary array in the shape (batch,) which is 1 if there is data available, 0 if not. Assumed to be 1-s only if not given. Returns ------- output : :class:`~tensor.TensorVariable` Next states of the network. """ # Concatenate the inputs of the MLP mlp_input = tensor.concatenate((inputs, states), axis=1) # Compute the output of the MLP gate_value = self.mlp.apply(mlp_input) random = self.randomstream.uniform((1, )) # TODO: Find a way to remove the following "hack". # Simply removing the two next lines won't work gate_value = gate_value[:, 0] gate_value = gate_value[:, None] # Compute the next_states value, before gating next_states = self.activation.apply( states.dot(self.state_to_state) + inputs) # Apply the gating next_states = tensor.switch(tensor.le(random[0], gate_value), next_states, states) if mask: next_states = (mask[:, None] * next_states + (1 - mask[:, None]) * states) return next_states @application(outputs=apply.states) def initial_states(self, batch_size, *args, **kwargs): return [tensor.repeat(self.params[2][None, :], batch_size, 0)]
def test_uniform(): # TODO: test param low, high # TODO: test size=None # TODO: test ndim!=size.ndim # TODO: test bad seed # TODO: test size=Var, with shape that change from call to call if (config.mode in ["DEBUG_MODE", "DebugMode", "FAST_COMPILE"] or config.mode == "Mode" and config.linker in ["py"]): sample_size = (10, 100) steps = 50 else: sample_size = (500, 50) steps = int(1e3) x = tensor.matrix() for size, const_size, var_input, input in [ (sample_size, sample_size, [], []), (x.shape, sample_size, [x], [np.zeros(sample_size, dtype=config.floatX)]), ( (x.shape[0], sample_size[1]), sample_size, [x], [np.zeros(sample_size, dtype=config.floatX)], ), # test empty size (scalar) ((), (), [], []), ]: # TEST CPU IMPLEMENTATION # The python and C implementation are tested with DebugMode x = tensor.matrix() R = MRG_RandomStreams(234) # Note: we specify `nstreams` to avoid a warning. # TODO Look for all occurrences of `guess_n_streams` and `30 * 256` # for such situations: it would be better to instead filter the # warning using the warning module. u = R.uniform(size=size, nstreams=rng_mrg.guess_n_streams(size, warn=False)) f = theano.function(var_input, u) assert any([ isinstance(node.op, theano.sandbox.rng_mrg.mrg_uniform) for node in f.maker.fgraph.toposort() ]) f(*input) # Increase the number of steps if sizes implies only a few samples if np.prod(const_size) < 10: steps_ = steps * 100 else: steps_ = steps check_basics(f, steps_, const_size, prefix="mrg cpu", inputs=input) RR = theano.tensor.shared_randomstreams.RandomStreams(234) uu = RR.uniform(size=size) ff = theano.function(var_input, uu) # It's not our problem if numpy generates 0 or 1 check_basics(ff, steps_, const_size, prefix="numpy", allow_01=True, inputs=input)
def _pokemon_wgan_gp(): import os os.environ["FUEL_DATA_PATH"] = os.getcwd() + "/data/" batch_size = 20 data_train = PokemonGenYellowNormal(which_sets=['train'], sources=['features']) train_stream = Flatten(DataStream.default_stream( data_train, iteration_scheme=SequentialScheme( data_train.num_examples, batch_size))) features_size = 56 * 56 * 1 inits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.) } # print train_stream.get_epoch_iterator(as_dict=True).next() # raise inputs = T.matrix('features') inputs = ((inputs / 255.) * 2. - 1.) rng = MRG_RandomStreams(123) prior = Z_prior(dim=512) gen = Generator(input_dim=512, dims=[512, 512, 512, 512, features_size], alpha=0.1, **inits) dis = Discriminator(dims=[features_size, 512, 512 , 512, 512], alpha=0.1, **inits) gan = GAN(dis=dis, gen=gen, prior=prior) gan.initialize() # gradient penalty fake_samples, _ = gan.sampling(inputs.shape[0]) e = rng.uniform(size=(inputs.shape[0], 1)) mixed_input = (e * fake_samples) + (1 - e) * inputs output_d_mixed = gan._dis.apply(mixed_input) grad_mixed = T.grad(T.sum(output_d_mixed), mixed_input) norm_grad_mixed = T.sqrt(T.sum(T.square(grad_mixed), axis=1)) grad_penalty = T.mean(T.square(norm_grad_mixed -1)) y_hat1, y_hat0, z = gan.apply(inputs) d_loss_real = y_hat1.mean() d_loss_fake = y_hat0.mean() d_loss = - d_loss_real + d_loss_fake + 10 * grad_penalty g_loss = - d_loss_fake dis_obj = d_loss gen_obj = g_loss model = Model([y_hat0, y_hat1]) em_loss = -d_loss_real + d_loss_fake em_loss.name = "Earth Move loss" dis_obj.name = 'Discriminator loss' gen_obj.name = 'Generator loss' cg = ComputationGraph([gen_obj, dis_obj]) gen_filter = VariableFilter(roles=[PARAMETER], bricks=gen.linear_transformations) dis_filter = VariableFilter(roles=[PARAMETER], bricks=dis.linear_transformations) gen_params = gen_filter(cg.variables) dis_params = dis_filter(cg.variables) # Prepare the dropout _inputs = [] for brick_ in [gen]: _inputs.extend(VariableFilter(roles=[INPUT], bricks=brick_.linear_transformations)(cg.variables)) cg_dropout = apply_dropout(cg, _inputs, 0.02) gen_obj = cg_dropout.outputs[0] dis_obj = cg_dropout.outputs[1] gan.dis_params = dis_params gan.gen_params = gen_params # gradient penalty algo = AdverserialTraning(gen_obj=gen_obj, dis_obj=dis_obj, model=gan, dis_iter=5, gradient_clip=None, step_rule=RMSProp(learning_rate=1e-4), gen_consider_constant=z) neg_sample = gan.sampling(size=25) from blocks.monitoring.aggregation import mean monitor = TrainingDataMonitoring(variables=[mean(gen_obj), mean(dis_obj), mean(em_loss)], prefix="train", after_batch=True) subdir = './exp/' + 'pokemon-wgan-gp' + "-" + time.strftime("%Y%m%d-%H%M%S") check_point = Checkpoint("{}/{}".format(subdir, 'CIFAR10'), every_n_epochs=100, save_separately=['log', 'model']) neg_sampling = GenerateNegtiveSample(neg_sample, img_size=(25, 56, 56), every_n_epochs=10) if not os.path.exists(subdir): os.makedirs(subdir) main_loop = MainLoop(algorithm=algo, model=model, data_stream=train_stream, extensions=[Printing(), ProgressBar(), monitor, check_point, neg_sampling]) main_loop.run()
class Graph: def __init__(self, seed=123): self.rng = MRG_RandomStreams(seed) self.y = self.rng.uniform(size=(1, ))
class NNClassifier_theano: """ NNClassifier_theano is a custom class for mass spectra classification. It is essentially a wrapper for nolearn and processes the hyperparameters given by the neuralNetInterface. :param architecture: lasagne.layers.Layer object :param hyperparameter: dictionary of hyperparameter-value-pairs """ def __init__(self, architecture, hyperparameter={}): self.archi = architecture self.hyperp = hyperparameter self._srng = RandomStreams(get_rng().randint( 1, 2147462579)) # for adaptive noise self._srng2 = rStream(2147462579) # Create nolearn ModifiedNeuralNet object self.classifier = ModifiedNeuralNet( layers=self.archi, max_epochs=self.hyperp.setdefault('epochs',100), update=self.hyperp.setdefault('optimizer',lasagne.updates.adam), update_learning_rate=self.hyperp.setdefault('learningRate',0.001), objective = modifiedObjective, objective_logitSens = self.hyperp.setdefault('logitSens',0.), objective_probSens = self.hyperp.setdefault('probSens',0.), objective_lossSens = self.hyperp.setdefault('lossSens',0.), objective_std = self.hyperp.setdefault('trainingDataStd',None), objective_loss_function=categorical_crossentropy, verbose=0, batch_iterator_train = DataAugmentationBatchIterator( self.hyperp.setdefault('batchSize',64), disturbLabelRate=self.hyperp.setdefault('disturbLabelRate',0), sdWidth=self.hyperp.setdefault('sdWidth',0), sdNumber=self.hyperp.setdefault('sdNumber',0), shuffle=True), batch_iterator_test = nolearn.lasagne.BatchIterator( self.hyperp.setdefault('batchSize',64),shuffle=False),\ train_split = TrainSplit(eval_size=self.hyperp.setdefault( 'validationSetRatio',.1)), objective_l1 = self.hyperp.setdefault('l1',0.), objective_l2 = self.hyperp.setdefault('l2',0.01), on_training_started=[nolearn.lasagne.PrintLayerInfo()], on_epoch_finished=[getIndividualLosses, printError, addEndTimeToHistory, printAdaptiveNoise, saveBestValidNet]) self.classifier.initialize() def trainModel(self, data, classes): validationSetRatio = self.hyperp.setdefault('validationSetRatio', .1) if validationSetRatio != 0: initialShuffleIndices = np.arange(len(data), dtype='int32') np.random.shuffle(initialShuffleIndices) data = data[initialShuffleIndices] classes = classes[initialShuffleIndices] numOfValPoints = np.floor(validationSetRatio * len(data)) validationData = data[:np.int32(numOfValPoints)] validationClasses = classes[:np.int32(numOfValPoints)] data = data[np.int32(numOfValPoints):] classes = classes[np.int32(numOfValPoints):] y = T.ivector() verbose = self.hyperp.setdefault('verbose', 1.) maxEpochs = self.hyperp.setdefault('epochs', 100) optimizer = self.hyperp.setdefault('optimizer', lasagne.updates.adam) learningRate = self.hyperp.setdefault('learningRate', 0.001) if self.hyperp['useSensRegControl']: regMultiplier = theano.shared(0.) else: regMultiplier = theano.shared(1.) # RENAME THIS GLOBALLY std = self.hyperp.setdefault('trainingDataStd', None) biTrain = nolearn.lasagne.BatchIterator(self.hyperp.setdefault( 'batchSize', 64), shuffle=True) biVal = nolearn.lasagne.BatchIterator(self.hyperp.setdefault( 'batchSize', 64), shuffle=False) #biTest = nolearn.lasagne.BatchIterator(self.hyperp.setdefault('batchSize',64),shuffle=False) layers = self.classifier.layers_ outputLayer = layers[-1] inputLayer = layers[0] networkInput = inputLayer.input_var ###### FOR DEBUGGING, DELETE LATER ######## print(self.hyperp) # computation of losses for training (not deterministic) classificationLoss, totalLoss, l1Loss, l2Loss, logitSensLoss, logitDiffSensLoss, logitSqSensLoss, probSensLoss, lossSensLoss = self.computeLosses( y, std, regMultiplier, deterministic=False) # computation of losses for output and cleaning (deterministic) classificationLossVal, totalLossVal, l1LossVal, l2LossVal, logitSensLossVal, logitDiffSensLossVal, logitSqSensLossVal, probSensLossVal, lossSensLossVal = self.computeLosses( y, std, regMultiplier, deterministic=True) params = lasagne.layers.get_all_params(outputLayer, trainable=True) updates = optimizer(totalLoss, params, learning_rate=learningRate) # testPrediction = lasagne.layers.get_output(outputLayer, # deterministic = True) # testAccuracy = T.mean(T.eq(T.argmax(testPrediction, axis=1), y), # dtype=theano.config.floatX) # adaptive noise regularization # advNoise = self.hyperp.setdefault('adversarialNoise', 0.) if advNoise > 0: bXAdap = self.__adversarialNoise(std, advNoise) print('Compiling DeepFool graph...') deepFoolFunction = theano.function([networkInput], bXAdap) print('Compiling neural network graph...') trainLoopFunction = theano.function([networkInput, y], totalLoss, updates=updates) print('Compiling output graph...') lossOutputs = theano.function([networkInput, y], [ classificationLossVal, l1LossVal, l2LossVal, logitSensLossVal, logitSqSensLossVal, probSensLossVal, lossSensLossVal ]) if verbose == 0: simpleOutput = theano.function([networkInput, y], totalLoss) print('Beginning training loop...') self.classifier.train_history_ = list() for epoch in range(maxEpochs): ######################## # TRAINING DATA # ######################## trainBatches = biTrain(data, classes) numOfTrainingPoints = len(data) out = "" for bX, by in trainBatches: if advNoise > 0: # adaptive noise on the network inputs sizeBatch = bX.shape[0] # DeepFool bXFool = np.copy(bX) # create dataset with normal and adversarial examples # Parameter: p p = 1 randFool = np.random.binomial(1, p, sizeBatch).astype('bool') bXFool = bXFool[randFool, :, :, :] # call deepFool on subset of samples bXadap = deepFoolFunction(bXFool) # stack noise samples and unchanged samples bXNormal = bX[~randFool, :, :, :] bXCombined = np.vstack((bXadap, bXNormal)) # stack labels byFool = by[randFool] byNormal = by[~randFool] byCombined = np.hstack((byFool, byNormal)) # call training function totalError = trainLoopFunction(bXCombined, byCombined) else: totalError = trainLoopFunction(bX, by) cL = l1L = l2L = loL = loSqL = prL = lsL = 0 if verbose == 1: for bX, by in trainBatches: stdout.flush() batchSize = bX.shape[0] cLt, l1Lt, l2Lt, loLt, loSqLt, prLt, lsLt = lossOutputs( bX, by) cL += cLt * batchSize l1L += l1Lt * batchSize l2L += l2Lt * batchSize loL += loLt * batchSize loSqL += loSqLt * batchSize prL += prLt * batchSize lsL += lsLt * batchSize cL /= numOfTrainingPoints l1L /= numOfTrainingPoints l2L /= numOfTrainingPoints loL /= numOfTrainingPoints loSqL /= numOfTrainingPoints prL /= numOfTrainingPoints lsL /= numOfTrainingPoints out = "Training set : cL: %5.5f, l1: %5.5f, l2: %5.5f, logitSens: %5.5f, logitSqSens: %5.5f, probSens: %5.5f, lossSens: %5.5f" % ( cL, l1L, l2L, loL, loSqL, prL, lsL) if verbose == 0: out = "Total loss (last batch): %5.5f" % (totalError) stdout.flush() logDict = dict() logDict['cL'] = cL logDict['l1'] = l1L logDict['l2'] = l2L logDict['loL'] = loL logDict['loSqL'] = loSqL logDict['prL'] = prL logDict['lsL'] = lsL print("Epoch ", epoch + 1, ":") print(out) if self.hyperp['useSensRegControl']: if epoch > .25 * maxEpochs and epoch < .75 * maxEpochs: t = float(epoch - maxEpochs / 4.) / (maxEpochs / 2.) regMultiplier.set_value(t) if epoch >= .75 * maxEpochs: regMultiplier.set_value(1) ######################## # VALIDATION DATA # ######################## if validationSetRatio != 0: valBatches = biVal(validationData, validationClasses) numOfTrainingPoints = len(validationData) out = "" cL = l1L = l2L = loL = loSqL = prL = lsL = 0 for bX, by in valBatches: stdout.flush() batchSize = bX.shape[0] cLt, l1Lt, l2Lt, loLt, loSqLt, prLt, lsLt = lossOutputs( bX, by) cL += cLt * batchSize l1L += l1Lt * batchSize l2L += l2Lt * batchSize loL += loLt * batchSize loSqL += loSqLt * batchSize prL += prLt * batchSize lsL += lsLt * batchSize cL /= numOfTrainingPoints l1L /= numOfTrainingPoints l2L /= numOfTrainingPoints loL /= numOfTrainingPoints loSqL /= numOfTrainingPoints prL /= numOfTrainingPoints lsL /= numOfTrainingPoints logDict['val_cL'] = cL logDict['val_l1'] = l1L logDict['val_l2'] = l2L logDict['val_loL'] = loL logDict['val_loSqL'] = loSqL logDict['val_prL'] = prL logDict['val_lsL'] = lsL out = "Validation set: cL: %5.5f, l1: %5.5f, l2: %5.5f, logitSens: %5.5f, logitSqSens: %5.5f, probSens: %5.5f, lossSens: %5.5f" % ( cL, l1L, l2L, loL, loSqL, prL, lsL) print(out) self.classifier.train_history_.append(logDict) # Check if the current training epoch is the best model #getBestValidNet(self.classifier) return NNModel_theano(self.classifier) def __adversarialNoise(self, std, advNoise): """ TODO: currently only works for 2 classes !!! """ layers = self.classifier.layers_ outputLayer = layers[-2] inputLayer = layers[0] networkInput = inputLayer.input_var networkOutput = get_output(outputLayer, deterministic=True) # difference of logits (only works for 2-classes !!!!) fX = networkOutput[:, 0] - networkOutput[:, 1] g_logit = T.grad(T.sum(fX), networkInput) # scale gradient by std if std is not None: g_logit = std * g_logit # l2-norm squared grad_l2 = T.sum(T.sqr(g_logit), axis=(1, 2, 3)) + 1e-12 # scale again by gradient due to divison by squred l2-norm of gradient if std is not None: sqrG_logit = std * g_logit r = -(sqrG_logit.dimshuffle(1, 2, 3, 0) * (fX / grad_l2)).dimshuffle(3, 0, 1, 2) # random scaling of noise randTensor = self._srng.uniform(size=(networkInput.shape[0], ), low=-advNoise / 2., high=advNoise) addedTerm = (r.dimshuffle(2, 3, 1, 0) * randTensor).dimshuffle( 3, 2, 0, 1) # projection onto non-negative values bXFool = T.maximum(networkInput + addedTerm, 0) return bXFool def computeLosses(self, y, std, regMultiplier, deterministic): logitSens = self.hyperp.setdefault('logitSens', 0.) logitDiffSens = self.hyperp.setdefault('logitDiffSens', 0.) logitSqSens = self.hyperp.setdefault('logitSqSens', 0.) probSens = self.hyperp.setdefault('probSens', 0.) lossSens = self.hyperp.setdefault('lossSens', 0.) l1 = self.hyperp.setdefault('l1', 0.) l2 = self.hyperp.setdefault('l2', 0.) layers = self.classifier.layers_ lossFunction = lasagne.objectives.categorical_crossentropy aggregate = T.mean # otherwise lasagne.objectives.aggregate outputLayer = layers[-1] logitLayer = layers[-2] inputLayer = layers[0] networkInput = inputLayer.input_var networkOutput = get_output(outputLayer, deterministic=deterministic) logitOutput = get_output(logitLayer, deterministic=deterministic) ###################################################################### # Very weird thing: # lossSensitivity gradients can only be computed if the one-hot encoded # version of the loss function is used. BUT that version lacks a # stability optimization in Theano that leads to NaNs during training. # This is why both versions need to be employed here. L = lossFunction(networkOutput, y) y_oneHot = lasagne.utils.one_hot(y, outputLayer.output_shape[1]) L_oneHot = lossFunction(networkOutput, y_oneHot) ####################################################################### classificationLoss = aggregate(L) l1Loss = regularization.regularize_layer_params( layers.values(), regularization.l1) l2Loss = regularization.regularize_layer_params( layers.values(), regularization.l2) # logit sensitivity logit = T.sum(logitOutput * y_oneHot, axis=1) G_logit = T.grad(T.sum(logit), networkInput) if std is not None: G_logit = std * G_logit # Sparse logit saliency regularization absG_logit = T.abs_(G_logit) sumAbsG_logit = T.sum(absG_logit, axis=(1, 2, 3)) logitSensLoss = aggregate(sumAbsG_logit) # Squared logit saliency regularization sqG_logit = G_logit**2 sumSqG_logit = T.sum(sqG_logit, axis=(1, 2, 3)) logitSqSensLoss = aggregate(sumSqG_logit) # probability sensitivity prob = T.sum(networkOutput * y_oneHot, axis=1) G_prob = T.grad(T.sum(prob), networkInput) if std is not None: G_prob = std * G_prob # Sparse probability saliency regularization absG_prob = T.abs_(G_prob) sumAbsG_prob = T.sum(absG_prob, axis=(1, 2, 3)) probSensLoss = aggregate(sumAbsG_prob) # Loss sensitivity G_loss = theano.grad(T.sum(L_oneHot), networkInput) if std is not None: G_loss = std * G_loss absG_loss = T.abs_(G_loss) sumAbsG_loss = T.sum(absG_loss, axis=(1, 2, 3)) lossSensLoss = aggregate(sumAbsG_loss) ####### !!!!!!!!!!!!!!!!!!! EXPERIMENTAL !!!!!!!!!!!!!!!!!! ########## #### !!!! only makes sense for 2-class problems in this case !!!! #### # Clumsy way to regularize logit differences # It works by replacing the matrix of one-hot encoded labels by one # whose first column is ones and the rest is minus ones. After summing # over each row, we are left with the difference of the logit of the # first class and the (sum of the) other class(es). plusMinusOneMatrix = 2 * lasagne.utils.one_hot( 1, outputLayer.output_shape[1]) - T.ones_like(y_oneHot) logitDiff = T.sum(logitOutput * plusMinusOneMatrix, axis=1) G_logitDiff = T.grad(T.sum(logitDiff), networkInput) if std is not None: G_logitDiff = std * G_logitDiff absG_logitDiff = T.abs_(G_logitDiff) sumAbsG_logitDiff = T.sum(absG_logitDiff, axis=(1, 2, 3)) logitDiffSensLoss = aggregate(sumAbsG_logitDiff) # Sum up totalLoss = classificationLoss if l1: totalLoss += regMultiplier * l1 * l1Loss if l2: totalLoss += regMultiplier * l2 * l2Loss if logitSens: totalLoss += regMultiplier * logitSens * logitSensLoss if logitDiffSens: totalLoss += regMultiplier * logitDiffSens * logitDiffSensLoss if logitSqSens: totalLoss += regMultiplier * logitSqSens * logitSqSensLoss if probSens: totalLoss += regMultiplier * probSens * probSensLoss if lossSens: totalLoss += regMultiplier * lossSens * lossSensLoss return classificationLoss, totalLoss, l1Loss, l2Loss, logitSensLoss, logitDiffSensLoss, logitSqSensLoss, probSensLoss, lossSensLoss
def test_undefined_grad(): srng = MRG_RandomStreams(seed=1234) # checking uniform distribution low = tensor.scalar() out = srng.uniform((), low=low) with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out, low) high = tensor.scalar() out = srng.uniform((), low=0, high=high) with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out, high) out = srng.uniform((), low=low, high=high) with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out, (low, high)) # checking binomial distribution prob = tensor.scalar() out = srng.binomial((), p=prob) with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out, prob) # checking multinomial distribution prob1 = tensor.scalar() prob2 = tensor.scalar() p = [theano.tensor.as_tensor_variable([prob1, 0.5, 0.25])] out = srng.multinomial(size=None, pvals=p, n=4)[0] with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(theano.tensor.sum(out), prob1) p = [theano.tensor.as_tensor_variable([prob1, prob2])] out = srng.multinomial(size=None, pvals=p, n=4)[0] with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(theano.tensor.sum(out), (prob1, prob2)) # checking choice p = [theano.tensor.as_tensor_variable([prob1, prob2, 0.1, 0.2])] out = srng.choice(a=None, size=1, p=p, replace=False)[0] with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out[0], (prob1, prob2)) p = [theano.tensor.as_tensor_variable([prob1, prob2])] out = srng.choice(a=None, size=1, p=p, replace=False)[0] with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out[0], (prob1, prob2)) p = [theano.tensor.as_tensor_variable([prob1, 0.2, 0.3])] out = srng.choice(a=None, size=1, p=p, replace=False)[0] with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out[0], prob1) # checking normal distribution avg = tensor.scalar() out = srng.normal((), avg=avg) with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out, avg) std = tensor.scalar() out = srng.normal((), avg=0, std=std) with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out, std) out = srng.normal((), avg=avg, std=std) with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out, (avg, std)) # checking truncated normal distribution avg = tensor.scalar() out = srng.truncated_normal((), avg=avg) with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out, avg) std = tensor.scalar() out = srng.truncated_normal((), avg=0, std=std) with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out, std) out = srng.truncated_normal((), avg=avg, std=std) with pytest.raises(theano.gradient.NullTypeGradError): theano.grad(out, (avg, std))
def gamma_dist(self, alpha, beta, num_MC): srng = RandomStreams(seed=234) N = srng.uniform(size=(alpha, num_MC), low=1e-10, high=1.0) return T.sum(T.log(N), 0) * (-1) / beta
colorImg=colorImg, scale=generation_scale, save_path=os.path.join(outfolder, 'x_l_' + str(ssl_para_seed) + '_AT-JD.png')) n_batches_train_u_c = int(x_unlabelled.shape[0] / batch_size_u_c) n_batches_train_l_c = int(x_labelled.shape[0] / batch_size_l_c) n_batches_train_u_d = int(x_unlabelled.shape[0] / batch_size_u_d) n_batches_train_l_d = int(x_labelled.shape[0] / batch_size_l_d) n_batches_train_g = int(x_unlabelled.shape[0] / batch_size_g) n_batches_eval = int(eval_x.shape[0] / batch_size_eval) ''' models ''' # symbols sym_z_image = T.tile(theano_rng.uniform((z_generated, n_z)), (num_classes, 1)) sym_z_rand = theano_rng.uniform(size=(batch_size_g, n_z)) sym_x_u = T.tensor4() sym_x_u_d = T.tensor4() sym_x_u_g = T.tensor4() sym_x_l = T.tensor4() sym_y = T.ivector() sym_y_g = T.ivector() sym_x_eval = T.tensor4() sym_lr = T.scalar() sym_alpha_cla_g = T.scalar() sym_alpha_unlabel_entropy = T.scalar() sym_alpha_unlabel_average = T.scalar() # te sym_lr_cla = T.scalar('separate_lr')
class OptionCritic_Network(): def __init__(self, model_network=None, gamma=0.99, learning_method="rmsprop", actor_lr=0.00025, batch_size=32, input_size=None, learning_params=None, dnn_type=True, clip_delta=0, scale=255., freeze_interval=100, grad_clip=0, termination_reg=0, num_options=8, double_q=False, temp=1, entropy_reg=0, BASELINE=False, **kwargs): x = T.ftensor4() next_x = T.ftensor4() a = T.ivector() o = T.ivector() r = T.fvector() terminal = T.ivector() self.freeze_interval = freeze_interval self.theano_rng = MRG_RandomStreams(1000) self.x_shared = theano.shared( np.zeros(tuple([batch_size] + input_size[1:]), dtype='float32')) self.next_x_shared = theano.shared( np.zeros(tuple([batch_size] + input_size[1:]), dtype='float32')) self.a_shared = theano.shared(np.zeros((batch_size), dtype='int32')) self.o_shared = theano.shared(np.zeros((batch_size), dtype='int32')) self.terminal_shared = theano.shared( np.zeros((batch_size), dtype='int32')) self.r_shared = theano.shared(np.zeros((batch_size), dtype='float32')) state_network = model_network[:-1] termination_network = copy.deepcopy([model_network[-1]]) termination_network[0]["activation"] = "sigmoid" print "NUM OPTIONS --->", num_options termination_network[0]["out_size"] = num_options option_network = copy.deepcopy([model_network[-1]]) option_network[0]["activation"] = "softmax" Q_network = copy.deepcopy([model_network[-1]]) Q_network[0]["out_size"] = num_options self.state_model = Model(state_network, input_size=input_size, dnn_type=dnn_type) self.state_model_prime = Model(state_network, input_size=input_size, dnn_type=dnn_type) output_size = [None, model_network[-2]["out_size"]] self.Q_model = Model(Q_network, input_size=output_size, dnn_type=dnn_type) self.Q_model_prime = Model(Q_network, input_size=output_size, dnn_type=dnn_type) self.termination_model = Model(termination_network, input_size=output_size, dnn_type=dnn_type) self.options_model = MLP3D(num_options, model_network, temp=temp) s = self.state_model.apply(x / scale) next_s = self.state_model.apply(next_x / scale) next_s_prime = self.state_model_prime.apply(next_x / scale) termination_probs = self.termination_model.apply( theano.gradient.disconnected_grad(s)) option_term_prob = termination_probs[T.arange(o.shape[0]), o] next_termination_probs = self.termination_model.apply( theano.gradient.disconnected_grad(next_s)) next_option_term_prob = next_termination_probs[T.arange(o.shape[0]), o] termination_sample = T.gt(option_term_prob, self.theano_rng.uniform(size=o.shape)) Q = self.Q_model.apply(s) next_Q = self.Q_model.apply(next_s) next_Q_prime = theano.gradient.disconnected_grad( self.Q_model_prime.apply(next_s_prime)) disc_option_term_prob = theano.gradient.disconnected_grad( next_option_term_prob) action_probs = self.options_model.apply(s, o) sampled_actions = T.argmax(self.theano_rng.multinomial( pvals=action_probs, n=1), axis=1).astype("int32") if double_q: print "TRAINING DOUBLE_Q" y = r + (1 - terminal) * gamma * ( (1 - disc_option_term_prob) * next_Q_prime[T.arange(o.shape[0]), o] + disc_option_term_prob * next_Q_prime[T.arange(next_Q.shape[0]), T.argmax(next_Q, axis=1)]) else: y = r + (1 - terminal) * gamma * ( (1 - disc_option_term_prob) * next_Q_prime[T.arange(o.shape[0]), o] + disc_option_term_prob * T.max(next_Q_prime, axis=1)) y = theano.gradient.disconnected_grad(y) option_Q = Q[T.arange(o.shape[0]), o] td_errors = y - option_Q if clip_delta > 0: quadratic_part = T.minimum(abs(td_errors), clip_delta) linear_part = abs(td_errors) - quadratic_part td_cost = 0.5 * quadratic_part**2 + clip_delta * linear_part else: td_cost = 0.5 * td_errors**2 # critic updates critic_cost = T.sum(td_cost) critic_params = self.Q_model.params + self.state_model.params learning_algo = self.Q_model.get_learning_method( learning_method, **learning_params) grads = T.grad(critic_cost, critic_params) critic_updates = learning_algo.apply(critic_params, grads, grad_clip=grad_clip) # actor updates actor_params = self.termination_model.params + self.options_model.params learning_algo = self.termination_model.get_learning_method("sgd", lr=actor_lr) disc_Q = theano.gradient.disconnected_grad(option_Q) disc_V = theano.gradient.disconnected_grad(T.max(Q, axis=1)) term_grad = T.sum(option_term_prob * (disc_Q - disc_V + termination_reg)) entropy = -T.sum(action_probs * T.log(action_probs)) if not BASELINE: policy_grad = - \ T.sum( T.log(action_probs[T.arange(a.shape[0]), a]) * y) - entropy_reg*entropy else: policy_grad = - \ T.sum(T.log(action_probs[T.arange(a.shape[0]), a]) * (y-disc_Q)) - entropy_reg*entropy grads = T.grad(term_grad + policy_grad, actor_params) actor_updates = learning_algo.apply(actor_params, grads, grad_clip=grad_clip) if self.freeze_interval > 1: target_updates = OrderedDict() for t, b in zip( self.Q_model_prime.params + self.state_model_prime.params, self.Q_model.params + self.state_model.params): target_updates[t] = b self._update_target_params = theano.function( [], [], updates=target_updates) self.update_target_params() print "freeze interval:", self.freeze_interval else: print "freeze interval: None" critic_givens = { x: self.x_shared, o: self.o_shared, r: self.r_shared, terminal: self.terminal_shared, next_x: self.next_x_shared } actor_givens = { a: self.a_shared, r: self.r_shared, terminal: self.terminal_shared, o: self.o_shared, next_x: self.next_x_shared } print "compiling...", self.train_critic = theano.function([], [critic_cost], updates=critic_updates, givens=critic_givens) self.train_actor = theano.function([s], [], updates=actor_updates, givens=actor_givens) self.pred_score = theano.function([], T.max(Q, axis=1), givens={x: self.x_shared}) self.sample_termination = theano.function( [s], [termination_sample, T.argmax(Q, axis=1)], givens={o: self.o_shared}) self.sample_options = theano.function([s], T.argmax(Q, axis=1)) self.sample_actions = theano.function([s], sampled_actions, givens={o: self.o_shared}) self.get_action_dist = theano.function([s, o], action_probs) self.get_s = theano.function([], s, givens={x: self.x_shared}) print "complete" def update_target_params(self): if self.freeze_interval > 1: self._update_target_params() return def predict_move(self, s): return self.sample_options(s) def predict_termination(self, s, a): self.a_shared.set_value(a) return tuple(self.sample_termination(s)) def get_q_vals(self, x): self.x_shared.set_value(x) return self.pred_score()[:, np.newaxis] def get_state(self, x): self.x_shared.set_value(x) return self.get_s() def get_action(self, s, o): self.o_shared.set_value(o) return self.sample_actions(s) def train_conv_net(self, train_set_x, next_x, options, r, terminal, actions=None, model=""): self.next_x_shared.set_value(next_x) self.o_shared.set_value(options) self.r_shared.set_value(r) self.terminal_shared.set_value(terminal) if model == "critic": self.x_shared.set_value(train_set_x) return self.train_critic() elif model == "actor": self.a_shared.set_value(actions) return self.train_actor(train_set_x) else: print "WRONG MODEL NAME" raise NotImplementedError def save_params(self): return [ self.state_model.save_params(), self.Q_model.save_params(), self.termination_model.save_params(), self.options_model.save_params() ] def load_params(self, values): self.state_model.load_params(values[0]) self.Q_model.load_params(values[1]) self.termination_model.load_params(values[2]) self.options_model.load_params(values[3])
def __init__(self, args): self.args = args rng = np.random.RandomState(self.args.seed) # fixed random seeds theano_rng = MRG_RandomStreams(rng.randint(2**15)) lasagne.random.set_rng(np.random.RandomState(rng.randint(2**15))) data_rng = np.random.RandomState(self.args.seed_data) ''' specify pre-trained generator E ''' self.enc_layers = [ LL.InputLayer(shape=(None, 3, 32, 32), input_var=None) ] enc_layer_conv1 = dnn.Conv2DDNNLayer(self.enc_layers[-1], 64, (5, 5), pad=0, stride=1, W=Normal(0.01), nonlinearity=nn.relu) self.enc_layers.append(enc_layer_conv1) enc_layer_pool1 = LL.MaxPool2DLayer(self.enc_layers[-1], pool_size=(2, 2)) self.enc_layers.append(enc_layer_pool1) enc_layer_conv2 = dnn.Conv2DDNNLayer(self.enc_layers[-1], 128, (5, 5), pad=0, stride=1, W=Normal(0.01), nonlinearity=nn.relu) self.enc_layers.append(enc_layer_conv2) enc_layer_pool2 = LL.MaxPool2DLayer(self.enc_layers[-1], pool_size=(2, 2)) self.enc_layers.append(enc_layer_pool2) self.enc_layer_fc3 = LL.DenseLayer(self.enc_layers[-1], num_units=256, nonlinearity=T.nnet.relu) self.enc_layers.append(self.enc_layer_fc3) self.enc_layer_fc4 = LL.DenseLayer(self.enc_layers[-1], num_units=10, nonlinearity=T.nnet.softmax) self.enc_layers.append(self.enc_layer_fc4) ''' load pretrained weights for encoder ''' weights_toload = np.load('pretrained/encoder.npz') weights_list_toload = [ weights_toload['arr_{}'.format(k)] for k in range(len(weights_toload.files)) ] LL.set_all_param_values(self.enc_layers[-1], weights_list_toload) ''' input tensor variables ''' #self.G_weights #self.D_weights self.dummy_input = T.scalar() self.G_layers = [] self.z = theano_rng.uniform(size=(self.args.batch_size, self.args.z0dim)) self.x = T.tensor4() self.meanx = T.tensor3() self.Gen_x = T.tensor4() self.D_layers = [] self.D_layer_adv = [] self.D_layer_z_recon = [] self.gen_lr = T.scalar() # learning rate self.disc_lr = T.scalar() # learning rate self.y = T.ivector() self.y_1hot = T.matrix() self.Gen_x_list = [] self.y_recon_list = [] self.mincost = T.scalar() #self.enc_layer_fc3 = self.get_enc_layer_fc3() self.real_fc3 = LL.get_output(self.enc_layer_fc3, self.x, deterministic=True)
class GumbelSoftmax(BaseLayer): """ This class implements gumbel softmax activation. See "Categorical Reparameterization with Gumbel-softmax". (Eric Jang, Shixiang Gu, Ben Poole, 2016.) """ def __init__(self, input_shape, temperature_init=0.1, seed=9683): """ This function initializes the class. Parameters ---------- shape: tuple a tuple of shape of gumbel random distribution. this is required for scan to not affected. temperature_init: float, default: 1 a positive float value. if T > 1, become more soften, and T < 1, become more sharpen. if temperature is 1, same as normal softmax. """ super(GumbelSoftmax, self).__init__() # check asserts assert isinstance(input_shape, tuple) and len( input_shape) == 1, '"input_shape" should be a tuple of shape.' assert temperature_init > 0, '"temperature_init" should be a positive float.' # set members self.input_shape = input_shape self.temperature_init = temperature_init self.rng = MRG(seed) def set_shared(self): """ This function overrides the parents' one. Set shared variables. Shared Variables ---------------- temperature: scalar """ temperature = np.array(self.temperature_init).astype('float32') self.temperature = theano.shared(temperature, self.name + '_temperature') self.temperature.tags = ['temperature', self.name] def change_temperature(self, new_temperature): """ This function changes the temperature for softmax. Parameters ---------- new_temperature: float a positive float value which will be a new temperature. """ # check asserts assert new_temperature > 0, '"new_temperature" should be a positive float.' self.temperature.set_value(float(new_temperature)) def get_output(self, input_): """ This function overrides the parents' one. Softmax converts output energy to probability distributuion. Math Expression ------------------- g_k ~ Gumbel(0, 1) y_k = exp((x_k + g_k) / T) / \sum(exp((x_i + g_k) / T)) Parameters ---------- input_: TensorVariable Returns ------- TensorVariable """ # generate random gumbel distribution uniform_random = self.rng.uniform( (self.batch_size, self.input_shape[0]), 0, 1) gumbel_random = -T.log(-T.log(uniform_random + 1e-7) + 1e-7) return T.nnet.softmax((input_ + gumbel_random) / self.temperature) # divide by temperature
class LSTMGenerator: """ A multimodal long short-term memory (LSTM) generator """ # ======================================================================================== def __init__(self, params): image_encoding_size = params.get('image_encoding_size', 128) word_encoding_size = params.get('word_encoding_size', 128) hidden_size = params.get('hidden_size', 128) hidden_depth = params.get('hidden_depth', 1) generator = params.get('generator', 'lstm') vocabulary_size = params.get('vocabulary_size', -1) output_size = params.get('output_size', -1) image_feat_size = params.get('image_feat_size', -1) # size of CNN vectors hardcoded here aux_inp_size = params.get('aux_inp_size', -1) model = OrderedDict() # Recurrent weights: take x_t, h_{t-1}, and bias unit # and produce the 3 gates and the input to cell signal encoder = params.get('feat_encoder', None) use_feat_enc = params.get('use_encoder_for', 0) if not (use_feat_enc & 1): model['WIemb'] = initwTh(image_feat_size, word_encoding_size) # image encoder model['b_Img'] = np.zeros( (word_encoding_size)).astype(config.floatX) model['Wemb'] = initwTh(vocabulary_size, word_encoding_size) # word encoder model['lstm_W_hid'] = initwTh(hidden_size, 4 * hidden_size) model['lstm_W_inp'] = initwTh(word_encoding_size, 4 * hidden_size) for i in xrange(1, hidden_depth): model['lstm_W_hid_' + str(i)] = initwTh(hidden_size, 4 * hidden_size) model['lstm_W_inp_' + str(i)] = initwTh(hidden_size, 4 * hidden_size) model['lstm_b'] = np.zeros((4 * hidden_size, )).astype(config.floatX) # Decoder weights (e.g. mapping to vocabulary) if params.get('class_out_factoring', 0) == 0: model['Wd'] = initwTh(hidden_size, output_size) # decoder model['bd'] = np.zeros((output_size, )).astype(config.floatX) else: clsinfo = params['ixtoclsinfo'] self.clsinfo = clsinfo clsSizes = clsinfo[:, 2] - clsinfo[:, 1] self.clsSize = np.zeros(params['nClasses']) self.clsOffset = np.zeros(params['nClasses'], dtype=np.int32) self.clsSize[clsinfo[:, 0]] = clsSizes self.clsOffset[clsinfo[:, 0]] = np.int32(clsinfo[:, 1]) max_cls_size = np.max(clsSizes) self.max_cls_size = max_cls_size Wd = np.zeros( (params['hidden_size'], params['nClasses'], max_cls_size), dtype=config.floatX) model['bd'] = np.zeros((1, params['nClasses'], max_cls_size), dtype=config.floatX) for cix in clsinfo[:, 0]: Wd[:, cix, :clsSizes[cix]] = initwTh(params['hidden_size'], clsSizes[cix]) model['bd'][0, cix, clsSizes[cix]:] = -100 model['Wd'] = Wd update_list = [ 'lstm_W_hid', 'lstm_W_inp', 'lstm_b', 'Wd', 'bd', 'Wemb' ] self.regularize = ['lstm_W_hid', 'lstm_W_inp', 'Wd', 'Wemb'] if not (use_feat_enc & 1): update_list.extend(['WIemb', 'b_Img']) self.regularize.extend(['WIemb']) if params.get('class_out_factoring', 0) == 1: model['WdCls'] = initwTh(hidden_size, params['nClasses']) # decoder model['bdCls'] = np.zeros( (params['nClasses'], )).astype(config.floatX) update_list.extend(['WdCls', 'bdCls']) self.regularize.extend(['WdCls']) for i in xrange(1, hidden_depth): update_list.append('lstm_W_hid_' + str(i)) update_list.append('lstm_W_hid_' + str(i)) self.regularize.append('lstm_W_inp_' + str(i)) self.regularize.append('lstm_W_inp_' + str(i)) if params.get('en_aux_inp', 0): if params.get('swap_aux', 1) == 1: if not (use_feat_enc & 2) or params.get( 'encode_gt_sentences', 0): model['WIemb_aux'] = initwTh( aux_inp_size, image_encoding_size) # image encoder model['b_Img_aux'] = np.zeros( (image_encoding_size)).astype(config.floatX) update_list.append('WIemb_aux') self.regularize.append('WIemb_aux') update_list.append('b_Img_aux') model['lstm_W_aux'] = initwTh(image_encoding_size, 4 * hidden_size, 0.00005) else: model['lstm_W_aux'] = initwTh(aux_inp_size, 4 * hidden_size, 0.001) update_list.append('lstm_W_aux') self.regularize.append('lstm_W_aux') if params.get('gen_input_noise', 0): self.noise_dim = params.get('gen_inp_noise_dim', 50) model['lstm_W_noise'] = initwTh(self.noise_dim, 4 * hidden_size, 0.001) self.model_th = self.init_tparams(model) del model if params.get('use_gumbel_mse', 0): self.usegumbel = theano.shared(1) self.gumb_temp = theano.shared( numpy_floatX(params.get('gumbel_temp_init', 0.5))) #self.model_th['gumb_temp'] = self.gumb_temp self.softmax_smooth_factor = theano.shared( numpy_floatX(params.get('softmax_smooth_factor', 1.0))) else: self.usegumbel = theano.shared(0) self.update_list = update_list # ======================================================================================== def init_tparams(self, params): tparams = OrderedDict() for kk, pp in params.iteritems(): tparams[kk] = theano.shared(params[kk], name=kk) return tparams # ======================================================================================== # BUILD LSTM forward propogation model def build_model(self, tparams, options, xI=None, xAux=None, attn_nw=None): self.trng = RandomStreams(int(time.time())) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) xW = tensor.matrix('xW', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) n_timesteps = xW.shape[0] n_samples = xW.shape[1] embW = tparams['Wemb'][xW.flatten()].reshape( [n_timesteps, n_samples, options['word_encoding_size']]) if xI == None: xI = tensor.matrix('xI', dtype=config.floatX) embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']) xI_is_inp = True else: embImg = xI xI_is_inp = False if xAux == None: xAux = tensor.matrix( 'xAux', dtype=config.floatX) if attn_nw == None else tensor.tensor3( 'xAux', dtype=config.floatX) if (options.get('swap_aux', 1)) and (attn_nw == None): xAuxEmb = tensor.dot( xAux, tparams['WIemb_aux']) + tparams['b_Img_aux'] else: xAuxEmb = xAux xA_is_inp = True else: xA_is_inp = False if options.get('encode_gt_sentences', 0): xAuxEmb = tensor.dot( xAux, tparams['WIemb_aux']) + tparams['b_Img_aux'] else: xAuxEmb = xAux embImg = embImg.reshape([1, n_samples, options['image_encoding_size']]) emb = tensor.concatenate([embImg, embW], axis=0) #This is implementation of input dropout !! if options['use_dropout']: emb = dropout_layer(emb, use_noise, self.trng, options['drop_prob_encoder'], shp=emb.shape) if (options.get('en_aux_inp', 0)) and (attn_nw == None): xAuxEmb = dropout_layer(xAuxEmb, use_noise, self.trng, options['drop_prob_aux'], shp=xAuxEmb.shape) # Implement scehduled sampling! if options.get('sched_sampling_mode', None) != None: curr_epoch = tensor.scalar(name='curr_epoch', dtype=config.floatX) # Assign the probabilies according to the scheduling mode if options['sched_sampling_mode'] == 'linear': prob = tensor.maximum( options['sslin_min'], options['sched_sampling_const'] - options['sslin_slope'] * curr_epoch) elif options['sched_sampling_mode'] == 'exp': raise ValueError( 'ERROR: %s --> This solver type is not yet supported' % (options['sched_sampling_mode'])) elif options['sched_sampling_mode'] == 'invsig': raise ValueError( 'ERROR: %s --> This solver type is not yet supported' % (options['sched_sampling_mode'])) else: raise ValueError( 'ERROR: %s --> This scheduling type is unknown' % (options['sched_sampling_mode'])) # Now to build the mask. We don't want to do this coin toss when # feeding in image feature and the start symbol sched_mask = self.trng.binomial((n_timesteps - 2, n_samples), p=prob, n=1, dtype='int64') sched_mask = tensor.concatenate( [sched_mask, tensor.alloc(1, 2, n_samples)], axis=0) else: sched_mask = [] ############################################################################################################################# # This implements core lstm rval, updatesLSTM = basic_lstm_layer(tparams, emb[:n_timesteps, :, :], xAuxEmb, use_noise, options, prefix=options['generator'], sched_prob_mask=sched_mask, attn_nw=attn_nw) ############################################################################################################################# # NOTE1: we are leaving out the first prediction, which was made for the image and is meaningless. if options['use_dropout']: # XXX : Size given to dropout is missing one dimension. This keeps the dropped units consistent across time!?. # ### Is this a good bug ? p = dropout_layer( sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1), options['hidden_size']), use_noise, self.trng, options['drop_prob_decoder'], (n_samples, options['hidden_size'])) else: p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1), options['hidden_size']) if options.get('class_out_factoring', 0) == 1: if options.get('cls_diff_layer', 0) == 1: pC_inp = dropout_layer( sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1) - 2, options['hidden_size']), use_noise, self.trng, options['drop_prob_decoder'], (n_samples, options['hidden_size'])) else: pC_inp = p n_out_samps = (n_timesteps - 1) * n_samples if options.get('class_out_factoring', 0) == 0: pW = (tensor.dot(p, tparams['Wd']) + tparams['bd']).reshape( [n_out_samps, options['output_size']]) if options.get('use_gumbel_mse', 0) == 0: pWSft = tensor.nnet.softmax(pW) else: w_out = ifelse( self.usegumbel, gumbel_softmax_sample(self.trng, pW, self.gumb_temp, hard=options.get( 'use_gumbel_hard', False)), tensor.nnet.softmax(pW)) # This is not exactly right, but just testing pWSft = w_out totProb = pWSft[tensor.arange(n_out_samps), xW[1:, :].flatten()] out_list = [pWSft, totProb, pW] else: ixtoclsinfo_t = tensor.as_tensor_variable(self.clsinfo) xC = ixtoclsinfo_t[xW[1:, :].flatten(), 0] if options.get('cls_zmean', 1): pW = ((tparams['Wd'][:, xC, :].T * ((p.reshape([1, n_out_samps, options['hidden_size']]) - tparams['WdCls'][:, xC].T))).sum(axis=-1).T + tparams['bd'][:, xC, :]) else: pW = ((tparams['Wd'][:, xC, :].T * (p.reshape([1, n_out_samps, options['hidden_size']])) ).sum(axis=-1).T + tparams['bd'][:, xC, :]) pWSft = tensor.nnet.softmax(pW[0, :, :]) pC = (tensor.dot(pC_inp, tparams['WdCls']) + tparams['bdCls']).reshape([n_out_samps, options['nClasses']]) pCSft = tensor.nnet.softmax(pC) totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \ pCSft[tensor.arange(n_out_samps), xC] out_list = [pWSft, pCSft, totProb, p] tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:, :].flatten()).sum() tot_pplx = -(tensor.log2(totProb + 1e-10) * mask[1:, :].flatten()).sum() cost = [ tot_cost / tensor.cast(n_samples, dtype=config.floatX), tot_pplx ] inp_list = [xW, mask] if xI_is_inp: inp_list.append(xI) if options.get('en_aux_inp', 0) and xA_is_inp: inp_list.append(xAux) if options.get('sched_sampling_mode', None) != None: inp_list.append(curr_epoch) f_pred_prob = theano.function([xW, xI, xAux], out_list, name='f_pred_prob', updates=updatesLSTM) return use_noise, inp_list, f_pred_prob, cost, out_list, updatesLSTM # ======================================================================================== # Predictor Related Stuff!! def prepPredictor(self, model_npy=None, checkpoint_params=None, beam_size=5, xI=None, xAux=None, inp_list_prev=[], per_word_logweight=None): if model_npy != None: if type(model_npy[model_npy.keys()[0]]) == np.ndarray: zipp(model_npy, self.model_th) else: self.model_th = model_npy #theano.config.exception_verbosity = 'high' self.beam_size = beam_size # Now we build a predictor model if checkpoint_params.get('advers_gen', 0) == 1: checkpoint_params['n_gen_samples'] = beam_size (inp_list_gen, predLogProb, predIdx, predCand, wOut_emb, updates, seq_lengths) = self.build_prediction_model( self.model_th, checkpoint_params, xI, xAux, per_word_logweight=per_word_logweight) self.f_pred_th = theano.function(inp_list_prev + inp_list_gen, [predLogProb, predIdx, predCand], name='f_pred') # Now we build a training model which evaluates cost. This is for the evaluation part in the end if checkpoint_params.get('advers_gen', 0) == 0: (self.use_dropout, inp_list_gen2, f_pred_prob, cost, predTh, updatesLSTM) = self.build_model(self.model_th, checkpoint_params, xI, xAux) self.f_eval = theano.function(inp_list_prev + inp_list_gen2, cost, name='f_eval') # ======================================================================================== def predict(self, batch, checkpoint_params, ext_inp=[]): inp_list = ext_inp if not checkpoint_params.get('use_encoder_for', 0) & 1: inp_list.extend([ batch[0]['image']['feat'].reshape( 1, checkpoint_params['image_feat_size']).astype(config.floatX) ]) if not checkpoint_params.get('use_encoder_for', 0) & 2: if checkpoint_params.get('en_aux_inp', 0): inp_list.append(batch[0]['image']['aux_inp'].reshape( 1, checkpoint_params['aux_inp_size']).astype(config.floatX)) Ax = self.f_pred_th(*inp_list) # Backtracking to decode the correct sequence of candidates Ys = [] for i in xrange(self.beam_size): candI = [] curr_cand = Ax[2][-1][i] for j in reversed(xrange(Ax[1].shape[0] - 1)): candI.insert(0, Ax[1][j][curr_cand]) curr_cand = Ax[2][j][curr_cand] Ys.append([Ax[0][i], candI]) return [Ys], Ax def build_prediction_model(self, tparams, options, xI=None, xAux=None, per_word_logweight=None): #Initialize random streams for other to use. self.trng = RandomStreams(int(time.time())) if xI == None: xI = tensor.matrix('xI', dtype=config.floatX) embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']) xI_is_inp = True else: xI_is_inp = False embImg = xI if xAux == None and options.get('en_aux_inp', 0): xAux = tensor.matrix('xAux', dtype=config.floatX) xA_is_inp = True if options.get('swap_aux', 1): xAuxEmb = tensor.dot( xAux, tparams['WIemb_aux']) + tparams['b_Img_aux'] else: xAuxEmb = xAux else: xA_is_inp = False if options.get('encode_gt_sentences', 0): xAuxEmb = tensor.dot( xAux, tparams['WIemb_aux']) + tparams['b_Img_aux'] else: xAuxEmb = xAux if options.get('advers_gen', 0) == 1: accLogProb, Idx, wOut_emb, updates, seq_lengths = self.lstm_advers_gen_layer( tparams, embImg, xAuxEmb, options, prefix=options['generator']) Cand = tensor.tile(tensor.arange(Idx.shape[1]), [Idx.shape[0], 1]) else: accLogProb, Idx, Cand, wOut_emb, updates = self.lstm_predict_layer( tparams, embImg, xAuxEmb, options, self.beam_size, prefix=options['generator'], per_word_logweight=per_word_logweight) seq_lengths = [] inp_list = [] if xI_is_inp: inp_list.append(xI) if options.get('en_aux_inp', 0) and xA_is_inp: inp_list.append(xAux) return inp_list, accLogProb, Idx, Cand, wOut_emb, updates, seq_lengths # ======================================================================================== # LSTM LAYER in Prediction mode. Here we don't provide the word sequences, just the image feature vector # The network starts first with forward propogatin the image feature vector. Then we pass the start word feature # i.e zeroth word vector. From then the network output word (i.e ML word) is fed as the input to the next time step. # In beam_size > 1 we could repeat a time step multiple times, once for each beam!!. def lstm_predict_layer(self, tparams, Xi, aux_input, options, beam_size, prefix='lstm', per_word_logweight=None): nMaxsteps = options.get('maxlen', 30) if nMaxsteps is None: nMaxsteps = 30 n_samples = 1 h_depth = options.get('hidden_depth', 1) h_sz = options['hidden_size'] # ---------------------- STEP FUNCTION ---------------------- # def _stepP(x_, h_, c_, lP_, dV_, xAux): preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')]) preact += (tensor.dot(x_, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')]) if options.get('en_aux_inp', 0): preact += tensor.dot(xAux, tparams[_p(prefix, 'W_aux')]) hL = [[]] * h_depth cL = [[]] * h_depth outp = [[]] * h_depth for di in xrange(h_depth): i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz)) f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz)) o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz)) cL[di] = tensor.tanh(sliceT(preact, 3, h_sz)) cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di] hL[di] = o * tensor.tanh(cL[di]) outp[di] = hL[di] if options.get('en_residual_conn', 1): if (di > 0): outp[di] += outp[di - 1] print "Connecting residual at %d" % (di) if di < (h_depth - 1): preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \ tensor.dot(outp[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))]) c = tensor.concatenate(cL, axis=1) h = tensor.concatenate(hL, axis=1) if options.get('class_out_factoring', 0) == 1: if options.get('cls_diff_layer', 0) == 1: pC = tensor.dot(hL[-2], tparams['WdCls']) + tparams['bdCls'] else: pC = tensor.dot(outp[-1], tparams['WdCls']) + tparams['bdCls'] pCSft = tensor.nnet.softmax(pC) xCIdx = tensor.argmax(pCSft, axis=-1) #pW = tensor.dot(outp[-1],tparams['Wd'][:,xCIdx,:]) + tparams['bd'][:,xCIdx,:] #smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f') #pWSft = tensor.nnet.softmax(pW*smooth_factor) #lProb = tensor.log(pWSft + 1e-20) + tensor.log(pCSft[0,xCIdx] + 1e-20) ######################################################### # pW is now of size (beam_size, n_classes, class_size) if options.get('cls_zmean', 0): pW = tensor.dot( (outp[-1] - tparams['WdCls'][:, xCIdx].T), tparams['Wd'].swapaxes(0, 1)) + tparams['bd'][0, :, :] else: pW = tensor.dot((outp[-1]), tparams['Wd'].swapaxes( 0, 1)) + tparams['bd'][0, :, :] #smooth_factor = tensor.as_tensor_variable(numpy_floatX(options.get('softmax_smooth_factor',1.0)), name='sm_f') pWSft = tensor.nnet.softmax( pW.reshape([pW.shape[0] * pW.shape[1], pW.shape[2]])).reshape( [pW.shape[0], pW.shape[1] * pW.shape[2]]) ixtoclsinfo_t = tensor.as_tensor_variable(self.clsinfo) lProb = tensor.log(pWSft[:,ixtoclsinfo_t[:,0]*tparams['Wd'].shape[2]+ixtoclsinfo_t[:,3]] + 1e-20) + \ tensor.log(pCSft[0,ixtoclsinfo_t[:,0]] + 1e-20) else: p = tensor.dot(outp[-1], tparams['Wd']) + tparams['bd'] smooth_factor = tensor.as_tensor_variable(numpy_floatX( options.get('softmax_smooth_factor', 1.0)), name='sm_f') p = tensor.nnet.softmax(p * smooth_factor) lProb = tensor.log(p + 1e-20) if per_word_logweight is not None: log_w = theano.shared( per_word_logweight) #, dtype= theano.config.floatX) lProb = log_w + lProb if beam_size > 1: def _FindB_best(lPLcl, lPprev, dVLcl): srtLcl = tensor.argsort(-lPLcl) srtLcl = srtLcl[:beam_size] deltaVec = tensor.fill(lPLcl[srtLcl], numpy_floatX(-10000.)) deltaVec = tensor.set_subtensor(deltaVec[0], lPprev) lProbBest = ifelse( tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), lPLcl[srtLcl] + lPprev, deltaVec) xWIdxBest = ifelse( tensor.eq(dVLcl, tensor.zeros_like(dVLcl)), srtLcl, tensor.zeros_like(srtLcl)) return lProbBest, xWIdxBest rvalLcl, updatesLcl = theano.scan(_FindB_best, sequences=[lProb, lP_, dV_], name=_p(prefix, 'FindBest'), n_steps=x_.shape[0]) xWIdxBest = rvalLcl[1] lProbBest = rvalLcl[0] xWIdxBest = xWIdxBest.flatten() lProb = lProbBest.flatten() # Now sort and find the best among these best extensions for the current beams srtIdx = tensor.argsort(-lProb) srtIdx = srtIdx[:beam_size] xCandIdx = srtIdx // beam_size # Floor division h = h.take(xCandIdx.flatten(), axis=0) c = c.take(xCandIdx.flatten(), axis=0) xWlogProb = lProb[srtIdx] xWIdx = xWIdxBest[srtIdx] if options.get('class_out_factoring', 0) == 1: clsoffset = tensor.as_tensor_variable(self.clsOffset) else: xCandIdx = tensor.as_tensor_variable([0]) lProb = lProb.flatten() xWIdx = tensor.argmax(lProb, keepdims=True) xWlogProb = lProb[xWIdx] + lP_ #if options.get('class_out_factoring',0) == 1: # clsoffset = tensor.as_tensor_variable(self.clsOffset) # xWIdx += clsoffset[xCIdx] h = h.take(xCandIdx.flatten(), axis=0) c = c.take(xCandIdx.flatten(), axis=0) if options.get('softmax_propogate', 0) == 0: xW = tparams['Wemb'][xWIdx.flatten()] else: xW = p.dot(tparams['Wemb']) doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx)) return [xW, h, c, xWlogProb, doneVec, xWIdx, xCandIdx], theano.scan_module.until(doneVec.all()) # ------------------- END of STEP FUNCTION -------------------- # if options.get('en_aux_inp', 0) == 0: aux_input = [] h = tensor.alloc(numpy_floatX(0.), beam_size, h_sz * h_depth) c = tensor.alloc(numpy_floatX(0.), beam_size, h_sz * h_depth) lP = tensor.alloc(numpy_floatX(0.), beam_size) dV = tensor.alloc(np.int8(0.), beam_size) # Propogate the image feature vector [xW, h, c, _, _, _, _], _ = _stepP(Xi, h[:1, :], c[:1, :], lP, dV, aux_input) xWStart = tparams['Wemb'][[0]] [xW, h, c, lP, dV, idx0, cand0], _ = _stepP(xWStart, h[:1, :], c[:1, :], lP, dV, aux_input) if options.get('en_aux_inp', 0) == 1: aux_input = tensor.extra_ops.repeat(aux_input, beam_size, axis=0) # Now lets do the loop. rval, updates = theano.scan( _stepP, outputs_info=[xW, h, c, lP, dV, None, None], non_sequences=[aux_input], name=_p(prefix, 'predict_layers'), n_steps=nMaxsteps) return rval[3][-1], tensor.concatenate( [idx0.reshape([1, beam_size]), rval[5]], axis=0), tensor.concatenate( [cand0.reshape([1, beam_size]), rval[6]], axis=0), tensor.concatenate( [tensor.shape_padleft(xW, n_ones=1), rval[0]], axis=0), updates #================================================================================================================ def lstm_advers_gen_layer(self, tparams, xI, xAux, options, prefix='lstm'): nBatchSamps = xI.shape[0] nMaxsteps = options.get('maxlen', 15) if nMaxsteps is None: nMaxsteps = 30 n_samp = options.get('n_gen_samples', 1) h_depth = options.get('hidden_depth', 1) h_sz = options['hidden_size'] # ---------------------- STEP FUNCTION ---------------------- # def _stepP(U, xW_, h_, c_, lP_, dV_, xAux, xNoise): preact = tensor.dot(sliceT(h_, 0, h_sz), tparams[_p(prefix, 'W_hid')]) preact += (tensor.dot(xW_, tparams[_p(prefix, 'W_inp')]) + tparams[_p(prefix, 'b')]) preact += xAux if options.get('gen_input_noise', 0): preact += xNoise hL = [[]] * h_depth cL = [[]] * h_depth outp = [[]] * h_depth for di in xrange(h_depth): i = tensor.nnet.sigmoid(sliceT(preact, 0, h_sz)) f = tensor.nnet.sigmoid(sliceT(preact, 1, h_sz)) o = tensor.nnet.sigmoid(sliceT(preact, 2, h_sz)) cL[di] = tensor.tanh(sliceT(preact, 3, h_sz)) cL[di] = f * sliceT(c_, di, h_sz) + i * cL[di] hL[di] = o * tensor.tanh(cL[di]) outp[di] = hL[di] if options.get('en_residual_conn', 1): if (di > 0): outp[di] += outp[di - 1] print "Connecting residual at %d" % (di) if di < (h_depth - 1): preact = tensor.dot(sliceT(h_, di+1, h_sz), tparams[_p(prefix, ('W_hid_' + str(di+1)))]) + \ tensor.dot(outp[di], tparams[_p(prefix, ('W_inp_' + str(di+1)))]) c = tensor.concatenate(cL, axis=1) h = tensor.concatenate(hL, axis=1) logits = tensor.dot(outp[-1], tparams['Wd']) + tparams['bd'] #p = tensor.dot(outp[-1],l2norm(tparams['Wd'],axis=0))# + tparams['bd'] if options.get('use_gumbel_mse', 0) == 0 or options.get( 'greedy', 0): p = tensor.nnet.softmax(logits) else: p = gumbel_softmax_sample( self.trng, logits * self.softmax_smooth_factor, self.gumb_temp, U, options.get('use_gumbel_hard', False)) if options.get('computelogprob', 0): lProb = tensor.log( tensor.nnet.softmax(logits * self.softmax_smooth_factor) + 1e-20) else: lProb = logits # Idx of the correct word should come from the xWIdx = ~dV_ * tensor.argmax(p, axis=-1) xWlogProb = ~dV_ * lProb[tensor.arange(nBatchSamps * n_samp), xWIdx] + lP_ #xW = tparams['Wemb'][xWIdx.flatten()] if options.get('use_gumbel_hard', 0) and options.get( 'use_gumbel_mse', 0) and not options.get('greedy', 0): xW = p.dot(tparams['Wemb']) else: xW = theano.gradient.disconnected_grad( tparams['Wemb'][xWIdx.flatten()].reshape( [xWIdx.shape[0], -1])) doneVec = tensor.eq(xWIdx, tensor.zeros_like(xWIdx)) return [xW, h, c, xWlogProb, doneVec, xWIdx, p], theano.scan_module.until(doneVec.all()) # ------------------- END of STEP FUNCTION -------------------- # if options.get('use_gumbel_mse', 0) == 0: U = self.trng.uniform((nMaxsteps, 1), low=0., high=1., dtype=theano.config.floatX) else: U = self.trng.uniform((nMaxsteps + 1, nBatchSamps * n_samp, options['vocabulary_size']), low=0., high=1., dtype=theano.config.floatX) xI = tensor.extra_ops.repeat(xI, n_samp, axis=0) xAux = tensor.extra_ops.repeat(tensor.dot(xAux, tparams[_p(prefix, 'W_aux')]), n_samp, axis=0) if options.get('gen_input_noise', 0): xNoise = tensor.dot( self.trng.normal([nBatchSamps * n_samp, self.noise_dim]), tparams[_p(prefix, 'W_noise')]) else: xNoise = [] if options.get('gen_use_rand_init', 0) and not options.get('gen_input_noise', 0): h = tensor.unbroadcast( self.trng.uniform([nBatchSamps * n_samp, h_sz * h_depth], low=-0.1, high=0.1), 0, 1) c = tensor.unbroadcast( self.trng.uniform([nBatchSamps * n_samp, h_sz * h_depth], low=-0.1, high=0.1), 0, 1) else: h = tensor.zeros([nBatchSamps * n_samp, h_sz * h_depth]) c = tensor.zeros([nBatchSamps * n_samp, h_sz * h_depth]) lP = tensor.alloc(numpy_floatX(0.), nBatchSamps * n_samp) dV = tensor.alloc(np.bool_(0.), nBatchSamps * n_samp) # Propogate the image feature vector [_, h, c, _, _, _, _], _ = _stepP(U[0, :], xI, h, c, lP, dV, xAux, xNoise) xWStart = tensor.unbroadcast( tensor.tile(tparams['Wemb'][[0]], [nBatchSamps * n_samp, 1]), 0, 1) # Now lets do the loop. rval, updates = theano.scan( _stepP, sequences=[U[1:, :]], outputs_info=[xWStart, h, c, lP, dV, None, None], non_sequences=[xAux, xNoise], name=_p(prefix, 'adv_predict_layers'), n_steps=nMaxsteps) seq_lengths = theano.gradient.disconnected_grad( tensor.argmax(tensor.concatenate( [rval[4][:-1, :], tensor.ones((1, nBatchSamps * n_samp))], axis=0), axis=0) + 1) return rval[3][-1], rval[5], rval[6], updates, seq_lengths # ======================================================================================== def build_eval_other_sent(self, tparams, options, model_npy): zipp(model_npy, self.model_th) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) xW = tensor.matrix('xW', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) n_timesteps = xW.shape[0] n_samples = xW.shape[1] n_out_samps = (n_timesteps - 1) * n_samples embW = tparams['Wemb'][xW.flatten()].reshape( [n_timesteps, n_samples, options['word_encoding_size']]) xI = tensor.matrix('xI', dtype=config.floatX) xAux = tensor.matrix('xAux', dtype=config.floatX) if options.get('swap_aux', 0): xAuxEmb = tensor.dot(xAux, tparams['WIemb_aux']) + tparams['b_Img_aux'] else: xAuxEmb = xAux embImg = (tensor.dot(xI, tparams['WIemb']) + tparams['b_Img']).reshape( [1, n_samples, options['image_encoding_size']]) emb = tensor.concatenate([embImg, embW], axis=0) rval, updatesLSTM = basic_lstm_layer(tparams, emb[:n_timesteps, :, :], xAuxEmb, use_noise, options, prefix=options['generator']) p = sliceT(rval[0][1:, :, :], options.get('hidden_depth', 1), options['hidden_size']) if options.get('class_out_factoring', 0) == 0: pW = (tensor.dot(p, tparams['Wd']) + tparams['bd']).reshape( [n_out_samps, options['output_size']]) pWSft = tensor.nnet.softmax(pW) totProb = pWSft[tensor.arange(n_out_samps), xW[1:, :].flatten()] out_list = [pWSft, totProb, p] else: ixtoclsinfo_t = tensor.as_tensor_variable(self.clsinfo) xC = ixtoclsinfo_t[xW[1:, :].flatten(), 0] pW = ((tparams['Wd'][:, xC, :].T * ((p.reshape([1, n_out_samps, options['hidden_size']]) - tparams['WdCls'][:, xC].T))).sum(axis=-1).T + tparams['bd'][:, xC, :]) pWSft = tensor.nnet.softmax(pW[0, :, :]) pC = (tensor.dot(p, tparams['WdCls']) + tparams['bdCls']).reshape( [n_out_samps, options['nClasses']]) pCSft = tensor.nnet.softmax(pC) totProb = pWSft[tensor.arange(n_out_samps), ixtoclsinfo_t[xW[1:,:].flatten(),3]] * \ pCSft[tensor.arange(n_out_samps), xC] tot_cost = -(tensor.log(totProb + 1e-10) * mask[1:, :].flatten() ).reshape([n_timesteps - 1, n_samples]) cost = tot_cost.sum(axis=0) inp_list = [xW, mask, xI] if options.get('en_aux_inp', 0): inp_list.append(xAux) self.f_pred_prob_other = theano.function([xW, xI, xAux], pWSft, name='f_pred_prob', updates=updatesLSTM) #f_pred = theano.function([xW, mask], pred.argmax(axis=1), name='f_pred') #cost = -tensor.log(pred[tensor.arange(n_timesteps),tensor.arange(n_samples), xW] + 1e-8).mean() self.f_eval_other = theano.function(inp_list, cost, name='f_eval') return use_noise, inp_list, self.f_pred_prob_other, cost, pW, updatesLSTM
class InputLayer(lasagne.layers.InputLayer): def __init__(self, shape, input_var=None, name=None, binary=True, deterministic=False, threshold=0.5, batch_size=100, n_bits=-1, **kwargs): self.rng_mrg = RandomStreams(lasagne.random.get_rng().randint( 1, 2394349593)) if binary == False: if n_bits == -1: # no quantization at all super(InputLayer, self).__init__(shape=shape, input_var=input_var, name=name, **kwargs) else: # Normalize to [0 ~ 1 - 2^(-n_bits)] input_var_normed = input_var * (1 - 2**(-n_bits)) if deterministic == False: shape_rand = list(shape) if shape_rand[0] is None: shape_rand[0] = batch_size shape_rand = tuple(shape_rand) input_var_ceil = T.ceil( input_var_normed * 2**n_bits) / 2**n_bits input_var_floor = T.floor( input_var_normed * 2**n_bits) / 2**n_bits input_var_above_floor = input_var - input_var_floor input_var_stochastic_quantized = T.cast( T.switch( T.ge( input_var_above_floor, self.rng_mrg.uniform( shape_rand, low=0.0, high=2**(-n_bits), dtype=theano.config.floatX)), input_var_ceil, input_var_floor), theano.config.floatX) super(InputLayer, self).__init__( shape=shape, input_var=input_var_stochastic_quantized, name=name, **kwargs) else: input_var_deterministic_quantized = T.cast( T.round(input_var_normed * 2**n_bits) / 2**n_bits, theano.config.floatX) super(InputLayer, self).__init__( shape=shape, input_var=input_var_deterministic_quantized, name=name, **kwargs) else: if deterministic == False: shape_rand = list(shape) if shape_rand[0] is None: shape_rand[0] = batch_size shape_rand = tuple(shape_rand) # Bernoulli spikes input_var_stochastic_binarized = T.cast( T.gt( input_var, self.rng_mrg.uniform(shape_rand, low=0.0, high=1.0, dtype=theano.config.floatX)), theano.config.floatX) super(InputLayer, self).__init__(shape=shape, input_var=input_var_stochastic_binarized, name=name, **kwargs) else: input_var_deterministic_binarized = T.cast( T.switch(T.ge(input_var, threshold), 1.0, 0.), theano.config.floatX) super(InputLayer, self).__init__( shape=shape, input_var=input_var_deterministic_binarized, name=name, **kwargs)
tys = np.concatenate(tys, axis=0) # assert txs.shape[0] == 6 # assert tys.shape[0] == 6 # trainx = txs.copy() # trainy = tys.copy() trainx_unl = trainx.copy() trainx_unl2 = trainx.copy() nr_batches_train = int(trainx.shape[0] / args.batch_size) nr_batches_test = int(testx.shape[0] / args.batch_size) # specify generative model sf = args.net_scale_factor noise_dim = (args.batch_size, 100) noise = theano_rng.uniform(size=noise_dim) gen_layers = [ll.InputLayer(shape=noise_dim, input_var=noise)] gen_layers.append( nn.batch_norm(ll.DenseLayer(gen_layers[-1], num_units=13 * 1 * 512 / sf, W=Normal(0.05), nonlinearity=nn.relu), g=None)) gen_layers.append( ll.ReshapeLayer(gen_layers[-1], (args.batch_size, 512 / sf, 13, 1))) gen_layers.append( nn.batch_norm(nn.Deconv2DLayer(gen_layers[-1], (args.batch_size, 256 / sf, 25, 1), (5, 1), stride=(2, 1), W=Normal(0.05), nonlinearity=nn.relu),
class RandomizedRectifierLayer(Layer): """ A layer that applies a randomized leaky rectify nonlinearity to its input. The randomized leaky rectifier was first proposed and used in the Kaggle NDSB Competition, and later evaluated in [1]_. Compared to the standard leaky rectifier :func:`leaky_rectify`, it has a randomly sampled slope for negative input during training, and a fixed slope during evaluation. Equation for the randomized rectifier linear unit during training: :math:`\\varphi(x) = \\max((\\sim U(lower, upper)) \\cdot x, x)` During evaluation, the factor is fixed to the arithmetic mean of `lower` and `upper`. Parameters ---------- incoming : a :class:`Layer` instance or a tuple The layer feeding into this layer, or the expected input shape lower : Theano shared variable, expression, or constant The lower bound for the randomly chosen slopes. upper : Theano shared variable, expression, or constant The upper bound for the randomly chosen slopes. shared_axes : 'auto', 'all', int or tuple of int The axes along which the random slopes of the rectifier units are going to be shared. If ``'auto'`` (the default), share over all axes except for the second - this will share the random slope over the minibatch dimension for dense layers, and additionally over all spatial dimensions for convolutional layers. If ``'all'``, share over all axes, thus using a single random slope. **kwargs Any additional keyword arguments are passed to the `Layer` superclass. References ---------- .. [1] Bing Xu, Naiyan Wang et al. (2015): Empirical Evaluation of Rectified Activations in Convolutional Network, http://arxiv.org/abs/1505.00853 """ def __init__(self, incoming, lower=0.3, upper=0.8, shared_axes='auto', **kwargs): super(RandomizedRectifierLayer, self).__init__(incoming, **kwargs) self._srng = RandomStreams(get_rng().randint(1, 2147462579)) self.lower = lower self.upper = upper if not isinstance(lower > upper, theano.Variable) and lower > upper: raise ValueError("Upper bound for RandomizedRectifierLayer needs " "to be higher than lower bound.") if shared_axes == 'auto': self.shared_axes = (0,) + tuple(range(2, len(self.input_shape))) elif shared_axes == 'all': self.shared_axes = tuple(range(len(self.input_shape))) elif isinstance(shared_axes, int): self.shared_axes = (shared_axes,) else: self.shared_axes = shared_axes def get_output_for(self, input, deterministic=False, **kwargs): """ Parameters ---------- input : tensor output from the previous layer deterministic : bool If true, the arithmetic mean of lower and upper are used for the leaky slope. """ if deterministic or self.upper == self.lower: return theano.tensor.nnet.relu(input, (self.upper+self.lower)/2.0) else: shape = list(self.input_shape) if any(s is None for s in shape): shape = list(input.shape) for ax in self.shared_axes: shape[ax] = 1 rnd = self._srng.uniform(tuple(shape), low=self.lower, high=self.upper, dtype=theano.config.floatX) rnd = theano.tensor.addbroadcast(rnd, *self.shared_axes) return theano.tensor.nnet.relu(input, rnd)
def test_uniform(): # TODO: test param low, high # TODO: test size=None # TODO: test ndim!=size.ndim # TODO: test bad seed # TODO: test size=Var, with shape that change from call to call if (mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE'] or mode == 'Mode' and config.linker in ['py']): sample_size = (10, 100) steps = 50 else: sample_size = (500, 50) steps = int(1e3) x = tensor.matrix() for size, const_size, var_input, input in [ (sample_size, sample_size, [], []), (x.shape, sample_size, [x], [numpy.zeros(sample_size, dtype=config.floatX)]), ((x.shape[0], sample_size[1]), sample_size, [x], [numpy.zeros(sample_size, dtype=config.floatX)]), # test empty size (scalar) ((), (), [], []), ]: #### TEST CPU IMPLEMENTATION #### # The python and C implementation are tested with DebugMode # print '' # print 'ON CPU with size=(%s):' % str(size) x = tensor.matrix() R = MRG_RandomStreams(234, use_cuda=False) # Note: we specify `nstreams` to avoid a warning. # TODO Look for all occurrences of `guess_n_streams` and `30 * 256` # for such situations: it would be better to instead filter the # warning using the warning module. u = R.uniform(size=size, nstreams=rng_mrg.guess_n_streams(size, warn=False)) f = theano.function(var_input, u, mode=mode) assert any([ isinstance(node.op, theano.sandbox.rng_mrg.mrg_uniform) for node in f.maker.fgraph.toposort() ]) # theano.printing.debugprint(f) cpu_out = f(*input) # print 'CPU: random?[:10], random?[-10:]' # print cpu_out[0, 0:10] # print cpu_out[-1, -10:] # Increase the number of steps if sizes implies only a few samples if numpy.prod(const_size) < 10: steps_ = steps * 100 else: steps_ = steps basictest(f, steps_, const_size, prefix='mrg cpu', inputs=input) if mode != 'FAST_COMPILE' and cuda_available: # print '' # print 'ON GPU with size=(%s):' % str(size) R = MRG_RandomStreams(234, use_cuda=True) u = R.uniform(size=size, dtype='float32', nstreams=rng_mrg.guess_n_streams(size, warn=False)) # well, it's really that this test w GPU doesn't make sense otw assert u.dtype == 'float32' f = theano.function( var_input, theano.Out(theano.sandbox.cuda.basic_ops.gpu_from_host(u), borrow=True), mode=mode_with_gpu) assert any([ isinstance(node.op, theano.sandbox.rng_mrg.GPU_mrg_uniform) for node in f.maker.fgraph.toposort() ]) # theano.printing.debugprint(f) gpu_out = numpy.asarray(f(*input)) # print 'GPU: random?[:10], random?[-10:]' # print gpu_out[0, 0:10] # print gpu_out[-1, -10:] basictest(f, steps_, const_size, prefix='mrg gpu', inputs=input) numpy.testing.assert_array_almost_equal(cpu_out, gpu_out, decimal=6) # print '' # print 'ON CPU w Numpy with size=(%s):' % str(size) RR = theano.tensor.shared_randomstreams.RandomStreams(234) uu = RR.uniform(size=size) ff = theano.function(var_input, uu, mode=mode) # It's not our problem if numpy generates 0 or 1 basictest(ff, steps_, const_size, prefix='numpy', allow_01=True, inputs=input)