def test_seed_fn(self): random = RandomStreams(234) fn = function([], random.uniform((2, 2)), updates=random.updates()) random.seed(utt.fetch_seed()) fn_val0 = fn() fn_val1 = fn() rng_seed = np.random.RandomState(utt.fetch_seed()).randint(2**30) rng = np.random.RandomState(int(rng_seed)) # int() is for 32bit numpy_val0 = rng.uniform(size=(2, 2)) numpy_val1 = rng.uniform(size=(2, 2)) assert np.allclose(fn_val0, numpy_val0) assert np.allclose(fn_val1, numpy_val1)
def test_setitem(self): random = RandomStreams(234) out = random.uniform((2, 2)) fn = function([], out, updates=random.updates()) random.seed(888) rng = numpy.random.RandomState(utt.fetch_seed()) random[out.rng] = numpy.random.RandomState(utt.fetch_seed()) fn_val0 = fn() fn_val1 = fn() numpy_val0 = rng.uniform(size=(2, 2)) numpy_val1 = rng.uniform(size=(2, 2)) assert numpy.allclose(fn_val0, numpy_val0) assert numpy.allclose(fn_val1, numpy_val1)
def test_setitem(self): random = RandomStreams(234) out = random.uniform((2, 2)) fn = function([], out, updates=random.updates()) random.seed(888) rng = np.random.RandomState(utt.fetch_seed()) random[out.rng] = np.random.RandomState(utt.fetch_seed()) fn_val0 = fn() fn_val1 = fn() numpy_val0 = rng.uniform(size=(2, 2)) numpy_val1 = rng.uniform(size=(2, 2)) assert np.allclose(fn_val0, numpy_val0) assert np.allclose(fn_val1, numpy_val1)
def test_seed_fn(self): random = RandomStreams(234) fn = function([], random.uniform((2, 2)), updates=random.updates()) random.seed(utt.fetch_seed()) fn_val0 = fn() fn_val1 = fn() rng_seed = numpy.random.RandomState(utt.fetch_seed()).randint(2**30) rng = numpy.random.RandomState(int(rng_seed)) #int() is for 32bit # print fn_val0 numpy_val0 = rng.uniform(size=(2, 2)) numpy_val1 = rng.uniform(size=(2, 2)) # print numpy_val0 assert numpy.allclose(fn_val0, numpy_val0) assert numpy.allclose(fn_val1, numpy_val1)
def test_examples_9(self): from theano.tensor.shared_randomstreams import RandomStreams srng = RandomStreams(seed=234) rv_u = srng.uniform((2,2)) rv_n = srng.normal((2,2)) f = function([], rv_u) g = function([], rv_n, no_default_updates=True) #Not updating rv_n.rng nearly_zeros = function([], rv_u + rv_u - 2 * rv_u) f_val0 = f() f_val1 = f() #different numbers from f_val0 assert numpy.all(f_val0 != f_val1) g_val0 = g() # different numbers from f_val0 and f_val1 g_val1 = g() # same numbers as g_val0 !!! assert numpy.all(g_val0 == g_val1) assert numpy.all(g_val0 != f_val0) assert numpy.all(g_val0 != f_val1) nearly_zeros = function([], rv_u + rv_u - 2 * rv_u) assert numpy.allclose(nearly_zeros(), [[0.,0.],[0.,0.]]) rng_val = rv_u.rng.get_value(borrow=True) # Get the rng for rv_u rng_val.seed(89234) # seeds the generator rv_u.rng.set_value(rng_val, borrow=True) # Assign back seeded rng srng.seed(902340) # seeds rv_u and rv_n with different seeds each state_after_v0 = rv_u.rng.get_value().get_state() nearly_zeros() # this affects rv_u's generator v1 = f() rng = rv_u.rng.get_value(borrow=True) rng.set_state(state_after_v0) rv_u.rng.set_value(rng, borrow=True) v2 = f() # v2 != v1 v3 = f() # v3 == v1 assert numpy.all(v1 != v2) assert numpy.all(v1 == v3)
g_val0 = g() g_val1 = g() print f_val0 print f_val1 print g_val0 print g_val1 print nearly_zeros() rng_val = rv_u.rng.get_value(borrow=True) rng_val.seed(89234) rv_u.rng.set_value(rng_val, borrow=True) print f() print srng.seed(900890) print f() print g() print state_after_v0 = rv_u.rng.get_value().get_state() print nearly_zeros() print f() rng = rv_u.rng.get_value(borrow=True) rng.set_state(state_after_v0) rv_u.rng.set_value(rng, borrow=True) print f() print f() print f() print
class LocalNoiseEBM(object): def reset_rng(self): self.rng = N.random.RandomState([12., 9., 2.]) self.theano_rng = RandomStreams(self.rng.randint(2**30)) if self.initialized: self.redo_theano() # def __getstate__(self): d = copy.copy(self.__dict__) #remove everything set up by redo_theano for name in self.names_to_del: if name in d: del d[name] return d def __setstate__(self, d): self.__dict__.update(d) #self.redo_theano() # todo: make some way of not running this, so it's possible to just open something up and look at its weights fast without recompiling it def weights_format(self): return ['v', 'h'] def get_dimensionality(self): return 0 def important_error(self): return 2 def __init__( self, nvis, nhid, learning_rate, irange, init_bias_hid, init_noise_var, min_misclass, max_misclass, time_constant, noise_var_scale_up, noise_var_scale_down, max_noise_var, different_examples, energy_function, init_vis_prec, learn_vis_prec, vis_prec_lr_scale=1e-2, # 0 won't make it not learn, it will just make the transfer function invalid init_delta=0.0, clean_contrastive_coeff=0.0, use_two_noise_vars=False, denoise=False): self.denoise = denoise self.initialized = False self.reset_rng() self.nhid = nhid self.nvis = nvis self.learning_rate = learning_rate self.ERROR_RECORD_MODE_MONITORING = 0 self.error_record_mode = self.ERROR_RECORD_MODE_MONITORING self.init_weight_mag = irange self.force_batch_size = 0 self.init_bias_hid = init_bias_hid self.noise_var = shared(N.cast[floatX](init_noise_var)) self.min_misclass = min_misclass self.max_misclass = max_misclass self.time_constant = time_constant self.noise_var_scale_up = noise_var_scale_up self.noise_var_scale_down = noise_var_scale_down self.max_noise_var = max_noise_var self.misclass = -1 self.different_examples = different_examples self.init_vis_prec = init_vis_prec self.learn_vis_prec = learn_vis_prec self.vis_prec_lr_scale = vis_prec_lr_scale self.energy_function = energy_function self.init_delta = init_delta self.use_two_noise_vars = use_two_noise_vars self.clean_contrastive_coeff = clean_contrastive_coeff self.names_to_del = [] self.redo_everything() def set_error_record_mode(self, mode): self.error_record_mode = mode def set_size_from_dataset(self, dataset): self.nvis = dataset.get_output_dim() self.redo_everything() self.vis_mean.set_value(dataset.get_marginals(), borrow=False) # def get_input_dim(self): return self.nvis def get_output_dim(self): return self.nhid def redo_everything(self): self.initialized = True self.error_record = [] self.examples_seen = 0 self.batches_seen = 0 self.W = shared(N.cast[floatX](self.rng.uniform( -self.init_weight_mag, self.init_weight_mag, (self.nvis, self.nhid)))) self.W.name = 'W' self.b = shared(N.cast[floatX](N.zeros(self.nhid) + self.init_bias_hid)) self.b.name = 'b' self.c = shared(N.cast[floatX](N.zeros(self.nvis))) self.c.name = 'c' self.params = [self.W, self.c, self.b] self.vis_prec_driver = shared( N.zeros(self.nvis) + N.log(N.exp(self.init_vis_prec) - 1.) / self.vis_prec_lr_scale) self.vis_prec_driver.name = 'vis_prec_driver' assert not N.any(N.isnan(self.vis_prec_driver.get_value())) assert not N.any(N.isinf(self.vis_prec_driver.get_value())) if self.learn_vis_prec: self.params.append(self.vis_prec_driver) # if self.energy_function == 'mse autoencoder': self.delta = shared(self.init_delta + N.zeros(self.nhid)) self.delta.name = 'delta' self.s = shared(N.ones(self.nhid)) self.s.name = 's' self.params.append(self.s) if not self.denoise: self.params.append(self.delta) # self.redo_theano() # def batch_energy(self, V, H): if self.energy_function != 'gaussian-binary rbm': assert False output_scan, updates = scan( lambda v, h, beta: 0.5 * T.dot(v, beta * v) - T.dot( self.b, h) - T.dot(self.c, v) - T.dot(v, T.dot(self.W, h)), sequences=(V, H), non_sequences=self.vis_prec) return output_scan def p_h_given_v(self, V): if self.energy_function != 'gaussian-binary rbm': assert False return T.nnet.sigmoid(self.b + T.dot(V, self.W)) def free_energy(self, V): return self.batch_free_energy(V) def batch_free_energy(self, V): if self.energy_function == 'gaussian-binary rbm': output_scan, updates = scan( lambda v, beta: 0.5 * T.dot(v, beta * v) - T.dot(self.c, v) - T .sum(T.nnet.softplus(T.dot(v, self.W) + self.b)), sequences=V, non_sequences=self.vis_prec) elif self.energy_function == 'mse autoencoder': def fn(v, beta, w): h = T.nnet.sigmoid((self.s / w) * T.dot(v, self.W) - self.s + self.b) h.name = 'h' r = T.dot(self.W, h) + self.c r.name = 'r' assert len(h.type().broadcastable) == 1 assert len(self.delta.type().broadcastable) == 1 penalty = -T.dot(self.delta, h) d = v - r scaled_mse = T.dot(d, beta * d) rval = scaled_mse + penalty assert len(rval.type().broadcastable) == 0 return rval output_scan, updates = scan( fn, sequences=V, non_sequences=[self.vis_prec, self.wnorms]) assert len(output_scan.type().broadcastable) == 1 return output_scan def redo_theano(self): if 'denoise' not in dir(self): self.denoise = False if 'energy_function' not in dir(self): self.energy_function = 'gaussian-binary rbm' if 'noise_var' not in dir(self): self.noise_var = self.beta del self.beta if 'different_examples' not in dir(self): self.different_examples = False if 'vis_prec_driver' not in dir(self): self.vis_prec_lr_scale = 1. self.vis_prec_driver = shared( N.zeros(self.nvis) + N.log(N.exp(1.0) - 1.) / self.vis_prec_lr_scale) pre_existing_names = dir(self) self.wnorms = T.sum(T.sqr(self.W), axis=0) self.vis_prec = T.nnet.softplus(self.vis_prec_driver * self.vis_prec_lr_scale) self.vis_prec.name = 'vis_prec' self.W_T = self.W.T self.W_T.name = 'W.T' alpha = T.scalar() X = T.matrix() X.name = 'X' if self.use_two_noise_vars: switch = self.theano_rng.normal( size=[ 1, ], avg=0, std=1, dtype='float32') > 0.0 else: switch = 1.0 final_noise_var = switch * self.noise_var + (1.0 - switch) * 2.0 corrupted = self.theano_rng.normal(size=X.shape, avg=X, std=T.sqrt(final_noise_var), dtype=X.dtype) corrupted.name = 'prenorm_corrupted' old_norm = T.sqr(X).sum(axis=1) old_norm.name = 'old_norm' new_norm = T.sqr(corrupted).sum(axis=1) new_norm.name = 'new_norm' norm_ratio = old_norm / (1e-8 + new_norm) norm_ratio.name = 'norm_ratio' norm_ratio_shuffled = norm_ratio.dimshuffle(0, 'x') norm_ratio_shuffled.name = 'norm_ratio_shuffled' #corrupted = corrupted * norm_ratio_shuffled #corrupted.name = 'postnorm_corrupted' print "NOT USING NORM RESCALING" self.corruption_func = function([X], corrupted) E_c = self.batch_free_energy(corrupted) E_c.name = 'E_c' if self.different_examples: X2 = T.matrix() inputs = [X, X2] else: X2 = X inputs = [X] # E_d = self.batch_free_energy(X2) assert len(E_d.type().broadcastable) == 1 E_d.name = 'E_d' noise_contrastive = T.mean(-T.log(T.nnet.sigmoid(E_c - E_d))) if self.denoise: H = h = T.nnet.sigmoid((self.s / self.wnorms) * T.dot(corrupted, self.W) - self.s + self.b) H.name = 'H' R = (T.dot(H, self.W.T) + self.c) / self.vis_prec recons_diff = R - X #obj = T.mean(T.sqr(recons_diff)) model_score_diffs = corrupted - R noise_dir = corrupted - X model_score = self.vis_prec * model_score_diffs model_score.name = 'model_score' data_score = noise_dir / self.noise_var score_diffs = data_score - model_score obj = T.mean(T.sqr(score_diffs)) HX = T.nnet.sigmoid((self.s / self.wnorms) * T.dot(X, self.W) - self.s + self.b) RX = T.dot(HX, self.W.T) + self.c recons_diff_X = RX - X recons_norms = T.sum(T.sqr(recons_diff_X), axis=1) recons_dir = recons_diff_X / ( 1e-14 + T.sqrt(recons_norms.dimshuffle((0, 'x')))) self.recons_dir_func = function([X], recons_dir) elif self.clean_contrastive_coeff > 0: assert not self.different_examples E_d_0 = self.batch_free_energy(X) clean_contrastive = T.mean(-T.log(T.nnet.sigmoid(E_d - E_d_0))) obj = noise_contrastive + self.clean_contrastive_coeff * clean_contrastive else: obj = noise_contrastive self.error_func = function(inputs, obj) misclass_batch = (E_c < E_d) misclass_batch.name = 'misclass_batch' misclass = misclass_batch.mean() misclass.name = 'misclass' #print 'maker' #print theano.printing.debugprint(self.error_func.maker.env.outputs[0]) #print 'obj' #print theano.printing.debugprint(obj) self.E_d_func = function(inputs, E_d.mean()) self.E_d_batch_func = function(inputs, E_d) self.E_X_batch_func = function([X2], E_d) self.E_c_func = function(inputs, E_c.mean()) self.sqnorm_grad_E_c_func = function( inputs, T.sum(T.sqr(T.grad(T.mean(E_c), corrupted)))) self.sqnorm_grad_E_d_func = function( inputs, T.sum(T.sqr(T.grad(T.mean(E_d), X2)))) self.misclass_func = function(inputs, misclass) #self.norm_misclass_func = function([X], ( T.sum(T.sqr(corrupted),axis=1) < T.sum(T.sqr(X),axis=1) ).mean()) #self.norm_c_func = function([X], T.sum(T.sqr(corrupted),axis=1).mean()) #self.norm_d_func = function([X], T.sum(T.sqr(X),axis=1).mean()) grads = [T.grad(obj, param) for param in self.params] learn_inputs = [ipt for ipt in inputs] learn_inputs.append(alpha) self.learn_func = function(learn_inputs, updates=[ (param, param - alpha * grad) for (param, grad) in zip(self.params, grads) ], name='learn_func') if self.energy_function != 'mse autoencoder': self.recons_func = function([X], self.gibbs_step_exp(X), name='recons_func') # post_existing_names = dir(self) self.names_to_del = [ name for name in post_existing_names if name not in pre_existing_names ] def learn(self, dataset, batch_size): self.learn_mini_batch([ dataset.get_batch_design(batch_size) for x in xrange(1 + self.different_examples) ]) def recons_func(self, x): rval = N.zeros(x.shape) for i in xrange(x.shape[0]): rval[i, :] = self.gibbs_step_exp(x[i, :]) return rval def print_suite(self, dataset, batch_size, batches, things_to_print): self.theano_rng.seed(5) tracker = {} for thing in things_to_print: tracker[thing[0]] = [] for i in xrange(batches): x = dataset.get_batch_design(batch_size) assert x.shape == (batch_size, self.nvis) if self.different_examples: inputs = [x, dataset.get_batch_design(batch_size)] else: inputs = [x] for thing in things_to_print: tracker[thing[0]].append(thing[1](*inputs)) for thing in things_to_print: print thing[0] + ': ' + str(N.asarray(tracker[thing[0]]).mean()) # # def record_monitoring_error(self, dataset, batch_size, batches): assert self.error_record_mode == self.ERROR_RECORD_MODE_MONITORING print 'noise variance (before norm rescaling): ' + str( self.noise_var.get_value()) #always use the same seed for monitoring error self.theano_rng.seed(5) errors = [] misclasses = [] for i in xrange(batches): x = dataset.get_batch_design(batch_size) assert x.shape == (batch_size, self.nvis) if self.different_examples: inputs = [x, dataset.get_batch_design(batch_size)] else: inputs = [x] error = self.error_func(*inputs) errors.append(error) misclass = self.misclass_func(*inputs) misclasses.append(misclass) # misclass = N.asarray(misclasses).mean() print 'misclassification rate: ' + str(misclass) error = N.asarray(errors).mean() assert not N.isnan(misclass) assert not N.isnan(error) self.error_record.append((self.examples_seen, self.batches_seen, error, self.noise_var.get_value(), misclass)) print "TODO: restore old theano_rng state instead of jumping to new one" self.theano_rng.seed(self.rng.randint(2**30)) # def reconstruct(self, x, use_noise): assert x.shape[0] == 1 print 'x summary: ' + str((x.min(), x.mean(), x.max())) #this method is mostly a hack to make the formatting work the same as denoising autoencoder self.truth_shared = shared(x.copy()) if use_noise: self.vis_shared = shared(self.corruption_func(x)) else: self.vis_shared = shared(x.copy()) self.reconstruction = self.recons_func(self.vis_shared.get_value()) print 'recons summary: ' + str( (self.reconstruction.min(), self.reconstruction.mean(), self.reconstruction.max())) def gibbs_step_exp(self, V): base_name = V.name if base_name is None: base_name = 'anon' Q = self.p_h_given_v(V) H = self.sample_hid(Q) H.name = base_name + '->hid_sample' sample = self.c + T.dot(H, self.W_T) sample.name = base_name + '->sample_expectation' return sample def sample_hid(self, Q): return self.theano_rng.binomial(size=Q.shape, n=1, p=Q, dtype=Q.dtype) def learn_mini_batch(self, inputs): for x in inputs: assert x.shape[1] == self.nvis cur_misclass = self.misclass_func(*inputs) if self.misclass == -1: self.misclass = cur_misclass else: self.misclass = self.time_constant * cur_misclass + ( 1. - self.time_constant) * self.misclass #print 'current misclassification rate: '+str(self.misclass) if self.misclass > self.max_misclass: self.noise_var.set_value( min(self.max_noise_var, self.noise_var.get_value() * self.noise_var_scale_up)) elif self.misclass < self.min_misclass: self.noise_var.set_value( max(1e-8, self.noise_var.get_value() * self.noise_var_scale_down)) # learn_inputs = [ipt for ipt in inputs] learn_inputs.append(self.learning_rate) self.learn_func(*learn_inputs) self.examples_seen += x.shape[0] self.batches_seen += 1
from theano import scan import time, numpy as np srng = RandomStreams(seed=234) rv_u = srng.uniform((2, 2)) rv_n = srng.normal((2, 2)) f = function([], rv_u) g = function([], rv_n, no_default_updates=True) nearly_zeros = function([], rv_u + rv_u - 2 * rv_u) # seeding seed rng_val = rv_u.rng.get_value(borrow=True) rng_val.seed(89234) rv_u.rng.set_value(rng_val) srng.seed(100) state_after_v0 = rv_u.rng.get_value().get_state() f() f() nearly_zeros() v1 = f() rng = rv_u.rng.get_value(borrow=True) rng.set_state(state_after_v0) rv_u.rng.set_value(rng, borrow=True) v2 = f() v3 = f() # Derivatives
class LocalNoiseEBM(object): def reset_rng(self): self.rng = N.random.RandomState([12.,9.,2.]) self.theano_rng = RandomStreams(self.rng.randint(2**30)) if self.initialized: self.redo_theano() # def __getstate__(self): d = copy.copy(self.__dict__) #remove everything set up by redo_theano for name in self.names_to_del: if name in d: del d[name] return d def __setstate__(self, d): self.__dict__.update(d) #self.redo_theano() # todo: make some way of not running this, so it's possible to just open something up and look at its weights fast without recompiling it def weights_format(self): return ['v','h'] def get_dimensionality(self): return 0 def important_error(self): return 2 def __init__(self, nvis, nhid, learning_rate, irange, init_bias_hid, init_noise_var, min_misclass, max_misclass, time_constant, noise_var_scale_up, noise_var_scale_down, max_noise_var, different_examples, energy_function, init_vis_prec, learn_vis_prec, vis_prec_lr_scale = 1e-2, # 0 won't make it not learn, it will just make the transfer function invalid init_delta = 0.0, clean_contrastive_coeff = 0.0, use_two_noise_vars = False, denoise = False ): self.denoise = denoise self.initialized = False self.reset_rng() self.nhid = nhid self.nvis = nvis self.learning_rate = learning_rate self.ERROR_RECORD_MODE_MONITORING = 0 self.error_record_mode = self.ERROR_RECORD_MODE_MONITORING self.init_weight_mag = irange self.force_batch_size = 0 self.init_bias_hid = init_bias_hid self.noise_var = shared(N.cast[floatX] (init_noise_var)) self.min_misclass = min_misclass self.max_misclass = max_misclass self.time_constant = time_constant self.noise_var_scale_up = noise_var_scale_up self.noise_var_scale_down = noise_var_scale_down self.max_noise_var = max_noise_var self.misclass = -1 self.different_examples = different_examples self.init_vis_prec = init_vis_prec self.learn_vis_prec = learn_vis_prec self.vis_prec_lr_scale = vis_prec_lr_scale self.energy_function = energy_function self.init_delta = init_delta self.use_two_noise_vars = use_two_noise_vars self.clean_contrastive_coeff = clean_contrastive_coeff self.names_to_del = [] self.redo_everything() def set_error_record_mode(self, mode): self.error_record_mode = mode def set_size_from_dataset(self, dataset): self.nvis = dataset.get_output_dim() self.redo_everything() self.vis_mean.set_value( dataset.get_marginals(), borrow=False) # def get_input_dim(self): return self.nvis def get_output_dim(self): return self.nhid def redo_everything(self): self.initialized = True self.error_record = [] self.examples_seen = 0 self.batches_seen = 0 self.W = shared( N.cast[floatX](self.rng.uniform(-self.init_weight_mag, self.init_weight_mag, (self.nvis, self.nhid ) ) )) self.W.name = 'W' self.b = shared( N.cast[floatX](N.zeros(self.nhid) + self.init_bias_hid) ) self.b.name = 'b' self.c = shared( N.cast[floatX](N.zeros(self.nvis))) self.c.name = 'c' self.params = [ self.W, self.c, self.b ] self.vis_prec_driver = shared(N.zeros(self.nvis) + N.log(N.exp(self.init_vis_prec) - 1.) / self.vis_prec_lr_scale) self.vis_prec_driver.name = 'vis_prec_driver' assert not N.any(N.isnan( self.vis_prec_driver.get_value() )) assert not N.any(N.isinf( self.vis_prec_driver.get_value() )) if self.learn_vis_prec: self.params.append(self.vis_prec_driver) # if self.energy_function == 'mse autoencoder': self.delta = shared(self.init_delta + N.zeros(self.nhid)) self.delta.name = 'delta' self.s = shared(N.ones(self.nhid)) self.s.name = 's' self.params.append(self.s) if not self.denoise: self.params.append(self.delta) # self.redo_theano() # def batch_energy(self, V, H): if self.energy_function != 'gaussian-binary rbm': assert False output_scan, updates = scan( lambda v, h, beta: 0.5 * T.dot(v,beta*v) - T.dot(self.b,h) - T.dot(self.c,v) -T.dot(v,T.dot(self.W,h)), sequences = (V,H), non_sequences = self.vis_prec) return output_scan def p_h_given_v(self, V): if self.energy_function != 'gaussian-binary rbm': assert False return T.nnet.sigmoid(self.b + T.dot(V,self.W)) def free_energy(self, V): return self.batch_free_energy(V) def batch_free_energy(self, V): if self.energy_function == 'gaussian-binary rbm': output_scan, updates = scan( lambda v, beta: 0.5 * T.dot(v,beta * v) - T.dot(self.c,v) - T.sum(T.nnet.softplus( T.dot(v,self.W)+self.b)), sequences = V, non_sequences = self.vis_prec ) elif self.energy_function == 'mse autoencoder': def fn(v, beta, w): h = T.nnet.sigmoid((self.s/w) * T.dot(v,self.W)-self.s+self.b) h.name = 'h' r = T.dot(self.W,h)+self.c r.name = 'r' assert len(h.type().broadcastable ) == 1 assert len(self.delta.type().broadcastable ) == 1 penalty = - T.dot(self.delta , h) d = v -r scaled_mse = T.dot(d,beta * d) rval = scaled_mse + penalty assert len(rval.type().broadcastable ) == 0 return rval output_scan, updates = scan( fn, sequences = V, non_sequences = [self.vis_prec, self.wnorms]) assert len(output_scan.type().broadcastable ) == 1 return output_scan def redo_theano(self): if 'denoise' not in dir(self): self.denoise = False if 'energy_function' not in dir(self): self.energy_function = 'gaussian-binary rbm' if 'noise_var' not in dir(self): self.noise_var = self.beta del self.beta if 'different_examples' not in dir(self): self.different_examples = False if 'vis_prec_driver' not in dir(self): self.vis_prec_lr_scale = 1. self.vis_prec_driver = shared(N.zeros(self.nvis) + N.log(N.exp(1.0) - 1.) / self.vis_prec_lr_scale) pre_existing_names = dir(self) self.wnorms = T.sum(T.sqr(self.W),axis=0) self.vis_prec = T.nnet.softplus(self.vis_prec_driver * self.vis_prec_lr_scale) self.vis_prec.name = 'vis_prec' self.W_T = self.W.T self.W_T.name = 'W.T' alpha = T.scalar() X = T.matrix() X.name = 'X' if self.use_two_noise_vars: switch = self.theano_rng.normal(size=[1,], avg = 0, std = 1, dtype='float32') > 0.0 else: switch = 1.0 final_noise_var = switch * self.noise_var + (1.0 - switch)* 2.0 corrupted = self.theano_rng.normal(size = X.shape, avg = X, std = T.sqrt(final_noise_var), dtype = X.dtype) corrupted.name = 'prenorm_corrupted' old_norm = T.sqr(X).sum(axis=1) old_norm.name = 'old_norm' new_norm = T.sqr(corrupted).sum(axis=1) new_norm.name = 'new_norm' norm_ratio = old_norm / (1e-8 + new_norm) norm_ratio.name = 'norm_ratio' norm_ratio_shuffled = norm_ratio.dimshuffle(0,'x') norm_ratio_shuffled.name = 'norm_ratio_shuffled' #corrupted = corrupted * norm_ratio_shuffled #corrupted.name = 'postnorm_corrupted' print "NOT USING NORM RESCALING" self.corruption_func = function([X],corrupted) E_c = self.batch_free_energy(corrupted) E_c.name = 'E_c' if self.different_examples: X2 = T.matrix() inputs = [ X, X2] else: X2 = X inputs = [ X ] # E_d = self.batch_free_energy(X2) assert len(E_d.type().broadcastable) == 1 E_d.name = 'E_d' noise_contrastive = T.mean( -T.log( T.nnet.sigmoid( E_c - E_d) ) ) if self.denoise: H = h = T.nnet.sigmoid((self.s/self.wnorms) * T.dot(corrupted,self.W)-self.s+self.b) H.name = 'H' R = (T.dot(H,self.W.T)+self.c)/self.vis_prec recons_diff = R - X #obj = T.mean(T.sqr(recons_diff)) model_score_diffs = corrupted - R noise_dir = corrupted - X model_score = self.vis_prec * model_score_diffs model_score.name = 'model_score' data_score = noise_dir / self.noise_var score_diffs = data_score - model_score obj = T.mean(T.sqr(score_diffs )) HX = T.nnet.sigmoid((self.s/self.wnorms) * T.dot(X,self.W)-self.s+self.b) RX = T.dot(HX,self.W.T)+self.c recons_diff_X = RX - X recons_norms = T.sum(T.sqr(recons_diff_X),axis=1) recons_dir = recons_diff_X / (1e-14+T.sqrt(recons_norms.dimshuffle((0,'x')))) self.recons_dir_func = function( [X], recons_dir) elif self.clean_contrastive_coeff > 0: assert not self.different_examples E_d_0 = self.batch_free_energy(X) clean_contrastive = T.mean( -T.log(T.nnet.sigmoid( E_d - E_d_0))) obj = noise_contrastive + self.clean_contrastive_coeff * clean_contrastive else: obj = noise_contrastive self.error_func = function(inputs,obj ) misclass_batch = (E_c < E_d) misclass_batch.name = 'misclass_batch' misclass = misclass_batch.mean() misclass.name = 'misclass' #print 'maker' #print theano.printing.debugprint(self.error_func.maker.env.outputs[0]) #print 'obj' #print theano.printing.debugprint(obj) self.E_d_func = function(inputs, E_d.mean()) self.E_d_batch_func = function(inputs, E_d) self.E_X_batch_func = function([X2], E_d) self.E_c_func = function(inputs, E_c.mean()) self.sqnorm_grad_E_c_func = function(inputs, T.sum(T.sqr(T.grad(T.mean(E_c),corrupted)))) self.sqnorm_grad_E_d_func = function(inputs, T.sum(T.sqr(T.grad(T.mean(E_d),X2)))) self.misclass_func = function(inputs, misclass) #self.norm_misclass_func = function([X], ( T.sum(T.sqr(corrupted),axis=1) < T.sum(T.sqr(X),axis=1) ).mean()) #self.norm_c_func = function([X], T.sum(T.sqr(corrupted),axis=1).mean()) #self.norm_d_func = function([X], T.sum(T.sqr(X),axis=1).mean()) grads = [ T.grad(obj,param) for param in self.params ] learn_inputs = [ ipt for ipt in inputs ] learn_inputs.append(alpha) self.learn_func = function(learn_inputs, updates = [ (param, param - alpha * grad) for (param,grad) in zip(self.params, grads) ] , name='learn_func') if self.energy_function != 'mse autoencoder': self.recons_func = function([X], self.gibbs_step_exp(X) , name = 'recons_func') # post_existing_names = dir(self) self.names_to_del = [ name for name in post_existing_names if name not in pre_existing_names] def learn(self, dataset, batch_size): self.learn_mini_batch([dataset.get_batch_design(batch_size) for x in xrange(1+self.different_examples)]) def recons_func(self, x): rval = N.zeros(x.shape) for i in xrange(x.shape[0]): rval[i,:] = self.gibbs_step_exp(x[i,:]) return rval def print_suite(self, dataset, batch_size, batches, things_to_print): self.theano_rng.seed(5) tracker = {} for thing in things_to_print: tracker[thing[0]] = [] for i in xrange(batches): x = dataset.get_batch_design(batch_size) assert x.shape == (batch_size, self.nvis) if self.different_examples: inputs = [ x , dataset.get_batch_design(batch_size) ] else: inputs = [ x ] for thing in things_to_print: tracker[thing[0]].append(thing[1](*inputs)) for thing in things_to_print: print thing[0] + ': '+str(N.asarray(tracker[thing[0]]).mean()) # # def record_monitoring_error(self, dataset, batch_size, batches): assert self.error_record_mode == self.ERROR_RECORD_MODE_MONITORING print 'noise variance (before norm rescaling): '+str(self.noise_var.get_value()) #always use the same seed for monitoring error self.theano_rng.seed(5) errors = [] misclasses = [] for i in xrange(batches): x = dataset.get_batch_design(batch_size) assert x.shape == (batch_size, self.nvis) if self.different_examples: inputs = [ x, dataset.get_batch_design(batch_size) ] else: inputs = [ x ] error = self.error_func(*inputs) errors.append( error ) misclass = self.misclass_func(*inputs) misclasses.append(misclass) # misclass = N.asarray(misclasses).mean() print 'misclassification rate: '+str(misclass) error = N.asarray(errors).mean() assert not N.isnan(misclass) assert not N.isnan(error) self.error_record.append( (self.examples_seen, self.batches_seen, error, self.noise_var.get_value(), misclass ) ) print "TODO: restore old theano_rng state instead of jumping to new one" self.theano_rng.seed(self.rng.randint(2**30)) # def reconstruct(self, x, use_noise): assert x.shape[0] == 1 print 'x summary: '+str((x.min(),x.mean(),x.max())) #this method is mostly a hack to make the formatting work the same as denoising autoencoder self.truth_shared = shared(x.copy()) if use_noise: self.vis_shared = shared(self.corruption_func(x)) else: self.vis_shared = shared(x.copy()) self.reconstruction = self.recons_func(self.vis_shared.get_value()) print 'recons summary: '+str((self.reconstruction.min(),self.reconstruction.mean(),self.reconstruction.max())) def gibbs_step_exp(self, V): base_name = V.name if base_name is None: base_name = 'anon' Q = self.p_h_given_v(V) H = self.sample_hid(Q) H.name = base_name + '->hid_sample' sample = self.c + T.dot(H,self.W_T) sample.name = base_name + '->sample_expectation' return sample def sample_hid(self, Q): return self.theano_rng.binomial(size = Q.shape, n = 1, p = Q, dtype = Q.dtype) def learn_mini_batch(self, inputs): for x in inputs: assert x.shape[1] == self.nvis cur_misclass = self.misclass_func(*inputs) if self.misclass == -1: self.misclass = cur_misclass else: self.misclass = self.time_constant * cur_misclass + (1.-self.time_constant) * self.misclass #print 'current misclassification rate: '+str(self.misclass) if self.misclass > self.max_misclass: self.noise_var.set_value(min(self.max_noise_var,self.noise_var.get_value() * self.noise_var_scale_up) ) elif self.misclass < self.min_misclass: self.noise_var.set_value(max(1e-8,self.noise_var.get_value() * self.noise_var_scale_down )) # learn_inputs = [ ipt for ipt in inputs ] learn_inputs.append(self.learning_rate) self.learn_func( * learn_inputs) self.examples_seen += x.shape[0] self.batches_seen += 1
class Network(object): ''' Core neural network class that forms the basis for all further implementations (e.g. MultilayerNet, Autoencoder, etc). Contains basic functions for propagating data forward and backwards through the network, as well as fitting the weights to data''' def __init__(self, d=None, k=None, num_hids=None, activs=None, loss_terms=[None], **loss_params): # Number of units in the output layer determined by k, so not explicitly specified in # num_hids. still need to check that there's one less hidden layer than number of activation # functions assert(len(num_hids) + 1 == len(activs)) self.num_nodes = [d] + num_hids + [k] # needed mainly for gradient checking... self.num_params = 0 for i, (n1, n2) in enumerate(zip(self.num_nodes[:-1], self.num_nodes[1:])): self.num_params += (n1 + 1) * n2 self.activs = [None] * len(activs) for idx, activ in enumerate(activs): if activ == 'sigmoid': self.activs[idx] = na.sigmoid elif activ == 'tanh': self.activs[idx] = na.tanh elif activ == 'reLU': self.activs[idx] = na.reLU elif activ == 'softmax': self.activs[idx] = na.softmax else: sys.exit(ne.activ_err()) self.loss_terms = loss_terms self.loss_params = loss_params self.srng = RandomStreams() self.srng.seed(np.random.randint(99999)) def set_weights(self, wts=None, bs=None, init_method=None, scale_factor=None, seed=None): ''' Initializes the weights and biases of the neural network Parameters: ----------- param: wts - weights type: np.ndarray, optional param: bs - biases type: np.ndarray, optional param: init_method - calls some pre-specified weight initialization routines type: string param: scale_factor - additional hyperparameter for weight initialization type: float, optional param: seed - seeds the random number generator type: int, optional ''' if seed is not None: np.random.seed(seed=seed) self.srng.seed(seed) if wts is None and bs is None: wts = (len(self.num_nodes) - 1) * [None] bs = (len(self.num_nodes) - 1) * [None] if init_method == 'gauss': for i, (n1, n2) in enumerate(zip(self.num_nodes[:-1], self.num_nodes[1:])): wts[i] = scale_factor * 1. / \ np.sqrt(n2) * np.random.randn(n1, n2) bs[i] = np.zeros(n2) elif init_method == 'fan-io': for i, (n1, n2) in enumerate(zip(self.num_nodes[:-1], self.num_nodes[1:])): v = scale_factor * np.sqrt(6. / (n1 + n2 + 1)) wts[i] = 2.0 * v * np.random.rand(n1, n2) - v bs[i] = np.zeros(n2) else: sys.exit(ne.weight_error()) else: # this scenario occurs most when doing unsupervised pre-training to initialize # the weights assert isinstance(wts, list) assert isinstance(bs, list) self.wts_ = [theano.shared(nu.floatX(wt), borrow=True) for wt in wts] self.bs_ = [theano.shared(nu.floatX(b), borrow=True) for b in bs] def fit(self, X_tr, y_tr, X_val=None, y_val=None, wts=None, bs=None, plotting=False, **optim_params): ''' The primary function which ingests data and fits to the neural network. Parameters: ----------- param: X_tr - training data type: theano matrix param: y_tr - training labels type: theano matrix param: X_val - validation data type: theano matrix param: y_val - validation labels type: theano matrix param: plotting - specifies whether any curves should be generated type: boolean param: **optim_params type: dictionary of optimization parameters ''' # initialize weights... if all(node for node in self.num_nodes): init_method = optim_params.pop('init_method') scale_factor = optim_params.pop('scale_factor') try: seed = optim_params.pop('seed') except KeyError: seed = None self.set_weights( wts=wts, bs=bs, init_method=init_method, scale_factor=scale_factor, seed=seed) #...and train try: optim_type = optim_params.pop('optim_type') except KeyError: sys.exit(ne.opt_type_err()) num_epochs = optim_params.pop('num_epochs', None) batch_size = optim_params.pop('batch_size', None) if optim_type == 'minibatch': self.minibatch_optimize(X_tr, y_tr, X_val=X_val, y_val=y_val, batch_size=batch_size, num_epochs=num_epochs, plotting=plotting, **optim_params) elif optim_type == 'fullbatch': self.fullbatch_optimize( X_tr, y_tr, X_val=X_val, y_val=y_val, num_epochs=num_epochs, **optim_params) else: sys.exit(ne.opt_type_err()) return self def shared_dataset(self, X, y): ''' As per the deep learning tutorial, loading the data all at once (if possible) into the GPU will significantly speed things up ''' return theano.shared(nu.floatX(X)), theano.shared(nu.floatX(y)) def fullbatch_optimize(self, X_tr, y_tr, X_val=None, y_val=None, num_epochs=None, **optim_params): ''' Full-batch optimization using scipy's L-BFGS-B and CG Parameters: ----------- param: X_tr - training data type: theano matrix param: y_tr - training labels type: theano matrix param: num_epochs - the number of full runs through the dataset type: int param: **optim_params type: dictionary of optimization parameters ''' X = T.matrix('X') # input variable y = T.matrix('y') # output variable w = T.vector('w') # weight vector # reshape the vector w into weight and bias matrices, and set up the # theano graph to compute the loss and gradient wts, bs = nu.t_reroll(w, self.num_nodes) optim_loss = self.compute_optim_loss(X, y, wts=wts, bs=bs) params = [p for param in [wts, bs] for p in param] grad_params = [T.grad(optim_loss, param) for param in params] grad_w = nu.t_unroll(grad_params[:len(wts)], grad_params[len(wts):]) compute_loss_grad_from_vector = theano.function( inputs=[w, X, y], outputs=[optim_loss, grad_w], allow_input_downcast=True) compute_loss_from_vector = theano.function( inputs=[w, X, y], outputs=[optim_loss], allow_input_downcast=True) # initialize the weight vector and perform full-batch optimization wts0 = [wt.get_value() for wt in self.wts_] bs0 = [b.get_value() for b in self.bs_] w0 = nu.unroll(wts0, bs0) # print 'Checking gradients...' # self.check_gradients(X_tr,y_tr,wts0,bs0) # print 'Pre-training loss:',compute_loss_from_vector(w0,X_tr,y_tr) try: optim_method = optim_params.pop('optim_method') except KeyError: sys.exit(ne.method_err()) # very annoying. if optim_method == 'L-BFGS-B' and theano.config.floatX == 'float32': sys.exit('Sorry, L-BFGS-B only works with float64') wf = sp.optimize.minimize(compute_loss_grad_from_vector, w0, args=(X_tr, y_tr), method=optim_method, jac=True, options={'maxiter': num_epochs}) # re-rolln back into weights and biases wts, bs = nu.reroll(wf.x, self.num_nodes) self.wts_ = [theano.shared(nu.floatX(wt)) for wt in wts] self.bs_ = [theano.shared(nu.floatX(b)) for b in bs] def minibatch_optimize(self, X_tr, y_tr, X_val=None, y_val=None, batch_size=None, num_epochs=None, plotting=False, **optim_params): ''' Mini-batch optimization using update functions; however, if the batch size = m, then this is basically full-batch learning with gradient descent Parameters: ----------- param: X_tr - training data type: theano matrix param: y_tr - training labels type: theano matrix param: updates - update per rule for each param: batch_size - number of examples per mini-batch type: int param: num_epochs - the number of full runs through the dataset type: int param: **optim_params type: dictionary of optimization parameters ''' X = T.matrix('X') # input variable y = T.matrix('y') # output variable idx = T.ivector('idx') # integer index optim_loss = self.compute_optim_loss(X, y) eval_loss = self.compute_eval_loss(X, y) params = [p for param in [self.wts_, self.bs_] for p in param] grad_params = [T.grad(optim_loss, param) for param in params] try: optim_method = optim_params.pop('optim_method') except KeyError: sys.exit(ne.method_err()) # define the update rule updates = [] if optim_method == 'SGD': updates = nopt.sgd( params, grad_params, **optim_params) # update rule elif optim_method == 'ADAGRAD': updates = nopt.adagrad( params, grad_params, **optim_params) # update rule elif optim_method == 'RMSPROP': updates = nopt.rmsprop(params, grad_params, **optim_params) else: print method_err() # define the mini-batches m = X_tr.shape[0] # total number of training instances # number of batches, based on batch size n_batches = int(m / batch_size) # batch_size won't divide the data evenly, so get leftover leftover = m - n_batches * batch_size # load the full dataset into a shared variable - this is especially useful # for test X_tr, y_tr = self.shared_dataset(X_tr, y_tr) # training function for minibatchs train = theano.function( inputs=[idx], updates=updates, allow_input_downcast=True, mode='FAST_RUN', givens={ X: X_tr[idx], y: y_tr[idx] }) compute_train_loss = theano.function( inputs=[], outputs=eval_loss, allow_input_downcast=True, mode='FAST_RUN', givens={ X: X_tr, y: y_tr }) # if validation data is provided, validation loss compute_val_loss = None if X_val is not None and y_val is not None: X_val, y_val = self.shared_dataset(X_val, y_val) compute_val_loss = theano.function( inputs=[], outputs=eval_loss, allow_input_downcast=True, mode='FAST_RUN', givens={ X: X_val, y: y_val }) # iterate through the training examples tr_loss = [] val_loss = [] epoch = 0 while epoch < num_epochs: # randomly shuffle the data indices tr_idx = np.random.permutation(m) # define the start-stop indices ss_idx = range(0, m + 1, batch_size) ss_idx[-1] += leftover # add the leftovers to the last batch # run through a full epoch for idx, (start_idx, stop_idx) in enumerate(zip(ss_idx[:-1], ss_idx[1:])): # total number of batches processed up until now n_batch_iter = (epoch - 1) * n_batches + idx batch_idx = tr_idx[start_idx:stop_idx] # get the next batch train(batch_idx) epoch += 1 # update the epoch count if epoch % 10 == 0: tr_loss.append(compute_train_loss()) if compute_val_loss is not None: val_loss.append(compute_val_loss()) print 'Epoch: %s, Training error: %.15f, Validation error: %.15f' % (epoch, tr_loss[-1], val_loss[-1]) else: print 'Epoch: %s, Training error: %.15f' % (epoch, tr_loss[-1]) # training and validation curves - very useful to see how training # error evolves if plotting: num_pts = len(tr_loss) pts = [idx * 10 for idx in range(num_pts)] plt.plot(pts, tr_loss, label='Training loss') # sort of a weak way to check if validation losses have been # computed if len(val_loss) > 0: plt.plot(pts, val_loss, label='Validation loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend(loc='upper right') plt.show() def dropout(self, act, p=0.5): ''' Randomly drops an activation with probability p Parameters ---------- param: act - activation values, in a matrix type: theano matrix param: p - probability of dropping out a node type: float, optional Returns: -------- param: [expr] - activation values randomly zeroed out type: theano matrix ''' if p > 0: # randomly dropout p activations retain_prob = 1. - p return (1. / retain_prob) * act * self.srng.binomial(act.shape, p=retain_prob, dtype=theano.config.floatX) def train_fprop(self, X, wts=None, bs=None): ''' Performs forward propagation with for training, which could be different from the vanilla frprop we would use for testing, due to extra bells and whistles such as dropout, corruption, etc''' if wts is None and bs is None: wts = self.wts_ bs = self.bs_ if 'dropout' in self.loss_terms: input_p = self.loss_params['input_p'] hidden_p = self.loss_params['hidden_p'] # compute the first activation separately in case we have no hidden # layer; act = self.activs[0]( T.dot(self.dropout(X, input_p), wts[0]) + bs[0]) if len(wts) > 1: # len(wts) = 1 corresponds to softmax regression for i, (w, b, activ) in enumerate(zip(wts[1:], bs[1:], self.activs[1:])): act = activ(T.dot(self.dropout(act, hidden_p), w) + b) eps = 1e-6 act = T.switch(act < eps, eps, act) act = T.switch(act > (1. - eps), (1. - eps), act) return act else: return self.fprop(X, wts, bs) def fprop(self, X, wts=None, bs=None): ''' Performs vanilla forward propagation through the network Parameters ---------- param: X - training data type: theano matrix param: wts - weights type: theano matrix param: bs - biases type: theano matrix Returns: -------- param: act - final activation values type: theano matrix ''' if wts is None and bs is None: wts = self.wts_ bs = self.bs_ # use the first data matrix to compute the first activation act = self.activs[0](T.dot(X, wts[0]) + bs[0]) # len(wts) = 1 corresponds to softmax regression if len(wts) > 1: for i, (w, b, activ) in enumerate(zip(wts[1:], bs[1:], self.activs[1:])): act = activ(T.dot(act, w) + b) # for numericaly stability eps = 1e-6 act = T.switch(act < eps, eps, act) act = T.switch(act > (1. - eps), (1. - eps), act) return act def check_gradients(self, X_in, Y_in, wts=None, bs=None): ''' this seems like overkill, but I suppose it doesn't hurt to have it in here...''' # assume that if it's not provided, they will be shared variables - this is # probably dangerous, but this is a debugging tool anyway, # so...whatever if wts is None and bs is None: wts = self.wts_ bs = self.bs_ else: wts = [theano.shared(nu.floatX(w), borrow=True) for w in wts] bs = [theano.shared(nu.floatX(b), borrow=True) for b in bs] X = T.matrix() # inputs Y = T.matrix() # labels v = T.vector() # vector of biases and weights i = T.lscalar() # index # 1. compile the numerical gradient function def compute_numerical_gradient(v, i, X, Y, eps=1e-4): # perturb the input v_plus = T.inc_subtensor(v[i], eps) v_minus = T.inc_subtensor(v[i], -1.0 * eps) # roll it back into the weight matrices and bias vectors wts_plus, bs_plus = nu.t_reroll(v_plus, self.num_nodes) wts_minus, bs_minus = nu.t_reroll(v_minus, self.num_nodes) # compute the loss for both sides, and then compute the numerical # gradient loss_plus = self.compute_optim_loss(X, Y, wts=wts_plus, bs=bs_plus) loss_minus = self.compute_optim_loss(X, Y, wts_minus, bs_minus) # ( E(weights[i]+eps) - E(weights[i]-eps) )/(2*eps) return 1.0 * (loss_plus - loss_minus) / (2 * eps) compute_ngrad = theano.function( inputs=[v, i, X, Y], outputs=compute_numerical_gradient(v, i, X, Y)) # 2. compile backprop (theano's autodiff) optim_loss = self.compute_optim_loss(X, Y, wts=wts, bs=bs) params = [p for param in [wts, bs] for p in param] # all model parameters in a list # gradient of each model param w.r.t training loss grad_params = [T.grad(optim_loss, param) for param in params] # gradient of the full weight vector grad_w = nu.t_unroll(grad_params[:len(wts)], grad_params[len(wts):]) compute_bgrad = theano.function(inputs=[X, Y], outputs=grad_w) # compute the mean difference between the numerical and exact gradients v0 = nu.unroll([wt.get_value() for wt in wts], [b.get_value() for b in bs]) # get the indices of the weights/biases we want to check idxs = np.random.permutation(self.num_params)[:(self.num_params / 5)] ngrad = [None] * len(idxs) for j, idx in enumerate(idxs): ngrad[j] = compute_ngrad(v0, idx, X_in, Y_in) bgrad = compute_bgrad(X_in, Y_in)[idxs] cerr = np.mean(np.abs(ngrad - bgrad)) assert cerr < 1e-10 def compute_eval_loss(self, X, y, wts=None, bs=None): ''' Given inputs, returns the evaluation loss at the current state of the model Parameters: ----------- param: X - training data type: theano matrix param: y - training labels type: theano matrix param: wts - weights type: theano matrix, optional param: bs - biases type: theano matrix, optional Returns: -------- param: eval_loss - evaluation loss, which doesn't include regularization type: theano scalar ''' if wts is None and bs is None: wts = self.wts_ bs = self.bs_ eval_loss = None # the loss function we can evaluate during validation y_pred = self.fprop(X, wts, bs) if 'cross_entropy' in self.loss_terms: eval_loss = nl.cross_entropy(y, y_pred) elif 'binary_cross_entropy' in self.loss_terms: eval_loss = nl.binary_cross_entropy(y, y_pred) elif 'squared_error' in self.loss_terms: eval_loss = nl.squared_error(y, y_pred) else: sys.exit('Must be either cross_entropy or squared_error') return eval_loss def compute_optim_loss(self, X, y, wts=None, bs=None): ''' Given inputs, returns the training loss at the current state of the model Parameters: ----------- param: X - training data type: theano matrix param: y - training labels type: theano matrix param: wts - weights type: theano matrix, optional param: bs - biases type: theano matrix, optional Returns: -------- param: optim_loss - the optimization loss which must be optimized over type: theano scalar ''' if wts is None and bs is None: wts = self.wts_ bs = self.bs_ y_optim = self.train_fprop(X, wts, bs) # the loss function which will specifically be optimized over optim_loss = None if 'cross_entropy' in self.loss_terms: optim_loss = nl.cross_entropy(y, y_optim) elif 'binary_cross_entropy' in self.loss_terms: optim_loss = nl.binary_cross_entropy(y, y_optim) elif 'squared_error' in self.loss_terms: optim_loss = nl.squared_error(y, y_optim) else: sys.exit('Must be either cross_entropy or squared_error') if 'l1_reg' in self.loss_terms: l1_decay = self.loss_params.get('l1_decay') optim_loss += nl.l1_reg(wts, l1_decay=l1_decay) if 'l2_reg' in self.loss_terms: l2_decay = self.loss_params.get('l2_decay') optim_loss += nl.l2_reg(wts, l2_decay=l2_decay) return optim_loss def get_weights_and_biases(self): ''' simple function which returns the weights and biases as numpy arrays''' wts = [wt.get_value() for wt in self.wts_] bs = [b.get_value() for b in self.bs_] return wts, bs # debugging def check_nans(self): ''' simple function which returns True if any value is NaN in wts or biases ''' # poke into the shared variables and get their values wts, bs = self.get_weights_and_biases() nans = 0 for wt, b in zip(wts, bs): nans += np.sum(wt) + np.sum(b) return np.isnan(nans)
print(f()) # different uniform numbers print(g()) # different normal numbers print(g()) # same normal numbers as the prev. call # NOTE: a single RV is sampled only once in one function call, regardless of how many # times it appears in the formula (which makes sense, in math it is the same) nearly_zeros = function([], rv_unif + rv_unif - 2*rv_unif) print(nearly_zeros()) # returns 0 # Using seeds: you can seed each RV separately or all at once (pretty much to the same effect) rng_val = rv_unif.rng.get_value(borrow=True) rng_val.seed(81232) rv_unif.rng.set_value(rng_val, borrow=True) # or all at once srng.seed(123321) # and to explicitly show that RandomStreams have a shared state: state_after_v0 = rv_unif.rng.get_value().get_state() nearly_zeros() v1 = f() # Go one step back rng = rv_unif.rng.get_value(borrow=True) rng.set_state(state_after_v0) rv_unif.rng.set_value(rng, borrow=True) print(v1 == f()) # False print(v1 == f()) # True """ Copying random states from one function to another """
class MaskGenerator(object): def __init__(self, input_size, hidden_sizes, l, random_seed=1234): self._random_seed = random_seed self._mrng = MRG_RandomStreams(seed=random_seed) self._rng = RandomStreams(seed=random_seed) self._hidden_sizes = hidden_sizes self._input_size = input_size self._l = l self.ordering = theano.shared(value=np.arange(input_size, dtype=theano.config.floatX), name='ordering', borrow=False) # Initial layer connectivity self.layers_connectivity = [theano.shared(value=(self.ordering + 1).eval(), name='layer_connectivity_input', borrow=False)] for i in range(len(self._hidden_sizes)): self.layers_connectivity += [theano.shared(value=np.zeros((self._hidden_sizes[i]), dtype=theano.config.floatX), name='layer_connectivity_hidden{0}'.format(i), borrow=False)] self.layers_connectivity += [self.ordering] ## Theano functions new_ordering = self._rng.shuffle_row_elements(self.ordering) self.shuffle_ordering = theano.function(name='shuffle_ordering', inputs=[], updates=[(self.ordering, new_ordering), (self.layers_connectivity[0], new_ordering + 1)]) self.layers_connectivity_updates = [] for i in range(len(self._hidden_sizes)): self.layers_connectivity_updates += [self._get_hidden_layer_connectivity(i)] # self.layers_connectivity_updates = [self._get_hidden_layer_connectivity(i) for i in range(len(self._hidden_sizes))] # WTF THIS DO NOT WORK self.sample_connectivity = theano.function(name='sample_connectivity', inputs=[], updates=[(self.layers_connectivity[i+1], self.layers_connectivity_updates[i]) for i in range(len(self._hidden_sizes))]) # Save random initial state self._initial_mrng_rstate = copy.deepcopy(self._mrng.rstate) self._initial_mrng_state_updates = [state_update[0].get_value() for state_update in self._mrng.state_updates] # Ensuring valid initial connectivity self.sample_connectivity() def reset(self): # Set Original ordering self.ordering.set_value(np.arange(self._input_size, dtype=theano.config.floatX)) # Reset RandomStreams self._rng.seed(self._random_seed) # Initial layer connectivity self.layers_connectivity[0].set_value((self.ordering + 1).eval()) for i in range(1, len(self.layers_connectivity)-1): self.layers_connectivity[i].set_value(np.zeros((self._hidden_sizes[i-1]), dtype=theano.config.floatX)) self.layers_connectivity[-1].set_value(self.ordering.get_value()) # Reset MRG_RandomStreams (GPU) self._mrng.rstate = self._initial_mrng_rstate for state, value in zip(self._mrng.state_updates, self._initial_mrng_state_updates): state[0].set_value(value) self.sample_connectivity() def _get_p(self, start_choice): start_choice_idx = (start_choice-1).astype('int32') p_vals = T.concatenate([T.zeros((start_choice_idx,)), T.nnet.nnet.softmax(self._l * T.arange(start_choice, self._input_size, dtype=theano.config.floatX))[0]]) p_vals = T.inc_subtensor(p_vals[start_choice_idx], 1.) # Stupid hack because de multinomial does not contain a safety for numerical imprecision. return p_vals def _get_hidden_layer_connectivity(self, layerIdx): layer_size = self._hidden_sizes[layerIdx] if layerIdx == 0: p_vals = self._get_p(T.min(self.layers_connectivity[layerIdx])) else: p_vals = self._get_p(T.min(self.layers_connectivity_updates[layerIdx-1])) # #Implementations of np.choose in theano GPU # return T.nonzero(self._mrng.multinomial(pvals=[self._p_vals] * layer_size, dtype=theano.config.floatX))[1].astype(dtype=theano.config.floatX) # return T.argmax(self._mrng.multinomial(pvals=[self._p_vals] * layer_size, dtype=theano.config.floatX), axis=1) return T.sum(T.cumsum(self._mrng.multinomial(pvals=T.tile(p_vals[::-1][None, :], (layer_size, 1)), dtype=theano.config.floatX), axis=1), axis=1) def _get_mask(self, layerIdxIn, layerIdxOut): return (self.layers_connectivity[layerIdxIn][:, None] <= self.layers_connectivity[layerIdxOut][None, :]).astype(theano.config.floatX) def get_mask_layer_UPDATE(self, layerIdx): return self._get_mask(layerIdx, layerIdx + 1) def get_direct_input_mask_layer_UPDATE(self, layerIdx): return self._get_mask(0, layerIdx) def get_direct_output_mask_layer_UPDATE(self, layerIdx): return self._get_mask(layerIdx, -1)
# 8. Seed Streams # Random variables can be seeded individually or collectively. # You can seed just one random variable by seeding or assigning the .rng # attribute, using the .rng.set_value(). rng_val = rv_u.rng.get_value(borrow=True) # Get the rng for rv_u rng_val.seed(89234) # seeds the generator rv_u.rng.set_value(rng_val, borrow=True) # Assign back seeded rng # You can also seed all the random variables allocated by a RandomStreams # object by that object's seed method. This seed will be used to seed a # temporary random number generator, that will in turn generate seeds for # each of the random variables. print 'seed' srng.seed(902340) # seeds rv_u and rv_n with different seeds each print f() print f() print g() srng.seed(156456) print g() # 9. Sharing Streams Between Functions # As usual for shared variables, the random number generators used for # random variables are common between functions. So our nearly_zeros # function will update the state of the generators used in function f # above. # For example: state_after_v0 = rv_u.rng.get_value().get_state()
srng = RandomStreams(seed=234) rv_u = srng.uniform((2, 2)) rv_n = srng.normal((2, 2)) f = function([], rv_u) g = function([], rv_n, no_default_updates=True) nearly_zeros = function([], rv_u + rv_u - 2 * rv_u) print(f()) print(f()) print(g()) print(g()) print(".....") print(nearly_zeros()) srng.seed(902340) print(f()) print(f()) print(g()) print(g()) state_after_v0 = rv_u.rng.get_value().get_state() print(nearly_zeros()) v1 = f() rng = rv_u.rng.get_value(borrow=True) rng.set_state(state_after_v0) rv_u.rng.set_value(rng, borrow=True) print(rng) v2 = f() # v2 != v1 v3 = f() # v3 == v1 print(v2)
g = function([], rv_n, no_default_updates=True) g = function([], ev_n, no_default_updates=True) nearly_zeros = function([], rv_u + rv_u - 2 * rv_u) f_val0 = f() f_val1 = f() f_val0 f_val1 g_val0 = g() g_val1 = g() g_val0 g_val1 nearly_zeros() rng_val = rv_u.rng.get_value(borrow=True) rng_val.seed(89234) rv_u.rng.set_value(rng_val, borrow=True) srng.seed(902340) rv_u rv_u.get_value() rv_u[0] rv_u[0,0] help(rv_u) rv_u.all() help(rv_u) rv_u.argmax() state_after_v0 = rv_u.rng.get_value().get_state() nearly_zeros() v1 = f() rng = rv_u.rng.get_value(borrow=True) rng.set_state(state_after_v0) rv_u.rng.set_value(rng, borrow=True) v2 = f()
rv_n = srng.normal((2,2)) f = function([], rv_u) g = function([], rv_n, no_default_updates=True) #Not updating rv_n.rng nearly_zeros = function([], rv_u + rv_u - 2 * rv_u) #Call the random number function - normally distributed print f() print f() #same value every time - no_default_updates = True print g() # different numbers from f_val0 and f_val1 print g() #Seeding streams rng_val = rv_u.rng.get_value(borrow=True) # Get the rng for rv_u rng_val.seed(89234) # seeds the generator rv_u.rng.set_value(rng_val, borrow=True) # Assign back seeded rng srng.seed(902340) # seeds rv_u and rv_n with different seeds each state_after_v0 = rv_u.rng.get_value().get_state() nearly_zeros() # this affects rv_u's generator v1 = f() rng = rv_u.rng.get_value(borrow=True) rng.set_state(state_after_v0) rv_u.rng.set_value(rng, borrow=True) v2 = f() # v2 != v1 v3 = f() # v3 == v1 print v1, v2, v3
class Network(object): ''' Core neural network class that forms the basis for all further implementations (e.g. MultilayerNet, Autoencoder, etc). Contains basic functions for propagating data forward and backwards through the network, as well as fitting the weights to data''' def __init__(self, d=None, k=None, num_hids=None, activs=None, loss_terms=[None], **loss_params): # Number of units in the output layer determined by k, so not explicitly specified in # num_hids. still need to check that there's one less hidden layer than number of activation # functions assert (len(num_hids) + 1 == len(activs)) self.num_nodes = [d] + num_hids + [k] # needed mainly for gradient checking... self.num_params = 0 for i, (n1, n2) in enumerate(zip(self.num_nodes[:-1], self.num_nodes[1:])): self.num_params += (n1 + 1) * n2 self.activs = [None] * len(activs) for idx, activ in enumerate(activs): if activ == 'sigmoid': self.activs[idx] = na.sigmoid elif activ == 'tanh': self.activs[idx] = na.tanh elif activ == 'reLU': self.activs[idx] = na.reLU elif activ == 'softmax': self.activs[idx] = na.softmax else: sys.exit(ne.activ_err()) self.loss_terms = loss_terms self.loss_params = loss_params self.srng = RandomStreams() self.srng.seed(np.random.randint(99999)) def set_weights(self, wts=None, bs=None, init_method=None, scale_factor=None, seed=None): ''' Initializes the weights and biases of the neural network Parameters: ----------- param: wts - weights type: np.ndarray, optional param: bs - biases type: np.ndarray, optional param: init_method - calls some pre-specified weight initialization routines type: string param: scale_factor - additional hyperparameter for weight initialization type: float, optional param: seed - seeds the random number generator type: int, optional ''' if seed is not None: np.random.seed(seed=seed) self.srng.seed(seed) if wts is None and bs is None: wts = (len(self.num_nodes) - 1) * [None] bs = (len(self.num_nodes) - 1) * [None] if init_method == 'gauss': for i, (n1, n2) in enumerate( zip(self.num_nodes[:-1], self.num_nodes[1:])): wts[i] = scale_factor * 1. / \ np.sqrt(n2) * np.random.randn(n1, n2) bs[i] = np.zeros(n2) elif init_method == 'fan-io': for i, (n1, n2) in enumerate( zip(self.num_nodes[:-1], self.num_nodes[1:])): v = scale_factor * np.sqrt(6. / (n1 + n2 + 1)) wts[i] = 2.0 * v * np.random.rand(n1, n2) - v bs[i] = np.zeros(n2) else: sys.exit(ne.weight_error()) else: # this scenario occurs most when doing unsupervised pre-training to initialize # the weights assert isinstance(wts, list) assert isinstance(bs, list) self.wts_ = [theano.shared(nu.floatX(wt), borrow=True) for wt in wts] self.bs_ = [theano.shared(nu.floatX(b), borrow=True) for b in bs] def fit(self, X_tr, y_tr, X_val=None, y_val=None, wts=None, bs=None, plotting=False, **optim_params): ''' The primary function which ingests data and fits to the neural network. Parameters: ----------- param: X_tr - training data type: theano matrix param: y_tr - training labels type: theano matrix param: X_val - validation data type: theano matrix param: y_val - validation labels type: theano matrix param: plotting - specifies whether any curves should be generated type: boolean param: **optim_params type: dictionary of optimization parameters ''' # initialize weights... if all(node for node in self.num_nodes): init_method = optim_params.pop('init_method') scale_factor = optim_params.pop('scale_factor') try: seed = optim_params.pop('seed') except KeyError: seed = None self.set_weights(wts=wts, bs=bs, init_method=init_method, scale_factor=scale_factor, seed=seed) #...and train try: optim_type = optim_params.pop('optim_type') except KeyError: sys.exit(ne.opt_type_err()) num_epochs = optim_params.pop('num_epochs', None) batch_size = optim_params.pop('batch_size', None) if optim_type == 'minibatch': self.minibatch_optimize(X_tr, y_tr, X_val=X_val, y_val=y_val, batch_size=batch_size, num_epochs=num_epochs, plotting=plotting, **optim_params) elif optim_type == 'fullbatch': self.fullbatch_optimize(X_tr, y_tr, X_val=X_val, y_val=y_val, num_epochs=num_epochs, **optim_params) else: sys.exit(ne.opt_type_err()) return self def shared_dataset(self, X, y): ''' As per the deep learning tutorial, loading the data all at once (if possible) into the GPU will significantly speed things up ''' return theano.shared(nu.floatX(X)), theano.shared(nu.floatX(y)) def fullbatch_optimize(self, X_tr, y_tr, X_val=None, y_val=None, num_epochs=None, **optim_params): ''' Full-batch optimization using scipy's L-BFGS-B and CG Parameters: ----------- param: X_tr - training data type: theano matrix param: y_tr - training labels type: theano matrix param: num_epochs - the number of full runs through the dataset type: int param: **optim_params type: dictionary of optimization parameters ''' X = T.matrix('X') # input variable y = T.matrix('y') # output variable w = T.vector('w') # weight vector # reshape the vector w into weight and bias matrices, and set up the # theano graph to compute the loss and gradient wts, bs = nu.t_reroll(w, self.num_nodes) optim_loss = self.compute_optim_loss(X, y, wts=wts, bs=bs) params = [p for param in [wts, bs] for p in param] grad_params = [T.grad(optim_loss, param) for param in params] grad_w = nu.t_unroll(grad_params[:len(wts)], grad_params[len(wts):]) compute_loss_grad_from_vector = theano.function( inputs=[w, X, y], outputs=[optim_loss, grad_w], allow_input_downcast=True) compute_loss_from_vector = theano.function(inputs=[w, X, y], outputs=[optim_loss], allow_input_downcast=True) # initialize the weight vector and perform full-batch optimization wts0 = [wt.get_value() for wt in self.wts_] bs0 = [b.get_value() for b in self.bs_] w0 = nu.unroll(wts0, bs0) # print 'Checking gradients...' # self.check_gradients(X_tr,y_tr,wts0,bs0) # print 'Pre-training loss:',compute_loss_from_vector(w0,X_tr,y_tr) try: optim_method = optim_params.pop('optim_method') except KeyError: sys.exit(ne.method_err()) # very annoying. if optim_method == 'L-BFGS-B' and theano.config.floatX == 'float32': sys.exit('Sorry, L-BFGS-B only works with float64') wf = sp.optimize.minimize(compute_loss_grad_from_vector, w0, args=(X_tr, y_tr), method=optim_method, jac=True, options={'maxiter': num_epochs}) # re-rolln back into weights and biases wts, bs = nu.reroll(wf.x, self.num_nodes) self.wts_ = [theano.shared(nu.floatX(wt)) for wt in wts] self.bs_ = [theano.shared(nu.floatX(b)) for b in bs] def minibatch_optimize(self, X_tr, y_tr, X_val=None, y_val=None, batch_size=None, num_epochs=None, plotting=False, **optim_params): ''' Mini-batch optimization using update functions; however, if the batch size = m, then this is basically full-batch learning with gradient descent Parameters: ----------- param: X_tr - training data type: theano matrix param: y_tr - training labels type: theano matrix param: updates - update per rule for each param: batch_size - number of examples per mini-batch type: int param: num_epochs - the number of full runs through the dataset type: int param: **optim_params type: dictionary of optimization parameters ''' X = T.matrix('X') # input variable y = T.matrix('y') # output variable idx = T.ivector('idx') # integer index optim_loss = self.compute_optim_loss(X, y) eval_loss = self.compute_eval_loss(X, y) params = [p for param in [self.wts_, self.bs_] for p in param] grad_params = [T.grad(optim_loss, param) for param in params] try: optim_method = optim_params.pop('optim_method') except KeyError: sys.exit(ne.method_err()) # define the update rule updates = [] if optim_method == 'SGD': updates = nopt.sgd(params, grad_params, **optim_params) # update rule elif optim_method == 'ADAGRAD': updates = nopt.adagrad(params, grad_params, **optim_params) # update rule elif optim_method == 'RMSPROP': updates = nopt.rmsprop(params, grad_params, **optim_params) else: print method_err() # define the mini-batches m = X_tr.shape[0] # total number of training instances # number of batches, based on batch size n_batches = int(m / batch_size) # batch_size won't divide the data evenly, so get leftover leftover = m - n_batches * batch_size # load the full dataset into a shared variable - this is especially useful # for test X_tr, y_tr = self.shared_dataset(X_tr, y_tr) # training function for minibatchs train = theano.function(inputs=[idx], updates=updates, allow_input_downcast=True, mode='FAST_RUN', givens={ X: X_tr[idx], y: y_tr[idx] }) compute_train_loss = theano.function(inputs=[], outputs=eval_loss, allow_input_downcast=True, mode='FAST_RUN', givens={ X: X_tr, y: y_tr }) # if validation data is provided, validation loss compute_val_loss = None if X_val is not None and y_val is not None: X_val, y_val = self.shared_dataset(X_val, y_val) compute_val_loss = theano.function(inputs=[], outputs=eval_loss, allow_input_downcast=True, mode='FAST_RUN', givens={ X: X_val, y: y_val }) # iterate through the training examples tr_loss = [] val_loss = [] epoch = 0 while epoch < num_epochs: # randomly shuffle the data indices tr_idx = np.random.permutation(m) # define the start-stop indices ss_idx = range(0, m + 1, batch_size) ss_idx[-1] += leftover # add the leftovers to the last batch # run through a full epoch for idx, (start_idx, stop_idx) in enumerate(zip(ss_idx[:-1], ss_idx[1:])): # total number of batches processed up until now n_batch_iter = (epoch - 1) * n_batches + idx batch_idx = tr_idx[start_idx:stop_idx] # get the next batch train(batch_idx) epoch += 1 # update the epoch count if epoch % 10 == 0: tr_loss.append(compute_train_loss()) if compute_val_loss is not None: val_loss.append(compute_val_loss()) print 'Epoch: %s, Training error: %.15f, Validation error: %.15f' % ( epoch, tr_loss[-1], val_loss[-1]) else: print 'Epoch: %s, Training error: %.15f' % (epoch, tr_loss[-1]) # training and validation curves - very useful to see how training # error evolves if plotting: num_pts = len(tr_loss) pts = [idx * 10 for idx in range(num_pts)] plt.plot(pts, tr_loss, label='Training loss') # sort of a weak way to check if validation losses have been # computed if len(val_loss) > 0: plt.plot(pts, val_loss, label='Validation loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend(loc='upper right') plt.show() def dropout(self, act, p=0.5): ''' Randomly drops an activation with probability p Parameters ---------- param: act - activation values, in a matrix type: theano matrix param: p - probability of dropping out a node type: float, optional Returns: -------- param: [expr] - activation values randomly zeroed out type: theano matrix ''' if p > 0: # randomly dropout p activations retain_prob = 1. - p return (1. / retain_prob) * act * self.srng.binomial( act.shape, p=retain_prob, dtype=theano.config.floatX) def train_fprop(self, X, wts=None, bs=None): ''' Performs forward propagation with for training, which could be different from the vanilla frprop we would use for testing, due to extra bells and whistles such as dropout, corruption, etc''' if wts is None and bs is None: wts = self.wts_ bs = self.bs_ if 'dropout' in self.loss_terms: input_p = self.loss_params['input_p'] hidden_p = self.loss_params['hidden_p'] # compute the first activation separately in case we have no hidden # layer; act = self.activs[0](T.dot(self.dropout(X, input_p), wts[0]) + bs[0]) if len(wts) > 1: # len(wts) = 1 corresponds to softmax regression for i, (w, b, activ) in enumerate( zip(wts[1:], bs[1:], self.activs[1:])): act = activ(T.dot(self.dropout(act, hidden_p), w) + b) eps = 1e-6 act = T.switch(act < eps, eps, act) act = T.switch(act > (1. - eps), (1. - eps), act) return act else: return self.fprop(X, wts, bs) def fprop(self, X, wts=None, bs=None): ''' Performs vanilla forward propagation through the network Parameters ---------- param: X - training data type: theano matrix param: wts - weights type: theano matrix param: bs - biases type: theano matrix Returns: -------- param: act - final activation values type: theano matrix ''' if wts is None and bs is None: wts = self.wts_ bs = self.bs_ # use the first data matrix to compute the first activation act = self.activs[0](T.dot(X, wts[0]) + bs[0]) # len(wts) = 1 corresponds to softmax regression if len(wts) > 1: for i, (w, b, activ) in enumerate(zip(wts[1:], bs[1:], self.activs[1:])): act = activ(T.dot(act, w) + b) # for numericaly stability eps = 1e-6 act = T.switch(act < eps, eps, act) act = T.switch(act > (1. - eps), (1. - eps), act) return act def check_gradients(self, X_in, Y_in, wts=None, bs=None): ''' this seems like overkill, but I suppose it doesn't hurt to have it in here...''' # assume that if it's not provided, they will be shared variables - this is # probably dangerous, but this is a debugging tool anyway, # so...whatever if wts is None and bs is None: wts = self.wts_ bs = self.bs_ else: wts = [theano.shared(nu.floatX(w), borrow=True) for w in wts] bs = [theano.shared(nu.floatX(b), borrow=True) for b in bs] X = T.matrix() # inputs Y = T.matrix() # labels v = T.vector() # vector of biases and weights i = T.lscalar() # index # 1. compile the numerical gradient function def compute_numerical_gradient(v, i, X, Y, eps=1e-4): # perturb the input v_plus = T.inc_subtensor(v[i], eps) v_minus = T.inc_subtensor(v[i], -1.0 * eps) # roll it back into the weight matrices and bias vectors wts_plus, bs_plus = nu.t_reroll(v_plus, self.num_nodes) wts_minus, bs_minus = nu.t_reroll(v_minus, self.num_nodes) # compute the loss for both sides, and then compute the numerical # gradient loss_plus = self.compute_optim_loss(X, Y, wts=wts_plus, bs=bs_plus) loss_minus = self.compute_optim_loss(X, Y, wts_minus, bs_minus) # ( E(weights[i]+eps) - E(weights[i]-eps) )/(2*eps) return 1.0 * (loss_plus - loss_minus) / (2 * eps) compute_ngrad = theano.function(inputs=[v, i, X, Y], outputs=compute_numerical_gradient( v, i, X, Y)) # 2. compile backprop (theano's autodiff) optim_loss = self.compute_optim_loss(X, Y, wts=wts, bs=bs) params = [p for param in [wts, bs] for p in param] # all model parameters in a list # gradient of each model param w.r.t training loss grad_params = [T.grad(optim_loss, param) for param in params] # gradient of the full weight vector grad_w = nu.t_unroll(grad_params[:len(wts)], grad_params[len(wts):]) compute_bgrad = theano.function(inputs=[X, Y], outputs=grad_w) # compute the mean difference between the numerical and exact gradients v0 = nu.unroll([wt.get_value() for wt in wts], [b.get_value() for b in bs]) # get the indices of the weights/biases we want to check idxs = np.random.permutation(self.num_params)[:(self.num_params / 5)] ngrad = [None] * len(idxs) for j, idx in enumerate(idxs): ngrad[j] = compute_ngrad(v0, idx, X_in, Y_in) bgrad = compute_bgrad(X_in, Y_in)[idxs] cerr = np.mean(np.abs(ngrad - bgrad)) assert cerr < 1e-10 def compute_eval_loss(self, X, y, wts=None, bs=None): ''' Given inputs, returns the evaluation loss at the current state of the model Parameters: ----------- param: X - training data type: theano matrix param: y - training labels type: theano matrix param: wts - weights type: theano matrix, optional param: bs - biases type: theano matrix, optional Returns: -------- param: eval_loss - evaluation loss, which doesn't include regularization type: theano scalar ''' if wts is None and bs is None: wts = self.wts_ bs = self.bs_ eval_loss = None # the loss function we can evaluate during validation y_pred = self.fprop(X, wts, bs) if 'cross_entropy' in self.loss_terms: eval_loss = nl.cross_entropy(y, y_pred) elif 'binary_cross_entropy' in self.loss_terms: eval_loss = nl.binary_cross_entropy(y, y_pred) elif 'squared_error' in self.loss_terms: eval_loss = nl.squared_error(y, y_pred) else: sys.exit('Must be either cross_entropy or squared_error') return eval_loss def compute_optim_loss(self, X, y, wts=None, bs=None): ''' Given inputs, returns the training loss at the current state of the model Parameters: ----------- param: X - training data type: theano matrix param: y - training labels type: theano matrix param: wts - weights type: theano matrix, optional param: bs - biases type: theano matrix, optional Returns: -------- param: optim_loss - the optimization loss which must be optimized over type: theano scalar ''' if wts is None and bs is None: wts = self.wts_ bs = self.bs_ y_optim = self.train_fprop(X, wts, bs) # the loss function which will specifically be optimized over optim_loss = None if 'cross_entropy' in self.loss_terms: optim_loss = nl.cross_entropy(y, y_optim) elif 'binary_cross_entropy' in self.loss_terms: optim_loss = nl.binary_cross_entropy(y, y_optim) elif 'squared_error' in self.loss_terms: optim_loss = nl.squared_error(y, y_optim) else: sys.exit('Must be either cross_entropy or squared_error') if 'l1_reg' in self.loss_terms: l1_decay = self.loss_params.get('l1_decay') optim_loss += nl.l1_reg(wts, l1_decay=l1_decay) if 'l2_reg' in self.loss_terms: l2_decay = self.loss_params.get('l2_decay') optim_loss += nl.l2_reg(wts, l2_decay=l2_decay) return optim_loss def get_weights_and_biases(self): ''' simple function which returns the weights and biases as numpy arrays''' wts = [wt.get_value() for wt in self.wts_] bs = [b.get_value() for b in self.bs_] return wts, bs # debugging def check_nans(self): ''' simple function which returns True if any value is NaN in wts or biases ''' # poke into the shared variables and get their values wts, bs = self.get_weights_and_biases() nans = 0 for wt, b in zip(wts, bs): nans += np.sum(wt) + np.sum(b) return np.isnan(nans)
class MaskGenerator(object): def __init__(self, input_size, hidden_sizes, l, random_seed=1234): self._random_seed = random_seed self._mrng = MRG_RandomStreams(seed=random_seed) self._rng = RandomStreams(seed=random_seed) self._hidden_sizes = hidden_sizes self._input_size = input_size self._l = l self.ordering = theano.shared(value=np.arange( input_size, dtype=theano.config.floatX), name='ordering', borrow=False) # Initial layer connectivity self.layers_connectivity = [ theano.shared(value=(self.ordering + 1).eval(), name='layer_connectivity_input', borrow=False) ] for i in range(len(self._hidden_sizes)): self.layers_connectivity += [ theano.shared(value=np.zeros((self._hidden_sizes[i]), dtype=theano.config.floatX), name='layer_connectivity_hidden{0}'.format(i), borrow=False) ] self.layers_connectivity += [self.ordering] ## Theano functions new_ordering = self._rng.shuffle_row_elements(self.ordering) self.shuffle_ordering = theano.function( name='shuffle_ordering', inputs=[], updates=[(self.ordering, new_ordering), (self.layers_connectivity[0], new_ordering + 1)]) self.layers_connectivity_updates = [] for i in range(len(self._hidden_sizes)): self.layers_connectivity_updates += [ self._get_hidden_layer_connectivity(i) ] # self.layers_connectivity_updates = [self._get_hidden_layer_connectivity(i) for i in range(len(self._hidden_sizes))] # WTF THIS DO NOT WORK self.sample_connectivity = theano.function( name='sample_connectivity', inputs=[], updates=[(self.layers_connectivity[i + 1], self.layers_connectivity_updates[i]) for i in range(len(self._hidden_sizes))]) # Save random initial state self._initial_mrng_rstate = copy.deepcopy(self._mrng.rstate) self._initial_mrng_state_updates = [ state_update[0].get_value() for state_update in self._mrng.state_updates ] # Ensuring valid initial connectivity self.sample_connectivity() def reset(self): # Set Original ordering self.ordering.set_value( np.arange(self._input_size, dtype=theano.config.floatX)) # Reset RandomStreams self._rng.seed(self._random_seed) # Initial layer connectivity self.layers_connectivity[0].set_value((self.ordering + 1).eval()) for i in range(1, len(self.layers_connectivity) - 1): self.layers_connectivity[i].set_value( np.zeros((self._hidden_sizes[i - 1]), dtype=theano.config.floatX)) self.layers_connectivity[-1].set_value(self.ordering.get_value()) # Reset MRG_RandomStreams (GPU) self._mrng.rstate = self._initial_mrng_rstate for state, value in zip(self._mrng.state_updates, self._initial_mrng_state_updates): state[0].set_value(value) self.sample_connectivity() def _get_p(self, start_choice): start_choice_idx = (start_choice - 1).astype('int32') p_vals = T.concatenate([ T.zeros((start_choice_idx, )), T.nnet.nnet.softmax(self._l * T.arange( start_choice, self._input_size, dtype=theano.config.floatX))[0] ]) p_vals = T.inc_subtensor( p_vals[start_choice_idx], 1. ) # Stupid hack because de multinomial does not contain a safety for numerical imprecision. return p_vals def _get_hidden_layer_connectivity(self, layerIdx): layer_size = self._hidden_sizes[layerIdx] if layerIdx == 0: p_vals = self._get_p(T.min(self.layers_connectivity[layerIdx])) else: p_vals = self._get_p( T.min(self.layers_connectivity_updates[layerIdx - 1])) # #Implementations of np.choose in theano GPU # return T.nonzero(self._mrng.multinomial(pvals=[self._p_vals] * layer_size, dtype=theano.config.floatX))[1].astype(dtype=theano.config.floatX) # return T.argmax(self._mrng.multinomial(pvals=[self._p_vals] * layer_size, dtype=theano.config.floatX), axis=1) return T.sum(T.cumsum(self._mrng.multinomial( pvals=T.tile(p_vals[::-1][None, :], (layer_size, 1)), dtype=theano.config.floatX), axis=1), axis=1) def _get_mask(self, layerIdxIn, layerIdxOut): return (self.layers_connectivity[layerIdxIn][:, None] <= self.layers_connectivity[layerIdxOut][None, :]).astype( theano.config.floatX) def get_mask_layer_UPDATE(self, layerIdx): return self._get_mask(layerIdx, layerIdx + 1) def get_direct_input_mask_layer_UPDATE(self, layerIdx): return self._get_mask(0, layerIdx) def get_direct_output_mask_layer_UPDATE(self, layerIdx): return self._get_mask(layerIdx, -1)
# Fix random seed for reproducible experiments if K.backend() == "tensorflow": import tensorflow as tf session_conf = tf.ConfigProto( intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) #, device_count={"GPU": 0} tf.set_random_seed(1234) session_conf.gpu_options.allow_growth = True sess = tf.Session(graph=tf.get_default_graph(), config=session_conf) K.set_session(sess) else: from theano.tensor.shared_randomstreams import RandomStreams # from theano import function srng = RandomStreams(seed=123456789) srng.seed(123456789) # seeds rv_u and rv_n with different seeds each from utils import invert_dict, unsplit_query, merge_two_dicts, sample_aaai_val_set from data_preprocess import gen_data, load_data, save_data, construct_vocab_emb from attention_model import create_attention_model def evaluate(predictions_file, qrels_file): print(predictions_file, qrels_file) pargs = shlex.split("/bin/sh run_eval.sh '{}' '{}'".format( qrels_file, predictions_file)) p = subprocess.Popen(pargs, stdout=subprocess.PIPE, stderr=subprocess.PIPE) pout, perr = p.communicate() print(perr) if sys.version_info[0] < 3:
g=function([], rv_n, no_default_updates=True) # 如果以后一直用这组随机数,就不再更新 .也就是每次调用g(),都会产生相同的结果 g_val0 = g() g_val1 = g() print(g_val0) print(g_val1) nearly_zeros=function([], rv_u+rv_u-2*rv_u) # 一个randow变量在每次执行函数时只提取一个数 , 所以接近与0 print(nearly_zeros()) # 分别设置,使用.rng.set_value()函数 rng_val =rv_u.rng.get_value(borrow=True) # Get the rng for rv_u。 borrow=true共用一个内存空间, borrow 是借的意思 rng_val.seed(89234) # seeds thegenerator 重新设置一个种子 rv_u.rng.set_value(rng_val,borrow=True) srng.seed(902340) # 当然你也可以选择全局设置,使用.seed()函数 state_after_v0 = rv_u.rng.get_value().get_state() # 保存调用前的state 这是保存rng的state,get_value其实是从rv_u中获取这个rng print(nearly_zeros()) v1 = f() #第一个调用,之后state会变化 rng = rv_u.rng.get_value(borrow=True) rng.set_state(state_after_v0) # 为其state还原 rv_u.rng.set_value(rng, borrow = True) v2 = f() # 回到了v1之前的那个状态 # v2 != v1输出更新后state对应的随机数 v3 = f() # v3 == v1再次更新又还原成原来的state了 print(v1) print(v2) print(v3)
class MaskGenerator(object): def __init__(self, input_size, hidden_sizes, l, random_seed=1234): self._random_seed = random_seed self._mrng = MRG_RandomStreams(seed=random_seed) self._rng = RandomStreams(seed=random_seed) self._hidden_sizes = hidden_sizes self._input_size = input_size self._l = l self.ordering = theano.shared(np.arange(input_size, dtype=theano.config.floatX), 'ordering', borrow=False) # Initial layer connectivity self.layers_connectivity = [theano.shared((self.ordering + 1).eval(), 'layer_connectivity_input', borrow=False)] for i in range(len(self._hidden_sizes)): lc = theano.shared(np.zeros((self._hidden_sizes[i]),dtype=floatX), 'layer_connectivity_hidden{0}'.format(i), borrow=False) self.layers_connectivity += [lc] self.layers_connectivity += [self.ordering] ## Theano functions new_ordering = self._rng.shuffle_row_elements(self.ordering) updates = [(self.ordering, new_ordering), (self.layers_connectivity[0], new_ordering + 1)] self.shuffle_ordering = theano.function(name='shuffle_ordering', inputs=[], updates=updates) self.layers_connectivity_updates = [] for i in range(len(self._hidden_sizes)): lcu = self._get_hidden_layer_connectivity(i) self.layers_connectivity_updates += [lcu] hsizes = range(len(self._hidden_sizes)) updates = [(self.layers_connectivity[i+1], self.layers_connectivity_updates[i]) for i in hsizes] self.sample_connectivity = theano.function(name='sample_connectivity', inputs=[], updates=updates) # Save random initial state self._initial_mrng_rstate = copy.deepcopy(self._mrng.rstate) self._initial_mrng_state_updates = [sup[0].get_value() for sup in self._mrng.state_updates] # Ensuring valid initial connectivity self.sample_connectivity() def reset(self): # Set Original ordering self.ordering.set_value(np.arange(self._input_size, dtype=theano.config.floatX)) # Reset RandomStreams self._rng.seed(self._random_seed) # Initial layer connectivity self.layers_connectivity[0].set_value((self.ordering + 1).eval()) for i in range(1, len(self.layers_connectivity)-1): value = np.zeros((self._hidden_sizes[i-1]), dtype=theano.config.floatX) self.layers_connectivity[i].set_value(value) self.layers_connectivity[-1].set_value(self.ordering.get_value()) # Reset MRG_RandomStreams (GPU) self._mrng.rstate = self._initial_mrng_rstate states_values = zip(self._mrng.state_updates, self._initial_mrng_state_updates) for state, value in states_values: state[0].set_value(value) self.sample_connectivity() def _get_p(self, start_choice): start_choice_idx = (start_choice-1).astype('int32') prob = T.nnet.nnet.softmax(self._l * T.arange(start_choice, self._input_size, dtype=floatX))[0] p_vals = T.concatenate([T.zeros((start_choice_idx,)),prob]) p_vals = T.inc_subtensor(p_vals[start_choice_idx], 1.) return p_vals def _get_hidden_layer_connectivity(self, layerIdx): layer_size = self._hidden_sizes[layerIdx] if layerIdx == 0: lc = self.layers_connectivity[layerIdx] p_vals = self._get_p(T.min(lc)) else: lc = self.layers_connectivity_updates[layerIdx-1] p_vals = self._get_p(T.min(lc)) return T.sum( T.cumsum(self._mrng.multinomial( pvals=T.tile(p_vals[::-1][None, :],(layer_size, 1)), dtype=floatX), axis=1), axis=1 ) def _get_mask(self, layerIdxIn, layerIdxOut): return (self.layers_connectivity[layerIdxIn][:, None] <= self.layers_connectivity[layerIdxOut][None, :]).astype(floatX) def get_mask_layer_UPDATE(self, layerIdx): return self._get_mask(layerIdx, layerIdx + 1) def get_direct_input_mask_layer_UPDATE(self, layerIdx): return self._get_mask(0, layerIdx) def get_direct_output_mask_layer_UPDATE(self, layerIdx): return self._get_mask(layerIdx, -1)