def test_random_integers_vector(self): random = RandomStreams(utt.fetch_seed()) low = tensor.lvector() high = tensor.lvector() out = random.random_integers(low=low, high=high) assert out.ndim == 1 f = function([low, high], out) low_val = [100, 200, 300] high_val = [110, 220, 330] seed_gen = numpy.random.RandomState(utt.fetch_seed()) numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30))) # Arguments of size (3,) val0 = f(low_val, high_val) numpy_val0 = numpy.asarray([numpy_rng.random_integers(low=lv, high=hv) for lv, hv in zip(low_val, high_val)]) assert numpy.all(val0 == numpy_val0) # arguments of size (2,) val1 = f(low_val[:-1], high_val[:-1]) numpy_val1 = numpy.asarray([numpy_rng.random_integers(low=lv, high=hv) for lv, hv in zip(low_val[:-1], high_val[:-1])]) assert numpy.all(val1 == numpy_val1) # Specifying the size explicitly g = function([low, high], random.random_integers(low=low, high=high, size=(3,))) val2 = g(low_val, high_val) numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30))) numpy_val2 = numpy.asarray([numpy_rng.random_integers(low=lv, high=hv) for lv, hv in zip(low_val, high_val)]) assert numpy.all(val2 == numpy_val2) self.assertRaises(ValueError, g, low_val[:-1], high_val[:-1])
def test_random_integers_vector(self): random = RandomStreams(utt.fetch_seed()) low = tensor.lvector() high = tensor.lvector() out = random.random_integers(low=low, high=high) assert out.ndim == 1 f = function([low, high], out) low_val = [100, 200, 300] high_val = [110, 220, 330] seed_gen = numpy.random.RandomState(utt.fetch_seed()) numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30))) # Arguments of size (3,) val0 = f(low_val, high_val) numpy_val0 = numpy.asarray([numpy_rng.randint(low=lv, high=hv+1) for lv, hv in zip(low_val, high_val)]) assert numpy.all(val0 == numpy_val0) # arguments of size (2,) val1 = f(low_val[:-1], high_val[:-1]) numpy_val1 = numpy.asarray([numpy_rng.randint(low=lv, high=hv+1) for lv, hv in zip(low_val[:-1], high_val[:-1])]) assert numpy.all(val1 == numpy_val1) # Specifying the size explicitly g = function([low, high], random.random_integers(low=low, high=high, size=(3,))) val2 = g(low_val, high_val) numpy_rng = numpy.random.RandomState(int(seed_gen.randint(2**30))) numpy_val2 = numpy.asarray([numpy_rng.randint(low=lv, high=hv+1) for lv, hv in zip(low_val, high_val)]) assert numpy.all(val2 == numpy_val2) self.assertRaises(ValueError, g, low_val[:-1], high_val[:-1])
class Sample2DLayer(Layer): """ Sample random patches from 2D input. Result is batch of patches with shape (patches_per_example * input_batch_size, num_channels, image_h, image_w). """ def __init__(self, incoming, patches_per_example, patch_size, pad=True, **kwargs): self.rng = RandomStreams() super(Sample2DLayer, self).__init__(incoming, **kwargs) self.patch_size = patch_size self.patches_per_example = patches_per_example self.pad = pad def get_output_for(self, input, **kwargs): def sample_one_image(img, y, x): return theano.map( lambda x, y, image: image[:, y:(y + self.patch_size[0]), x: (x + self.patch_size[1])], sequences=[x, y], non_sequences=img)[0] if self.pad: shp = (input.shape[0], input.shape[1], input.shape[2] + self.patch_size[0] * 2 - 2, input.shape[3] + self.patch_size[1] * 2 - 2) padded_input = T.zeros(shp) padded_input = T.set_subtensor( padded_input[:, :, (self.patch_size[0] - 1):(-self.patch_size[0] + 1), (self.patch_size[1] - 1):(-self.patch_size[1] + 1)], input) input = padded_input y = self.rng.random_integers(size=(input.shape[0], self.patches_per_example), low=0, high=input.shape[2] - self.patch_size[0]) x = self.rng.random_integers(size=(input.shape[0], self.patches_per_example), low=0, high=input.shape[3] - self.patch_size[1]) return theano.map(sample_one_image, sequences=[input, y, x])[0].reshape( (-1, input.shape[1], self.patch_size[0], self.patch_size[1])) def get_output_shape_for(self, input_shape): if input_shape[0] is None: return (None, input_shape[1], self.patch_size[0], self.patch_size[1]) else: return (input_shape[0] * self.patches_per_example, input_shape[1], self.patch_size[0], self.patch_size[1])
def __init__(self, latent_dim, hidden_dim, exploration_probability, clip_value, value_decay, data, batch_size, exploration_decay_rate): self.latent_dim = latent_dim self.words = data["words"] self.depth = 1 + max(len(w) for w in self.words) depth = self.depth self.hidden_dim = hidden_dim self.characters = data["characters"] self.charset = data["charset"] self.charmap = data["charmap"] self.wordcount = len(self.words) self.charcount = len(self.charset) self.generator = Generator("generator", latent_dim, depth, self.charcount, hidden_dim, exploration_probability, exploration_decay_rate) self.discriminator = Discriminator("discriminator", depth, self.charcount, hidden_dim) self.clip_value = np.float32(clip_value) self.value_decay = theano.shared(np.float32(value_decay), "value_decay") self.batch_size = batch_size self.word_vectors = np.vstack([self.word_to_vector(word).reshape((1, -1)) for word in self.words]).astype( np.int32) xreal = Input((depth,), name="xreal", dtype="int32") batch_n = T.iscalar("batch_n") srng = RandomStreams(seed=234) z = srng.normal(size=(batch_n, latent_dim)) e = srng.uniform(size=(batch_n, depth), low=0, high=1) ex = srng.random_integers(size=(batch_n, latent_dim), low=0, high=self.charcount) # z = Input((latent_dim,), name="z", dtype="float32") # e = Input((depth,), name="e", dtype="float32") # ex = Input((depth,), name="ex", dtype="int32") # xreal = T.imatrix("xreal") # z = T.fmatrix("z") # e = T.fmatrix("e") # ex = T.imatrix("ex") _, xfake = self.generator.policy(z, e, ex) xfake = theano.gradient.zero_grad(xfake) # print("xfake: {}, {}".format(xfake, xfake.type)) # print("xreal: {}, {}".format(xreal, xreal.type)) _, yfake = self.discriminator.discriminator(xfake) _, yreal = self.discriminator.discriminator(xreal) dloss = T.mean(yfake, axis=None) - T.mean(yreal, axis=None) dconstraints = {p: ClipConstraint(self.clip_value) for p in self.discriminator.clip_params} dopt = Adam(1e-4) dupdates = dopt.get_updates(self.discriminator.params, dconstraints, dloss) n = z.shape[0] outputs_info = [T.zeros((n,), dtype='float32')] yfaker = T.transpose(yfake[:, ::-1], (1, 0)) vtarget, _ = theano.scan(reward_function, outputs_info=outputs_info, sequences=yfaker, non_sequences=self.value_decay) vtarget = T.transpose(vtarget, (1, 0))[:, ::-1] # print("vtarget: {}, {}, {}".format(vtarget, vtarget.ndim, vtarget.type)) _, vpred = self.generator.value(z, xfake) gloss = T.mean(T.abs_(vtarget - vpred), axis=None) gopt = Adam(1e-5) gupdates = gopt.get_updates(self.generator.params, {}, gloss) self.discriminator_train_function = theano.function([xreal, batch_n], [dloss], updates=dupdates) self.generator_train_function = theano.function([batch_n], [gloss], updates=gupdates) self.generator_sample_function = theano.function([batch_n], [xfake]) self.test_function = theano.function([xreal, batch_n], [dloss, gloss])
def get_conv_xy(layer, deterministic=True): w_np = layer.W.get_value() input_layer = layer.input_layer if layer.pad == 'same': input_layer = L.PadLayer(layer.input_layer, width=np.array(w_np.shape[2:])/2, batch_ndim=2) input_shape = L.get_output_shape(input_layer) max_x = input_shape[2] - w_np.shape[2] max_y = input_shape[3] - w_np.shape[3] srng = RandomStreams() patch_x = srng.random_integers(low=0, high=max_x) patch_y = srng.random_integers(low=0, high=max_y) #print("input_shape shape: ", input_shape) #print("pad: \"%s\""% (layer.pad,)) #print(" stride: " ,layer.stride) #print("max_x %d max_y %d"%(max_x,max_y)) x = L.get_output(input_layer, deterministic=deterministic) x = x[:, :, patch_x:patch_x + w_np.shape[2], patch_y:patch_y + w_np.shape[3]] x = T.flatten(x, 2) # N,D w = layer.W if layer.flip_filters: w = w[:, :, ::-1, ::-1] w = T.flatten(w, outdim=2).T # D,O y = T.dot(x, w) # N,O if layer.b is not None: y += T.shape_padaxis(layer.b, axis=0) return x, y
def call(self, x, **kwargs): from theano import tensor as T from theano.tensor.shared_randomstreams import RandomStreams if K.backend() == "theano": import theano mask_rng = RandomStreams(self.seed) ints = mask_rng.random_integers(size=K.expand_dims(x.shape[0], 0), high=x.shape[1] - 1) def set_value_at_position(i, ns_x): zeros = T.zeros_like(ns_x[0, :]) return T.set_subtensor(zeros[:i], 1) result, updates = theano.scan(fn=set_value_at_position, outputs_info=None, sequences=ints, non_sequences=x) mask = mask_rng.shuffle_row_elements(result) elif K.backend() == "tensorflow": import tensorflow as tf tf.set_random_seed(self.seed) ints = tf.random_uniform(shape=K.expand_dims(tf.shape(x)[0], 0), maxval=x.shape[1], dtype=tf.int32) result = tf.sequence_mask(ints, maxlen=x.shape[1]) parallel_iterations = self._deterministic and 1 or 10 mask = tf.cast( tf.map_fn(tf.random_shuffle, result, parallel_iterations=parallel_iterations), K.floatx()) else: raise NotImplementedError() return K.concatenate([x * mask, mask])
class RomainLayer(lasagne.layers.Layer): def __init__(self, incoming, **kwargs): super(RomainLayer, self).__init__(incoming, **kwargs) self.mask = ones(self.input_shape[2], dtype='float32') self.mask[0] = 0 self.mask[1] = 0 self.snrg = RandomStreams() def get_output_for(self, input, deterministic=False, **kwargs): input = input * self.mask.reshape((1, 1, -1, 1)) if (deterministic == True): return input shift_temps = self.snrg.random_integers(low=-44000, high=44000) shift_freq = self.snrg.random_integers(low=-1, high=1) return theano.tensor.roll(theano.tensor.roll(input, shift_temps, axis=3), shift_freq, axis=2)
def cost_from_X_wrong(self, data): X, Y = data theano_rng = RandomStreams(seed = self.rng.randint(2 ** 15)) noise = theano_rng.random_integers(size = (X.shape[0] * self.k,), low=0, high = self.dict_size - 1) p_n = 1. / self.dict_size pos = T.nnet.sigmoid(self.delta(data) - T.log(self.k * p_n)) neg = T.nnet.sigmoid(self.delta((T.tile(X, (self.k, 1)), noise)) - T.log(self.k * p_n)) neg =neg.reshape((X.shape[0], self.k)) rval = -T.log(pos) - T.log(1 - neg).sum(axis=1) return rval.mean()
class OneHotDistribution(Distribution): """Randomly samples from a distribution of one-hot vectors.""" def __init__(self, space, rng=None): super(OneHotDistribution, self).__init__(space) self.dim = space.get_total_dimension() self.formatter = OneHotFormatter(self.dim, dtype=space.dtype) self.rng = RandomStreams() if rng is None else rng def sample(self, n): idxs = self.rng.random_integers((n, 1), low=0, high=self.dim - 1) return self.formatter.theano_expr(idxs, mode='concatenate')
def score(self, Y, Y_hat): # TODO fix me later when using IndexSpace assert hasattr(Y_hat, 'owner') owner = Y_hat.owner assert owner is not None op = owner.op if isinstance(op, Print): assert len(owner.inputs) == 1 Y_hat, = owner.inputs owner = Y_hat.owner op = owner.op assert isinstance(op, T.nnet.Softmax) state_below, = owner.inputs assert state_below.ndim == 2 # TODO make this more generic like above state_below = state_below.owner.inputs[0].owner.inputs[0] Y = T.argmax(Y, axis = 1) k = self.num_noise_samples if self.noise_prob is None: theano_rng = RandomStreams(seed = self.mlp.rng.randint(2 ** 15)) noise = theano_rng.random_integers(size = (state_below.shape[0], self.num_noise_samples,), low=0, high = self.n_classes - 1) p_n = 1. / self.n_classes p_w = T.nnet.sigmoid((state_below * self.W[:, Y].T).sum(axis=1) + self.b[Y]) p_x = T.nnet.sigmoid((T.concatenate([state_below] * k) * self.W[:, noise.flatten()].T).sum(axis=1) + self.b[noise.flatten()]) # TODO is this reshape necessary? p_x = p_x.reshape((state_below.shape[0], k)) #pos = k * p_n / (p_w + k * p_n) * T.log(p_w) #neg = (p_x / (p_x + k * p_n) * T.log(p_x)).sum(axis=1) else: #import ipdb #ipdb.set_trace() theano_rng = MRG_RandomStreams(max(self.mlp.rng.randint(2 ** 15), 1)) assert self.mlp.batch_size is not None noise = theano_rng.multinomial(pvals = np.tile(self.noise_prob.get_value(), (k * self.mlp.batch_size, 1))) noise = T.argmax(noise, axis = 1) p_n = self.noise_prob p_w = T.nnet.sigmoid((state_below * self.W[:, Y].T).sum(axis=1) + self.b[Y]) p_x = T.nnet.sigmoid((T.concatenate([state_below] * k) * self.W[:, noise.flatten()].T).sum(axis=1) + self.b[noise.flatten()]) p_x = p_x.reshape((state_below.shape[0], k)) pos = k * p_n[Y] / (p_w + k * p_n[Y]) * T.log(p_w) neg = (p_x / (p_x + k * p_n[noise].reshape(p_x.shape)) * T.log(p_x)).sum(axis=1) #return -(pos - neg).mean() return p_w, p_x
def test_dtype(self): random = RandomStreams(utt.fetch_seed()) low = tensor.lscalar() high = tensor.lscalar() out = random.random_integers(low=low, high=high, size=(20,), dtype='int8') assert out.dtype == 'int8' f = function([low, high], out) val0 = f(0, 9) assert val0.dtype == 'int8' val1 = f(255, 257) assert val1.dtype == 'int8' assert numpy.all(abs(val1) <= 1)
def test_dtype(self): random = RandomStreams(utt.fetch_seed()) low = tensor.lscalar() high = tensor.lscalar() out = random.random_integers(low=low, high=high, size=(20,), dtype='int8') assert out.dtype == 'int8' f = function([low, high], out) val0 = f(0, 9) assert val0.dtype == 'int8' val1 = f(255, 257) assert val1.dtype == 'int8' assert numpy.all(abs(val1) <= 1)
class OneHotDistribution(Distribution): """Randomly samples from a distribution of one-hot vectors.""" def __init__(self, space, rng=None): super(OneHotDistribution, self).__init__(space) self.dim = space.get_total_dimension() self.formatter = OneHotFormatter(self.dim, dtype=space.dtype) self.rng = RandomStreams() if rng is None else rng def sample(self, n): idxs = self.rng.random_integers((n, 1), low=0, high=self.dim - 1) return self.formatter.theano_expr(idxs, mode='concatenate')
def test_random_integers(self): """Test that RandomStreams.random_integers generates the same results as numpy""" # Check over two calls to see if the random state is correctly updated. random = RandomStreams(utt.fetch_seed()) fn = function([], random.random_integers((20, 20), -5, 5)) fn_val0 = fn() fn_val1 = fn() rng_seed = numpy.random.RandomState(utt.fetch_seed()).randint(2**30) rng = numpy.random.RandomState(int(rng_seed)) #int() is for 32bit numpy_val0 = rng.random_integers(-5, 5, size=(20,20)) numpy_val1 = rng.random_integers(-5, 5, size=(20,20)) assert numpy.all(fn_val0 == numpy_val0) assert numpy.all(fn_val1 == numpy_val1)
def test_random_integers(self): """Test that RandomStreams.random_integers generates the same results as numpy""" # Check over two calls to see if the random state is correctly updated. random = RandomStreams(utt.fetch_seed()) fn = function([], random.random_integers((20, 20), -5, 5)) fn_val0 = fn() fn_val1 = fn() rng_seed = numpy.random.RandomState(utt.fetch_seed()).randint(2**30) rng = numpy.random.RandomState(int(rng_seed)) # int() is for 32bit numpy_val0 = rng.random_integers(-5, 5, size=(20, 20)) numpy_val1 = rng.random_integers(-5, 5, size=(20, 20)) assert numpy.all(fn_val0 == numpy_val0) assert numpy.all(fn_val1 == numpy_val1)
def get_gradients(self, model, data, ** kwargs): space, sources = self.get_data_specs(model) space.validate(data) X, Y = data theano_rng = RandomStreams(seed = model.rng.randint(2 ** 15)) noise = theano_rng.random_integers(size = (X.shape[0] * model.k,), low=0, high = model.dict_size - 1) delta = model.delta(data) p = model.score(X, Y) params = model.get_params() pos_ = T.jacobian(model.score(X, Y), params, disconnected_inputs='ignore') pos_coeff = 1 - T.nnet.sigmoid(model.delta(data)) pos = [] for param in pos_: axes = [0] axes.extend(['x' for item in range(param.ndim - 1)]) pos.append(pos_coeff.dimshuffle(axes) * param) del pos_, pos_coeff noise_x = T.tile(X, (model.k, 1)) neg_ = T.jacobian(model.score(noise_x, noise), params, disconnected_inputs='ignore') neg_coeff = T.nnet.sigmoid(model.delta((noise_x, noise))) neg = [] for param in neg_: axes = [0] axes.extend(['x' for item in range(param.ndim - 1)]) tmp = neg_coeff.dimshuffle(axes) * param new_shape = [X.shape[0], model.k] new_shape.extend([tmp.shape[i] for i in range(1, tmp.ndim)]) neg.append(tmp.reshape(new_shape).sum(axis=1)) del neg_, neg_coeff grads = [(pos_ - neg_).mean(axis=0) for pos_, neg_ in zip(pos, neg)] gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() return gradients, updates
def build_graph_logloss(self): #initialize for randomness if self.seed==None: self.seed = numpy.random.randint(2**30) theano_rng = RandomStreams(self.seed) self.randstate = numpy.random.RandomState(self.seed) #define parameters init_p_before_sigmoid = numpy.linspace(start=-self.init_p_width,stop=self.init_p_width, num=self.dim+1) self.p_before_sigmoid = theano.shared(init_p_before_sigmoid.astype(theano.config.floatX),name='p_before_sigmoid') self.params = [self.p_before_sigmoid] #define inputs self.x1_idxs = T.ivector() self.x2_idxs = T.ivector() self.x1_idxs.tag.test_value = numpy.asarray([0,1],dtype=numpy.int32) self.x2_idxs.tag.test_value = numpy.asarray([1,2],dtype=numpy.int32) #define negative inputs choice = theano_rng.binomial(size=self.x1_idxs.shape) alternative = theano_rng.random_integers(size=self.x1_idxs.shape,low=0,high=self.n_entities-1) self.x1_idxs_negative = T.switch(choice,self.x1_idxs,alternative) self.x2_idxs_negative = T.switch(choice,alternative,self.x2_idxs) #define graph from inputs to probabilities and to log loss def get_embed(index_tensor): return self.embeddings[index_tensor].reshape((index_tensor.shape[0],self.dim)) self.x1_emb = get_embed(self.x1_idxs) self.x2_emb = get_embed(self.x2_idxs) self.x1neg_emb = get_embed(self.x1_idxs_negative) self.x2neg_emb = get_embed(self.x2_idxs_negative) def get_prob(embed_tensor1,embed_tensor2): distances = T.sum(embed_tensor1*embed_tensor2 + (1-embed_tensor1)*(1-embed_tensor2), axis=1) return sigmoid(self.p_before_sigmoid[distances]) self.pos_probs = get_prob(self.x1_emb,self.x2_emb) self.neg_probs = get_prob(self.x1neg_emb,self.x2neg_emb) self.loss = -T.mean(T.log(self.pos_probs) + T.log(1.0-self.neg_probs))
def cost_(self, Y, Y_hat): # TODO fix me later when using IndexSpace assert hasattr(Y_hat, 'owner') owner = Y_hat.owner assert owner is not None op = owner.op if isinstance(op, Print): assert len(owner.inputs) == 1 Y_hat, = owner.inputs owner = Y_hat.owner op = owner.op assert isinstance(op, T.nnet.Softmax) state_below, = owner.inputs assert state_below.ndim == 2 # TODO make this more generic like above state_below = state_below.owner.inputs[0].owner.inputs[0] #import ipdb #ipdb.set_trace() Y = T.argmax(Y, axis = 1) #Y = Y.astype('uint32') theano_rng = RandomStreams(seed = self.mlp.rng.randint(2 ** 15)) noise = theano_rng.random_integers(size = (state_below.shape[0], self.num_noise_samples,), low=0, high = self.n_classes - 1) k = self.num_noise_samples p_n = 1. / self.n_classes pos = T.nnet.sigmoid((state_below * self.W[:, Y].T).sum(axis=1) + self.b[Y] - T.log(k * p_n)) neg = T.nnet.sigmoid((T.concatenate([state_below] * k) * self.W[:, noise.flatten()].T).sum(axis=1) + self.b[noise.flatten()] - T.log(k * p_n)) # TODO is this reshape necessary? neg = neg.reshape((state_below.shape[0], k)).sum(axis=1) rval = -T.log(pos) - T.log(1 - neg) return rval.mean()
class LDmodel(): ''' Models discreet-time continuous data as a linear transformation of a linear dynamical system with sparse "noise". x: data s: latent variable u: sparse noise n: gaussian noise W: generative matrix M: dynamical matrix s_(t+1) = M*s_t + u x_t = W*s_t + n Approximate EM learning is performed via minibatched gradient ascent on the log-likelihood. Inference/sampling is achieved with particle filtering. The proposal distribution in the particle filter ignores (for now) the predictive part and samples directly from the posterior specified by the generative part (as if the top equation didn't exist.) ''' def __init__(self, nx, ns, npcl, xvar=1.0): #generative matrix init_W=np.asarray(np.random.randn(nx,ns)/0.1,dtype='float32') #normalize the columns of W to be unit length #(maybe unnecessary if sampling?) init_W=init_W/np.sqrt(np.sum(init_W**2,axis=0)) #dynamical matrix init_M=np.asarray(np.eye(ns),dtype='float32') #sparsity parameters #parametrized as the exponent of ln_b to ensure positivity init_ln_b=np.asarray(np.zeros(ns),dtype='float32') self.W=theano.shared(init_W) self.M=theano.shared(init_M) self.ln_b=theano.shared(init_ln_b) #for ease of use self.b=T.exp(self.ln_b) #square root of covariance matrix of proposal distribution #initialized to the true root covariance init_cov_inv=np.dot(init_W.T, init_W)/(xvar**2) + 0.5*np.eye(ns)*np.exp(-2.0*init_ln_b) init_cov=spla.inv(init_cov_inv) init_C=spla.sqrtm(init_cov) init_C=np.asarray(np.real(init_C),dtype='float32') init_s_now=np.asarray(np.zeros((npcl,ns)),dtype='float32') init_weights_now=np.asarray(np.ones(npcl)/float(npcl),dtype='float32') init_s_past=np.asarray(np.zeros((npcl,ns)),dtype='float32') init_weights_past=np.asarray(np.ones(npcl)/float(npcl),dtype='float32') self.C=theano.shared(init_C) self.s_now=theano.shared(init_s_now) self.weights_now=theano.shared(init_weights_now) self.s_past=theano.shared(init_s_past) self.weights_past=theano.shared(init_weights_past) self.xvar=np.asarray(xvar,dtype='float32') self.nx=nx #dimensionality of observed variables self.ns=ns #dimensionality of latent variables self.npcl=npcl #numer of particles in particle filter #this is used for the resampling nummat=np.repeat(np.reshape(np.arange(npcl),(npcl,1)),npcl,axis=1) self.idx_mat=theano.shared(nummat.T) #for ease of use and efficient computation (these are used a lot) self.CCT=T.dot(self.C, self.C.T) self.cov_inv=T.dot(self.W.T, self.W)/(self.xvar**2) + 0.5*T.eye(self.ns)/(self.b**2) self.theano_rng = RandomStreams() self.init_multi_samp=theano.shared(np.asarray(np.arange(npcl),dtype='int64')) self.params= [self.W, self.M, self.ln_b] self.rel_lrates=theano.shared(np.asarray([ 1.0, 1.0, 1.0] ,dtype='float32')) self.meta_params= [self.C] self.meta_rel_lrates=theano.shared(np.asarray([ 1.0 ], dtype='float32')) def sample_proposal_s(self, s, xp): #s is npcl-by-ns #xp is 1-by-nx n=self.theano_rng.normal(size=T.shape(self.s_now)) mean_term=T.dot(xp, self.W)/(self.xvar**2) + T.dot(s,self.M.T*0.5/(self.b**2)) prop_mean=T.dot(mean_term, self.CCT) s_prop=prop_mean + T.dot(n, self.C) #I compute the term inside the exponent for the pdf of the proposal distrib prop_term=-T.sum(n**2)/2.0 #return T.cast(s_prop,'float32'), T.cast(s_pred,'float32'), T.cast(prop_term,'float32'), prop_mean return s_prop, prop_term, prop_mean def forward_filter_step(self, xp): #need to sample from the proposal distribution first s_samps, prop_terms, prop_means = self.sample_proposal_s(self.s_now, xp) updates={} #now that we have samples from the proposal distribution, we need to reweight them recons=T.dot(self.W, s_samps.T) s_pred=self.get_prediction(self.s_now) x_terms=-T.sum((recons-T.reshape(xp,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2) s_terms=-T.sum(T.abs_((s_samps-s_pred)/self.b),axis=1) energies=x_terms+s_terms-prop_terms #to avoid exponentiating large or very small numbers, I #"re-center" the reweighting factors by adding a constant, #as this has no impact on the resulting new weights #energies_recentered=energies-T.max(energies) alpha=T.exp(energies) #these are the reweighting factors new_weights=self.weights_now*alpha #normalizer=T.sum(new_weights_unnorm) #new_weights=new_weights_unnorm/normalizer #need to normalize new weights updates[self.s_past]=T.cast(self.s_now,'float32') updates[self.s_now]=T.cast(s_samps,'float32') updates[self.weights_past]=T.cast(self.weights_now,'float32') updates[self.weights_now]=T.cast(new_weights,'float32') #return normalizer, energies_recentered, s_samps, s_pred, T.dot(self.W.T,(xp-self.c)), updates #return normalizer, energies_recentered, updates #return h_samps, updates return updates def proposal_loss(self,C): #calculates how far off self.CCT is from the true posterior covariance CCT=T.dot(C, C.T) prod=T.dot(CCT, self.cov_inv) diff=prod-T.eye(self.ns) tot=T.sum(T.sum(diff**2)) #frobenius norm return tot def prop_update_step(self, C_now, lr): loss=self.proposal_loss(C_now) gr=T.grad(loss, C_now) return [C_now-lr*gr], theano.scan_module.until(loss<1e-6) def update_proposal_distrib(self, n_steps, lr): #does some gradient descent on self.C, so that self.CCT becomes #closer to the true posterior covariance C0=self.C Cs, updates = theano.scan(fn=self.prop_update_step, outputs_info=[C0], non_sequences=[lr], n_steps=n_steps) updates[self.C]=Cs[-1] loss=self.proposal_loss(Cs[-1]) #updates={} #updates[self.C]=self.prop_update_step(self.C,lr) #loss=self.proposal_loss(self.C) return loss, updates def get_prediction(self, s): s_pred=T.dot(s, self.M) return s_pred def sample_joint(self, sp): #t2_samp=self.theano_rng.multinomial(pvals=T.reshape(self.weights_now,(1,self.npcl))).T #s2_samp=T.cast(T.sum(self.s_now*T.addbroadcast(t2_samp,1),axis=0),'float32') t2_samp=self.sample_multinomial(self.weights_now,3) s2_samp=self.s_now[t2_samp] diffs=(s2_samp-sp) abs_term=T.sum(T.abs_(diffs)/self.b,axis=1) alpha=T.exp(-abs_term) probs=self.weights_past*alpha #probs=probs_unnorm/T.sum(probs_unnorm) #t1_samp=self.theano_rng.multinomial(pvals=T.reshape(probs,(1,self.npcl))).T #s1_samp=T.cast(T.sum(self.s_past*T.addbroadcast(t1_samp,1),axis=0),'float32') t1_samp=self.sample_multinomial(probs,3) s1_samp=self.s_past[t1_samp] return [s1_samp, s2_samp] def update_params(self, x1, x2, n_samps, lrate): #this function samples from the joint posterior and performs # a step of gradient ascent on the log-likelihood sp=self.get_prediction(self.s_past) #sp should be np by ns [s1_samps, s2_samps], updates = theano.scan(fn=self.sample_joint, outputs_info=[None, None], non_sequences=[sp], n_steps=n_samps) x2_recons=T.dot(self.W, s2_samps.T) s_pred = self.get_prediction(s1_samps) sterm=-T.mean(T.sum(T.abs_((s2_samps-s_pred)/self.b),axis=1)) - T.sum(T.log(self.b)) #xterm1=-T.mean(T.sum((x1_recons-T.reshape(x1,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2)) xterm2=-T.mean(T.sum((x2_recons-T.reshape(x2,(self.nx,1)))**2,axis=0)/(2.0*self.xvar**2)) #energy = hterm1 + xterm1 + hterm2 + xterm2 + sterm -T.sum(T.sum(self.A**2)) #energy = hterm1 + xterm2 + sterm energy = xterm2 + sterm learning_params=[self.params[i] for i in range(len(self.params)) if self.rel_lrates[i]!=0.0] learning_rel_lrates=[self.rel_lrates[i] for i in range(len(self.params)) if self.rel_lrates[i]!=0.0] gparams=T.grad(energy, learning_params, consider_constant=[s1_samps, s2_samps]) # constructs the update dictionary for gparam, param, rel_lr in zip(gparams, learning_params, learning_rel_lrates): #gnat=T.dot(param, T.dot(param.T,param)) if param==self.M: #I do this so the derivative of M doesn't depend on the sparsity parameters updates[param] = T.cast(param + gparam*T.reshape(self.b,(1,self.ns))*lrate*rel_lr,'float32') elif param==self.b: updates[param] = T.cast(param + gparam*T.reshape(1.0/self.b,(1,self.ns))*lrate*rel_lr,'float32') else: updates[param] = T.cast(param + gparam*lrate*rel_lr,'float32') return energy, updates def get_ESS(self): return 1.0/T.sum(self.weights_now**2) def resample(self): updates={} #samp=self.theano_rng.multinomial(size=self.weights_now.shape,pvals=self.weights_now) idxs=self.sample_multinomial(self.weights_now,3) #idxs=T.cast(T.sum(samp*self.idx_mat,axis=1),'int32') s_samps=self.s_now[idxs] updates[self.s_now]=s_samps updates[self.weights_now]=T.cast(T.ones_like(self.weights_now)/T.cast(self.npcl,'float32'),'float32') #dtype paranoia return updates def simulate_step(self, s): s=T.reshape(s,(1,self.ns)) sp=self.get_prediction(s) xp=T.dot(self.W, sp.T) return T.cast(sp,'float32'), T.cast(xp,'float32') def simulate_forward(self, n_steps): s0=T.sum(self.s_now*T.reshape(self.weights_now,(self.npcl,1)),axis=0) s0=T.reshape(s0,(1,self.ns)) [sp, xp], updates = theano.scan(fn=self.simulate_step, outputs_info=[s0, None], n_steps=n_steps) return sp, xp, updates def multinomial_step(self,samp,weights): u=self.theano_rng.uniform(size=self.weights_now.shape) i=self.theano_rng.random_integers(size=self.weights_now.shape, low=0, high=self.npcl-1) Wnow=weights[samp] Wstep=weights[i] probs=Wstep/Wnow out=T.switch(u<probs, i, samp) return out def sample_multinomial(self,weights,nsteps): #this function samples from a multinomial distribution using #the Metropolis method as in [Murray, Lee, Jacob 2013] #weights are unnormalized #this is biased for small nsteps, but could be faster than the #native theano multinomial sampler and the use of unnormalized #weights improves numerical stability samp0=self.init_multi_samp samps, updates = theano.scan(fn=self.multinomial_step, outputs_info=[samp0], non_sequences=[weights], n_steps=nsteps) return samps[-1] def set_rel_lrates(self, new_rel_lrates): updates={} updates[self.rel_lrates]=new_rel_lrates return updates
class AverageSGM(object): ''' A toy example showing the usage of `extheano.NodeDescriptor` and `extheano.jit`. This class performs the stochastic gradient method (SGM) to find the average of given data. Usage: >> data = np.arange(1000) >> a = AverageSGM(data) >> for _ in xrange(10000): a.calc_loss_with_onestep_SGM() >> est = a.get_estimation() ''' # node descriptors for shared variables # whole data of which we will compute the average data = extheano.NodeDescriptor() # estimate of the average mu = extheano.NodeDescriptor() # learning rate (will be discounted as SGM goes on) lrate = extheano.NodeDescriptor() def __init__(self, data, batch_size=10, init_val=0., lrate=0.05, degree=0.75, seed=None): '''Set parameters for the SGM :param data: array-like with its dimension one :param batch_size: size of the mini batch in integer :param init_val: initial guess of the average in float :param lrate: initial learning rate in float :param degree: degree of learning rate decreasing in float :param seed: seed for RNG in integer ''' # pure-python variables (assumed to be invariant until recompilation) self.batch_size = batch_size self.n_batches = len(data) / batch_size self.degree = degree self.init_lrate = lrate # initialize the nodes self.data = theano.shared(data.astype(float), 'data', borrow=True) self.mu = theano.shared(float(init_val), 'mu') self.lrate = theano.shared(float(lrate), 'lrate') # shared random streams self.rng = RandomStreams(seed) def quadratic_loss(self, minibatch): '''Get the quadratic loss against the given input''' return ((minibatch - self.mu) ** 2).mean() def gradient_descent(self, loss, lrate): '''Perform one step of the gradient descent on the given loss Note that you can update `self.mu` with the normal assignment operation since it is a descriptor. ''' # calculate the gradient grad = -T.grad(loss, self.mu) # update the estimation self.mu = self.mu + lrate * grad def next_lrate(self, lr): '''Return the discounted learning rate The learning rate will be proportional to the number of iterations with minus `self.degree` on the exponent. ''' time = (self.init_lrate / lr) ** (1. / self.degree) ratio = (1. - 1. / (1. + time)) ** self.degree return lr * ratio # With the decorator `@extheano.jit`, you can compile your theano-function # 'just in time'. Use `@extheano.jit.parse` instead if it has arguments with # default values. @extheano.jit.parse def calc_loss_with_onestep_SGM(self, scale=1.): '''Calculate the quadratic loss and perform one step of the SGM ''' # assign a random batch to the input batch_start = self.batch_size * \ self.rng.random_integers(low=0, high=self.n_batches - 1) batch_stop = batch_start + self.batch_size minibatch = self.data[batch_start: batch_stop] # perform SGM and discount the learning rate loss = self.quadratic_loss(minibatch) self.gradient_descent(loss, self.lrate * scale) self.lrate = self.next_lrate(self.lrate) return loss @extheano.jit def set_estimation(self, val): '''Set the estimation of the average''' self.mu = T.cast(val, theano.config.floatX) @extheano.jit def get_estimation(self): '''Get the estimation of the average''' return self.mu
def __init__(self, dim, n_entities, batch_size=None, validation_samples=2): self.__dict__.update(locals()) del self.self theano_rng = RandomStreams(numpy.random.randint(2**30)) #Start by defining the graph ##Parameter setup self.emb = theano.shared((numpy.random.uniform( -1.0, 1.0, (self.n_entities, self.dim))).astype(theano.config.floatX)) self.emb.tag.test_value = (numpy.random.uniform( -1.0, 1.0, (self.n_entities, self.dim))).astype(theano.config.floatX) self.a = theano.shared(numpy.asarray(1.0).astype(theano.config.floatX)) self.b = theano.shared(numpy.asarray(0.0).astype(theano.config.floatX)) self.params = [self.emb, self.a, self.b] ### Input setup! self.x1_idxs = T.ivector() self.x2_idxs = T.ivector() self.x1_idxs.tag.test_value = numpy.asarray([0, 1], dtype=numpy.int32) self.x2_idxs.tag.test_value = numpy.asarray([1, 2], dtype=numpy.int32) #generate negative samples choice = theano_rng.binomial(size=self.x1_idxs.shape) alternative = theano_rng.random_integers(size=self.x1_idxs.shape, low=0, high=n_entities - 1) self.x1_idxs_negative = T.switch(choice, self.x1_idxs, alternative) self.x2_idxs_negative = T.switch(choice, alternative, self.x2_idxs) ### Define graph from input to predictive loss def get_embed(index_tensor): return sigmoid(self.emb[index_tensor].reshape( (index_tensor.shape[0], self.dim))) x1_emb = get_embed(self.x1_idxs) x2_emb = get_embed(self.x2_idxs) x1neg_emb = get_embed(self.x1_idxs_negative) x2neg_emb = get_embed(self.x2_idxs_negative) def get_prob1(embed_tensor1, embed_tensor2): return sigmoid( self.a * T.mean(embed_tensor1 * embed_tensor2 + (1 - embed_tensor1) * (1 - embed_tensor2), axis=1) + self.b) #probability of a link, 0 to 1.' self.loss = T.mean(-T.log(get_prob1(x1_emb, x2_emb)) - T.log(1 - get_prob1(x1neg_emb, x2neg_emb))) ###Define graph from input to sampled/validated loss randomizationA = theano_rng.uniform(size=(self.validation_samples, self.dim)) randomizationB = theano_rng.uniform(size=(self.validation_samples, self.dim))
class Neural_network_layer: '''Represents the units within a layer and the units activations and dropout functions. ''' def __init__(self, size, activation_function, dropout_type, dropout, dropout_decay, batch_size, frequency): self.drop_count = 0 self.size = size self.frequency = frequency self.dropout = dropout self.dropout_init = dropout self.dropout_decay = dropout_decay self.dropout_type = dropout_type self.rdm = RandomStreams(seed=1234) self.batch_size = batch_size self.sample_range = 100000 self.create_dropout_sample_functions() self.activation_crossvalidation = activation_function self.activation_function = self.set_dropout(dropout, activation_function) self.activation_derivative = lambda X: g(T.mul(X, (1.0 - X))) self.activation_tracker = self.set_activation_tracker( activation_function) pass def set_dropout(self, dropout, activation_function): action_with_drop = None if dropout > 0: action_with_drop = lambda X: T.mul(activation_function(X), self. dropout_function) self.activation_cv_dropout = lambda X: T.mul( activation_function(X), self.dropout_function_cv) else: action_with_drop = activation_function self.activation_cv_dropout = activation_function return action_with_drop def set_activation_tracker(self, activation_function): '''Sets a tracker function that logs the activations that exceed 0.75. ''' if activation_function == Activation_function.sigmoid: activation_tracker = lambda X: T.gt(activation_function(X), 0.75) else: activation_tracker = None return activation_tracker def create_dropout_sample_functions(self, reset=False): '''Creates functions of sample vectors which can be index with random integers to create a pseudo random sample for dropout. This greatly speeds up sampling as no new samples have to be created. ''' if reset: self.dropout = self.dropout_init print 'Reset dropout to ' + str(self.dropout) self.dropout_function = None sample_function = None if self.dropout > 0: if self.dropout_type == Dropout.drop_activation: if reset: self.bino_sample_vector.set_value(np.matrix( np.float32( np.random.binomial(1, 1 - self.dropout, (10000000, 1)))), borrow=True) else: self.bino_sample_vector = shared(np.matrix( np.float32( np.random.binomial(1, 1 - self.dropout, (10000000, 1)))), 'float32', borrow=True) sample_function = lambda rand: g( T.reshape( self.bino_sample_vector[rand:rand + (self.batch_size * self.size)], (self.batch_size, self.size))) sample_function_cv = lambda rand: g( T.reshape( self.bino_sample_vector[rand:rand + (4200 * self.size)], (4200, self.size))) self.dropout_function = sample_function( self.rdm.random_integers(low=0, high=self.sample_range)) self.dropout_function_cv = sample_function_cv( self.rdm.random_integers(low=0, high=self.sample_range)) def handle_dropout_decay(self, epoch): '''Handles automatically the dropout decay by decreasing the dropout by the given amount after the given number of epochs. ''' if self.dropout_function and self.frequency[ self.drop_count] > 0 and epoch % self.frequency[ self.drop_count] == 0 and epoch > 0: print 'Setting dropout from ' + str(self.dropout) + ' to ' + str( np.float32(self.dropout * (1 - self.dropout_decay[self.drop_count]))) self.dropout = np.float32( self.dropout * (1 - self.dropout_decay[self.drop_count])) if self.dropout_type == Dropout.drop_activation: self.bino_sample_vector.set_value(np.matrix( np.float32( np.random.binomial(1, 1 - self.dropout, (10000000, 1)))), borrow=True) self.drop_count += 1 if self.drop_count > len(self.dropout_decay) - 1: self.drop_count -= 1
def __init__(self, num_words, num_rels, vocab_embed_size, lr=0.01, tensor_activation=T.tanh, num_noise_samples=1, init_dense_vocab=None): numpy_rng = numpy.random.RandomState(89677) theano_rng = RandomStreams(12783) rng_box_limit = 4 * numpy.sqrt(6. / (vocab_embed_size + vocab_embed_size + num_rels)) rng_box_low = 0 rng_box_high = rng_box_limit init_box = numpy.asarray(numpy_rng.uniform(low=rng_box_low, high=rng_box_high, size=(vocab_embed_size, vocab_embed_size, num_rels))) rng_proj_low = -4 * numpy.sqrt(6. / (num_words + vocab_embed_size)) rng_proj_high = 4 * numpy.sqrt(6. / (num_words + vocab_embed_size)) if init_dense_vocab is None: init_dense_vocab = numpy.asarray(numpy_rng.uniform(low=rng_proj_low, high=rng_proj_high, size=(num_words, vocab_embed_size))) init_rev_dense_vocab = numpy.asarray(numpy_rng.uniform(low=rng_proj_low, high=rng_proj_high, size=(vocab_embed_size, num_words))) self.B = theano.shared(value=init_box, name='B') self.P = theano.shared(value=init_dense_vocab, name='P') self.P_hat = theano.shared(value=init_rev_dense_vocab, name='P_hat') self.vocab = T.eye(num_words) word_activation = T.nnet.softmax self.rel = T.eye(num_rels) rel_activation = T.nnet.softmax self.lr = lr self.x_ind, self.y_ind, self.r_ind = T.iscalars('x_ind', 'y_ind', 'r_ind') x = self.vocab[self.x_ind] self.x_rep = T.dot(x, self.P) y = self.vocab[self.y_ind] self.y_rep = T.dot(y, self.P) r = self.rel[self.r_ind] # Assumption: Corresponding dimensions: 0 -> x, 1 -> y, 2 -> r # TODO: Where do we apply activations? Do we have to, at all? pred_xy = tensor_activation(T.tensordot(r, self.B, axes=(0,2))) pred_y = T.dot(T.tensordot(self.x_rep, pred_xy, axes=(0,0)), self.P_hat) self.prob_y = word_activation(pred_y) pred_x = T.dot(T.tensordot(self.y_rep, pred_xy, axes=(0,1)), self.P_hat) self.prob_x = word_activation(pred_x) pred_yr = tensor_activation(T.tensordot(self.x_rep, self.B, axes=(0,0))) self.prob_r = rel_activation(T.tensordot(self.y_rep, pred_yr, axes=(0,0))) self.score = T.dot(y, T.dot(T.tensordot(self.x_rep, T.tensordot(r, self.B, axes=(0,2)), axes=(0,0)), self.P_hat).T) # y \times (((x \times P) \times (r \otimes B)) \times P_hat) rand_margin_score = T.constant(0) noise_log_likelihood = T.constant(0) # The noise distribution is one where words and the relation are independent of each other. The probability of the right tuple and the corrupted tuple are both equal in this distribution. noise_prob = num_noise_samples/float(num_words * num_words * num_rels) rand_x_ind = theano_rng.random_integers(low=0, high=num_words-1) rand_y_ind = theano_rng.random_integers(low=0, high=num_words-1) rand_r_ind = theano_rng.random_integers(low=0, high=num_rels-1) rand_x = self.vocab[rand_x_ind] rand_x_rep = T.dot(rand_x, self.P) rand_y = self.vocab[rand_y_ind] rand_y_rep = T.dot(rand_y, self.P) rand_r = self.rel[rand_r_ind] rand_score = T.dot(rand_y, T.dot(T.tensordot(rand_x_rep, T.tensordot(rand_r, self.B, axes=(0,2)), axes=(0,0)), self.P_hat).T) for _ in range(num_noise_samples): rand_margin_score += rand_score noise_log_likelihood += T.log(noise_prob/(T.abs_(rand_score) + noise_prob)) self.nce_margin_loss = T.maximum(0, 1 - self.score + rand_margin_score) # NCE negative log likelihood:-1 * {log(score/(score + num_noise_samples*noise_prob)) + \sum_{i=1}^k (log(noise_prob/(rand_score + noise_prob)))} self.nce_prob_loss = -(T.log(T.abs_(self.score)/(T.abs_(self.score) + noise_prob)) + noise_log_likelihood) self.cost_inputs = [self.x_ind, self.y_ind, self.r_ind] self.params = [self.B, self.P, self.P_hat] self.x_loss = self.ce(x, self.prob_x) self.y_loss = self.ce(y, self.prob_y) self.r_loss = self.ce(r, self.prob_r)
def __init__(self, numargs, embed_size, pred_vocab_size, arg_vocab_size, initial_pred_rep=None, initial_arg_rep = None, margin = 5, lr=0.01, activation=T.nnet.sigmoid): numpy_rng = numpy.random.RandomState(12345) theano_rng = RandomStreams(54321) self.lr = lr #margin = 5 # Initializing predicate representations if initial_pred_rep is not None: num_preds, pred_dim = initial_pred_rep.shape assert pred_vocab_size == num_arrays, "Initial predicate representation is not the same size as pred_vocab_size" assert embed_size == pred_dim, "Initial predicate representation does not have the same dimensionality as embed_size" else: initial_pred_rep_range = 4 * numpy.sqrt(6. / (pred_vocab_size + embed_size)) initial_pred_rep = numpy.asarray(numpy_rng.uniform(low = -initial_pred_rep_range, high = initial_pred_rep_range, size = (pred_vocab_size, embed_size))) self.pred_rep = theano.shared(value=initial_pred_rep, name='P') # Initializing argument representations if initial_arg_rep is not None: arg_rep_len, arg_dim = initial_arg_rep.shape assert arg_vocab_size == arg_rep_len, "Initial argument representation is not the same size as arg_vocab_size" assert embed_size == arg_dim, "Initial argument representation does not have the same dimensionality as embed_size" else: initial_arg_rep_range = 4 * numpy.sqrt(6. / (arg_vocab_size + embed_size)) initial_arg_rep = numpy.asarray(numpy_rng.uniform(low = -initial_arg_rep_range, high = initial_arg_rep_range, size = (arg_vocab_size, embed_size))) self.arg_rep = theano.shared(value=initial_arg_rep, name='A') # Initialize scorer scorer_dim = embed_size * (numargs + 1) # Predicate is +1 initial_scorer_range = 4 * numpy.sqrt(6. / scorer_dim) initial_scorer = numpy.asarray(numpy_rng.uniform(low = -initial_scorer_range, high = initial_scorer_range, size = scorer_dim)) self.scorer = theano.shared(value=initial_scorer, name='s') # Initialize indicator indicator_dim = embed_size * (numargs + 1) # Predicate is +1 initial_indicator_range = 4 * numpy.sqrt(6. / (indicator_dim + numargs)) initial_indicator = numpy.asarray(numpy_rng.uniform(low = -initial_indicator_range, high = initial_indicator_range, size = (indicator_dim, numargs))) self.indicator = theano.shared(value=initial_indicator, name='I') # Define symbolic pred-arg self.pred_ind = T.iscalar('p') self.arg_inds = T.iscalars(numargs) pred = self.pred_rep[self.pred_ind].reshape((1, embed_size)) args = self.arg_rep[self.arg_inds].reshape((1, embed_size * numargs)) pred_arg = activation(T.concatenate([pred, args], axis=1)) # Define symbolic rand pred-arg for training scorer rand_pred_ind = theano_rng.random_integers(low=0, high=pred_vocab_size-1) rand_arg_inds = theano_rng.random_integers([1, numargs], low=0, high=arg_vocab_size-1) rand_pred = self.pred_rep[rand_pred_ind].reshape((1, embed_size)) rand_args = self.arg_rep[rand_arg_inds].reshape((1, embed_size * numargs)) rand_pred_arg = activation(T.concatenate([rand_pred, rand_args], axis=1)) # Define symbolic pred_rand-arg for training indicator pred_rand_arg = activation(T.concatenate([pred, rand_args], axis=1)) # Define scores and loss self.corr_score = T.sum(T.dot(pred_arg, self.scorer)) rand_score = T.sum(T.dot(rand_pred_arg, self.scorer)) self.margin_loss = T.maximum(0, margin - self.corr_score + rand_score) # Define indicator values and loss orig_ind_labels = T.constant(numpy.zeros(numargs)) self.indicator_pred = T.nnet.sigmoid(T.dot(pred_arg, self.indicator)) rand_ind_labels = T.constant(numpy.ones(numargs)) rand_indicator_pred = T.nnet.sigmoid(T.dot(pred_rand_arg, self.indicator)) self.indicator_loss = T.mean((self.indicator_pred - orig_ind_labels) ** 2) + T.mean((rand_indicator_pred - rand_ind_labels) ** 2) # Define params and inputs self.score_params = [self.pred_rep, self.arg_rep, self.scorer] self.indicator_params = [self.pred_rep, self.arg_rep, self.indicator] self.score_ind_inputs = [self.pred_ind] + list(self.arg_inds)
def __init__(self, cooccurrence, z_k, opt, pz_weight_regularizer=None, pz_regularizer=None, eps=1e-8, scale=1e-2, beta=0.01, batch_gibbs=True): srng = RandomStreams(123) cooccurrence = cooccurrence.astype(np.float32) self.cooccurrence = cooccurrence self.z_k = z_k self.opt = opt x_k = cooccurrence.shape[0] self.x_k = x_k self.pz_weight_regularizer = pz_weight_regularizer self.pz_regularizer = pz_regularizer self.batch_gibbs = batch_gibbs # cooccurrence matrix n = np.sum(cooccurrence, axis=None) _co = cooccurrence / n co = T.constant(_co, name="co") # (x_k, x_k) _co_m = np.sum(_co, axis=1, keepdims=True) co_m = T.constant(_co_m, name="co_m") # (x_k,1) _co_c = _co / (eps + _co_m) _co_h = np.sum(_co * -np.log(eps + _co_c), axis=1, keepdims=True) # (x_k, 1) print "H(Y|X): {}".format(np.sum(_co_h)) co_h = T.constant(_co_h, name="co_h") # parameters # P(z1=k,z2=k) tril = np.tril_indices(n=x_k, k=-1) initial_param = np.random.normal(loc=0, scale=scale, size=(tril[0].shape[0], )).astype( np.float32) param = K.variable(initial_param, name="param", dtype='float32') pz = T.zeros((x_k, x_k)) pz = T.set_subtensor(pz[tril], param) pz += T.transpose(pz, (1, 0)) # symmetric pz = T.nnet.sigmoid(pz) # (x_k, x_k) squash params = [param] # current sample initial_sample = np.random.random_integers(low=0, high=z_k - 1, size=(x_k, )).astype( np.int32) current_sample = K.variable(initial_sample, name="current_sample", dtype='int32') current_oh = tensor_one_hot(current_sample, k=z_k) # (x_k, z_k) # probability of sample matches = T.eq(current_sample.dimshuffle((0, 'x')), current_sample.dimshuffle(('x', 0))) # (x_k, x_k) p1 = T.nnet.sigmoid(param) p2 = matches[tril] lp = (p2 * T.log(eps + p1)) + ( (1. - p2) * T.log(eps + 1 - p1)) # (tril,) sample_logp = T.sum(lp) # gibbs sampling if batch_gibbs: idx = T.ivector() pzidx = pz[idx, :] # (n, x_k) current_masked = T.set_subtensor(current_oh[idx, current_sample[idx]], 0) # (x_k, z_k) # todo: test this p calculation e_add = T.dot( T.log(eps + pzidx) - T.log(eps + 1 - pzidx), current_masked) # (n, z_k) #e_add = T.dot(T.log(eps + pzidx), current_masked) # (n, z_k) p_add = softmax_nd(e_add) cs = T.cumsum(p_add, axis=1) rnd = srng.uniform(low=0., high=1., size=(idx.shape[0], )) bucket = T.sum(T.gt(rnd.dimshuffle((0, 'x')), cs), axis=1) # (n,) bucket = T.clip(bucket, 0, z_k - 1) # (n,) new_sample = T.set_subtensor(current_sample[idx], bucket) gibbs_updates = [(current_sample, new_sample)] self.gibbs_fun = theano.function([idx], [], updates=gibbs_updates) else: idx = srng.random_integers(low=0, high=x_k - 1) # scalar pzidx = pz[idx, :] # (x_k,) current_masked = T.set_subtensor(current_oh[idx, current_sample[idx]], 0) # (x_k, z_k) # todo: test this p calculation e_add = T.dot( T.log(eps + pzidx) - T.log(eps + 1 - pzidx), current_masked) # (Z_k,) p_add = softmax_nd(e_add) cs = T.cumsum(p_add) rnd = srng.uniform(low=0., high=1.) bucket = T.sum(T.gt(rnd, cs)) bucket = T.clip(bucket, 0, z_k - 1) new_sample = T.set_subtensor(current_sample[idx], bucket) gibbs_updates = [(current_sample, new_sample)] self.gibbs_fun = theano.function([], [], updates=gibbs_updates) # loss of sample p_b = T.dot(T.transpose(current_oh, (1, 0)), co) # (z_k, x_k) marg = T.sum(p_b, axis=1, keepdims=True) # (z_k, 1) cond = p_b / (marg + eps) # (z_k, x_k) current_nll = T.sum(p_b * -T.log(eps + cond), axis=None) # scalar current_nll = theano.gradient.zero_grad(current_nll) avg_nll = K.variable(0., name='avg_nll', dtype='float32') new_avg = ((1. - beta) * avg_nll) + (beta * current_nll) avg_updates = [(avg_nll, new_avg)] # REINFORCE glp = T.grad(sample_logp, param) # todo: check sign sampled_grad = -(current_nll - avg_nll) * glp self.regularize = False assert isinstance(opt, keras.optimizers.Optimizer) def get_gradients(loss, params): assert len(params) == 1 assert params[0] == param return [sampled_grad] opt.get_gradients = get_gradients updates = opt.get_updates(loss=current_nll, params=params) self.val_fun = theano.function([], current_nll) self.encodings_fun = theano.function([], current_sample) # (x_k,) self.train_fun = theano.function([], current_nll, updates=updates + avg_updates) self.weights = params + opt.weights + [current_sample, avg_nll] t = self.calc_utilization()
def __init__(self, numargs, embed_size, pred_vocab_size, arg_vocab_size, initial_pred_rep=None, initial_arg_rep=None, margin=5, lr=0.01, activation=T.nnet.sigmoid): numpy_rng = numpy.random.RandomState(12345) theano_rng = RandomStreams(54321) self.lr = lr #margin = 5 # Initializing predicate representations if initial_pred_rep is not None: num_preds, pred_dim = initial_pred_rep.shape assert pred_vocab_size == num_arrays, "Initial predicate representation is not the same size as pred_vocab_size" assert embed_size == pred_dim, "Initial predicate representation does not have the same dimensionality as embed_size" else: initial_pred_rep_range = 4 * numpy.sqrt( 6. / (pred_vocab_size + embed_size)) initial_pred_rep = numpy.asarray( numpy_rng.uniform(low=-initial_pred_rep_range, high=initial_pred_rep_range, size=(pred_vocab_size, embed_size))) self.pred_rep = theano.shared(value=initial_pred_rep, name='P') # Initializing argument representations if initial_arg_rep is not None: arg_rep_len, arg_dim = initial_arg_rep.shape assert arg_vocab_size == arg_rep_len, "Initial argument representation is not the same size as arg_vocab_size" assert embed_size == arg_dim, "Initial argument representation does not have the same dimensionality as embed_size" else: initial_arg_rep_range = 4 * numpy.sqrt( 6. / (arg_vocab_size + embed_size)) initial_arg_rep = numpy.asarray( numpy_rng.uniform(low=-initial_arg_rep_range, high=initial_arg_rep_range, size=(arg_vocab_size, embed_size))) self.arg_rep = theano.shared(value=initial_arg_rep, name='A') # Initialize scorer scorer_dim = embed_size * (numargs + 1) # Predicate is +1 initial_scorer_range = 4 * numpy.sqrt(6. / scorer_dim) initial_scorer = numpy.asarray( numpy_rng.uniform(low=-initial_scorer_range, high=initial_scorer_range, size=scorer_dim)) self.scorer = theano.shared(value=initial_scorer, name='s') # Initialize indicator indicator_dim = embed_size * (numargs + 1) # Predicate is +1 initial_indicator_range = 4 * numpy.sqrt(6. / (indicator_dim + numargs)) initial_indicator = numpy.asarray( numpy_rng.uniform(low=-initial_indicator_range, high=initial_indicator_range, size=(indicator_dim, numargs))) self.indicator = theano.shared(value=initial_indicator, name='I') # Define symbolic pred-arg self.pred_ind = T.iscalar('p') self.arg_inds = T.iscalars(numargs) pred = self.pred_rep[self.pred_ind].reshape((1, embed_size)) args = self.arg_rep[self.arg_inds].reshape((1, embed_size * numargs)) pred_arg = activation(T.concatenate([pred, args], axis=1)) # Define symbolic rand pred-arg for training scorer rand_pred_ind = theano_rng.random_integers(low=0, high=pred_vocab_size - 1) rand_arg_inds = theano_rng.random_integers([1, numargs], low=0, high=arg_vocab_size - 1) rand_pred = self.pred_rep[rand_pred_ind].reshape((1, embed_size)) rand_args = self.arg_rep[rand_arg_inds].reshape( (1, embed_size * numargs)) rand_pred_arg = activation( T.concatenate([rand_pred, rand_args], axis=1)) # Define symbolic pred_rand-arg for training indicator pred_rand_arg = activation(T.concatenate([pred, rand_args], axis=1)) # Define scores and loss self.corr_score = T.sum(T.dot(pred_arg, self.scorer)) rand_score = T.sum(T.dot(rand_pred_arg, self.scorer)) self.margin_loss = T.maximum(0, margin - self.corr_score + rand_score) # Define indicator values and loss orig_ind_labels = T.constant(numpy.zeros(numargs)) self.indicator_pred = T.nnet.sigmoid(T.dot(pred_arg, self.indicator)) rand_ind_labels = T.constant(numpy.ones(numargs)) rand_indicator_pred = T.nnet.sigmoid( T.dot(pred_rand_arg, self.indicator)) self.indicator_loss = T.mean( (self.indicator_pred - orig_ind_labels)**2) + T.mean( (rand_indicator_pred - rand_ind_labels)**2) # Define params and inputs self.score_params = [self.pred_rep, self.arg_rep, self.scorer] self.indicator_params = [self.pred_rep, self.arg_rep, self.indicator] self.score_ind_inputs = [self.pred_ind] + list(self.arg_inds)
rn_b = srng.binomial(size=(3,),n=100,p=.7) #Say we want to generate an array of 3 independent binomial rvs binom = function([], rn_b, no_default_updates=True) print "First Binomial vector ", binom() print "Second Binomial without changing random number generator", binom() ############Normal RV rn_n = srng.normal(size=(), avg=0.0, std=2.3) norm = function([],rn_n) print "Single Normal ", norm() #############Random integer list rn_i = srng.random_integers(size = (4, ), low=1, high=900) inte = function([], rn_i) print "Integer list ", inte() #############Generating a permutation unifromly at random rn_p = srng.permutation(size=(), n = 10) perm = function([], rn_p) print "Random permutation of 0 to 9", perm() #############choosing from a list randomly rn_list = srng.choice(size=(), a=[2,3, 4.5, 6], replace=True, p=[.5, 0, .5, 0], dtype='float64') lis = function([], rn_list) print "Choosing 3 times from the specified list ", lis() print lis()
def selectRandomJ_(i, m): rstr = RandomStreams() # TODO: make somehow sure that the random integer is not i randint = rstr.random_integers(None, 0, m -1, ndim=0) # randint = tPrint("Taking random j: ")(randint) return randint
def build_graph(self): if self.seed == None: self.seed = numpy.random.randint(2**30) theano_rng = RandomStreams(self.seed) randstate = numpy.random.RandomState(self.seed) ################## ##Parameter setup ################## self.emb = theano.shared( (randstate.uniform(-1.0, 1.0, (self.n_entities, self.dim))).astype( theano.config.floatX)) self.emb.tag.test_value = (randstate.uniform( -1.0, 1.0, (self.n_entities, self.dim))).astype(theano.config.floatX) self.a = theano.shared( numpy.asarray(self.init_a).astype(theano.config.floatX)) self.b = theano.shared( numpy.asarray(self.init_b).astype(theano.config.floatX)) self.params = [self.emb, self.a, self.b] if self.embedding_type == 'REAL_TRAINED': self.coef_defaults = [2.0, -.5, -.5, -.5, -.5] self.coefs = [ theano.shared( numpy.asarray(coef_default).astype(theano.config.floatX)) for coef_default in self.coef_defaults ] self.params = self.params + self.coefs if self.embedding_type == 'REAL_TRAINED_L1': self.coef_defaults = [2.0, -.5, -.5, 0, 0] self.coefs = [ theano.shared( numpy.asarray(coef_default).astype(theano.config.floatX)) for coef_default in self.coef_defaults ] self.params = self.params + self.coefs[:-2] ################ ### Input setup! ################# self.x1_idxs = T.ivector() self.x2_idxs = T.ivector() self.x1_idxs.tag.test_value = numpy.asarray([0, 1], dtype=numpy.int32) self.x2_idxs.tag.test_value = numpy.asarray([1, 2], dtype=numpy.int32) #generate negative samples choice = theano_rng.binomial(size=self.x1_idxs.shape) alternative = theano_rng.random_integers(size=self.x1_idxs.shape, low=0, high=self.n_entities - 1) self.x1_idxs_negative = T.switch(choice, self.x1_idxs, alternative) self.x2_idxs_negative = T.switch(choice, alternative, self.x2_idxs) ### Define graph from input to predictive loss def get_embed(index_tensor): #index_tensor: (samples) if self.parameterization == 'SIGMOID': return sigmoid(self.emb[index_tensor].reshape( (index_tensor.shape[0], self.dim))) elif self.parameterization == 'DIRECT': return self.emb[index_tensor].reshape( (index_tensor.shape[0], self.dim)) self.x1_emb = get_embed(self.x1_idxs) self.x2_emb = get_embed(self.x2_idxs) self.x1neg_emb = get_embed(self.x1_idxs_negative) self.x2neg_emb = get_embed(self.x2_idxs_negative) def get_prob(embed_tensor1, embed_tensor2): #embed_tensorX: (n_batches,dim,*) if self.embedding_type == 'BIT': return sigmoid( self.a * T.mean(embed_tensor1 * embed_tensor2 + (1 - embed_tensor1) * (1 - embed_tensor2), axis=1) + self.b) #returns (n_batches,_,*) if self.embedding_type == 'BIT_INTERNALB': return sigmoid( self.a * (T.mean(2.0 * embed_tensor1 * embed_tensor2 - embed_tensor1 - embed_tensor2 + 1.0, axis=1) + self.b)) #returns (n_batches,_,*) if self.embedding_type == 'BIT_AND': return sigmoid( self.a * T.mean(2.0 * embed_tensor1 * embed_tensor2, axis=1) + self.b) #returns (n_batches,_,*) elif self.embedding_type == 'REAL': return sigmoid( self.a * T.mean(2.0 * embed_tensor1 * embed_tensor2 - embed_tensor1**2 - embed_tensor2**2, axis=1) + self.b) #returns (n_batches,_,*) elif self.embedding_type == 'REAL_INTERNALB': return sigmoid( self.a * (T.mean(2.0 * embed_tensor1 * embed_tensor2 - embed_tensor1**2 - embed_tensor2**2, axis=1) + self.b)) #returns (n_batches,_,*) elif self.embedding_type == 'REAL_SQRT': return sigmoid(self.a * (T.mean(1.0 - (embed_tensor1 - embed_tensor2) * (embed_tensor1 - embed_tensor2), axis=1))**.5 + self.b) #returns (n_batches,_,*) elif self.embedding_type == 'REAL_L1': return sigmoid(self.a * (T.mean( 1.0 - T.abs_(embed_tensor1 - embed_tensor2), axis=1)) + self.b) #returns (n_batches,_,*) elif self.embedding_type == 'REAL_TRAINED' or self.embedding_type == 'REAL_TRAINED_L1': terms = [ embed_tensor1 * embed_tensor2, embed_tensor1, embed_tensor2, embed_tensor1**2, embed_tensor2**2 ] expr = sum( [term * coef for term, coef in zip(terms, self.coefs)]) return sigmoid(self.a * T.mean(expr, axis=1) + self.b) def get_prob_sampled(embed_tensor1, embed_tensor2, n_samples): randomizationA = theano_rng.uniform( size=(embed_tensor1.shape[0], embed_tensor1.shape[1], n_samples)) #(n_batches,dim,val) randomizationB = theano_rng.uniform( size=(embed_tensor2.shape[0], embed_tensor2.shape[1], n_samples)) #(n_batches,dim,val) bithash_1 = T.switch( T.lt(randomizationA, embed_tensor1.dimshuffle(0, 1, 'x')), 1, 0) #(val,dim) bithash_2 = T.switch( T.lt(randomizationB, embed_tensor2.dimshuffle(0, 1, 'x')), 1, 0) #(val,dim) return ([bithash_1, bithash_2], get_prob(bithash_1, bithash_2)) def get_mean(embed_tensor1, embed_tensor2): return self.a * T.mean( 2.0 * embed_tensor1 * embed_tensor2 - embed_tensor1 - embed_tensor2 + 1.0 + self.b, axis=1) def get_var(embed_tensor1, embed_tensor2): p = 2.0 * embed_tensor1 * embed_tensor2 - embed_tensor1 - embed_tensor2 + 1.0 variances = p * (1 - p) total_var = T.sum(variances, axis=1) * (self.a / T.shape(variances)[1])**2 return total_var #build up list of sampling points and sampling weights, according to normal cdf approximation. #if objective_samples == None, stick to sub-optimal sampling scheme. def get_samples(embed_tensor1, embed_tensor2): if self.objective_samples == None: return [{ 'weight': 1.0, 'value': get_prob(embed_tensor1, embed_tensor2) }] else: PhiInv = lambda z: 2**.5 * erfinv(2 * z - 1) means = get_mean(embed_tensor1, embed_tensor2) variances = get_var(embed_tensor1, embed_tensor2) # print (variances**.5).tag.test_value spacing = 1.0 / (self.objective_samples + 1) xs = [] for i in range(1, self.objective_samples + 1): sample = variances**.5 * PhiInv(float(i) * spacing) + means xs.append({ 'weight': 1.0 * spacing, 'value': sigmoid(sample) }) xs.append({ 'weight': 0.5 * spacing, 'value': sigmoid(variances**.5 * PhiInv(0.5 * spacing) + means) }) xs.append({ 'weight': 0.5 * spacing, 'value': sigmoid(variances**.5 * PhiInv(1 - 0.5 * spacing) + means) }) return xs pos_losses = [ -sample['weight'] * T.mean(T.log(sample['value'])) for sample in get_samples(self.x1_emb, self.x2_emb) ] neg_losses = [ -sample['weight'] * T.mean(T.log(1 - sample['value'])) for sample in get_samples(self.x1neg_emb, self.x2neg_emb) ] self.loss = sum(pos_losses + neg_losses) # for sample in get_samples(self.x1_emb,self.x2_emb): # print "weight: ",sample['weight'], "value: ",sample['value'].tag.test_value # for x in pos_losses: # print "pos loss test_value:",x.tag.test_value # for x in neg_losses: # print "neg loss test_value:",x.tag.test_value #print "loss test value: ",self.loss.tag.test_value if self.n_samples != None: self.bithash_1s, self.bit_p1 = get_prob_sampled( self.x1_emb, self.x2_emb, self.n_samples) self.bithash_1s, self.bit_p2 = get_prob_sampled( self.x1neg_emb, self.x2neg_emb, self.n_samples) self.sampled_loss = T.mean(-T.log(self.bit_p1) - T.log(1 - self.bit_p2))
def __init__(self, cooccurrence, z_k, opt, initializer, initial_pz_weight=None, initial_b=None, pz_regularizer=None, eps=1e-9): cooccurrence = cooccurrence.astype(np.float32) self.cooccurrence = cooccurrence self.z_k = z_k self.opt = opt x_k = cooccurrence.shape[0] self.x_k = x_k # cooccurrence matrix n = np.sum(cooccurrence, axis=None) _co = cooccurrence / n co = T.constant(_co, name="co") # (x_k, x_k) _co_m = np.sum(_co, axis=1, keepdims=True) co_m = T.constant(_co_m, name="co_m") # (x_k,1) _co_c = _co / (eps + _co_m) _co_h = np.sum(_co * -np.log(eps + _co_c), axis=1, keepdims=True) # (x_k, 1) print "COh: {}".format(np.sum(_co_h)) co_h = T.constant(_co_h, name="co_h") if initial_pz_weight is None: initial_pz_weight = initializer((x_k, z_k)) pz_weight = K.variable(initial_pz_weight) pz = softmax_nd(pz_weight) initial_w = initializer((z_k, x_k)) w = K.variable(initial_w, name="w") # (z_k, x_k) if initial_b is None: initial_b = initializer((x_k, )) b = K.variable(initial_b, name="b") yw = softmax_nd(w + b) # (z_k, x_k) srng = RandomStreams(123) zsamp = srng.random_integers(size=(x_k, ), low=0, high=z_k - 1) yt = yw[zsamp, :] # (x_k, x_k) lt = -T.sum(co * T.log(eps + yt), axis=1) # (x_k,) pt = pz[T.arange(pz.shape[0]), zsamp] assert lt.ndim == 1 assert pt.ndim == 1 nll_loss = T.sum(pt * lt, axis=None) * z_k self.params = [pz_weight, w, b] reg_loss = T.constant(0.) if pz_regularizer: reg_loss = pz_regularizer(pz) total_loss = nll_loss + reg_loss encoding = T.argmax(pz_weight, axis=1) one_hot_encoding = tensor_one_hot(encoding, z_k) # (x_k, z_k) pb = T.dot(T.transpose(one_hot_encoding, (1, 0)), co) m = T.sum(pb, axis=1, keepdims=True) c = pb / (m + eps) validation_nll = -T.sum(pb * T.log(eps + c), axis=None) utilization = T.sum(T.gt(T.sum(one_hot_encoding, axis=0), 0), axis=0) updates = opt.get_updates(loss=total_loss, params=self.params) self.val_fun = theano.function([], [validation_nll, utilization]) self.encodings_fun = theano.function([], encoding) self.train_fun = theano.function([], [reg_loss, nll_loss, total_loss], updates=updates) self.weights = self.params + opt.weights
class DropModality(Layer): ''' drop a modality alltogether ''' def __init__(self, input_shapes=[], **kwargs): self.trng = RandomStreams(seed=np.random.randint(10e6)) self.params = [] self.input_shapes = input_shapes def set_prev_shape(self, input_shapes): self.input_shapes = input_shapes def get_output(self, train=False): X = self.get_input(train) full = T.ones_like(X) masks = [full] for i in xrange(len(self.input_shapes)): mask = T.ones_like(X) idx = 0 for j in xrange(len(self.input_shapes)): if i == j: try: ishape = len(self.input_shapes[0]) except: ishape = [1] pass if len(ishape) == 3: mask = T.set_subtensor( mask[:, :, idx:idx + self.input_shapes[j]], 0) elif len(ishape) == 2: mask = T.set_subtensor( mask[:, idx:idx + self.input_shapes[j]], 0) elif len(ishape) == 1: mask = T.set_subtensor( mask[idx:idx + self.input_shapes[j]], 0) else: raise NotImplementedError() idx = idx + self.input_shapes[j] masks += [mask] masked = T.stack(masks) if train: index = self.trng.random_integers(size=(1, ), low=0, high=len(masks) - 1)[0] else: index = 0 masked_output = X * masked[index] return masked_output def get_masked(self, train=False): X = self.get_input(train) full = T.ones_like(X) masks = [full] for i in xrange(len(self.input_shapes)): mask = T.ones_like(X) idx = 0 for j in xrange(len(self.input_shapes)): if i == j: mask = T.set_subtensor( mask[:, :, idx:idx + self.input_shapes[j]], 0) idx = idx + self.input_shapes[j] masks += [mask] masked = T.stack(masks) index = self.trng.random_integers(size=(1, ), low=0, high=len(masks) - 1)[0] return masked, index def get_input_shapes(self): return self.input_shapes def get_config(self): config = { "name": self.__class__.__name__, "input_shapes": self.input_shapes } base_config = super(DropModality, self).get_config() return dict(list(base_config.items()) + list(config.items()))
class DropModality(Layer): ''' drop a modality alltogether ''' def __init__(self, input_shapes = [], **kwargs): self.trng = RandomStreams(seed=np.random.randint(10e6)) self.params = [] self.input_shapes = input_shapes def set_prev_shape(self, input_shapes): self.input_shapes = input_shapes def get_output(self, train=False): X = self.get_input(train) full = T.ones_like(X) masks = [full] for i in xrange(len(self.input_shapes)): mask = T.ones_like(X) idx = 0 for j in xrange(len(self.input_shapes)): if i == j: try: ishape = len(self.input_shapes[0]) except: ishape = [1] pass if len(ishape) == 3: mask = T.set_subtensor(mask[:,:,idx : idx+ self.input_shapes[j]], 0) elif len(ishape) == 2: mask = T.set_subtensor(mask[:,idx : idx+ self.input_shapes[j]], 0) elif len(ishape) == 1: mask = T.set_subtensor(mask[idx : idx+ self.input_shapes[j]], 0) else: raise NotImplementedError() idx = idx + self.input_shapes[j] masks += [mask] masked = T.stack(masks) if train: index = self.trng.random_integers(size=(1,),low = 0, high = len(masks)-1)[0] else: index = 0 masked_output = X * masked[index] return masked_output def get_masked(self, train=False): X = self.get_input(train) full = T.ones_like(X) masks = [full] for i in xrange(len(self.input_shapes)): mask = T.ones_like(X) idx = 0 for j in xrange(len(self.input_shapes)): if i == j: mask = T.set_subtensor(mask[:,:,idx : idx+ self.input_shapes[j]], 0) idx = idx + self.input_shapes[j] masks += [mask] masked = T.stack(masks) index = self.trng.random_integers(size=(1,),low = 0, high = len(masks)-1)[0] return masked, index def get_input_shapes(self): return self.input_shapes def get_config(self): config = {"name": self.__class__.__name__, "input_shapes" : self.input_shapes } base_config = super(DropModality, self).get_config() return dict(list(base_config.items()) + list(config.items()))
import theano.tensor as T from theano.tensor.shared_randomstreams import RandomStreams floatX = theano.config.floatX vocabularySize = 10 embeddingSize = 10 contextSize = 2 samples = 10 wordIndices = T.ivector('wordIndices') defaultEmbeddings = np.arange(0, vocabularySize * embeddingSize).reshape((vocabularySize, embeddingSize)).astype(floatX) embeddings = theano.shared(defaultEmbeddings, name='embeddings', borrow=True) random = RandomStreams(seed=234) negativeSampleIndices = random.random_integers((contextSize * samples,), 0, vocabularySize - 1) indicies = T.concatenate([wordIndices, negativeSampleIndices]) indicies = indicies.reshape((samples + 1, contextSize)) output = embeddings[indicies] output = output.mean(axis=1) getEmbeddings = theano.function( inputs=[wordIndices], outputs=output ) print getEmbeddings(range(0, contextSize))
uniform_sample = shared(np.matrix(np.float32(np.random.rand(10000000,1))),'float32', borrow=True) bino_input = shared(np.matrix(np.float32(np.random.binomial(1,1-dropout_input,(10000000,1)))),config.floatX, borrow=True) t1 = T.fmatrix("t1") a1 = T.fmatrix("a1") e1 = T.fmatrix("e1") idx = T.iscalar("idx") bsize = T.fscalar("bsize") alpha = T.fscalar("alpha") cv_size = T.fscalar("cv_size") drop_input = lambda rand: T.reshape(bino_input[rand:rand + (batch_size*dim_visible)],(batch_size,dim_visible)) input_drop = drop_input(rdm.random_integers(low=0, high=sample_range_dropout)) h = T.nnet.sigmoid(T.add(T.dot(v,w_vh),w_h)) u_w_plus = function([],updates=[(wu_vh, g(T.add(wu_vh,T.dot(v.T,h)))), (wu_v, g(T.add(T.sum(v[:],axis=0),wu_v))), (wu_h, g(T.add(T.sum(h[:],axis=0),wu_h))) ]) u_w_minus = function([],updates=[(wu_vh, g(T.sub(wu_vh,T.dot(v.T,h)))), (wu_v, g(T.sub(T.sum(v[:],axis=0),wu_v))), (wu_h, g(T.sub(T.sum(h[:],axis=0),wu_h))) ]) sample = lambda rdm: T.reshape(uniform_sample[rdm:rdm + (dim_hidden*batch_size)],(batch_size,dim_hidden))
class Neural_network_layer: '''Represents the units within a layer and the units activations and dropout functions. ''' def __init__(self, size, activation_function, dropout_type, dropout, dropout_decay, batch_size, frequency): self.drop_count = 0 self.size = size self.frequency = frequency self.dropout = dropout self.dropout_init = dropout self.dropout_decay = dropout_decay self.dropout_type = dropout_type self.rdm = RandomStreams(seed=1234) self.batch_size = batch_size self.sample_range = 100000 self.create_dropout_sample_functions() self.activation_crossvalidation = activation_function self.activation_function = self.set_dropout(dropout, activation_function) self.activation_derivative = lambda X: g(T.mul(X, (1.0 - X))) self.activation_tracker = self.set_activation_tracker(activation_function) pass def set_dropout(self, dropout, activation_function): action_with_drop = None if dropout > 0: action_with_drop = lambda X: T.mul(activation_function(X),self.dropout_function) self.activation_cv_dropout = lambda X: T.mul(activation_function(X),self.dropout_function_cv) else: action_with_drop = activation_function self.activation_cv_dropout = activation_function return action_with_drop def set_activation_tracker(self, activation_function): '''Sets a tracker function that logs the activations that exceed 0.75. ''' if activation_function == Activation_function.sigmoid: activation_tracker = lambda X: T.gt(activation_function(X),0.75) else: activation_tracker = None return activation_tracker def create_dropout_sample_functions(self, reset = False): '''Creates functions of sample vectors which can be index with random integers to create a pseudo random sample for dropout. This greatly speeds up sampling as no new samples have to be created. ''' if reset: self.dropout = self.dropout_init print 'Reset dropout to ' + str(self.dropout) self.dropout_function = None sample_function = None if self.dropout > 0: if self.dropout_type == Dropout.drop_activation: if reset: self.bino_sample_vector.set_value(np.matrix(np.float32( np.random.binomial(1,1-self.dropout,(10000000,1)))), borrow=True) else: self.bino_sample_vector = shared(np.matrix(np.float32( np.random.binomial(1,1-self.dropout,(10000000,1)))), 'float32', borrow=True) sample_function = lambda rand: g(T.reshape(self.bino_sample_vector[rand:rand + (self.batch_size*self.size)],(self.batch_size,self.size))) sample_function_cv = lambda rand: g(T.reshape(self.bino_sample_vector[rand:rand + (4200*self.size)],(4200,self.size))) self.dropout_function = sample_function(self.rdm.random_integers(low=0, high=self.sample_range)) self.dropout_function_cv = sample_function_cv(self.rdm.random_integers(low=0, high=self.sample_range)) def handle_dropout_decay(self, epoch): '''Handles automatically the dropout decay by decreasing the dropout by the given amount after the given number of epochs. ''' if self.dropout_function and self.frequency[self.drop_count] > 0 and epoch % self.frequency[self.drop_count] == 0 and epoch > 0: print 'Setting dropout from ' + str(self.dropout) + ' to ' + str(np.float32(self.dropout*(1-self.dropout_decay[self.drop_count]))) self.dropout = np.float32(self.dropout*(1-self.dropout_decay[self.drop_count])) if self.dropout_type == Dropout.drop_activation: self.bino_sample_vector.set_value(np.matrix(np.float32( np.random.binomial(1,1-self.dropout,(10000000,1)))), borrow=True) self.drop_count += 1 if self.drop_count > len(self.dropout_decay)-1: self.drop_count -= 1
shared_random_generator = RandomStreams() x_r = T.iscalar() y_r = T.iscalar() p_scalar = T.fscalar('p_scalar') binomial_f = theano.function([x_r, y_r, p_scalar], outputs=shared_random_generator. binomial(size=(x_r, y_r), n=1, p=p_scalar, dtype='float32')) rows = T.iscalar() columns = T.iscalar() uniform_f = theano.function([rows, columns], outputs=shared_random_generator. uniform(size=(rows, columns), low=-0.1, high=0.1, dtype='float32')) random_f = theano.function([rows, columns], outputs=shared_random_generator.random_integers( size=(rows, columns), low=0, high=10000, dtype='float32')/10000.) def get_random_input(in_dim): return 2 * binomial_f(1, in_dim, 0.5) - np.ones((1, in_dim), dtype=np.float32) def set_contains_pattern(patterns_set, pattern): for pat in patterns_set: if get_pattern_correlation(pat, pattern) == 1: return True return False pat1 = T.fmatrix() pat2 = T.fmatrix() get_pattern_correlation = theano.function([pat1, pat2], outputs=T.sum(pat1 * pat2)/(pat1.shape[0] * pat1.shape[1]))
np.float32(np.random.binomial(1, 1 - dropout_input, (10000000, 1)))), config.floatX, borrow=True) t1 = T.fmatrix("t1") a1 = T.fmatrix("a1") e1 = T.fmatrix("e1") idx = T.iscalar("idx") bsize = T.fscalar("bsize") alpha = T.fscalar("alpha") cv_size = T.fscalar("cv_size") drop_input = lambda rand: T.reshape( bino_input[rand:rand + (batch_size * dim_visible)], (batch_size, dim_visible)) input_drop = drop_input(rdm.random_integers(low=0, high=sample_range_dropout)) h = T.nnet.sigmoid(T.add(T.dot(v, w_vh), w_h)) u_w_plus = function([], updates=[(wu_vh, g(T.add(wu_vh, T.dot(v.T, h)))), (wu_v, g(T.add(T.sum(v[:], axis=0), wu_v))), (wu_h, g(T.add(T.sum(h[:], axis=0), wu_h)))]) u_w_minus = function([], updates=[(wu_vh, g(T.sub(wu_vh, T.dot(v.T, h)))), (wu_v, g(T.sub(T.sum(v[:], axis=0), wu_v))), (wu_h, g(T.sub(T.sum(h[:], axis=0), wu_h)))]) sample = lambda rdm: T.reshape( uniform_sample[rdm:rdm + (dim_hidden * batch_size)],
x = T.scalar() z = g * x gradVal = T.grad(z, x) f = theano.function([x], gradVal) #With Scan def step(lastVal, xval): return (g * xval) outputs, updates = theano.scan(step, sequences = [], non_sequences = [x], outputs_info = [1.0], n_steps = 5) gradVal = T.grad(outputs[-1], x) f = theano.function([x], outputs = gradVal) print f(1), f(1), f(1), f(1) exit(0) if __name__ == '__main__': rng = RandomStreams(0) x = T.vector('x') xx = x ** 2 y = xx[rng.random_integers()] dy = T.grad(y, x) fdy = function([x], dy) for i in range(100): print fdy([1, 1])
def __init__(self, train_x, train_y, valid_x, valid_y, test_x, test_y, batchSize): rng = numpy.random.RandomState(42) self.train_x = theano.shared(train_x.astype('float32')) self.train_y = theano.shared(train_y.astype('int32')) self.valid_x = theano.shared(valid_x.astype('float32')).reshape((valid_x.shape[0],1,28,28)) self.valid_y = theano.shared(valid_y.astype('int32')) self.test_x = theano.shared(test_x.astype('float32')).reshape((test_x.shape[0],1,28,28)) self.test_y = theano.shared(test_y.astype('int32')) x = T.matrix() y = T.ivector() index = T.lscalar() learningRate = T.scalar() L1_reg = 0.0 L2_reg = 0.0 random_stream = RandomStreams(seed=420) indices = random_stream.random_integers((batchSize,), low=0, high=train_x.shape[0]-1) x = self.train_x.take(indices, axis=0) y = self.train_y.take(indices, axis=0) layer0Input = x.reshape((batchSize,1,28,28)) layer0 = ConvPoolLayer( rng=rng, input=layer0Input, filter_shape=(64,1,3,3), image_shape=(None,1,28,28), poolsize=(2,2) ) layer1 = ConvPoolLayer( rng=rng, input=layer0.output, filter_shape=(128,64,3,3), image_shape=(None,64,13,13), poolsize=(2,2) ) layer1Out = layer1.output.flatten(2) layer2 = HiddenLayer( rng=rng, input=layer1Out, n_in=128*5*5, n_out=512, activation=relu ) layer3 = LogisticRegression( rng=rng, input=layer2.output, n_in=layer2.n_out, n_out=10 ) L1 = abs(layer0.W).sum() + abs(layer1.W).sum() + abs(layer2.W).sum() + abs(layer3.W).sum() L2 = (layer0.W**2).sum() + (layer1.W**2).sum() + (layer2.W**2).sum() + (layer3.W**2).sum() cost = layer3.negative_log_likelihood(y) + L1_reg * L1 + L2_reg * L2 self.test_model = theano.function( [index], layer3.errors(y), givens={ layer0Input: self.test_x[index * 1000:(index+1)*1000,:,:,:], y: self.test_y[index * 1000:(index+1)*1000] } ) self.validate_model = theano.function( [index], [layer3.errors(y), cost], givens={ layer0Input: self.valid_x[index * 1000:(index+1)*1000,:,:,:], y: self.valid_y[index * 1000:(index+1)*1000] } ) self.forward = theano.function([layer0Input], [layer3.p_y_given_x]) self.params = layer3.params + layer2.params + layer1.params + layer0.params updates = self.rmsProp(cost, self.params, 0.7, 0.01, learningRate) self.train_model = theano.function( [learningRate], cost, updates=updates )