def __init__(self, inputs, bs, max_time, class_num, feature_dim, hidden_size, method='max', seed=12345): self._inputs = inputs self.method = method self.name = 'baseline_' + str(class_num) self.batch_size = bs self.class_num = theano.shared(class_num) self.max_time = max_time self.feature_dim = feature_dim self.dropout = True self.hidden = HiddenLayer(input_size=feature_dim, hidden_size=hidden_size, batch_size=bs, name='hidden', dropout=0.5, activation=act.LeakyRelu()) self.classify = HiddenLayer(input_size=hidden_size, hidden_size=1, batch_size=bs, name='classify', dropout=0.0, activation=act.sigmoid)
def __init__(self, inputs, bs, max_time, classes, feature_dim, hidden_size, method='max', seed=12345): self._inputs = inputs self.method = method self.batch_size = bs self.classes = classes self.max_time = max_time self.feature_dim = feature_dim self.dropout = True self.hidden = HiddenLayer(input_size=feature_dim, hidden_size=hidden_size, batch_size=bs, name='hidden', dropout=0.5, activation=act.LeakyRelu()) self.softmax = SoftmaxLayer(input_size=hidden_size, classes=self.classes, batch_size=bs, name='softmax', dropout=0.5)
def __init__(self, inputs, bs, max_time, classes, feature_dim, hidden_size, levels, N=1, pool=None, seed=12345): self._inputs = inputs self.N = N self.batch_size = bs self.classes = classes self.max_time = max_time self.levels = levels self.feature_dim = feature_dim self.pool = pool self.dropout = True # create a pyramid of filters self.temporal_pyramid = [] for l in range(self.levels): for f in range(2**l): tf = TemporalAttentionLayer(batch_size=bs, N=N, channels=feature_dim, name='temporal-attention-layer-'+str(l)+'-filter-'+str(f)) tf.test = True tf.d = theano.shared(value=np.asarray([1./2**(l+1)]).astype('float32'), name='d', borrow=True, broadcastable=[True]) tf.g = theano.shared(value=np.asarray([((1./2**l)+(2*f/2.**l))]).astype('float32'), name='g', borrow=True, broadcastable=[True]) tf.sigma = theano.shared(value=np.asarray([5.0]).astype('float32'), name='sigma', borrow=True, broadcastable=[True]) self.temporal_pyramid.append(tf) input_size = feature_dim*N*(len(self.temporal_pyramid) if pool == None else 1) self.hidden = HiddenLayer(input_size=input_size, hidden_size=hidden_size, activation=act.LeakyRelu(), batch_size=bs, name='hidden', dropout=0.5) self.softmax = SoftmaxLayer(input_size=hidden_size, classes=self.classes, batch_size=bs, name='softmax', dropout=0.5)
def init_params(self): self.transform_hidden = HiddenLayer(input_size=self.input_hidden_size, hidden_size=5 + self.use_dx_dy, batch_size=self.batch_size, activation=act.Identity, device=self.device, name='Read.Transform.' + self.name)
def __init__( self, batch_size, N, channels, name='', use_gpu=True, test=False, input_hidden_size=4096, initializers=[ i.IsotropicGaussian(0.01), # g i.IsotropicGaussian(0.01), # d i.IsotropicGaussian(0.01) ]): # sigma """ Temporal Read Layer Based on DRAW paper """ self.batch_size = batch_size self.N = N self.channels = channels self.name = name self.output_shape = [batch_size, channels, N] self.use_gpu = use_gpu self.initializers = initializers self.test = test self.input_hidden_size = input_hidden_size self.hidden_layer = HiddenLayer(input_size=self.input_hidden_size, hidden_size=3, batch_size=self.batch_size, activation=act.Identity, name='Attention-Transform.' + self.name)
def __init__(self, rng, input, nkerns, recept_width, pool_width, stride, training_mode, dropout_prob, activation, weights_variance, n_channels, n_timesteps, n_fbins, global_pooling): self.layer0 = ConvPoolLayer( rng, input=input, image_shape=(None, 1, n_channels * n_fbins, n_timesteps), filter_shape=(nkerns[0], 1, n_fbins * n_channels, recept_width[0]), poolsize=(1, pool_width[0]), activation=activation[0], weights_variance=weights_variance, subsample=(1, stride[0])) input_layer1_width = ( (n_timesteps - recept_width[0]) / stride[0] + 1) / pool_width[0] self.layer1 = ConvPoolLayer(rng, input=self.layer0.output, image_shape=(None, nkerns[0], 1, input_layer1_width), filter_shape=(nkerns[1], nkerns[0], 1, recept_width[1]), poolsize=(1, pool_width[1]), activation=activation[1], weights_variance=weights_variance, subsample=(1, stride[1])) if global_pooling: self.glob_pool = GlobalPoolLayer(self.layer1.output) layer2_input = self.glob_pool.output.flatten(2) input_layer2_shape = nkerns[1] * 6 self.layer2 = HiddenLayer(rng=rng, input=layer2_input, n_in=input_layer2_shape, n_out=nkerns[2], training_mode=training_mode, dropout_prob=dropout_prob, activation=activation[2], weights_variance=weights_variance) else: layer2_input = self.layer1.output.flatten(2) input_layer2_size = ( (input_layer1_width - recept_width[1]) / stride[1] + 1) / pool_width[1] self.layer2 = HiddenLayer(rng=rng, input=layer2_input, n_in=nkerns[1] * input_layer2_size, n_out=nkerns[2], training_mode=training_mode, dropout_prob=dropout_prob, activation=activation[2], weights_variance=weights_variance) self.output = self.layer2.output self.weights = self.layer0.weights + self.layer1.weights + self.layer2.weights
def __init__(self): self.samples = [] # <Sample Object> self.iteration = 0 self.max_iteration = 1 self.tolerance_error = 0.001 self.learning_rate = 1.0 self.hidden_layer = HiddenLayer() self.output_layer = OutputLayer() self.sga = SGA() self.cost = Cost() self.picker = Picker()
def __init__(self, inputs, bs, max_time, classes, feature_dim, hidden_size, filters, N=1, pool=None, lstm_dim=4096, steps=8, seed=12345): self._inputs = inputs self.N = N self.batch_size = bs self.classes = classes self.max_time = max_time self.filters = filters self.feature_dim = feature_dim self.pool = pool self.dropout = True self.steps = steps self.temporal_filters = [] for f in range(filters): tf = TemporalAttentionLayer( batch_size=bs, N=N, channels=feature_dim, input_hidden_size=lstm_dim, name='temporal-attention-layer-filter-' + str(f)) self.temporal_filters.append(tf) input_size = feature_dim * len( self.temporal_filters) * (N if pool == None else 1) self.lstm_in = HiddenLayer(input_size=input_size, hidden_size=lstm_dim * 4, batch_size=bs) self.lstm = LSTMLayer(input_size=lstm_dim, hidden_size=lstm_dim) self.hidden = HiddenLayer(input_size=lstm_dim, hidden_size=hidden_size, activation=act.relu, batch_size=bs, name='hidden', dropout=0.5) self.softmax = SoftmaxLayer(input_size=hidden_size, classes=self.classes, batch_size=bs, name='softmax', dropout=0.5)
def init_params(self): self.transform_hidden = HiddenLayer(input_size=self.input_hidden_size, hidden_size=5+self.use_dx_dy, batch_size=self.batch_size, activation=act.Identity, device=self.device, name='Read.Transform.'+self.name)
def __init__(self, inputs, bs, max_time, class_num, feature_dim, hidden_size, levels, N=1, pool=None, seed=12345): self._inputs = inputs self.N = N self.batch_size = bs self.name = 'learned_' + str(class_num) self.class_num = theano.shared(class_num) self.max_time = max_time self.filters = levels self.feature_dim = feature_dim self.pool = pool self.dropout = True self.temporal_pyramid = [] for f in range(self.filters): tf = TemporalAttentionLayer(batch_size=bs, N=N, channels=feature_dim, name='af-' + str(f)) self.temporal_pyramid.append(tf) input_size = feature_dim * len(self.temporal_pyramid) #*N self.hidden = HiddenLayer(input_size=input_size, hidden_size=hidden_size, activation=act.LeakyRelu(), batch_size=bs, name='hidden', dropout=0.5) self.classify = HiddenLayer(input_size=hidden_size, hidden_size=1, batch_size=bs, name='classify', dropout=0.0, activation=act.sigmoid)
def __init__(self, bs, K, N, m): # builds a bidirectional LSTM to create # a m-dimensional hidden state for the given # sequence of lenth N with vocab size K self.K = K self.N = N self.m = m self.bs = bs self.forward_in = HiddenLayer(input_size=K, hidden_size=m*4//2, batch_size=bs, name='forward-lstm-in') self.forward_lstm = LSTMLayer(hidden_size=m//2, activation=T.tanh, batch_size=bs, dropout=0.0, name='forward-lstm') self.backward_in = HiddenLayer(input_size=K, hidden_size=m*4//2, batch_size=bs, name='backward-lstm-in') self.backward_lstm = LSTMLayer(hidden_size=m//2, activation=T.tanh, batch_size=bs, dropout=0.0, name='backward-lstm')
class ImageModel(Model): def __init__(self, bs, K, lang_N, steps, read_size, write_size, m, gen_dim, infer_dim, z_dim, l, seed=12345, channels=1, image_size=60*60, cinit=0): # K is the vocab size # lang_N is the (max) length of the sentence encoding # N is the number of times to run the model # m is the size of the langauge representation # l is the dimensions in the align function # image_size is the w*h of image (assumed square) self.use_gpu = True self.cinit = cinit self.batch_size = bs self.channels = channels self.gen_dim = gen_dim self.z_dim = z_dim self.m = m self.lang_N = lang_N self.steps = steps self.l = l self.read_size = read_size self.write_size = write_size self.infer_dim = infer_dim self.image_size = image_size self.language_model = LanguageModel(bs, K, lang_N, m) self.gen_in = HiddenLayer(input_size=m+z_dim, hidden_size=gen_dim*4, batch_size=bs, name='gen-lstm-in') self.gen_lstm = LSTMLayer(hidden_size=gen_dim, activation=T.tanh, batch_size=bs, dropout=0.0, name='gen-lstm') self.infer_in = HiddenLayer(input_size=2*channels*self.read_size**2+self.gen_dim, hidden_size=infer_dim*4, batch_size=bs, name='infer-lstm-in') self.infer_lstm = LSTMLayer(hidden_size=infer_dim, activation=T.tanh, batch_size=bs, dropout=0.0, name='infer-lstm') self.reader = ReadLayer(batch_size=self.batch_size, N=self.read_size, channels=channels, image_width=int(np.sqrt(self.image_size)), image_height=int(np.sqrt(self.image_size)), input_hidden_size=gen_dim, name='Read') self.writer = WriteLayer(batch_size=self.batch_size, N=self.write_size, channels=channels, image_width=int(np.sqrt(self.image_size)), image_height=int(np.sqrt(self.image_size)), input_hidden_size=gen_dim, name='Write') self.random = RandomStreams(seed) # create W_mu, W_sigma, v, U, W, b init = IsotropicGaussian(0.01) u = init.init(np_rng, (self.m, self.l)) self.U = theano.shared(value=u, name='U', borrow=True) v = init.init(np_rng, (self.l,)) self.v = theano.shared(value=v, name='v', borrow=True) w = init.init(np_rng, (self.gen_dim, self.l)) self.W = theano.shared(value=w, name='W', borrow=True) b = init.init(np_rng, (self.l,)) self.b = theano.shared(value=b, name='b', borrow=True) w_mu = init.init(np_rng, (self.infer_dim, self.z_dim)) self.W_mu_infer = theano.shared(value=w_mu, name='W_mu_infer', borrow=True) w_sigma = init.init(np_rng, (self.infer_dim, self.z_dim)) self.W_sigma_infer = theano.shared(value=w_sigma, name='W_sigma_infer', borrow=True) w_mu = init.init(np_rng, (self.gen_dim, self.z_dim)) self.W_mu_gen = theano.shared(value=w_mu, name='W_mu_gen', borrow=True) w_sigma = init.init(np_rng, (self.gen_dim, self.z_dim)) self.W_sigma_gen = theano.shared(value=w_sigma, name='W_sigma_gen', borrow=True) def batched_dot(self, A, B): if self.use_gpu: return theano.sandbox.cuda.blas.batched_dot(A, B) else: return T.batched_dot(A,B) @property def params(self): return flatten_f([self.U, self.v, self.W, self.b, self.W_mu_infer, self.W_sigma_infer, self.W_mu_gen, self.W_sigma_gen] + self.language_model.params + \ self.gen_in.params + self.gen_lstm.params + self.infer_in.params + self.infer_lstm.params + \ self.reader.params + self.writer.params) def align(self, h_gen, h_lang, mask): # h_lang is N x batch x m # h_gen is batch x gen_dim # U is m x l # mask determines which elements of h_lang we care about # we want the result to be N x batch x l # using batched_dot this can be done # by making U to be 1 x m x l # and mkaing h_lang to be N x batch x m # and repeating U N times on axis 0 U = self.U.reshape((1, self.m, self.l)).repeat(self.lang_N, axis=0) # align_lang is now N x batch x l align_lang = self.batched_dot(h_lang, U) # W is gen_dim x l # h_gen is batch x gen_dim # result is batch x l align_img = T.dot(h_gen, self.W) # use broadcasting to get a to be N x batch x l alpha = T.tanh(align_lang + align_img.dimshuffle('x',0,1) + self.b.dimshuffle('x','x',0)) # v is l, a is N x batch x l # result will be N x batch alpha = T.exp(T.dot(alpha, self.v)) # need to mask a before normalizing # so that the parts that are masked do # not affect the normalization mask = mask.transpose([1,0]) # make mask langN x batch_size alpha = T.switch(T.neq(mask,0), alpha, zeros((self.lang_N, self.batch_size))) # normalize a by the sum of a along the N (axis=0) # creates a vector of length N alpha = alpha / T.sum(alpha, axis=0) # we now use alpha with h_lang to compute s_t # s_t is of size m because it is a constant # (alpha) * h_lang (m-vector) # we have alpha as N x batch # and h_lang as N x batch x m alpha = alpha.reshape((self.lang_N, self.batch_size, 1)) s = h_lang * alpha # sum along the N axis to give batch x m s = T.sum(s, axis=0) return s, alpha # use with partial to pass in first args # scan will pass the remaining args def step_train(self, rnd_in, kl, h_infer, c_infer, h_gen, c_gen, c, mu_gen, sigma_gen, h_lang, x, mask): # h_gen is a sequence # h_lang is a non-sequence (but it is used to calculate # the align function each step) # eqs 10-13 # compute "error image" x_hat = x-T.nnet.sigmoid(c) # read from both input (x) and error image r, _ = self.reader.run(x, h_gen) r_hat, _ = self.reader.run(x_hat, h_gen) # concatente the two read regions r = T.concatenate([r,r_hat], axis=1) # run the infer lstm on the read regions val = self.infer_in.run(T.concatenate([r, h_gen], axis=1)) h_infer_t, c_infer_t = self.infer_lstm.run(val, h_infer, c_infer) # I don't believe we actually need to sample from Q # we just use it to minimze the loss so that it learns # good values for the infer-lstm # But we do need the mean and logsigma for KL mu_infer = T.dot(h_infer_t, self.W_mu_infer) sigma_infer = 0.5*T.dot(h_infer_t, self.W_sigma_infer) # generate a sample from these normal distributions z = mu_infer + T.exp(sigma_infer) * rnd_in # calculate kl-divergence between infer and gen normal distributions kl_t = kl + T.sum(-1 + ((mu_infer - mu_gen)**2 + T.exp(2*sigma_infer))/ (T.exp(2*sigma_gen)) - 2*sigma_infer + 2*sigma_gen) # do the alignment (eq 2) # this is m-dimensions - each word is summed into 1 vector # to represent the whole sequence, so N x batch x m becomes batch x m s,_ = self.align(h_gen, h_lang, mask) # run the LSTM (eq 3) # val is batch x m+z_dims val = self.gen_in.run(T.concatenate([z,s], axis=1)) h_gen_t, c_gen_t = self.gen_lstm.run(val, h_gen, c_gen) mu_gen = T.tanh(T.dot(h_gen_t, self.W_mu_gen)) sigma_gen = T.tanh(T.dot(h_gen_t, self.W_sigma_gen)) # do the write (eq 4) c_update, _ = self.writer.run(h_gen_t) c_t = c + c_update return kl_t, h_infer_t, c_infer_t, h_gen_t, c_gen_t, c_t, mu_gen, sigma_gen def train(self, x, y, mask): # do language model on y h_lang = self.language_model.run(y) # do train recurrence h_infer, c_infer = self.infer_lstm.get_initial_hidden h_gen, c_gen = self.gen_lstm.get_initial_hidden c0 = theano.shared(self.cinit*np.ones((1, self.channels*self.image_size)).astype(theano.config.floatX)) c0 = c0.repeat(self.batch_size, axis=0) rnd_in = rng.normal(size=(self.steps, self.batch_size, self.z_dim), avg=0.0, std=1.0, dtype=theano.config.floatX) # setup output outputs_info = [dict(initial=T.zeros(()), taps=[-1]), # kl dict(initial=h_infer, taps=[-1]), # h_infer dict(initial=c_infer, taps=[-1]), # c_infer dict(initial=h_gen, taps=[-1]), # h_gen dict(initial=c_gen, taps=[-1]), # c_gen dict(initial=c0, taps=[-1]), # c dict(initial=T.zeros((self.batch_size,self.z_dim)), taps=[-1]), # mu_gen dict(initial=T.zeros((self.batch_size,self.z_dim)), taps=[-1])] # sigma_gen # do scan [kl, h_infer, c_infer, h_gen, c_gen, c, mu_gen, sigma_gen], _ = theano.scan( fn=self.step_train, sequences=rnd_in, outputs_info=outputs_info, non_sequences=[h_lang,x,mask], n_steps=self.steps) # Get x-reconstruction-error (eq 5) x_recons = T.nnet.sigmoid(c[-1,:,:]) log_recons = T.nnet.binary_crossentropy(x_recons, x).sum() # compute KL kl = 0.5*kl[-1] log_likelihood = kl + log_recons log_likelihood = log_likelihood.mean() kl = kl.mean() log_recons = log_recons.mean() return kl, log_recons, log_likelihood, c def generate_image(self, y, mask): # do language model on y h_lang = self.language_model.run(y) # do train recurrence h_gen, c_gen = self.gen_lstm.get_initial_hidden c0 = theano.shared(self.cinit*np.ones((1, self.channels*self.image_size)).astype(theano.config.floatX)) c0 = c0.repeat(self.batch_size, axis=0) rnd_in = rng.normal(size=(self.steps, self.batch_size, self.z_dim), avg=0.0, std=1.0, dtype=theano.config.floatX) # setup output outputs_info = [dict(initial=h_gen, taps=[-1]), # h_gen dict(initial=c_gen, taps=[-1]), # c_gen dict(initial=c0, taps=[-1]), # c dict(initial=T.zeros((self.batch_size,self.z_dim)), taps=[-1]), # mu_gen dict(initial=T.zeros((self.batch_size,self.z_dim)), taps=[-1]), # sigma_gen dict(initial=T.zeros((self.lang_N,self.batch_size, 1)), taps=[-1])] # alpha # do scan [h_gen, c_gen, c, mu_gen, sigma_gen, alpha], _ = theano.scan(fn=self.step_gen, sequences=rnd_in, outputs_info=outputs_info, non_sequences=[h_lang,mask], n_steps=self.steps) c = T.nnet.sigmoid(c) return c[-1].reshape((1,self.batch_size,self.channels, self.image_size)), alpha def step_gen(self, rnd_in, h_gen, c_gen, c, mu_gen, sigma_gen, alpha, h_lang, mask): # generate a sample from the generative distribution z = mu_gen + T.exp(sigma_gen) * rnd_in # do the alignment (eq 2) # this is m-dimensions - each word is summed into 1 vector # to represent the whole sequence, so N x batch x m becomes batch x m s, alpha = self.align(h_gen, h_lang, mask) # run the LSTM (eq 3) # val is batch x m+z_dims val = self.gen_in.run(T.concatenate([z,s], axis=1)) h_gen_t, c_gen_t = self.gen_lstm.run(val, h_gen, c_gen) mu_gen = T.tanh(T.dot(h_gen_t, self.W_mu_gen)) sigma_gen = T.tanh(T.dot(h_gen_t, self.W_sigma_gen)) # do the write (eq 4) c_update, _ = self.writer.run(h_gen_t) c_t = c + c_update return h_gen_t, c_gen_t, c_t, mu_gen, sigma_gen, alpha def build_sample_function(self, y, mask): c, alpha = self.generate_image(y, mask) self.sample_sentences = theano.function([y, mask], [c]) @property def inputs(self): return self._inputs @property def outputs(self): return self._outputs @property def updates(self): return self._updates
class ReadLayer(object): def __init__(self, batch_size, N, channels, image_width, image_height, input_hidden_size, use_dx_dy=False, name='', test=False, use_gpu=True, device='gpu', use_gamma=True): """ Read Layer from DRAW paper """ self.batch_size = batch_size self.use_dx_dy = use_dx_dy self.N = N self.channels = channels self.width = image_width self.height = image_height self.name = name self.input_hidden_size = input_hidden_size self.test = test self.output_shape = [batch_size, channels, N, N] self.use_gpu = use_gpu self.use_gamma = use_gamma self.device = device self.init_params() def load_pretrained(self, v, i): return i def init_params(self): self.transform_hidden = HiddenLayer(input_size=self.input_hidden_size, hidden_size=5+self.use_dx_dy, batch_size=self.batch_size, activation=act.Identity, device=self.device, name='Read.Transform.'+self.name) def batched_dot(self, A, B): if self.use_gpu: return theano.sandbox.cuda.blas.batched_dot(A, B) else: return T.batched_dot(A,B) # C = A.dimshuffle([0,1,2,'x']) * B.dimshuffle([0,'x',1,2]) # return C.sum(axis=-2) def get_params(self, h): hidden = self.transform_hidden.run(h) gx = (hidden[:,0]+1)*0.5 * self.width gy = (hidden[:,1]+1)*0.5 * self.height s2 = T.exp(hidden[:,3]/2.0) if self.use_gamma: g = T.exp(hidden[:,4]).dimshuffle(0,'x') else: g = T.exp(hidden[:,4]).dimshuffle(0,'x') g = g/g if self.use_dx_dy: dx = (self.width-1.0) / (self.N-1.0) * T.exp(hidden[:,2]) dy = (self.height-1.0) / (self.N-1.0) * T.exp(hidden[:,5]) else: dx = dy = ((max(self.width,self.height)-1.0) / (self.N-1.0) * T.exp(hidden[:,2])) return gx,gy,dx,dy,s2,g def get_params_test(self, h): return h[:,0], h[:,1], h[:,2], h[:,5], h[:,3], h[:,4].dimshuffle(0,'x') def run(self, images, h):#, error_images, h): channels = self.channels#images.shape[1] if not self.test: gx,gy,dx,dy,s2,g = self.get_params(h) else: gx,gy,dx,dy,s2,g = self.get_params_test(h) # how to handle variable sized input images? (mask??) I = images.reshape((self.batch_size*self.channels, self.height, self.width)) muX = gx.dimshuffle([0,'x']) + dx.dimshuffle([0,'x']) * (T.arange(self.N).astype(theano.config.floatX) - self.N/2 - 0.5) muY = gy.dimshuffle([0,'x']) + dy.dimshuffle([0,'x']) * (T.arange(self.N).astype(theano.config.floatX) - self.N/2 - 0.5) a = T.arange(self.width).astype(theano.config.floatX) b = T.arange(self.height).astype(theano.config.floatX) Fx = T.exp(-(a-muX.dimshuffle([0,1,'x']))**2 / 2. / s2.dimshuffle([0,'x','x'])**2) Fy = T.exp(-(b-muY.dimshuffle([0,1,'x']))**2 / 2. / s2.dimshuffle([0,'x','x'])**2) Fx = Fx / (Fx.sum(axis=-1).dimshuffle([0,1,'x']) + 1e-4) Fy = Fy / (Fy.sum(axis=-1).dimshuffle([0,1,'x']) + 1e-4) self.Fx = T.repeat(Fx, channels, axis=0) self.Fy = T.repeat(Fy, channels, axis=0) self.fint = self.batched_dot(self.Fy, I) # self.efint = T.dot(self.Fx, error_images) self.fim = self.batched_dot(self.fint, self.Fx.transpose([0,2,1])).reshape( (self.batch_size, self.channels*self.N*self.N)) # self.feim = T.dot(self.efint, self.Fy.transpose([0,2,1])).reshape( # (self.batch_size, channels,self.N,self.N)) return g * self.fim, (gx, gy, dx, dy, self.fint)#$T.concatenate([self.fim, self.feim], axis=1) @property def params(self): return [param for param in self.transform_hidden.params] @params.setter def params(self, params): self.transform_hidden.params = params def print_layer(self): v = '--------------------\n' v += 'Read Layer '+self.name+'\n' v += 'Input Shape: '+str((self.width, self.height))+'\n' return v + 'Output Shape: '+str((self.N, self.N))+'\n'
def __init__(self, bs, K, lang_N, steps, read_size, write_size, m, gen_dim, infer_dim, z_dim, l, seed=12345, channels=1, image_size=60*60, cinit=0): # K is the vocab size # lang_N is the (max) length of the sentence encoding # N is the number of times to run the model # m is the size of the langauge representation # l is the dimensions in the align function # image_size is the w*h of image (assumed square) self.use_gpu = True self.cinit = cinit self.batch_size = bs self.channels = channels self.gen_dim = gen_dim self.z_dim = z_dim self.m = m self.lang_N = lang_N self.steps = steps self.l = l self.read_size = read_size self.write_size = write_size self.infer_dim = infer_dim self.image_size = image_size self.language_model = LanguageModel(bs, K, lang_N, m) self.gen_in = HiddenLayer(input_size=m+z_dim, hidden_size=gen_dim*4, batch_size=bs, name='gen-lstm-in') self.gen_lstm = LSTMLayer(hidden_size=gen_dim, activation=T.tanh, batch_size=bs, dropout=0.0, name='gen-lstm') self.infer_in = HiddenLayer(input_size=2*channels*self.read_size**2+self.gen_dim, hidden_size=infer_dim*4, batch_size=bs, name='infer-lstm-in') self.infer_lstm = LSTMLayer(hidden_size=infer_dim, activation=T.tanh, batch_size=bs, dropout=0.0, name='infer-lstm') self.reader = ReadLayer(batch_size=self.batch_size, N=self.read_size, channels=channels, image_width=int(np.sqrt(self.image_size)), image_height=int(np.sqrt(self.image_size)), input_hidden_size=gen_dim, name='Read') self.writer = WriteLayer(batch_size=self.batch_size, N=self.write_size, channels=channels, image_width=int(np.sqrt(self.image_size)), image_height=int(np.sqrt(self.image_size)), input_hidden_size=gen_dim, name='Write') self.random = RandomStreams(seed) # create W_mu, W_sigma, v, U, W, b init = IsotropicGaussian(0.01) u = init.init(np_rng, (self.m, self.l)) self.U = theano.shared(value=u, name='U', borrow=True) v = init.init(np_rng, (self.l,)) self.v = theano.shared(value=v, name='v', borrow=True) w = init.init(np_rng, (self.gen_dim, self.l)) self.W = theano.shared(value=w, name='W', borrow=True) b = init.init(np_rng, (self.l,)) self.b = theano.shared(value=b, name='b', borrow=True) w_mu = init.init(np_rng, (self.infer_dim, self.z_dim)) self.W_mu_infer = theano.shared(value=w_mu, name='W_mu_infer', borrow=True) w_sigma = init.init(np_rng, (self.infer_dim, self.z_dim)) self.W_sigma_infer = theano.shared(value=w_sigma, name='W_sigma_infer', borrow=True) w_mu = init.init(np_rng, (self.gen_dim, self.z_dim)) self.W_mu_gen = theano.shared(value=w_mu, name='W_mu_gen', borrow=True) w_sigma = init.init(np_rng, (self.gen_dim, self.z_dim)) self.W_sigma_gen = theano.shared(value=w_sigma, name='W_sigma_gen', borrow=True)
class TemporalModel(Model): def __init__(self, inputs, bs, max_time, classes, feature_dim, hidden_size, method='max', seed=12345): self._inputs = inputs self.method = method self.batch_size = bs self.classes = classes self.max_time = max_time self.feature_dim = feature_dim self.dropout = True self.hidden = HiddenLayer(input_size=feature_dim, hidden_size=hidden_size, batch_size=bs, name='hidden', dropout=0.5, activation=act.LeakyRelu()) self.softmax = SoftmaxLayer(input_size=hidden_size, classes=self.classes, batch_size=bs, name='softmax', dropout=0.5) @property def params(self): return self.softmax.params + self.hidden.params @property def inputs(self): return self._inputs @property def outputs(self): return self._outputs @property def updates(self): return self._updates @property def test_algorithm(self): if not hasattr(self, '_talgorithm'): d = self.dropout self.dropout = False o = self.run(*self.inputs) for i, ot in enumerate(self.outputs): o[i].name = ot.name self._talgorithm = theano.function(inputs=self.inputs, outputs=o, on_unused_input='warn') self.dropout = d return self._talgorithm def run(self, x, mask, y): # get the max/mean/sum of x for each feature # from all frame if self.method == 'max': m = (-100 * (1 - mask)).dimshuffle([0, 1, 'x']) x = T.max(x + m, axis=1) elif self.method == 'sum' or self.method == 'mean': x = T.sum(x, axis=1) elif self.method == 'mean': x = x / T.sum(mask, axis=1).dimshuffle([0, 'x']) x = x.astype(theano.config.floatX) x = self.hidden.run(x, self.dropout) prob, pred = self.softmax.run(x, self.dropout) y = y.reshape((y.shape[0], )) loss = self.softmax.loss(prob, y) + T.sum( self.hidden.w**2) * 0.001 + T.sum(self.softmax.w**2) * 0.0001 y = T.extra_ops.to_one_hot(y, 51) error = self.softmax.error(pred, y) acc = 1 - error return prob, pred, loss, error, acc
class RBFNN: def __init__(self): self.samples = [] # <Sample Object> self.iteration = 0 self.max_iteration = 1 self.tolerance_error = 0.001 self.learning_rate = 1.0 self.hidden_layer = HiddenLayer() self.output_layer = OutputLayer() self.sga = SGA() self.cost = Cost() self.picker = Picker() def reset(self): del self.samples[:] self.iteration = 0 self.hidden_layer.clear_nets() self.output_layer.clear_nets() self.cost.clear() # sample: Sample Object def add_sample(self, sample): if not sample: return self.samples.append(sample) def add_samples(self, samples=[]): for sample in samples: self.add_sample(sample) # 对输出层神经元给定随机权重 def randomize_weights(self, min=-0.25, max=0.25): hidden_count = len(self.hidden_layer.nets) for output_net in self.output_layer.nets: output_net.randomize_weights(hidden_count, min, max) def zero_weights(self): self.randomize_weights(0.0, 0.0) def add_center(self, center=[]): center_net = CenterNet(center) self.hidden_layer.add_net(center_net) # k: 要几个中心点(隐藏层神经元) # pick_method: 挑选中心点的方法 def initialize_centers(self, k=1, pick_method=PickMethod.Random): self.picker.samples = self.samples picked_samples = { PickMethod.Random: self.picker.shuffing, PickMethod.Clustering: self.picker.clustering }.get(pick_method)(k) # 把选取到的中心点都设定进孙菲菲层里 for sample in picked_samples: center_net = CenterNet(sample.features) self.hidden_layer.add_net(center_net) def initialize_outputs(self): # 有几个输出 outputs_count = len(self.samples[0].targets) for i in range(outputs_count): output_net = OutputNet() self.output_layer.add_net(output_net) # custom_sigmas: <Double>, 自订每个中心点的 Sigma, 1 center has 1 sigma. def training(self, iteration_callback=None, completion_callback=None, custom_sigmas=[]): self.iteration = 0 self.cost.clear() # 先统一设定 3 个参数的学习速率 # 1. 权重(weight) # 2. 中心点(center) # 3. 标准差(sigma) self.sga.uniform_learning_rate(self.learning_rate) # 设定中心点的标准差 if len(custom_sigmas) > 0: self.hidden_layer.refresh_centers_sigma(custom_sigmas) else: # 如果没有自订的 Sigmas, 则会跑演算法去算初始通用的 Sigma self.hidden_layer.initialize_centers_sigma() # 开始训练 while (self.iteration < self.max_iteration and self.cost.rmse > self.tolerance_error): self.iteration += 1 for sample in self.samples: # Network Outputing center_nets = self.hidden_layer.nets output_nets = self.output_layer.nets hidden_outputs = self.hidden_layer.output( sample) # Output RBF Values network_outputs = self.output_layer.output( sample, hidden_outputs) # Training Failed (做例外处理) if network_outputs == -1: if completion_callback: completion_callback(self, False) return # 记录 Cost self.cost.add(network_outputs, sample.targets) # Updates centers and weights self.sga.update_centers(sample, center_nets, output_nets) self.sga.update_weights(center_nets, output_nets) # 所有训练样本(training samples)都跑完后为 1 迭代(Iteration) if iteration_callback: iteration_callback(self) # 完成训练 if completion_callback: completion_callback(self, True) # features <Double> def predicate(self, features=[]): return self.output_layer.predicate(features, self.hidden_layer.nets)
class ImageModel(Model): def __init__(self, bs, K, lang_N, steps, read_size, write_size, m, gen_dim, infer_dim, z_dim, l, seed=12345, channels=1, image_size=60 * 60, cinit=0): # K is the vocab size # lang_N is the (max) length of the sentence encoding # N is the number of times to run the model # m is the size of the langauge representation # l is the dimensions in the align function # image_size is the w*h of image (assumed square) self.use_gpu = True self.cinit = cinit self.batch_size = bs self.channels = channels self.gen_dim = gen_dim self.z_dim = z_dim self.m = m self.lang_N = lang_N self.steps = steps self.l = l self.read_size = read_size self.write_size = write_size self.infer_dim = infer_dim self.image_size = image_size self.language_model = LanguageModel(bs, K, lang_N, m) self.gen_in = HiddenLayer(input_size=m + z_dim, hidden_size=gen_dim * 4, batch_size=bs, name='gen-lstm-in') self.gen_lstm = LSTMLayer(hidden_size=gen_dim, activation=T.tanh, batch_size=bs, dropout=0.0, name='gen-lstm') self.infer_in = HiddenLayer( input_size=2 * channels * self.read_size**2 + self.gen_dim, hidden_size=infer_dim * 4, batch_size=bs, name='infer-lstm-in') self.infer_lstm = LSTMLayer(hidden_size=infer_dim, activation=T.tanh, batch_size=bs, dropout=0.0, name='infer-lstm') self.reader = ReadLayer(batch_size=self.batch_size, N=self.read_size, channels=channels, image_width=int(np.sqrt(self.image_size)), image_height=int(np.sqrt(self.image_size)), input_hidden_size=gen_dim, name='Read') self.writer = WriteLayer(batch_size=self.batch_size, N=self.write_size, channels=channels, image_width=int(np.sqrt(self.image_size)), image_height=int(np.sqrt(self.image_size)), input_hidden_size=gen_dim, name='Write') self.random = RandomStreams(seed) # create W_mu, W_sigma, v, U, W, b init = IsotropicGaussian(0.01) u = init.init(np_rng, (self.m, self.l)) self.U = theano.shared(value=u, name='U', borrow=True) v = init.init(np_rng, (self.l, )) self.v = theano.shared(value=v, name='v', borrow=True) w = init.init(np_rng, (self.gen_dim, self.l)) self.W = theano.shared(value=w, name='W', borrow=True) b = init.init(np_rng, (self.l, )) self.b = theano.shared(value=b, name='b', borrow=True) w_mu = init.init(np_rng, (self.infer_dim, self.z_dim)) self.W_mu_infer = theano.shared(value=w_mu, name='W_mu_infer', borrow=True) w_sigma = init.init(np_rng, (self.infer_dim, self.z_dim)) self.W_sigma_infer = theano.shared(value=w_sigma, name='W_sigma_infer', borrow=True) w_mu = init.init(np_rng, (self.gen_dim, self.z_dim)) self.W_mu_gen = theano.shared(value=w_mu, name='W_mu_gen', borrow=True) w_sigma = init.init(np_rng, (self.gen_dim, self.z_dim)) self.W_sigma_gen = theano.shared(value=w_sigma, name='W_sigma_gen', borrow=True) def batched_dot(self, A, B): if self.use_gpu: return theano.sandbox.cuda.blas.batched_dot(A, B) else: return T.batched_dot(A, B) @property def params(self): return flatten_f([self.U, self.v, self.W, self.b, self.W_mu_infer, self.W_sigma_infer, self.W_mu_gen, self.W_sigma_gen] + self.language_model.params + \ self.gen_in.params + self.gen_lstm.params + self.infer_in.params + self.infer_lstm.params + \ self.reader.params + self.writer.params) def align(self, h_gen, h_lang, mask): # h_lang is N x batch x m # h_gen is batch x gen_dim # U is m x l # mask determines which elements of h_lang we care about # we want the result to be N x batch x l # using batched_dot this can be done # by making U to be 1 x m x l # and mkaing h_lang to be N x batch x m # and repeating U N times on axis 0 U = self.U.reshape((1, self.m, self.l)).repeat(self.lang_N, axis=0) # align_lang is now N x batch x l align_lang = self.batched_dot(h_lang, U) # W is gen_dim x l # h_gen is batch x gen_dim # result is batch x l align_img = T.dot(h_gen, self.W) # use broadcasting to get a to be N x batch x l alpha = T.tanh(align_lang + align_img.dimshuffle('x', 0, 1) + self.b.dimshuffle('x', 'x', 0)) # v is l, a is N x batch x l # result will be N x batch alpha = T.exp(T.dot(alpha, self.v)) # need to mask a before normalizing # so that the parts that are masked do # not affect the normalization mask = mask.transpose([1, 0]) # make mask langN x batch_size alpha = T.switch(T.neq(mask, 0), alpha, zeros((self.lang_N, self.batch_size))) # normalize a by the sum of a along the N (axis=0) # creates a vector of length N alpha = alpha / T.sum(alpha, axis=0) # we now use alpha with h_lang to compute s_t # s_t is of size m because it is a constant # (alpha) * h_lang (m-vector) # we have alpha as N x batch # and h_lang as N x batch x m alpha = alpha.reshape((self.lang_N, self.batch_size, 1)) s = h_lang * alpha # sum along the N axis to give batch x m s = T.sum(s, axis=0) return s, alpha # use with partial to pass in first args # scan will pass the remaining args def step_train(self, rnd_in, kl, h_infer, c_infer, h_gen, c_gen, c, mu_gen, sigma_gen, h_lang, x, mask): # h_gen is a sequence # h_lang is a non-sequence (but it is used to calculate # the align function each step) # eqs 10-13 # compute "error image" x_hat = x - T.nnet.sigmoid(c) # read from both input (x) and error image r, _ = self.reader.run(x, h_gen) r_hat, _ = self.reader.run(x_hat, h_gen) # concatente the two read regions r = T.concatenate([r, r_hat], axis=1) # run the infer lstm on the read regions val = self.infer_in.run(T.concatenate([r, h_gen], axis=1)) h_infer_t, c_infer_t = self.infer_lstm.run(val, h_infer, c_infer) # I don't believe we actually need to sample from Q # we just use it to minimze the loss so that it learns # good values for the infer-lstm # But we do need the mean and logsigma for KL mu_infer = T.dot(h_infer_t, self.W_mu_infer) sigma_infer = 0.5 * T.dot(h_infer_t, self.W_sigma_infer) # generate a sample from these normal distributions z = mu_infer + T.exp(sigma_infer) * rnd_in # calculate kl-divergence between infer and gen normal distributions kl_t = kl + T.sum(-1 + ((mu_infer - mu_gen)**2 + T.exp(2 * sigma_infer)) / (T.exp(2 * sigma_gen)) - 2 * sigma_infer + 2 * sigma_gen) # do the alignment (eq 2) # this is m-dimensions - each word is summed into 1 vector # to represent the whole sequence, so N x batch x m becomes batch x m s, _ = self.align(h_gen, h_lang, mask) # run the LSTM (eq 3) # val is batch x m+z_dims val = self.gen_in.run(T.concatenate([z, s], axis=1)) h_gen_t, c_gen_t = self.gen_lstm.run(val, h_gen, c_gen) mu_gen = T.tanh(T.dot(h_gen_t, self.W_mu_gen)) sigma_gen = T.tanh(T.dot(h_gen_t, self.W_sigma_gen)) # do the write (eq 4) c_update, _ = self.writer.run(h_gen_t) c_t = c + c_update return kl_t, h_infer_t, c_infer_t, h_gen_t, c_gen_t, c_t, mu_gen, sigma_gen def train(self, x, y, mask): # do language model on y h_lang = self.language_model.run(y) # do train recurrence h_infer, c_infer = self.infer_lstm.get_initial_hidden h_gen, c_gen = self.gen_lstm.get_initial_hidden c0 = theano.shared(self.cinit * np.ones( (1, self.channels * self.image_size)).astype(theano.config.floatX)) c0 = c0.repeat(self.batch_size, axis=0) rnd_in = rng.normal(size=(self.steps, self.batch_size, self.z_dim), avg=0.0, std=1.0, dtype=theano.config.floatX) # setup output outputs_info = [ dict(initial=T.zeros(()), taps=[-1]), # kl dict(initial=h_infer, taps=[-1]), # h_infer dict(initial=c_infer, taps=[-1]), # c_infer dict(initial=h_gen, taps=[-1]), # h_gen dict(initial=c_gen, taps=[-1]), # c_gen dict(initial=c0, taps=[-1]), # c dict(initial=T.zeros((self.batch_size, self.z_dim)), taps=[-1]), # mu_gen dict(initial=T.zeros((self.batch_size, self.z_dim)), taps=[-1]) ] # sigma_gen # do scan [kl, h_infer, c_infer, h_gen, c_gen, c, mu_gen, sigma_gen], _ = theano.scan(fn=self.step_train, sequences=rnd_in, outputs_info=outputs_info, non_sequences=[h_lang, x, mask], n_steps=self.steps) # Get x-reconstruction-error (eq 5) x_recons = T.nnet.sigmoid(c[-1, :, :]) log_recons = T.nnet.binary_crossentropy(x_recons, x).sum() # compute KL kl = 0.5 * kl[-1] log_likelihood = kl + log_recons log_likelihood = log_likelihood.mean() kl = kl.mean() log_recons = log_recons.mean() return kl, log_recons, log_likelihood, c def generate_image(self, y, mask): # do language model on y h_lang = self.language_model.run(y) # do train recurrence h_gen, c_gen = self.gen_lstm.get_initial_hidden c0 = theano.shared(self.cinit * np.ones( (1, self.channels * self.image_size)).astype(theano.config.floatX)) c0 = c0.repeat(self.batch_size, axis=0) rnd_in = rng.normal(size=(self.steps, self.batch_size, self.z_dim), avg=0.0, std=1.0, dtype=theano.config.floatX) # setup output outputs_info = [ dict(initial=h_gen, taps=[-1]), # h_gen dict(initial=c_gen, taps=[-1]), # c_gen dict(initial=c0, taps=[-1]), # c dict(initial=T.zeros((self.batch_size, self.z_dim)), taps=[-1]), # mu_gen dict(initial=T.zeros((self.batch_size, self.z_dim)), taps=[-1]), # sigma_gen dict(initial=T.zeros((self.lang_N, self.batch_size, 1)), taps=[-1]) ] # alpha # do scan [h_gen, c_gen, c, mu_gen, sigma_gen, alpha], _ = theano.scan(fn=self.step_gen, sequences=rnd_in, outputs_info=outputs_info, non_sequences=[h_lang, mask], n_steps=self.steps) c = T.nnet.sigmoid(c) return c[-1].reshape( (1, self.batch_size, self.channels, self.image_size)), alpha def step_gen(self, rnd_in, h_gen, c_gen, c, mu_gen, sigma_gen, alpha, h_lang, mask): # generate a sample from the generative distribution z = mu_gen + T.exp(sigma_gen) * rnd_in # do the alignment (eq 2) # this is m-dimensions - each word is summed into 1 vector # to represent the whole sequence, so N x batch x m becomes batch x m s, alpha = self.align(h_gen, h_lang, mask) # run the LSTM (eq 3) # val is batch x m+z_dims val = self.gen_in.run(T.concatenate([z, s], axis=1)) h_gen_t, c_gen_t = self.gen_lstm.run(val, h_gen, c_gen) mu_gen = T.tanh(T.dot(h_gen_t, self.W_mu_gen)) sigma_gen = T.tanh(T.dot(h_gen_t, self.W_sigma_gen)) # do the write (eq 4) c_update, _ = self.writer.run(h_gen_t) c_t = c + c_update return h_gen_t, c_gen_t, c_t, mu_gen, sigma_gen, alpha def build_sample_function(self, y, mask): c, alpha = self.generate_image(y, mask) self.sample_sentences = theano.function([y, mask], [c]) @property def inputs(self): return self._inputs @property def outputs(self): return self._outputs @property def updates(self): return self._updates
class TemporalAttentionLayer(object): def __init__( self, batch_size, N, channels, name='', use_gpu=True, test=False, input_hidden_size=4096, initializers=[ i.IsotropicGaussian(0.01), # g i.IsotropicGaussian(0.01), # d i.IsotropicGaussian(0.01) ]): # sigma """ Temporal Read Layer Based on DRAW paper """ self.batch_size = batch_size self.N = N self.channels = channels self.name = name self.output_shape = [batch_size, channels, N] self.use_gpu = use_gpu self.initializers = initializers self.test = test self.input_hidden_size = input_hidden_size self.hidden_layer = HiddenLayer(input_size=self.input_hidden_size, hidden_size=3, batch_size=self.batch_size, activation=act.Identity, name='Attention-Transform.' + self.name) def load_pretrained(self, v, i): return i def batched_dot(self, A, B): if self.use_gpu: return theano.sandbox.cuda.blas.batched_dot(A, B) else: return T.batched_dot(A, B) # C = A.dimshuffle([0,1,2,'x']) * B.dimshuffle([0,'x',1,2]) # return C.sum(axis=-2) def get_params(self, time, h): hidden = self.hidden_layer.run(h) g = time * ((T.tanh(hidden[:, 0]) + 1) * 0.5) g = g.astype(theano.config.floatX) s2 = T.exp(hidden[:, 1] / 2.0) #.repeat(time.shape[0], axis=0) s2 = s2.astype(theano.config.floatX) d = time / (max(self.N - 1.0, 1.0)) * T.exp(hidden[:, 2]) d = d.astype(theano.config.floatX) return g, s2, d def get_params_test(self, time, h): return h[:, 0], h[:, 1], h[:, 2] def run(self, features, h, time_mask): channels = self.channels # assumes that features are batch x dim (channels) x time # time mask is batch x time and is binary # time mask is a binary matrix that is 1 when the input # is a valid frame and 0 when the input is not. # This allows the shape of a minibatch be the same # even though videos are of various lengths. # we sum along axis 1 to get the time length of # the individual clips time = T.sum(time_mask, axis=1) if not self.test: g, s2, d = self.get_params(time, h) else: g, s2, d = self.get_params_test(time, h) g = g.astype(theano.config.floatX) s2 = s2.astype(theano.config.floatX) d = d.astype(theano.config.floatX) I = features.reshape( (features.shape[0] * self.channels, features.shape[2], 1)) mu = g.dimshuffle([0,'x']) + d.dimshuffle([0,'x']) * \ (T.arange(self.N).astype(theano.config.floatX) - self.N/2 - 0.5) a = T.arange(features.shape[2]).astype(theano.config.floatX) # I is batch*channels x time x 1 # F is batch[*channels] x N x time # batch*channels x N x 1 F = T.exp(-(a - mu.dimshuffle([0, 1, 'x']))**2 / 2. / s2.dimshuffle([0, 'x', 'x'])**2) # need to mask F F = F * time_mask.dimshuffle([0, 'x', 1]) # normalize F F = F / (F.sum(axis=-1).dimshuffle([0, 1, 'x']) + 1e-4) F = T.repeat(F, channels, axis=0) res = self.batched_dot(F, I).reshape( (features.shape[0], self.channels, self.N)) return res, (g, s2, d) @property def params(self): return self.hidden_layer.params @params.setter def params(self, params): print 'Temporal Set Params not implemented' def print_layer(self): v = '--------------------\n' v += 'Read Layer ' + self.name + '\n' v += 'Input Shape: ' + str((self.width, self.height)) + '\n' return v + 'Output Shape: ' + str((self.N, self.N)) + '\n'
class TemporalModel(Model): def __init__(self, inputs, bs, max_time, class_num, feature_dim, hidden_size, levels, N=1, pool=None, seed=12345): self._inputs = inputs self.N = N self.batch_size = bs self.name = 'learned_' + str(class_num) self.class_num = theano.shared(class_num) self.max_time = max_time self.filters = levels self.feature_dim = feature_dim self.pool = pool self.dropout = True self.temporal_pyramid = [] for f in range(self.filters): tf = TemporalAttentionLayer(batch_size=bs, N=N, channels=feature_dim, name='af-' + str(f)) self.temporal_pyramid.append(tf) input_size = feature_dim * len(self.temporal_pyramid) #*N self.hidden = HiddenLayer(input_size=input_size, hidden_size=hidden_size, activation=act.LeakyRelu(), batch_size=bs, name='hidden', dropout=0.5) self.classify = HiddenLayer(input_size=hidden_size, hidden_size=1, batch_size=bs, name='classify', dropout=0.0, activation=act.sigmoid) @property def params(self): return self.classify.params + self.hidden.params + [ p for f in self.temporal_pyramid for p in f.params ] @property def inputs(self): return self._inputs @property def outputs(self): return self._outputs @property def updates(self): return self._updates def run(self, x, mask, y): # use temporal filters results = [] # make x to be batch x features x time x = x.transpose([0, 2, 1]) for tf in self.temporal_pyramid: # results is batch x features x N # flatten to batch x features*N res, (g, s2, d) = tf.run(x, mask) res = res.reshape((x.shape[0], self.feature_dim, self.N)) results.append( T.mean(res, axis=2) ) # take mean along N to get feature x 1 representation for sub-event # if self.pool == None: # results.append(res.reshape((x.shape[0], self.feature_dim*self.N))) # else: # results.append(res.reshape((x.shape[0], 1, self.feature_dim*self.N))) # concatenate on axis 1 to get batch x features*N*filters x = T.concatenate(results, axis=1) if self.pool == 'max': x = T.max(x, axis=1) elif self.pool == 'sum': x = T.sum(x, axis=1) elif self.pool == 'mean': x = T.mean(x, axis=1) x = self.hidden.run(x, self.dropout) prob = self.classify.run(x, False) y = T.switch( T.eq( self.class_num.repeat(y.shape[0]).reshape((y.shape[0], 1)), y), 1, 0) preds = T.switch(T.gt(prob, 0.5), 1, 0) true_pos = (T.eq(y, 1) * T.eq(preds, 1)).sum() true_neg = (T.neq(y, 1) * T.neq(preds, 1)).sum() false_pos = (T.neq(y, 1) * T.eq(preds, 1)).sum() false_neg = (T.eq(y, 1) * T.neq(preds, 1)).sum() loss = T.nnet.binary_crossentropy(prob, y) loss = T.switch(T.eq(y, 1), loss, 0.02 * loss) loss = loss.mean() return prob, loss, (true_pos, true_neg, false_pos, false_neg)
class ReadLayer(object): def __init__(self, batch_size, N, channels, image_width, image_height, input_hidden_size, use_dx_dy=False, name='', test=False, use_gpu=True, device='gpu', use_gamma=True): """ Read Layer from DRAW paper """ self.batch_size = batch_size self.use_dx_dy = use_dx_dy self.N = N self.channels = channels self.width = image_width self.height = image_height self.name = name self.input_hidden_size = input_hidden_size self.test = test self.output_shape = [batch_size, channels, N, N] self.use_gpu = use_gpu self.use_gamma = use_gamma self.device = device self.init_params() def load_pretrained(self, v, i): return i def init_params(self): self.transform_hidden = HiddenLayer(input_size=self.input_hidden_size, hidden_size=5 + self.use_dx_dy, batch_size=self.batch_size, activation=act.Identity, device=self.device, name='Read.Transform.' + self.name) def batched_dot(self, A, B): if self.use_gpu: return theano.sandbox.cuda.blas.batched_dot(A, B) else: return T.batched_dot(A, B) # C = A.dimshuffle([0,1,2,'x']) * B.dimshuffle([0,'x',1,2]) # return C.sum(axis=-2) def get_params(self, h): hidden = self.transform_hidden.run(h) gx = (hidden[:, 0] + 1) * 0.5 * self.width gy = (hidden[:, 1] + 1) * 0.5 * self.height s2 = T.exp(hidden[:, 3] / 2.0) if self.use_gamma: g = T.exp(hidden[:, 4]).dimshuffle(0, 'x') else: g = T.exp(hidden[:, 4]).dimshuffle(0, 'x') g = g / g if self.use_dx_dy: dx = (self.width - 1.0) / (self.N - 1.0) * T.exp(hidden[:, 2]) dy = (self.height - 1.0) / (self.N - 1.0) * T.exp(hidden[:, 5]) else: dx = dy = ((max(self.width, self.height) - 1.0) / (self.N - 1.0) * T.exp(hidden[:, 2])) return gx, gy, dx, dy, s2, g def get_params_test(self, h): return h[:, 0], h[:, 1], h[:, 2], h[:, 5], h[:, 3], h[:, 4].dimshuffle(0, 'x') def run(self, images, h): #, error_images, h): channels = self.channels #images.shape[1] if not self.test: gx, gy, dx, dy, s2, g = self.get_params(h) else: gx, gy, dx, dy, s2, g = self.get_params_test(h) # how to handle variable sized input images? (mask??) I = images.reshape( (self.batch_size * self.channels, self.height, self.width)) muX = gx.dimshuffle([0, 'x']) + dx.dimshuffle([0, 'x']) * ( T.arange(self.N).astype(theano.config.floatX) - self.N / 2 - 0.5) muY = gy.dimshuffle([0, 'x']) + dy.dimshuffle([0, 'x']) * ( T.arange(self.N).astype(theano.config.floatX) - self.N / 2 - 0.5) a = T.arange(self.width).astype(theano.config.floatX) b = T.arange(self.height).astype(theano.config.floatX) Fx = T.exp(-(a - muX.dimshuffle([0, 1, 'x']))**2 / 2. / s2.dimshuffle([0, 'x', 'x'])**2) Fy = T.exp(-(b - muY.dimshuffle([0, 1, 'x']))**2 / 2. / s2.dimshuffle([0, 'x', 'x'])**2) Fx = Fx / (Fx.sum(axis=-1).dimshuffle([0, 1, 'x']) + 1e-4) Fy = Fy / (Fy.sum(axis=-1).dimshuffle([0, 1, 'x']) + 1e-4) self.Fx = T.repeat(Fx, channels, axis=0) self.Fy = T.repeat(Fy, channels, axis=0) self.fint = self.batched_dot(self.Fy, I) # self.efint = T.dot(self.Fx, error_images) self.fim = self.batched_dot(self.fint, self.Fx.transpose([0, 2, 1])).reshape( (self.batch_size, self.channels * self.N * self.N)) # self.feim = T.dot(self.efint, self.Fy.transpose([0,2,1])).reshape( # (self.batch_size, channels,self.N,self.N)) return g * self.fim, (gx, gy, dx, dy, self.fint ) #$T.concatenate([self.fim, self.feim], axis=1) @property def params(self): return [param for param in self.transform_hidden.params] @params.setter def params(self, params): self.transform_hidden.params = params def print_layer(self): v = '--------------------\n' v += 'Read Layer ' + self.name + '\n' v += 'Input Shape: ' + str((self.width, self.height)) + '\n' return v + 'Output Shape: ' + str((self.N, self.N)) + '\n'
class TemporalModel(Model): def __init__(self, inputs, bs, max_time, class_num, feature_dim, hidden_size, method='max', seed=12345): self._inputs = inputs self.method = method self.name = 'baseline_' + str(class_num) self.batch_size = bs self.class_num = theano.shared(class_num) self.max_time = max_time self.feature_dim = feature_dim self.dropout = True self.hidden = HiddenLayer(input_size=feature_dim, hidden_size=hidden_size, batch_size=bs, name='hidden', dropout=0.5, activation=act.LeakyRelu()) self.classify = HiddenLayer(input_size=hidden_size, hidden_size=1, batch_size=bs, name='classify', dropout=0.0, activation=act.sigmoid) @property def params(self): return self.classify.params + self.hidden.params @property def inputs(self): return self._inputs @property def outputs(self): return self._outputs @property def updates(self): return self._updates def run(self, x, mask, y): # get the max/mean/sum of x for each feature # from all frame if self.method == 'max': # apply the mask so that if the feature is all negative # the 0s don't affect it m = (-100 * (1 - mask)).dimshuffle([0, 1, 'x']) x = T.max(x + m, axis=1) elif self.method == 'sum' or self.method == 'mean': x = T.sum(x, axis=1) if self.method == 'mean': # divide by the number of valid frames x = x / T.sum(mask, axis=1).dimshuffle([0, 'x']) x = x.astype(theano.config.floatX) x = self.hidden.run(x, self.dropout) prob = self.classify.run(x, False) y = T.switch( T.eq( self.class_num.repeat(y.shape[0]).reshape((y.shape[0], 1)), y), 1, 0) preds = T.switch(T.gt(prob, 0.5), 1, 0) true_pos = (T.eq(y, 1) * T.eq(preds, 1)).sum() true_neg = (T.neq(y, 1) * T.neq(preds, 1)).sum() false_pos = (T.neq(y, 1) * T.eq(preds, 1)).sum() false_neg = (T.eq(y, 1) * T.neq(preds, 1)).sum() loss = T.nnet.binary_crossentropy(prob, y) loss = T.switch(T.eq(y, 1), loss, 0.02 * loss) loss = loss.mean() return prob, loss, (true_pos, true_neg, false_pos, false_neg)
class TemporalModel(Model): def __init__(self, inputs, bs, max_time, classes, feature_dim, hidden_size, levels, N=1, pool=None, seed=12345): self._inputs = inputs self.N = N self.batch_size = bs self.classes = classes self.max_time = max_time self.levels = levels self.feature_dim = feature_dim self.pool = pool self.dropout = True # create a pyramid of filters self.temporal_pyramid = [] for l in range(self.levels): for f in range(2**l): tf = TemporalAttentionLayer(batch_size=bs, N=N, channels=feature_dim, name='temporal-attention-layer-'+str(l)+'-filter-'+str(f)) tf.test = True tf.d = theano.shared(value=np.asarray([1./2**(l+1)]).astype('float32'), name='d', borrow=True, broadcastable=[True]) tf.g = theano.shared(value=np.asarray([((1./2**l)+(2*f/2.**l))]).astype('float32'), name='g', borrow=True, broadcastable=[True]) tf.sigma = theano.shared(value=np.asarray([5.0]).astype('float32'), name='sigma', borrow=True, broadcastable=[True]) self.temporal_pyramid.append(tf) input_size = feature_dim*N*(len(self.temporal_pyramid) if pool == None else 1) self.hidden = HiddenLayer(input_size=input_size, hidden_size=hidden_size, activation=act.LeakyRelu(), batch_size=bs, name='hidden', dropout=0.5) self.softmax = SoftmaxLayer(input_size=hidden_size, classes=self.classes, batch_size=bs, name='softmax', dropout=0.5) @property def params(self): return self.softmax.params+self.hidden.params#+[p for f in self.temporal_filters for p in f.params] @property def inputs(self): return self._inputs @property def outputs(self): return self._outputs @property def updates(self): return self._updates @property def test_algorithm(self): if not hasattr(self, '_talgorithm'): d = self.dropout self.dropout = False o = self.run(*self.inputs) for i,ot in enumerate(self.outputs): o[i].name = ot.name self._talgorithm = theano.function(inputs=self.inputs, outputs=o, on_unused_input='warn') self.dropout = d return self._talgorithm def run(self, x, mask, y): # use temporal filters results = [] # make x to be batch x features x time x = x.transpose([0,2,1]) for tf in self.temporal_pyramid: # results is batch x features x N # flatten to batch x features*N res, (g,s2,d) = tf.run(x, mask) if self.pool == None: results.append(res.reshape((x.shape[0], self.feature_dim*self.N))) else: results.append(res.reshape((x.shape[0], 1, self.feature_dim*self.N))) # concatenate on axis 1 to get batch x features*N*filters x = T.concatenate(results, axis=1) if self.pool == 'max': x = T.max(x, axis=1) elif self.pool == 'sum': x = T.sum(x, axis=1) elif self.pool == 'mean': x = T.mean(x, axis=1) x = self.hidden.run(x, self.dropout) prob, pred = self.softmax.run(x, self.dropout) loss = self.softmax.loss(prob, y) error = self.softmax.error(pred, y) acc = 1-error return prob, pred, loss, error, acc
class TemporalModel(Model): def __init__(self, inputs, bs, max_time, classes, feature_dim, hidden_size, filters, N=1, pool=None, lstm_dim=4096, steps=8, seed=12345): self._inputs = inputs self.N = N self.batch_size = bs self.classes = classes self.max_time = max_time self.filters = filters self.feature_dim = feature_dim self.pool = pool self.dropout = True self.steps = steps self.temporal_filters = [] for f in range(filters): tf = TemporalAttentionLayer( batch_size=bs, N=N, channels=feature_dim, input_hidden_size=lstm_dim, name='temporal-attention-layer-filter-' + str(f)) self.temporal_filters.append(tf) input_size = feature_dim * len( self.temporal_filters) * (N if pool == None else 1) self.lstm_in = HiddenLayer(input_size=input_size, hidden_size=lstm_dim * 4, batch_size=bs) self.lstm = LSTMLayer(input_size=lstm_dim, hidden_size=lstm_dim) self.hidden = HiddenLayer(input_size=lstm_dim, hidden_size=hidden_size, activation=act.relu, batch_size=bs, name='hidden', dropout=0.5) self.softmax = SoftmaxLayer(input_size=hidden_size, classes=self.classes, batch_size=bs, name='softmax', dropout=0.5) @property def params(self): return self.softmax.params + self.hidden.params + self.lstm_in.params + self.lstm.params + [ p for f in self.temporal_filters for p in f.params ] @property def inputs(self): return self._inputs @property def outputs(self): return self._outputs @property def updates(self): return self._updates @property def test_algorithm(self): if not hasattr(self, '_talgorithm'): d = self.dropout self.dropout = False o = self.run(*self.inputs) for i, ot in enumerate(self.outputs): o[i].name = ot.name self._talgorithm = theano.function(inputs=self.inputs, outputs=o, on_unused_input='warn') self.dropout = d return self._talgorithm def run(self, x, mask, y): # use temporal filters # make x to be batch x features x time x = x.transpose([0, 2, 1]) h, c = self.lstm.get_initial_hidden(x) outputs_info = [ dict(initial=h, taps=[-1]), # h dict(initial=c, taps=[-1]) ] # c [h, c], _ = theano.scan(fn=self.step, non_sequences=[x, mask], outputs_info=outputs_info, n_steps=self.steps) x = self.hidden.run(h[-1], self.dropout) prob, pred = self.softmax.run(x, self.dropout) loss = self.softmax.loss(prob, y) error = self.softmax.error(pred, y) acc = 1 - error return prob, pred, loss, error, acc def step(self, h, c, x, mask): results = [] for tf in self.temporal_filters: # results is batch x features x N # flatten to batch x features*N res, (g, s2, d) = tf.run(x, h, mask) if self.pool == None: results.append( res.reshape((x.shape[0], self.feature_dim * self.N))) elif self.pool == 'max': results.append( T.max(res, axis=2).reshape((x.shape[0], self.feature_dim))) elif self.pool == 'sum': results.append( T.sum(res, axis=2).reshape((x.shape[0], self.feature_dim))) elif self.pool == 'mean': results.append( T.mean(res, axis=2).reshape( (x.shape[0], self.feature_dim))) # concatenate on axis 1 to get batch x features*N*filters x = T.concatenate(results, axis=1) x = self.lstm_in.run(x) h, c = self.lstm.run(x, h, c) return h, c
def __init__(self, bs, K, lang_N, steps, read_size, write_size, m, gen_dim, infer_dim, z_dim, l, seed=12345, channels=1, image_size=60 * 60, cinit=0): # K is the vocab size # lang_N is the (max) length of the sentence encoding # N is the number of times to run the model # m is the size of the langauge representation # l is the dimensions in the align function # image_size is the w*h of image (assumed square) self.use_gpu = True self.cinit = cinit self.batch_size = bs self.channels = channels self.gen_dim = gen_dim self.z_dim = z_dim self.m = m self.lang_N = lang_N self.steps = steps self.l = l self.read_size = read_size self.write_size = write_size self.infer_dim = infer_dim self.image_size = image_size self.language_model = LanguageModel(bs, K, lang_N, m) self.gen_in = HiddenLayer(input_size=m + z_dim, hidden_size=gen_dim * 4, batch_size=bs, name='gen-lstm-in') self.gen_lstm = LSTMLayer(hidden_size=gen_dim, activation=T.tanh, batch_size=bs, dropout=0.0, name='gen-lstm') self.infer_in = HiddenLayer( input_size=2 * channels * self.read_size**2 + self.gen_dim, hidden_size=infer_dim * 4, batch_size=bs, name='infer-lstm-in') self.infer_lstm = LSTMLayer(hidden_size=infer_dim, activation=T.tanh, batch_size=bs, dropout=0.0, name='infer-lstm') self.reader = ReadLayer(batch_size=self.batch_size, N=self.read_size, channels=channels, image_width=int(np.sqrt(self.image_size)), image_height=int(np.sqrt(self.image_size)), input_hidden_size=gen_dim, name='Read') self.writer = WriteLayer(batch_size=self.batch_size, N=self.write_size, channels=channels, image_width=int(np.sqrt(self.image_size)), image_height=int(np.sqrt(self.image_size)), input_hidden_size=gen_dim, name='Write') self.random = RandomStreams(seed) # create W_mu, W_sigma, v, U, W, b init = IsotropicGaussian(0.01) u = init.init(np_rng, (self.m, self.l)) self.U = theano.shared(value=u, name='U', borrow=True) v = init.init(np_rng, (self.l, )) self.v = theano.shared(value=v, name='v', borrow=True) w = init.init(np_rng, (self.gen_dim, self.l)) self.W = theano.shared(value=w, name='W', borrow=True) b = init.init(np_rng, (self.l, )) self.b = theano.shared(value=b, name='b', borrow=True) w_mu = init.init(np_rng, (self.infer_dim, self.z_dim)) self.W_mu_infer = theano.shared(value=w_mu, name='W_mu_infer', borrow=True) w_sigma = init.init(np_rng, (self.infer_dim, self.z_dim)) self.W_sigma_infer = theano.shared(value=w_sigma, name='W_sigma_infer', borrow=True) w_mu = init.init(np_rng, (self.gen_dim, self.z_dim)) self.W_mu_gen = theano.shared(value=w_mu, name='W_mu_gen', borrow=True) w_sigma = init.init(np_rng, (self.gen_dim, self.z_dim)) self.W_sigma_gen = theano.shared(value=w_sigma, name='W_sigma_gen', borrow=True)
class LanguageModel(Model): def __init__(self, bs, K, N, m): # builds a bidirectional LSTM to create # a m-dimensional hidden state for the given # sequence of lenth N with vocab size K self.K = K self.N = N self.m = m self.bs = bs self.forward_in = HiddenLayer(input_size=K, hidden_size=m*4//2, batch_size=bs, name='forward-lstm-in') self.forward_lstm = LSTMLayer(hidden_size=m//2, activation=T.tanh, batch_size=bs, dropout=0.0, name='forward-lstm') self.backward_in = HiddenLayer(input_size=K, hidden_size=m*4//2, batch_size=bs, name='backward-lstm-in') self.backward_lstm = LSTMLayer(hidden_size=m//2, activation=T.tanh, batch_size=bs, dropout=0.0, name='backward-lstm') def run(self, y): # y comes in as shape batch X total_seq y = y.transpose([1,0]) # y is of shape seq X batch and of type 'int' # y needs to be 1-hot encoded, but this is more # easily done in the step function # reverse each example of y (not the batches, just the variables) y_rev = y[::-1, :] # get initial values for LSTMs hf, cf = self.forward_lstm.get_initial_hidden hb, cb = self.backward_lstm.get_initial_hidden # setup initial values for scan outputs_info = [dict(initial=hf, taps=[-1]), # hf dict(initial=cf, taps=[-1]), # cf dict(initial=hb, taps=[-1]), # cb dict(initial=cb, taps=[-1])] # cb # run LSTM loop [hf,cf,hb,cb], _ = theano.scan(fn=self.step, sequences=[y,y_rev], outputs_info=outputs_info, n_steps=self.N) # return forward and backward concatenated # this needs to be aligned so that [4,13,45,3,X, X, X] # and [0,0, 0, 3,45,13,4] # concatenate correctly to [4/3,13/25,45/13,3/4,X,X,X] # stores the indices of the string b_indx = zeros((self.N, self.bs), int) # stores the last-set index c = zeros((self.bs,), int) # This loop creates an array that can be used to # map hb to hf with the proper alignment for i in range(self.N): # if this part of y_rev is 0, ignore # else, get the current index indx = T.switch(T.neq(y_rev[i,:], 0), i, 0) # set b_indx to be the current indx if this is # a valid part of the string b_indx = T.set_subtensor(b_indx[c,T.arange(self.bs)], indx) # increment those that were used inc = T.switch(T.neq(y_rev[i,:], 0), 1, 0) c = c + inc # the magic that gets hb to align with hf # it takes hb, uses the aligning indices and grabs those on the # diagonal as the elements we are interested in. This results in # essentially "shifting" the first non-zero element of hb # to the front of the list, for each sample in the batch h_b_aligned = hb[b_indx][:,T.arange(self.bs),T.arange(self.bs)] # concatenate them together. Now everything is aligned, as it should be! h_lang = T.concatenate([hf, h_b_aligned], axis=2) # axis 0 -> N # axis 1 -> batch # axis 2 -> m return h_lang def step(self, y_m, yb_m, hf, cf, hb, cb): # y_m/yb_m are what shape? should be batch_size (x 1) print y_m.ndim # one-hot encode y,yb (NEED TO SAVE PREVIOUS VALUES FOR MASKING!!!) y = to_one_hot(y_m, self.bs, self.K) yb = to_one_hot(yb_m, self.bs, self.K) # get forward and backward inputs values y_f_in = self.forward_in.run(y) y_b_in = self.backward_in.run(yb) # run forward and backward LSTMs hf_t,cf_t = self.forward_lstm.run(y_f_in, hf, cf) hb_t,cb_t = self.backward_lstm.run(y_b_in, hb, cb) # but only if y/yb is not 0 (apply mask) mask_y = y_m.reshape((self.bs, 1))#.repeat(self.m//2, axis=1) # these lines *shouldnt* be needed... mask_yb = yb_m.reshape((self.bs, 1))#.repeat(self.m//2, axis=1) hf = T.switch(T.neq(mask_y, 0), hf_t, hf) cf = T.switch(T.neq(mask_y, 0), cf_t, cf) # and backward hb = T.switch(T.neq(mask_yb, 0), hb_t, hb) cb = T.switch(T.neq(mask_yb, 0), cb_t, cb) # return the new values return hf,cf,hb,cb @property def params(self): return self.forward_in.params+self.forward_lstm.params+self.backward_in.params+\ self.backward_lstm.params