def __init__(self,
                 inputs,
                 bs,
                 max_time,
                 class_num,
                 feature_dim,
                 hidden_size,
                 method='max',
                 seed=12345):
        self._inputs = inputs
        self.method = method
        self.name = 'baseline_' + str(class_num)
        self.batch_size = bs
        self.class_num = theano.shared(class_num)
        self.max_time = max_time
        self.feature_dim = feature_dim
        self.dropout = True
        self.hidden = HiddenLayer(input_size=feature_dim,
                                  hidden_size=hidden_size,
                                  batch_size=bs,
                                  name='hidden',
                                  dropout=0.5,
                                  activation=act.LeakyRelu())

        self.classify = HiddenLayer(input_size=hidden_size,
                                    hidden_size=1,
                                    batch_size=bs,
                                    name='classify',
                                    dropout=0.0,
                                    activation=act.sigmoid)
 def __init__(self,
              inputs,
              bs,
              max_time,
              classes,
              feature_dim,
              hidden_size,
              method='max',
              seed=12345):
     self._inputs = inputs
     self.method = method
     self.batch_size = bs
     self.classes = classes
     self.max_time = max_time
     self.feature_dim = feature_dim
     self.dropout = True
     self.hidden = HiddenLayer(input_size=feature_dim,
                               hidden_size=hidden_size,
                               batch_size=bs,
                               name='hidden',
                               dropout=0.5,
                               activation=act.LeakyRelu())
     self.softmax = SoftmaxLayer(input_size=hidden_size,
                                 classes=self.classes,
                                 batch_size=bs,
                                 name='softmax',
                                 dropout=0.5)
Exemple #3
0
    def __init__(self, inputs, bs, max_time, classes, feature_dim, hidden_size, levels, N=1, pool=None, seed=12345):
        self._inputs = inputs
        self.N = N
        self.batch_size = bs
        self.classes = classes
        self.max_time = max_time
        self.levels = levels
        self.feature_dim = feature_dim
        self.pool = pool
        self.dropout = True

        # create a pyramid of filters
        self.temporal_pyramid = []
        for l in range(self.levels):
            for f in range(2**l):
                tf = TemporalAttentionLayer(batch_size=bs, N=N, channels=feature_dim, 
                                            name='temporal-attention-layer-'+str(l)+'-filter-'+str(f))
                tf.test = True
                tf.d = theano.shared(value=np.asarray([1./2**(l+1)]).astype('float32'), name='d', borrow=True,
                                     broadcastable=[True])
                tf.g = theano.shared(value=np.asarray([((1./2**l)+(2*f/2.**l))]).astype('float32'), name='g', borrow=True,
                                     broadcastable=[True])
                tf.sigma = theano.shared(value=np.asarray([5.0]).astype('float32'), name='sigma', borrow=True,
                                         broadcastable=[True])
                self.temporal_pyramid.append(tf)

        input_size = feature_dim*N*(len(self.temporal_pyramid) if pool == None else 1)
        self.hidden = HiddenLayer(input_size=input_size, hidden_size=hidden_size, activation=act.LeakyRelu(),
                                  batch_size=bs, name='hidden', dropout=0.5)
        self.softmax = SoftmaxLayer(input_size=hidden_size, classes=self.classes,
                                    batch_size=bs, name='softmax', dropout=0.5)
 def init_params(self):
     self.transform_hidden = HiddenLayer(input_size=self.input_hidden_size,
                                         hidden_size=5 + self.use_dx_dy,
                                         batch_size=self.batch_size,
                                         activation=act.Identity,
                                         device=self.device,
                                         name='Read.Transform.' + self.name)
Exemple #5
0
    def __init__(
        self,
        batch_size,
        N,
        channels,
        name='',
        use_gpu=True,
        test=False,
        input_hidden_size=4096,
        initializers=[
            i.IsotropicGaussian(0.01),  # g
            i.IsotropicGaussian(0.01),  # d
            i.IsotropicGaussian(0.01)
        ]):  # sigma
        """
        Temporal Read Layer Based on DRAW paper
        """

        self.batch_size = batch_size
        self.N = N
        self.channels = channels
        self.name = name
        self.output_shape = [batch_size, channels, N]
        self.use_gpu = use_gpu
        self.initializers = initializers
        self.test = test
        self.input_hidden_size = input_hidden_size

        self.hidden_layer = HiddenLayer(input_size=self.input_hidden_size,
                                        hidden_size=3,
                                        batch_size=self.batch_size,
                                        activation=act.Identity,
                                        name='Attention-Transform.' +
                                        self.name)
Exemple #6
0
    def __init__(self, rng, input, nkerns, recept_width, pool_width, stride,
                 training_mode, dropout_prob, activation, weights_variance,
                 n_channels, n_timesteps, n_fbins, global_pooling):
        self.layer0 = ConvPoolLayer(
            rng,
            input=input,
            image_shape=(None, 1, n_channels * n_fbins, n_timesteps),
            filter_shape=(nkerns[0], 1, n_fbins * n_channels, recept_width[0]),
            poolsize=(1, pool_width[0]),
            activation=activation[0],
            weights_variance=weights_variance,
            subsample=(1, stride[0]))

        input_layer1_width = (
            (n_timesteps - recept_width[0]) / stride[0] + 1) / pool_width[0]
        self.layer1 = ConvPoolLayer(rng,
                                    input=self.layer0.output,
                                    image_shape=(None, nkerns[0], 1,
                                                 input_layer1_width),
                                    filter_shape=(nkerns[1], nkerns[0], 1,
                                                  recept_width[1]),
                                    poolsize=(1, pool_width[1]),
                                    activation=activation[1],
                                    weights_variance=weights_variance,
                                    subsample=(1, stride[1]))

        if global_pooling:
            self.glob_pool = GlobalPoolLayer(self.layer1.output)
            layer2_input = self.glob_pool.output.flatten(2)

            input_layer2_shape = nkerns[1] * 6
            self.layer2 = HiddenLayer(rng=rng,
                                      input=layer2_input,
                                      n_in=input_layer2_shape,
                                      n_out=nkerns[2],
                                      training_mode=training_mode,
                                      dropout_prob=dropout_prob,
                                      activation=activation[2],
                                      weights_variance=weights_variance)
        else:
            layer2_input = self.layer1.output.flatten(2)
            input_layer2_size = (
                (input_layer1_width - recept_width[1]) / stride[1] +
                1) / pool_width[1]
            self.layer2 = HiddenLayer(rng=rng,
                                      input=layer2_input,
                                      n_in=nkerns[1] * input_layer2_size,
                                      n_out=nkerns[2],
                                      training_mode=training_mode,
                                      dropout_prob=dropout_prob,
                                      activation=activation[2],
                                      weights_variance=weights_variance)

        self.output = self.layer2.output
        self.weights = self.layer0.weights + self.layer1.weights + self.layer2.weights
Exemple #7
0
 def __init__(self):
     self.samples = []  # <Sample Object>
     self.iteration = 0
     self.max_iteration = 1
     self.tolerance_error = 0.001
     self.learning_rate = 1.0
     self.hidden_layer = HiddenLayer()
     self.output_layer = OutputLayer()
     self.sga = SGA()
     self.cost = Cost()
     self.picker = Picker()
Exemple #8
0
    def __init__(self,
                 inputs,
                 bs,
                 max_time,
                 classes,
                 feature_dim,
                 hidden_size,
                 filters,
                 N=1,
                 pool=None,
                 lstm_dim=4096,
                 steps=8,
                 seed=12345):
        self._inputs = inputs
        self.N = N
        self.batch_size = bs
        self.classes = classes
        self.max_time = max_time
        self.filters = filters
        self.feature_dim = feature_dim
        self.pool = pool
        self.dropout = True
        self.steps = steps

        self.temporal_filters = []
        for f in range(filters):
            tf = TemporalAttentionLayer(
                batch_size=bs,
                N=N,
                channels=feature_dim,
                input_hidden_size=lstm_dim,
                name='temporal-attention-layer-filter-' + str(f))
            self.temporal_filters.append(tf)

        input_size = feature_dim * len(
            self.temporal_filters) * (N if pool == None else 1)

        self.lstm_in = HiddenLayer(input_size=input_size,
                                   hidden_size=lstm_dim * 4,
                                   batch_size=bs)
        self.lstm = LSTMLayer(input_size=lstm_dim, hidden_size=lstm_dim)

        self.hidden = HiddenLayer(input_size=lstm_dim,
                                  hidden_size=hidden_size,
                                  activation=act.relu,
                                  batch_size=bs,
                                  name='hidden',
                                  dropout=0.5)
        self.softmax = SoftmaxLayer(input_size=hidden_size,
                                    classes=self.classes,
                                    batch_size=bs,
                                    name='softmax',
                                    dropout=0.5)
 def init_params(self):
     self.transform_hidden = HiddenLayer(input_size=self.input_hidden_size,
                                         hidden_size=5+self.use_dx_dy,
                                         batch_size=self.batch_size,
                                         activation=act.Identity,
                                         device=self.device,
                                         name='Read.Transform.'+self.name)
    def __init__(self,
                 inputs,
                 bs,
                 max_time,
                 class_num,
                 feature_dim,
                 hidden_size,
                 levels,
                 N=1,
                 pool=None,
                 seed=12345):
        self._inputs = inputs
        self.N = N
        self.batch_size = bs
        self.name = 'learned_' + str(class_num)
        self.class_num = theano.shared(class_num)
        self.max_time = max_time
        self.filters = levels
        self.feature_dim = feature_dim
        self.pool = pool
        self.dropout = True

        self.temporal_pyramid = []
        for f in range(self.filters):
            tf = TemporalAttentionLayer(batch_size=bs,
                                        N=N,
                                        channels=feature_dim,
                                        name='af-' + str(f))
            self.temporal_pyramid.append(tf)

        input_size = feature_dim * len(self.temporal_pyramid)  #*N
        self.hidden = HiddenLayer(input_size=input_size,
                                  hidden_size=hidden_size,
                                  activation=act.LeakyRelu(),
                                  batch_size=bs,
                                  name='hidden',
                                  dropout=0.5)
        self.classify = HiddenLayer(input_size=hidden_size,
                                    hidden_size=1,
                                    batch_size=bs,
                                    name='classify',
                                    dropout=0.0,
                                    activation=act.sigmoid)
Exemple #11
0
 def __init__(self, bs, K, N, m):
     # builds a bidirectional LSTM to create
     # a m-dimensional hidden state for the given
     # sequence of lenth N with vocab size K
     self.K = K
     self.N = N
     self.m = m
     self.bs = bs
     self.forward_in = HiddenLayer(input_size=K, hidden_size=m*4//2,
                                   batch_size=bs, name='forward-lstm-in')
     self.forward_lstm = LSTMLayer(hidden_size=m//2, 
                                   activation=T.tanh, 
                                   batch_size=bs,
                                   dropout=0.0,
                                   name='forward-lstm')
     
     self.backward_in = HiddenLayer(input_size=K, hidden_size=m*4//2,
                                    batch_size=bs, name='backward-lstm-in')
     self.backward_lstm = LSTMLayer(hidden_size=m//2, 
                                    activation=T.tanh, 
                                    batch_size=bs,
                                    dropout=0.0,
                                    name='backward-lstm')
class ImageModel(Model):
    def __init__(self, bs, K, lang_N, steps, read_size, write_size, m, gen_dim, infer_dim, z_dim, l, seed=12345, channels=1, image_size=60*60, cinit=0):
        # K is the vocab size
        # lang_N is the (max) length of the sentence encoding
        # N is the number of times to run the model
        # m is the size of the langauge representation
        # l is the dimensions in the align function
        # image_size is the w*h of image (assumed square)
        self.use_gpu = True
        self.cinit = cinit
        self.batch_size = bs
        self.channels = channels
        self.gen_dim = gen_dim
        self.z_dim = z_dim
        self.m = m
        self.lang_N = lang_N
        self.steps = steps
        self.l = l
        self.read_size = read_size
        self.write_size = write_size
        self.infer_dim = infer_dim
        self.image_size = image_size

        self.language_model = LanguageModel(bs, K, lang_N, m)

        self.gen_in = HiddenLayer(input_size=m+z_dim, hidden_size=gen_dim*4,
                                  batch_size=bs, name='gen-lstm-in')
        self.gen_lstm = LSTMLayer(hidden_size=gen_dim, 
                                  activation=T.tanh, 
                                  batch_size=bs,
                                  dropout=0.0,
                                  name='gen-lstm')
        self.infer_in = HiddenLayer(input_size=2*channels*self.read_size**2+self.gen_dim,
                                    hidden_size=infer_dim*4,
                                    batch_size=bs, name='infer-lstm-in')
        self.infer_lstm = LSTMLayer(hidden_size=infer_dim, 
                                    activation=T.tanh, 
                                    batch_size=bs,
                                    dropout=0.0,
                                    name='infer-lstm')

        self.reader = ReadLayer(batch_size=self.batch_size,
                                N=self.read_size,
                                channels=channels,
                                image_width=int(np.sqrt(self.image_size)),
                                image_height=int(np.sqrt(self.image_size)),
                                input_hidden_size=gen_dim,
                                name='Read')
        self.writer = WriteLayer(batch_size=self.batch_size,
                                 N=self.write_size,
                                 channels=channels,
                                 image_width=int(np.sqrt(self.image_size)),
                                 image_height=int(np.sqrt(self.image_size)),
                                 input_hidden_size=gen_dim,
                                 name='Write')
        self.random = RandomStreams(seed)

        # create W_mu, W_sigma, v, U, W, b
        init = IsotropicGaussian(0.01)
        u = init.init(np_rng, (self.m, self.l))
        self.U = theano.shared(value=u, name='U', borrow=True)
        v = init.init(np_rng, (self.l,))
        self.v = theano.shared(value=v, name='v', borrow=True)
        w = init.init(np_rng, (self.gen_dim, self.l))
        self.W = theano.shared(value=w, name='W', borrow=True)
        b = init.init(np_rng, (self.l,))
        self.b = theano.shared(value=b, name='b', borrow=True)


        w_mu = init.init(np_rng, (self.infer_dim, self.z_dim))
        self.W_mu_infer = theano.shared(value=w_mu, name='W_mu_infer', borrow=True)
        w_sigma = init.init(np_rng, (self.infer_dim, self.z_dim))
        self.W_sigma_infer = theano.shared(value=w_sigma, name='W_sigma_infer', borrow=True)

        w_mu = init.init(np_rng, (self.gen_dim, self.z_dim))
        self.W_mu_gen = theano.shared(value=w_mu, name='W_mu_gen', borrow=True)
        w_sigma = init.init(np_rng, (self.gen_dim, self.z_dim))
        self.W_sigma_gen = theano.shared(value=w_sigma, name='W_sigma_gen', borrow=True)


    def batched_dot(self, A, B):
        if self.use_gpu:
            return theano.sandbox.cuda.blas.batched_dot(A, B)
        else:
            return T.batched_dot(A,B)

    @property
    def params(self):
        return flatten_f([self.U, self.v, self.W, self.b, self.W_mu_infer, self.W_sigma_infer,
                self.W_mu_gen, self.W_sigma_gen] + self.language_model.params + \
        self.gen_in.params + self.gen_lstm.params + self.infer_in.params + self.infer_lstm.params + \
        self.reader.params + self.writer.params)

    def align(self, h_gen, h_lang, mask):
        # h_lang is N x batch x m
        # h_gen is batch x gen_dim
        # U is m x l
        # mask determines which elements of h_lang we care about
        # we want the result to be N x batch x l

        # using batched_dot this can be done
        # by making U to be 1 x m x l
        # and mkaing h_lang to be N x batch x m
        # and repeating U N times on axis 0
        U = self.U.reshape((1, self.m, self.l)).repeat(self.lang_N, axis=0)
        # align_lang is now N x batch x l
        align_lang = self.batched_dot(h_lang, U)

        # W is gen_dim x l
        # h_gen is batch x gen_dim
        # result is batch x l
        align_img = T.dot(h_gen, self.W)
        # use broadcasting to get a to be N x batch x l
        alpha = T.tanh(align_lang + align_img.dimshuffle('x',0,1) + self.b.dimshuffle('x','x',0))
        
        # v is l, a is N x batch x l
        # result will be N x batch
        alpha = T.exp(T.dot(alpha, self.v))
        
        # need to mask a before normalizing
        # so that the parts that are masked do
        # not affect the normalization
        mask = mask.transpose([1,0]) # make mask langN x batch_size
        alpha = T.switch(T.neq(mask,0), alpha, zeros((self.lang_N, self.batch_size)))

        # normalize a by the sum of a along the N (axis=0)
        # creates a vector of length N
        alpha = alpha / T.sum(alpha, axis=0)

        # we now use alpha with h_lang to compute s_t
        # s_t is of size m because it is a constant
        # (alpha) * h_lang (m-vector)
        # we have alpha as N x batch
        # and h_lang as N x batch x m
        alpha = alpha.reshape((self.lang_N, self.batch_size, 1))
        s = h_lang * alpha

        # sum along the N axis to give batch x m
        s = T.sum(s, axis=0)
        return s, alpha


    # use with partial to pass in first args
    # scan will pass the remaining args
    def step_train(self, rnd_in, kl, h_infer, c_infer, h_gen, c_gen, c, mu_gen, sigma_gen, h_lang, x, mask):
        # h_gen is a sequence
        # h_lang is a non-sequence (but it is used to calculate
        #     the align function each step)

        # eqs 10-13
        # compute "error image"
        x_hat = x-T.nnet.sigmoid(c)
        # read from both input (x) and error image
        r, _ = self.reader.run(x, h_gen)
        r_hat, _ = self.reader.run(x_hat, h_gen)
        # concatente the two read regions
        r = T.concatenate([r,r_hat], axis=1)
        
        # run the infer lstm on the read regions
        val = self.infer_in.run(T.concatenate([r, h_gen], axis=1))
        h_infer_t, c_infer_t = self.infer_lstm.run(val, h_infer, c_infer)
        
        # I don't believe we actually need to sample from Q
        # we just use it to minimze the loss so that it learns
        # good values for the infer-lstm
        # But we do need the mean and logsigma for KL
        mu_infer = T.dot(h_infer_t, self.W_mu_infer)
        sigma_infer = 0.5*T.dot(h_infer_t, self.W_sigma_infer)
        # generate a sample from these normal distributions
        z = mu_infer + T.exp(sigma_infer) * rnd_in
        
        # calculate kl-divergence between infer and gen normal distributions
        kl_t = kl + T.sum(-1 + ((mu_infer - mu_gen)**2 + T.exp(2*sigma_infer))/
                          (T.exp(2*sigma_gen)) - 2*sigma_infer + 2*sigma_gen)
        
        # do the alignment (eq 2)
        # this is m-dimensions - each word is summed into 1 vector
        # to represent the whole sequence, so N x batch x m becomes batch x m
        s,_ = self.align(h_gen, h_lang, mask)

        # run the LSTM (eq 3)
        # val is batch x m+z_dims
        val = self.gen_in.run(T.concatenate([z,s], axis=1))
        h_gen_t, c_gen_t = self.gen_lstm.run(val, h_gen, c_gen)

        mu_gen = T.tanh(T.dot(h_gen_t, self.W_mu_gen))
        sigma_gen = T.tanh(T.dot(h_gen_t, self.W_sigma_gen))

        # do the write (eq 4)
        c_update, _ = self.writer.run(h_gen_t)
        c_t = c + c_update

        return kl_t, h_infer_t, c_infer_t, h_gen_t, c_gen_t, c_t, mu_gen, sigma_gen
    

    def train(self, x, y, mask):
        # do language model on y
        h_lang = self.language_model.run(y)


        # do train recurrence
        h_infer, c_infer = self.infer_lstm.get_initial_hidden
        h_gen, c_gen = self.gen_lstm.get_initial_hidden
        c0 = theano.shared(self.cinit*np.ones((1, self.channels*self.image_size)).astype(theano.config.floatX))
        c0 = c0.repeat(self.batch_size, axis=0)

        rnd_in = rng.normal(size=(self.steps, self.batch_size, self.z_dim), 
                            avg=0.0, std=1.0, dtype=theano.config.floatX)

        # setup output
        outputs_info = [dict(initial=T.zeros(()), taps=[-1]), # kl
                        dict(initial=h_infer, taps=[-1]), # h_infer
                        dict(initial=c_infer, taps=[-1]), # c_infer
                        dict(initial=h_gen, taps=[-1]), # h_gen
                        dict(initial=c_gen, taps=[-1]), # c_gen
                        dict(initial=c0, taps=[-1]), # c
                        dict(initial=T.zeros((self.batch_size,self.z_dim)), taps=[-1]), # mu_gen
                        dict(initial=T.zeros((self.batch_size,self.z_dim)), taps=[-1])] # sigma_gen

        # do scan
        [kl, h_infer, c_infer, h_gen, c_gen, c, mu_gen, sigma_gen], _ = theano.scan(
                                                                         fn=self.step_train,
                                                                         sequences=rnd_in,
                                                                         outputs_info=outputs_info,
                                                                         non_sequences=[h_lang,x,mask],
                                                                         n_steps=self.steps)
                    

        # Get x-reconstruction-error (eq 5)
        x_recons = T.nnet.sigmoid(c[-1,:,:])
        log_recons = T.nnet.binary_crossentropy(x_recons, x).sum()
        
        # compute KL
        kl = 0.5*kl[-1]

        log_likelihood = kl + log_recons
        log_likelihood = log_likelihood.mean()
        kl = kl.mean()
        log_recons = log_recons.mean()
        return kl, log_recons, log_likelihood, c


    def generate_image(self, y, mask):
        # do language model on y
        h_lang = self.language_model.run(y)

        # do train recurrence
        h_gen, c_gen = self.gen_lstm.get_initial_hidden
        c0 = theano.shared(self.cinit*np.ones((1, self.channels*self.image_size)).astype(theano.config.floatX))
        c0 = c0.repeat(self.batch_size, axis=0)

        rnd_in = rng.normal(size=(self.steps, self.batch_size, self.z_dim), 
                            avg=0.0, std=1.0, dtype=theano.config.floatX)

        # setup output
        outputs_info = [dict(initial=h_gen, taps=[-1]), # h_gen
                        dict(initial=c_gen, taps=[-1]), # c_gen
                        dict(initial=c0, taps=[-1]), # c
                        dict(initial=T.zeros((self.batch_size,self.z_dim)), taps=[-1]), # mu_gen
                        dict(initial=T.zeros((self.batch_size,self.z_dim)), taps=[-1]), # sigma_gen
                        dict(initial=T.zeros((self.lang_N,self.batch_size, 1)), taps=[-1])] # alpha
        # do scan
        [h_gen, c_gen, c, mu_gen, sigma_gen, alpha], _ = theano.scan(fn=self.step_gen,
                                                              sequences=rnd_in,
                                                              outputs_info=outputs_info,
                                                              non_sequences=[h_lang,mask],
                                                              n_steps=self.steps)
        c = T.nnet.sigmoid(c)

        return c[-1].reshape((1,self.batch_size,self.channels, self.image_size)), alpha

    def step_gen(self, rnd_in, h_gen, c_gen, c, mu_gen, sigma_gen, alpha, h_lang, mask):
        # generate a sample from the generative distribution
        z = mu_gen + T.exp(sigma_gen) * rnd_in

        # do the alignment (eq 2)
        # this is m-dimensions - each word is summed into 1 vector
        # to represent the whole sequence, so N x batch x m becomes batch x m
        s, alpha = self.align(h_gen, h_lang, mask)

        # run the LSTM (eq 3)
        # val is batch x m+z_dims
        val = self.gen_in.run(T.concatenate([z,s], axis=1))
        h_gen_t, c_gen_t = self.gen_lstm.run(val, h_gen, c_gen)

        mu_gen = T.tanh(T.dot(h_gen_t, self.W_mu_gen))
        sigma_gen = T.tanh(T.dot(h_gen_t, self.W_sigma_gen))

        # do the write (eq 4)
        c_update, _ = self.writer.run(h_gen_t)
        c_t = c + c_update
            
        return h_gen_t, c_gen_t, c_t, mu_gen, sigma_gen, alpha


    def build_sample_function(self, y, mask):
        c, alpha = self.generate_image(y, mask)
        self.sample_sentences = theano.function([y, mask], [c])


    @property
    def inputs(self):
        return self._inputs

    @property
    def outputs(self):
        return self._outputs
    
    @property
    def updates(self):
        return self._updates
Exemple #13
0
class ReadLayer(object):

    def __init__(self, batch_size, N, channels, image_width, image_height, input_hidden_size,
                 use_dx_dy=False, name='', test=False, use_gpu=True, device='gpu', use_gamma=True):
        """
        Read Layer from DRAW paper
        """

        self.batch_size = batch_size
        self.use_dx_dy = use_dx_dy
        self.N = N
        self.channels = channels
        self.width = image_width
        self.height = image_height
        self.name = name
        self.input_hidden_size = input_hidden_size
        self.test = test
        self.output_shape = [batch_size, channels, N, N]
        self.use_gpu = use_gpu
        self.use_gamma = use_gamma
        self.device = device

        self.init_params()

    def load_pretrained(self, v, i):
        return i

    def init_params(self):
        self.transform_hidden = HiddenLayer(input_size=self.input_hidden_size,
                                            hidden_size=5+self.use_dx_dy,
                                            batch_size=self.batch_size,
                                            activation=act.Identity,
                                            device=self.device,
                                            name='Read.Transform.'+self.name)
    def batched_dot(self, A, B):
        if self.use_gpu:
            return theano.sandbox.cuda.blas.batched_dot(A, B)
        else:
            return T.batched_dot(A,B)
#        C = A.dimshuffle([0,1,2,'x']) * B.dimshuffle([0,'x',1,2])
#        return C.sum(axis=-2)

    def get_params(self, h):
        hidden = self.transform_hidden.run(h)
        
        gx = (hidden[:,0]+1)*0.5 * self.width
        gy = (hidden[:,1]+1)*0.5 * self.height
        s2 = T.exp(hidden[:,3]/2.0)
        if self.use_gamma:
            g = T.exp(hidden[:,4]).dimshuffle(0,'x')
        else:
            g = T.exp(hidden[:,4]).dimshuffle(0,'x')
            g = g/g
        if self.use_dx_dy:
            dx = (self.width-1.0) / (self.N-1.0) *  T.exp(hidden[:,2])
            dy = (self.height-1.0) / (self.N-1.0) *  T.exp(hidden[:,5])
        else:
            dx = dy = ((max(self.width,self.height)-1.0) / (self.N-1.0) * T.exp(hidden[:,2]))
        return gx,gy,dx,dy,s2,g

    def get_params_test(self, h):
        return h[:,0], h[:,1], h[:,2], h[:,5], h[:,3], h[:,4].dimshuffle(0,'x')

    def run(self, images, h):#, error_images, h):
        channels = self.channels#images.shape[1]
        if not self.test:
            gx,gy,dx,dy,s2,g = self.get_params(h)
        else:
            gx,gy,dx,dy,s2,g = self.get_params_test(h)

        # how to handle variable sized input images? (mask??)
        I = images.reshape((self.batch_size*self.channels, self.height, self.width))

        muX = gx.dimshuffle([0,'x']) + dx.dimshuffle([0,'x']) * (T.arange(self.N).astype(theano.config.floatX) - self.N/2 - 0.5)
        muY = gy.dimshuffle([0,'x']) + dy.dimshuffle([0,'x']) * (T.arange(self.N).astype(theano.config.floatX) - self.N/2 - 0.5)

        a = T.arange(self.width).astype(theano.config.floatX)
        b = T.arange(self.height).astype(theano.config.floatX)

        Fx = T.exp(-(a-muX.dimshuffle([0,1,'x']))**2 / 2. / s2.dimshuffle([0,'x','x'])**2)
        Fy = T.exp(-(b-muY.dimshuffle([0,1,'x']))**2 / 2. / s2.dimshuffle([0,'x','x'])**2)

        Fx = Fx / (Fx.sum(axis=-1).dimshuffle([0,1,'x']) + 1e-4)
        Fy = Fy / (Fy.sum(axis=-1).dimshuffle([0,1,'x']) + 1e-4)

        self.Fx = T.repeat(Fx, channels, axis=0)
        self.Fy = T.repeat(Fy, channels, axis=0)

        self.fint = self.batched_dot(self.Fy, I)
#        self.efint = T.dot(self.Fx, error_images)
        self.fim = self.batched_dot(self.fint, self.Fx.transpose([0,2,1])).reshape(
            (self.batch_size, self.channels*self.N*self.N))
#        self.feim = T.dot(self.efint, self.Fy.transpose([0,2,1])).reshape(
#            (self.batch_size, channels,self.N,self.N))
        return g * self.fim, (gx, gy, dx, dy, self.fint)#$T.concatenate([self.fim, self.feim], axis=1)

    @property
    def params(self):
        return [param for param in self.transform_hidden.params]

    @params.setter
    def params(self, params):
        self.transform_hidden.params = params

    def print_layer(self):
        v = '--------------------\n'
        v += 'Read Layer '+self.name+'\n'
        v += 'Input Shape: '+str((self.width, self.height))+'\n'
        return v + 'Output Shape: '+str((self.N, self.N))+'\n'
    def __init__(self, bs, K, lang_N, steps, read_size, write_size, m, gen_dim, infer_dim, z_dim, l, seed=12345, channels=1, image_size=60*60, cinit=0):
        # K is the vocab size
        # lang_N is the (max) length of the sentence encoding
        # N is the number of times to run the model
        # m is the size of the langauge representation
        # l is the dimensions in the align function
        # image_size is the w*h of image (assumed square)
        self.use_gpu = True
        self.cinit = cinit
        self.batch_size = bs
        self.channels = channels
        self.gen_dim = gen_dim
        self.z_dim = z_dim
        self.m = m
        self.lang_N = lang_N
        self.steps = steps
        self.l = l
        self.read_size = read_size
        self.write_size = write_size
        self.infer_dim = infer_dim
        self.image_size = image_size

        self.language_model = LanguageModel(bs, K, lang_N, m)

        self.gen_in = HiddenLayer(input_size=m+z_dim, hidden_size=gen_dim*4,
                                  batch_size=bs, name='gen-lstm-in')
        self.gen_lstm = LSTMLayer(hidden_size=gen_dim, 
                                  activation=T.tanh, 
                                  batch_size=bs,
                                  dropout=0.0,
                                  name='gen-lstm')
        self.infer_in = HiddenLayer(input_size=2*channels*self.read_size**2+self.gen_dim,
                                    hidden_size=infer_dim*4,
                                    batch_size=bs, name='infer-lstm-in')
        self.infer_lstm = LSTMLayer(hidden_size=infer_dim, 
                                    activation=T.tanh, 
                                    batch_size=bs,
                                    dropout=0.0,
                                    name='infer-lstm')

        self.reader = ReadLayer(batch_size=self.batch_size,
                                N=self.read_size,
                                channels=channels,
                                image_width=int(np.sqrt(self.image_size)),
                                image_height=int(np.sqrt(self.image_size)),
                                input_hidden_size=gen_dim,
                                name='Read')
        self.writer = WriteLayer(batch_size=self.batch_size,
                                 N=self.write_size,
                                 channels=channels,
                                 image_width=int(np.sqrt(self.image_size)),
                                 image_height=int(np.sqrt(self.image_size)),
                                 input_hidden_size=gen_dim,
                                 name='Write')
        self.random = RandomStreams(seed)

        # create W_mu, W_sigma, v, U, W, b
        init = IsotropicGaussian(0.01)
        u = init.init(np_rng, (self.m, self.l))
        self.U = theano.shared(value=u, name='U', borrow=True)
        v = init.init(np_rng, (self.l,))
        self.v = theano.shared(value=v, name='v', borrow=True)
        w = init.init(np_rng, (self.gen_dim, self.l))
        self.W = theano.shared(value=w, name='W', borrow=True)
        b = init.init(np_rng, (self.l,))
        self.b = theano.shared(value=b, name='b', borrow=True)


        w_mu = init.init(np_rng, (self.infer_dim, self.z_dim))
        self.W_mu_infer = theano.shared(value=w_mu, name='W_mu_infer', borrow=True)
        w_sigma = init.init(np_rng, (self.infer_dim, self.z_dim))
        self.W_sigma_infer = theano.shared(value=w_sigma, name='W_sigma_infer', borrow=True)

        w_mu = init.init(np_rng, (self.gen_dim, self.z_dim))
        self.W_mu_gen = theano.shared(value=w_mu, name='W_mu_gen', borrow=True)
        w_sigma = init.init(np_rng, (self.gen_dim, self.z_dim))
        self.W_sigma_gen = theano.shared(value=w_sigma, name='W_sigma_gen', borrow=True)
class TemporalModel(Model):
    def __init__(self,
                 inputs,
                 bs,
                 max_time,
                 classes,
                 feature_dim,
                 hidden_size,
                 method='max',
                 seed=12345):
        self._inputs = inputs
        self.method = method
        self.batch_size = bs
        self.classes = classes
        self.max_time = max_time
        self.feature_dim = feature_dim
        self.dropout = True
        self.hidden = HiddenLayer(input_size=feature_dim,
                                  hidden_size=hidden_size,
                                  batch_size=bs,
                                  name='hidden',
                                  dropout=0.5,
                                  activation=act.LeakyRelu())
        self.softmax = SoftmaxLayer(input_size=hidden_size,
                                    classes=self.classes,
                                    batch_size=bs,
                                    name='softmax',
                                    dropout=0.5)

    @property
    def params(self):
        return self.softmax.params + self.hidden.params

    @property
    def inputs(self):
        return self._inputs

    @property
    def outputs(self):
        return self._outputs

    @property
    def updates(self):
        return self._updates

    @property
    def test_algorithm(self):
        if not hasattr(self, '_talgorithm'):
            d = self.dropout
            self.dropout = False
            o = self.run(*self.inputs)
            for i, ot in enumerate(self.outputs):
                o[i].name = ot.name
            self._talgorithm = theano.function(inputs=self.inputs,
                                               outputs=o,
                                               on_unused_input='warn')
            self.dropout = d
        return self._talgorithm

    def run(self, x, mask, y):
        # get the max/mean/sum of x for each feature
        # from all frame
        if self.method == 'max':
            m = (-100 * (1 - mask)).dimshuffle([0, 1, 'x'])
            x = T.max(x + m, axis=1)
        elif self.method == 'sum' or self.method == 'mean':
            x = T.sum(x, axis=1)
        elif self.method == 'mean':
            x = x / T.sum(mask, axis=1).dimshuffle([0, 'x'])

        x = x.astype(theano.config.floatX)
        x = self.hidden.run(x, self.dropout)

        prob, pred = self.softmax.run(x, self.dropout)
        y = y.reshape((y.shape[0], ))
        loss = self.softmax.loss(prob, y) + T.sum(
            self.hidden.w**2) * 0.001 + T.sum(self.softmax.w**2) * 0.0001
        y = T.extra_ops.to_one_hot(y, 51)
        error = self.softmax.error(pred, y)
        acc = 1 - error

        return prob, pred, loss, error, acc
Exemple #16
0
class RBFNN:
    def __init__(self):
        self.samples = []  # <Sample Object>
        self.iteration = 0
        self.max_iteration = 1
        self.tolerance_error = 0.001
        self.learning_rate = 1.0
        self.hidden_layer = HiddenLayer()
        self.output_layer = OutputLayer()
        self.sga = SGA()
        self.cost = Cost()
        self.picker = Picker()

    def reset(self):
        del self.samples[:]
        self.iteration = 0
        self.hidden_layer.clear_nets()
        self.output_layer.clear_nets()
        self.cost.clear()

    # sample: Sample Object
    def add_sample(self, sample):
        if not sample:
            return
        self.samples.append(sample)

    def add_samples(self, samples=[]):
        for sample in samples:
            self.add_sample(sample)

    # 对输出层神经元给定随机权重
    def randomize_weights(self, min=-0.25, max=0.25):
        hidden_count = len(self.hidden_layer.nets)
        for output_net in self.output_layer.nets:
            output_net.randomize_weights(hidden_count, min, max)

    def zero_weights(self):
        self.randomize_weights(0.0, 0.0)

    def add_center(self, center=[]):
        center_net = CenterNet(center)
        self.hidden_layer.add_net(center_net)

    # k: 要几个中心点(隐藏层神经元)
    # pick_method: 挑选中心点的方法
    def initialize_centers(self, k=1, pick_method=PickMethod.Random):
        self.picker.samples = self.samples
        picked_samples = {
            PickMethod.Random: self.picker.shuffing,
            PickMethod.Clustering: self.picker.clustering
        }.get(pick_method)(k)
        # 把选取到的中心点都设定进孙菲菲层里
        for sample in picked_samples:
            center_net = CenterNet(sample.features)
            self.hidden_layer.add_net(center_net)

    def initialize_outputs(self):
        # 有几个输出
        outputs_count = len(self.samples[0].targets)
        for i in range(outputs_count):
            output_net = OutputNet()
            self.output_layer.add_net(output_net)

    # custom_sigmas: <Double>, 自订每个中心点的 Sigma, 1 center has 1 sigma.
    def training(self,
                 iteration_callback=None,
                 completion_callback=None,
                 custom_sigmas=[]):
        self.iteration = 0
        self.cost.clear()

        # 先统一设定 3 个参数的学习速率
        # 1. 权重(weight)
        # 2. 中心点(center)
        # 3. 标准差(sigma)
        self.sga.uniform_learning_rate(self.learning_rate)

        # 设定中心点的标准差
        if len(custom_sigmas) > 0:
            self.hidden_layer.refresh_centers_sigma(custom_sigmas)
        else:
            # 如果没有自订的 Sigmas, 则会跑演算法去算初始通用的 Sigma
            self.hidden_layer.initialize_centers_sigma()

        # 开始训练
        while (self.iteration < self.max_iteration
               and self.cost.rmse > self.tolerance_error):
            self.iteration += 1
            for sample in self.samples:
                # Network Outputing
                center_nets = self.hidden_layer.nets
                output_nets = self.output_layer.nets
                hidden_outputs = self.hidden_layer.output(
                    sample)  # Output RBF Values
                network_outputs = self.output_layer.output(
                    sample, hidden_outputs)
                # Training Failed (做例外处理)
                if network_outputs == -1:
                    if completion_callback:
                        completion_callback(self, False)
                    return
                # 记录 Cost
                self.cost.add(network_outputs, sample.targets)
                # Updates centers and weights
                self.sga.update_centers(sample, center_nets, output_nets)
                self.sga.update_weights(center_nets, output_nets)

            # 所有训练样本(training samples)都跑完后为 1 迭代(Iteration)
            if iteration_callback:
                iteration_callback(self)

        # 完成训练
        if completion_callback:
            completion_callback(self, True)

    # features <Double>
    def predicate(self, features=[]):
        return self.output_layer.predicate(features, self.hidden_layer.nets)
class ImageModel(Model):
    def __init__(self,
                 bs,
                 K,
                 lang_N,
                 steps,
                 read_size,
                 write_size,
                 m,
                 gen_dim,
                 infer_dim,
                 z_dim,
                 l,
                 seed=12345,
                 channels=1,
                 image_size=60 * 60,
                 cinit=0):
        # K is the vocab size
        # lang_N is the (max) length of the sentence encoding
        # N is the number of times to run the model
        # m is the size of the langauge representation
        # l is the dimensions in the align function
        # image_size is the w*h of image (assumed square)
        self.use_gpu = True
        self.cinit = cinit
        self.batch_size = bs
        self.channels = channels
        self.gen_dim = gen_dim
        self.z_dim = z_dim
        self.m = m
        self.lang_N = lang_N
        self.steps = steps
        self.l = l
        self.read_size = read_size
        self.write_size = write_size
        self.infer_dim = infer_dim
        self.image_size = image_size

        self.language_model = LanguageModel(bs, K, lang_N, m)

        self.gen_in = HiddenLayer(input_size=m + z_dim,
                                  hidden_size=gen_dim * 4,
                                  batch_size=bs,
                                  name='gen-lstm-in')
        self.gen_lstm = LSTMLayer(hidden_size=gen_dim,
                                  activation=T.tanh,
                                  batch_size=bs,
                                  dropout=0.0,
                                  name='gen-lstm')
        self.infer_in = HiddenLayer(
            input_size=2 * channels * self.read_size**2 + self.gen_dim,
            hidden_size=infer_dim * 4,
            batch_size=bs,
            name='infer-lstm-in')
        self.infer_lstm = LSTMLayer(hidden_size=infer_dim,
                                    activation=T.tanh,
                                    batch_size=bs,
                                    dropout=0.0,
                                    name='infer-lstm')

        self.reader = ReadLayer(batch_size=self.batch_size,
                                N=self.read_size,
                                channels=channels,
                                image_width=int(np.sqrt(self.image_size)),
                                image_height=int(np.sqrt(self.image_size)),
                                input_hidden_size=gen_dim,
                                name='Read')
        self.writer = WriteLayer(batch_size=self.batch_size,
                                 N=self.write_size,
                                 channels=channels,
                                 image_width=int(np.sqrt(self.image_size)),
                                 image_height=int(np.sqrt(self.image_size)),
                                 input_hidden_size=gen_dim,
                                 name='Write')
        self.random = RandomStreams(seed)

        # create W_mu, W_sigma, v, U, W, b
        init = IsotropicGaussian(0.01)
        u = init.init(np_rng, (self.m, self.l))
        self.U = theano.shared(value=u, name='U', borrow=True)
        v = init.init(np_rng, (self.l, ))
        self.v = theano.shared(value=v, name='v', borrow=True)
        w = init.init(np_rng, (self.gen_dim, self.l))
        self.W = theano.shared(value=w, name='W', borrow=True)
        b = init.init(np_rng, (self.l, ))
        self.b = theano.shared(value=b, name='b', borrow=True)

        w_mu = init.init(np_rng, (self.infer_dim, self.z_dim))
        self.W_mu_infer = theano.shared(value=w_mu,
                                        name='W_mu_infer',
                                        borrow=True)
        w_sigma = init.init(np_rng, (self.infer_dim, self.z_dim))
        self.W_sigma_infer = theano.shared(value=w_sigma,
                                           name='W_sigma_infer',
                                           borrow=True)

        w_mu = init.init(np_rng, (self.gen_dim, self.z_dim))
        self.W_mu_gen = theano.shared(value=w_mu, name='W_mu_gen', borrow=True)
        w_sigma = init.init(np_rng, (self.gen_dim, self.z_dim))
        self.W_sigma_gen = theano.shared(value=w_sigma,
                                         name='W_sigma_gen',
                                         borrow=True)

    def batched_dot(self, A, B):
        if self.use_gpu:
            return theano.sandbox.cuda.blas.batched_dot(A, B)
        else:
            return T.batched_dot(A, B)

    @property
    def params(self):
        return flatten_f([self.U, self.v, self.W, self.b, self.W_mu_infer, self.W_sigma_infer,
                self.W_mu_gen, self.W_sigma_gen] + self.language_model.params + \
        self.gen_in.params + self.gen_lstm.params + self.infer_in.params + self.infer_lstm.params + \
        self.reader.params + self.writer.params)

    def align(self, h_gen, h_lang, mask):
        # h_lang is N x batch x m
        # h_gen is batch x gen_dim
        # U is m x l
        # mask determines which elements of h_lang we care about
        # we want the result to be N x batch x l

        # using batched_dot this can be done
        # by making U to be 1 x m x l
        # and mkaing h_lang to be N x batch x m
        # and repeating U N times on axis 0
        U = self.U.reshape((1, self.m, self.l)).repeat(self.lang_N, axis=0)
        # align_lang is now N x batch x l
        align_lang = self.batched_dot(h_lang, U)

        # W is gen_dim x l
        # h_gen is batch x gen_dim
        # result is batch x l
        align_img = T.dot(h_gen, self.W)
        # use broadcasting to get a to be N x batch x l
        alpha = T.tanh(align_lang + align_img.dimshuffle('x', 0, 1) +
                       self.b.dimshuffle('x', 'x', 0))

        # v is l, a is N x batch x l
        # result will be N x batch
        alpha = T.exp(T.dot(alpha, self.v))

        # need to mask a before normalizing
        # so that the parts that are masked do
        # not affect the normalization
        mask = mask.transpose([1, 0])  # make mask langN x batch_size
        alpha = T.switch(T.neq(mask, 0), alpha,
                         zeros((self.lang_N, self.batch_size)))

        # normalize a by the sum of a along the N (axis=0)
        # creates a vector of length N
        alpha = alpha / T.sum(alpha, axis=0)

        # we now use alpha with h_lang to compute s_t
        # s_t is of size m because it is a constant
        # (alpha) * h_lang (m-vector)
        # we have alpha as N x batch
        # and h_lang as N x batch x m
        alpha = alpha.reshape((self.lang_N, self.batch_size, 1))
        s = h_lang * alpha

        # sum along the N axis to give batch x m
        s = T.sum(s, axis=0)
        return s, alpha

    # use with partial to pass in first args
    # scan will pass the remaining args
    def step_train(self, rnd_in, kl, h_infer, c_infer, h_gen, c_gen, c, mu_gen,
                   sigma_gen, h_lang, x, mask):
        # h_gen is a sequence
        # h_lang is a non-sequence (but it is used to calculate
        #     the align function each step)

        # eqs 10-13
        # compute "error image"
        x_hat = x - T.nnet.sigmoid(c)
        # read from both input (x) and error image
        r, _ = self.reader.run(x, h_gen)
        r_hat, _ = self.reader.run(x_hat, h_gen)
        # concatente the two read regions
        r = T.concatenate([r, r_hat], axis=1)

        # run the infer lstm on the read regions
        val = self.infer_in.run(T.concatenate([r, h_gen], axis=1))
        h_infer_t, c_infer_t = self.infer_lstm.run(val, h_infer, c_infer)

        # I don't believe we actually need to sample from Q
        # we just use it to minimze the loss so that it learns
        # good values for the infer-lstm
        # But we do need the mean and logsigma for KL
        mu_infer = T.dot(h_infer_t, self.W_mu_infer)
        sigma_infer = 0.5 * T.dot(h_infer_t, self.W_sigma_infer)
        # generate a sample from these normal distributions
        z = mu_infer + T.exp(sigma_infer) * rnd_in

        # calculate kl-divergence between infer and gen normal distributions
        kl_t = kl + T.sum(-1 +
                          ((mu_infer - mu_gen)**2 + T.exp(2 * sigma_infer)) /
                          (T.exp(2 * sigma_gen)) - 2 * sigma_infer +
                          2 * sigma_gen)

        # do the alignment (eq 2)
        # this is m-dimensions - each word is summed into 1 vector
        # to represent the whole sequence, so N x batch x m becomes batch x m
        s, _ = self.align(h_gen, h_lang, mask)

        # run the LSTM (eq 3)
        # val is batch x m+z_dims
        val = self.gen_in.run(T.concatenate([z, s], axis=1))
        h_gen_t, c_gen_t = self.gen_lstm.run(val, h_gen, c_gen)

        mu_gen = T.tanh(T.dot(h_gen_t, self.W_mu_gen))
        sigma_gen = T.tanh(T.dot(h_gen_t, self.W_sigma_gen))

        # do the write (eq 4)
        c_update, _ = self.writer.run(h_gen_t)
        c_t = c + c_update

        return kl_t, h_infer_t, c_infer_t, h_gen_t, c_gen_t, c_t, mu_gen, sigma_gen

    def train(self, x, y, mask):
        # do language model on y
        h_lang = self.language_model.run(y)

        # do train recurrence
        h_infer, c_infer = self.infer_lstm.get_initial_hidden
        h_gen, c_gen = self.gen_lstm.get_initial_hidden
        c0 = theano.shared(self.cinit * np.ones(
            (1, self.channels * self.image_size)).astype(theano.config.floatX))
        c0 = c0.repeat(self.batch_size, axis=0)

        rnd_in = rng.normal(size=(self.steps, self.batch_size, self.z_dim),
                            avg=0.0,
                            std=1.0,
                            dtype=theano.config.floatX)

        # setup output
        outputs_info = [
            dict(initial=T.zeros(()), taps=[-1]),  # kl
            dict(initial=h_infer, taps=[-1]),  # h_infer
            dict(initial=c_infer, taps=[-1]),  # c_infer
            dict(initial=h_gen, taps=[-1]),  # h_gen
            dict(initial=c_gen, taps=[-1]),  # c_gen
            dict(initial=c0, taps=[-1]),  # c
            dict(initial=T.zeros((self.batch_size, self.z_dim)),
                 taps=[-1]),  # mu_gen
            dict(initial=T.zeros((self.batch_size, self.z_dim)), taps=[-1])
        ]  # sigma_gen

        # do scan
        [kl, h_infer, c_infer, h_gen, c_gen, c, mu_gen,
         sigma_gen], _ = theano.scan(fn=self.step_train,
                                     sequences=rnd_in,
                                     outputs_info=outputs_info,
                                     non_sequences=[h_lang, x, mask],
                                     n_steps=self.steps)

        # Get x-reconstruction-error (eq 5)
        x_recons = T.nnet.sigmoid(c[-1, :, :])
        log_recons = T.nnet.binary_crossentropy(x_recons, x).sum()

        # compute KL
        kl = 0.5 * kl[-1]

        log_likelihood = kl + log_recons
        log_likelihood = log_likelihood.mean()
        kl = kl.mean()
        log_recons = log_recons.mean()
        return kl, log_recons, log_likelihood, c

    def generate_image(self, y, mask):
        # do language model on y
        h_lang = self.language_model.run(y)

        # do train recurrence
        h_gen, c_gen = self.gen_lstm.get_initial_hidden
        c0 = theano.shared(self.cinit * np.ones(
            (1, self.channels * self.image_size)).astype(theano.config.floatX))
        c0 = c0.repeat(self.batch_size, axis=0)

        rnd_in = rng.normal(size=(self.steps, self.batch_size, self.z_dim),
                            avg=0.0,
                            std=1.0,
                            dtype=theano.config.floatX)

        # setup output
        outputs_info = [
            dict(initial=h_gen, taps=[-1]),  # h_gen
            dict(initial=c_gen, taps=[-1]),  # c_gen
            dict(initial=c0, taps=[-1]),  # c
            dict(initial=T.zeros((self.batch_size, self.z_dim)),
                 taps=[-1]),  # mu_gen
            dict(initial=T.zeros((self.batch_size, self.z_dim)),
                 taps=[-1]),  # sigma_gen
            dict(initial=T.zeros((self.lang_N, self.batch_size, 1)), taps=[-1])
        ]  # alpha
        # do scan
        [h_gen, c_gen, c, mu_gen, sigma_gen,
         alpha], _ = theano.scan(fn=self.step_gen,
                                 sequences=rnd_in,
                                 outputs_info=outputs_info,
                                 non_sequences=[h_lang, mask],
                                 n_steps=self.steps)
        c = T.nnet.sigmoid(c)

        return c[-1].reshape(
            (1, self.batch_size, self.channels, self.image_size)), alpha

    def step_gen(self, rnd_in, h_gen, c_gen, c, mu_gen, sigma_gen, alpha,
                 h_lang, mask):
        # generate a sample from the generative distribution
        z = mu_gen + T.exp(sigma_gen) * rnd_in

        # do the alignment (eq 2)
        # this is m-dimensions - each word is summed into 1 vector
        # to represent the whole sequence, so N x batch x m becomes batch x m
        s, alpha = self.align(h_gen, h_lang, mask)

        # run the LSTM (eq 3)
        # val is batch x m+z_dims
        val = self.gen_in.run(T.concatenate([z, s], axis=1))
        h_gen_t, c_gen_t = self.gen_lstm.run(val, h_gen, c_gen)

        mu_gen = T.tanh(T.dot(h_gen_t, self.W_mu_gen))
        sigma_gen = T.tanh(T.dot(h_gen_t, self.W_sigma_gen))

        # do the write (eq 4)
        c_update, _ = self.writer.run(h_gen_t)
        c_t = c + c_update

        return h_gen_t, c_gen_t, c_t, mu_gen, sigma_gen, alpha

    def build_sample_function(self, y, mask):
        c, alpha = self.generate_image(y, mask)
        self.sample_sentences = theano.function([y, mask], [c])

    @property
    def inputs(self):
        return self._inputs

    @property
    def outputs(self):
        return self._outputs

    @property
    def updates(self):
        return self._updates
Exemple #18
0
class TemporalAttentionLayer(object):
    def __init__(
        self,
        batch_size,
        N,
        channels,
        name='',
        use_gpu=True,
        test=False,
        input_hidden_size=4096,
        initializers=[
            i.IsotropicGaussian(0.01),  # g
            i.IsotropicGaussian(0.01),  # d
            i.IsotropicGaussian(0.01)
        ]):  # sigma
        """
        Temporal Read Layer Based on DRAW paper
        """

        self.batch_size = batch_size
        self.N = N
        self.channels = channels
        self.name = name
        self.output_shape = [batch_size, channels, N]
        self.use_gpu = use_gpu
        self.initializers = initializers
        self.test = test
        self.input_hidden_size = input_hidden_size

        self.hidden_layer = HiddenLayer(input_size=self.input_hidden_size,
                                        hidden_size=3,
                                        batch_size=self.batch_size,
                                        activation=act.Identity,
                                        name='Attention-Transform.' +
                                        self.name)

    def load_pretrained(self, v, i):
        return i

    def batched_dot(self, A, B):
        if self.use_gpu:
            return theano.sandbox.cuda.blas.batched_dot(A, B)
        else:
            return T.batched_dot(A, B)


#        C = A.dimshuffle([0,1,2,'x']) * B.dimshuffle([0,'x',1,2])
#        return C.sum(axis=-2)

    def get_params(self, time, h):
        hidden = self.hidden_layer.run(h)
        g = time * ((T.tanh(hidden[:, 0]) + 1) * 0.5)
        g = g.astype(theano.config.floatX)
        s2 = T.exp(hidden[:, 1] / 2.0)  #.repeat(time.shape[0], axis=0)
        s2 = s2.astype(theano.config.floatX)
        d = time / (max(self.N - 1.0, 1.0)) * T.exp(hidden[:, 2])
        d = d.astype(theano.config.floatX)

        return g, s2, d

    def get_params_test(self, time, h):
        return h[:, 0], h[:, 1], h[:, 2]

    def run(self, features, h, time_mask):
        channels = self.channels
        # assumes that features are batch x dim (channels) x time
        # time mask is batch x time and is binary

        # time mask is a binary matrix that is 1 when the input
        # is a valid frame and 0 when the input is not.
        # This allows the shape of a minibatch be the same
        # even though videos are of various lengths.

        # we sum along axis 1 to get the time length of
        # the individual clips
        time = T.sum(time_mask, axis=1)

        if not self.test:
            g, s2, d = self.get_params(time, h)
        else:
            g, s2, d = self.get_params_test(time, h)
            g = g.astype(theano.config.floatX)
            s2 = s2.astype(theano.config.floatX)
            d = d.astype(theano.config.floatX)

        I = features.reshape(
            (features.shape[0] * self.channels, features.shape[2], 1))

        mu = g.dimshuffle([0,'x']) + d.dimshuffle([0,'x']) * \
             (T.arange(self.N).astype(theano.config.floatX) - self.N/2 - 0.5)

        a = T.arange(features.shape[2]).astype(theano.config.floatX)

        # I is batch*channels x time x 1
        # F is batch[*channels] x N x time
        # batch*channels x N x 1
        F = T.exp(-(a - mu.dimshuffle([0, 1, 'x']))**2 / 2. /
                  s2.dimshuffle([0, 'x', 'x'])**2)
        # need to mask F
        F = F * time_mask.dimshuffle([0, 'x', 1])
        # normalize F
        F = F / (F.sum(axis=-1).dimshuffle([0, 1, 'x']) + 1e-4)
        F = T.repeat(F, channels, axis=0)

        res = self.batched_dot(F, I).reshape(
            (features.shape[0], self.channels, self.N))

        return res, (g, s2, d)

    @property
    def params(self):
        return self.hidden_layer.params

    @params.setter
    def params(self, params):
        print 'Temporal Set Params not implemented'

    def print_layer(self):
        v = '--------------------\n'
        v += 'Read Layer ' + self.name + '\n'
        v += 'Input Shape: ' + str((self.width, self.height)) + '\n'
        return v + 'Output Shape: ' + str((self.N, self.N)) + '\n'
class TemporalModel(Model):
    def __init__(self,
                 inputs,
                 bs,
                 max_time,
                 class_num,
                 feature_dim,
                 hidden_size,
                 levels,
                 N=1,
                 pool=None,
                 seed=12345):
        self._inputs = inputs
        self.N = N
        self.batch_size = bs
        self.name = 'learned_' + str(class_num)
        self.class_num = theano.shared(class_num)
        self.max_time = max_time
        self.filters = levels
        self.feature_dim = feature_dim
        self.pool = pool
        self.dropout = True

        self.temporal_pyramid = []
        for f in range(self.filters):
            tf = TemporalAttentionLayer(batch_size=bs,
                                        N=N,
                                        channels=feature_dim,
                                        name='af-' + str(f))
            self.temporal_pyramid.append(tf)

        input_size = feature_dim * len(self.temporal_pyramid)  #*N
        self.hidden = HiddenLayer(input_size=input_size,
                                  hidden_size=hidden_size,
                                  activation=act.LeakyRelu(),
                                  batch_size=bs,
                                  name='hidden',
                                  dropout=0.5)
        self.classify = HiddenLayer(input_size=hidden_size,
                                    hidden_size=1,
                                    batch_size=bs,
                                    name='classify',
                                    dropout=0.0,
                                    activation=act.sigmoid)

    @property
    def params(self):
        return self.classify.params + self.hidden.params + [
            p for f in self.temporal_pyramid for p in f.params
        ]

    @property
    def inputs(self):
        return self._inputs

    @property
    def outputs(self):
        return self._outputs

    @property
    def updates(self):
        return self._updates

    def run(self, x, mask, y):
        # use temporal filters
        results = []
        # make x to be batch x features x time
        x = x.transpose([0, 2, 1])
        for tf in self.temporal_pyramid:
            # results is batch x features x N
            # flatten to batch x features*N
            res, (g, s2, d) = tf.run(x, mask)
            res = res.reshape((x.shape[0], self.feature_dim, self.N))
            results.append(
                T.mean(res, axis=2)
            )  # take mean along N to get feature x 1 representation for sub-event
#            if self.pool == None:
#                results.append(res.reshape((x.shape[0], self.feature_dim*self.N)))
#            else:
#                results.append(res.reshape((x.shape[0], 1, self.feature_dim*self.N)))
# concatenate on axis 1 to get batch x features*N*filters
        x = T.concatenate(results, axis=1)

        if self.pool == 'max':
            x = T.max(x, axis=1)
        elif self.pool == 'sum':
            x = T.sum(x, axis=1)
        elif self.pool == 'mean':
            x = T.mean(x, axis=1)

        x = self.hidden.run(x, self.dropout)
        prob = self.classify.run(x, False)

        y = T.switch(
            T.eq(
                self.class_num.repeat(y.shape[0]).reshape((y.shape[0], 1)), y),
            1, 0)
        preds = T.switch(T.gt(prob, 0.5), 1, 0)

        true_pos = (T.eq(y, 1) * T.eq(preds, 1)).sum()
        true_neg = (T.neq(y, 1) * T.neq(preds, 1)).sum()
        false_pos = (T.neq(y, 1) * T.eq(preds, 1)).sum()
        false_neg = (T.eq(y, 1) * T.neq(preds, 1)).sum()

        loss = T.nnet.binary_crossentropy(prob, y)
        loss = T.switch(T.eq(y, 1), loss, 0.02 * loss)
        loss = loss.mean()

        return prob, loss, (true_pos, true_neg, false_pos, false_neg)
class ReadLayer(object):
    def __init__(self,
                 batch_size,
                 N,
                 channels,
                 image_width,
                 image_height,
                 input_hidden_size,
                 use_dx_dy=False,
                 name='',
                 test=False,
                 use_gpu=True,
                 device='gpu',
                 use_gamma=True):
        """
        Read Layer from DRAW paper
        """

        self.batch_size = batch_size
        self.use_dx_dy = use_dx_dy
        self.N = N
        self.channels = channels
        self.width = image_width
        self.height = image_height
        self.name = name
        self.input_hidden_size = input_hidden_size
        self.test = test
        self.output_shape = [batch_size, channels, N, N]
        self.use_gpu = use_gpu
        self.use_gamma = use_gamma
        self.device = device

        self.init_params()

    def load_pretrained(self, v, i):
        return i

    def init_params(self):
        self.transform_hidden = HiddenLayer(input_size=self.input_hidden_size,
                                            hidden_size=5 + self.use_dx_dy,
                                            batch_size=self.batch_size,
                                            activation=act.Identity,
                                            device=self.device,
                                            name='Read.Transform.' + self.name)

    def batched_dot(self, A, B):
        if self.use_gpu:
            return theano.sandbox.cuda.blas.batched_dot(A, B)
        else:
            return T.batched_dot(A, B)
#        C = A.dimshuffle([0,1,2,'x']) * B.dimshuffle([0,'x',1,2])
#        return C.sum(axis=-2)

    def get_params(self, h):
        hidden = self.transform_hidden.run(h)

        gx = (hidden[:, 0] + 1) * 0.5 * self.width
        gy = (hidden[:, 1] + 1) * 0.5 * self.height
        s2 = T.exp(hidden[:, 3] / 2.0)
        if self.use_gamma:
            g = T.exp(hidden[:, 4]).dimshuffle(0, 'x')
        else:
            g = T.exp(hidden[:, 4]).dimshuffle(0, 'x')
            g = g / g
        if self.use_dx_dy:
            dx = (self.width - 1.0) / (self.N - 1.0) * T.exp(hidden[:, 2])
            dy = (self.height - 1.0) / (self.N - 1.0) * T.exp(hidden[:, 5])
        else:
            dx = dy = ((max(self.width, self.height) - 1.0) / (self.N - 1.0) *
                       T.exp(hidden[:, 2]))
        return gx, gy, dx, dy, s2, g

    def get_params_test(self, h):
        return h[:, 0], h[:, 1], h[:, 2], h[:,
                                            5], h[:,
                                                  3], h[:,
                                                        4].dimshuffle(0, 'x')

    def run(self, images, h):  #, error_images, h):
        channels = self.channels  #images.shape[1]
        if not self.test:
            gx, gy, dx, dy, s2, g = self.get_params(h)
        else:
            gx, gy, dx, dy, s2, g = self.get_params_test(h)

        # how to handle variable sized input images? (mask??)
        I = images.reshape(
            (self.batch_size * self.channels, self.height, self.width))

        muX = gx.dimshuffle([0, 'x']) + dx.dimshuffle([0, 'x']) * (
            T.arange(self.N).astype(theano.config.floatX) - self.N / 2 - 0.5)
        muY = gy.dimshuffle([0, 'x']) + dy.dimshuffle([0, 'x']) * (
            T.arange(self.N).astype(theano.config.floatX) - self.N / 2 - 0.5)

        a = T.arange(self.width).astype(theano.config.floatX)
        b = T.arange(self.height).astype(theano.config.floatX)

        Fx = T.exp(-(a - muX.dimshuffle([0, 1, 'x']))**2 / 2. /
                   s2.dimshuffle([0, 'x', 'x'])**2)
        Fy = T.exp(-(b - muY.dimshuffle([0, 1, 'x']))**2 / 2. /
                   s2.dimshuffle([0, 'x', 'x'])**2)

        Fx = Fx / (Fx.sum(axis=-1).dimshuffle([0, 1, 'x']) + 1e-4)
        Fy = Fy / (Fy.sum(axis=-1).dimshuffle([0, 1, 'x']) + 1e-4)

        self.Fx = T.repeat(Fx, channels, axis=0)
        self.Fy = T.repeat(Fy, channels, axis=0)

        self.fint = self.batched_dot(self.Fy, I)
        #        self.efint = T.dot(self.Fx, error_images)
        self.fim = self.batched_dot(self.fint,
                                    self.Fx.transpose([0, 2, 1])).reshape(
                                        (self.batch_size,
                                         self.channels * self.N * self.N))
        #        self.feim = T.dot(self.efint, self.Fy.transpose([0,2,1])).reshape(
        #            (self.batch_size, channels,self.N,self.N))
        return g * self.fim, (gx, gy, dx, dy, self.fint
                              )  #$T.concatenate([self.fim, self.feim], axis=1)

    @property
    def params(self):
        return [param for param in self.transform_hidden.params]

    @params.setter
    def params(self, params):
        self.transform_hidden.params = params

    def print_layer(self):
        v = '--------------------\n'
        v += 'Read Layer ' + self.name + '\n'
        v += 'Input Shape: ' + str((self.width, self.height)) + '\n'
        return v + 'Output Shape: ' + str((self.N, self.N)) + '\n'
class TemporalModel(Model):
    def __init__(self,
                 inputs,
                 bs,
                 max_time,
                 class_num,
                 feature_dim,
                 hidden_size,
                 method='max',
                 seed=12345):
        self._inputs = inputs
        self.method = method
        self.name = 'baseline_' + str(class_num)
        self.batch_size = bs
        self.class_num = theano.shared(class_num)
        self.max_time = max_time
        self.feature_dim = feature_dim
        self.dropout = True
        self.hidden = HiddenLayer(input_size=feature_dim,
                                  hidden_size=hidden_size,
                                  batch_size=bs,
                                  name='hidden',
                                  dropout=0.5,
                                  activation=act.LeakyRelu())

        self.classify = HiddenLayer(input_size=hidden_size,
                                    hidden_size=1,
                                    batch_size=bs,
                                    name='classify',
                                    dropout=0.0,
                                    activation=act.sigmoid)

    @property
    def params(self):
        return self.classify.params + self.hidden.params

    @property
    def inputs(self):
        return self._inputs

    @property
    def outputs(self):
        return self._outputs

    @property
    def updates(self):
        return self._updates

    def run(self, x, mask, y):
        # get the max/mean/sum of x for each feature
        # from all frame
        if self.method == 'max':
            # apply the mask so that if the feature is all negative
            # the 0s don't affect it
            m = (-100 * (1 - mask)).dimshuffle([0, 1, 'x'])
            x = T.max(x + m, axis=1)
        elif self.method == 'sum' or self.method == 'mean':
            x = T.sum(x, axis=1)
        if self.method == 'mean':
            # divide by the number of valid frames
            x = x / T.sum(mask, axis=1).dimshuffle([0, 'x'])

        x = x.astype(theano.config.floatX)
        x = self.hidden.run(x, self.dropout)
        prob = self.classify.run(x, False)

        y = T.switch(
            T.eq(
                self.class_num.repeat(y.shape[0]).reshape((y.shape[0], 1)), y),
            1, 0)
        preds = T.switch(T.gt(prob, 0.5), 1, 0)

        true_pos = (T.eq(y, 1) * T.eq(preds, 1)).sum()
        true_neg = (T.neq(y, 1) * T.neq(preds, 1)).sum()
        false_pos = (T.neq(y, 1) * T.eq(preds, 1)).sum()
        false_neg = (T.eq(y, 1) * T.neq(preds, 1)).sum()

        loss = T.nnet.binary_crossentropy(prob, y)
        loss = T.switch(T.eq(y, 1), loss, 0.02 * loss)
        loss = loss.mean()

        return prob, loss, (true_pos, true_neg, false_pos, false_neg)
Exemple #22
0
class TemporalModel(Model):
    def __init__(self, inputs, bs, max_time, classes, feature_dim, hidden_size, levels, N=1, pool=None, seed=12345):
        self._inputs = inputs
        self.N = N
        self.batch_size = bs
        self.classes = classes
        self.max_time = max_time
        self.levels = levels
        self.feature_dim = feature_dim
        self.pool = pool
        self.dropout = True

        # create a pyramid of filters
        self.temporal_pyramid = []
        for l in range(self.levels):
            for f in range(2**l):
                tf = TemporalAttentionLayer(batch_size=bs, N=N, channels=feature_dim, 
                                            name='temporal-attention-layer-'+str(l)+'-filter-'+str(f))
                tf.test = True
                tf.d = theano.shared(value=np.asarray([1./2**(l+1)]).astype('float32'), name='d', borrow=True,
                                     broadcastable=[True])
                tf.g = theano.shared(value=np.asarray([((1./2**l)+(2*f/2.**l))]).astype('float32'), name='g', borrow=True,
                                     broadcastable=[True])
                tf.sigma = theano.shared(value=np.asarray([5.0]).astype('float32'), name='sigma', borrow=True,
                                         broadcastable=[True])
                self.temporal_pyramid.append(tf)

        input_size = feature_dim*N*(len(self.temporal_pyramid) if pool == None else 1)
        self.hidden = HiddenLayer(input_size=input_size, hidden_size=hidden_size, activation=act.LeakyRelu(),
                                  batch_size=bs, name='hidden', dropout=0.5)
        self.softmax = SoftmaxLayer(input_size=hidden_size, classes=self.classes,
                                    batch_size=bs, name='softmax', dropout=0.5)


    @property
    def params(self):
        return self.softmax.params+self.hidden.params#+[p for f in self.temporal_filters for p in f.params]

    @property
    def inputs(self):
        return self._inputs

    @property
    def outputs(self):
        return self._outputs
    
    @property
    def updates(self):
        return self._updates

    @property
    def test_algorithm(self):
        if not hasattr(self, '_talgorithm'):
            d = self.dropout
            self.dropout = False
            o = self.run(*self.inputs)
            for i,ot in enumerate(self.outputs):
                o[i].name = ot.name
            self._talgorithm = theano.function(inputs=self.inputs,
                                               outputs=o, on_unused_input='warn')
            self.dropout = d
        return self._talgorithm



    def run(self, x, mask, y):
        # use temporal filters
        results = []
        # make x to be batch x features x time
        x = x.transpose([0,2,1])
        for tf in self.temporal_pyramid:
            # results is batch x features x N
            # flatten to batch x features*N
            res, (g,s2,d) = tf.run(x, mask)
            if self.pool == None:
                results.append(res.reshape((x.shape[0], self.feature_dim*self.N)))
            else:
                results.append(res.reshape((x.shape[0], 1, self.feature_dim*self.N)))
        # concatenate on axis 1 to get batch x features*N*filters
        x = T.concatenate(results, axis=1)

        if self.pool == 'max':
            x = T.max(x, axis=1)
        elif self.pool == 'sum':
            x = T.sum(x, axis=1)
        elif self.pool == 'mean':
            x = T.mean(x, axis=1)

        x = self.hidden.run(x, self.dropout)
        prob, pred = self.softmax.run(x, self.dropout)
        loss = self.softmax.loss(prob, y)
        error = self.softmax.error(pred, y)
        acc = 1-error

        return prob, pred, loss, error, acc
Exemple #23
0
class TemporalModel(Model):
    def __init__(self,
                 inputs,
                 bs,
                 max_time,
                 classes,
                 feature_dim,
                 hidden_size,
                 filters,
                 N=1,
                 pool=None,
                 lstm_dim=4096,
                 steps=8,
                 seed=12345):
        self._inputs = inputs
        self.N = N
        self.batch_size = bs
        self.classes = classes
        self.max_time = max_time
        self.filters = filters
        self.feature_dim = feature_dim
        self.pool = pool
        self.dropout = True
        self.steps = steps

        self.temporal_filters = []
        for f in range(filters):
            tf = TemporalAttentionLayer(
                batch_size=bs,
                N=N,
                channels=feature_dim,
                input_hidden_size=lstm_dim,
                name='temporal-attention-layer-filter-' + str(f))
            self.temporal_filters.append(tf)

        input_size = feature_dim * len(
            self.temporal_filters) * (N if pool == None else 1)

        self.lstm_in = HiddenLayer(input_size=input_size,
                                   hidden_size=lstm_dim * 4,
                                   batch_size=bs)
        self.lstm = LSTMLayer(input_size=lstm_dim, hidden_size=lstm_dim)

        self.hidden = HiddenLayer(input_size=lstm_dim,
                                  hidden_size=hidden_size,
                                  activation=act.relu,
                                  batch_size=bs,
                                  name='hidden',
                                  dropout=0.5)
        self.softmax = SoftmaxLayer(input_size=hidden_size,
                                    classes=self.classes,
                                    batch_size=bs,
                                    name='softmax',
                                    dropout=0.5)

    @property
    def params(self):
        return self.softmax.params + self.hidden.params + self.lstm_in.params + self.lstm.params + [
            p for f in self.temporal_filters for p in f.params
        ]

    @property
    def inputs(self):
        return self._inputs

    @property
    def outputs(self):
        return self._outputs

    @property
    def updates(self):
        return self._updates

    @property
    def test_algorithm(self):
        if not hasattr(self, '_talgorithm'):
            d = self.dropout
            self.dropout = False
            o = self.run(*self.inputs)
            for i, ot in enumerate(self.outputs):
                o[i].name = ot.name
            self._talgorithm = theano.function(inputs=self.inputs,
                                               outputs=o,
                                               on_unused_input='warn')
            self.dropout = d
        return self._talgorithm

    def run(self, x, mask, y):
        # use temporal filters

        # make x to be batch x features x time
        x = x.transpose([0, 2, 1])

        h, c = self.lstm.get_initial_hidden(x)

        outputs_info = [
            dict(initial=h, taps=[-1]),  # h
            dict(initial=c, taps=[-1])
        ]  # c

        [h, c], _ = theano.scan(fn=self.step,
                                non_sequences=[x, mask],
                                outputs_info=outputs_info,
                                n_steps=self.steps)

        x = self.hidden.run(h[-1], self.dropout)
        prob, pred = self.softmax.run(x, self.dropout)
        loss = self.softmax.loss(prob, y)
        error = self.softmax.error(pred, y)
        acc = 1 - error

        return prob, pred, loss, error, acc

    def step(self, h, c, x, mask):
        results = []
        for tf in self.temporal_filters:
            # results is batch x features x N
            # flatten to batch x features*N
            res, (g, s2, d) = tf.run(x, h, mask)
            if self.pool == None:
                results.append(
                    res.reshape((x.shape[0], self.feature_dim * self.N)))
            elif self.pool == 'max':
                results.append(
                    T.max(res, axis=2).reshape((x.shape[0], self.feature_dim)))
            elif self.pool == 'sum':
                results.append(
                    T.sum(res, axis=2).reshape((x.shape[0], self.feature_dim)))
            elif self.pool == 'mean':
                results.append(
                    T.mean(res, axis=2).reshape(
                        (x.shape[0], self.feature_dim)))

        # concatenate on axis 1 to get batch x features*N*filters
        x = T.concatenate(results, axis=1)
        x = self.lstm_in.run(x)
        h, c = self.lstm.run(x, h, c)

        return h, c
    def __init__(self,
                 bs,
                 K,
                 lang_N,
                 steps,
                 read_size,
                 write_size,
                 m,
                 gen_dim,
                 infer_dim,
                 z_dim,
                 l,
                 seed=12345,
                 channels=1,
                 image_size=60 * 60,
                 cinit=0):
        # K is the vocab size
        # lang_N is the (max) length of the sentence encoding
        # N is the number of times to run the model
        # m is the size of the langauge representation
        # l is the dimensions in the align function
        # image_size is the w*h of image (assumed square)
        self.use_gpu = True
        self.cinit = cinit
        self.batch_size = bs
        self.channels = channels
        self.gen_dim = gen_dim
        self.z_dim = z_dim
        self.m = m
        self.lang_N = lang_N
        self.steps = steps
        self.l = l
        self.read_size = read_size
        self.write_size = write_size
        self.infer_dim = infer_dim
        self.image_size = image_size

        self.language_model = LanguageModel(bs, K, lang_N, m)

        self.gen_in = HiddenLayer(input_size=m + z_dim,
                                  hidden_size=gen_dim * 4,
                                  batch_size=bs,
                                  name='gen-lstm-in')
        self.gen_lstm = LSTMLayer(hidden_size=gen_dim,
                                  activation=T.tanh,
                                  batch_size=bs,
                                  dropout=0.0,
                                  name='gen-lstm')
        self.infer_in = HiddenLayer(
            input_size=2 * channels * self.read_size**2 + self.gen_dim,
            hidden_size=infer_dim * 4,
            batch_size=bs,
            name='infer-lstm-in')
        self.infer_lstm = LSTMLayer(hidden_size=infer_dim,
                                    activation=T.tanh,
                                    batch_size=bs,
                                    dropout=0.0,
                                    name='infer-lstm')

        self.reader = ReadLayer(batch_size=self.batch_size,
                                N=self.read_size,
                                channels=channels,
                                image_width=int(np.sqrt(self.image_size)),
                                image_height=int(np.sqrt(self.image_size)),
                                input_hidden_size=gen_dim,
                                name='Read')
        self.writer = WriteLayer(batch_size=self.batch_size,
                                 N=self.write_size,
                                 channels=channels,
                                 image_width=int(np.sqrt(self.image_size)),
                                 image_height=int(np.sqrt(self.image_size)),
                                 input_hidden_size=gen_dim,
                                 name='Write')
        self.random = RandomStreams(seed)

        # create W_mu, W_sigma, v, U, W, b
        init = IsotropicGaussian(0.01)
        u = init.init(np_rng, (self.m, self.l))
        self.U = theano.shared(value=u, name='U', borrow=True)
        v = init.init(np_rng, (self.l, ))
        self.v = theano.shared(value=v, name='v', borrow=True)
        w = init.init(np_rng, (self.gen_dim, self.l))
        self.W = theano.shared(value=w, name='W', borrow=True)
        b = init.init(np_rng, (self.l, ))
        self.b = theano.shared(value=b, name='b', borrow=True)

        w_mu = init.init(np_rng, (self.infer_dim, self.z_dim))
        self.W_mu_infer = theano.shared(value=w_mu,
                                        name='W_mu_infer',
                                        borrow=True)
        w_sigma = init.init(np_rng, (self.infer_dim, self.z_dim))
        self.W_sigma_infer = theano.shared(value=w_sigma,
                                           name='W_sigma_infer',
                                           borrow=True)

        w_mu = init.init(np_rng, (self.gen_dim, self.z_dim))
        self.W_mu_gen = theano.shared(value=w_mu, name='W_mu_gen', borrow=True)
        w_sigma = init.init(np_rng, (self.gen_dim, self.z_dim))
        self.W_sigma_gen = theano.shared(value=w_sigma,
                                         name='W_sigma_gen',
                                         borrow=True)
Exemple #25
0
class LanguageModel(Model):
    def __init__(self, bs, K, N, m):
        # builds a bidirectional LSTM to create
        # a m-dimensional hidden state for the given
        # sequence of lenth N with vocab size K
        self.K = K
        self.N = N
        self.m = m
        self.bs = bs
        self.forward_in = HiddenLayer(input_size=K, hidden_size=m*4//2,
                                      batch_size=bs, name='forward-lstm-in')
        self.forward_lstm = LSTMLayer(hidden_size=m//2, 
                                      activation=T.tanh, 
                                      batch_size=bs,
                                      dropout=0.0,
                                      name='forward-lstm')
        
        self.backward_in = HiddenLayer(input_size=K, hidden_size=m*4//2,
                                       batch_size=bs, name='backward-lstm-in')
        self.backward_lstm = LSTMLayer(hidden_size=m//2, 
                                       activation=T.tanh, 
                                       batch_size=bs,
                                       dropout=0.0,
                                       name='backward-lstm')
        
    def run(self, y):
        # y comes in as shape batch X total_seq
        y = y.transpose([1,0])
        # y is of shape seq X batch and of type 'int'
        # y needs to be 1-hot encoded, but this is more
        # easily done in the step function

        # reverse each example of y (not the batches, just the variables)
        y_rev = y[::-1, :]

        # get initial values for LSTMs
        hf, cf = self.forward_lstm.get_initial_hidden
        hb, cb = self.backward_lstm.get_initial_hidden

        # setup initial values for scan
        outputs_info = [dict(initial=hf, taps=[-1]), # hf
                        dict(initial=cf, taps=[-1]), # cf
                        dict(initial=hb, taps=[-1]), # cb
                        dict(initial=cb, taps=[-1])] # cb
                        
        # run LSTM loop
        [hf,cf,hb,cb], _ = theano.scan(fn=self.step,
                                       sequences=[y,y_rev],
                                       outputs_info=outputs_info,
                                       n_steps=self.N)

        # return forward and backward concatenated
        # this needs to be aligned so that [4,13,45,3,X, X, X]
        # and                              [0,0, 0, 3,45,13,4]
        # concatenate correctly to         [4/3,13/25,45/13,3/4,X,X,X]

        # stores the indices of the string
        b_indx = zeros((self.N, self.bs), int)
        # stores the last-set index
        c = zeros((self.bs,), int)
        # This loop creates an array that can be used to
        # map hb to hf with the proper alignment
        for i in range(self.N):
            # if this part of y_rev is 0, ignore
            # else, get the current index
            indx = T.switch(T.neq(y_rev[i,:], 0), i, 0)
            # set b_indx to be the current indx if this is
            # a valid part of the string
            b_indx = T.set_subtensor(b_indx[c,T.arange(self.bs)], indx)
            
            # increment those that were used
            inc = T.switch(T.neq(y_rev[i,:], 0), 1, 0)
            c  = c + inc
            
        # the magic that gets hb to align with hf
        # it takes hb, uses the aligning indices and grabs those on the
        # diagonal as the elements we are interested in. This results in
        # essentially "shifting" the first non-zero element of hb
        # to the front of the list, for each sample in the batch
        h_b_aligned = hb[b_indx][:,T.arange(self.bs),T.arange(self.bs)]
        # concatenate them together. Now everything is aligned, as it should be!
        h_lang = T.concatenate([hf, h_b_aligned], axis=2)

        # axis 0 -> N
        # axis 1 -> batch
        # axis 2 -> m
        return h_lang

    def step(self, y_m, yb_m, hf, cf, hb, cb):
        # y_m/yb_m are what shape? should be batch_size (x 1)
        print y_m.ndim
        # one-hot encode y,yb (NEED TO SAVE PREVIOUS VALUES FOR MASKING!!!)
        y = to_one_hot(y_m, self.bs, self.K)
        yb = to_one_hot(yb_m, self.bs, self.K)

        # get forward and backward inputs values
        y_f_in = self.forward_in.run(y)
        y_b_in = self.backward_in.run(yb)
        
        # run forward and backward LSTMs
        hf_t,cf_t = self.forward_lstm.run(y_f_in, hf, cf)
        hb_t,cb_t = self.backward_lstm.run(y_b_in, hb, cb)

        # but only if y/yb is not 0 (apply mask)
        mask_y = y_m.reshape((self.bs, 1))#.repeat(self.m//2, axis=1) # these lines *shouldnt* be needed...
        mask_yb = yb_m.reshape((self.bs, 1))#.repeat(self.m//2, axis=1)
        hf = T.switch(T.neq(mask_y, 0), hf_t, hf)
        cf = T.switch(T.neq(mask_y, 0), cf_t, cf)
        # and backward
        hb = T.switch(T.neq(mask_yb, 0), hb_t, hb)
        cb = T.switch(T.neq(mask_yb, 0), cb_t, cb)

        # return the new values
        return hf,cf,hb,cb

    @property
    def params(self):
        return self.forward_in.params+self.forward_lstm.params+self.backward_in.params+\
            self.backward_lstm.params