Example #1
0
 def __init__(self,
              size_vocab,
              size_embed,
              size,
              size_out,
              depth,
              depth_spec=1,
              visual_encoder=StackedGRUH0,
              gru_activation=clipped_rectify,
              visual_activation=linear,
              dropout_prob=0.0):
     autoassign(locals())
     self.Embed = Embedding(self.size_vocab, self.size_embed)
     self.Shared = StackedGRUH0(self.size_embed,
                                self.size,
                                self.depth,
                                activation=self.gru_activation,
                                dropout_prob=self.dropout_prob)
     self.Visual = Visual(self.size,
                          self.size,
                          self.size_out,
                          self.depth_spec,
                          encoder=self.visual_encoder,
                          gru_activation=self.gru_activation,
                          visual_activation=self.visual_activation,
                          dropout_prob=self.dropout_prob)
     self.LM = StackedGRU(self.size,
                          self.size,
                          self.depth_spec,
                          activation=self.gru_activation,
                          dropout_prob=self.dropout_prob)
     self.ToTxt = Dense(self.size, self.size_embed)  # try direct softmax
Example #2
0
 def __init__(self, size_vocab, size_embed, size, size_out, depth, out_depth=1, # FIXME USE THIS PARAM
              gru_activation=tanh, visual_activation=linear,
              dropout_prob=0.0):
     autoassign(locals())
     self.Embed = Embedding(self.size_vocab, self.size_embed)
     self.Encode = StackedGRUH0(self.size_embed, self.size, self.depth,
                                activation=self.gru_activation, dropout_prob=self.dropout_prob)
     self.DecodeT = StackedGRU(self.size_embed, self.size, self.depth,
                               activation=self.gru_activation, dropout_prob=self.dropout_prob)
     self.PredictT   = Dense(size_in=self.size, size_out=self.size_embed)
     self.DecodeV = Dense(self.size, self.size_out)
     self.params = params(self.Embed, self.DecodeT, self.PredictT, self.DecodeV) 
Example #3
0
 def __init__(self, size_vocab, size_embed, size, size_out, depth, textual,
              out_depth=1,
              gru_activation=tanh,
              visual_activation=linear,
              dropout_prob=0.0):
     autoassign(locals())
     self.Embed   =  Embedding(self.size_vocab, self.size_embed)
     self.Visual  = Visual(self.size_embed, self.size, self.size_out, self.depth, out_depth=self.out_depth,
                           gru_activation=self.gru_activation, dropout_prob=self.dropout_prob)
     self.Textual = textual(self.size_embed, self.size, self.size_out, self.depth,
                            gru_activation=self.gru_activation, dropout_prob=self.dropout_prob)
     self.params  = params(self.Embed, self.Visual, self.Textual)
Example #4
0
 def __init__(self,
              size_vocab,
              size_embed,
              size,
              size_out,
              depth,
              gru_activation=clipped_rectify,
              visual_encoder=StackedGRUH0,
              visual_activation=linear,
              dropout_prob=0.0):
     autoassign(locals())
     self.Embed = Embedding(self.size_vocab, self.size_embed)
     self.Visual = Visual(self.size_embed,
                          self.size,
                          self.size_out,
                          self.depth,
                          encoder=self.visual_encoder,
                          gru_activation=self.gru_activation,
                          visual_activation=self.visual_activation,
                          dropout_prob=self.dropout_prob)
     self.LM = StackedGRUH0(self.size_embed,
                            self.size,
                            self.depth,
                            activation=self.gru_activation,
                            dropout_prob=self.dropout_prob)
     self.ToTxt = Dense(self.size, self.size_vocab)  # map to vocabulary
Example #5
0
 def __init__(
     self,
     size_vocab,
     size_embed,
     size,
     size_out,
     depth,
     depth_spec=1,
     visual_encoder=StackedGRUH0,
     gru_activation=clipped_rectify,
     visual_activation=linear,
     dropout_prob=0.0,
 ):
     autoassign(locals())
     self.Embed = Embedding(self.size_vocab, self.size_embed)
     self.Shared = StackedGRUH0(
         self.size_embed, self.size, self.depth, activation=self.gru_activation, dropout_prob=self.dropout_prob
     )
     self.Visual = Visual(
         self.size,
         self.size,
         self.size_out,
         self.depth_spec,
         encoder=self.visual_encoder,
         gru_activation=self.gru_activation,
         visual_activation=self.visual_activation,
         dropout_prob=self.dropout_prob,
     )
     self.LM = StackedGRU(
         self.size, self.size, self.depth_spec, activation=self.gru_activation, dropout_prob=self.dropout_prob
     )
     self.ToTxt = Dense(self.size, self.size_embed)  # try direct softmax
Example #6
0
class MultitaskBD(Layer):
    """Text to image and back."""
    def __init__(self, size_vocab, size_embed, size, size_out, depth, out_depth=1, # FIXME USE THIS PARAM
                 gru_activation=tanh, visual_activation=linear,
                 dropout_prob=0.0):
        autoassign(locals())
        self.Embed = Embedding(self.size_vocab, self.size_embed)
        self.Encode = StackedGRUH0(self.size_embed, self.size, self.depth,
                                   activation=self.gru_activation, dropout_prob=self.dropout_prob)
        self.ToVis   = Dense(self.size, self.size_out)
        self.FromVis = Dense(self.size_out, self.size)
        self.Decode = StackedGRU(self.size_embed, self.size, self.depth,
                                 activation=self.gru_activation, dropout_prob=self.dropout_prob)
        self.PredictT   = Dense(size_in=self.size, size_out=self.size_embed)
        self.params = params(self.Embed, self.Encode, self.ToVis, self.FromVis, self.Decode, self.PredictT)
        
    def __call__(self, inp, out_prev, img):
        img_out = self.visual_activation(self.ToVis(last(self.Encode(self.Embed(inp)))))
        txt_out = softmax3d(self.Embed.unembed(self.PredictT(self.Decode(self.visual_activation(self.FromVis(img)),
                                                                   self.Embed(out_prev)))))
        return (img_out, txt_out)
    
    def predictor_v(self):
        """Return function to predict image vector from input."""
        input = T.imatrix()
        return theano.function([input], self.visual_activation(self.ToVis(last(self.Encode(self.Embed(input))))))
Example #7
0
class MultitaskMM(Layer):
    """Shared recurrent encoder with visual decoder + textual decoder."""
    def __init__(self, size_vocab, size_embed, size, size_out, depth, out_depth=1, # FIXME USE THIS PARAM
                 gru_activation=tanh, visual_activation=linear,
                 dropout_prob=0.0):
        autoassign(locals())
        self.Embed = Embedding(self.size_vocab, self.size_embed)
        self.Encode = StackedGRUH0(self.size_embed, self.size, self.depth,
                                   activation=self.gru_activation, dropout_prob=self.dropout_prob)
        self.DecodeT = StackedGRU(self.size_embed, self.size, self.depth,
                                  activation=self.gru_activation, dropout_prob=self.dropout_prob)
        self.PredictT   = Dense(size_in=self.size, size_out=self.size_embed)
        self.DecodeV = Dense(self.size, self.size_out)
        self.params = params(self.Embed, self.DecodeT, self.PredictT, self.DecodeV) 
        
    def __call__(self, inp, out_prev, _img):
        rep = last(self.Encode(self.Embed(inp)))
        img = self.visual_activation(self.DecodeV(rep))
        txt = softmax3d(self.Embed.unembed(self.PredictT(self.DecodeT(rep, self.Embed(out_prev)))))
        return (img, txt)
    
    def predictor_v(self):
        """Return function to predict image vector from input."""
        input    = T.imatrix()
        return theano.function([input], self.visual_activation(self.DecodeV(last(self.Encode(self.Embed(input))))))
Example #8
0
class MultitaskY(Layer):
    """Joint Encode + separate pathways."""
    
    def __init__(self, size_vocab, size_embed, size, size_out, depth, textual,
                 out_depth=1,
                 gru_activation=tanh,
                 visual_activation=linear,
                 dropout_prob=0.0):
        autoassign(locals())
        self.Embed   = Embedding(self.size_vocab, self.size_embed)
        self.Joint   = StackedGRUH0(self.size_embed, self.size, self.depth,
                                    activation=self.gru_activation, dropout_prob=self.dropout_prob)
        self.Visual  = Visual(self.size, self.size, self.size_out, self.depth, out_depth=self.out_depth,
                              gru_activation=self.gru_activation, dropout_prob=self.dropout_prob)
        self.Textual = textual(self.size, self.size, self.depth,
                               gru_activation=self.gru_activation, dropout_prob=self.dropout_prob)
        self.params  = params(self.Embed, self.Visual, self.Textual)

    def __call__(self, inp, output_prev, _img):
        inp_e = self.Joint(self.Embed(inp))
        output_prev_e  = self.Embed(output_prev)
        img   = self.visual_activation(self.Visual(inp_e))
        txt   = softmax3d(self.Embed.unembed(self.Textual(inp_e, output_prev_e, _img)))
        return (img, txt)
    
    def predictor_v(self):
        """Return function to predict image vector from input."""
        input    = T.imatrix()
        return theano.function([input], self.visual_activation(self.Visual(self.Embed(input))))
Example #9
0
class Multitask(Layer):
    """Visual encoder combined with a textual task."""
    
    def __init__(self, size_vocab, size_embed, size, size_out, depth, textual,
                 out_depth=1,
                 gru_activation=tanh,
                 visual_activation=linear,
                 dropout_prob=0.0):
        autoassign(locals())
        self.Embed   =  Embedding(self.size_vocab, self.size_embed)
        self.Visual  = Visual(self.size_embed, self.size, self.size_out, self.depth, out_depth=self.out_depth,
                              gru_activation=self.gru_activation, dropout_prob=self.dropout_prob)
        self.Textual = textual(self.size_embed, self.size, self.size_out, self.depth,
                               gru_activation=self.gru_activation, dropout_prob=self.dropout_prob)
        self.params  = params(self.Embed, self.Visual, self.Textual)

        
    def __call__(self, inp, output_prev, img):
        inp_e = self.Embed(inp)
        output_prev_e  = self.Embed(output_prev)
        img_pred   = self.visual_activation(self.Visual(inp_e))
        txt_pred   = softmax3d(self.Embed.unembed(self.Textual(inp_e, output_prev_e, img)))
        return (img_pred, txt_pred)

    
    def predictor_v(self):
        """Return function to predict image vector from input."""
        input    = T.imatrix()
        return theano.function([input], self.visual_activation(self.Visual(self.Embed(input))))
Example #10
0
    def __init__(self, size_vocab, size_embed, size, depth):
        autoassign(locals())

        self.Embed = Embedding(self.size_vocab, self.size_embed)
        self.GRU = StackedGRUH0(self.size_embed,
                                self.size,
                                self.depth,
                                activation=clipped_rectify)
Example #11
0
class MultitaskLMY(Layer):
    """Alternative visual encoder combined with a textual decoder.

    Textual decoder starts from final state of encoder instead of from
    image. Shared hidden layer plus specialized layers.
    """

    def __init__(
        self,
        size_vocab,
        size_embed,
        size,
        size_out,
        depth,
        depth_spec=1,
        visual_encoder=StackedGRUH0,
        gru_activation=clipped_rectify,
        visual_activation=linear,
        dropout_prob=0.0,
    ):
        autoassign(locals())
        self.Embed = Embedding(self.size_vocab, self.size_embed)
        self.Shared = StackedGRUH0(
            self.size_embed, self.size, self.depth, activation=self.gru_activation, dropout_prob=self.dropout_prob
        )
        self.Visual = Visual(
            self.size,
            self.size,
            self.size_out,
            self.depth_spec,
            encoder=self.visual_encoder,
            gru_activation=self.gru_activation,
            visual_activation=self.visual_activation,
            dropout_prob=self.dropout_prob,
        )
        self.LM = StackedGRU(
            self.size, self.size, self.depth_spec, activation=self.gru_activation, dropout_prob=self.dropout_prob
        )
        self.ToTxt = Dense(self.size, self.size_embed)  # try direct softmax

    def params(self):
        return params(self.Embed, self.Shared, self.Visual, self.LM, self.ToTxt)

    def __call__(self, inp, output_prev, _img):
        shared = self.Shared(self.Embed(inp))
        img_pred = self.Visual(shared)
        txt_pred = softmax3d(self.Embed.unembed(self.ToTxt(self.LM(last(shared), self.Embed(output_prev)))))
        return (img_pred, txt_pred)

    def predictor_v(self):
        """Return function to predict image vector from input."""
        input = T.imatrix()
        return theano.function([input], self.Visual(self.Shared(self.Embed(input))))

    def predictor_r(self):
        """Return function to predict representation from input."""
        input = T.imatrix()
        return theano.function([input], last(self.Shared(self.Embed(input))))
Example #12
0
class MultitaskLMD(Layer):
    """Alternative visual encoder combined with a textual decoder.

    Textual decoder starts from final state of encoder instead of from image.
"""
    def __init__(self,
                 size_vocab,
                 size_embed,
                 size,
                 size_out,
                 depth,
                 gru_activation=clipped_rectify,
                 visual_activation=linear,
                 visual_encoder=StackedGRUH0,
                 dropout_prob=0.0):
        autoassign(locals())
        self.Embed = Embedding(self.size_vocab, self.size_embed)
        self.Visual = Visual(self.size_embed,
                             self.size,
                             self.size_out,
                             self.depth,
                             encoder=self.visual_encoder,
                             gru_activation=self.gru_activation,
                             visual_activation=self.visual_activation,
                             dropout_prob=self.dropout_prob)
        self.LM = StackedGRU(self.size_embed,
                             self.size,
                             self.depth,
                             activation=self.gru_activation,
                             dropout_prob=self.dropout_prob)
        self.ToTxt = Dense(self.size, self.size_embed)  # try direct softmax

    def params(self):
        return params(self.Embed, self.Visual, self.LM, self.ToTxt)

    def __call__(self, inp, output_prev, _img):
        rep = self.Visual.encode(self.Embed(inp))
        img_pred = self.Visual.visual_activation(self.Visual.ToImg(rep))
        txt_pred = softmax3d(
            self.Embed.unembed(
                self.ToTxt(self.LM(rep, self.Embed(output_prev)))))
        return (img_pred, txt_pred)

    def predictor_v(self):
        """Return function to predict image vector from input."""
        input = T.imatrix()
        return theano.function([input], self.Visual(self.Embed(input)))

    def predictor_r(self):
        """Return function to predict representation from input."""
        input = T.imatrix()
        return theano.function([input], self.Visual.encode(self.Embed(input)))
Example #13
0
 def __init__(self, size_vocab, size_embed, size, size_out, depth,
              gru_activation=clipped_rectify,
              visual_activation=linear, dropout_prob=0.0):
     autoassign(locals())
     self.Embed   =  Embedding(self.size_vocab, self.size_embed)
     self.Visual  =  Visual(self.size_embed, self.size, self.size_out, self.depth,
                            gru_activation=self.gru_activation,
                            visual_activation=self.visual_activation,
                            dropout_prob=self.dropout_prob)
     self.LM      =  StackedGRUH0(self.size_embed, self.size, self.depth,
                                  activation=self.gru_activation,
                                  dropout_prob=self.dropout_prob)
     self.ToTxt   =  Dense(self.size, self.size_embed) # map to embeddings
Example #14
0
class MultitaskLMC(Layer):
    """Visual encoder combined with a textual decoder."""

    def __init__(
        self,
        size_vocab,
        size_embed,
        size,
        size_out,
        depth,
        gru_activation=clipped_rectify,
        visual_activation=linear,
        visual_encoder=StackedGRUH0,
        dropout_prob=0.0,
    ):
        autoassign(locals())
        self.Embed = Embedding(self.size_vocab, self.size_embed)
        self.Visual = Visual(
            self.size_embed,
            self.size,
            self.size_out,
            self.depth,
            encoder=self.visual_encoder,
            gru_activation=self.gru_activation,
            visual_activation=self.visual_activation,
            dropout_prob=self.dropout_prob,
        )
        self.LM = StackedGRU(
            self.size_embed, self.size, self.depth, activation=self.gru_activation, dropout_prob=self.dropout_prob
        )
        self.FromImg = Dense(self.size_out, self.size)
        self.ToTxt = Dense(self.size, self.size_embed)  # try direct softmax

    def params(self):
        return params(self.Embed, self.Visual, self.LM, self.FromImg, self.ToTxt)

    def __call__(self, inp, output_prev, img):
        img_pred = self.Visual(self.Embed(inp))
        txt_pred = softmax3d(self.Embed.unembed(self.ToTxt(self.LM(self.FromImg(img), self.Embed(output_prev)))))
        return (img_pred, txt_pred)

    def predictor_v(self):
        """Return function to predict image vector from input."""
        input = T.imatrix()
        return theano.function([input], self.Visual(self.Embed(input)))

    def predictor_r(self):
        """Return function to predict representation from input."""
        input = T.imatrix()
        return theano.function([input], self.Visual.encode(self.Embed(input)))