class MultitaskBD(Layer): """Text to image and back.""" def __init__(self, size_vocab, size_embed, size, size_out, depth, out_depth=1, # FIXME USE THIS PARAM gru_activation=tanh, visual_activation=linear, dropout_prob=0.0): autoassign(locals()) self.Embed = Embedding(self.size_vocab, self.size_embed) self.Encode = StackedGRUH0(self.size_embed, self.size, self.depth, activation=self.gru_activation, dropout_prob=self.dropout_prob) self.ToVis = Dense(self.size, self.size_out) self.FromVis = Dense(self.size_out, self.size) self.Decode = StackedGRU(self.size_embed, self.size, self.depth, activation=self.gru_activation, dropout_prob=self.dropout_prob) self.PredictT = Dense(size_in=self.size, size_out=self.size_embed) self.params = params(self.Embed, self.Encode, self.ToVis, self.FromVis, self.Decode, self.PredictT) def __call__(self, inp, out_prev, img): img_out = self.visual_activation(self.ToVis(last(self.Encode(self.Embed(inp))))) txt_out = softmax3d(self.Embed.unembed(self.PredictT(self.Decode(self.visual_activation(self.FromVis(img)), self.Embed(out_prev))))) return (img_out, txt_out) def predictor_v(self): """Return function to predict image vector from input.""" input = T.imatrix() return theano.function([input], self.visual_activation(self.ToVis(last(self.Encode(self.Embed(input))))))
class MultitaskMM(Layer): """Shared recurrent encoder with visual decoder + textual decoder.""" def __init__(self, size_vocab, size_embed, size, size_out, depth, out_depth=1, # FIXME USE THIS PARAM gru_activation=tanh, visual_activation=linear, dropout_prob=0.0): autoassign(locals()) self.Embed = Embedding(self.size_vocab, self.size_embed) self.Encode = StackedGRUH0(self.size_embed, self.size, self.depth, activation=self.gru_activation, dropout_prob=self.dropout_prob) self.DecodeT = StackedGRU(self.size_embed, self.size, self.depth, activation=self.gru_activation, dropout_prob=self.dropout_prob) self.PredictT = Dense(size_in=self.size, size_out=self.size_embed) self.DecodeV = Dense(self.size, self.size_out) self.params = params(self.Embed, self.DecodeT, self.PredictT, self.DecodeV) def __call__(self, inp, out_prev, _img): rep = last(self.Encode(self.Embed(inp))) img = self.visual_activation(self.DecodeV(rep)) txt = softmax3d(self.Embed.unembed(self.PredictT(self.DecodeT(rep, self.Embed(out_prev))))) return (img, txt) def predictor_v(self): """Return function to predict image vector from input.""" input = T.imatrix() return theano.function([input], self.visual_activation(self.DecodeV(last(self.Encode(self.Embed(input))))))
class MultitaskY(Layer): """Joint Encode + separate pathways.""" def __init__(self, size_vocab, size_embed, size, size_out, depth, textual, out_depth=1, gru_activation=tanh, visual_activation=linear, dropout_prob=0.0): autoassign(locals()) self.Embed = Embedding(self.size_vocab, self.size_embed) self.Joint = StackedGRUH0(self.size_embed, self.size, self.depth, activation=self.gru_activation, dropout_prob=self.dropout_prob) self.Visual = Visual(self.size, self.size, self.size_out, self.depth, out_depth=self.out_depth, gru_activation=self.gru_activation, dropout_prob=self.dropout_prob) self.Textual = textual(self.size, self.size, self.depth, gru_activation=self.gru_activation, dropout_prob=self.dropout_prob) self.params = params(self.Embed, self.Visual, self.Textual) def __call__(self, inp, output_prev, _img): inp_e = self.Joint(self.Embed(inp)) output_prev_e = self.Embed(output_prev) img = self.visual_activation(self.Visual(inp_e)) txt = softmax3d(self.Embed.unembed(self.Textual(inp_e, output_prev_e, _img))) return (img, txt) def predictor_v(self): """Return function to predict image vector from input.""" input = T.imatrix() return theano.function([input], self.visual_activation(self.Visual(self.Embed(input))))
class Multitask(Layer): """Visual encoder combined with a textual task.""" def __init__(self, size_vocab, size_embed, size, size_out, depth, textual, out_depth=1, gru_activation=tanh, visual_activation=linear, dropout_prob=0.0): autoassign(locals()) self.Embed = Embedding(self.size_vocab, self.size_embed) self.Visual = Visual(self.size_embed, self.size, self.size_out, self.depth, out_depth=self.out_depth, gru_activation=self.gru_activation, dropout_prob=self.dropout_prob) self.Textual = textual(self.size_embed, self.size, self.size_out, self.depth, gru_activation=self.gru_activation, dropout_prob=self.dropout_prob) self.params = params(self.Embed, self.Visual, self.Textual) def __call__(self, inp, output_prev, img): inp_e = self.Embed(inp) output_prev_e = self.Embed(output_prev) img_pred = self.visual_activation(self.Visual(inp_e)) txt_pred = softmax3d(self.Embed.unembed(self.Textual(inp_e, output_prev_e, img))) return (img_pred, txt_pred) def predictor_v(self): """Return function to predict image vector from input.""" input = T.imatrix() return theano.function([input], self.visual_activation(self.Visual(self.Embed(input))))
class MultitaskLMY(Layer): """Alternative visual encoder combined with a textual decoder. Textual decoder starts from final state of encoder instead of from image. Shared hidden layer plus specialized layers. """ def __init__( self, size_vocab, size_embed, size, size_out, depth, depth_spec=1, visual_encoder=StackedGRUH0, gru_activation=clipped_rectify, visual_activation=linear, dropout_prob=0.0, ): autoassign(locals()) self.Embed = Embedding(self.size_vocab, self.size_embed) self.Shared = StackedGRUH0( self.size_embed, self.size, self.depth, activation=self.gru_activation, dropout_prob=self.dropout_prob ) self.Visual = Visual( self.size, self.size, self.size_out, self.depth_spec, encoder=self.visual_encoder, gru_activation=self.gru_activation, visual_activation=self.visual_activation, dropout_prob=self.dropout_prob, ) self.LM = StackedGRU( self.size, self.size, self.depth_spec, activation=self.gru_activation, dropout_prob=self.dropout_prob ) self.ToTxt = Dense(self.size, self.size_embed) # try direct softmax def params(self): return params(self.Embed, self.Shared, self.Visual, self.LM, self.ToTxt) def __call__(self, inp, output_prev, _img): shared = self.Shared(self.Embed(inp)) img_pred = self.Visual(shared) txt_pred = softmax3d(self.Embed.unembed(self.ToTxt(self.LM(last(shared), self.Embed(output_prev))))) return (img_pred, txt_pred) def predictor_v(self): """Return function to predict image vector from input.""" input = T.imatrix() return theano.function([input], self.Visual(self.Shared(self.Embed(input)))) def predictor_r(self): """Return function to predict representation from input.""" input = T.imatrix() return theano.function([input], last(self.Shared(self.Embed(input))))
class MultitaskLMD(Layer): """Alternative visual encoder combined with a textual decoder. Textual decoder starts from final state of encoder instead of from image. """ def __init__(self, size_vocab, size_embed, size, size_out, depth, gru_activation=clipped_rectify, visual_activation=linear, visual_encoder=StackedGRUH0, dropout_prob=0.0): autoassign(locals()) self.Embed = Embedding(self.size_vocab, self.size_embed) self.Visual = Visual(self.size_embed, self.size, self.size_out, self.depth, encoder=self.visual_encoder, gru_activation=self.gru_activation, visual_activation=self.visual_activation, dropout_prob=self.dropout_prob) self.LM = StackedGRU(self.size_embed, self.size, self.depth, activation=self.gru_activation, dropout_prob=self.dropout_prob) self.ToTxt = Dense(self.size, self.size_embed) # try direct softmax def params(self): return params(self.Embed, self.Visual, self.LM, self.ToTxt) def __call__(self, inp, output_prev, _img): rep = self.Visual.encode(self.Embed(inp)) img_pred = self.Visual.visual_activation(self.Visual.ToImg(rep)) txt_pred = softmax3d( self.Embed.unembed( self.ToTxt(self.LM(rep, self.Embed(output_prev))))) return (img_pred, txt_pred) def predictor_v(self): """Return function to predict image vector from input.""" input = T.imatrix() return theano.function([input], self.Visual(self.Embed(input))) def predictor_r(self): """Return function to predict representation from input.""" input = T.imatrix() return theano.function([input], self.Visual.encode(self.Embed(input)))
class MultitaskLMC(Layer): """Visual encoder combined with a textual decoder.""" def __init__( self, size_vocab, size_embed, size, size_out, depth, gru_activation=clipped_rectify, visual_activation=linear, visual_encoder=StackedGRUH0, dropout_prob=0.0, ): autoassign(locals()) self.Embed = Embedding(self.size_vocab, self.size_embed) self.Visual = Visual( self.size_embed, self.size, self.size_out, self.depth, encoder=self.visual_encoder, gru_activation=self.gru_activation, visual_activation=self.visual_activation, dropout_prob=self.dropout_prob, ) self.LM = StackedGRU( self.size_embed, self.size, self.depth, activation=self.gru_activation, dropout_prob=self.dropout_prob ) self.FromImg = Dense(self.size_out, self.size) self.ToTxt = Dense(self.size, self.size_embed) # try direct softmax def params(self): return params(self.Embed, self.Visual, self.LM, self.FromImg, self.ToTxt) def __call__(self, inp, output_prev, img): img_pred = self.Visual(self.Embed(inp)) txt_pred = softmax3d(self.Embed.unembed(self.ToTxt(self.LM(self.FromImg(img), self.Embed(output_prev))))) return (img_pred, txt_pred) def predictor_v(self): """Return function to predict image vector from input.""" input = T.imatrix() return theano.function([input], self.Visual(self.Embed(input))) def predictor_r(self): """Return function to predict representation from input.""" input = T.imatrix() return theano.function([input], self.Visual.encode(self.Embed(input)))