def ready(self):
     # vector of sentences, where each sentence input is a vector of word ids
     self.x_sentence         = T.matrix(name='x_sentence',
                                         dtype='int64')
     # masks for padding
     self.x_sentence_mask    = T.matrix(name='x_sentence_mask',
                                         dtype=theano.config.floatX)
     # indices of sentences in the minibatch
     self.x_sentence_indices = T.vector(name='x_sentence_indices',
                                         dtype='int64')
     
     # image input feature vector
     self.x_image            = T.matrix(name='x_image',
                                         dtype=theano.config.floatX)
     # indices of images in the minibatch
     self.x_image_indices    = T.vector(name='x_image_indices',
                                         dtype='int64')
     
     # sentence embeddings RNN
     #self.sentence_proj = RNN()
     #self.sentence_proj = RNN_GRU()
     self.sentence_proj = RNN_Multilayer()
     
     self.sentence_proj.init_parameters(
         n_in                       = self.word_emb_dim,
         n_hidden                   = self.multimodal_emb_dim,
         word_embeddings            = self.word_embeddings,
         vocabulary_size            = self.vocabulary_size,
         n_layers                   = self.n_layers)
     
     self.sentence_proj.create(
         minibatch_sentences        = self.x_sentence,
         minibatch_mask             = self.x_sentence_mask)
     
     # image embeddings linear projection network
     self.image_proj = ImageProjector()
     
     self.image_proj.init_parameters(
         n_in             = self.image_fv_dim,
         n_out            = self.multimodal_emb_dim)
     
     self.image_proj.create(
         input            = self.x_image)
class MultimodalEmbeddingsLearner(PersistentObject,BaseEstimator,MultimodalEmbeddingDistances):
    """ Create multimodal embeddings by fitting positive
        sentence/image pairs from Flickr8k
        into one same (multimodal) vector space.
    """
    
    def setState(self, state):
        """ Set parameters from state sequence.
            Parameters must be in predefined order.
        """
        meta_params, sentence_proj_params, sentence_proj_update_mom, \
                image_proj_params, image_proj_update_mom = state
        
        # call to BaseEstimator class method
        self.set_params(**meta_params)
        self.ready()
        self.__setSentenceParams(sentence_proj_params, sentence_proj_update_mom)
        self.__setImageParams(image_proj_params, image_proj_update_mom)
    
    def getState(self):
        """ Return state sequence (meta parameters plus networks parameters)."""
        
        # call to BaseEstimator class method
        meta_params              = self.get_params()
        sentence_proj_params     = [p for p in self.sentence_proj.params]
        sentence_proj_update_mom = [p for p in self.sentence_proj.updates]
        image_proj_params        = [p for p in self.image_proj.params]
        image_proj_update_mom    = [p for p in self.image_proj.updates]
        
        state = (meta_params,
                 sentence_proj_params, sentence_proj_update_mom,
                 image_proj_params, image_proj_update_mom)
        
        return state
    
    def __init__(self,
                 train_stream = None,
                 valid_stream = None,
                 test_stream  = None,
                 word_embeddings=None,
                 vocabulary_size=100000,
                 word_emb_dim=300,
                 image_fv_dim=4096,
                 multimodal_emb_dim=500,
                 n_layers=1,
                 learning_rate=0.01,
                 n_epochs=100,
                 L1_reg=0.00, L2_reg=0.00,
                 learning_rate_decay=1,
                 activation=T.nnet.sigmoid,
                 final_momentum=0.9,
                 initial_momentum=0.5,
                 momentum_switchover=5,
                 patience=5000,
                 patience_increase=2,
                 improvement_threshold=0.995,
                 finish_after = 100000000,
                 proj_folder='.',
                 proj_name=None,
                 load_model=False,
                 stop_on_load_error=False,
                 last_epoch=-1,
                 time_measures=[],
                 clock_measures=[],
                 all_train_losses=[],
                 all_valid_losses=[],
                 all_test_losses=[]):
        # mandatory parameters
        assert(not train_stream == None and not valid_stream == None)
        assert(n_layers >= 1)
        
        # these parameters are in this constructor
        # for persistence purposes only but ** must never be set **
        # to a value different than their default values
        assert(last_epoch == -1)
        assert(time_measures == clock_measures == all_train_losses
               == all_valid_losses == all_test_losses == [])
        
        # the three parameters below have different names from their
        # corresponding to their setters. this means they will not be pickled!
        self.__train_stream          = train_stream
        self.__valid_stream          = valid_stream
        self.__test_stream           = test_stream
        
        self.word_embeddings       = word_embeddings

        self.vocabulary_size       = int(vocabulary_size)
        self.word_emb_dim          = int(word_emb_dim)
        self.image_fv_dim          = int(image_fv_dim)
        self.multimodal_emb_dim    = int(multimodal_emb_dim)
        self.n_layers              = int(n_layers)
        
        self.learning_rate         = float(learning_rate)
        self.learning_rate_decay   = float(learning_rate_decay)
        self.n_epochs              = int(n_epochs)
        self.L1_reg                = float(L1_reg)
        self.L2_reg                = float(L2_reg)
        self.activation            = activation
        self.initial_momentum      = float(initial_momentum)
        self.final_momentum        = float(final_momentum)
        self.momentum_switchover   = int(momentum_switchover)
        self.patience              = int(patience)
        self.patience_increase     = float(patience_increase)
        self.improvement_threshold = float(improvement_threshold)
        self.finish_after          = int(finish_after)
        
        # last epoch trained, starts from -1
        self.last_epoch            = last_epoch
        # whether we attempt to load pretrained model from disk
        self.load_model            = bool(load_model)
        
        # vars used for plotting training/dev/test progress
        self.time_measures         = time_measures
        self.clock_measures        = clock_measures
        self.all_train_losses      = all_train_losses
        self.all_valid_losses      = all_valid_losses
        self.all_test_losses       = all_test_losses
        
        self.proj_folder           = proj_folder
        self.proj_name             = proj_name
        self.stop_on_load_error    = stop_on_load_error
        
        # super class, implements behaviour to save() and load() from disk
        PersistentObject.__init__(self,
                                  proj_folder=proj_folder,
                                  proj_name=proj_name,
                                  stop_on_load_error=stop_on_load_error)
        
        self.populate_dataset_metadata()
        self.ready()
        
        if self.load_model:
            self.load()
    
        
    def ready(self):
        # vector of sentences, where each sentence input is a vector of word ids
        self.x_sentence         = T.matrix(name='x_sentence',
                                            dtype='int64')
        # masks for padding
        self.x_sentence_mask    = T.matrix(name='x_sentence_mask',
                                            dtype=theano.config.floatX)
        # indices of sentences in the minibatch
        self.x_sentence_indices = T.vector(name='x_sentence_indices',
                                            dtype='int64')
        
        # image input feature vector
        self.x_image            = T.matrix(name='x_image',
                                            dtype=theano.config.floatX)
        # indices of images in the minibatch
        self.x_image_indices    = T.vector(name='x_image_indices',
                                            dtype='int64')
        
        # sentence embeddings RNN
        #self.sentence_proj = RNN()
        #self.sentence_proj = RNN_GRU()
        self.sentence_proj = RNN_Multilayer()
        
        self.sentence_proj.init_parameters(
            n_in                       = self.word_emb_dim,
            n_hidden                   = self.multimodal_emb_dim,
            word_embeddings            = self.word_embeddings,
            vocabulary_size            = self.vocabulary_size,
            n_layers                   = self.n_layers)
        
        self.sentence_proj.create(
            minibatch_sentences        = self.x_sentence,
            minibatch_mask             = self.x_sentence_mask)
        
        # image embeddings linear projection network
        self.image_proj = ImageProjector()
        
        self.image_proj.init_parameters(
            n_in             = self.image_fv_dim,
            n_out            = self.multimodal_emb_dim)
        
        self.image_proj.create(
            input            = self.x_image)
    
    
    def shared_dataset(self, data_xy):
        """ Load the dataset into shared variables / list of shared variables
            X values are word indices, not vectors (at this point).
            Y values are image festure vectors.
        """

        data_x, data_y = data_xy
        shared_x = theano.shared(np.asarray(data_x,
                                            dtype='int32'))
        shared_y = theano.shared(np.asarray(data_y,
                                            dtype=theano.config.floatX))
        return shared_x, shared_y

    def __setSentenceParams(self, sentence_proj_params, sentence_proj_update_mom):
        """ Set sentence RNN fittable parameters from weights sequence.
            Parameters must be in the order defined by rnn_params and rnn_update_mom.
        """
        assert(len(sentence_proj_params) == len(sentence_proj_update_mom))
        assert(len(self.sentence_proj.params) == len(self.sentence_proj.updates))
        assert(len(sentence_proj_params) == len(self.sentence_proj.params))
        
        c=0
        for param, update in zip(self.sentence_proj.params,
                                 self.sentence_proj.updates):
            param.set_value( sentence_proj_params[c].get_value() )
            update.set_value( sentence_proj_update_mom[c].get_value() )
            c+=1
    
    def __setImageParams(self, image_proj_params, image_proj_update_mom):
        """ Set image network parameters.
            Parameters must be in the right order.
        """
        assert(len(image_proj_params) == len(image_proj_update_mom))
        assert(len(self.image_proj.params) == len(self.image_proj.updates))
        assert(len(image_proj_params) == len(self.image_proj.params))
        
        c=0
        for param, update in zip(self.image_proj.params, self.image_proj.updates):
            param.set_value( image_proj_params[c].get_value() )
            update.set_value( image_proj_update_mom[c].get_value() )
            c+=1
    
    def populate_dataset_metadata(self):
        # populate training set metadata
        # populate mappings from image idx -> sentences ids and vice-versa
        self.training_set_image_idx_to_sentence_indices = OrderedDict() # 1xN mapping
        self.training_set_sentence_idx_to_image_idx     = OrderedDict() # 1x1 mapping
        for minibatch in self.__train_stream.get_epoch_iterator():
            sentence_indices, image_indices, _, _, _ = minibatch
            
            sentence_indices = sentence_indices.flatten()
            image_indices    = image_indices.flatten()
            assert(len(sentence_indices) == len(image_indices))
            
            for image_idx, sentence_idx in zip(image_indices, sentence_indices):
                if not image_idx in self.training_set_image_idx_to_sentence_indices:
                    self.training_set_image_idx_to_sentence_indices[image_idx] = deque()
                
                self.training_set_image_idx_to_sentence_indices[image_idx].append(sentence_idx)
                self.training_set_sentence_idx_to_image_idx[sentence_idx]  = image_idx
        
        self.training_set_n_unique_images = len(self.training_set_image_idx_to_sentence_indices)
        self.training_set_n_unique_sentences = len(self.training_set_sentence_idx_to_image_idx)
        
        #logger.info("training_set_n_unique_images: %s" % str(self.training_set_n_unique_images))
        #logger.info("training_set_n_unique_sentences: %s" % str(self.training_set_n_unique_sentences))
        
        # populate validation set metadata
        # populate mappings from image idx -> sentences ids and vice-versa
        self.valid_set_image_idx_to_sentence_indices = OrderedDict() # 1xN mapping
        self.valid_set_sentence_idx_to_image_idx     = OrderedDict() # 1x1 mapping
        for minibatch in self.__valid_stream.get_epoch_iterator():
            sentence_indices, image_indices, _, _, _ = minibatch
            
            sentence_indices = sentence_indices.flatten()
            image_indices    = image_indices.flatten()
            assert(len(sentence_indices) == len(image_indices))
            
            for image_idx, sentence_idx in zip(image_indices, sentence_indices):
                if not image_idx in self.valid_set_image_idx_to_sentence_indices:
                    self.valid_set_image_idx_to_sentence_indices[image_idx] = deque()
                
                self.valid_set_image_idx_to_sentence_indices[image_idx].append(sentence_idx)
                self.valid_set_sentence_idx_to_image_idx[sentence_idx]  = image_idx
        
        self.valid_set_n_unique_images = len(self.valid_set_image_idx_to_sentence_indices)
        self.valid_set_n_unique_sentences = len(self.valid_set_sentence_idx_to_image_idx)
        
        # populate test set metadata
        # populate mappings from image idx -> sentences ids and vice-versa
        self.test_set_image_idx_to_sentence_indices = OrderedDict() # 1xN mapping
        self.test_set_sentence_idx_to_image_idx     = OrderedDict() # 1x1 mapping
        for minibatch in self.__test_stream.get_epoch_iterator():
            sentence_indices, image_indices, _, _, _ = minibatch
            
            sentence_indices = sentence_indices.flatten()
            image_indices    = image_indices.flatten()
            assert(len(sentence_indices) == len(image_indices))
            
            for image_idx, sentence_idx in zip(image_indices, sentence_indices):
                if not image_idx in self.test_set_image_idx_to_sentence_indices:
                    self.test_set_image_idx_to_sentence_indices[image_idx] = deque()
                
                self.test_set_image_idx_to_sentence_indices[image_idx].append(sentence_idx)
                self.test_set_sentence_idx_to_image_idx[sentence_idx]  = image_idx
        
        self.test_set_n_unique_images = len(self.test_set_image_idx_to_sentence_indices)
        self.test_set_n_unique_sentences = len(self.test_set_sentence_idx_to_image_idx)
        
        #logger.info("test_set_n_unique_images: %s" % str(self.test_set_n_unique_images))
        #logger.info("test_set_n_unique_sentences: %s" % str(self.test_set_n_unique_sentences))
        
    
    def test(self):
        self.populate_dataset_metadata()
        
        logger.info("Generating image and sentence embeddings using trained network...")
        
        n_images_total = int(self.training_set_n_unique_images +
                             self.valid_set_n_unique_images +
                             self.test_set_n_unique_images)
        n_sentences_total = int(self.training_set_n_unique_sentences +
                                self.valid_set_n_unique_sentences +
                                self.test_set_n_unique_sentences)
        
        self.image_embeddings = np.zeros((n_images_total, self.multimodal_emb_dim),
                                         dtype=theano.config.floatX)
        self.sentence_embeddings = np.zeros((n_sentences_total, self.multimodal_emb_dim),
                                            dtype=theano.config.floatX)
        
        #logger.info("sentence_embeddings: %s" % str(np.shape(sentence_embeddings)))
        #logger.info("image_embeddings: %s" % str(np.shape(image_embeddings)))
        
        # dictionary with mappings between sentence id <-> sentence embedding id
        self.sentence_idx_to_sentence_embedding_idx = OrderedDict()
        self.sentence_embedding_idx_to_sentence_idx = OrderedDict()
        # dictionary with mappings between image id    <-> image embedding id
        self.image_idx_to_image_embedding_idx = OrderedDict()
        self.image_embedding_idx_to_image_idx = OrderedDict()
        
        # generate sentence and image embeddings for training+test instances
        # using current network parameters
        start_sentence_idx, start_image_idx = 0, 0
        for minibatch in chain(self.__train_stream.get_epoch_iterator(),
                               self.__valid_stream.get_epoch_iterator(),
                               self.__test_stream.get_epoch_iterator()):
            x_sent_idx, x_image_idx, x_sent, x_mask, x_image = minibatch
            
            x_sent_idx = x_sent_idx.flatten()
            x_image_idx = x_image_idx.flatten()
            
            # sentences ids <-> sentence embeddings ids mapping
            def map_sentences(idx):
                if idx in self.sentence_idx_to_sentence_embedding_idx:
                    return self.sentence_idx_to_sentence_embedding_idx[idx]
                else:
                    retval = len(self.sentence_idx_to_sentence_embedding_idx)
                    self.sentence_idx_to_sentence_embedding_idx[idx] = retval
                    self.sentence_embedding_idx_to_sentence_idx[retval] = idx
                    return retval
            
            minibatch_sentence_indices = x_sent_idx
            sentence_embedding_indices = map(map_sentences, minibatch_sentence_indices)
            
            # images ids <-> image embeddings ids mapping
            def map_images(idx):
                if idx in self.image_idx_to_image_embedding_idx:
                    return self.image_idx_to_image_embedding_idx[idx]
                else:
                    retval = len(self.image_idx_to_image_embedding_idx)
                    self.image_idx_to_image_embedding_idx[idx] = retval
                    self.image_embedding_idx_to_image_idx[retval] = idx
                    return retval
            
            minibatch_image_indices = x_image_idx
            image_embedding_indices = map(map_images, minibatch_image_indices)
            
            #logger.info("")
            #logger.info("sentence_embedding_indices: %s" % str(sentence_embedding_indices))
            #logger.info("minibatch_sentence_indices: %s" % str(minibatch_sentence_indices))
            #logger.info("image_embedding_indices: %s" % str(image_embedding_indices))
            #logger.info("minibatch_image_indices: %s" % str(minibatch_image_indices))
            
            # first, generate sentence embeddings for test set sentences
            # time as first dimension, minibatch as second dimension
            x_sent = np.swapaxes(x_sent, 0, 1)
            x_mask = np.swapaxes(x_mask, 0, 1)
            
            self.sentence_embeddings[sentence_embedding_indices] = self.rnn.predict(x_sent)
            self.image_embeddings[image_embedding_indices]       = self.image_proj.predict(x_image)
        
        logger.info("len(self.sentence_embedding_idx_to_sentence_idx): %i" %
                    len(self.sentence_embedding_idx_to_sentence_idx))
        logger.info("len(self.image_embedding_idx_to_image_idx): %i" %
                    len(self.image_embedding_idx_to_image_idx))
        
        assert(len(self.sentence_idx_to_sentence_embedding_idx) == n_sentences_total and
               len(self.image_idx_to_image_embedding_idx) == n_images_total)
        
        logger.info("Done!")
        
        sentences_given_image_medium_rank  = []
        sentences_given_image_recall_at_1  = []
        sentences_given_image_recall_at_5  = []
        sentences_given_image_recall_at_10 = []
        
        images_given_sentence_medium_rank  = []
        images_given_sentence_recall_at_1  = []
        images_given_sentence_recall_at_5  = []
        images_given_sentence_recall_at_10 = []
        
        # having the test sentences and images in the multimodal embedding, evaluate them
        for test_minibatch in self.__test_stream.get_epoch_iterator():
            x_sentt_idx, x_imaget_idx, x_sentt, x_maskt, x_imaget = test_minibatch
            
            x_sentt_idx = x_sentt_idx.flatten()
            x_imaget_idx = x_imaget_idx.flatten()
            
            # time as first dimension, minibatch as second dimension
            x_sentt = np.swapaxes(x_sentt, 0, 1)
            x_maskt = np.swapaxes(x_maskt, 0, 1)
            
            # process each image in test set one by one
            for sent_idx, image_idx in zip(x_sentt_idx, x_imaget_idx):
                # obtain the other four sentences indices that illustrate
                # the same image as the current sentence
                current_image_idx = self.test_set_sentence_idx_to_image_idx[sent_idx]
                assert(current_image_idx == image_idx)
                
                similar_sentences_idx = self.test_set_image_idx_to_sentence_indices[image_idx]
                assert(sent_idx in similar_sentences_idx)
                
                #logger.info("sent_idx: %i, similar_sentences_ids: %s" % (sent_idx, similar_sentences_idx))
                
                # get distances/rankings for sentence `sent_idx`
                sentence_embedding_idx = self.sentence_idx_to_sentence_embedding_idx[sent_idx]
                this_sentence_embedding = self.sentence_embeddings[sentence_embedding_idx]
                distance_images_given_sentence, ranking_images_given_sentence = \
                        self.get_distances_from_image_embedding(this_sentence_embedding)
                
                # get distances/rankings for image `image_idx`
                image_embedding_idx = self.image_idx_to_image_embedding_idx[image_idx]
                this_image_embedding = self.image_embeddings[image_embedding_idx]
                distance_sentences_given_image, ranking_sentences_given_image = \
                        self.get_distances_from_sentence_embedding(this_image_embedding)
                
                # rankings are all computed on the embeddings IDs
                if image_embedding_idx in ranking_images_given_sentence[:1]:
                    images_given_sentence_recall_at_1.append( sentence_embedding_idx )
                if image_embedding_idx in ranking_images_given_sentence[:5]:
                    images_given_sentence_recall_at_5.append( sentence_embedding_idx )
                if image_embedding_idx in ranking_images_given_sentence[:10]:
                    images_given_sentence_recall_at_10.append( sentence_embedding_idx )
                
                if sentence_embedding_idx in ranking_sentences_given_image[:1]:
                    sentences_given_image_recall_at_1.append( image_embedding_idx )
                if sentence_embedding_idx in ranking_sentences_given_image[:5]:
                    sentences_given_image_recall_at_5.append( image_embedding_idx )
                if sentence_embedding_idx in ranking_sentences_given_image[:10]:
                    sentences_given_image_recall_at_10.append( image_embedding_idx )
                
                # medium rank
                # rankings are computed based on the embeddings ids.
                # dictionaries built using original (minibatch) ids.
                # images given sentence
                for counter, ranked_image_idx in enumerate(ranking_images_given_sentence):
                    if self.image_embedding_idx_to_image_idx[ranked_image_idx] == \
                            self.test_set_sentence_idx_to_image_idx[sent_idx]:
                        images_given_sentence_medium_rank.append(counter)
                        break
                # sentences given image
                for counter, ranked_sentence_idx in enumerate(ranking_sentences_given_image):
                    #if self.sentence_embedding_idx_to_sentence_idx[ranked_sentence_idx] == \
                    #        sent_idx:
                    if self.sentence_embedding_idx_to_sentence_idx[ranked_sentence_idx] in \
                            similar_sentences_idx:
                        sentences_given_image_medium_rank.append(counter)
                        break
            
            logger.info("minibatch %i-%i/%i " %
                        (x_sentt_idx[0], x_sentt_idx[-1], n_sentences_total))
            
            logger.info("[sentence-image %i-%i] images given sentence rank: %i/%i" %
                        (sent_idx, current_image_idx, 
                         np.asarray(images_given_sentence_medium_rank).mean(),
                         n_images_total))
            logger.info("[sentence-image %i-%i] sentences given image rank: %i/%i" %
                        (sent_idx, current_image_idx,
                         np.asarray(sentences_given_image_medium_rank).mean(),
                         n_sentences_total))
        
        logger.info("Final results:")
        logger.info("images given sentence medium rank: %i/%i -- %.2f%%" %
                    (np.asarray(images_given_sentence_medium_rank).mean(),
                     n_images_total,
                     (np.asarray(images_given_sentence_medium_rank).mean() / n_images_total)))
        logger.info("images given sentence R@1: %i" %
                    (len(images_given_sentence_recall_at_1)))
        logger.info("images given sentence R@5: %i" %
                    (len(images_given_sentence_recall_at_5)))
        logger.info("images given sentence R@10: %i" %
                    (len(images_given_sentence_recall_at_10)))
        
        logger.info("sentences given image medium rank: %i/%i -- %.2f%%" %
                    (np.asarray(sentences_given_image_medium_rank).mean(),
                     n_sentences_total,
                     (np.asarray(sentences_given_image_medium_rank).mean() / n_sentences_total)))
        logger.info("sentences given image R@1: %i" %
                    (len(sentences_given_image_recall_at_1)))
        logger.info("sentences given image R@5: %i" %
                    (len(sentences_given_image_recall_at_5)))
        logger.info("sentences given image R@10: %i" %
                    (len(sentences_given_image_recall_at_10)))
    
    
    def fit(self, train_set_size=None,
                  valid_set_size=None,
                  test_set_size=None,
                  minibatch_size=None,
                  validation_frequency=-1,
                  save_frequency=-1):
        assert(not train_set_size == None and not minibatch_size == None)
        
        if valid_set_size is None:
            valid_set_size = train_set_size
        
        # (adaptive) learning rate
        l_r = T.scalar('learning_rate', dtype=theano.config.floatX)
        # momentum
        mom = T.scalar('momentum', dtype=theano.config.floatX)
        
        # cost to be observed (prior to regularisation)
        # minimum squared error between predicted sentence and image vectors
        cost = T.mean((self.image_proj.y_pred - self.sentence_proj.last_h)** 2)
        
        # cost to be minimised
        # in case we are dealing with ONLY positive instances, minimise
        # minimum squared difference between predicted sentence and image vectors
        reg_cost = cost + self.L1_reg * self.sentence_proj.L1 \
                        + self.L1_reg * self.image_proj.L1 \
                        + self.L2_reg * self.sentence_proj.L2_sqr \
                        + self.L2_reg * self.image_proj.L2_sqr
        reg_cost.name = 'cost_with_regularisation'
        
        # total loss is just the sum of image and sentence networks' losses
        total_loss = self.sentence_proj.loss(self.image_proj.y_pred) \
            + self.image_proj.loss(self.sentence_proj.last_h)
        
        # compute loss given minibatch
        compute_loss = theano.function(inputs=[self.x_sentence,
                                               self.x_sentence_mask,
                                               self.x_image],
                                       outputs=total_loss)
        
        # update parameters
        # compute the gradient of cost with respect to model parameters
        # gradients on the weights using BPTT
        gparams_sentence = []
        gparams_image = []
        # text rnn
        for param in self.sentence_proj.params:
            gparam = T.grad(cost, param)
            gparams_sentence.append(gparam)
        # image nn
        for param in self.image_proj.params:
            gparam = T.grad(cost, param)
            gparams_image.append(gparam)
        
        updates = OrderedDict()
        # text rnn
        for param, gparam in zip(self.sentence_proj.params, gparams_sentence):
            weight_update = self.sentence_proj.updates[param]
            upd = mom * weight_update - l_r * gparam
            updates[weight_update] = upd
            updates[param] = param + upd
        # image nn
        for param, gparam in zip(self.image_proj.params, gparams_image):
            weight_update = self.image_proj.updates[param]
            upd = mom * weight_update - l_r * gparam
            updates[weight_update] = upd
            updates[param] = param + upd
        
        # compute cost given minibatch and update model parameters
        train_model = theano.function(
            inputs=[self.x_sentence,
                    self.x_sentence_mask,
                    self.x_image,
                    l_r,
                    mom],
            outputs=reg_cost,
            updates=updates,
            on_unused_input='warn')
        
        n_train_batches = train_set_size / minibatch_size
        
        # go through this many minibatches before checking the network
        # on the validation set; in this case we check every epoch
        if validation_frequency == -1:
            validation_frequency = n_train_batches
            #validation_frequency = min(n_train_batches, patience / 2)
        
        # save model at every 5 epochs
        if save_frequency == -1:
            save_frequency = n_train_batches * 5
            #save_frequency = min(n_train_batches, patience / 2)
        
        logger.info("validation frequency: %i" % validation_frequency)
        logger.info("save frequency: %i" % save_frequency)
        logger.info("training set size: %i" % train_set_size)
        logger.info("number of training batches: %i" % n_train_batches)
        logger.info("minibatch size: %i" % minibatch_size)
        
        best_validation_loss = np.inf
        early_stop = False
        
        logger.info("Training...")
        
        # some variables used for model analysis
        for epoch in range(self.last_epoch+1, self.n_epochs):
            # iterate train set
            for n_iterations, minibatch in enumerate(self.__train_stream.get_epoch_iterator()):
                x_sent_idx, x_image_idx, x_sent, x_mask, x_image = minibatch # unpack minibatch
                
                x_sent_idx = x_sent_idx.flatten()
                x_image_idx = x_image_idx.flatten()
                
                #logger.info("x_sent_idx: %s" % (str(x_sent_idx)))
                #logger.info("x_image_idx: %s" % (str(x_image_idx)))
                
                effective_momentum = self.final_momentum \
                                   if epoch > self.momentum_switchover \
                                   else self.initial_momentum
                
                # swap dimensions to put time as dimension zero,
                # minibatch as dimension one
                x_sent = np.swapaxes(x_sent, 0, 1)
                x_mask = np.swapaxes(x_mask, 0, 1)
                
                # train on minibatch and update model
                minibatch_regularised_cost = train_model(
                    x_sent, x_mask, x_image,
                    self.learning_rate, effective_momentum)
                
                # iteration number (how many weight updates have we made? 0-indexed)
                n_iterations += 1
                iter = (epoch) * n_train_batches + n_iterations
                
                if iter % validation_frequency == 0:
                    valid_losses = []
                    # iterate validation minibatch
                    for valid_minibatch in self.__valid_stream.get_epoch_iterator():
                        _, _, x_sentv, x_maskv, s_imagev = valid_minibatch
                        
                        x_sentv = np.swapaxes(x_sentv, 0, 1)
                        x_maskv = np.swapaxes(x_maskv, 0, 1)
                        
                        this_valid_loss = compute_loss(x_sentv, x_maskv, s_imagev)
                        valid_losses.append(this_valid_loss)
                    this_valid_loss = np.mean(valid_losses)
                    
                    # update best validation loss for early stopping
                    if this_valid_loss < best_validation_loss:
                        # save model and improve patience if loss improvement is good enough
                        if this_valid_loss < best_validation_loss * self.improvement_threshold:
                            self.patience = max(self.patience, iter * self.patience_increase)
                            #logger.info("new patience: %i"%patience)
                        
                        best_validation_loss = this_valid_loss
                        bad_counter = 0
                    
                    # evaluate on test set
                    if not test_set_size==None:
                        test_losses = []
                        # iterate test minibatch
                        for test_minibatch in self.__test_stream.get_epoch_iterator():
                            _, _, x_sentt, x_maskt, x_imaget = test_minibatch
                            
                            x_sentt = np.swapaxes(x_sentt, 0, 1)
                            x_maskt = np.swapaxes(x_maskt, 0, 1)
                            
                            this_test_loss = compute_loss(x_sentt, x_maskt, x_imaget)
                            test_losses.append(this_test_loss)
                        this_test_loss = np.mean(test_losses)
                        self.all_test_losses.append(this_test_loss)
                    
                        logger.info('epoch %i, minibatch %i/%i, iter %i, train loss %f '
                                    ', valid loss %f, test loss %f, patience: %i, lr: %f' % \
                                    (epoch, n_iterations*minibatch_size, train_set_size,
                                     iter, minibatch_regularised_cost, this_valid_loss,
                                     this_test_loss, self.patience, self.learning_rate))
                    else:
                        logger.info('epoch %i, minibatch %i/%i, iter %i, train loss %f '
                                    ', valid loss %f, patience: %i, lr: %f' % \
                                    (epoch, n_iterations*minibatch_size, train_set_size,
                                     iter, minibatch_regularised_cost, this_valid_loss,
                                     self.patience, self.learning_rate))
                    
                    self.all_train_losses.append(minibatch_regularised_cost)
                    self.all_valid_losses.append(this_valid_loss)
                    self.time_measures.append(time.time())
                    self.clock_measures.append(time.clock())
                    
                    #logger.info("len(all_valid_losses): %i"%len(self.all_valid_losses))
                    #logger.info("patience: %i"%patience)
                    
                    if len(self.all_valid_losses) > self.patience and \
                            this_valid_loss >= \
                            np.array(self.all_valid_losses)[:-self.patience].min():
                        logger.info('Bad counter increase (to %d)' % bad_counter)
                        bad_counter += 1
                        if bad_counter > self.patience:
                            logger.info('Early Stop!')
                            early_stop = True
                            break
                    
                    self.__valid_stream.reset()
                    self.__test_stream.reset()
                
                if iter % save_frequency == 0:
                    self.last_epoch = epoch
                    self.save()
                    
            
            # finish after this many updates
            if iter >= self.finish_after:
                logger.info('Finishing after %d iterations!' % iter)
                early_stop = True
            
            if early_stop:
                break
            
            self.learning_rate *= self.learning_rate_decay
            self.__train_stream.reset()
        
        # plot results of model applied on training/valid/test sets
        # currently not using CPU time from time.clock()
        plot_losses_vs_time(train_losses=self.all_train_losses,
                            valid_losses=self.all_valid_losses,
                            test_losses=self.all_test_losses,
                            time_measures=self.time_measures,
                            time_label='Time (secs)')
        logger.info("last epoch: %i, n_epochs: %i" % (self.last_epoch, self.n_epochs))