def __init__(self, *args, **kwargs):
        super().__init__()
        self.num_classes = kwargs['num_classes']
        self.batch_size = kwargs['batch_size']
        # Pretrained image model
        self.inception = models.inception_v3(pretrained=True)
        set_parameter_requires_grad(model=self.inception,
                                    feature_extracting=True)
        # Handle the auxilary net
        self.num_ftrs = self.inception.AuxLogits.fc.in_features
        self.inception.AuxLogits.fc = nn.Linear(self.num_ftrs,
                                                self.num_classes)
        # Handle the primary net
        self.num_ftrs = self.inception.fc.in_features
        # dim: 2048
        print('self.num_ftrs: {}'.format(self.num_ftrs))
        self.inception.fc = nn.Linear(self.num_ftrs, self.num_classes)
        # Return features before fc layer.
        self.inception.fc = nn.Identity()
        # print('self.inception:\n{}'.format(self.inception))
        self.image_size = 299

        # Text model
        self.vocab_size = kwargs['vocab_size']
        self.embedding_dim = kwargs['embedding_dim']  # 50

        params_model = {
            'bsize': self.batch_size,
            'word_emb_dim': self.embedding_dim,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': 1
        }
        self.infersent = InferSent(params_model)
        self.infersent.load_state_dict(
            torch.load(
                os.path.join(os.getcwd(), '../data/encoder/infersent1.pkl')))
        self.infersent.set_w2v_path(w2v_path=os.path.join(
            os.getcwd(), '../data/glove/glove.840B.300d.txt'))
        self.infersent.build_vocab_k_words(K=self.vocab_size)
        # print('self.infersent:\n{}'.format(self.infersent))

        self.encode_dim = 4096

        # Acc: 0.41 best from 20 epoch

        # Reduce the feature len of img and text embs
        self.img_f_dim = 512
        self.text_emb_dim = 512

        self.fc_img = nn.Linear(self.num_ftrs, self.img_f_dim, bias=False)
        self.fc_text = nn.Linear(self.encode_dim,
                                 self.text_emb_dim,
                                 bias=False)

        self.fc1 = nn.Linear((self.img_f_dim + 1) * (self.text_emb_dim + 1),
                             128)
        # self.fc2 = nn.Linear(128, 128)
        self.out_f = nn.Linear(128, self.num_classes)
Example #2
0
class ShallownetGloveModel(nn.Module):
    def __init__(self, *args, **kwargs):
        super().__init__()
        self.num_classes = kwargs['num_classes']
        self.batch_size = kwargs['batch_size']  # 64
        self.image_model = ShallowNet()
        self.image_size = 224

        # Text model
        self.vocab_size = kwargs['vocab_size']
        self.embedding_dim = kwargs['embedding_dim']  # 50

        params_model = {
            'bsize': self.batch_size,
            'word_emb_dim': self.embedding_dim,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': 1
        }
        self.infersent = InferSent(params_model)
        self.infersent.load_state_dict(
            torch.load(
                os.path.join(os.getcwd(), '../data/encoder/infersent1.pkl')))
        self.infersent.set_w2v_path(w2v_path=os.path.join(
            os.getcwd(), '../data/glove/glove.840B.300d.txt'))
        self.infersent.build_vocab_k_words(K=self.vocab_size)
        print('self.infersent:\n{}'.format(self.infersent))

        # Fully connected layers
        self.shallownet_output_dim = 120
        self.fc_size = 120
        self.encode_dim = 4096
        self.fc1 = nn.Linear(in_features=self.encode_dim,
                             out_features=self.fc_size)
        self.fc1_bn = nn.BatchNorm1d(num_features=self.fc_size)
        self.fc2 = nn.Linear(in_features=self.fc_size +
                             self.shallownet_output_dim,
                             out_features=168)
        self.fc2_bn = nn.BatchNorm1d(num_features=168)
        self.fc3 = nn.Linear(in_features=168, out_features=self.num_classes)
        # print('self.fc1:\n{}'.format(self.fc1))
        # print('self.fc2:\n{}'.format(self.fc2))

    def forward(self, image_batch, text_batch):
        image_features = self.image_model(image_batch)
        embeddings = self.infersent.encode(text_batch,
                                           bsize=self.batch_size,
                                           tokenize=False,
                                           verbose=False)
        embeddings = torch.FloatTensor(embeddings)
        text_features = F.relu(self.fc1_bn(self.fc1(embeddings)))
        # print('image_features.size(): {}, text_features.size(): {}'.format(
        #     image_features.size(), text_features.size()))
        concat_features = torch.cat((image_features, text_features), dim=1)
        # print('concat_features.size(): {}'.format(concat_features.size()))
        x = F.relu(self.fc2_bn(self.fc2(concat_features)))
        x = self.fc3(x)
        x = F.softmax(x, dim=1)
        return x
Example #3
0
    def __init__(self, *args, **kwargs):
        super().__init__()
        self.num_classes = kwargs['num_classes']
        self.batch_size = kwargs['batch_size']  # 64
        self.image_model = ShallowNet()
        self.image_size = 224

        # Text model
        self.vocab_size = kwargs['vocab_size']
        self.embedding_dim = kwargs['embedding_dim']  # 50

        params_model = {
            'bsize': self.batch_size,
            'word_emb_dim': self.embedding_dim,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': 1
        }
        self.infersent = InferSent(params_model)
        self.infersent.load_state_dict(
            torch.load(
                os.path.join(os.getcwd(), '../data/encoder/infersent1.pkl')))
        self.infersent.set_w2v_path(w2v_path=os.path.join(
            os.getcwd(), '../data/glove/glove.840B.300d.txt'))
        self.infersent.build_vocab_k_words(K=self.vocab_size)
        print('self.infersent:\n{}'.format(self.infersent))

        # Fully connected layers
        self.shallownet_output_dim = 120
        self.fc_size = 120
        self.encode_dim = 4096
        self.fc1 = nn.Linear(in_features=self.encode_dim,
                             out_features=self.fc_size)
        self.fc1_bn = nn.BatchNorm1d(num_features=self.fc_size)
        self.fc2 = nn.Linear(in_features=self.fc_size +
                             self.shallownet_output_dim,
                             out_features=168)
        self.fc2_bn = nn.BatchNorm1d(num_features=168)
        self.fc3 = nn.Linear(in_features=168, out_features=self.num_classes)
Example #4
0
    def __init__(self, *args, **kwargs):
        super().__init__()
        self.num_classes = kwargs['num_classes']
        self.batch_size = kwargs['batch_size']  # 64
        # Pretrained image model
        self.inception = models.googlenet(pretrained=True, aux_logits=True)
        # set_parameter_requires_grad(model=self.inception, feature_extracting=True)
        # Handle the auxilary net
        self.num_ftrs = self.inception.aux1.fc2.in_features
        self.inception.aux1.fc2 = nn.Linear(self.num_ftrs, self.num_classes)
        self.num_ftrs = self.inception.aux2.fc2.in_features
        self.inception.aux2.fc2 = nn.Linear(self.num_ftrs, self.num_classes)
        # Handle the primary net
        self.num_ftrs = self.inception.fc.in_features
        print('self.num_ftrs: {}'.format(self.num_ftrs))
        self.inception.fc = nn.Linear(self.num_ftrs, self.num_classes)
        # Return features before fc layer.
        self.inception.fc = nn.Identity()
        # print('self.inception:\n{}'.format(self.inception))
        self.image_size = 224

        # Text model
        self.vocab_size = kwargs['vocab_size']
        self.embedding_dim = kwargs['embedding_dim']  # 50
        # self.glove_embedding = kwargs['glove_embedding']
        # # self.word_embedding = nn.Embedding(num_embeddings=self.vocab_size,
        # #   embedding_dim=self.embedding_dim)
        # # self.word_embedding.load_state_dict({'weight': self.glove_embedding})
        # self.word_embedding = nn.Embedding.from_pretrained(
        #   embeddings=self.glove_embedding)

        params_model = {
            'bsize': self.batch_size,
            'word_emb_dim': self.embedding_dim,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': 1
        }
        self.infersent = InferSent(params_model)
        self.infersent.load_state_dict(
            torch.load(
                os.path.join(os.getcwd(), '../data/encoder/infersent1.pkl')))
        self.infersent.set_w2v_path(w2v_path=os.path.join(
            os.getcwd(), '../data/glove/glove.840B.300d.txt'))
        self.infersent.build_vocab_k_words(K=self.vocab_size)
        print('self.infersent:\n{}'.format(self.infersent))

        # LSTM
        self.hidden_size = 1024
        # self.lstm = nn.LSTM(input_size=4096,
        #   hidden_size=self.hidden_size, num_layers=1)
        # print('self.lstm:\n{}'.format(self.lstm))

        # Fully connected layers
        self.fc_size = 512
        self.encode_dim = 4096
        self.fc1 = nn.Linear(in_features=self.num_ftrs + self.encode_dim,
                             out_features=self.fc_size)
        self.fc1_bn = nn.BatchNorm1d(num_features=self.fc_size)
        self.fc2 = nn.Linear(in_features=self.fc_size,
                             out_features=self.num_classes)
        print('self.fc1:\n{}'.format(self.fc1))
        print('self.fc2:\n{}'.format(self.fc2))
Example #5
0
class DeepSentimentVanillaModel(nn.Module):
    def __init__(self, *args, **kwargs):
        super().__init__()
        self.num_classes = kwargs['num_classes']
        self.batch_size = kwargs['batch_size']  # 64
        # Pretrained image model
        self.inception = models.googlenet(pretrained=True, aux_logits=True)
        # set_parameter_requires_grad(model=self.inception, feature_extracting=True)
        # Handle the auxilary net
        self.num_ftrs = self.inception.aux1.fc2.in_features
        self.inception.aux1.fc2 = nn.Linear(self.num_ftrs, self.num_classes)
        self.num_ftrs = self.inception.aux2.fc2.in_features
        self.inception.aux2.fc2 = nn.Linear(self.num_ftrs, self.num_classes)
        # Handle the primary net
        self.num_ftrs = self.inception.fc.in_features
        print('self.num_ftrs: {}'.format(self.num_ftrs))
        self.inception.fc = nn.Linear(self.num_ftrs, self.num_classes)
        # Return features before fc layer.
        self.inception.fc = nn.Identity()
        # print('self.inception:\n{}'.format(self.inception))
        self.image_size = 224

        # Text model
        self.vocab_size = kwargs['vocab_size']
        self.embedding_dim = kwargs['embedding_dim']  # 50
        # self.glove_embedding = kwargs['glove_embedding']
        # # self.word_embedding = nn.Embedding(num_embeddings=self.vocab_size,
        # #   embedding_dim=self.embedding_dim)
        # # self.word_embedding.load_state_dict({'weight': self.glove_embedding})
        # self.word_embedding = nn.Embedding.from_pretrained(
        #   embeddings=self.glove_embedding)

        params_model = {
            'bsize': self.batch_size,
            'word_emb_dim': self.embedding_dim,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': 1
        }
        self.infersent = InferSent(params_model)
        self.infersent.load_state_dict(
            torch.load(
                os.path.join(os.getcwd(), '../data/encoder/infersent1.pkl')))
        self.infersent.set_w2v_path(w2v_path=os.path.join(
            os.getcwd(), '../data/glove/glove.840B.300d.txt'))
        self.infersent.build_vocab_k_words(K=self.vocab_size)
        print('self.infersent:\n{}'.format(self.infersent))

        # LSTM
        self.hidden_size = 1024
        # self.lstm = nn.LSTM(input_size=4096,
        #   hidden_size=self.hidden_size, num_layers=1)
        # print('self.lstm:\n{}'.format(self.lstm))

        # Fully connected layers
        self.fc_size = 512
        self.encode_dim = 4096
        self.fc1 = nn.Linear(in_features=self.num_ftrs + self.encode_dim,
                             out_features=self.fc_size)
        self.fc1_bn = nn.BatchNorm1d(num_features=self.fc_size)
        self.fc2 = nn.Linear(in_features=self.fc_size,
                             out_features=self.num_classes)
        print('self.fc1:\n{}'.format(self.fc1))
        print('self.fc2:\n{}'.format(self.fc2))

    def forward(self, image_batch, text_batch):
        # image_batch = sample_batch['image']
        if self.inception.training:
            image_features = self.inception(image_batch)[0]
        else:
            image_features = self.inception(image_batch)
        # image_features, _ = self.inception(image_batch)
        # print('image_features.size(): {}'.format(image_features.size()))

        # ocr_text_batch = sample_batch['ocr_extracted_text']
        # corrected_text_batch = sample_batch['corrected_text']
        # print('ocr_text_batch: {}\n'.format(ocr_text_batch))
        # print('corrected_text_batch: {}\n'.format(corrected_text_batch))
        # while 'nan' in corrected_text_batch:
        #   nan_idx = corrected_text_batch.index('nan')
        #   corrected_text_batch[nan_idx] = ocr_text_batch[nan_idx]
        # print('corrected_text_batch: {}\n'.format(corrected_text_batch))
        # numpy array with n vectors of dimension 4096
        embeddings = self.infersent.encode(text_batch,
                                           bsize=self.batch_size,
                                           tokenize=False,
                                           verbose=False)
        embeddings = torch.FloatTensor(embeddings)
        # print('embeddings.size(): {}'.format(embeddings.size()))
        # h_0 = c_0 = torch.zeros(1, self.batch_size, self.hidden_size)
        # print('h_0.size(): {}'.format(h_0.size()))
        # text_features, (h_n, c_n) = self.lstm(embeddings, (h_0, c_0))
        # print('text_features.size(): {}'.format(text_features.size()))

        # Concatenate image and text features
        concat_features = torch.cat((image_features, embeddings), dim=1)
        # print('concat_features.size(): {}'.format(concat_features.size()))
        x = F.relu(self.fc1_bn(self.fc1(concat_features)))
        # print('x.size(): {}'.format(x.size()))
        x = self.fc2(x)
        # print('x.size(): {}'.format(x.size()))
        x = F.softmax(x, dim=1)
        return x

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)
Example #6
0
    def __init__(self, *args, **kwargs):
        super().__init__()
        self.num_classes = kwargs['num_classes']
        self.batch_size = kwargs['batch_size']

        self.dropout = nn.Dropout(0.5)

        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.img_batch_norm1 = nn.BatchNorm2d(16)

        self.fc1 = nn.Linear(16 * 61 * 61, 1024)
        self.img_batch_norm2 = nn.BatchNorm1d(1024)
        self.fc2 = nn.Linear(1024, 512)
        self.img_batch_norm3 = nn.BatchNorm1d(512)
        self.fc3 = nn.Linear(512, 3)

        self.vocab_size = kwargs['vocab_size']
        self.embedding_dim = kwargs['embedding_dim']

        params_model = {
            'bsize': self.batch_size,
            'word_emb_dim': self.embedding_dim,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': 1
        }
        self.infersent = InferSent(params_model)
        self.infersent.load_state_dict(
            torch.load(
                os.path.join(os.getcwd(), '../data/encoder/infersent1.pkl')))
        self.infersent.set_w2v_path(w2v_path=os.path.join(
            os.getcwd(), '../data/glove/glove.840B.300d.txt'))
        self.infersent.build_vocab_k_words(K=self.vocab_size)
        print('self.infersent:\n{}'.format(self.infersent))

        self.fc_size = 512
        self.encode_dim = 4096

        self.fc_text1 = nn.Linear(in_features=self.encode_dim,
                                  out_features=self.fc_size)

        self.batch_norm1 = nn.BatchNorm1d(self.fc_size)

        self.fc_text2 = nn.Linear(in_features=self.fc_size, out_features=512)

        self.batch_norm2 = nn.BatchNorm1d(512)

        self.fc_text3 = nn.Linear(in_features=512,
                                  out_features=self.num_classes)

        # Output layer for the concatenated features
        self.conc_fc = nn.Linear(in_features=self.num_classes +
                                 self.num_classes,
                                 out_features=self.num_classes)

        self.num_att_head = kwargs['att_head_num']

        self.W_att = nn.Linear(in_features=self.num_classes,
                               out_features=self.num_att_head,
                               bias=False)
Example #7
0
class CNNGloveAttentionModel(nn.Module):
    def __init__(self, *args, **kwargs):
        super().__init__()
        self.num_classes = kwargs['num_classes']
        self.batch_size = kwargs['batch_size']

        self.dropout = nn.Dropout(0.5)

        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.img_batch_norm1 = nn.BatchNorm2d(16)

        self.fc1 = nn.Linear(16 * 61 * 61, 1024)
        self.img_batch_norm2 = nn.BatchNorm1d(1024)
        self.fc2 = nn.Linear(1024, 512)
        self.img_batch_norm3 = nn.BatchNorm1d(512)
        self.fc3 = nn.Linear(512, 3)

        self.vocab_size = kwargs['vocab_size']
        self.embedding_dim = kwargs['embedding_dim']

        params_model = {
            'bsize': self.batch_size,
            'word_emb_dim': self.embedding_dim,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': 1
        }
        self.infersent = InferSent(params_model)
        self.infersent.load_state_dict(
            torch.load(
                os.path.join(os.getcwd(), '../data/encoder/infersent1.pkl')))
        self.infersent.set_w2v_path(w2v_path=os.path.join(
            os.getcwd(), '../data/glove/glove.840B.300d.txt'))
        self.infersent.build_vocab_k_words(K=self.vocab_size)
        print('self.infersent:\n{}'.format(self.infersent))

        self.fc_size = 512
        self.encode_dim = 4096

        self.fc_text1 = nn.Linear(in_features=self.encode_dim,
                                  out_features=self.fc_size)

        self.batch_norm1 = nn.BatchNorm1d(self.fc_size)

        self.fc_text2 = nn.Linear(in_features=self.fc_size, out_features=512)

        self.batch_norm2 = nn.BatchNorm1d(512)

        self.fc_text3 = nn.Linear(in_features=512,
                                  out_features=self.num_classes)

        # Output layer for the concatenated features
        self.conc_fc = nn.Linear(in_features=self.num_classes +
                                 self.num_classes,
                                 out_features=self.num_classes)

        self.num_att_head = kwargs['att_head_num']

        self.W_att = nn.Linear(in_features=self.num_classes,
                               out_features=self.num_att_head,
                               bias=False)

    def forward(self, img_batch, text_batch):
        img_batch = F.relu(self.pool(self.conv1(img_batch)))
        img_batch = F.relu(
            self.pool(self.img_batch_norm1(self.conv2(img_batch))))

        img_batch = img_batch.view(-1, 16 * 61 * 61)

        img_batch = F.relu(self.img_batch_norm2(self.fc1(img_batch)))
        img_batch = F.relu(self.img_batch_norm3(self.fc2(img_batch)))

        # The predicted probabilistic of images
        img_pred_output = F.softmax(F.relu(self.fc3(img_batch)), dim=1)

        embeddings = self.infersent.encode(text_batch,
                                           bsize=self.batch_size,
                                           tokenize=False,
                                           verbose=False)
        embeddings = torch.FloatTensor(embeddings)

        embeddings = F.relu(self.batch_norm1(self.fc_text1(embeddings)))
        embeddings = F.relu(self.batch_norm2(self.fc_text2(embeddings)))

        # The predicted probabilistic of text
        text_pred_output = F.softmax(F.relu(self.fc_text3(embeddings)), dim=1)

        # Feature fusion, concatenate image and text features
        concat_features = torch.cat((img_pred_output, text_pred_output), dim=1)

        concat_pred_output = F.softmax(F.relu(self.conc_fc(concat_features)),
                                       dim=1)

        # Self attention to fuse the output desisions from image modal, text modal and concatenated modal
        img_pred_output_view = img_pred_output.view(img_pred_output.size()[0],
                                                    1,
                                                    img_pred_output.size()[1])
        text_pred_output_view = text_pred_output.view(
            text_pred_output.size()[0], 1,
            text_pred_output.size()[1])
        concat_pred_output_view = concat_pred_output.view(
            concat_pred_output.size()[0], 1,
            concat_pred_output.size()[1])

        # combined_img_text_concat = torch.cat((img_pred_output_view, text_pred_output_view), dim=1)
        combined_img_text_concat = torch.cat(
            (img_pred_output_view, text_pred_output_view,
             concat_pred_output_view),
            dim=1)

        att_img_text_concat = F.softmax(F.relu(
            self.W_att(combined_img_text_concat)),
                                        dim=1)

        att_img_text_concat = att_img_text_concat.transpose(1, 2)

        weighted_img_text_concat_pred = att_img_text_concat @ combined_img_text_concat

        avg_weighted_img_text_concat_pred = torch.sum(
            weighted_img_text_concat_pred, 1) / self.num_att_head

        avg_weighted_img_text_concat_pred = avg_weighted_img_text_concat_pred.view(
            len(text_batch), self.num_classes)

        return avg_weighted_img_text_concat_pred
class DeepSentimentFusionModel(nn.Module):
    def __init__(self, *args, **kwargs):
        super().__init__()
        self.num_classes = kwargs['num_classes']
        self.batch_size = kwargs['batch_size']
        # Pretrained image model
        self.inception = models.inception_v3(pretrained=True)
        set_parameter_requires_grad(model=self.inception,
                                    feature_extracting=True)
        # Handle the auxilary net
        self.num_ftrs = self.inception.AuxLogits.fc.in_features
        self.inception.AuxLogits.fc = nn.Linear(self.num_ftrs,
                                                self.num_classes)
        # Handle the primary net
        self.num_ftrs = self.inception.fc.in_features
        # dim: 2048
        print('self.num_ftrs: {}'.format(self.num_ftrs))
        self.inception.fc = nn.Linear(self.num_ftrs, self.num_classes)
        # Return features before fc layer.
        self.inception.fc = nn.Identity()
        # print('self.inception:\n{}'.format(self.inception))
        self.image_size = 299

        # Text model
        self.vocab_size = kwargs['vocab_size']
        self.embedding_dim = kwargs['embedding_dim']  # 50

        params_model = {
            'bsize': self.batch_size,
            'word_emb_dim': self.embedding_dim,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': 1
        }
        self.infersent = InferSent(params_model)
        self.infersent.load_state_dict(
            torch.load(
                os.path.join(os.getcwd(), '../data/encoder/infersent1.pkl')))
        self.infersent.set_w2v_path(w2v_path=os.path.join(
            os.getcwd(), '../data/glove/glove.840B.300d.txt'))
        self.infersent.build_vocab_k_words(K=self.vocab_size)
        # print('self.infersent:\n{}'.format(self.infersent))

        self.encode_dim = 4096

        # Acc: 0.41 best from 20 epoch

        # Reduce the feature len of img and text embs
        self.img_f_dim = 512
        self.text_emb_dim = 512

        self.fc_img = nn.Linear(self.num_ftrs, self.img_f_dim, bias=False)
        self.fc_text = nn.Linear(self.encode_dim,
                                 self.text_emb_dim,
                                 bias=False)

        self.fc1 = nn.Linear((self.img_f_dim + 1) * (self.text_emb_dim + 1),
                             128)
        # self.fc2 = nn.Linear(128, 128)
        self.out_f = nn.Linear(128, self.num_classes)

    def forward(self, image_batch, text_batch):
        # image_batch = sample_batch['image']
        if self.inception.training:
            image_features = self.inception(image_batch)[0]
        else:
            image_features = self.inception(image_batch)

        embeddings = self.infersent.encode(text_batch,
                                           bsize=self.batch_size,
                                           tokenize=False,
                                           verbose=True)
        embeddings = torch.FloatTensor(embeddings)

        image_features = self.fc_img(image_features)
        embeddings = self.fc_text(embeddings)

        # Tensor Fusion Layer: compute the outer product of img and text embs
        # https://github.com/Justin1904/TensorFusionNetworks/blob/master/model.py
        img_f = torch.cat((Variable(torch.ones(self.batch_size, 1),
                                    requires_grad=False), image_features),
                          dim=1)
        text_emb = torch.cat((Variable(torch.ones(self.batch_size, 1),
                                       requires_grad=False), embeddings),
                             dim=1)
        # Dim: batch_size * 2049 * 4097
        fusion_tensor = torch.bmm(img_f.unsqueeze(2), text_emb.unsqueeze(1))

        # Flatten
        fusion_tensor = fusion_tensor.view(-1, (image_features.shape[1] + 1) *
                                           (embeddings.shape[1] + 1))

        x = F.relu(self.fc1(fusion_tensor))
        # x = F.relu(self.fc2(x))
        x = F.relu(self.out_f(x))
        x = F.softmax(x, dim=1)

        return x