class ShallownetGloveModel(nn.Module): def __init__(self, *args, **kwargs): super().__init__() self.num_classes = kwargs['num_classes'] self.batch_size = kwargs['batch_size'] # 64 self.image_model = ShallowNet() self.image_size = 224 # Text model self.vocab_size = kwargs['vocab_size'] self.embedding_dim = kwargs['embedding_dim'] # 50 params_model = { 'bsize': self.batch_size, 'word_emb_dim': self.embedding_dim, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1 } self.infersent = InferSent(params_model) self.infersent.load_state_dict( torch.load( os.path.join(os.getcwd(), '../data/encoder/infersent1.pkl'))) self.infersent.set_w2v_path(w2v_path=os.path.join( os.getcwd(), '../data/glove/glove.840B.300d.txt')) self.infersent.build_vocab_k_words(K=self.vocab_size) print('self.infersent:\n{}'.format(self.infersent)) # Fully connected layers self.shallownet_output_dim = 120 self.fc_size = 120 self.encode_dim = 4096 self.fc1 = nn.Linear(in_features=self.encode_dim, out_features=self.fc_size) self.fc1_bn = nn.BatchNorm1d(num_features=self.fc_size) self.fc2 = nn.Linear(in_features=self.fc_size + self.shallownet_output_dim, out_features=168) self.fc2_bn = nn.BatchNorm1d(num_features=168) self.fc3 = nn.Linear(in_features=168, out_features=self.num_classes) # print('self.fc1:\n{}'.format(self.fc1)) # print('self.fc2:\n{}'.format(self.fc2)) def forward(self, image_batch, text_batch): image_features = self.image_model(image_batch) embeddings = self.infersent.encode(text_batch, bsize=self.batch_size, tokenize=False, verbose=False) embeddings = torch.FloatTensor(embeddings) text_features = F.relu(self.fc1_bn(self.fc1(embeddings))) # print('image_features.size(): {}, text_features.size(): {}'.format( # image_features.size(), text_features.size())) concat_features = torch.cat((image_features, text_features), dim=1) # print('concat_features.size(): {}'.format(concat_features.size())) x = F.relu(self.fc2_bn(self.fc2(concat_features))) x = self.fc3(x) x = F.softmax(x, dim=1) return x
class DeepSentimentVanillaModel(nn.Module): def __init__(self, *args, **kwargs): super().__init__() self.num_classes = kwargs['num_classes'] self.batch_size = kwargs['batch_size'] # 64 # Pretrained image model self.inception = models.googlenet(pretrained=True, aux_logits=True) # set_parameter_requires_grad(model=self.inception, feature_extracting=True) # Handle the auxilary net self.num_ftrs = self.inception.aux1.fc2.in_features self.inception.aux1.fc2 = nn.Linear(self.num_ftrs, self.num_classes) self.num_ftrs = self.inception.aux2.fc2.in_features self.inception.aux2.fc2 = nn.Linear(self.num_ftrs, self.num_classes) # Handle the primary net self.num_ftrs = self.inception.fc.in_features print('self.num_ftrs: {}'.format(self.num_ftrs)) self.inception.fc = nn.Linear(self.num_ftrs, self.num_classes) # Return features before fc layer. self.inception.fc = nn.Identity() # print('self.inception:\n{}'.format(self.inception)) self.image_size = 224 # Text model self.vocab_size = kwargs['vocab_size'] self.embedding_dim = kwargs['embedding_dim'] # 50 # self.glove_embedding = kwargs['glove_embedding'] # # self.word_embedding = nn.Embedding(num_embeddings=self.vocab_size, # # embedding_dim=self.embedding_dim) # # self.word_embedding.load_state_dict({'weight': self.glove_embedding}) # self.word_embedding = nn.Embedding.from_pretrained( # embeddings=self.glove_embedding) params_model = { 'bsize': self.batch_size, 'word_emb_dim': self.embedding_dim, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1 } self.infersent = InferSent(params_model) self.infersent.load_state_dict( torch.load( os.path.join(os.getcwd(), '../data/encoder/infersent1.pkl'))) self.infersent.set_w2v_path(w2v_path=os.path.join( os.getcwd(), '../data/glove/glove.840B.300d.txt')) self.infersent.build_vocab_k_words(K=self.vocab_size) print('self.infersent:\n{}'.format(self.infersent)) # LSTM self.hidden_size = 1024 # self.lstm = nn.LSTM(input_size=4096, # hidden_size=self.hidden_size, num_layers=1) # print('self.lstm:\n{}'.format(self.lstm)) # Fully connected layers self.fc_size = 512 self.encode_dim = 4096 self.fc1 = nn.Linear(in_features=self.num_ftrs + self.encode_dim, out_features=self.fc_size) self.fc1_bn = nn.BatchNorm1d(num_features=self.fc_size) self.fc2 = nn.Linear(in_features=self.fc_size, out_features=self.num_classes) print('self.fc1:\n{}'.format(self.fc1)) print('self.fc2:\n{}'.format(self.fc2)) def forward(self, image_batch, text_batch): # image_batch = sample_batch['image'] if self.inception.training: image_features = self.inception(image_batch)[0] else: image_features = self.inception(image_batch) # image_features, _ = self.inception(image_batch) # print('image_features.size(): {}'.format(image_features.size())) # ocr_text_batch = sample_batch['ocr_extracted_text'] # corrected_text_batch = sample_batch['corrected_text'] # print('ocr_text_batch: {}\n'.format(ocr_text_batch)) # print('corrected_text_batch: {}\n'.format(corrected_text_batch)) # while 'nan' in corrected_text_batch: # nan_idx = corrected_text_batch.index('nan') # corrected_text_batch[nan_idx] = ocr_text_batch[nan_idx] # print('corrected_text_batch: {}\n'.format(corrected_text_batch)) # numpy array with n vectors of dimension 4096 embeddings = self.infersent.encode(text_batch, bsize=self.batch_size, tokenize=False, verbose=False) embeddings = torch.FloatTensor(embeddings) # print('embeddings.size(): {}'.format(embeddings.size())) # h_0 = c_0 = torch.zeros(1, self.batch_size, self.hidden_size) # print('h_0.size(): {}'.format(h_0.size())) # text_features, (h_n, c_n) = self.lstm(embeddings, (h_0, c_0)) # print('text_features.size(): {}'.format(text_features.size())) # Concatenate image and text features concat_features = torch.cat((image_features, embeddings), dim=1) # print('concat_features.size(): {}'.format(concat_features.size())) x = F.relu(self.fc1_bn(self.fc1(concat_features))) # print('x.size(): {}'.format(x.size())) x = self.fc2(x) # print('x.size(): {}'.format(x.size())) x = F.softmax(x, dim=1) return x def initHidden(self): return torch.zeros(1, self.hidden_size)
class CNNGloveAttentionModel(nn.Module): def __init__(self, *args, **kwargs): super().__init__() self.num_classes = kwargs['num_classes'] self.batch_size = kwargs['batch_size'] self.dropout = nn.Dropout(0.5) self.conv1 = nn.Conv2d(3, 6, 5) self.pool = nn.MaxPool2d(2, 2) self.conv2 = nn.Conv2d(6, 16, 5) self.img_batch_norm1 = nn.BatchNorm2d(16) self.fc1 = nn.Linear(16 * 61 * 61, 1024) self.img_batch_norm2 = nn.BatchNorm1d(1024) self.fc2 = nn.Linear(1024, 512) self.img_batch_norm3 = nn.BatchNorm1d(512) self.fc3 = nn.Linear(512, 3) self.vocab_size = kwargs['vocab_size'] self.embedding_dim = kwargs['embedding_dim'] params_model = { 'bsize': self.batch_size, 'word_emb_dim': self.embedding_dim, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1 } self.infersent = InferSent(params_model) self.infersent.load_state_dict( torch.load( os.path.join(os.getcwd(), '../data/encoder/infersent1.pkl'))) self.infersent.set_w2v_path(w2v_path=os.path.join( os.getcwd(), '../data/glove/glove.840B.300d.txt')) self.infersent.build_vocab_k_words(K=self.vocab_size) print('self.infersent:\n{}'.format(self.infersent)) self.fc_size = 512 self.encode_dim = 4096 self.fc_text1 = nn.Linear(in_features=self.encode_dim, out_features=self.fc_size) self.batch_norm1 = nn.BatchNorm1d(self.fc_size) self.fc_text2 = nn.Linear(in_features=self.fc_size, out_features=512) self.batch_norm2 = nn.BatchNorm1d(512) self.fc_text3 = nn.Linear(in_features=512, out_features=self.num_classes) # Output layer for the concatenated features self.conc_fc = nn.Linear(in_features=self.num_classes + self.num_classes, out_features=self.num_classes) self.num_att_head = kwargs['att_head_num'] self.W_att = nn.Linear(in_features=self.num_classes, out_features=self.num_att_head, bias=False) def forward(self, img_batch, text_batch): img_batch = F.relu(self.pool(self.conv1(img_batch))) img_batch = F.relu( self.pool(self.img_batch_norm1(self.conv2(img_batch)))) img_batch = img_batch.view(-1, 16 * 61 * 61) img_batch = F.relu(self.img_batch_norm2(self.fc1(img_batch))) img_batch = F.relu(self.img_batch_norm3(self.fc2(img_batch))) # The predicted probabilistic of images img_pred_output = F.softmax(F.relu(self.fc3(img_batch)), dim=1) embeddings = self.infersent.encode(text_batch, bsize=self.batch_size, tokenize=False, verbose=False) embeddings = torch.FloatTensor(embeddings) embeddings = F.relu(self.batch_norm1(self.fc_text1(embeddings))) embeddings = F.relu(self.batch_norm2(self.fc_text2(embeddings))) # The predicted probabilistic of text text_pred_output = F.softmax(F.relu(self.fc_text3(embeddings)), dim=1) # Feature fusion, concatenate image and text features concat_features = torch.cat((img_pred_output, text_pred_output), dim=1) concat_pred_output = F.softmax(F.relu(self.conc_fc(concat_features)), dim=1) # Self attention to fuse the output desisions from image modal, text modal and concatenated modal img_pred_output_view = img_pred_output.view(img_pred_output.size()[0], 1, img_pred_output.size()[1]) text_pred_output_view = text_pred_output.view( text_pred_output.size()[0], 1, text_pred_output.size()[1]) concat_pred_output_view = concat_pred_output.view( concat_pred_output.size()[0], 1, concat_pred_output.size()[1]) # combined_img_text_concat = torch.cat((img_pred_output_view, text_pred_output_view), dim=1) combined_img_text_concat = torch.cat( (img_pred_output_view, text_pred_output_view, concat_pred_output_view), dim=1) att_img_text_concat = F.softmax(F.relu( self.W_att(combined_img_text_concat)), dim=1) att_img_text_concat = att_img_text_concat.transpose(1, 2) weighted_img_text_concat_pred = att_img_text_concat @ combined_img_text_concat avg_weighted_img_text_concat_pred = torch.sum( weighted_img_text_concat_pred, 1) / self.num_att_head avg_weighted_img_text_concat_pred = avg_weighted_img_text_concat_pred.view( len(text_batch), self.num_classes) return avg_weighted_img_text_concat_pred
class DeepSentimentFusionModel(nn.Module): def __init__(self, *args, **kwargs): super().__init__() self.num_classes = kwargs['num_classes'] self.batch_size = kwargs['batch_size'] # Pretrained image model self.inception = models.inception_v3(pretrained=True) set_parameter_requires_grad(model=self.inception, feature_extracting=True) # Handle the auxilary net self.num_ftrs = self.inception.AuxLogits.fc.in_features self.inception.AuxLogits.fc = nn.Linear(self.num_ftrs, self.num_classes) # Handle the primary net self.num_ftrs = self.inception.fc.in_features # dim: 2048 print('self.num_ftrs: {}'.format(self.num_ftrs)) self.inception.fc = nn.Linear(self.num_ftrs, self.num_classes) # Return features before fc layer. self.inception.fc = nn.Identity() # print('self.inception:\n{}'.format(self.inception)) self.image_size = 299 # Text model self.vocab_size = kwargs['vocab_size'] self.embedding_dim = kwargs['embedding_dim'] # 50 params_model = { 'bsize': self.batch_size, 'word_emb_dim': self.embedding_dim, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1 } self.infersent = InferSent(params_model) self.infersent.load_state_dict( torch.load( os.path.join(os.getcwd(), '../data/encoder/infersent1.pkl'))) self.infersent.set_w2v_path(w2v_path=os.path.join( os.getcwd(), '../data/glove/glove.840B.300d.txt')) self.infersent.build_vocab_k_words(K=self.vocab_size) # print('self.infersent:\n{}'.format(self.infersent)) self.encode_dim = 4096 # Acc: 0.41 best from 20 epoch # Reduce the feature len of img and text embs self.img_f_dim = 512 self.text_emb_dim = 512 self.fc_img = nn.Linear(self.num_ftrs, self.img_f_dim, bias=False) self.fc_text = nn.Linear(self.encode_dim, self.text_emb_dim, bias=False) self.fc1 = nn.Linear((self.img_f_dim + 1) * (self.text_emb_dim + 1), 128) # self.fc2 = nn.Linear(128, 128) self.out_f = nn.Linear(128, self.num_classes) def forward(self, image_batch, text_batch): # image_batch = sample_batch['image'] if self.inception.training: image_features = self.inception(image_batch)[0] else: image_features = self.inception(image_batch) embeddings = self.infersent.encode(text_batch, bsize=self.batch_size, tokenize=False, verbose=True) embeddings = torch.FloatTensor(embeddings) image_features = self.fc_img(image_features) embeddings = self.fc_text(embeddings) # Tensor Fusion Layer: compute the outer product of img and text embs # https://github.com/Justin1904/TensorFusionNetworks/blob/master/model.py img_f = torch.cat((Variable(torch.ones(self.batch_size, 1), requires_grad=False), image_features), dim=1) text_emb = torch.cat((Variable(torch.ones(self.batch_size, 1), requires_grad=False), embeddings), dim=1) # Dim: batch_size * 2049 * 4097 fusion_tensor = torch.bmm(img_f.unsqueeze(2), text_emb.unsqueeze(1)) # Flatten fusion_tensor = fusion_tensor.view(-1, (image_features.shape[1] + 1) * (embeddings.shape[1] + 1)) x = F.relu(self.fc1(fusion_tensor)) # x = F.relu(self.fc2(x)) x = F.relu(self.out_f(x)) x = F.softmax(x, dim=1) return x