def __init__(self, *args, **kwargs):
        super().__init__()
        self.num_classes = kwargs['num_classes']
        self.batch_size = kwargs['batch_size']
        # Pretrained image model
        self.inception = models.inception_v3(pretrained=True)
        set_parameter_requires_grad(model=self.inception,
                                    feature_extracting=True)
        # Handle the auxilary net
        self.num_ftrs = self.inception.AuxLogits.fc.in_features
        self.inception.AuxLogits.fc = nn.Linear(self.num_ftrs,
                                                self.num_classes)
        # Handle the primary net
        self.num_ftrs = self.inception.fc.in_features
        # dim: 2048
        print('self.num_ftrs: {}'.format(self.num_ftrs))
        self.inception.fc = nn.Linear(self.num_ftrs, self.num_classes)
        # Return features before fc layer.
        self.inception.fc = nn.Identity()
        # print('self.inception:\n{}'.format(self.inception))
        self.image_size = 299

        # Text model
        self.vocab_size = kwargs['vocab_size']
        self.embedding_dim = kwargs['embedding_dim']  # 50

        params_model = {
            'bsize': self.batch_size,
            'word_emb_dim': self.embedding_dim,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': 1
        }
        self.infersent = InferSent(params_model)
        self.infersent.load_state_dict(
            torch.load(
                os.path.join(os.getcwd(), '../data/encoder/infersent1.pkl')))
        self.infersent.set_w2v_path(w2v_path=os.path.join(
            os.getcwd(), '../data/glove/glove.840B.300d.txt'))
        self.infersent.build_vocab_k_words(K=self.vocab_size)
        # print('self.infersent:\n{}'.format(self.infersent))

        self.encode_dim = 4096

        # Acc: 0.41 best from 20 epoch

        # Reduce the feature len of img and text embs
        self.img_f_dim = 512
        self.text_emb_dim = 512

        self.fc_img = nn.Linear(self.num_ftrs, self.img_f_dim, bias=False)
        self.fc_text = nn.Linear(self.encode_dim,
                                 self.text_emb_dim,
                                 bias=False)

        self.fc1 = nn.Linear((self.img_f_dim + 1) * (self.text_emb_dim + 1),
                             128)
        # self.fc2 = nn.Linear(128, 128)
        self.out_f = nn.Linear(128, self.num_classes)
Beispiel #2
0
    def __init__(self, *args, **kwargs):
        super().__init__()
        self.num_classes = kwargs['num_classes']
        self.batch_size = kwargs['batch_size']  # 64
        self.image_model = ShallowNet()
        self.image_size = 224

        # Text model
        self.vocab_size = kwargs['vocab_size']
        self.embedding_dim = kwargs['embedding_dim']  # 50

        params_model = {
            'bsize': self.batch_size,
            'word_emb_dim': self.embedding_dim,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': 1
        }
        self.infersent = InferSent(params_model)
        self.infersent.load_state_dict(
            torch.load(
                os.path.join(os.getcwd(), '../data/encoder/infersent1.pkl')))
        self.infersent.set_w2v_path(w2v_path=os.path.join(
            os.getcwd(), '../data/glove/glove.840B.300d.txt'))
        self.infersent.build_vocab_k_words(K=self.vocab_size)
        print('self.infersent:\n{}'.format(self.infersent))

        # Fully connected layers
        self.shallownet_output_dim = 120
        self.fc_size = 120
        self.encode_dim = 4096
        self.fc1 = nn.Linear(in_features=self.encode_dim,
                             out_features=self.fc_size)
        self.fc1_bn = nn.BatchNorm1d(num_features=self.fc_size)
        self.fc2 = nn.Linear(in_features=self.fc_size +
                             self.shallownet_output_dim,
                             out_features=168)
        self.fc2_bn = nn.BatchNorm1d(num_features=168)
        self.fc3 = nn.Linear(in_features=168, out_features=self.num_classes)
Beispiel #3
0
    def __init__(self, *args, **kwargs):
        super().__init__()
        self.num_classes = kwargs['num_classes']
        self.batch_size = kwargs['batch_size']  # 64
        # Pretrained image model
        self.inception = models.googlenet(pretrained=True, aux_logits=True)
        # set_parameter_requires_grad(model=self.inception, feature_extracting=True)
        # Handle the auxilary net
        self.num_ftrs = self.inception.aux1.fc2.in_features
        self.inception.aux1.fc2 = nn.Linear(self.num_ftrs, self.num_classes)
        self.num_ftrs = self.inception.aux2.fc2.in_features
        self.inception.aux2.fc2 = nn.Linear(self.num_ftrs, self.num_classes)
        # Handle the primary net
        self.num_ftrs = self.inception.fc.in_features
        print('self.num_ftrs: {}'.format(self.num_ftrs))
        self.inception.fc = nn.Linear(self.num_ftrs, self.num_classes)
        # Return features before fc layer.
        self.inception.fc = nn.Identity()
        # print('self.inception:\n{}'.format(self.inception))
        self.image_size = 224

        # Text model
        self.vocab_size = kwargs['vocab_size']
        self.embedding_dim = kwargs['embedding_dim']  # 50
        # self.glove_embedding = kwargs['glove_embedding']
        # # self.word_embedding = nn.Embedding(num_embeddings=self.vocab_size,
        # #   embedding_dim=self.embedding_dim)
        # # self.word_embedding.load_state_dict({'weight': self.glove_embedding})
        # self.word_embedding = nn.Embedding.from_pretrained(
        #   embeddings=self.glove_embedding)

        params_model = {
            'bsize': self.batch_size,
            'word_emb_dim': self.embedding_dim,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': 1
        }
        self.infersent = InferSent(params_model)
        self.infersent.load_state_dict(
            torch.load(
                os.path.join(os.getcwd(), '../data/encoder/infersent1.pkl')))
        self.infersent.set_w2v_path(w2v_path=os.path.join(
            os.getcwd(), '../data/glove/glove.840B.300d.txt'))
        self.infersent.build_vocab_k_words(K=self.vocab_size)
        print('self.infersent:\n{}'.format(self.infersent))

        # LSTM
        self.hidden_size = 1024
        # self.lstm = nn.LSTM(input_size=4096,
        #   hidden_size=self.hidden_size, num_layers=1)
        # print('self.lstm:\n{}'.format(self.lstm))

        # Fully connected layers
        self.fc_size = 512
        self.encode_dim = 4096
        self.fc1 = nn.Linear(in_features=self.num_ftrs + self.encode_dim,
                             out_features=self.fc_size)
        self.fc1_bn = nn.BatchNorm1d(num_features=self.fc_size)
        self.fc2 = nn.Linear(in_features=self.fc_size,
                             out_features=self.num_classes)
        print('self.fc1:\n{}'.format(self.fc1))
        print('self.fc2:\n{}'.format(self.fc2))
Beispiel #4
0
    def __init__(self, *args, **kwargs):
        super().__init__()
        self.num_classes = kwargs['num_classes']
        self.batch_size = kwargs['batch_size']

        self.dropout = nn.Dropout(0.5)

        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.img_batch_norm1 = nn.BatchNorm2d(16)

        self.fc1 = nn.Linear(16 * 61 * 61, 1024)
        self.img_batch_norm2 = nn.BatchNorm1d(1024)
        self.fc2 = nn.Linear(1024, 512)
        self.img_batch_norm3 = nn.BatchNorm1d(512)
        self.fc3 = nn.Linear(512, 3)

        self.vocab_size = kwargs['vocab_size']
        self.embedding_dim = kwargs['embedding_dim']

        params_model = {
            'bsize': self.batch_size,
            'word_emb_dim': self.embedding_dim,
            'enc_lstm_dim': 2048,
            'pool_type': 'max',
            'dpout_model': 0.0,
            'version': 1
        }
        self.infersent = InferSent(params_model)
        self.infersent.load_state_dict(
            torch.load(
                os.path.join(os.getcwd(), '../data/encoder/infersent1.pkl')))
        self.infersent.set_w2v_path(w2v_path=os.path.join(
            os.getcwd(), '../data/glove/glove.840B.300d.txt'))
        self.infersent.build_vocab_k_words(K=self.vocab_size)
        print('self.infersent:\n{}'.format(self.infersent))

        self.fc_size = 512
        self.encode_dim = 4096

        self.fc_text1 = nn.Linear(in_features=self.encode_dim,
                                  out_features=self.fc_size)

        self.batch_norm1 = nn.BatchNorm1d(self.fc_size)

        self.fc_text2 = nn.Linear(in_features=self.fc_size, out_features=512)

        self.batch_norm2 = nn.BatchNorm1d(512)

        self.fc_text3 = nn.Linear(in_features=512,
                                  out_features=self.num_classes)

        # Output layer for the concatenated features
        self.conc_fc = nn.Linear(in_features=self.num_classes +
                                 self.num_classes,
                                 out_features=self.num_classes)

        self.num_att_head = kwargs['att_head_num']

        self.W_att = nn.Linear(in_features=self.num_classes,
                               out_features=self.num_att_head,
                               bias=False)