Example #1
0
    def __init__(self,
                 args,
                 vocab,
                 n_dim,
                 image_dim,
                 layers,
                 dropout,
                 num_choice=5):
        super().__init__()
        self.vocab = vocab
        V = len(vocab)
        D = n_dim
        self.hidden_dim = n_dim

        #video_encoder_layer = nn.TransformerEncoderLayer(d_model=300, nhead=6, dim_feedforward=1024, dropout=0.1, activation='gelu')
        #self.video_encoder = nn.TransformerEncoder(video_encoder_layer, num_layers=1)
        self.video_encoder = nn.GRU(image_dim + 21,
                                    150,
                                    bidirectional=True,
                                    batch_first=True)

        multimodal_encoder_layer = nn.TransformerEncoderLayer(
            d_model=n_dim,
            nhead=6,
            dim_feedforward=1024,
            dropout=0.5,
            activation='gelu')
        self.transformer = nn.TransformerEncoder(multimodal_encoder_layer,
                                                 num_layers=2)

        self.embedding = nn.Embedding(V, D)
        n_dim = args.n_dim
        image_dim = args.image_dim

        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        self.language_model = RobertaModel.from_pretrained('roberta-base',
                                                           return_dict=True)
        #for param in self.language_model.base_model.parameters():
        #    param.requires_grad = False

        # Update config to finetune token type embeddings
        #self.language_model.config.type_vocab_size = 3

        # Create a new Embeddings layer, with 2 possible segments IDs instead of 1
        #self.language_model.embeddings.token_type_embeddings = nn.Embedding(3, self.language_model.config.hidden_size)

        # Initialize it
        #self.language_model.embeddings.token_type_embeddings.weight.data.normal_(mean=0.0, std=self.language_model.config.initializer_range)
        '''
        # Freeze the first 10 layers
        modules = [self.language_model.encoder.layer[:10]]
        for module in modules:
            for param in module.parameters():
                param.requires_grad = False
        '''

        #self.cmat = ContextMatching(n_dim * 3)
        #self.lstm_raw = RNNEncoder(300, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm")
        self.lstm_script = RNNEncoder(321,
                                      150,
                                      bidirectional=True,
                                      dropout_p=0,
                                      n_layers=1,
                                      rnn_type="lstm")
        self.script_on = "script" in args.stream_type
        self.vbb_on = "visual_bb" in args.stream_type
        self.vmeta_on = "visual_meta" in args.stream_type
        #self.conv_pool = Conv1d(n_dim*4+1, n_dim*2)

        self.character = nn.Parameter(torch.randn(22,
                                                  D,
                                                  device=args.device,
                                                  dtype=torch.float),
                                      requires_grad=True)
        self.norm1 = Norm(D)

        self.lang_proj = nn.Linear(768, 300)
        self.visual_proj = nn.Linear(2048, 300)

        #self.mh_video = nn.MultiheadAttention(300, 6)
        #self.context_gru = nn.GRU(300, 150, bidirectional=True, batch_first=True)
        self.cross1 = UtilityLayer(300)
        self.cross2 = UtilityLayer(300)
        self.cross3 = UtilityLayer(300)
        self.context_proj = nn.Linear(5 * 300, 300)

        self.char_classifier = nn.Linear(300, 21)
        self.mask_classifier = nn.Linear(300, self.tokenizer.vocab_size)

        self.output = nn.Linear(300, 1)

        self.answer_rnn = nn.LSTM(300, 300, 1, batch_first=True, dropout=0)

        speaker_name = [
            'None',  # index 0: unknown speaker 
            'Anna',
            'Chairman',
            'Deogi',
            'Dokyung',
            'Gitae',
            'Haeyoung1',
            'Haeyoung2',
            'Heeran',
            'Hun',
            'Jeongsuk',
            'Jinsang',
            'Jiya',
            'Kyungsu',
            'Sangseok',
            'Seohee',
            'Soontack',
            'Sukyung',
            'Sungjin',
            'Taejin',
            'Yijoon'
        ]
        self.speaker_to_index = {
            name: index
            for index, name in enumerate(speaker_name)
        }
        self.index_to_speaker = {
            v: k
            for k, v in self.speaker_to_index.items()
        }

        if self.script_on:
            self.lstm_script = RNNEncoder(321,
                                          150,
                                          bidirectional=True,
                                          dropout_p=0,
                                          n_layers=1,
                                          rnn_type="lstm")
            self.classifier_script = nn.Sequential(nn.Linear(n_dim * 2, 1),
                                                   nn.Softmax(dim=1))
            self.mhattn_script = CharMatching(4, D, D)

        if self.vmeta_on:
            self.lstm_vmeta = RNNEncoder(321,
                                         150,
                                         bidirectional=True,
                                         dropout_p=0,
                                         n_layers=1,
                                         rnn_type="lstm")
            self.classifier_vmeta = nn.Sequential(nn.Linear(n_dim * 2, 1),
                                                  nn.Softmax(dim=1))
            self.mhattn_vmeta = CharMatching(4, D, D)

        if self.vbb_on:
            self.lstm_vbb = RNNEncoder(image_dim + 21,
                                       150,
                                       bidirectional=True,
                                       dropout_p=0,
                                       n_layers=1,
                                       rnn_type="lstm")
            self.vbb_fc = nn.Sequential(
                nn.Dropout(0.5),
                nn.Linear(image_dim, n_dim),
                nn.Tanh(),
            )
            self.classifier_vbb = nn.Sequential(nn.Linear(n_dim * 2, 1),
                                                nn.Softmax(dim=1))

            self.mhattn_vbb = CharMatching(4, D, D)
Example #2
0
    def __init__(self,
                 args,
                 vocab,
                 n_dim,
                 image_dim,
                 layers,
                 dropout,
                 num_choice=5):
        super().__init__()
        self.vocab = vocab
        V = len(vocab)
        D = n_dim

        self.hidden_dim = n_dim

        self.embedding = nn.Embedding(V, D)
        n_dim = args.n_dim
        image_dim = args.image_dim

        bert_vocab_size = 30525
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.bert.resize_token_embeddings(bert_vocab_size)
        self.bert_dim = 768

        self.cmat = ContextMatching(n_dim * 3)
        self.lstm_raw = RNNEncoder(self.bert_dim,
                                   150,
                                   bidirectional=True,
                                   dropout_p=0,
                                   n_layers=1,
                                   rnn_type="lstm")
        self.lstm_script = RNNEncoder(321,
                                      150,
                                      bidirectional=True,
                                      dropout_p=0,
                                      n_layers=1,
                                      rnn_type="lstm")
        self.script_on = "script" in args.stream_type
        self.vbb_on = "visual_bb" in args.stream_type
        self.vmeta_on = "visual_meta" in args.stream_type
        self.conv_pool = Conv1d(n_dim * 4 + 1, n_dim * 2)

        self.util = UtilityLayer(hidden_dim=300,
                                 feedforward_dim=600,
                                 n_head=10,
                                 dropout=0.1)
        self.util2 = UtilityLayer(hidden_dim=300,
                                  feedforward_dim=600,
                                  n_head=10,
                                  dropout=0.1)
        self.summary_s = SummaryAttn(300, 8, 0.1)
        self.summary_m = SummaryAttn(300, 8, 0.1)
        self.summary_b = SummaryAttn(300, 8, 0.1)
        self.summary_q = SummaryAttn(300, 5, 0.1)

        self.character = nn.Parameter(torch.randn(22,
                                                  D,
                                                  device=args.device,
                                                  dtype=torch.float),
                                      requires_grad=True)
        self.norm1 = Norm(D)

        self.output = nn.Sequential(nn.Linear(4 * 300, 300), nn.PReLU())

        self.linear_addit = nn.Sequential(nn.Linear(1800, 300), nn.PReLU())

        if self.script_on:
            self.lstm_script = RNNEncoder(self.bert_dim + 21,
                                          150,
                                          bidirectional=True,
                                          dropout_p=0,
                                          n_layers=1,
                                          rnn_type="lstm")
            self.classifier_script = nn.Sequential(nn.Linear(n_dim * 2, 1),
                                                   nn.Softmax(dim=1))
            self.mhattn_script = CharMatching(4, D, D)

        if self.vmeta_on:
            self.lstm_vmeta = RNNEncoder(self.bert_dim + 21,
                                         150,
                                         bidirectional=True,
                                         dropout_p=0,
                                         n_layers=1,
                                         rnn_type="lstm")
            self.classifier_vmeta = nn.Sequential(nn.Linear(n_dim * 2, 1),
                                                  nn.Softmax(dim=1))
            self.mhattn_vmeta = CharMatching(4, D, D)

        if self.vbb_on:
            self.lstm_vbb = RNNEncoder(image_dim + 21,
                                       150,
                                       bidirectional=True,
                                       dropout_p=0,
                                       n_layers=1,
                                       rnn_type="lstm")
            self.vbb_fc = nn.Sequential(
                nn.Dropout(0.5),
                nn.Linear(image_dim, n_dim),
                nn.Tanh(),
            )
            self.classifier_vbb = nn.Sequential(nn.Linear(n_dim * 2, 1),
                                                nn.Softmax(dim=1))

            self.mhattn_vbb = CharMatching(4, D, D)
Example #3
0
    def __init__(self,
                 args,
                 vocab,
                 n_dim,
                 image_dim,
                 layers,
                 dropout,
                 num_choice=5):
        super().__init__()
        self.vocab = vocab
        V = len(vocab)
        D = n_dim

        # set appropriate CLS, SEP tokens here (from BERT tokenizer)
        self.CLS = 101
        self.SEP = 102

        self.hidden_dim = n_dim

        self.embedding = nn.Embedding(V, D)
        n_dim = args.n_dim
        image_dim = args.image_dim

        #bert_vocab_size = 30543
        self.bert_dim = 768
        #self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.bert = RobertaModel.from_pretrained('roberta-base')
        #self.bert.resize_token_embeddings(bert_vocab_size)
        for param in self.bert.parameters():
            param.requires_grad = False

        self.cmat = ContextMatching(n_dim * 3)
        self.lstm_raw = RNNEncoder(self.bert_dim,
                                   150,
                                   bidirectional=True,
                                   dropout_p=0,
                                   n_layers=1,
                                   rnn_type="lstm")
        #self.lstm_script = RNNEncoder(321, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm")
        self.script_on = "script" in args.stream_type
        self.vbb_on = "visual_bb" in args.stream_type
        self.vmeta_on = "visual_meta" in args.stream_type
        self.conv_pool = Conv1d(n_dim * 4 + 1, n_dim * 2)

        self.character = nn.Parameter(torch.randn(22,
                                                  D,
                                                  device=args.device,
                                                  dtype=torch.float),
                                      requires_grad=True)
        self.norm1 = Norm(D)

        self.visual_projection = nn.Sequential(nn.Linear(512, 300), nn.ReLU())

        self.person_projection = nn.Sequential(nn.Linear(512, 300), nn.ReLU())

        self.output = nn.Sequential(nn.Linear(768, 1), nn.PReLU())

        if self.script_on:
            #self.lstm_script = RNNEncoder(321, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm")
            self.lstm_script = RNNEncoder(self.bert_dim + 21,
                                          150,
                                          bidirectional=True,
                                          dropout_p=0,
                                          n_layers=1,
                                          rnn_type="lstm")
            self.classifier_script = nn.Sequential(nn.Linear(n_dim * 2, 1),
                                                   nn.Softmax(dim=1))
            self.mhattn_script = CharMatching(4, D, D)

        if self.vmeta_on:
            #self.lstm_vmeta = RNNEncoder(321, 150, bidirectional=True, dropout_p=0, n_layers=1, rnn_type="lstm")
            self.lstm_vmeta = RNNEncoder(self.bert_dim + 21,
                                         150,
                                         bidirectional=True,
                                         dropout_p=0,
                                         n_layers=1,
                                         rnn_type="lstm")
            self.classifier_vmeta = nn.Sequential(nn.Linear(n_dim * 2, 1),
                                                  nn.Softmax(dim=1))
            self.mhattn_vmeta = CharMatching(4, D, D)

        if self.vbb_on:
            self.lstm_vbb = RNNEncoder(image_dim + 21,
                                       150,
                                       bidirectional=True,
                                       dropout_p=0,
                                       n_layers=1,
                                       rnn_type="lstm")
            self.vbb_fc = nn.Sequential(
                nn.Dropout(0.5),
                nn.Linear(image_dim, n_dim),
                nn.Tanh(),
            )
            self.classifier_vbb = nn.Sequential(nn.Linear(n_dim * 2, 1),
                                                nn.Softmax(dim=1))

            self.mhattn_vbb = CharMatching(4, D, D)
Example #4
0
    def __init__(self,
                 args,
                 vocab,
                 n_dim,
                 image_dim,
                 layers,
                 dropout,
                 num_choice=5):
        super().__init__()
        self.vocab = vocab
        V = len(vocab)
        D = n_dim

        self.hidden_dim = n_dim

        self.embedding = nn.Embedding(V, D)
        n_dim = args.n_dim
        image_dim = args.image_dim

        self.cmat = ContextMatching(n_dim * 3)
        self.lstm_raw = RNNEncoder(300,
                                   150,
                                   bidirectional=True,
                                   dropout_p=0,
                                   n_layers=1,
                                   rnn_type="lstm")
        self.lstm_script = RNNEncoder(321,
                                      150,
                                      bidirectional=True,
                                      dropout_p=0,
                                      n_layers=1,
                                      rnn_type="lstm")
        self.script_on = "script" in args.stream_type
        self.vbb_on = "visual_bb" in args.stream_type
        self.vmeta_on = "visual_meta" in args.stream_type
        self.conv_pool = Conv1d(n_dim * 4 + 1, n_dim * 2)

        self.util = UtilityLayer(hidden_dim=300,
                                 feedforward_dim=1024,
                                 n_head=10,
                                 dropout=0.1)
        self.util2 = UtilityLayer(hidden_dim=300,
                                  feedforward_dim=1024,
                                  n_head=10,
                                  dropout=0.1)
        self.util3 = UtilityLayer(hidden_dim=300,
                                  feedforward_dim=1024,
                                  n_head=10,
                                  dropout=0.1)
        self.summary_s = SummaryAttn(300, 3, 0.1)
        self.summary_m = SummaryAttn(300, 3, 0.1)
        self.summary_b = SummaryAttn(300, 3, 0.1)
        self.summary_f = SummaryAttn(300, 3, 0.1)
        self.summary_q = SummaryAttn(300, 3, 0.1)
        self.summary_addit = SummaryAttn(300, 3, 0.1)

        self.character = nn.Parameter(torch.randn(22,
                                                  D,
                                                  device=args.device,
                                                  dtype=torch.float),
                                      requires_grad=True)
        self.norm1 = Norm(D)

        self.image_projection = nn.Sequential(nn.Linear(512, 300), nn.PReLU())

        self.output = nn.Sequential(nn.Linear(4 * 300, 300), nn.PReLU())

        self.linear_addit = nn.Sequential(nn.Linear(1800 + 3, 300), nn.PReLU())

        self.mh_bb = nn.MultiheadAttention(embed_dim=300, num_heads=6)
        self.mh_script = nn.MultiheadAttention(embed_dim=300, num_heads=6)
        self.mh_meta = nn.MultiheadAttention(embed_dim=300, num_heads=6)
        self.mh_answers = nn.MultiheadAttention(embed_dim=300, num_heads=6)

        if self.script_on:
            self.lstm_script = RNNEncoder(321,
                                          150,
                                          bidirectional=True,
                                          dropout_p=0,
                                          n_layers=1,
                                          rnn_type="lstm")
            self.classifier_script = nn.Sequential(nn.Linear(n_dim * 2, 1),
                                                   nn.Softmax(dim=1))
            self.mhattn_script = CharMatching(4, D, D)

        if self.vmeta_on:
            self.lstm_vmeta = RNNEncoder(321,
                                         150,
                                         bidirectional=True,
                                         dropout_p=0,
                                         n_layers=1,
                                         rnn_type="lstm")
            self.classifier_vmeta = nn.Sequential(nn.Linear(n_dim * 2, 1),
                                                  nn.Softmax(dim=1))
            self.mhattn_vmeta = CharMatching(4, D, D)

        if self.vbb_on:
            self.lstm_vbb = RNNEncoder(image_dim + 21,
                                       150,
                                       bidirectional=True,
                                       dropout_p=0,
                                       n_layers=1,
                                       rnn_type="lstm")
            self.vbb_fc = nn.Sequential(
                nn.Dropout(0.5),
                nn.Linear(image_dim, n_dim),
                nn.Tanh(),
            )
            self.classifier_vbb = nn.Sequential(nn.Linear(n_dim * 2, 1),
                                                nn.Softmax(dim=1))

            self.mhattn_vbb = CharMatching(4, D, D)
Example #5
0
    def __init__(self,
                 args,
                 vocab,
                 n_dim,
                 image_dim,
                 layers,
                 dropout,
                 num_choice=5):
        super().__init__()
        self.vocab = vocab
        V = len(vocab)
        D = n_dim

        self.hidden_dim = n_dim

        #self.bert = BertModel.from_pretrained('bert-base-uncased')

        encoder_layer = nn.TransformerEncoderLayer(d_model=n_dim,
                                                   nhead=6,
                                                   dim_feedforward=1024,
                                                   dropout=0.5,
                                                   activation='gelu')
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=2)
        #self.transformer = nn.Transformer(d_model=n_dim, nhead=6)

        self.embedding = nn.Embedding(V, D)
        n_dim = args.n_dim
        image_dim = args.image_dim

        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        self.language_model = RobertaModel.from_pretrained('roberta-base',
                                                           return_dict=True)
        for param in self.language_model.base_model.parameters():
            param.requires_grad = False

        self.cmat = ContextMatching(n_dim * 3)
        self.lstm_raw = RNNEncoder(300,
                                   150,
                                   bidirectional=True,
                                   dropout_p=0,
                                   n_layers=1,
                                   rnn_type="lstm")
        self.lstm_script = RNNEncoder(321,
                                      150,
                                      bidirectional=True,
                                      dropout_p=0,
                                      n_layers=1,
                                      rnn_type="lstm")
        self.script_on = "script" in args.stream_type
        self.vbb_on = "visual_bb" in args.stream_type
        self.vmeta_on = "visual_meta" in args.stream_type
        self.conv_pool = Conv1d(n_dim * 4 + 1, n_dim * 2)

        self.character = nn.Parameter(torch.randn(22,
                                                  D,
                                                  device=args.device,
                                                  dtype=torch.float),
                                      requires_grad=True)
        self.norm1 = Norm(D)

        self.output = nn.Sequential(nn.Linear(5 * 300, 5), nn.Softmax(dim=1))

        if self.script_on:
            self.lstm_script = RNNEncoder(321,
                                          150,
                                          bidirectional=True,
                                          dropout_p=0,
                                          n_layers=1,
                                          rnn_type="lstm")
            self.classifier_script = nn.Sequential(nn.Linear(n_dim * 2, 1),
                                                   nn.Softmax(dim=1))
            self.mhattn_script = CharMatching(4, D, D)

        if self.vmeta_on:
            self.lstm_vmeta = RNNEncoder(321,
                                         150,
                                         bidirectional=True,
                                         dropout_p=0,
                                         n_layers=1,
                                         rnn_type="lstm")
            self.classifier_vmeta = nn.Sequential(nn.Linear(n_dim * 2, 1),
                                                  nn.Softmax(dim=1))
            self.mhattn_vmeta = CharMatching(4, D, D)

        if self.vbb_on:
            self.lstm_vbb = RNNEncoder(image_dim + 21,
                                       150,
                                       bidirectional=True,
                                       dropout_p=0,
                                       n_layers=1,
                                       rnn_type="lstm")
            self.vbb_fc = nn.Sequential(
                nn.Dropout(0.5),
                nn.Linear(image_dim, n_dim),
                nn.Tanh(),
            )
            self.classifier_vbb = nn.Sequential(nn.Linear(n_dim * 2, 1),
                                                nn.Softmax(dim=1))

            self.mhattn_vbb = CharMatching(4, D, D)