Ejemplo n.º 1
0
    def __init__(self, train_dataset, cfg):
        super(Transformer, self).__init__()
        self.modality = cfg.modality

        if cfg.modality == 'video':
            self.d_model = cfg.d_model_video
            self.d_feat = cfg.d_vid
            self.d_ff = cfg.d_ff_video
        elif cfg.modality == 'audio':
            self.d_feat = cfg.d_aud
            self.d_model = cfg.d_model_audio
            self.d_ff = cfg.d_ff_audio

        if cfg.use_linear_embedder:
            self.src_emb = FeatureEmbedder(self.d_feat, self.d_model)
        else:
            assert self.d_feat == self.d_model
            self.src_emb = Identity()

        self.trg_emb = VocabularyEmbedder(train_dataset.trg_voc_size,
                                          self.d_model)
        self.pos_emb = PositionalEncoder(self.d_model, cfg.dout_p)
        self.encoder = Encoder(self.d_model, cfg.dout_p, cfg.H, self.d_ff,
                               cfg.N)
        self.decoder = Decoder(self.d_model, cfg.dout_p, cfg.H, self.d_ff,
                               cfg.N)
        self.generator = Generator(self.d_model, train_dataset.trg_voc_size)

        print('initialization: xavier')
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
        # initialize embedding after, so it will replace the weights initialized previously
        self.trg_emb.init_word_embeddings(train_dataset.train_vocab.vectors,
                                          cfg.unfreeze_word_emb)

        # load the pretrained encoder from the proposal (used in ablation studies)
        if cfg.pretrained_prop_model_path is not None:
            print(f'Pretrained prop path: \n {cfg.pretrained_prop_model_path}')
            cap_model_cpt = torch.load(cfg.pretrained_prop_model_path,
                                       map_location='cpu')
            encoder_config = cap_model_cpt['config']
            if cfg.modality == 'video':
                self.d_model = encoder_config.d_model_video
                self.d_ff = encoder_config.d_ff_video
            elif cfg.modality == 'audio':
                self.d_model = encoder_config.d_model_audio
                self.d_ff = encoder_config.d_ff_audio
            self.encoder = Encoder(self.d_model, encoder_config.dout_p,
                                   encoder_config.H, self.d_ff,
                                   encoder_config.N)
            encoder_weights = {
                k: v
                for k, v in cap_model_cpt['model_state_dict'].items()
                if 'encoder' in k
            }
            encoder_weights = {
                k.replace('encoder.', ''): v
                for k, v in encoder_weights.items()
            }
            self.encoder.load_state_dict(encoder_weights)
            self.encoder = self.encoder.to(cfg.device)
            for param in self.encoder.parameters():
                param.requires_grad = cfg.finetune_prop_encoder
Ejemplo n.º 2
0
    def __init__(self, cfg, anchors):
        super(ProposalGenerator, self).__init__()
        self.cfg = cfg
        self.EPS = 1e-16
        self.num_logits = 3  # 3: c, w, obj
        self.anchors = anchors
        self.anchors_list = anchors[cfg.modality]
        self.anchors_num = len(self.anchors_list)

        if cfg.modality == 'video':
            self.d_feat = cfg.d_vid
            self.d_model_modality = cfg.d_model_video
            self.d_ff = cfg.d_ff_video
            layer_dims = [
                self.d_model_modality, *cfg.conv_layers_video,
                self.num_logits * self.anchors_num
            ]
        elif cfg.modality == 'audio':
            self.d_feat = cfg.d_aud
            self.d_model_modality = cfg.d_model_audio
            self.d_ff = cfg.d_ff_audio
            layer_dims = [
                self.d_model_modality, *cfg.conv_layers_audio,
                self.num_logits * self.anchors_num
            ]
        else:
            raise NotImplementedError

        if cfg.use_linear_embedder:
            self.emb = FeatureEmbedder(self.d_feat, self.d_model_modality)
        else:
            self.emb = Identity()
        self.pos_enc = PositionalEncoder(self.d_model_modality, cfg.dout_p)

        # load the pre-trained encoder from captioning module
        if cfg.pretrained_cap_model_path is not None:
            print(f'Caption path: \n {cfg.pretrained_cap_model_path}')
            cap_model_cpt = torch.load(cfg.pretrained_cap_model_path,
                                       map_location='cpu')
            encoder_config = cap_model_cpt['config']
            if cfg.modality == 'video':
                self.d_model_modality = encoder_config.d_model_video
                self.d_ff = encoder_config.d_ff_video
            elif cfg.modality == 'audio':
                self.d_model_modality = encoder_config.d_model_audio
                self.d_ff = encoder_config.d_ff_audio
            else:
                raise NotImplementedError
            self.encoder = Encoder(self.d_model_modality,
                                   encoder_config.dout_p, encoder_config.H,
                                   self.d_ff, encoder_config.N)
            encoder_weights = {
                k: v
                for k, v in cap_model_cpt['model_state_dict'].items()
                if 'encoder' in k
            }
            encoder_weights = {
                k.replace('module.encoder.', ''): v
                for k, v in encoder_weights.items()
            }
            self.encoder.load_state_dict(encoder_weights)
            self.encoder = self.encoder.to(cfg.device)
            for param in self.encoder.parameters():
                param.requires_grad = cfg.finetune_cap_encoder
        else:
            self.encoder = Encoder(self.d_model_modality, cfg.dout_p, cfg.H,
                                   self.d_ff, cfg.N)
            # encoder initialization
            for p in self.encoder.parameters():
                if p.dim() > 1:
                    nn.init.xavier_uniform_(p)

        self.detection_layers = torch.nn.ModuleList([
            ProposalGenerationHead(layer_dims, k, cfg.dout_p, cfg.layer_norm)
            for k in cfg.kernel_sizes[cfg.modality]
        ])

        print(self.detection_layers)
        self.bce_loss = nn.BCELoss()
        self.mse_loss = nn.MSELoss()
Ejemplo n.º 3
0
    def __init__(self, cfg, train_dataset):
        super(BiModalTransformer, self).__init__()

        if cfg.use_linear_embedder:
            self.emb_A = FeatureEmbedder(cfg.d_aud, cfg.d_model_audio)
            self.emb_V = FeatureEmbedder(cfg.d_vid, cfg.d_model_video)
        else:
            self.emb_A = Identity()
            self.emb_V = Identity()

        self.emb_C = VocabularyEmbedder(train_dataset.trg_voc_size,
                                        cfg.d_model_caps)

        self.pos_enc_A = PositionalEncoder(cfg.d_model_audio, cfg.dout_p)
        self.pos_enc_V = PositionalEncoder(cfg.d_model_video, cfg.dout_p)
        self.pos_enc_C = PositionalEncoder(cfg.d_model_caps, cfg.dout_p)

        self.encoder = BiModalEncoder(cfg.d_model_audio, cfg.d_model_video,
                                      cfg.d_model, cfg.dout_p, cfg.H,
                                      cfg.d_ff_audio, cfg.d_ff_video, cfg.N)

        self.decoder = BiModelDecoder(cfg.d_model_audio, cfg.d_model_video,
                                      cfg.d_model_caps, cfg.d_model,
                                      cfg.dout_p, cfg.H, cfg.d_ff_caps, cfg.N)

        self.generator = Generator(cfg.d_model_caps,
                                   train_dataset.trg_voc_size)

        print('initialization: xavier')
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
        # initialize embedding after, so it will replace the weights
        # of the prev. initialization
        self.emb_C.init_word_embeddings(train_dataset.train_vocab.vectors,
                                        cfg.unfreeze_word_emb)

        # load the pretrained encoder from the proposal (used in ablation studies)
        if cfg.pretrained_prop_model_path is not None:
            print(f'Pretrained prop path: \n {cfg.pretrained_prop_model_path}')
            cap_model_cpt = torch.load(cfg.pretrained_prop_model_path,
                                       map_location='cpu')
            encoder_config = cap_model_cpt['config']
            self.encoder = BiModalEncoder(
                encoder_config.d_model_audio, encoder_config.d_model_video,
                encoder_config.d_model, encoder_config.dout_p,
                encoder_config.H, encoder_config.d_ff_audio,
                encoder_config.d_ff_video, encoder_config.N)
            encoder_weights = {
                k: v
                for k, v in cap_model_cpt['model_state_dict'].items()
                if 'encoder' in k
            }
            encoder_weights = {
                k.replace('encoder.', ''): v
                for k, v in encoder_weights.items()
            }
            self.encoder.load_state_dict(encoder_weights)
            self.encoder = self.encoder.to(cfg.device)
            for param in self.encoder.parameters():
                param.requires_grad = cfg.finetune_prop_encoder
Ejemplo n.º 4
0
    def __init__(self, cfg, anchors, nocuda=False):
        super(MultimodalProposalGenerator, self).__init__()
        assert cfg.modality == 'audio_video'
        self.cfg = cfg
        self.anchors = anchors
        self.EPS = 1e-16
        self.num_logits = 3  # 3: c, w, obj
        self.nocuda = nocuda

        if cfg.use_linear_embedder:
            self.emb_V = FeatureEmbedder(cfg.d_vid, cfg.d_model_video)
            self.emb_A = FeatureEmbedder(cfg.d_aud, cfg.d_model_audio)
        else:
            self.emb_V = Identity()
            self.emb_A = Identity()
        self.pos_enc_V = PositionalEncoder(cfg.d_model_video, cfg.dout_p)
        self.pos_enc_A = PositionalEncoder(cfg.d_model_audio, cfg.dout_p)

        # load the pre-trained encoder from captioning module
        if cfg.pretrained_cap_model_path is not None:
            print(
                f'Pretrained caption path: \n {cfg.pretrained_cap_model_path}')
            cap_model_cpt = torch.load(cfg.pretrained_cap_model_path,
                                       map_location='cpu')
            encoder_config = cap_model_cpt['config']
            self.encoder = BiModalEncoder(
                encoder_config.d_model_audio, encoder_config.d_model_video,
                encoder_config.d_model, encoder_config.dout_p,
                encoder_config.H, encoder_config.d_ff_audio,
                encoder_config.d_ff_video, encoder_config.N)
            encoder_weights = {
                k: v
                for k, v in cap_model_cpt['model_state_dict'].items()
                if 'encoder' in k
            }
            encoder_weights = {
                k.replace('module.encoder.', ''): v
                for k, v in encoder_weights.items()
            }
            self.encoder.load_state_dict(encoder_weights)
            self.encoder = self.encoder.to(
                cfg.device) if not nocuda else self.encoder
            for param in self.encoder.parameters():
                param.requires_grad = cfg.finetune_cap_encoder
        else:
            self.encoder = BiModalEncoder(cfg.d_model_audio, cfg.d_model_video,
                                          cfg.d_model, cfg.dout_p, cfg.H,
                                          cfg.d_ff_audio, cfg.d_ff_video,
                                          cfg.N)
            # encoder initialization
            for p in self.encoder.parameters():
                if p.dim() > 1:
                    nn.init.xavier_uniform_(p)

        dims_A = [
            cfg.d_model_audio, *cfg.conv_layers_audio,
            self.num_logits * cfg.anchors_num_audio
        ]
        dims_V = [
            cfg.d_model_video, *cfg.conv_layers_video,
            self.num_logits * cfg.anchors_num_video
        ]
        self.detection_layers_A = torch.nn.ModuleList([
            ProposalGenerationHead(dims_A, k, cfg.dout_p, cfg.layer_norm)
            for k in cfg.kernel_sizes['audio']
        ])
        self.detection_layers_V = torch.nn.ModuleList([
            ProposalGenerationHead(dims_V, k, cfg.dout_p, cfg.layer_norm)
            for k in cfg.kernel_sizes['video']
        ])

        self.bce_loss = nn.BCELoss()
        self.mse_loss = nn.MSELoss()