コード例 #1
0
    def __init__(self, cfg, train_dataset):
        super(BiModalTransformer, self).__init__()

        if cfg.use_linear_embedder:
            self.emb_A = FeatureEmbedder(cfg.d_aud, cfg.d_model_audio)
            self.emb_V = FeatureEmbedder(cfg.d_vid, cfg.d_model_video)
        else:
            self.emb_A = Identity()
            self.emb_V = Identity()

        self.emb_C = VocabularyEmbedder(train_dataset.trg_voc_size,
                                        cfg.d_model_caps)

        self.pos_enc_A = PositionalEncoder(cfg.d_model_audio, cfg.dout_p)
        self.pos_enc_V = PositionalEncoder(cfg.d_model_video, cfg.dout_p)
        self.pos_enc_C = PositionalEncoder(cfg.d_model_caps, cfg.dout_p)

        self.encoder = BiModalEncoder(cfg.d_model_audio, cfg.d_model_video,
                                      cfg.d_model, cfg.dout_p, cfg.H,
                                      cfg.d_ff_audio, cfg.d_ff_video, cfg.N)

        self.decoder = BiModelDecoder(cfg.d_model_audio, cfg.d_model_video,
                                      cfg.d_model_caps, cfg.d_model,
                                      cfg.dout_p, cfg.H, cfg.d_ff_caps, cfg.N)

        self.generator = Generator(cfg.d_model_caps,
                                   train_dataset.trg_voc_size)

        print('initialization: xavier')
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
        # initialize embedding after, so it will replace the weights
        # of the prev. initialization
        self.emb_C.init_word_embeddings(train_dataset.train_vocab.vectors,
                                        cfg.unfreeze_word_emb)

        # load the pretrained encoder from the proposal (used in ablation studies)
        if cfg.pretrained_prop_model_path is not None:
            print(f'Pretrained prop path: \n {cfg.pretrained_prop_model_path}')
            cap_model_cpt = torch.load(cfg.pretrained_prop_model_path,
                                       map_location='cpu')
            encoder_config = cap_model_cpt['config']
            self.encoder = BiModalEncoder(
                encoder_config.d_model_audio, encoder_config.d_model_video,
                encoder_config.d_model, encoder_config.dout_p,
                encoder_config.H, encoder_config.d_ff_audio,
                encoder_config.d_ff_video, encoder_config.N)
            encoder_weights = {
                k: v
                for k, v in cap_model_cpt['model_state_dict'].items()
                if 'encoder' in k
            }
            encoder_weights = {
                k.replace('encoder.', ''): v
                for k, v in encoder_weights.items()
            }
            self.encoder.load_state_dict(encoder_weights)
            self.encoder = self.encoder.to(cfg.device)
            for param in self.encoder.parameters():
                param.requires_grad = cfg.finetune_prop_encoder
コード例 #2
0
class BiModalTransformer(nn.Module):
    '''
    Forward:
        Inputs:
            src {'rgb'&'flow' (B, Sv, Dv), 'audio': (B, Sa, Da)}
            trg (C): ((B, Sc))
            masks: {'V_mask': (B, 1, Sv), 'A_mask': (B, 1, Sa), 'C_mask' (B, Sc, Sc))}
        Output:
            C: (B, Sc, Vc)
    '''
    def __init__(self, cfg, train_dataset):
        super(BiModalTransformer, self).__init__()

        if cfg.use_linear_embedder:
            self.emb_A = FeatureEmbedder(cfg.d_aud, cfg.d_model_audio)
            self.emb_V = FeatureEmbedder(cfg.d_vid, cfg.d_model_video)
        else:
            self.emb_A = Identity()
            self.emb_V = Identity()

        self.emb_C = VocabularyEmbedder(train_dataset.trg_voc_size,
                                        cfg.d_model_caps)

        self.pos_enc_A = PositionalEncoder(cfg.d_model_audio, cfg.dout_p)
        self.pos_enc_V = PositionalEncoder(cfg.d_model_video, cfg.dout_p)
        self.pos_enc_C = PositionalEncoder(cfg.d_model_caps, cfg.dout_p)

        self.encoder = BiModalEncoder(cfg.d_model_audio, cfg.d_model_video,
                                      cfg.d_model, cfg.dout_p, cfg.H,
                                      cfg.d_ff_audio, cfg.d_ff_video, cfg.N)

        self.decoder = BiModelDecoder(cfg.d_model_audio, cfg.d_model_video,
                                      cfg.d_model_caps, cfg.d_model,
                                      cfg.dout_p, cfg.H, cfg.d_ff_caps, cfg.N)

        self.generator = Generator(cfg.d_model_caps,
                                   train_dataset.trg_voc_size)

        print('initialization: xavier')
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
        # initialize embedding after, so it will replace the weights
        # of the prev. initialization
        self.emb_C.init_word_embeddings(train_dataset.train_vocab.vectors,
                                        cfg.unfreeze_word_emb)

        # load the pretrained encoder from the proposal (used in ablation studies)
        if cfg.pretrained_prop_model_path is not None:
            print(f'Pretrained prop path: \n {cfg.pretrained_prop_model_path}')
            cap_model_cpt = torch.load(cfg.pretrained_prop_model_path,
                                       map_location='cpu')
            encoder_config = cap_model_cpt['config']
            self.encoder = BiModalEncoder(
                encoder_config.d_model_audio, encoder_config.d_model_video,
                encoder_config.d_model, encoder_config.dout_p,
                encoder_config.H, encoder_config.d_ff_audio,
                encoder_config.d_ff_video, encoder_config.N)
            encoder_weights = {
                k: v
                for k, v in cap_model_cpt['model_state_dict'].items()
                if 'encoder' in k
            }
            encoder_weights = {
                k.replace('encoder.', ''): v
                for k, v in encoder_weights.items()
            }
            self.encoder.load_state_dict(encoder_weights)
            self.encoder = self.encoder.to(cfg.device)
            for param in self.encoder.parameters():
                param.requires_grad = cfg.finetune_prop_encoder

    def forward(self, src: dict, trg, masks: dict):
        V, A = src['rgb'] + src['flow'], src['audio']
        C = trg

        # (B, Sm, Dm) <- (B, Sm, Dm), m in [a, v];
        A = self.emb_A(A)
        V = self.emb_V(V)
        # (B, Sc, Dc) <- (S, Sc)
        C = self.emb_C(C)

        A = self.pos_enc_A(A)
        V = self.pos_enc_V(V)
        C = self.pos_enc_C(C)

        # notation: M1m2m2 (B, Sm1, Dm1), M1 is the target modality, m2 is the source modality
        Av, Va = self.encoder((A, V), masks)

        # (B, Sc, Dc)
        C = self.decoder((C, (Av, Va)), masks)

        # (B, Sc, Vc) <- (B, Sc, Dc)
        C = self.generator(C)

        return C
コード例 #3
0
    def __init__(self, cfg, anchors, nocuda=False):
        super(MultimodalProposalGenerator, self).__init__()
        assert cfg.modality == 'audio_video'
        self.cfg = cfg
        self.anchors = anchors
        self.EPS = 1e-16
        self.num_logits = 3  # 3: c, w, obj
        self.nocuda = nocuda

        if cfg.use_linear_embedder:
            self.emb_V = FeatureEmbedder(cfg.d_vid, cfg.d_model_video)
            self.emb_A = FeatureEmbedder(cfg.d_aud, cfg.d_model_audio)
        else:
            self.emb_V = Identity()
            self.emb_A = Identity()
        self.pos_enc_V = PositionalEncoder(cfg.d_model_video, cfg.dout_p)
        self.pos_enc_A = PositionalEncoder(cfg.d_model_audio, cfg.dout_p)

        # load the pre-trained encoder from captioning module
        if cfg.pretrained_cap_model_path is not None:
            print(
                f'Pretrained caption path: \n {cfg.pretrained_cap_model_path}')
            cap_model_cpt = torch.load(cfg.pretrained_cap_model_path,
                                       map_location='cpu')
            encoder_config = cap_model_cpt['config']
            self.encoder = BiModalEncoder(
                encoder_config.d_model_audio, encoder_config.d_model_video,
                encoder_config.d_model, encoder_config.dout_p,
                encoder_config.H, encoder_config.d_ff_audio,
                encoder_config.d_ff_video, encoder_config.N)
            encoder_weights = {
                k: v
                for k, v in cap_model_cpt['model_state_dict'].items()
                if 'encoder' in k
            }
            encoder_weights = {
                k.replace('module.encoder.', ''): v
                for k, v in encoder_weights.items()
            }
            self.encoder.load_state_dict(encoder_weights)
            self.encoder = self.encoder.to(
                cfg.device) if not nocuda else self.encoder
            for param in self.encoder.parameters():
                param.requires_grad = cfg.finetune_cap_encoder
        else:
            self.encoder = BiModalEncoder(cfg.d_model_audio, cfg.d_model_video,
                                          cfg.d_model, cfg.dout_p, cfg.H,
                                          cfg.d_ff_audio, cfg.d_ff_video,
                                          cfg.N)
            # encoder initialization
            for p in self.encoder.parameters():
                if p.dim() > 1:
                    nn.init.xavier_uniform_(p)

        dims_A = [
            cfg.d_model_audio, *cfg.conv_layers_audio,
            self.num_logits * cfg.anchors_num_audio
        ]
        dims_V = [
            cfg.d_model_video, *cfg.conv_layers_video,
            self.num_logits * cfg.anchors_num_video
        ]
        self.detection_layers_A = torch.nn.ModuleList([
            ProposalGenerationHead(dims_A, k, cfg.dout_p, cfg.layer_norm)
            for k in cfg.kernel_sizes['audio']
        ])
        self.detection_layers_V = torch.nn.ModuleList([
            ProposalGenerationHead(dims_V, k, cfg.dout_p, cfg.layer_norm)
            for k in cfg.kernel_sizes['video']
        ])

        self.bce_loss = nn.BCELoss()
        self.mse_loss = nn.MSELoss()
コード例 #4
0
class BiModalTransformer(nn.Module):
    '''
    Forward:
        Inputs:
            src {'rgb'&'flow' (B, Sv, Dv), 'audio': (B, Sa, Da)}
            trg (C): ((B, Sc))
            masks: {'V_mask': (B, 1, Sv), 'A_mask': (B, 1, Sa), 'C_mask' (B, Sc, Sc))}
        Output:
            C: (B, Sc, Vc)   将索引转换成了词向量表示,所以多了一个维度,为300
    '''
    def __init__(self, cfg, train_dataset):
        super(BiModalTransformer, self).__init__()

        if cfg.use_linear_embedder:
            self.emb_A = FeatureEmbedder(cfg.d_aud, cfg.d_model_audio)
            self.emb_V = FeatureEmbedder(cfg.d_vid, cfg.d_model_video)
        else:
            self.emb_A = Identity()  # 128
            self.emb_V = Identity()  # 1024

        self.emb_C = VocabularyEmbedder(train_dataset.trg_voc_size,
                                        cfg.d_model_caps)  # (10172,300)

        # print('cfg.d_model_audio:\n',cfg.d_model_audio)          # 128
        # print('cfg.d_model_video:\n', cfg.d_model_video)         # 1024
        # print('cfg.d_model_caps:\n', cfg.d_model_caps)           # 300
        # 返回的是带有位置编码信息的特征矩阵
        self.pos_enc_A = PositionalEncoder(cfg.d_model_audio,
                                           cfg.dout_p)  # (32,*,128)
        self.pos_enc_V = PositionalEncoder(cfg.d_model_video,
                                           cfg.dout_p)  # (32,*,1024)
        self.pos_enc_C = PositionalEncoder(cfg.d_model_caps,
                                           cfg.dout_p)  # (32,*,300)

        # pdb.set_trace()

        self.encoder = BiModalEncoder(cfg.d_model_audio, cfg.d_model_video,
                                      cfg.d_model, cfg.dout_p, cfg.H,
                                      cfg.d_ff_audio, cfg.d_ff_video, cfg.N)

        self.decoder = BiModelDecoder(cfg.d_model_audio, cfg.d_model_video,
                                      cfg.d_model_caps, cfg.d_model,
                                      cfg.dout_p, cfg.H, cfg.d_ff_caps, cfg.N)

        self.generator = Generator(cfg.d_model_caps,
                                   train_dataset.trg_voc_size)

        print('initialization: xavier')
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
        # initialize embedding after, so it will replace the weights
        # of the prev. initialization
        # 将word index转化为词向量
        self.emb_C.init_word_embeddings(train_dataset.train_vocab.vectors,
                                        cfg.unfreeze_word_emb)

        # load the pre_trained encoder from the proposal (used in ablation studies)
        if cfg.pretrained_prop_model_path is not None:
            print(
                f'Pre_trained prop path: \n {cfg.pretrained_prop_model_path}')
            cap_model_cpt = torch.load(cfg.pretrained_prop_model_path,
                                       map_location='cpu')
            encoder_config = cap_model_cpt['config']
            self.encoder = BiModalEncoder(
                encoder_config.d_model_audio, encoder_config.d_model_video,
                encoder_config.d_model, encoder_config.dout_p,
                encoder_config.H, encoder_config.d_ff_audio,
                encoder_config.d_ff_video, encoder_config.N)
            encoder_weights = {
                k: v
                for k, v in cap_model_cpt['model_state_dict'].items()
                if 'encoder' in k
            }
            encoder_weights = {
                k.replace('encoder.', ''): v
                for k, v in encoder_weights.items()
            }
            self.encoder.load_state_dict(encoder_weights)
            self.encoder = self.encoder.to(cfg.device)
            for param in self.encoder.parameters():
                param.requires_grad = cfg.finetune_prop_encoder

    def forward(self, src: dict, trg, masks: dict):
        # V.shape=(32,*,1024),A.shape=(32,*,128)
        V, A = src['rgb'] + src['flow'], src[
            'audio']  # 将rgb和flow两个特征线性相加作为V的特征
        C = trg
        print('V.shape,A.shape,C.shape:\n', V.shape, A.shape, C.shape)
        # print('V,A,C:\n', V[0], A[0], C[0])

        # (B, Sm, Dm) <- (B, Sm, Dm), m in [a, v];
        A = self.emb_A(A)
        V = self.emb_V(V)
        # (B, Sc, Dc) <- (S, Sc)
        C = self.emb_C(C)

        A = self.pos_enc_A(A)
        V = self.pos_enc_V(V)
        C = self.pos_enc_C(C)
        # print('WordEmbedding:\n', C[0][0])   # batch中,第一个caption的第一个词嵌入向量表示

        # notation: M1m2m2 (B, Sm1, Dm1), M1 is the target modality, m2 is the source modality
        # Av--M1m2 (B, Sm1, Dm1), Va--M2m1 (B, Sm2, Dm2)
        Av, Va = self.encoder((A, V), masks)

        # (B, Sc, Dc)
        C = self.decoder((C, (Av, Va)), masks)

        # (B, Sc, Vocabc) <- (B, Sc, Dc)
        C = self.generator(C)

        return C
コード例 #5
0
class MultimodalProposalGenerator(nn.Module):
    def __init__(self, cfg, anchors, nocuda=False):
        super(MultimodalProposalGenerator, self).__init__()
        assert cfg.modality == 'audio_video'
        self.cfg = cfg
        self.anchors = anchors
        self.EPS = 1e-16
        self.num_logits = 3  # 3: c, w, obj
        self.nocuda = nocuda

        if cfg.use_linear_embedder:
            self.emb_V = FeatureEmbedder(cfg.d_vid, cfg.d_model_video)
            self.emb_A = FeatureEmbedder(cfg.d_aud, cfg.d_model_audio)
        else:
            self.emb_V = Identity()
            self.emb_A = Identity()
        self.pos_enc_V = PositionalEncoder(cfg.d_model_video, cfg.dout_p)
        self.pos_enc_A = PositionalEncoder(cfg.d_model_audio, cfg.dout_p)

        # load the pre-trained encoder from captioning module
        if cfg.pretrained_cap_model_path is not None:
            print(
                f'Pretrained caption path: \n {cfg.pretrained_cap_model_path}')
            cap_model_cpt = torch.load(cfg.pretrained_cap_model_path,
                                       map_location='cpu')
            encoder_config = cap_model_cpt['config']
            self.encoder = BiModalEncoder(
                encoder_config.d_model_audio, encoder_config.d_model_video,
                encoder_config.d_model, encoder_config.dout_p,
                encoder_config.H, encoder_config.d_ff_audio,
                encoder_config.d_ff_video, encoder_config.N)
            encoder_weights = {
                k: v
                for k, v in cap_model_cpt['model_state_dict'].items()
                if 'encoder' in k
            }
            encoder_weights = {
                k.replace('module.encoder.', ''): v
                for k, v in encoder_weights.items()
            }
            self.encoder.load_state_dict(encoder_weights)
            self.encoder = self.encoder.to(
                cfg.device) if not nocuda else self.encoder
            for param in self.encoder.parameters():
                param.requires_grad = cfg.finetune_cap_encoder
        else:
            self.encoder = BiModalEncoder(cfg.d_model_audio, cfg.d_model_video,
                                          cfg.d_model, cfg.dout_p, cfg.H,
                                          cfg.d_ff_audio, cfg.d_ff_video,
                                          cfg.N)
            # encoder initialization
            for p in self.encoder.parameters():
                if p.dim() > 1:
                    nn.init.xavier_uniform_(p)

        dims_A = [
            cfg.d_model_audio, *cfg.conv_layers_audio,
            self.num_logits * cfg.anchors_num_audio
        ]
        dims_V = [
            cfg.d_model_video, *cfg.conv_layers_video,
            self.num_logits * cfg.anchors_num_video
        ]
        self.detection_layers_A = torch.nn.ModuleList([
            ProposalGenerationHead(dims_A, k, cfg.dout_p, cfg.layer_norm)
            for k in cfg.kernel_sizes['audio']
        ])
        self.detection_layers_V = torch.nn.ModuleList([
            ProposalGenerationHead(dims_V, k, cfg.dout_p, cfg.layer_norm)
            for k in cfg.kernel_sizes['video']
        ])

        self.bce_loss = nn.BCELoss()
        self.mse_loss = nn.MSELoss()

    def forward_modality(self, x, targets, detection, stride, anchors_list):
        anchors_num = len(anchors_list)
        # in case targets is None
        loss = 0
        losses = {}

        x = detection(x)

        B, S, D = x.shape
        x = x.view(B, S, anchors_num, self.num_logits)

        x = x.permute(0, 2, 1, 3).contiguous()
        grid_cell = torch.arange(S).view(1, 1, S).float()
        grid_cell = grid_cell.to(self.cfg.device) if self.nocuda else grid_cell
        # After dividing anchors by the stride, they represent the size size of
        # how many grid celts they are overlapping: 1.2 = 1 and 20% of a grid cell.
        # After multiplying them by the stride, the pixel values are going to be
        # obtained.
        anchors_list = [[anchor / stride] for anchor in anchors_list]
        anchors_tensor = torch.tensor(
            anchors_list) if not self.nocuda else torch.tensor(
                anchors_list, device=self.cfg.device)
        # (A, 2) -> (1, A, 1) for broadcasting
        prior_length = anchors_tensor.view(1, anchors_num, 1)

        # prediction values for the *loss* calculation (training)
        sigma_c = torch.sigmoid(x[:, :, :, 0])  # center
        l = x[:, :, :, 1]  # length
        sigma_o = torch.sigmoid(x[:, :, :, 2])  # objectness

        # prediction values that are going to be used for the original image
        # we need to detach them from the graph as we don't need to backproparate
        # on them
        predictions = x.clone().detach()
        # broadcasting (B, A, S) + (1, 1, S)
        # For now, we are not going to multiply them by stride since
        # we need them in make_targets
        predictions[:, :, :, 0] = sigma_c + grid_cell
        # broadcasting (1, A, 1) * (B, A, S)
        predictions[:, :, :, 1] = prior_length * torch.exp(l)
        predictions[:, :, :, 2] = sigma_o

        if targets is not None:
            obj_mask, noobj_mask, gt_x, gt_w, gt_obj = make_targets(
                predictions, targets, anchors_tensor, stride)
            ## Loss
            # Localization
            loss_x = self.mse_loss(sigma_c[obj_mask], gt_x[obj_mask])
            loss_w = self.mse_loss(l[obj_mask], gt_w[obj_mask])
            loss_loc = loss_x + loss_w
            # Confidence
            loss_obj = self.bce_loss(sigma_o[obj_mask], gt_obj[obj_mask])
            loss_noobj = self.bce_loss(sigma_o[noobj_mask], gt_obj[noobj_mask])
            loss_conf = self.cfg.obj_coeff * loss_obj + self.cfg.noobj_coeff * loss_noobj
            # Total loss
            loss = loss_loc + loss_conf

            losses = {
                'loss_x': loss_x,
                'loss_w': loss_w,
                'loss_conf_obj': loss_obj,
                'loss_conf_noobj': loss_noobj
            }

        # for NMS: (B, A, S, 3) -> (B, A*S, 3)
        predictions = predictions.view(B, S * anchors_num, self.num_logits)
        predictions[:, :, :2] *= stride

        return predictions, loss, losses

    def forward(self, x, targets, masks):
        V, A = x['rgb'] + x['flow'], x['audio']

        # (B, Sm, Dm) < - (B, Sm, Dm), m in [a, v]
        A = self.emb_A(A)
        V = self.emb_V(V)
        A = self.pos_enc_A(A)
        V = self.pos_enc_V(V)
        # notation: M1m2m2 (B, Sm1, Dm1), M1 is the target modality, m2 is the source modality
        Av, Va = self.encoder((A, V), masks)

        all_predictions_A = []
        all_predictions_V = []
        # total_loss should have backward
        sum_losses_dict_A = {}
        sum_losses_dict_V = {}
        total_loss_A = 0
        total_loss_V = 0

        for layer in self.detection_layers_A:
            props_A, loss_A, losses_A = self.forward_modality(
                Av, targets, layer, self.cfg.strides['audio'],
                self.anchors['audio'])
            total_loss_A += loss_A
            all_predictions_A.append(props_A)
            sum_losses_dict_A = add_dict_to_another_dict(
                losses_A, sum_losses_dict_A)

        for layer in self.detection_layers_V:
            props_V, loss_V, losses_V = self.forward_modality(
                Va, targets, layer, self.cfg.strides['video'],
                self.anchors['video'])
            total_loss_V += loss_V
            all_predictions_V.append(props_V)
            sum_losses_dict_V = add_dict_to_another_dict(
                losses_V, sum_losses_dict_V)

        all_predictions_A = torch.cat(all_predictions_A, dim=1)
        all_predictions_V = torch.cat(all_predictions_V, dim=1)

        total_loss = total_loss_A + total_loss_V

        # combine predictions
        all_predictions = torch.cat([all_predictions_A, all_predictions_V],
                                    dim=1)
        # if you like the predictions to be half from audio and half from the video modalities
        # all_predictions = torch.cat([
        #     select_topk_predictions(all_predictions_A, k=self.cfg.max_prop_per_vid // 2),
        #     select_topk_predictions(all_predictions_V, k=self.cfg.max_prop_per_vid // 2)
        # ], dim=1)

        return all_predictions, total_loss, sum_losses_dict_A, sum_losses_dict_V