def __init__(self, cfg, train_dataset): super(BiModalTransformer, self).__init__() if cfg.use_linear_embedder: self.emb_A = FeatureEmbedder(cfg.d_aud, cfg.d_model_audio) self.emb_V = FeatureEmbedder(cfg.d_vid, cfg.d_model_video) else: self.emb_A = Identity() self.emb_V = Identity() self.emb_C = VocabularyEmbedder(train_dataset.trg_voc_size, cfg.d_model_caps) self.pos_enc_A = PositionalEncoder(cfg.d_model_audio, cfg.dout_p) self.pos_enc_V = PositionalEncoder(cfg.d_model_video, cfg.dout_p) self.pos_enc_C = PositionalEncoder(cfg.d_model_caps, cfg.dout_p) self.encoder = BiModalEncoder(cfg.d_model_audio, cfg.d_model_video, cfg.d_model, cfg.dout_p, cfg.H, cfg.d_ff_audio, cfg.d_ff_video, cfg.N) self.decoder = BiModelDecoder(cfg.d_model_audio, cfg.d_model_video, cfg.d_model_caps, cfg.d_model, cfg.dout_p, cfg.H, cfg.d_ff_caps, cfg.N) self.generator = Generator(cfg.d_model_caps, train_dataset.trg_voc_size) print('initialization: xavier') for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) # initialize embedding after, so it will replace the weights # of the prev. initialization self.emb_C.init_word_embeddings(train_dataset.train_vocab.vectors, cfg.unfreeze_word_emb) # load the pretrained encoder from the proposal (used in ablation studies) if cfg.pretrained_prop_model_path is not None: print(f'Pretrained prop path: \n {cfg.pretrained_prop_model_path}') cap_model_cpt = torch.load(cfg.pretrained_prop_model_path, map_location='cpu') encoder_config = cap_model_cpt['config'] self.encoder = BiModalEncoder( encoder_config.d_model_audio, encoder_config.d_model_video, encoder_config.d_model, encoder_config.dout_p, encoder_config.H, encoder_config.d_ff_audio, encoder_config.d_ff_video, encoder_config.N) encoder_weights = { k: v for k, v in cap_model_cpt['model_state_dict'].items() if 'encoder' in k } encoder_weights = { k.replace('encoder.', ''): v for k, v in encoder_weights.items() } self.encoder.load_state_dict(encoder_weights) self.encoder = self.encoder.to(cfg.device) for param in self.encoder.parameters(): param.requires_grad = cfg.finetune_prop_encoder
class BiModalTransformer(nn.Module): ''' Forward: Inputs: src {'rgb'&'flow' (B, Sv, Dv), 'audio': (B, Sa, Da)} trg (C): ((B, Sc)) masks: {'V_mask': (B, 1, Sv), 'A_mask': (B, 1, Sa), 'C_mask' (B, Sc, Sc))} Output: C: (B, Sc, Vc) ''' def __init__(self, cfg, train_dataset): super(BiModalTransformer, self).__init__() if cfg.use_linear_embedder: self.emb_A = FeatureEmbedder(cfg.d_aud, cfg.d_model_audio) self.emb_V = FeatureEmbedder(cfg.d_vid, cfg.d_model_video) else: self.emb_A = Identity() self.emb_V = Identity() self.emb_C = VocabularyEmbedder(train_dataset.trg_voc_size, cfg.d_model_caps) self.pos_enc_A = PositionalEncoder(cfg.d_model_audio, cfg.dout_p) self.pos_enc_V = PositionalEncoder(cfg.d_model_video, cfg.dout_p) self.pos_enc_C = PositionalEncoder(cfg.d_model_caps, cfg.dout_p) self.encoder = BiModalEncoder(cfg.d_model_audio, cfg.d_model_video, cfg.d_model, cfg.dout_p, cfg.H, cfg.d_ff_audio, cfg.d_ff_video, cfg.N) self.decoder = BiModelDecoder(cfg.d_model_audio, cfg.d_model_video, cfg.d_model_caps, cfg.d_model, cfg.dout_p, cfg.H, cfg.d_ff_caps, cfg.N) self.generator = Generator(cfg.d_model_caps, train_dataset.trg_voc_size) print('initialization: xavier') for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) # initialize embedding after, so it will replace the weights # of the prev. initialization self.emb_C.init_word_embeddings(train_dataset.train_vocab.vectors, cfg.unfreeze_word_emb) # load the pretrained encoder from the proposal (used in ablation studies) if cfg.pretrained_prop_model_path is not None: print(f'Pretrained prop path: \n {cfg.pretrained_prop_model_path}') cap_model_cpt = torch.load(cfg.pretrained_prop_model_path, map_location='cpu') encoder_config = cap_model_cpt['config'] self.encoder = BiModalEncoder( encoder_config.d_model_audio, encoder_config.d_model_video, encoder_config.d_model, encoder_config.dout_p, encoder_config.H, encoder_config.d_ff_audio, encoder_config.d_ff_video, encoder_config.N) encoder_weights = { k: v for k, v in cap_model_cpt['model_state_dict'].items() if 'encoder' in k } encoder_weights = { k.replace('encoder.', ''): v for k, v in encoder_weights.items() } self.encoder.load_state_dict(encoder_weights) self.encoder = self.encoder.to(cfg.device) for param in self.encoder.parameters(): param.requires_grad = cfg.finetune_prop_encoder def forward(self, src: dict, trg, masks: dict): V, A = src['rgb'] + src['flow'], src['audio'] C = trg # (B, Sm, Dm) <- (B, Sm, Dm), m in [a, v]; A = self.emb_A(A) V = self.emb_V(V) # (B, Sc, Dc) <- (S, Sc) C = self.emb_C(C) A = self.pos_enc_A(A) V = self.pos_enc_V(V) C = self.pos_enc_C(C) # notation: M1m2m2 (B, Sm1, Dm1), M1 is the target modality, m2 is the source modality Av, Va = self.encoder((A, V), masks) # (B, Sc, Dc) C = self.decoder((C, (Av, Va)), masks) # (B, Sc, Vc) <- (B, Sc, Dc) C = self.generator(C) return C
def __init__(self, cfg, anchors, nocuda=False): super(MultimodalProposalGenerator, self).__init__() assert cfg.modality == 'audio_video' self.cfg = cfg self.anchors = anchors self.EPS = 1e-16 self.num_logits = 3 # 3: c, w, obj self.nocuda = nocuda if cfg.use_linear_embedder: self.emb_V = FeatureEmbedder(cfg.d_vid, cfg.d_model_video) self.emb_A = FeatureEmbedder(cfg.d_aud, cfg.d_model_audio) else: self.emb_V = Identity() self.emb_A = Identity() self.pos_enc_V = PositionalEncoder(cfg.d_model_video, cfg.dout_p) self.pos_enc_A = PositionalEncoder(cfg.d_model_audio, cfg.dout_p) # load the pre-trained encoder from captioning module if cfg.pretrained_cap_model_path is not None: print( f'Pretrained caption path: \n {cfg.pretrained_cap_model_path}') cap_model_cpt = torch.load(cfg.pretrained_cap_model_path, map_location='cpu') encoder_config = cap_model_cpt['config'] self.encoder = BiModalEncoder( encoder_config.d_model_audio, encoder_config.d_model_video, encoder_config.d_model, encoder_config.dout_p, encoder_config.H, encoder_config.d_ff_audio, encoder_config.d_ff_video, encoder_config.N) encoder_weights = { k: v for k, v in cap_model_cpt['model_state_dict'].items() if 'encoder' in k } encoder_weights = { k.replace('module.encoder.', ''): v for k, v in encoder_weights.items() } self.encoder.load_state_dict(encoder_weights) self.encoder = self.encoder.to( cfg.device) if not nocuda else self.encoder for param in self.encoder.parameters(): param.requires_grad = cfg.finetune_cap_encoder else: self.encoder = BiModalEncoder(cfg.d_model_audio, cfg.d_model_video, cfg.d_model, cfg.dout_p, cfg.H, cfg.d_ff_audio, cfg.d_ff_video, cfg.N) # encoder initialization for p in self.encoder.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) dims_A = [ cfg.d_model_audio, *cfg.conv_layers_audio, self.num_logits * cfg.anchors_num_audio ] dims_V = [ cfg.d_model_video, *cfg.conv_layers_video, self.num_logits * cfg.anchors_num_video ] self.detection_layers_A = torch.nn.ModuleList([ ProposalGenerationHead(dims_A, k, cfg.dout_p, cfg.layer_norm) for k in cfg.kernel_sizes['audio'] ]) self.detection_layers_V = torch.nn.ModuleList([ ProposalGenerationHead(dims_V, k, cfg.dout_p, cfg.layer_norm) for k in cfg.kernel_sizes['video'] ]) self.bce_loss = nn.BCELoss() self.mse_loss = nn.MSELoss()
class BiModalTransformer(nn.Module): ''' Forward: Inputs: src {'rgb'&'flow' (B, Sv, Dv), 'audio': (B, Sa, Da)} trg (C): ((B, Sc)) masks: {'V_mask': (B, 1, Sv), 'A_mask': (B, 1, Sa), 'C_mask' (B, Sc, Sc))} Output: C: (B, Sc, Vc) 将索引转换成了词向量表示,所以多了一个维度,为300 ''' def __init__(self, cfg, train_dataset): super(BiModalTransformer, self).__init__() if cfg.use_linear_embedder: self.emb_A = FeatureEmbedder(cfg.d_aud, cfg.d_model_audio) self.emb_V = FeatureEmbedder(cfg.d_vid, cfg.d_model_video) else: self.emb_A = Identity() # 128 self.emb_V = Identity() # 1024 self.emb_C = VocabularyEmbedder(train_dataset.trg_voc_size, cfg.d_model_caps) # (10172,300) # print('cfg.d_model_audio:\n',cfg.d_model_audio) # 128 # print('cfg.d_model_video:\n', cfg.d_model_video) # 1024 # print('cfg.d_model_caps:\n', cfg.d_model_caps) # 300 # 返回的是带有位置编码信息的特征矩阵 self.pos_enc_A = PositionalEncoder(cfg.d_model_audio, cfg.dout_p) # (32,*,128) self.pos_enc_V = PositionalEncoder(cfg.d_model_video, cfg.dout_p) # (32,*,1024) self.pos_enc_C = PositionalEncoder(cfg.d_model_caps, cfg.dout_p) # (32,*,300) # pdb.set_trace() self.encoder = BiModalEncoder(cfg.d_model_audio, cfg.d_model_video, cfg.d_model, cfg.dout_p, cfg.H, cfg.d_ff_audio, cfg.d_ff_video, cfg.N) self.decoder = BiModelDecoder(cfg.d_model_audio, cfg.d_model_video, cfg.d_model_caps, cfg.d_model, cfg.dout_p, cfg.H, cfg.d_ff_caps, cfg.N) self.generator = Generator(cfg.d_model_caps, train_dataset.trg_voc_size) print('initialization: xavier') for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) # initialize embedding after, so it will replace the weights # of the prev. initialization # 将word index转化为词向量 self.emb_C.init_word_embeddings(train_dataset.train_vocab.vectors, cfg.unfreeze_word_emb) # load the pre_trained encoder from the proposal (used in ablation studies) if cfg.pretrained_prop_model_path is not None: print( f'Pre_trained prop path: \n {cfg.pretrained_prop_model_path}') cap_model_cpt = torch.load(cfg.pretrained_prop_model_path, map_location='cpu') encoder_config = cap_model_cpt['config'] self.encoder = BiModalEncoder( encoder_config.d_model_audio, encoder_config.d_model_video, encoder_config.d_model, encoder_config.dout_p, encoder_config.H, encoder_config.d_ff_audio, encoder_config.d_ff_video, encoder_config.N) encoder_weights = { k: v for k, v in cap_model_cpt['model_state_dict'].items() if 'encoder' in k } encoder_weights = { k.replace('encoder.', ''): v for k, v in encoder_weights.items() } self.encoder.load_state_dict(encoder_weights) self.encoder = self.encoder.to(cfg.device) for param in self.encoder.parameters(): param.requires_grad = cfg.finetune_prop_encoder def forward(self, src: dict, trg, masks: dict): # V.shape=(32,*,1024),A.shape=(32,*,128) V, A = src['rgb'] + src['flow'], src[ 'audio'] # 将rgb和flow两个特征线性相加作为V的特征 C = trg print('V.shape,A.shape,C.shape:\n', V.shape, A.shape, C.shape) # print('V,A,C:\n', V[0], A[0], C[0]) # (B, Sm, Dm) <- (B, Sm, Dm), m in [a, v]; A = self.emb_A(A) V = self.emb_V(V) # (B, Sc, Dc) <- (S, Sc) C = self.emb_C(C) A = self.pos_enc_A(A) V = self.pos_enc_V(V) C = self.pos_enc_C(C) # print('WordEmbedding:\n', C[0][0]) # batch中,第一个caption的第一个词嵌入向量表示 # notation: M1m2m2 (B, Sm1, Dm1), M1 is the target modality, m2 is the source modality # Av--M1m2 (B, Sm1, Dm1), Va--M2m1 (B, Sm2, Dm2) Av, Va = self.encoder((A, V), masks) # (B, Sc, Dc) C = self.decoder((C, (Av, Va)), masks) # (B, Sc, Vocabc) <- (B, Sc, Dc) C = self.generator(C) return C
class MultimodalProposalGenerator(nn.Module): def __init__(self, cfg, anchors, nocuda=False): super(MultimodalProposalGenerator, self).__init__() assert cfg.modality == 'audio_video' self.cfg = cfg self.anchors = anchors self.EPS = 1e-16 self.num_logits = 3 # 3: c, w, obj self.nocuda = nocuda if cfg.use_linear_embedder: self.emb_V = FeatureEmbedder(cfg.d_vid, cfg.d_model_video) self.emb_A = FeatureEmbedder(cfg.d_aud, cfg.d_model_audio) else: self.emb_V = Identity() self.emb_A = Identity() self.pos_enc_V = PositionalEncoder(cfg.d_model_video, cfg.dout_p) self.pos_enc_A = PositionalEncoder(cfg.d_model_audio, cfg.dout_p) # load the pre-trained encoder from captioning module if cfg.pretrained_cap_model_path is not None: print( f'Pretrained caption path: \n {cfg.pretrained_cap_model_path}') cap_model_cpt = torch.load(cfg.pretrained_cap_model_path, map_location='cpu') encoder_config = cap_model_cpt['config'] self.encoder = BiModalEncoder( encoder_config.d_model_audio, encoder_config.d_model_video, encoder_config.d_model, encoder_config.dout_p, encoder_config.H, encoder_config.d_ff_audio, encoder_config.d_ff_video, encoder_config.N) encoder_weights = { k: v for k, v in cap_model_cpt['model_state_dict'].items() if 'encoder' in k } encoder_weights = { k.replace('module.encoder.', ''): v for k, v in encoder_weights.items() } self.encoder.load_state_dict(encoder_weights) self.encoder = self.encoder.to( cfg.device) if not nocuda else self.encoder for param in self.encoder.parameters(): param.requires_grad = cfg.finetune_cap_encoder else: self.encoder = BiModalEncoder(cfg.d_model_audio, cfg.d_model_video, cfg.d_model, cfg.dout_p, cfg.H, cfg.d_ff_audio, cfg.d_ff_video, cfg.N) # encoder initialization for p in self.encoder.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) dims_A = [ cfg.d_model_audio, *cfg.conv_layers_audio, self.num_logits * cfg.anchors_num_audio ] dims_V = [ cfg.d_model_video, *cfg.conv_layers_video, self.num_logits * cfg.anchors_num_video ] self.detection_layers_A = torch.nn.ModuleList([ ProposalGenerationHead(dims_A, k, cfg.dout_p, cfg.layer_norm) for k in cfg.kernel_sizes['audio'] ]) self.detection_layers_V = torch.nn.ModuleList([ ProposalGenerationHead(dims_V, k, cfg.dout_p, cfg.layer_norm) for k in cfg.kernel_sizes['video'] ]) self.bce_loss = nn.BCELoss() self.mse_loss = nn.MSELoss() def forward_modality(self, x, targets, detection, stride, anchors_list): anchors_num = len(anchors_list) # in case targets is None loss = 0 losses = {} x = detection(x) B, S, D = x.shape x = x.view(B, S, anchors_num, self.num_logits) x = x.permute(0, 2, 1, 3).contiguous() grid_cell = torch.arange(S).view(1, 1, S).float() grid_cell = grid_cell.to(self.cfg.device) if self.nocuda else grid_cell # After dividing anchors by the stride, they represent the size size of # how many grid celts they are overlapping: 1.2 = 1 and 20% of a grid cell. # After multiplying them by the stride, the pixel values are going to be # obtained. anchors_list = [[anchor / stride] for anchor in anchors_list] anchors_tensor = torch.tensor( anchors_list) if not self.nocuda else torch.tensor( anchors_list, device=self.cfg.device) # (A, 2) -> (1, A, 1) for broadcasting prior_length = anchors_tensor.view(1, anchors_num, 1) # prediction values for the *loss* calculation (training) sigma_c = torch.sigmoid(x[:, :, :, 0]) # center l = x[:, :, :, 1] # length sigma_o = torch.sigmoid(x[:, :, :, 2]) # objectness # prediction values that are going to be used for the original image # we need to detach them from the graph as we don't need to backproparate # on them predictions = x.clone().detach() # broadcasting (B, A, S) + (1, 1, S) # For now, we are not going to multiply them by stride since # we need them in make_targets predictions[:, :, :, 0] = sigma_c + grid_cell # broadcasting (1, A, 1) * (B, A, S) predictions[:, :, :, 1] = prior_length * torch.exp(l) predictions[:, :, :, 2] = sigma_o if targets is not None: obj_mask, noobj_mask, gt_x, gt_w, gt_obj = make_targets( predictions, targets, anchors_tensor, stride) ## Loss # Localization loss_x = self.mse_loss(sigma_c[obj_mask], gt_x[obj_mask]) loss_w = self.mse_loss(l[obj_mask], gt_w[obj_mask]) loss_loc = loss_x + loss_w # Confidence loss_obj = self.bce_loss(sigma_o[obj_mask], gt_obj[obj_mask]) loss_noobj = self.bce_loss(sigma_o[noobj_mask], gt_obj[noobj_mask]) loss_conf = self.cfg.obj_coeff * loss_obj + self.cfg.noobj_coeff * loss_noobj # Total loss loss = loss_loc + loss_conf losses = { 'loss_x': loss_x, 'loss_w': loss_w, 'loss_conf_obj': loss_obj, 'loss_conf_noobj': loss_noobj } # for NMS: (B, A, S, 3) -> (B, A*S, 3) predictions = predictions.view(B, S * anchors_num, self.num_logits) predictions[:, :, :2] *= stride return predictions, loss, losses def forward(self, x, targets, masks): V, A = x['rgb'] + x['flow'], x['audio'] # (B, Sm, Dm) < - (B, Sm, Dm), m in [a, v] A = self.emb_A(A) V = self.emb_V(V) A = self.pos_enc_A(A) V = self.pos_enc_V(V) # notation: M1m2m2 (B, Sm1, Dm1), M1 is the target modality, m2 is the source modality Av, Va = self.encoder((A, V), masks) all_predictions_A = [] all_predictions_V = [] # total_loss should have backward sum_losses_dict_A = {} sum_losses_dict_V = {} total_loss_A = 0 total_loss_V = 0 for layer in self.detection_layers_A: props_A, loss_A, losses_A = self.forward_modality( Av, targets, layer, self.cfg.strides['audio'], self.anchors['audio']) total_loss_A += loss_A all_predictions_A.append(props_A) sum_losses_dict_A = add_dict_to_another_dict( losses_A, sum_losses_dict_A) for layer in self.detection_layers_V: props_V, loss_V, losses_V = self.forward_modality( Va, targets, layer, self.cfg.strides['video'], self.anchors['video']) total_loss_V += loss_V all_predictions_V.append(props_V) sum_losses_dict_V = add_dict_to_another_dict( losses_V, sum_losses_dict_V) all_predictions_A = torch.cat(all_predictions_A, dim=1) all_predictions_V = torch.cat(all_predictions_V, dim=1) total_loss = total_loss_A + total_loss_V # combine predictions all_predictions = torch.cat([all_predictions_A, all_predictions_V], dim=1) # if you like the predictions to be half from audio and half from the video modalities # all_predictions = torch.cat([ # select_topk_predictions(all_predictions_A, k=self.cfg.max_prop_per_vid // 2), # select_topk_predictions(all_predictions_V, k=self.cfg.max_prop_per_vid // 2) # ], dim=1) return all_predictions, total_loss, sum_losses_dict_A, sum_losses_dict_V