def __init__(self, train_dataset, cfg): super(Transformer, self).__init__() self.modality = cfg.modality if cfg.modality == 'video': self.d_model = cfg.d_model_video self.d_feat = cfg.d_vid self.d_ff = cfg.d_ff_video elif cfg.modality == 'audio': self.d_feat = cfg.d_aud self.d_model = cfg.d_model_audio self.d_ff = cfg.d_ff_audio if cfg.use_linear_embedder: self.src_emb = FeatureEmbedder(self.d_feat, self.d_model) else: assert self.d_feat == self.d_model self.src_emb = Identity() self.trg_emb = VocabularyEmbedder(train_dataset.trg_voc_size, self.d_model) self.pos_emb = PositionalEncoder(self.d_model, cfg.dout_p) self.encoder = Encoder(self.d_model, cfg.dout_p, cfg.H, self.d_ff, cfg.N) self.decoder = Decoder(self.d_model, cfg.dout_p, cfg.H, self.d_ff, cfg.N) self.generator = Generator(self.d_model, train_dataset.trg_voc_size) print('initialization: xavier') for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) # initialize embedding after, so it will replace the weights initialized previously self.trg_emb.init_word_embeddings(train_dataset.train_vocab.vectors, cfg.unfreeze_word_emb) # load the pretrained encoder from the proposal (used in ablation studies) if cfg.pretrained_prop_model_path is not None: print(f'Pretrained prop path: \n {cfg.pretrained_prop_model_path}') cap_model_cpt = torch.load(cfg.pretrained_prop_model_path, map_location='cpu') encoder_config = cap_model_cpt['config'] if cfg.modality == 'video': self.d_model = encoder_config.d_model_video self.d_ff = encoder_config.d_ff_video elif cfg.modality == 'audio': self.d_model = encoder_config.d_model_audio self.d_ff = encoder_config.d_ff_audio self.encoder = Encoder(self.d_model, encoder_config.dout_p, encoder_config.H, self.d_ff, encoder_config.N) encoder_weights = { k: v for k, v in cap_model_cpt['model_state_dict'].items() if 'encoder' in k } encoder_weights = { k.replace('encoder.', ''): v for k, v in encoder_weights.items() } self.encoder.load_state_dict(encoder_weights) self.encoder = self.encoder.to(cfg.device) for param in self.encoder.parameters(): param.requires_grad = cfg.finetune_prop_encoder
def __init__(self, cfg, anchors): super(ProposalGenerator, self).__init__() self.cfg = cfg self.EPS = 1e-16 self.num_logits = 3 # 3: c, w, obj self.anchors = anchors self.anchors_list = anchors[cfg.modality] self.anchors_num = len(self.anchors_list) if cfg.modality == 'video': self.d_feat = cfg.d_vid self.d_model_modality = cfg.d_model_video self.d_ff = cfg.d_ff_video layer_dims = [ self.d_model_modality, *cfg.conv_layers_video, self.num_logits * self.anchors_num ] elif cfg.modality == 'audio': self.d_feat = cfg.d_aud self.d_model_modality = cfg.d_model_audio self.d_ff = cfg.d_ff_audio layer_dims = [ self.d_model_modality, *cfg.conv_layers_audio, self.num_logits * self.anchors_num ] else: raise NotImplementedError if cfg.use_linear_embedder: self.emb = FeatureEmbedder(self.d_feat, self.d_model_modality) else: self.emb = Identity() self.pos_enc = PositionalEncoder(self.d_model_modality, cfg.dout_p) # load the pre-trained encoder from captioning module if cfg.pretrained_cap_model_path is not None: print(f'Caption path: \n {cfg.pretrained_cap_model_path}') cap_model_cpt = torch.load(cfg.pretrained_cap_model_path, map_location='cpu') encoder_config = cap_model_cpt['config'] if cfg.modality == 'video': self.d_model_modality = encoder_config.d_model_video self.d_ff = encoder_config.d_ff_video elif cfg.modality == 'audio': self.d_model_modality = encoder_config.d_model_audio self.d_ff = encoder_config.d_ff_audio else: raise NotImplementedError self.encoder = Encoder(self.d_model_modality, encoder_config.dout_p, encoder_config.H, self.d_ff, encoder_config.N) encoder_weights = { k: v for k, v in cap_model_cpt['model_state_dict'].items() if 'encoder' in k } encoder_weights = { k.replace('module.encoder.', ''): v for k, v in encoder_weights.items() } self.encoder.load_state_dict(encoder_weights) self.encoder = self.encoder.to(cfg.device) for param in self.encoder.parameters(): param.requires_grad = cfg.finetune_cap_encoder else: self.encoder = Encoder(self.d_model_modality, cfg.dout_p, cfg.H, self.d_ff, cfg.N) # encoder initialization for p in self.encoder.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) self.detection_layers = torch.nn.ModuleList([ ProposalGenerationHead(layer_dims, k, cfg.dout_p, cfg.layer_norm) for k in cfg.kernel_sizes[cfg.modality] ]) print(self.detection_layers) self.bce_loss = nn.BCELoss() self.mse_loss = nn.MSELoss()
def __init__(self, cfg, train_dataset): super(BiModalTransformer, self).__init__() if cfg.use_linear_embedder: self.emb_A = FeatureEmbedder(cfg.d_aud, cfg.d_model_audio) self.emb_V = FeatureEmbedder(cfg.d_vid, cfg.d_model_video) else: self.emb_A = Identity() self.emb_V = Identity() self.emb_C = VocabularyEmbedder(train_dataset.trg_voc_size, cfg.d_model_caps) self.pos_enc_A = PositionalEncoder(cfg.d_model_audio, cfg.dout_p) self.pos_enc_V = PositionalEncoder(cfg.d_model_video, cfg.dout_p) self.pos_enc_C = PositionalEncoder(cfg.d_model_caps, cfg.dout_p) self.encoder = BiModalEncoder(cfg.d_model_audio, cfg.d_model_video, cfg.d_model, cfg.dout_p, cfg.H, cfg.d_ff_audio, cfg.d_ff_video, cfg.N) self.decoder = BiModelDecoder(cfg.d_model_audio, cfg.d_model_video, cfg.d_model_caps, cfg.d_model, cfg.dout_p, cfg.H, cfg.d_ff_caps, cfg.N) self.generator = Generator(cfg.d_model_caps, train_dataset.trg_voc_size) print('initialization: xavier') for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) # initialize embedding after, so it will replace the weights # of the prev. initialization self.emb_C.init_word_embeddings(train_dataset.train_vocab.vectors, cfg.unfreeze_word_emb) # load the pretrained encoder from the proposal (used in ablation studies) if cfg.pretrained_prop_model_path is not None: print(f'Pretrained prop path: \n {cfg.pretrained_prop_model_path}') cap_model_cpt = torch.load(cfg.pretrained_prop_model_path, map_location='cpu') encoder_config = cap_model_cpt['config'] self.encoder = BiModalEncoder( encoder_config.d_model_audio, encoder_config.d_model_video, encoder_config.d_model, encoder_config.dout_p, encoder_config.H, encoder_config.d_ff_audio, encoder_config.d_ff_video, encoder_config.N) encoder_weights = { k: v for k, v in cap_model_cpt['model_state_dict'].items() if 'encoder' in k } encoder_weights = { k.replace('encoder.', ''): v for k, v in encoder_weights.items() } self.encoder.load_state_dict(encoder_weights) self.encoder = self.encoder.to(cfg.device) for param in self.encoder.parameters(): param.requires_grad = cfg.finetune_prop_encoder
def __init__(self, cfg, anchors, nocuda=False): super(MultimodalProposalGenerator, self).__init__() assert cfg.modality == 'audio_video' self.cfg = cfg self.anchors = anchors self.EPS = 1e-16 self.num_logits = 3 # 3: c, w, obj self.nocuda = nocuda if cfg.use_linear_embedder: self.emb_V = FeatureEmbedder(cfg.d_vid, cfg.d_model_video) self.emb_A = FeatureEmbedder(cfg.d_aud, cfg.d_model_audio) else: self.emb_V = Identity() self.emb_A = Identity() self.pos_enc_V = PositionalEncoder(cfg.d_model_video, cfg.dout_p) self.pos_enc_A = PositionalEncoder(cfg.d_model_audio, cfg.dout_p) # load the pre-trained encoder from captioning module if cfg.pretrained_cap_model_path is not None: print( f'Pretrained caption path: \n {cfg.pretrained_cap_model_path}') cap_model_cpt = torch.load(cfg.pretrained_cap_model_path, map_location='cpu') encoder_config = cap_model_cpt['config'] self.encoder = BiModalEncoder( encoder_config.d_model_audio, encoder_config.d_model_video, encoder_config.d_model, encoder_config.dout_p, encoder_config.H, encoder_config.d_ff_audio, encoder_config.d_ff_video, encoder_config.N) encoder_weights = { k: v for k, v in cap_model_cpt['model_state_dict'].items() if 'encoder' in k } encoder_weights = { k.replace('module.encoder.', ''): v for k, v in encoder_weights.items() } self.encoder.load_state_dict(encoder_weights) self.encoder = self.encoder.to( cfg.device) if not nocuda else self.encoder for param in self.encoder.parameters(): param.requires_grad = cfg.finetune_cap_encoder else: self.encoder = BiModalEncoder(cfg.d_model_audio, cfg.d_model_video, cfg.d_model, cfg.dout_p, cfg.H, cfg.d_ff_audio, cfg.d_ff_video, cfg.N) # encoder initialization for p in self.encoder.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) dims_A = [ cfg.d_model_audio, *cfg.conv_layers_audio, self.num_logits * cfg.anchors_num_audio ] dims_V = [ cfg.d_model_video, *cfg.conv_layers_video, self.num_logits * cfg.anchors_num_video ] self.detection_layers_A = torch.nn.ModuleList([ ProposalGenerationHead(dims_A, k, cfg.dout_p, cfg.layer_norm) for k in cfg.kernel_sizes['audio'] ]) self.detection_layers_V = torch.nn.ModuleList([ ProposalGenerationHead(dims_V, k, cfg.dout_p, cfg.layer_norm) for k in cfg.kernel_sizes['video'] ]) self.bce_loss = nn.BCELoss() self.mse_loss = nn.MSELoss()