def __init__(self, args): super().__init__(None) self.w2v_encoder = Wav2VecEncoder(args) self.is_v0_arch = not args.adaptor_proj self.w2v_proj_ln = None if not self.is_v0_arch and self.w2v_encoder.proj is not None: self.w2v_proj_ln = LayerNorm(args.decoder_embed_dim) self.adaptor = self.build_adaptor(args) self.num_updates = 0 self.freezing_updates = args.w2v_freezing_updates self.finetuning_params = args.finetune_w2v_params for k, p in self.w2v_encoder.w2v_model.named_parameters(): p.requires_grad = need_finetuning(self.finetuning_params, k)
def __init__(self, args): super().__init__(None) self.w2v_encoder = Wav2VecEncoder(args) encoder_out_dim = self.w2v_encoder.w2v_model.encoder.embedding_dim # Projection + 8x shrinking self.adaptor = Conv1dAdaptor(encoder_out_dim, args.decoder_embed_dim, n_layers=args.adaptor_n_layers, kernel_size=args.adaptor_kernel_size, stride=args.adaptor_stride, add_layernorm=args.adaptor_layernorm) for k, p in self.w2v_encoder.w2v_model.named_parameters(): # Freeze pretrained models by default if safe_hasattr(args, 'finetune_w2v_params' ) and XMTransformerModel.finetune_params( args.finetune_w2v_params, k): p.requires_grad = True else: p.requires_grad = False