def build_encoder_layer(self, args: Wav2Vec2Config): if args.layer_type == "transformer": layer = TransformerSentenceEncoderLayer( embedding_dim=self.embedding_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, dropout=self.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, activation_fn=args.activation_fn, layer_norm_first=args.layer_norm_first, ) elif args.layer_type == "conformer": layer = ConformerWav2Vec2EncoderLayer( embed_dim=self.embedding_dim, ffn_embed_dim=args.encoder_ffn_embed_dim, attention_heads=args.encoder_attention_heads, dropout=args.dropout, depthwise_conv_kernel_size=args.depthwise_conv_kernel_size, activation_fn="swish", attn_type=args.attn_type, use_fp16=args.fp16, pos_enc_type="abs", ) layer = fsdp_wrap(layer) if args.checkpoint_activations: layer = checkpoint_wrapper(layer) return layer
def build_decoder_layer(self, cfg, no_encoder_attn=False): if self.cfg.shared_layer_qkv_conv == 1 and self.compress_layer is None: target_dim = cfg.compressed_dim compress_layer = nn.Linear( self.cfg.max_positions, target_dim, ) nn.init.xavier_uniform_(compress_layer.weight, gain=1 / math.sqrt(2)) if self.cfg.freeze_conv == 1: compress_layer.weight.requires_grad = False self.compress_layer = compress_layer #return PrimerTransformerEncoderLayer(cfg, self.compress_layer) layer = primer_layer.PrimerDecoderLayerBase(cfg, self.compress_layer, no_encoder_attn) checkpoint = cfg.checkpoint_activations if checkpoint: offload_to_cpu = cfg.offload_activations layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) # if we are checkpointing, enforce that FSDP always wraps the # checkpointed layer, regardless of layer size min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0 layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap) return layer
def __init__(self, cfg, task): try: from transformers.models.transfo_xl import ( TransfoXLConfig, TransfoXLLMHeadModel, ) except ImportError: from transformers.configuration_transfo_xl import TransfoXLConfig from transformers.modeling_transfo_xl import TransfoXLLMHeadModel super().__init__(task.target_dictionary) self.cfg = cfg # remove any cutoffs larger than the vocab size cutoffs = [ cutoff for cutoff in cfg.cutoffs if cutoff < len(task.target_dictionary) ] config = TransfoXLConfig( vocab_size=len(task.target_dictionary), cutoffs=cutoffs, d_model=cfg.d_model, d_embed=cfg.d_model, n_head=cfg.n_head, d_head=cfg.d_head, d_inner=cfg.d_inner, div_val=cfg.div_val, n_layer=cfg.n_layer, mem_len=cfg.mem_len, clamp_len=cfg.clamp_len, same_length=cfg.same_length, dropout=cfg.dropout, dropatt=cfg.dropatt, ) logger.info(config) self.model = TransfoXLLMHeadModel(config) # Workaround a bug in huggingface's ``ProjectedAdaptiveLogSoftmax`` # which adds ``None`` values to an ``nn.ParameterList``, which is not # supported in PyTorch. Instead we can replace this with an # ``nn.ModuleList``, which does support ``None`` values. try: if all(p is None for p in self.model.crit.out_projs._parameters.values()): self.model.crit.out_projs = torch.nn.ModuleList( [None] * len(self.model.crit.out_projs._parameters)) except Exception: pass if cfg.checkpoint_activations or cfg.offload_activations: for i in range(len(self.model.transformer.layers)): self.model.transformer.layers[i] = checkpoint_wrapper( self.model.transformer.layers[i], offload_to_cpu=cfg.offload_activations, ) # TODO: may save mem to wrap(layer.pos_ff.CoreNet[3]) self._mems = None
def build_decoder_layer(self, args, no_encoder_attn=False): layer = DecoderLayer(args.encoder_embed_dim, args.encoder_ffn_embed_dim, args.encoder_attention_heads, args.kdim, args.vdim, args.dropout) if getattr(args, "checkpoint_activations", False): offload_to_cpu = getattr(args, "offload_activations", False) layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) return layer
def build_encoder_layer(self, cfg): layer = transformer_layer.TransformerEncoderLayerBase(cfg) checkpoint = cfg.checkpoint_activations if checkpoint: offload_to_cpu = cfg.offload_activations layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) # if we are checkpointing, enforce that FSDP always wraps the # checkpointed layer, regardless of layer size min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0 layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap) return layer
def build_decoder_layer(self, cfg, no_encoder_attn=False): layer = UniLMDecoderLayer(cfg, no_encoder_attn) checkpoint = cfg.checkpoint_activations if checkpoint: offload_to_cpu = cfg.offload_activations layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) # if we are checkpointing, enforce that FSDP always wraps the # checkpointed layer, regardless of layer size min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0 layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap) return layer
def build_encoder_layer(self, args): layer = TransformerEncoderLayer(args) checkpoint = getattr(args, "checkpoint_activations", False) if checkpoint: offload_to_cpu = getattr(args, "offload_activations", False) layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) # if we are checkpointing, enforce that FSDP always wraps the # checkpointed layer, regardless of layer size min_params_to_wrap = (getattr(args, "min_params_to_wrap", DEFAULT_MIN_PARAMS_TO_WRAP) if not checkpoint else 0) layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap) return layer
def __init__(self, use_pytorch_checkpoint=False, use_fairseq_checkpoint=False): super().__init__() torch.manual_seed(0) self.use_pytorch_checkpoint = use_pytorch_checkpoint self.ffn = nn.Sequential( nn.Linear(32, 128), # add a Dropout layer to test RNG save/restore nn.Dropout(p=0.5), nn.Linear(128, 32), ) if use_fairseq_checkpoint: self.ffn = checkpoint_wrapper(self.ffn) self.out = nn.Linear(32, 1)
def __init__(self, args): super().__init__() self.dropout = args.dropout self.embedding_dim = args.encoder_embed_dim self.pos_conv = nn.Conv1d( self.embedding_dim, self.embedding_dim, kernel_size=args.conv_pos, padding=args.conv_pos // 2, groups=args.conv_pos_groups, ) dropout = 0 std = math.sqrt( (4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim)) nn.init.normal_(self.pos_conv.weight, mean=0, std=std) nn.init.constant_(self.pos_conv.bias, 0) self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2) self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU()) layers = [] for _ in range(args.encoder_layers): layer = TransformerSentenceEncoderLayer( embedding_dim=self.embedding_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, dropout=self.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, activation_fn=args.activation_fn, layer_norm_first=args.layer_norm_first, ) if args.checkpoint_activations: layer = fsdp_wrap(layer) layer = checkpoint_wrapper(layer) layers.append(layer) self.layers = nn.ModuleList(layers) self.layer_norm_first = args.layer_norm_first self.layer_norm = LayerNorm(self.embedding_dim) self.layerdrop = args.encoder_layerdrop self.apply(init_bert_params)
def build_encoder_layer(self, args): layer = ConformerWav2Vec2EncoderLayer( embed_dim=self.embedding_dim, ffn_embed_dim=args.encoder_ffn_embed_dim, attention_heads=args.encoder_attention_heads, dropout=args.dropout, depthwise_conv_kernel_size=args.depthwise_conv_kernel_size, activation_fn="swish", attn_type=args.attn_type, pos_enc_type=args.pos_enc_type, use_fp16=args.fp16, # only used for rope ) layer = fsdp_wrap(layer) if args.checkpoint_activations: layer = checkpoint_wrapper(layer) return layer
def __init__(self, cfg, task): try: from transformers.models.transfo_xl import ( TransfoXLConfig, TransfoXLLMHeadModel, ) except ImportError: from transformers.configuration_transfo_xl import TransfoXLConfig from transformers.modeling_transfo_xl import TransfoXLLMHeadModel super().__init__(task.target_dictionary) self.cfg = cfg # remove any cutoffs larger than the vocab size cutoffs = [ cutoff for cutoff in cfg.cutoffs if cutoff < len(task.target_dictionary) ] config = TransfoXLConfig( vocab_size=len(task.target_dictionary), cutoffs=cutoffs, d_model=cfg.d_model, d_embed=cfg.d_model, n_head=cfg.n_head, d_head=cfg.d_head, d_inner=cfg.d_inner, div_val=cfg.div_val, n_layer=cfg.n_layer, mem_len=cfg.mem_len, clamp_len=cfg.clamp_len, same_length=cfg.same_length, dropout=cfg.dropout, dropatt=cfg.dropatt, ) logger.info(config) self.model = TransfoXLLMHeadModel(config) if cfg.checkpoint_activations or cfg.offload_activations: for i in range(len(self.model.transformer.layers)): self.model.transformer.layers[i] = checkpoint_wrapper( self.model.transformer.layers[i], offload_to_cpu=cfg.offload_activations, ) # TODO: may save mem to wrap(layer.pos_ff.CoreNet[3]) self._mems = None
def build_decoder_layer( self, cfg, no_encoder_attn=False, positional_embedding: Optional[RelativePositionalEmbedding] = None, ): layer = TransformerWithRelativePositionalEmbeddingDecoderLayerBase( cfg, no_encoder_attn=no_encoder_attn, positional_embedding=positional_embedding, ) checkpoint = cfg.checkpoint_activations if checkpoint: offload_to_cpu = cfg.offload_activations layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) # if we are checkpointing, enforce that FSDP always wraps the # checkpointed layer, regardless of layer size min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0 layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap) return layer
def build_encoder_layer( self, cfg, positional_embedding: Optional[RelativePositionalEmbedding] = None ): if cfg.encoder.layer_type == "transformer": layer_cls = TransformerWithRelativePositionalEmbeddingEncoderLayerBase elif cfg.encoder.layer_type == "conformer": layer_cls = ConformerWithRelativePositionalEmbeddingEncoderLayerBase else: raise NotImplementedError layer = layer_cls( cfg, return_fc=self.return_fc, positional_embedding=positional_embedding ) checkpoint = cfg.checkpoint_activations if checkpoint: offload_to_cpu = cfg.offload_activations layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) # if we are checkpointing, enforce that FSDP always wraps the # checkpointed layer, regardless of layer size min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0 layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap) return layer
def build_decoder_layer(self, args, no_encoder_attn=False): layer = TransformerDecoderLayer(args, no_encoder_attn) if getattr(args, "checkpoint_activations", False): layer = checkpoint_wrapper(layer) return layer
def build_encoder_layer(self, args): layer = TransformerEncoderLayer(args) if getattr(args, "checkpoint_activations", False): offload_to_cpu = getattr(args, "offload_activations", False) layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) return layer
def build_encoder_layer(self, args): layer = ConvTransformerEncoderLayer(args) if getattr(args, "checkpoint_activations", False): layer = checkpoint_wrapper(layer) return layer