def __init__(self, args, dictionary, embed_tokens): self.args = args super().__init__( TransformerConfig.from_namespace(args), dictionary, embed_tokens, )
def add_args(cls, parser): """Add model-specific arguments to the parser.""" # we want to build the args recursively in this case. gen_parser_from_dataclass(parser, TransformerConfig(), delete_default=False, with_prefix="")
def build_self_attention( self, embed_dim, args, add_bias_kv=False, add_zero_attn=False ): return super().build_self_attention( embed_dim, TransformerConfig.from_namespace(args), add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, )
def __init__( self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False ): super().__init__( TransformerConfig.from_namespace(args), no_encoder_attn=no_encoder_attn, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, ) self.args = args
def __init__( self, args, positional_embedding: Optional[RelativePositionalEmbedding] = None ): super().__init__( TransformerConfig.from_namespace(args), positional_embedding=positional_embedding, ) self.args = args
def __init__( self, args, dictionary, embed_tokens, no_encoder_attn=False, output_projection=None, ): self.args = args super().__init__( TransformerConfig.from_namespace(args), dictionary, embed_tokens, no_encoder_attn=no_encoder_attn, output_projection=output_projection, )
def __init__( self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False, positional_embedding: Optional[RelativePositionalEmbedding] = None, ): super().__init__( TransformerConfig.from_namespace(args), no_encoder_attn=no_encoder_attn, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, positional_embedding=positional_embedding, ) self.args = args
class Data2VecTextConfig(FairseqDataclass): max_positions: int = II("task.tokens_per_sample") head_layers: int = 1 transformer: TransformerConfig = TransformerConfig() load_checkpoint_heads: bool = field( default=False, metadata={ "help": "(re-)register and load heads when loading checkpoints" }, ) loss_beta: float = field( default=0, metadata={"help": "beta for smooth l1 loss. 0 means use l2 loss"}) loss_scale: Optional[float] = field( default=None, metadata={ "help": "scale the reconstruction loss by this constant. if None then scales by 1/sqrt(dim)" }, ) average_top_k_layers: int = field( default=8, metadata={"help": "how many layers to average"}) layer_norm_target_layer: bool = False instance_norm_target_layer: bool = False batch_norm_target_layer: bool = False instance_norm_targets: bool = False layer_norm_targets: bool = False ema_decay: float = field(default=0.999, metadata={"help": "initial ema decay rate"}) ema_end_decay: float = field(default=0.9999, metadata={"help": "final ema decay rate"}) # when to finish annealing ema decay rate ema_anneal_end_step: int = II("optimization.max_update") ema_transformer_layers_only: bool = field( default=True, metadata={ "help": "whether to momentum update only the transformer layers" }, )
def build_encoder_attention(self, embed_dim, args): return super().build_encoder_attention( embed_dim, TransformerConfig.from_namespace(args), )
def __init__(self, args): super().__init__(TransformerConfig.from_namespace(args)) self.args = args
def build_encoder_layer(self, args): return super().build_encoder_layer( TransformerConfig.from_namespace(args), )
def build_decoder_layer(self, args, no_encoder_attn=False): return super().build_decoder_layer( TransformerConfig.from_namespace(args), no_encoder_attn=no_encoder_attn)
def build_output_projection(self, args, dictionary, embed_tokens): super().build_output_projection(TransformerConfig.from_namespace(args), dictionary, embed_tokens)
def build_self_attention(self, embed_dim, args, positional_embedding=None): return super().build_self_attention( embed_dim, TransformerConfig.from_namespace(args), positional_embedding=positional_embedding, )
def __init__( self, embed_dim, attention_heads, ffn_embed_dim, num_layers, embedding_layer, # torch.nn.Embedding. Must have a padding_idx field dropout=0, normalize_before=False, torch_encoder=None, # torch encoder that you can map weights from activation="relu", ): super().__init__() cfg = FairseqTransformerConfig() cfg.encoder.embed_dim = embed_dim cfg.encoder.attention_heads = attention_heads cfg.encoder.ffn_embed_dim = ffn_embed_dim cfg.dropout = dropout cfg.encoder.normalize_before = normalize_before cfg.encoder.layers = num_layers # make embedding behavior same as other encoders cfg.no_token_positional_embeddings = True cfg.no_scale_embedding = True cfg.activation_fn = activation dictionary = {} # TODO: verify what this is self.encoder = FairseqTransformerEncoder( cfg, dictionary, embedding_layer, return_fc=False ) if torch_encoder is not None: for src_layer, dst_layer in zip( torch_encoder.layers, self.encoder.layers ): w_q, w_k, w_v = src_layer.self_attn.in_proj_weight.chunk(3, dim=0) b_q, b_k, b_v = src_layer.self_attn.in_proj_bias.chunk(3, dim=0) dst_layer.self_attn.q_proj.weight = torch.nn.Parameter(w_q) dst_layer.self_attn.q_proj.bias = torch.nn.Parameter(b_q) dst_layer.self_attn.k_proj.weight = torch.nn.Parameter(w_k) dst_layer.self_attn.k_proj.bias = torch.nn.Parameter(b_k) dst_layer.self_attn.v_proj.weight = torch.nn.Parameter(w_v) dst_layer.self_attn.v_proj.bias = torch.nn.Parameter(b_v) dst_layer.self_attn.out_proj.weight = ( src_layer.self_attn.out_proj.weight ) dst_layer.self_attn.out_proj.bias = ( src_layer.self_attn.out_proj.bias ) dst_layer.fc1.weight = src_layer.linear1.weight dst_layer.fc1.bias = src_layer.linear1.bias # fairseq may use fusedlayernorm from nvidia apex - diff properties dst_layer.self_attn_layer_norm.load_state_dict( src_layer.norm1.state_dict() ) dst_layer.fc2.weight = src_layer.linear2.weight dst_layer.fc2.bias = src_layer.linear2.bias dst_layer.final_layer_norm.load_state_dict( src_layer.norm2.state_dict() )