def __init__( self, n_heads, n_layers, hidden_dim, ffn_size, reduction=True, attention_dropout=0.0, relu_dropout=0.0, learn_positional_embeddings=False, ): super().__init__() self.ffn_size = ffn_size self.n_layers = n_layers self.n_heads = n_heads self.out_dim = hidden_dim self.dim = hidden_dim self.reduction = reduction assert hidden_dim % n_heads == 0, "MM-Combiner dim must be multiple of n_heads" n_positions = 1024 self.position_embeddings = nn.Embedding(n_positions, hidden_dim) if not learn_positional_embeddings: create_position_codes(n_positions, hidden_dim, out=self.position_embeddings.weight) else: nn.init.normal_(self.position_embeddings.weight, 0, hidden_dim**-0.5) self.layers = nn.ModuleList() for _ in range(self.n_layers): self.layers.append( TransformerEncoderLayer(n_heads, hidden_dim, ffn_size, attention_dropout, relu_dropout))
def __init__( self, opt: Opt, vocabulary_size: int, embedding: Optional[nn.Embedding] = None, padding_idx: int = 0, reduction_type: str = 'mean', n_positions: Optional[int] = None, n_segments: Optional[int] = None, embeddings_scale: Optional[bool] = None, dropout: Optional[float] = None, activation: Optional[str] = None, variant: Optional[str] = None, output_scaling: Optional[float] = None, ): super(TransformerEncoder, self).__init__() def _default(val, default): return val if val is not None else default self.embedding_size = opt['embedding_size'] self.ffn_size = opt['ffn_size'] self.n_layers = ( opt['n_encoder_layers'] if opt.get('n_encoder_layers', -1) > 0 else opt['n_layers'] ) self.n_heads = opt['n_heads'] self.dim = self.embedding_size self.embeddings_scale = _default( embeddings_scale, opt.get('embeddings_scale', False) ) self.reduction_type = reduction_type self.padding_idx = padding_idx # this is --dropout, not --relu-dropout or --attention-dropout self.dropout_frac = _default(dropout, opt.get('dropout', 0.0)) self.dropout = nn.Dropout(p=self.dropout_frac) self.variant = _default(variant, opt.get('variant', 'aiayn')) self.n_segments = _default(n_segments, opt.get('n_segments', 0)) self.n_positions = _default(n_positions, get_n_positions_from_options(opt)) self.out_dim = self.embedding_size assert ( self.embedding_size % self.n_heads == 0 ), 'Transformer embedding size must be a multiple of n_heads' # check input formats: if embedding is not None: assert ( self.embedding_size is None or self.embedding_size == embedding.weight.shape[1] ), "Embedding dim must match the embedding size." if embedding is not None: self.embeddings = embedding else: raise AssertionError( "This code should not execute. Left here in case we want to enable it." ) assert self.padding_idx is not None self.embeddings = nn.Embedding( vocabulary_size, self.embedding_size, padding_idx=padding_idx ) nn.init.normal_(self.embeddings.weight, 0, self.embedding_size ** -0.5) # create the positional embeddings self.position_embeddings = nn.Embedding(self.n_positions, self.embedding_size) if not opt.get('learn_positional_embeddings', False): create_position_codes( self.n_positions, self.embedding_size, out=self.position_embeddings.weight, ) else: nn.init.normal_( self.position_embeddings.weight, 0, self.embedding_size ** -0.5 ) # embedding normalization if ( self.variant == 'xlm' or self.variant == 'prelayernorm' or self.variant == 'bart' ): self.norm_embeddings = torch.nn.LayerNorm(self.dim, eps=LAYER_NORM_EPS) elif self.variant == 'aiayn': pass else: raise ValueError("Can't handle --variant {}".format(self.variant)) if self.n_segments >= 1: self.segment_embeddings = nn.Embedding(self.n_segments, self.dim) # build the model self.layers = nn.ModuleList() for _ in range(self.n_layers): self.layers.append( TransformerEncoderLayer( self.n_heads, self.embedding_size, self.ffn_size, attention_dropout=opt.get('attention_dropout', 0.0), relu_dropout=opt.get('relu_dropout', 0.0), dropout=self.dropout_frac, variant=self.variant, activation=_default(activation, opt.get('activation', 'relu')), ) ) self.output_scaling = _default(output_scaling, opt.get('output_scaling', 1.0))
def __init__( self, opt: Opt, embedding: Optional[nn.Embedding] = None, n_positions: Optional[int] = None, **kwargs, ): super().__init__(**kwargs) self.opt = opt def _default(val, default): return val if val is not None else default self.embedding_size = opt['embedding_size'] self.ffn_size = opt['ffn_size'] self.n_layers = (opt['n_decoder_layers'] if opt.get('n_decoder_layers', -1) > 0 else opt['n_layers']) self.n_heads = opt['n_heads'] self.dim = self.embedding_size self.activation = opt.get('activation', 'relu') self.variant = opt.get('variant', 'aiayn') self.embeddings_scale = opt.get('embeddings_scale', True) self.dropout = nn.Dropout(p=opt.get('dropout', 0.0)) # --dropout self.n_positions = _default(n_positions, get_n_positions_from_options(opt)) self.out_dim = self.embedding_size assert (self.embedding_size % self.n_heads == 0 ), 'Transformer embedding size must be a multiple of n_heads' self.embeddings = embedding if (self.variant == 'xlm' or self.variant == 'prelayernorm' or self.variant == 'bart'): self.norm_embeddings = torch.nn.LayerNorm(self.dim, eps=LAYER_NORM_EPS) if self.variant == 'xlm': warn_once( 'DEPRECATED: XLM should only be used for backwards compatibility, ' 'as it involves a less-stable layernorm operation.') elif self.variant == 'aiayn': pass else: raise ValueError("Can't handle --variant {}".format(self.variant)) # create the positional embeddings self.position_embeddings = nn.Embedding(self.n_positions, self.embedding_size) if not opt.get('learn_positional_embeddings', False): create_position_codes( self.n_positions, self.embedding_size, out=self.position_embeddings.weight, ) else: nn.init.normal_(self.position_embeddings.weight, 0, self.embedding_size**-0.5) # build the model self.layers = self.build_layers()
def __init__( self, n_heads, n_layers, embedding_size, ffn_size, embedding=None, dropout=0.0, attention_dropout=0.0, relu_dropout=0.0, padding_idx=0, learn_positional_embeddings=False, embeddings_scale=False, n_positions=1024, activation='relu', variant='aiayn', n_segments=0, output_scaling=1.0, ): super(TransformerAREncoder, self).__init__() self.embedding_size = embedding_size self.ffn_size = ffn_size self.n_layers = n_layers self.n_heads = n_heads self.dim = embedding_size self.embeddings_scale = embeddings_scale self.padding_idx = padding_idx # this is --dropout, not --relu-dropout or --attention-dropout self.dropout_frac = dropout self.dropout = torch.nn.Dropout(p=self.dropout_frac) self.variant = variant self.n_segments = n_segments self.n_positions = n_positions self.out_dim = embedding_size assert (embedding_size % n_heads == 0 ), 'Transformer embedding size must be a multiple of n_heads' # check input formats: if embedding is not None: assert (embedding_size is None or embedding_size == embedding.weight.shape[1] ), "Embedding dim must match the embedding size." if embedding is not None: self.embeddings = embedding else: raise AssertionError( "This code should not execute. Left here in case we want to enable it." ) # create the positional embeddings self.position_embeddings = torch.nn.Embedding(n_positions, embedding_size) if not learn_positional_embeddings: create_position_codes(n_positions, embedding_size, out=self.position_embeddings.weight) else: torch.nn.init.normal_(self.position_embeddings.weight, 0, embedding_size**-0.5) # embedding normalization if self.variant == 'xlm' or self.variant == 'prelayernorm': self.norm_embeddings = LayerNorm(self.dim, eps=LAYER_NORM_EPS) elif self.variant == 'aiayn': pass else: raise ValueError("Can't handle --variant {}".format(self.variant)) if self.n_segments >= 1: self.segment_embeddings = torch.nn.Embedding( self.n_segments, self.dim) # build the model self.layers = torch.nn.ModuleList() for _ in range(self.n_layers): self.layers.append( TransformerEncoderLayerPast( n_heads, embedding_size, ffn_size, attention_dropout=attention_dropout, relu_dropout=relu_dropout, dropout=dropout, variant=variant, activation=activation, )) self.output_scaling = output_scaling