Example #1
0
    def __init__(
        self,
        n_heads,
        n_layers,
        hidden_dim,
        ffn_size,
        reduction=True,
        attention_dropout=0.0,
        relu_dropout=0.0,
        learn_positional_embeddings=False,
    ):
        super().__init__()
        self.ffn_size = ffn_size
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.out_dim = hidden_dim
        self.dim = hidden_dim
        self.reduction = reduction
        assert hidden_dim % n_heads == 0, "MM-Combiner dim must be multiple of n_heads"
        n_positions = 1024
        self.position_embeddings = nn.Embedding(n_positions, hidden_dim)
        if not learn_positional_embeddings:
            create_position_codes(n_positions,
                                  hidden_dim,
                                  out=self.position_embeddings.weight)
        else:
            nn.init.normal_(self.position_embeddings.weight, 0,
                            hidden_dim**-0.5)

        self.layers = nn.ModuleList()
        for _ in range(self.n_layers):
            self.layers.append(
                TransformerEncoderLayer(n_heads, hidden_dim, ffn_size,
                                        attention_dropout, relu_dropout))
Example #2
0
    def __init__(
        self,
        opt: Opt,
        vocabulary_size: int,
        embedding: Optional[nn.Embedding] = None,
        padding_idx: int = 0,
        reduction_type: str = 'mean',
        n_positions: Optional[int] = None,
        n_segments: Optional[int] = None,
        embeddings_scale: Optional[bool] = None,
        dropout: Optional[float] = None,
        activation: Optional[str] = None,
        variant: Optional[str] = None,
        output_scaling: Optional[float] = None,
    ):
        super(TransformerEncoder, self).__init__()

        def _default(val, default):
            return val if val is not None else default

        self.embedding_size = opt['embedding_size']
        self.ffn_size = opt['ffn_size']
        self.n_layers = (
            opt['n_encoder_layers']
            if opt.get('n_encoder_layers', -1) > 0
            else opt['n_layers']
        )
        self.n_heads = opt['n_heads']
        self.dim = self.embedding_size
        self.embeddings_scale = _default(
            embeddings_scale, opt.get('embeddings_scale', False)
        )
        self.reduction_type = reduction_type
        self.padding_idx = padding_idx
        # this is --dropout, not --relu-dropout or --attention-dropout
        self.dropout_frac = _default(dropout, opt.get('dropout', 0.0))
        self.dropout = nn.Dropout(p=self.dropout_frac)
        self.variant = _default(variant, opt.get('variant', 'aiayn'))
        self.n_segments = _default(n_segments, opt.get('n_segments', 0))

        self.n_positions = _default(n_positions, get_n_positions_from_options(opt))
        self.out_dim = self.embedding_size
        assert (
            self.embedding_size % self.n_heads == 0
        ), 'Transformer embedding size must be a multiple of n_heads'

        # check input formats:
        if embedding is not None:
            assert (
                self.embedding_size is None
                or self.embedding_size == embedding.weight.shape[1]
            ), "Embedding dim must match the embedding size."

        if embedding is not None:
            self.embeddings = embedding
        else:
            raise AssertionError(
                "This code should not execute. Left here in case we want to enable it."
            )
            assert self.padding_idx is not None
            self.embeddings = nn.Embedding(
                vocabulary_size, self.embedding_size, padding_idx=padding_idx
            )
            nn.init.normal_(self.embeddings.weight, 0, self.embedding_size ** -0.5)

        # create the positional embeddings
        self.position_embeddings = nn.Embedding(self.n_positions, self.embedding_size)
        if not opt.get('learn_positional_embeddings', False):
            create_position_codes(
                self.n_positions,
                self.embedding_size,
                out=self.position_embeddings.weight,
            )
        else:
            nn.init.normal_(
                self.position_embeddings.weight, 0, self.embedding_size ** -0.5
            )

        # embedding normalization
        if (
            self.variant == 'xlm'
            or self.variant == 'prelayernorm'
            or self.variant == 'bart'
        ):
            self.norm_embeddings = torch.nn.LayerNorm(self.dim, eps=LAYER_NORM_EPS)
        elif self.variant == 'aiayn':
            pass
        else:
            raise ValueError("Can't handle --variant {}".format(self.variant))

        if self.n_segments >= 1:
            self.segment_embeddings = nn.Embedding(self.n_segments, self.dim)

        # build the model
        self.layers = nn.ModuleList()
        for _ in range(self.n_layers):
            self.layers.append(
                TransformerEncoderLayer(
                    self.n_heads,
                    self.embedding_size,
                    self.ffn_size,
                    attention_dropout=opt.get('attention_dropout', 0.0),
                    relu_dropout=opt.get('relu_dropout', 0.0),
                    dropout=self.dropout_frac,
                    variant=self.variant,
                    activation=_default(activation, opt.get('activation', 'relu')),
                )
            )
        self.output_scaling = _default(output_scaling, opt.get('output_scaling', 1.0))
Example #3
0
    def __init__(
        self,
        opt: Opt,
        embedding: Optional[nn.Embedding] = None,
        n_positions: Optional[int] = None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.opt = opt

        def _default(val, default):
            return val if val is not None else default

        self.embedding_size = opt['embedding_size']
        self.ffn_size = opt['ffn_size']
        self.n_layers = (opt['n_decoder_layers']
                         if opt.get('n_decoder_layers', -1) > 0 else
                         opt['n_layers'])
        self.n_heads = opt['n_heads']
        self.dim = self.embedding_size
        self.activation = opt.get('activation', 'relu')
        self.variant = opt.get('variant', 'aiayn')

        self.embeddings_scale = opt.get('embeddings_scale', True)
        self.dropout = nn.Dropout(p=opt.get('dropout', 0.0))  # --dropout

        self.n_positions = _default(n_positions,
                                    get_n_positions_from_options(opt))
        self.out_dim = self.embedding_size
        assert (self.embedding_size % self.n_heads == 0
                ), 'Transformer embedding size must be a multiple of n_heads'

        self.embeddings = embedding

        if (self.variant == 'xlm' or self.variant == 'prelayernorm'
                or self.variant == 'bart'):
            self.norm_embeddings = torch.nn.LayerNorm(self.dim,
                                                      eps=LAYER_NORM_EPS)
            if self.variant == 'xlm':
                warn_once(
                    'DEPRECATED: XLM should only be used for backwards compatibility, '
                    'as it involves a less-stable layernorm operation.')
        elif self.variant == 'aiayn':
            pass
        else:
            raise ValueError("Can't handle --variant {}".format(self.variant))

        # create the positional embeddings
        self.position_embeddings = nn.Embedding(self.n_positions,
                                                self.embedding_size)
        if not opt.get('learn_positional_embeddings', False):
            create_position_codes(
                self.n_positions,
                self.embedding_size,
                out=self.position_embeddings.weight,
            )
        else:
            nn.init.normal_(self.position_embeddings.weight, 0,
                            self.embedding_size**-0.5)

        # build the model
        self.layers = self.build_layers()
Example #4
0
    def __init__(
        self,
        n_heads,
        n_layers,
        embedding_size,
        ffn_size,
        embedding=None,
        dropout=0.0,
        attention_dropout=0.0,
        relu_dropout=0.0,
        padding_idx=0,
        learn_positional_embeddings=False,
        embeddings_scale=False,
        n_positions=1024,
        activation='relu',
        variant='aiayn',
        n_segments=0,
        output_scaling=1.0,
    ):
        super(TransformerAREncoder, self).__init__()
        self.embedding_size = embedding_size
        self.ffn_size = ffn_size
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.dim = embedding_size
        self.embeddings_scale = embeddings_scale
        self.padding_idx = padding_idx
        # this is --dropout, not --relu-dropout or --attention-dropout
        self.dropout_frac = dropout
        self.dropout = torch.nn.Dropout(p=self.dropout_frac)
        self.variant = variant
        self.n_segments = n_segments

        self.n_positions = n_positions
        self.out_dim = embedding_size
        assert (embedding_size % n_heads == 0
                ), 'Transformer embedding size must be a multiple of n_heads'

        # check input formats:
        if embedding is not None:
            assert (embedding_size is None
                    or embedding_size == embedding.weight.shape[1]
                    ), "Embedding dim must match the embedding size."

        if embedding is not None:
            self.embeddings = embedding
        else:
            raise AssertionError(
                "This code should not execute. Left here in case we want to enable it."
            )

        # create the positional embeddings
        self.position_embeddings = torch.nn.Embedding(n_positions,
                                                      embedding_size)
        if not learn_positional_embeddings:
            create_position_codes(n_positions,
                                  embedding_size,
                                  out=self.position_embeddings.weight)
        else:
            torch.nn.init.normal_(self.position_embeddings.weight, 0,
                                  embedding_size**-0.5)

        # embedding normalization
        if self.variant == 'xlm' or self.variant == 'prelayernorm':
            self.norm_embeddings = LayerNorm(self.dim, eps=LAYER_NORM_EPS)
        elif self.variant == 'aiayn':
            pass
        else:
            raise ValueError("Can't handle --variant {}".format(self.variant))

        if self.n_segments >= 1:
            self.segment_embeddings = torch.nn.Embedding(
                self.n_segments, self.dim)

        # build the model
        self.layers = torch.nn.ModuleList()
        for _ in range(self.n_layers):
            self.layers.append(
                TransformerEncoderLayerPast(
                    n_heads,
                    embedding_size,
                    ffn_size,
                    attention_dropout=attention_dropout,
                    relu_dropout=relu_dropout,
                    dropout=dropout,
                    variant=variant,
                    activation=activation,
                ))
        self.output_scaling = output_scaling