Exemple #1
0
    def __init__(self, num_classes, hidden_dim=256, nheads=8,
                 num_encoder_layers=6, num_decoder_layers=6):
        super().__init__()

        # create ResNet-50 backbone
        self.backbone = resnet50()
        del self.backbone.fc

        # create conversion layer
        self.conv = nn.Conv2d(2048, hidden_dim, 1)

        # create a default PyTorch transformer
        self.transformer = nn.Transformer(
            hidden_dim, nheads, num_encoder_layers, num_decoder_layers)

        # prediction heads, one extra class for predicting non-empty slots
        # note that in baseline DETR linear_bbox layer is 3-layer MLP
        self.linear_class = nn.Linear(hidden_dim, num_classes + 1)
        self.linear_bbox = nn.Linear(hidden_dim, 4)

        # output positional encodings (object queries)
        self.query_pos = nn.Parameter(torch.rand(100, hidden_dim))

        # spatial positional encodings
        # note that in baseline DETR we use sine positional encodings
        self.row_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
        self.col_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
Exemple #2
0
 def __init__(self, e=512, nhead=8, channels=64, layers=4):
     super().__init__()
     self.alphabet_embedding = nn.Embedding(ALPHABET_SIZE, channels)
     self.masks_embedding = nn.Linear(7, channels)
     self.button_embedding = nn.Embedding(NUM_TOKENS + 1, e)
     self.block_1 = nn.Sequential(
         nn.Conv1d(channels, channels, kernel_size=3, padding=1),
         nn.ReLU(),
         nn.Conv1d(channels, channels, kernel_size=3, padding=1),
         nn.ReLU(),
         nn.Conv1d(channels, channels, kernel_size=3, padding=1),
         nn.ReLU(),
     )
     self.block_2 = nn.Sequential(
         nn.Conv1d(channels, channels, kernel_size=3, padding=1),
         nn.ReLU(),
         nn.Conv1d(channels, channels, kernel_size=3, padding=1),
         nn.ReLU(),
         nn.Conv1d(channels, channels, kernel_size=3, padding=1),
         nn.ReLU(),
     )
     self.spec_proj = nn.Linear(36 * channels, e)
     self.positional_button_encoding = PositionalEncoding(e)
     self.transformer = nn.Transformer(
         e,
         nhead,
         num_encoder_layers=layers // 2,
         num_decoder_layers=layers - layers // 2,
     )
Exemple #3
0
    def __init__(self,
                 d_in,
                 d_out,
                 batch_size,
                 d_model=512,
                 nhead=8,
                 num_encoder_layers=6,
                 num_decoder_layers=6,
                 dim_feedforward=2048,
                 dropout=0.5):

        super(OwnTransformerModel, self).__init__()

        self.d_in = d_in
        self.d_out = d_out
        self.batch_size = batch_size
        self.d_model = d_model

        # self.encoder = nn.Embedding(num_embeddings=ntoken, embedding_dim=ninp)
        self.encoder = nn.Linear(in_features=30, out_features=d_model)

        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout)
Exemple #4
0
    def __init__(self, vocab_len, hidden_dim, nheads, num_encoder_layers,
                 num_decoder_layers):
        super().__init__()

        # create ResNet-101 backbone
        self.backbone = resnet101()
        del self.backbone.fc

        # create conversion layer
        self.conv = nn.Conv2d(2048, hidden_dim, 1)

        # create a default PyTorch transformer
        self.transformer = nn.Transformer(hidden_dim, nheads,
                                          num_encoder_layers,
                                          num_decoder_layers)

        # prediction heads with length of vocab
        self.vocab = nn.Linear(hidden_dim, vocab_len)

        # output positional encodings (object queries)
        self.decoder = nn.Embedding(vocab_len, hidden_dim)
        self.query_pos = PositionalEncoding(hidden_dim, .2)

        # spatial positional encodings, sine positional encoding can be used.
        # Detr baseline uses sine positional encoding.
        self.row_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
        self.col_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
        self.trg_mask = None
Exemple #5
0
 def __init__(self,
              input_size,
              hidden_size,
              output_size,
              attention=False,
              dropout=False,
              bidirectional=False,
              num_layers=1,
              num_heads=2,
              max_length=50,
              batch_first=False):
     super(GRU, self).__init__()
     self.hidden_size = hidden_size
     self.num_layers = num_layers
     self.batch_first = batch_first
     self.bidirectional = bidirectional
     self.attention = attention
     self.multiplier = 1
     if self.bidirectional:
         self.multiplier = 2
     if self.attention:
         self.attention = nn.Transformer(nheads=num_heads,
                                         num_encoder_layers=num_layers)
     self.gru = nn.GRU(input_size,
                       hidden_size,
                       bidirectional=bidirectional,
                       num_layers=num_layers,
                       dropout=dropout,
                       batch_first=batch_first)
     self.out = nn.Linear(self.multiplier * hidden_size, output_size)
Exemple #6
0
    def __init__(self, args, embed, nlayers=3, dropout=0.1):
        super(GengrateDocument, self).__init__()

        self.cuda = args.cuda
        self.nhead = args.n_head  # 多头注意力模型的头数
        self.hidden = args.word_emb_dim  # 编码器和解码器输入的大小
        self.doc_max_timesteps = args.doc_max_timesteps  # 输入文档的最大句子数目
        self.outputSize = args.vocab_size  # 词汇表大小(用于预测每个单词输出的概率)
        self.sent_max_len = args.sent_max_len  # 输入句子的最大长度

        self.embed = embed  # encoder和decoder输入的embedding
        self.pos_encoder = PositionalEncoding(self.hidden, dropout,
                                              self.sent_max_len)  # 输入的位置编码

        self.pos_decoder = PositionalEncoding(
            self.hidden, dropout,
            self.sent_max_len * self.doc_max_timesteps)  # 输出的位置编码

        self.transformer = nn.Transformer(d_model=self.hidden,
                                          nhead=self.nhead,
                                          num_encoder_layers=nlayers,
                                          num_decoder_layers=nlayers,
                                          dim_feedforward=self.hidden,
                                          dropout=dropout,
                                          activation="gelu")

        self.src_mask = None  # 输入序列的mask
        self.trg_mask = None  # 输出序列的mask
        self.memory_mask = None  # encoder输出序列的mask

        self.fc_out = nn.Linear(self.hidden, self.outputSize)
Exemple #7
0
    def __init__(self,
                 sound_dim,
                 text_dim,
                 d_model,
                 dim_feedforward,
                 dropout_rate=0.0,
                 device="cuda"):
        super(MyTransformer, self).__init__()

        self.device = device

        self.sound_embed = Conv2dSubsampling(sound_dim, d_model, dropout_rate)
        # self.sound_embed = nn.Linear(sound_dim, d_model)
        self.text_embed = nn.Embedding(
            text_dim,
            d_model,
        )
        self.pos_encoder = PositionalEncoding(d_model, dropout_rate)

        self.transformer = nn.Transformer(d_model, nhead= 8, num_encoder_layers= 12,\
                                          num_decoder_layers = 6, dim_feedforward = dim_feedforward,\
                                          dropout = dropout_rate, activation = 'gelu')

        # self.lin_ctc = nn.Linear(d_model, text_dim + 1)

        self.out_lin = nn.Linear(d_model, text_dim)
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
        max_len,
        device,
    ):
        super(Transformer, self).__init__()
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
        self.src_position_embedding = nn.Embedding(max_len, embedding_size)
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
        self.trg_position_embedding = nn.Embedding(max_len, embedding_size)

        self.device = device
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout,
        )
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx
    def __init__(self, config, args):
        super().__init__()
        self.config = config
        self.args = args

        self.tok_embed = nn.Embedding(self.config.vocab_size,
                                      self.config.hidden_size)
        self.pos_encoding = nn.Embedding(300, self.config.hidden_size)
        self.tok_type_embed = nn.Embedding(2, self.config.hidden_size)

        self.dropout = nn.Dropout(0.1)
        self.scale = torch.sqrt(torch.FloatTensor([self.config.hidden_size
                                                   ])).to(self.args.device)
        assert len(self.scale.shape) == 1
        assert contains_nan(self.scale).item() is False

        self.fc_out = nn.Linear(self.config.hidden_size,
                                self.config.vocab_size)

        num_layers = self.config.num_hidden_layers // 2
        self.transformer = nn.Transformer(
            d_model=self.config.hidden_size,
            nhead=self.config.num_attention_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            activation=self.config.hidden_act,
            dropout=0.1)
Exemple #10
0
    def __init__(self, src_vocab, trg_vocab, d_model, N, heads, dropout):
        super().__init__()
        self.myenc = nn.GRU(src_vocab, d_model)

        self.pencoder1 = nn.GRU(d_model * 2,
                                d_model,
                                bidirectional=True,
                                batch_first=True)

        self.embed = nn.Embedding(trg_vocab + 1,
                                  d_model)  # plus 1 for the sos token
        self.trans = nn.Transformer(d_model=d_model,
                                    nhead=heads,
                                    num_encoder_layers=N,
                                    num_decoder_layers=N,
                                    dim_feedforward=DFF,
                                    dropout=dropout)
        self.out = nn.Linear(d_model, NUMLABELS)
        self.pos_enc1 = PositionalEncoding(d_model,
                                           dropout,
                                           max_len=ActualWINDOWWIDTH // 2)
        self.pos_enc2 = PositionalEncoding(d_model,
                                           dropout,
                                           max_len=WINDOWWIDTH //
                                           PREDICT_EVERY_NTH_FRAME)
        self.d_model = d_model
Exemple #11
0
    def __init__(
            self,
            dim_ins: tuple,  # dims of ti, tc, kn
            dim_out: int,  # just reserved
            ws: int,  # windows size of time series/sequence
            dim_emb: int,  # dim for embedding
            n_heads: int,  # the number of attention heads in transformer
            n_layers: int,  # layers of multi-heads
            k: int,  # the numbers of curves
    ):
        super().__init__()
        self.set_params()

        # override args from child/sub class
        dim_ins = self.args.dim_ins
        dim_emb = self.args.dim_emb
        ws = self.args.ws
        n_heads = self.args.n_heads
        n_layers = self.args.n_layers

        self.n_quantiles = len(self.quantiles)  # the number of quantiles

        # embedder
        n_dim = sum(dim_ins[0:])
        self.emb_encode = nn.Linear(n_dim, dim_emb)  # .double()
        self.emb_decode = nn.Linear(n_dim, dim_emb)  # .double()
        max_len = max(16, ws)
        self.pos = PositionalEncoding(d_model=dim_emb, max_len=max_len)

        # Transformer
        prm = dict(
            d_model=dim_emb,
            nhead=n_heads,
            num_encoder_layers=n_layers,
            num_decoder_layers=n_layers,
        )
        self.tr = nn.Transformer(**prm)  # .double()

        # linears
        self.dc = nn.Linear(ws * dim_emb,
                            ws * n_dim * self.n_quantiles)  # .double()

        # constraint
        self.loss_constraint_pretrain = 0

        # to double
        self.emb_encode = self.emb_encode.double()
        self.emb_decode = self.emb_decode.double()
        self.tr = self.tr.double()
        self.dc = self.dc.double()

        # initialize weights/biases
        weight_interval = 0.01
        nn.init.uniform_(self.emb_encode.weight, -weight_interval,
                         weight_interval)
        nn.init.uniform_(self.emb_decode.weight, -weight_interval,
                         weight_interval)
        nn.init.xavier_normal_(self.dc.weight)
        for fc in [self.emb_encode, self.emb_decode, self.dc]:
            nn.init.zeros_(fc.bias)
Exemple #12
0
    def __init__(self, num_classes, hidden_dim, nheads, num_encoder_layers,
                 num_decoder_layers):
        super().__init__()
        # We take only convolutional layers from ResNet-50 model
        self.backbone = nn.Sequential(
            *list(resnet50(pretrained=True).children())[:-2])
        self.conv = nn.Conv2d(2048, hidden_dim, 1)
        self.transformer = nn.Transformer(hidden_dim, heads,
                                          num_encoder_layers,
                                          num_decoder_layers)
        self.linear_class = nn.Linear(hidden_dim, num_classes + 1)
        self.linear_bbox = nn.Linear(hidden_dim, 4)
        self.query_pos = nn.Parameter(torch.rand(100, hidden_dim))
        self.row_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
        self.col_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))

        def forward(self, inputs):
            x = self.backbone(inputs)
            h = self.conv(x)
            H, W = h.shape[-2:]
            pos = torch.cat([
                self.col_embed[:W].unsqueeze(0).repeat(H, 1, 1),
                self.row_embed[:H].unsqueeze(1).repeat(1, W, 1),
            ],
                            dim=-1).flatten(0, 1).unsqueeze(1)
            h = self.transformer(pos + h.flatten(2).permute(2, 0, 1),
                                 self.query_pos.unsqueeze(1))

        return self.linear_class(h), self.linear_bbox(h).sigmoid()
    def __init__(self, config):
        super().__init__(config)

        self.embed = nn.ModuleDict()
        self.embed["qid"] = nn.Embedding(config.num_item + 1,
                                         config.dim_model,
                                         padding_idx=0)
        self.embed["skill"] = nn.Embedding(config.num_skill + 1,
                                           config.dim_model,
                                           padding_idx=0)
        self.embed["is_correct"] = nn.Embedding(3,
                                                config.dim_model,
                                                padding_idx=0)

        # transformer
        self.transformer = nn.Transformer(
            d_model=config.dim_model,
            nhead=config.head_count,
            num_encoder_layers=config.layer_count,
            num_decoder_layers=config.layer_count,
            dim_feedforward=config.dim_ff,
            dropout=config.dropout_rate,
        )
        # positional encoding
        self.embed["enc_pos"] = AbsoluteDiscretePositionalEncoding(
            dim_emb=config.dim_model,
            max_len=config.seq_len,
            device=config.device)
        self.embed["dec_pos"] = copy.deepcopy(self.embed["enc_pos"])
        self.generator = nn.Linear(config.dim_model, 1)

        # xavier initialization
        for param in self.parameters():
            if param.dim() > 1:
                nn.init.xavier_uniform_(param)
Exemple #14
0
    def __init__(self,
                 group,
                 *unused_args,
                 d_vocab=23,
                 d_model=16,
                 add_bn=True,
                 **unused_kwargs):
        super().__init__()
        self.rank = group.rank()
        self.world_size = group.size()
        torch.manual_seed(0)  # keep everything deterministic
        assert d_vocab >= 12  # we use torch.arange(12) as input
        self.embed_tokens = nn.Embedding(d_vocab, d_model)
        self.transformer = nn.Transformer(
            d_model=d_model,
            num_encoder_layers=2,
            num_decoder_layers=2,
            dim_feedforward=8,
            dropout=0.1,
        )
        self.output_proj = nn.Linear(d_model, d_vocab)

        # share the embedding and output projection weights
        self.output_proj.weight = self.embed_tokens.weight
        self.register_buffer("vocab_bias",
                             self.embed_tokens.weight.new_ones((d_model, )))
        self.register_buffer(
            "long_buffer", torch.zeros_like(self.vocab_bias, dtype=torch.long))

        self.bs = 2
        self.bn = torch.nn.BatchNorm1d(
            self.bs) if add_bn else torch.nn.Identity()
Exemple #15
0
    def forward(self, input, labels=None):
        # input_size T,N,D
        batch_size = input.size(1)

        if self.training:
            labels = labels.permute(1, 0)
            sos_emb = self.sos_emb.expand(1, batch_size)
            labels = torch.cat((sos_emb, labels), dim=0)[:self.max_len]
            tgt_emb = self.embeddings(labels)
            out = self.decoder(tgt=tgt_emb,
                               tgt_mask=self.tgt_mask,
                               memory=input)
            return self.predictor(out)
        else:
            labels = self.sos_emb.expand(1, batch_size)
            outputs = torch.zeros(self.max_len, batch_size,
                                  self.class_num).cuda()
            for i in range(0, self.max_len):
                tgt_emb = self.embeddings(labels)
                tgt_emb = self.pos_encoder(tgt_emb)
                tgt_mask = nn.Transformer().generate_square_subsequent_mask(
                    i + 1).cuda()
                out = self.decoder(tgt=tgt_emb,
                                   tgt_mask=tgt_mask,
                                   memory=input)
                out = self.predictor(out)
                pred = torch.argmax(out, dim=2)
                sos = self.sos_emb.expand(1, batch_size)
                labels = torch.cat((sos, pred), dim=0)
                outputs[:i] = out
            return outputs
Exemple #16
0
    def __init__(
        self, group, *args, d_vocab=23, d_model=16, add_bn=True,
        fsdp_init_mode=FSDPInitMode.CUDA_AFTER, **kwargs
    ):
        super().__init__()
        self.rank = group.rank()
        self.world_size = group.size()
        torch.manual_seed(0)  # keep everything deterministic
        assert (
            d_vocab >= 12
        ), "dim of vocab should be larger than 12, as we use torch.arange(12) as input"

        self.embed_tokens = nn.Embedding(d_vocab, d_model)
        self.transformer = nn.Transformer(
            d_model=d_model,
            num_encoder_layers=2,
            num_decoder_layers=2,
            dim_feedforward=8,
            dropout=0.1,
        )
        self.output_proj = nn.Linear(d_model, d_vocab)

        # share the embedding and output projection weights
        self.output_proj.weight = self.embed_tokens.weight
        self.register_buffer(
            "vocab_bias", self.embed_tokens.weight.new_ones((d_model,))
        )
        self.register_buffer("long_buffer", torch.zeros_like(self.vocab_bias, dtype=torch.long))  # type: ignore[arg-type]

        self.bs = 2
        self.bn = torch.nn.BatchNorm1d(self.bs) if add_bn else torch.nn.Identity()
        move_to_cuda = fsdp_init_mode == FSDPInitMode.CUDA_BEFORE
        self = _maybe_cuda(self, move_to_cuda)
    def __init__(self,
                 vocab: List[str],
                 hidden_features: int,
                 enc_layers=1,
                 dec_layers=1,
                 nhead=1,
                 dropout=0.1):
        super().__init__()

        self.letter_to_token, _ = tokenize_vocab(vocab)

        self.pos_encoder = PositionalEncoding(hidden_features, dropout)
        self.decoder = nn.Embedding(len(vocab), hidden_features)
        self.pos_decoder = PositionalEncoding(hidden_features, dropout)
        self.transformer = nn.Transformer(d_model=hidden_features,
                                          nhead=nhead,
                                          num_encoder_layers=enc_layers,
                                          num_decoder_layers=dec_layers,
                                          dim_feedforward=hidden_features * 4,
                                          dropout=dropout,
                                          activation='relu')

        self.src_mask = None
        self.trg_mask = None
        self.memory_mask = None
        self.fc = nn.Linear(hidden_features, len(vocab), bias=False)
        self._initialize_weights()
Exemple #18
0
    def __init__(self,
                 input_size: int,
                 nhead: int = 8,
                 num_encoder_layers: int = 1,
                 num_decoder_layers: int = 1,
                 dim_feedforward: int = 256,
                 dropout: float = 0.1,
                 activation: str = "relu",
                 pose_indices: Optional[Tuple[int, int]] = None,
                 pretraining: bool = False):
        r"""A Transformer for encoding the state in RL and decoding features based on
        the observation and goal encodings.

        Supports masking the hidden state during various timesteps in the forward pass

        Args:
            input_size: The input size of the SMT
            nhead: The number of encoding and decoding attention heads
            num_encoder_layers: The number of encoder layers
            num_decoder_layers: The number of decoder layers
            dim_feedforward: The hidden size of feedforward layers in the transformer
            dropout: The dropout value after each attention layer
            activation: The activation to use after each linear layer
        """

        super().__init__()
        self._input_size = input_size
        self._nhead = nhead
        self._num_encoder_layers = num_encoder_layers
        self._num_decoder_layers = num_decoder_layers
        self._dim_feedforward = dim_feedforward
        self._dropout = dropout
        self._activation = activation
        self._pose_indices = pose_indices
        self._pretraining = pretraining

        if pose_indices is not None:
            pose_dims = pose_indices[1] - pose_indices[0]
            self.pose_encoder = nn.Linear(5, 16)
            input_size += 16 - pose_dims
            self._use_pose_encoding = True
        else:
            self._use_pose_encoding = False

        self.fusion_encoder = nn.Sequential(
            nn.Linear(input_size, dim_feedforward),
            nn.ReLU(),
            nn.Linear(dim_feedforward, dim_feedforward),
        )

        self.transformer = nn.Transformer(
            d_model=dim_feedforward,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            activation=activation,
        )
 def __init__(self, args):
     super(SortingModel, self).__init__()
     self.embed_mod = nn.Embedding(args.n_vocab, args.hidden_dim)
     self.transformer = nn.Transformer(
         d_model=args.hidden_dim, nhead=args.width, num_encoder_layers=0, num_decoder_layers=args.depth
     )
     self.classifier = nn.Linear(args.hidden_dim, args.num_labels)
     self.args = args
 def __init__(self, cnn, vocab, **config):
     super().__init__(cnn, vocab, **config)
     self.decoder = nn.Transformer(cnn.n_features, config['nhead'],
                                   config['encoder_nlayers'],
                                   config['decoder_nlayers'],
                                   config['dim_feedforward'],
                                   config['dropout'])
     self.character_distribution = nn.Linear(cnn.n_features, vocab.size)
    def __init__(self,
                 src_vocab_size,
                 trg_vocab_size,
                 encoder_embed_dim_emb=512,
                 decoder_embed_dim_emb=512,
                 encoder_embed_dim=256,
                 decoder_embed_dim=256,
                 encoder_layers=3,
                 decoder_layers=3,
                 encoder_attention_heads=8,
                 decoder_attention_heads=8,
                 encoder_ffn_embed_dim=512,
                 decoder_ffn_embed_dim=512,
                 dropout=0.1,
                 activation_fn="relu",
                 max_src_positions=1024,
                 max_trg_positions=1024,
                 padding_idx=None,
                 learned=False,
                 **kwargs):
        super().__init__(src_vocab_size, trg_vocab_size, padding_idx, **kwargs)
        self.max_src_positions = max_src_positions
        self.max_trg_positions = max_trg_positions

        # Model
        self.src_embeddings = nn.Embedding(src_vocab_size,
                                           encoder_embed_dim_emb)
        self.trg_embeddings = nn.Embedding(trg_vocab_size,
                                           decoder_embed_dim_emb)
        self.src_pos_embeddings = PositionalEmbedding(
            num_embeddings=max_src_positions,
            embedding_dim=encoder_embed_dim_emb,
            padding_idx=padding_idx,
            learned=learned)
        self.trg_pos_embeddings = PositionalEmbedding(
            num_embeddings=max_trg_positions,
            embedding_dim=decoder_embed_dim_emb,
            padding_idx=padding_idx,
            learned=learned)
        self.src_dense_emb = nn.Linear(encoder_embed_dim_emb,
                                       encoder_embed_dim)
        self.trg_dense_emb = nn.Linear(decoder_embed_dim_emb,
                                       decoder_embed_dim)
        self.transformer = nn.Transformer(
            d_model=encoder_embed_dim,
            nhead=encoder_attention_heads,
            num_encoder_layers=encoder_layers,
            num_decoder_layers=decoder_layers,
            dim_feedforward=encoder_ffn_embed_dim,
            dropout=dropout,
            activation=activation_fn)
        self.output_layer = nn.Linear(encoder_embed_dim, src_vocab_size)
        self.input_dropout = nn.Dropout(dropout)

        # Checks
        assert encoder_embed_dim == decoder_embed_dim
        assert encoder_attention_heads == decoder_attention_heads
        assert encoder_ffn_embed_dim == decoder_ffn_embed_dim
    def __init__(self, src_vocab, trg_vocab, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6,
                 dim_feedforward=2048, dropout=0.1, max_length=5000):
        super().__init__()

        self.enc_emb = Embedding(vocab_size=src_vocab, d_model=d_model, dropout=dropout, max_length=max_length)
        self.dec_emb = Embedding(vocab_size=trg_vocab, d_model=d_model, dropout=dropout, max_length=max_length)
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead,
                                          num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers,
                                          dim_feedforward=dim_feedforward, dropout=dropout)
Exemple #23
0
    def __init__(
        self,
        phoneme_size: int,
        phoneme_embedding_size: int,
        speaker_size: int,
        speaker_embedding_size: int,
        transformer_hidden_size: int,
        tranformer_head_num: int,
        transformer_encoder_layer_num: int,
        transformer_decoder_layer_num: int,
        tranformer_linear_size: int,
    ):
        super().__init__()

        self.with_speaker = speaker_size > 0
        self.phoneme_size = phoneme_size
        self.phoneme_padding_index = phoneme_size

        self.phoneme_embedder = nn.Embedding(
            num_embeddings=phoneme_size + 1,
            embedding_dim=phoneme_embedding_size,
            padding_idx=self.phoneme_padding_index,
        )
        self.speaker_embedder = (nn.Embedding(
            num_embeddings=speaker_size,
            embedding_dim=speaker_embedding_size,
        ) if self.with_speaker else None)

        self.source_positional_encoding = PositionalEncoding(
            hidden_size=phoneme_embedding_size)
        self.source_pre = nn.Linear(
            phoneme_embedding_size +
            (speaker_embedding_size if self.with_speaker else 0),
            transformer_hidden_size,
        )

        self.target_size = 1 + phoneme_embedding_size  # f0 + phoneme
        self.target_positional_encoding = PositionalEncoding(
            hidden_size=phoneme_embedding_size)
        self.target_pre = nn.Linear(
            self.target_size +
            (speaker_embedding_size if self.with_speaker else 0),
            transformer_hidden_size,
        )

        self.transformer = nn.Transformer(
            d_model=transformer_hidden_size,
            nhead=tranformer_head_num,
            num_encoder_layers=transformer_encoder_layer_num,
            num_decoder_layers=transformer_decoder_layer_num,
            dim_feedforward=tranformer_linear_size,
        )

        self.post = nn.Linear(
            in_features=transformer_hidden_size,
            out_features=1 + phoneme_size + 1 + 1,  # f0 + phoneme + vuv + stop
        )
 def __init__(self, input_size, num_classes):
     super(SequenceTransformer, self).__init__()
     self.embedding = nn.Embedding(num_classes, input_size)
     self.transformer = nn.Transformer(d_model=input_size, nhead=8, num_encoder_layers=6, 
                                         num_decoder_layers=6, dim_feedforward=2048, 
                                         dropout=0.1, activation='gelu')
     self.generator = nn.Linear(input_size, num_classes, bias=False)
     self.pad_idx = 2
     self.num_classes = num_classes
    def __init__(self, batch_size=32, chpkpt=None):
        super(OptimusPrime2, self).__init__()
        self.batch_size = batch_size

        self.fc1 = nn.Linear(15873, 4096)
        self.fc2 = nn.Linear(4096, 2048)
        self.fc3 = nn.Linear(2048, 512)

        self.out3 = nn.Linear(4096, 15873)
        self.out2 = nn.Linear(2048, 4096)
        self.out1 = nn.Linear(512, 2048)

        self.dropout = nn.Dropout(0.2)

        self.pool = nn.MaxPool2d(2, 2)
        self.pos_encode = PositionalEncoding(512, max_len=100)
        self.transformer = nn.Transformer(512, 4, 2, 2, 1024)

        self.deconv1 = nn.ConvTranspose2d(
            512,
            256,
            kernel_size=5,
            padding=(0, 1),
        )
        self.adaptive_pool = nn.AdaptiveAvgPool2d((135, 103))

        self.trg = torch.ones((1, self.batch_size, 512)).cuda()

        if chpkpt is not None:
            pretrained_dict = torch.load(chpkpt)
            pretrained_dict = pretrained_dict["model_state_dict"]
            model_dict = self.state_dict()

            new_model_state_dict = OrderedDict()
            model_state_dict = pretrained_dict
            if "swt-dgx" not in socket.gethostname():
                for k, v in model_state_dict.items():
                    if k.startswith("module"):
                        k = k[7:]  # remove `module.`
                    new_model_state_dict[k] = v

                model_state_dict = new_model_state_dict

            # 1. filter out unnecessary keys
            pretrained_dict = {
                k: v
                for k, v in model_state_dict.items()
                if k in model_dict and 'out' not in k
            }
            # 2. overwrite entries in the existing state dict
            model_dict.update(pretrained_dict)

            self.load_state_dict(model_dict)

            for name, param in self.named_parameters():
                if name.startswith('fc'):
                    param.requires_grad = False
Exemple #26
0
 def test_init_(self):
     model = nn.Transformer(
         d_model=16,
         nhead=4,
         num_encoder_layers=3,
         num_decoder_layers=3,
         dim_feedforward=32,
         dropout=0.0,
     )
    def __init__(self, grapheme_vocab_size, phoneme_vocab_size, grapheme_pad, phoneme_pad, embedding_dim=128):
        super().__init__()
        self.embedding_dim = embedding_dim

        self.grapheme_embedding = nn.Embedding(grapheme_vocab_size, embedding_dim, padding_idx=grapheme_pad)
        self.phoneme_embedding = nn.Embedding(phoneme_vocab_size, embedding_dim, padding_idx=phoneme_pad)
        self.pos_encoding = PositionalEncoding(embedding_dim)
        self.transformer = nn.Transformer(embedding_dim, nhead=4, num_encoder_layers=4, num_decoder_layers=4, dim_feedforward=512)
        self.fc = nn.Linear(embedding_dim, phoneme_vocab_size)
 def __init__(self, in_size, hidden_size, out_size, n_layers, dropout=0.1):
     super(TrfmSeq2seq, self).__init__()
     self.in_size = in_size
     self.hidden_size = hidden_size
     self.embed = nn.Embedding(in_size, hidden_size)
     self.pe = PositionalEncoding(hidden_size, dropout)
     self.trfm = nn.Transformer(d_model=hidden_size, nhead=4, 
     num_encoder_layers=n_layers, num_decoder_layers=n_layers, dim_feedforward=hidden_size)
     self.out = nn.Linear(hidden_size, out_size)
Exemple #29
0
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 hidden_size,
                 embeddings=None,
                 padding_idx=0,
                 dropout=0.5,
                 num_classes=3,
                 device="cpu"):
        """
        Args:
            vocab_size: The size of the vocabulary of embeddings in the model.
            embedding_dim: The dimension of the word embeddings.
            hidden_size: The size of all the hidden layers in the network.
            embeddings: A tensor of size (vocab_size, embedding_dim) containing
                pretrained word embeddings. If None, word embeddings are
                initialised randomly. Defaults to None.
            padding_idx: The index of the padding token in the premises and
                hypotheses passed as input to the model. Defaults to 0.
            dropout: The dropout rate to use between the layers of the network.
                A dropout rate of 0 corresponds to using no dropout at all.
                Defaults to 0.5.
            num_classes: The number of classes in the output of the network.
                Defaults to 3.
            device: The name of the device on which the model is being
                executed. Defaults to 'cpu'.
        """
        super(ESIM, self).__init__()

        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        self.dropout = dropout
        self.device = device

        if self.dropout:
            self._rnn_dropout = RNNDropout(p=self.dropout)

        self._word_embedding = nn.Embedding(self.vocab_size,
                                            self.embedding_dim,
                                            padding_idx=padding_idx,
                                            _weight=embeddings)

        self.transformer_model = nn.Transformer(d_model=self.embedding_dim,
                                                nhead=4,
                                                num_encoder_layers=3,
                                                num_decoder_layers=3)

        self._composition = nn.LSTM(self.embedding_dim,
                                    self.hidden_size,
                                    bidirectional=True,
                                    batch_first=True)

        self._classification = nn.Sequential(
            nn.Linear(self.hidden_size * 2, self.num_classes))
    def __init__(self, d_model=256, nhead=4, num_encoder_layers=2, dim_feedforward=1024):
        super(TransformerSim, self).__init__()

        self.tf = nn.Transformer(d_model=d_model,
                                nhead=nhead,
                                num_encoder_layers=num_encoder_layers,
                                num_decoder_layers=num_encoder_layers,
                                dim_feedforward=dim_feedforward)
        self.out_embed = nn.Embedding(3, d_model)
        self.generator = nn.Linear(d_model, 3)