def __init__(self, num_classes, hidden_dim=256, nheads=8, num_encoder_layers=6, num_decoder_layers=6): super().__init__() # create ResNet-50 backbone self.backbone = resnet50() del self.backbone.fc # create conversion layer self.conv = nn.Conv2d(2048, hidden_dim, 1) # create a default PyTorch transformer self.transformer = nn.Transformer( hidden_dim, nheads, num_encoder_layers, num_decoder_layers) # prediction heads, one extra class for predicting non-empty slots # note that in baseline DETR linear_bbox layer is 3-layer MLP self.linear_class = nn.Linear(hidden_dim, num_classes + 1) self.linear_bbox = nn.Linear(hidden_dim, 4) # output positional encodings (object queries) self.query_pos = nn.Parameter(torch.rand(100, hidden_dim)) # spatial positional encodings # note that in baseline DETR we use sine positional encodings self.row_embed = nn.Parameter(torch.rand(50, hidden_dim // 2)) self.col_embed = nn.Parameter(torch.rand(50, hidden_dim // 2))
def __init__(self, e=512, nhead=8, channels=64, layers=4): super().__init__() self.alphabet_embedding = nn.Embedding(ALPHABET_SIZE, channels) self.masks_embedding = nn.Linear(7, channels) self.button_embedding = nn.Embedding(NUM_TOKENS + 1, e) self.block_1 = nn.Sequential( nn.Conv1d(channels, channels, kernel_size=3, padding=1), nn.ReLU(), nn.Conv1d(channels, channels, kernel_size=3, padding=1), nn.ReLU(), nn.Conv1d(channels, channels, kernel_size=3, padding=1), nn.ReLU(), ) self.block_2 = nn.Sequential( nn.Conv1d(channels, channels, kernel_size=3, padding=1), nn.ReLU(), nn.Conv1d(channels, channels, kernel_size=3, padding=1), nn.ReLU(), nn.Conv1d(channels, channels, kernel_size=3, padding=1), nn.ReLU(), ) self.spec_proj = nn.Linear(36 * channels, e) self.positional_button_encoding = PositionalEncoding(e) self.transformer = nn.Transformer( e, nhead, num_encoder_layers=layers // 2, num_decoder_layers=layers - layers // 2, )
def __init__(self, d_in, d_out, batch_size, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.5): super(OwnTransformerModel, self).__init__() self.d_in = d_in self.d_out = d_out self.batch_size = batch_size self.d_model = d_model # self.encoder = nn.Embedding(num_embeddings=ntoken, embedding_dim=ninp) self.encoder = nn.Linear(in_features=30, out_features=d_model) self.transformer = nn.Transformer( d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout)
def __init__(self, vocab_len, hidden_dim, nheads, num_encoder_layers, num_decoder_layers): super().__init__() # create ResNet-101 backbone self.backbone = resnet101() del self.backbone.fc # create conversion layer self.conv = nn.Conv2d(2048, hidden_dim, 1) # create a default PyTorch transformer self.transformer = nn.Transformer(hidden_dim, nheads, num_encoder_layers, num_decoder_layers) # prediction heads with length of vocab self.vocab = nn.Linear(hidden_dim, vocab_len) # output positional encodings (object queries) self.decoder = nn.Embedding(vocab_len, hidden_dim) self.query_pos = PositionalEncoding(hidden_dim, .2) # spatial positional encodings, sine positional encoding can be used. # Detr baseline uses sine positional encoding. self.row_embed = nn.Parameter(torch.rand(50, hidden_dim // 2)) self.col_embed = nn.Parameter(torch.rand(50, hidden_dim // 2)) self.trg_mask = None
def __init__(self, input_size, hidden_size, output_size, attention=False, dropout=False, bidirectional=False, num_layers=1, num_heads=2, max_length=50, batch_first=False): super(GRU, self).__init__() self.hidden_size = hidden_size self.num_layers = num_layers self.batch_first = batch_first self.bidirectional = bidirectional self.attention = attention self.multiplier = 1 if self.bidirectional: self.multiplier = 2 if self.attention: self.attention = nn.Transformer(nheads=num_heads, num_encoder_layers=num_layers) self.gru = nn.GRU(input_size, hidden_size, bidirectional=bidirectional, num_layers=num_layers, dropout=dropout, batch_first=batch_first) self.out = nn.Linear(self.multiplier * hidden_size, output_size)
def __init__(self, args, embed, nlayers=3, dropout=0.1): super(GengrateDocument, self).__init__() self.cuda = args.cuda self.nhead = args.n_head # 多头注意力模型的头数 self.hidden = args.word_emb_dim # 编码器和解码器输入的大小 self.doc_max_timesteps = args.doc_max_timesteps # 输入文档的最大句子数目 self.outputSize = args.vocab_size # 词汇表大小(用于预测每个单词输出的概率) self.sent_max_len = args.sent_max_len # 输入句子的最大长度 self.embed = embed # encoder和decoder输入的embedding self.pos_encoder = PositionalEncoding(self.hidden, dropout, self.sent_max_len) # 输入的位置编码 self.pos_decoder = PositionalEncoding( self.hidden, dropout, self.sent_max_len * self.doc_max_timesteps) # 输出的位置编码 self.transformer = nn.Transformer(d_model=self.hidden, nhead=self.nhead, num_encoder_layers=nlayers, num_decoder_layers=nlayers, dim_feedforward=self.hidden, dropout=dropout, activation="gelu") self.src_mask = None # 输入序列的mask self.trg_mask = None # 输出序列的mask self.memory_mask = None # encoder输出序列的mask self.fc_out = nn.Linear(self.hidden, self.outputSize)
def __init__(self, sound_dim, text_dim, d_model, dim_feedforward, dropout_rate=0.0, device="cuda"): super(MyTransformer, self).__init__() self.device = device self.sound_embed = Conv2dSubsampling(sound_dim, d_model, dropout_rate) # self.sound_embed = nn.Linear(sound_dim, d_model) self.text_embed = nn.Embedding( text_dim, d_model, ) self.pos_encoder = PositionalEncoding(d_model, dropout_rate) self.transformer = nn.Transformer(d_model, nhead= 8, num_encoder_layers= 12,\ num_decoder_layers = 6, dim_feedforward = dim_feedforward,\ dropout = dropout_rate, activation = 'gelu') # self.lin_ctc = nn.Linear(d_model, text_dim + 1) self.out_lin = nn.Linear(d_model, text_dim)
def __init__( self, embedding_size, src_vocab_size, trg_vocab_size, src_pad_idx, num_heads, num_encoder_layers, num_decoder_layers, forward_expansion, dropout, max_len, device, ): super(Transformer, self).__init__() self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size) self.src_position_embedding = nn.Embedding(max_len, embedding_size) self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size) self.trg_position_embedding = nn.Embedding(max_len, embedding_size) self.device = device self.transformer = nn.Transformer( embedding_size, num_heads, num_encoder_layers, num_decoder_layers, forward_expansion, dropout, ) self.fc_out = nn.Linear(embedding_size, trg_vocab_size) self.dropout = nn.Dropout(dropout) self.src_pad_idx = src_pad_idx
def __init__(self, config, args): super().__init__() self.config = config self.args = args self.tok_embed = nn.Embedding(self.config.vocab_size, self.config.hidden_size) self.pos_encoding = nn.Embedding(300, self.config.hidden_size) self.tok_type_embed = nn.Embedding(2, self.config.hidden_size) self.dropout = nn.Dropout(0.1) self.scale = torch.sqrt(torch.FloatTensor([self.config.hidden_size ])).to(self.args.device) assert len(self.scale.shape) == 1 assert contains_nan(self.scale).item() is False self.fc_out = nn.Linear(self.config.hidden_size, self.config.vocab_size) num_layers = self.config.num_hidden_layers // 2 self.transformer = nn.Transformer( d_model=self.config.hidden_size, nhead=self.config.num_attention_heads, num_encoder_layers=num_layers, num_decoder_layers=num_layers, activation=self.config.hidden_act, dropout=0.1)
def __init__(self, src_vocab, trg_vocab, d_model, N, heads, dropout): super().__init__() self.myenc = nn.GRU(src_vocab, d_model) self.pencoder1 = nn.GRU(d_model * 2, d_model, bidirectional=True, batch_first=True) self.embed = nn.Embedding(trg_vocab + 1, d_model) # plus 1 for the sos token self.trans = nn.Transformer(d_model=d_model, nhead=heads, num_encoder_layers=N, num_decoder_layers=N, dim_feedforward=DFF, dropout=dropout) self.out = nn.Linear(d_model, NUMLABELS) self.pos_enc1 = PositionalEncoding(d_model, dropout, max_len=ActualWINDOWWIDTH // 2) self.pos_enc2 = PositionalEncoding(d_model, dropout, max_len=WINDOWWIDTH // PREDICT_EVERY_NTH_FRAME) self.d_model = d_model
def __init__( self, dim_ins: tuple, # dims of ti, tc, kn dim_out: int, # just reserved ws: int, # windows size of time series/sequence dim_emb: int, # dim for embedding n_heads: int, # the number of attention heads in transformer n_layers: int, # layers of multi-heads k: int, # the numbers of curves ): super().__init__() self.set_params() # override args from child/sub class dim_ins = self.args.dim_ins dim_emb = self.args.dim_emb ws = self.args.ws n_heads = self.args.n_heads n_layers = self.args.n_layers self.n_quantiles = len(self.quantiles) # the number of quantiles # embedder n_dim = sum(dim_ins[0:]) self.emb_encode = nn.Linear(n_dim, dim_emb) # .double() self.emb_decode = nn.Linear(n_dim, dim_emb) # .double() max_len = max(16, ws) self.pos = PositionalEncoding(d_model=dim_emb, max_len=max_len) # Transformer prm = dict( d_model=dim_emb, nhead=n_heads, num_encoder_layers=n_layers, num_decoder_layers=n_layers, ) self.tr = nn.Transformer(**prm) # .double() # linears self.dc = nn.Linear(ws * dim_emb, ws * n_dim * self.n_quantiles) # .double() # constraint self.loss_constraint_pretrain = 0 # to double self.emb_encode = self.emb_encode.double() self.emb_decode = self.emb_decode.double() self.tr = self.tr.double() self.dc = self.dc.double() # initialize weights/biases weight_interval = 0.01 nn.init.uniform_(self.emb_encode.weight, -weight_interval, weight_interval) nn.init.uniform_(self.emb_decode.weight, -weight_interval, weight_interval) nn.init.xavier_normal_(self.dc.weight) for fc in [self.emb_encode, self.emb_decode, self.dc]: nn.init.zeros_(fc.bias)
def __init__(self, num_classes, hidden_dim, nheads, num_encoder_layers, num_decoder_layers): super().__init__() # We take only convolutional layers from ResNet-50 model self.backbone = nn.Sequential( *list(resnet50(pretrained=True).children())[:-2]) self.conv = nn.Conv2d(2048, hidden_dim, 1) self.transformer = nn.Transformer(hidden_dim, heads, num_encoder_layers, num_decoder_layers) self.linear_class = nn.Linear(hidden_dim, num_classes + 1) self.linear_bbox = nn.Linear(hidden_dim, 4) self.query_pos = nn.Parameter(torch.rand(100, hidden_dim)) self.row_embed = nn.Parameter(torch.rand(50, hidden_dim // 2)) self.col_embed = nn.Parameter(torch.rand(50, hidden_dim // 2)) def forward(self, inputs): x = self.backbone(inputs) h = self.conv(x) H, W = h.shape[-2:] pos = torch.cat([ self.col_embed[:W].unsqueeze(0).repeat(H, 1, 1), self.row_embed[:H].unsqueeze(1).repeat(1, W, 1), ], dim=-1).flatten(0, 1).unsqueeze(1) h = self.transformer(pos + h.flatten(2).permute(2, 0, 1), self.query_pos.unsqueeze(1)) return self.linear_class(h), self.linear_bbox(h).sigmoid()
def __init__(self, config): super().__init__(config) self.embed = nn.ModuleDict() self.embed["qid"] = nn.Embedding(config.num_item + 1, config.dim_model, padding_idx=0) self.embed["skill"] = nn.Embedding(config.num_skill + 1, config.dim_model, padding_idx=0) self.embed["is_correct"] = nn.Embedding(3, config.dim_model, padding_idx=0) # transformer self.transformer = nn.Transformer( d_model=config.dim_model, nhead=config.head_count, num_encoder_layers=config.layer_count, num_decoder_layers=config.layer_count, dim_feedforward=config.dim_ff, dropout=config.dropout_rate, ) # positional encoding self.embed["enc_pos"] = AbsoluteDiscretePositionalEncoding( dim_emb=config.dim_model, max_len=config.seq_len, device=config.device) self.embed["dec_pos"] = copy.deepcopy(self.embed["enc_pos"]) self.generator = nn.Linear(config.dim_model, 1) # xavier initialization for param in self.parameters(): if param.dim() > 1: nn.init.xavier_uniform_(param)
def __init__(self, group, *unused_args, d_vocab=23, d_model=16, add_bn=True, **unused_kwargs): super().__init__() self.rank = group.rank() self.world_size = group.size() torch.manual_seed(0) # keep everything deterministic assert d_vocab >= 12 # we use torch.arange(12) as input self.embed_tokens = nn.Embedding(d_vocab, d_model) self.transformer = nn.Transformer( d_model=d_model, num_encoder_layers=2, num_decoder_layers=2, dim_feedforward=8, dropout=0.1, ) self.output_proj = nn.Linear(d_model, d_vocab) # share the embedding and output projection weights self.output_proj.weight = self.embed_tokens.weight self.register_buffer("vocab_bias", self.embed_tokens.weight.new_ones((d_model, ))) self.register_buffer( "long_buffer", torch.zeros_like(self.vocab_bias, dtype=torch.long)) self.bs = 2 self.bn = torch.nn.BatchNorm1d( self.bs) if add_bn else torch.nn.Identity()
def forward(self, input, labels=None): # input_size T,N,D batch_size = input.size(1) if self.training: labels = labels.permute(1, 0) sos_emb = self.sos_emb.expand(1, batch_size) labels = torch.cat((sos_emb, labels), dim=0)[:self.max_len] tgt_emb = self.embeddings(labels) out = self.decoder(tgt=tgt_emb, tgt_mask=self.tgt_mask, memory=input) return self.predictor(out) else: labels = self.sos_emb.expand(1, batch_size) outputs = torch.zeros(self.max_len, batch_size, self.class_num).cuda() for i in range(0, self.max_len): tgt_emb = self.embeddings(labels) tgt_emb = self.pos_encoder(tgt_emb) tgt_mask = nn.Transformer().generate_square_subsequent_mask( i + 1).cuda() out = self.decoder(tgt=tgt_emb, tgt_mask=tgt_mask, memory=input) out = self.predictor(out) pred = torch.argmax(out, dim=2) sos = self.sos_emb.expand(1, batch_size) labels = torch.cat((sos, pred), dim=0) outputs[:i] = out return outputs
def __init__( self, group, *args, d_vocab=23, d_model=16, add_bn=True, fsdp_init_mode=FSDPInitMode.CUDA_AFTER, **kwargs ): super().__init__() self.rank = group.rank() self.world_size = group.size() torch.manual_seed(0) # keep everything deterministic assert ( d_vocab >= 12 ), "dim of vocab should be larger than 12, as we use torch.arange(12) as input" self.embed_tokens = nn.Embedding(d_vocab, d_model) self.transformer = nn.Transformer( d_model=d_model, num_encoder_layers=2, num_decoder_layers=2, dim_feedforward=8, dropout=0.1, ) self.output_proj = nn.Linear(d_model, d_vocab) # share the embedding and output projection weights self.output_proj.weight = self.embed_tokens.weight self.register_buffer( "vocab_bias", self.embed_tokens.weight.new_ones((d_model,)) ) self.register_buffer("long_buffer", torch.zeros_like(self.vocab_bias, dtype=torch.long)) # type: ignore[arg-type] self.bs = 2 self.bn = torch.nn.BatchNorm1d(self.bs) if add_bn else torch.nn.Identity() move_to_cuda = fsdp_init_mode == FSDPInitMode.CUDA_BEFORE self = _maybe_cuda(self, move_to_cuda)
def __init__(self, vocab: List[str], hidden_features: int, enc_layers=1, dec_layers=1, nhead=1, dropout=0.1): super().__init__() self.letter_to_token, _ = tokenize_vocab(vocab) self.pos_encoder = PositionalEncoding(hidden_features, dropout) self.decoder = nn.Embedding(len(vocab), hidden_features) self.pos_decoder = PositionalEncoding(hidden_features, dropout) self.transformer = nn.Transformer(d_model=hidden_features, nhead=nhead, num_encoder_layers=enc_layers, num_decoder_layers=dec_layers, dim_feedforward=hidden_features * 4, dropout=dropout, activation='relu') self.src_mask = None self.trg_mask = None self.memory_mask = None self.fc = nn.Linear(hidden_features, len(vocab), bias=False) self._initialize_weights()
def __init__(self, input_size: int, nhead: int = 8, num_encoder_layers: int = 1, num_decoder_layers: int = 1, dim_feedforward: int = 256, dropout: float = 0.1, activation: str = "relu", pose_indices: Optional[Tuple[int, int]] = None, pretraining: bool = False): r"""A Transformer for encoding the state in RL and decoding features based on the observation and goal encodings. Supports masking the hidden state during various timesteps in the forward pass Args: input_size: The input size of the SMT nhead: The number of encoding and decoding attention heads num_encoder_layers: The number of encoder layers num_decoder_layers: The number of decoder layers dim_feedforward: The hidden size of feedforward layers in the transformer dropout: The dropout value after each attention layer activation: The activation to use after each linear layer """ super().__init__() self._input_size = input_size self._nhead = nhead self._num_encoder_layers = num_encoder_layers self._num_decoder_layers = num_decoder_layers self._dim_feedforward = dim_feedforward self._dropout = dropout self._activation = activation self._pose_indices = pose_indices self._pretraining = pretraining if pose_indices is not None: pose_dims = pose_indices[1] - pose_indices[0] self.pose_encoder = nn.Linear(5, 16) input_size += 16 - pose_dims self._use_pose_encoding = True else: self._use_pose_encoding = False self.fusion_encoder = nn.Sequential( nn.Linear(input_size, dim_feedforward), nn.ReLU(), nn.Linear(dim_feedforward, dim_feedforward), ) self.transformer = nn.Transformer( d_model=dim_feedforward, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, activation=activation, )
def __init__(self, args): super(SortingModel, self).__init__() self.embed_mod = nn.Embedding(args.n_vocab, args.hidden_dim) self.transformer = nn.Transformer( d_model=args.hidden_dim, nhead=args.width, num_encoder_layers=0, num_decoder_layers=args.depth ) self.classifier = nn.Linear(args.hidden_dim, args.num_labels) self.args = args
def __init__(self, cnn, vocab, **config): super().__init__(cnn, vocab, **config) self.decoder = nn.Transformer(cnn.n_features, config['nhead'], config['encoder_nlayers'], config['decoder_nlayers'], config['dim_feedforward'], config['dropout']) self.character_distribution = nn.Linear(cnn.n_features, vocab.size)
def __init__(self, src_vocab_size, trg_vocab_size, encoder_embed_dim_emb=512, decoder_embed_dim_emb=512, encoder_embed_dim=256, decoder_embed_dim=256, encoder_layers=3, decoder_layers=3, encoder_attention_heads=8, decoder_attention_heads=8, encoder_ffn_embed_dim=512, decoder_ffn_embed_dim=512, dropout=0.1, activation_fn="relu", max_src_positions=1024, max_trg_positions=1024, padding_idx=None, learned=False, **kwargs): super().__init__(src_vocab_size, trg_vocab_size, padding_idx, **kwargs) self.max_src_positions = max_src_positions self.max_trg_positions = max_trg_positions # Model self.src_embeddings = nn.Embedding(src_vocab_size, encoder_embed_dim_emb) self.trg_embeddings = nn.Embedding(trg_vocab_size, decoder_embed_dim_emb) self.src_pos_embeddings = PositionalEmbedding( num_embeddings=max_src_positions, embedding_dim=encoder_embed_dim_emb, padding_idx=padding_idx, learned=learned) self.trg_pos_embeddings = PositionalEmbedding( num_embeddings=max_trg_positions, embedding_dim=decoder_embed_dim_emb, padding_idx=padding_idx, learned=learned) self.src_dense_emb = nn.Linear(encoder_embed_dim_emb, encoder_embed_dim) self.trg_dense_emb = nn.Linear(decoder_embed_dim_emb, decoder_embed_dim) self.transformer = nn.Transformer( d_model=encoder_embed_dim, nhead=encoder_attention_heads, num_encoder_layers=encoder_layers, num_decoder_layers=decoder_layers, dim_feedforward=encoder_ffn_embed_dim, dropout=dropout, activation=activation_fn) self.output_layer = nn.Linear(encoder_embed_dim, src_vocab_size) self.input_dropout = nn.Dropout(dropout) # Checks assert encoder_embed_dim == decoder_embed_dim assert encoder_attention_heads == decoder_attention_heads assert encoder_ffn_embed_dim == decoder_ffn_embed_dim
def __init__(self, src_vocab, trg_vocab, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, max_length=5000): super().__init__() self.enc_emb = Embedding(vocab_size=src_vocab, d_model=d_model, dropout=dropout, max_length=max_length) self.dec_emb = Embedding(vocab_size=trg_vocab, d_model=d_model, dropout=dropout, max_length=max_length) self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout)
def __init__( self, phoneme_size: int, phoneme_embedding_size: int, speaker_size: int, speaker_embedding_size: int, transformer_hidden_size: int, tranformer_head_num: int, transformer_encoder_layer_num: int, transformer_decoder_layer_num: int, tranformer_linear_size: int, ): super().__init__() self.with_speaker = speaker_size > 0 self.phoneme_size = phoneme_size self.phoneme_padding_index = phoneme_size self.phoneme_embedder = nn.Embedding( num_embeddings=phoneme_size + 1, embedding_dim=phoneme_embedding_size, padding_idx=self.phoneme_padding_index, ) self.speaker_embedder = (nn.Embedding( num_embeddings=speaker_size, embedding_dim=speaker_embedding_size, ) if self.with_speaker else None) self.source_positional_encoding = PositionalEncoding( hidden_size=phoneme_embedding_size) self.source_pre = nn.Linear( phoneme_embedding_size + (speaker_embedding_size if self.with_speaker else 0), transformer_hidden_size, ) self.target_size = 1 + phoneme_embedding_size # f0 + phoneme self.target_positional_encoding = PositionalEncoding( hidden_size=phoneme_embedding_size) self.target_pre = nn.Linear( self.target_size + (speaker_embedding_size if self.with_speaker else 0), transformer_hidden_size, ) self.transformer = nn.Transformer( d_model=transformer_hidden_size, nhead=tranformer_head_num, num_encoder_layers=transformer_encoder_layer_num, num_decoder_layers=transformer_decoder_layer_num, dim_feedforward=tranformer_linear_size, ) self.post = nn.Linear( in_features=transformer_hidden_size, out_features=1 + phoneme_size + 1 + 1, # f0 + phoneme + vuv + stop )
def __init__(self, input_size, num_classes): super(SequenceTransformer, self).__init__() self.embedding = nn.Embedding(num_classes, input_size) self.transformer = nn.Transformer(d_model=input_size, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, activation='gelu') self.generator = nn.Linear(input_size, num_classes, bias=False) self.pad_idx = 2 self.num_classes = num_classes
def __init__(self, batch_size=32, chpkpt=None): super(OptimusPrime2, self).__init__() self.batch_size = batch_size self.fc1 = nn.Linear(15873, 4096) self.fc2 = nn.Linear(4096, 2048) self.fc3 = nn.Linear(2048, 512) self.out3 = nn.Linear(4096, 15873) self.out2 = nn.Linear(2048, 4096) self.out1 = nn.Linear(512, 2048) self.dropout = nn.Dropout(0.2) self.pool = nn.MaxPool2d(2, 2) self.pos_encode = PositionalEncoding(512, max_len=100) self.transformer = nn.Transformer(512, 4, 2, 2, 1024) self.deconv1 = nn.ConvTranspose2d( 512, 256, kernel_size=5, padding=(0, 1), ) self.adaptive_pool = nn.AdaptiveAvgPool2d((135, 103)) self.trg = torch.ones((1, self.batch_size, 512)).cuda() if chpkpt is not None: pretrained_dict = torch.load(chpkpt) pretrained_dict = pretrained_dict["model_state_dict"] model_dict = self.state_dict() new_model_state_dict = OrderedDict() model_state_dict = pretrained_dict if "swt-dgx" not in socket.gethostname(): for k, v in model_state_dict.items(): if k.startswith("module"): k = k[7:] # remove `module.` new_model_state_dict[k] = v model_state_dict = new_model_state_dict # 1. filter out unnecessary keys pretrained_dict = { k: v for k, v in model_state_dict.items() if k in model_dict and 'out' not in k } # 2. overwrite entries in the existing state dict model_dict.update(pretrained_dict) self.load_state_dict(model_dict) for name, param in self.named_parameters(): if name.startswith('fc'): param.requires_grad = False
def test_init_(self): model = nn.Transformer( d_model=16, nhead=4, num_encoder_layers=3, num_decoder_layers=3, dim_feedforward=32, dropout=0.0, )
def __init__(self, grapheme_vocab_size, phoneme_vocab_size, grapheme_pad, phoneme_pad, embedding_dim=128): super().__init__() self.embedding_dim = embedding_dim self.grapheme_embedding = nn.Embedding(grapheme_vocab_size, embedding_dim, padding_idx=grapheme_pad) self.phoneme_embedding = nn.Embedding(phoneme_vocab_size, embedding_dim, padding_idx=phoneme_pad) self.pos_encoding = PositionalEncoding(embedding_dim) self.transformer = nn.Transformer(embedding_dim, nhead=4, num_encoder_layers=4, num_decoder_layers=4, dim_feedforward=512) self.fc = nn.Linear(embedding_dim, phoneme_vocab_size)
def __init__(self, in_size, hidden_size, out_size, n_layers, dropout=0.1): super(TrfmSeq2seq, self).__init__() self.in_size = in_size self.hidden_size = hidden_size self.embed = nn.Embedding(in_size, hidden_size) self.pe = PositionalEncoding(hidden_size, dropout) self.trfm = nn.Transformer(d_model=hidden_size, nhead=4, num_encoder_layers=n_layers, num_decoder_layers=n_layers, dim_feedforward=hidden_size) self.out = nn.Linear(hidden_size, out_size)
def __init__(self, vocab_size, embedding_dim, hidden_size, embeddings=None, padding_idx=0, dropout=0.5, num_classes=3, device="cpu"): """ Args: vocab_size: The size of the vocabulary of embeddings in the model. embedding_dim: The dimension of the word embeddings. hidden_size: The size of all the hidden layers in the network. embeddings: A tensor of size (vocab_size, embedding_dim) containing pretrained word embeddings. If None, word embeddings are initialised randomly. Defaults to None. padding_idx: The index of the padding token in the premises and hypotheses passed as input to the model. Defaults to 0. dropout: The dropout rate to use between the layers of the network. A dropout rate of 0 corresponds to using no dropout at all. Defaults to 0.5. num_classes: The number of classes in the output of the network. Defaults to 3. device: The name of the device on which the model is being executed. Defaults to 'cpu'. """ super(ESIM, self).__init__() self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.hidden_size = hidden_size self.num_classes = num_classes self.dropout = dropout self.device = device if self.dropout: self._rnn_dropout = RNNDropout(p=self.dropout) self._word_embedding = nn.Embedding(self.vocab_size, self.embedding_dim, padding_idx=padding_idx, _weight=embeddings) self.transformer_model = nn.Transformer(d_model=self.embedding_dim, nhead=4, num_encoder_layers=3, num_decoder_layers=3) self._composition = nn.LSTM(self.embedding_dim, self.hidden_size, bidirectional=True, batch_first=True) self._classification = nn.Sequential( nn.Linear(self.hidden_size * 2, self.num_classes))
def __init__(self, d_model=256, nhead=4, num_encoder_layers=2, dim_feedforward=1024): super(TransformerSim, self).__init__() self.tf = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_encoder_layers, dim_feedforward=dim_feedforward) self.out_embed = nn.Embedding(3, d_model) self.generator = nn.Linear(d_model, 3)