def __init__(self, cfg, word_vocab_size, pos_vocab_size, dep_vocab_size): super().__init__() self.transformer = models.Transformer(cfg) #logits_pos self.fc1 = nn.Linear(cfg.dim, cfg.dim) self.activ1 = models.gelu self.norm1 = models.LayerNorm(cfg) self.decoder1 = nn.Linear(cfg.dim, pos_vocab_size) #logits_vocab self.fc2 = nn.Linear(cfg.dim, cfg.dim) self.activ2 = models.gelu self.norm2 = models.LayerNorm(cfg) self.decoder2 = nn.Linear(cfg.dim, dep_vocab_size) #logits_word_vocab_size self.fc3 = nn.Linear(cfg.dim, cfg.dim) self.activ3 = models.gelu self.norm3 = models.LayerNorm(cfg) embed_weight = self.transformer.embed.tok_embed.weight n_vocab, n_dim = embed_weight.size() self.decoder3 = nn.Linear(n_dim, n_vocab, bias=False) self.decoder3.weight = embed_weight self.decoder3_bias = nn.Parameter(torch.zeros(n_vocab))
def __init__(self, cfg, word_vocab_size, pos_vocab_size, dep_vocab_size): super().__init__() self.transformer = models.Transformer2(cfg) #logits_pos self.fc1 = nn.Linear(cfg.dim, cfg.dim) self.activ1 = models.gelu self.norm1 = models.LayerNorm(cfg) embed_weight1 = self.transformer.embed.tag_embed.weight n_vocab1, n_dim1 = embed_weight1.size() self.decoder1 = nn.Linear(n_dim1, n_vocab1, bias=False) self.decoder1.weight = embed_weight1 self.decoder1_bias = nn.Parameter(torch.zeros(n_vocab1)) #logits_vocab self.fc2 = nn.Linear(cfg.dim, cfg.dim) self.activ2 = models.gelu self.norm2 = models.LayerNorm(cfg) embed_weight2 = self.transformer.embed.dep_embed.weight n_vocab2, n_dim2 = embed_weight2.size() self.decoder2 = nn.Linear(n_dim2, n_vocab2, bias=False) self.decoder2.weight = embed_weight2 self.decoder2_bias = nn.Parameter(torch.zeros(n_vocab2)) #logits_word_vocab_size self.fc3 = nn.Linear(cfg.dim, cfg.dim) self.activ3 = models.gelu self.norm3 = models.LayerNorm(cfg) embed_weight3 = self.transformer.embed.tok_embed.weight n_vocab3, n_dim3 = embed_weight3.size() self.decoder3 = nn.Linear(n_dim3, n_vocab3, bias=False) self.decoder3.weight = embed_weight3 self.decoder3_bias = nn.Parameter(torch.zeros(n_vocab3))
def __init__(self, cfg): super().__init__() self.transformer = models.Transformer(cfg) #logits_sentence_clsf self.fc = nn.Linear(cfg.dim, cfg.dim) self.activ1 = nn.Tanh() self.classifier = nn.Linear(cfg.dim, 2) #logits_paragraph_clsf ''' self.fc = nn.Linear(cfg.dim, 2) self.activ1 = nn.Tanh() self.norm1 = models.LayerNorm(cfg) self.drop = nn.Dropout(cfg.p_drop_hidden) self.classifier = nn.Linear(cfg.max_len * 2, 2) ''' #logits_lm self.linear = nn.Linear(cfg.dim, cfg.dim) self.activ2 = models.gelu self.norm2 = models.LayerNorm(cfg) # decoder is shared with embedding layer embed_weight = self.transformer.embed.tok_embed.weight n_vocab, n_dim = embed_weight.size() self.decoder = nn.Linear(n_dim, n_vocab, bias=False) self.decoder.weight = embed_weight self.decoder_bias = nn.Parameter(torch.zeros(n_vocab)) #logits_same self.linear2 = nn.Linear(cfg.dim, cfg.vocab_size)
def __init__(self, cfg): super().__init__() self.transformer = models.Transformer(cfg) self.fc = nn.Linear(cfg.hidden, cfg.hidden) self.activ1 = nn.Tanh() self.linear = nn.Linear(cfg.hidden, cfg.hidden) self.activ2 = models.gelu self.norm = models.LayerNorm(cfg) self.classifier = nn.Linear(cfg.hidden, 2) # decoder is shared with embedding layer ## project hidden layer to embedding layer embed_weight2 = self.transformer.embed.tok_embed2.weight n_hidden, n_embedding = embed_weight2.size() self.decoder1 = nn.Linear(n_hidden, n_embedding, bias=False) self.decoder1.weight.data = embed_weight2.data.t() ## project embedding layer to vocabulary layer embed_weight1 = self.transformer.embed.tok_embed1.weight n_vocab, n_embedding = embed_weight1.size() self.decoder2 = nn.Linear(n_embedding, n_vocab, bias=False) self.decoder2.weight = embed_weight1 # self.tok_embed1 = nn.Embedding(cfg.vocab_size, cfg.embedding) # self.tok_embed2 = nn.Linear(cfg.embedding, cfg.hidden) self.decoder_bias = nn.Parameter(torch.zeros(n_vocab)) # 역할이 뭐지..?
def __init__(self, cfg): super().__init__() self.transformer = models.Transformer(cfg) #logits_word_vocab_size self.fc3 = nn.Linear(cfg.dim, cfg.dim) self.activ3 = models.gelu self.norm3 = models.LayerNorm(cfg) embed_weight3 = self.transformer.embed.tok_embed.weight n_vocab3, n_dim3 = embed_weight3.size() self.decoder3 = nn.Linear(n_dim3, n_vocab3, bias=False) self.decoder3.weight = embed_weight3 self.decoder3_bias = nn.Parameter(torch.zeros(n_vocab3))
def __init__(self, cfg): super().__init__() self.transformer = models.Transformer(cfg) self.fc = nn.Linear(cfg.hidden, cfg.hidden) self.activ1 = nn.Tanh() self.linear = nn.Linear(cfg.hidden, cfg.hidden) self.activ2 = models.gelu self.norm = models.LayerNorm(cfg) self.classifier = nn.Linear(cfg.hidden, 2) # decoder is shared with embedding layer ## project hidden layer to embedding layer self.discriminator = nn.Linear(cfg.hidden, 1, bias=False)
def __init__(self, cfg): super().__init__() self.transformer = models.Transformer(cfg) self.fc = nn.Linear(cfg.dim, cfg.dim) self.activ1 = nn.Tanh() self.linear = nn.Linear(cfg.dim, cfg.dim) self.activ2 = models.gelu self.norm = models.LayerNorm(cfg) self.classifier = nn.Linear(cfg.dim, 2) # decoder is shared with embedding layer embed_weight = self.transformer.embed.tok_embed.weight n_vocab, n_dim = embed_weight.size() self.decoder = nn.Linear(n_dim, n_vocab, bias=False) self.decoder.weight = embed_weight self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))