def __init__(self, config): super(GPT2Model, self).__init__() self.embed_dim = config.hidden_size self.wte = nn.Embedding(config.vocab_size, self.embed_dim) self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) self.drop = nn.Dropout(config.embd_pdrop) self.h = nn.ModuleList( [GPT2Block(config) for _ in range(config.num_hidden_layers)]) self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
def __init__(self, config: Callable[..., None]) -> None: super().__init__() self.token_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=1e-12) self.dropout = nn.Dropout(config.hidden_dropout_prob)
def __init__( self, word_emb_dim, vocab_size, dim_channel, kernel_wins, dropout_rate, num_class, max_seq_len, training=True, ): super(textCNN, self).__init__() self.embed = nn.Embedding(vocab_size, word_emb_dim) self.convs = nn.ModuleList([ nn.Conv2d(1, dim_channel, (w, word_emb_dim)) for w in kernel_wins ]) self.maxpool = nn.ModuleList([ nn.MaxPool2d((max_seq_len - w + 1, 1), stride=1) for w in kernel_wins ]) # Dropout layer self.dropout = nn.Dropout(dropout_rate) self.training = training # FC layer self.fc = nn.Linear(len(kernel_wins) * dim_channel, num_class)
def __init__( self, max_position_embeddings, hidden_size, nheads, dropout=0, position_embedding_type="absolute", is_decoder=False, ): super(BertSelfAttention, self).__init__() if hidden_size % nheads != 0: raise ValueError( f"The hidden size ({hidden_size}) is not a multiple of the number of attention " f"heads ({nheads})") self.num_attention_heads = nheads self.attention_head_size = int(hidden_size / nheads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.query = nn.Linear(hidden_size, self.all_head_size) self.key = nn.Linear(hidden_size, self.all_head_size) self.value = nn.Linear(hidden_size, self.all_head_size) self.dropout = nn.Dropout(dropout) self.position_embedding_type = position_embedding_type if (self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query"): self.max_position_embeddings = max_position_embeddings self.distance_embedding = nn.Embedding( 2 * max_position_embeddings - 1, self.attention_head_size) self.is_decoder = is_decoder
def __init__(self, params): super(RecurrentLanguageModel, self).__init__(params) self.model_type = "recurrent_lm" self.vocab_size = params["vocab_size"] self.share_embedding = params["share_embedding"] self.smoothing = params["smoothing"] self.num_layers = params["num_layers"] self.hidden_size = params["hidden_size"] self.embedding = nn.Embedding(params["vocab_size"], params["hidden_size"]) self.rnn = nn.LSTM( input_size=params["hidden_size"], hidden_size=params["hidden_size"], num_layers=params["num_layers"], batch_first=True, dropout=params["dropout"], bidirectional=False, ) self.output_project = nn.Linear(params["hidden_size"], params["vocab_size"]) if self.share_embedding: assert self.embedding.weight.size( ) == self.output_project.weight.size() self.output_project.weight = self.embedding.weight self.crit = LabelSmoothingLoss(size=self.vocab_size, smoothing=self.smoothing, padding_idx=PAD)
def __init__( self, vocab_size, d_model=256, n_heads=4, d_ff=2048, memory_dim=256, n_blocks=6, pos_dropout=0.0, slf_attn_dropout=0.0, src_attn_dropout=0.0, ffn_dropout=0.0, residual_dropout=0.1, activation="relu", normalize_before=True, concat_after=False, share_embedding=False, ): super(TransformerDecoder, self).__init__() self.decoder_type = "transformer" self.normalize_before = normalize_before self.relative_positional = False self.d_model = d_model self.embedding = nn.Embedding(vocab_size, d_model) self.pos_emb = PositionalEncoding(d_model, pos_dropout) self.blocks = nn.ModuleList( [ TransformerDecoderLayer( n_heads, d_model, d_ff, memory_dim, slf_attn_dropout, src_attn_dropout, ffn_dropout, residual_dropout, normalize_before=normalize_before, concat_after=concat_after, relative_positional=False, activation=activation, ) for _ in range(n_blocks) ] ) if self.normalize_before: self.after_norm = nn.LayerNorm(d_model) self.output_layer = nn.Linear(d_model, vocab_size) if share_embedding: assert self.embedding.weight.size() == self.output_layer.weight.size() self.output_layer.weight = self.embedding.weight logger.info("Tie the weights between the embedding and output layer.")
def __init__( self, vocab_size, type_vocab_size, max_position_embeddings, hidden_size, hidden_dropout_prob, seq_length, ): super().__init__() self.word_embeddings = nn.Embedding(vocab_size, hidden_size) self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size) self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size) self.LayerNorm = nn.LayerNorm(hidden_size) self.dropout = nn.Dropout(hidden_dropout_prob, inplace=True) self.register_buffer( "position_ids", flow.arange(max_position_embeddings).unsqueeze(0) ) self.seq_length = seq_length
def __init__(self, emb_sz, emb_dim, hidden_size, nfc, n_classes, num_layers=1): super(LSTMText, self).__init__() self.emb_sz = emb_sz self.emb_dim = emb_dim self.n_classes = n_classes self.hidden_size = hidden_size self.nfc = nfc self.num_layers = num_layers self.bilstm = BiLSTM(emb_dim, hidden_size, num_layers) self.embedding = nn.Embedding(self.emb_sz, self.emb_dim) self.linear = nn.Linear(hidden_size * 2 * nfc, n_classes) self.softmax = nn.Softmax(dim=1)
def __init__( self, sos_id, eos_id, n_tgt_vocab, d_word_vec, n_layers, n_head, d_k, d_v, d_model, d_inner, dropout=0.1, tgt_emb_prj_weight_sharing=True, pe_maxlen=5000, ): super(Decoder, self).__init__() # parameters self.sos_id = sos_id self.eos_id = eos_id self.n_tgt_vocab = n_tgt_vocab self.d_word_vec = d_word_vec self.n_layers = n_layers self.n_head = n_head self.d_k = d_k self.d_v = d_v self.d_model = d_model self.d_inner = d_inner self.dropout = dropout self.tgt_emb_prj_weight_sharing = tgt_emb_prj_weight_sharing self.pe_maxlen = pe_maxlen self.tgt_word_emb = nn.Embedding(n_tgt_vocab, d_word_vec) self.positional_encoding = PositionalEncoding(d_model, max_len=pe_maxlen) self.dropout = nn.Dropout(dropout) self.layer_stack = nn.ModuleList([ DecoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers) ]) self.tgt_word_prj = nn.Linear(d_model, n_tgt_vocab, bias=False) nn.init.xavier_normal_(self.tgt_word_prj.weight) if tgt_emb_prj_weight_sharing: # Share the weight matrix between target word embedding & the final logit dense layer self.tgt_word_prj.weight = self.tgt_word_emb.weight self.x_logit_scale = d_model**0.5 else: self.x_logit_scale = 1.0
def __init__( self, vocab_size, max_position_embeddings, type_vocab_size, hidden_size, layer_norm_eps=1e-5, dropout=0, pad_token_id=0, position_embedding_type="absolute", ): super(BertEmbeddings, self).__init__() self.word_embeddings = nn.Embedding(vocab_size, hidden_size, padding_idx=pad_token_id) self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size) self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size) self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps) self.dropout = nn.Dropout(dropout) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = position_embedding_type self.register_buffer( "position_ids", flow.arange(max_position_embeddings).expand((1, -1))) self.register_buffer( "token_type_ids", flow.zeros( self.position_ids.size(), dtype=flow.int64, device=self.position_ids.device, ), persistent=False, ) self.padding_idx = pad_token_id
def __init__(self, params): super(TransformerLanguageModel, self).__init__(params) self.model_type = "transformer_lm" self.normalize_before = False self.smoothing = params["smoothing"] self.vocab_size = params["vocab_size"] self.num_blocks = params["num_blocks"] self.embedding = nn.Embedding(self.vocab_size, params["d_model"]) self.pos_embedding = PositionalEncoding(params["d_model"], 0.0) self.blocks = nn.ModuleList([ TransformerEncoderLayer( params["n_heads"], params["d_model"], params["d_ff"], slf_attn_dropout=0.0, ffn_dropout=0.0, residual_dropout=params["residual_dropout"], normalize_before=False, concat_after=False, activation="glu", ) for _ in range(self.num_blocks) ]) if self.normalize_before: self.after_norm = nn.LayerNorm(params["d_model"]) self.output_project = nn.Linear(params["d_model"], self.vocab_size) if params["share_embedding"]: self.output_project.weight = self.embedding.weight print("Share the weight of embedding to the output project layer!") self.crit = LabelSmoothingLoss(size=self.vocab_size, smoothing=self.smoothing, padding_idx=PAD)
def __init__(self, vocab, d_model): super(Embeddings, self).__init__() self.lut = nn.Embedding(vocab, d_model) self.d_model = d_model