def __init__(self, local_rank, vocab, embed_dim, ff_embed_dim, num_heads, dropout, layers, approx): super(BERTLM, self).__init__() self.vocab = vocab self.embed_dim =embed_dim self.tok_embed = Embedding(self.vocab.size, embed_dim, self.vocab.padding_idx) self.pos_embed = LearnedPositionalEmbedding(embed_dim, device=local_rank) self.seg_embed = Embedding(2, embed_dim, None) self.out_proj_bias = nn.Parameter(torch.Tensor(self.vocab.size)) self.layers = nn.ModuleList() for i in range(layers): self.layers.append(TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout)) self.emb_layer_norm = LayerNorm(embed_dim) self.one_more = nn.Linear(embed_dim, embed_dim) self.one_more_layer_norm = LayerNorm(embed_dim) self.one_more_nxt_snt = nn.Linear(embed_dim, embed_dim) self.nxt_snt_pred = nn.Linear(embed_dim, 1) self.dropout = dropout self.device = local_rank if approx == "none": self.approx = None elif approx == "adaptive": self.approx = nn.AdaptiveLogSoftmaxWithLoss(self.embed_dim, self.vocab.size, [10000, 20000, 200000]) else: raise NotImplementedError("%s has not been implemented"%approx) self.reset_parameters()
def __init__(self, local_rank, vocab, embed_dim, ff_embed_dim, num_heads, dropout, layers): super(BERTLM, self).__init__() self.vocab = vocab self.embed_dim = embed_dim self.tok_embed = Embedding(self.vocab.size, embed_dim, self.vocab.padding_idx) self.pos_embed = LearnedPositionalEmbedding(embed_dim, device=local_rank) self.seg_embed = Embedding(2, embed_dim, None) self.out_proj_bias = nn.Parameter(torch.Tensor(self.vocab.size)) self.layers = nn.ModuleList() for i in range(layers): self.layers.append( TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout)) self.emb_layer_norm = nn.LayerNorm(embed_dim) self.one_more = nn.Linear(embed_dim, embed_dim) self.one_more_layer_norm = nn.LayerNorm(embed_dim) self.one_more_nxt_snt = nn.Linear(embed_dim, embed_dim) self.nxt_snt_pred = nn.Linear(embed_dim, 1) self.dropout = dropout self.device = local_rank self.reset_parameters()
def __init__(self, modules, consts, options): super(Model, self).__init__() self.has_learnable_w2v = options["has_learnable_w2v"] self.is_predicting = options["is_predicting"] self.is_bidirectional = options["is_bidirectional"] self.beam_decoding = options["beam_decoding"] self.cell = options["cell"] self.device = options["device"] self.copy = options["copy"] self.coverage = options["coverage"] self.avg_nll = options["avg_nll"] self.dim_x = consts["dim_x"] self.dim_y = consts["dim_y"] self.len_x = consts["len_x"] self.len_y = consts["len_y"] self.hidden_size = consts["hidden_size"] self.dict_size = consts["dict_size"] self.pad_token_idx = consts["pad_token_idx"] self.ctx_size = self.hidden_size * 2 if self.is_bidirectional else self.hidden_size self.num_layers = consts["num_layers"] self.d_ff = consts["d_ff"] self.num_heads = consts["num_heads"] self.dropout = consts["dropout"] self.smoothing_factor = consts["label_smoothing"] self.tok_embed = nn.Embedding(self.dict_size, self.dim_x, self.pad_token_idx) self.pos_embed = LearnedPositionalEmbedding(self.dim_x, device=self.device) self.enc_layers = nn.ModuleList() for i in range(self.num_layers): self.enc_layers.append( TransformerLayer(self.dim_x, self.d_ff, self.num_heads, self.dropout)) self.dec_layers = nn.ModuleList() for i in range(self.num_layers): self.dec_layers.append( TransformerLayer(self.dim_x, self.d_ff, self.num_heads, self.dropout, with_external=True)) self.attn_mask = SelfAttentionMask(device=self.device) self.emb_layer_norm = LayerNorm(self.dim_x) self.word_prob = WordProbLayer(self.hidden_size, self.dict_size, self.device, self.copy, self.coverage, self.dropout) self.smoothing = LabelSmoothing(self.device, self.dict_size, self.pad_token_idx, self.smoothing_factor) self.init_weights()
def __init__(self, local_rank, vocab, embed_dim, ff_embed_dim, num_heads, dropout, layers, smoothing_factor, approx): super(BIGLM, self).__init__() self.vocab = vocab self.embed_dim = embed_dim self.tok_embed = Embedding(self.vocab.size, embed_dim, self.vocab.padding_idx) self.pos_embed = LearnedPositionalEmbedding(embed_dim, device=local_rank) self.layers = nn.ModuleList() for i in range(layers): self.layers.append(TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout, with_external=True)) self.emb_layer_norm = LayerNorm(embed_dim) self.one_more = nn.Linear(embed_dim, embed_dim) self.one_more_layer_norm = LayerNorm(embed_dim) self.out_proj = nn.Linear(embed_dim, self.vocab.size) self.attn_mask = SelfAttentionMask(device=local_rank) self.smoothing = LabelSmoothing(local_rank, self.vocab.size, self.vocab.padding_idx, smoothing_factor) self.dropout = dropout self.device = local_rank if approx == "none": self.approx = None elif approx == "adaptive": self.approx = nn.AdaptiveLogSoftmaxWithLoss(self.embed_dim, self.vocab.size, [10000, 20000, 200000]) else: raise NotImplementedError("%s has not been implemented"%approx) self.reset_parameters()
def __init__(self, local_rank, vocab, embed_dim, ff_embed_dim, num_heads, dropout, layers, smoothing_factor, approx=None): super(BIGLM, self).__init__() self.vocab = vocab self.embed_dim = embed_dim self.tok_embed = Embedding(self.vocab.size, embed_dim, self.vocab.padding_idx) self.pos_embed = LearnedPositionalEmbedding(embed_dim, device=local_rank) self.layers = nn.ModuleList() for i in range(layers): self.layers.append( TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout, with_external=True)) self.emb_layer_norm = LayerNorm(embed_dim) self.one_more = nn.Linear(embed_dim, embed_dim) self.one_more_layer_norm = LayerNorm(embed_dim) self.out_proj = nn.Linear(embed_dim, self.vocab.size) self.attn_mask = SelfAttentionMask(device=local_rank) self.smoothing = LabelSmoothing(local_rank, self.vocab.size, self.vocab.padding_idx, smoothing_factor) self.dropout = dropout self.device = local_rank self.approx = approx self.reset_parameters()