Exemple #1
0
    def __init__(self, local_rank, vocab, embed_dim, ff_embed_dim, num_heads, dropout, layers, smoothing_factor, approx):
        super(BIGLM, self).__init__()
        self.vocab = vocab
        self.embed_dim = embed_dim

        self.tok_embed = Embedding(self.vocab.size, embed_dim, self.vocab.padding_idx)
        self.pos_embed = LearnedPositionalEmbedding(embed_dim, device=local_rank)
        
        self.layers = nn.ModuleList()
        for i in range(layers):
            self.layers.append(TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout, with_external=True))
        self.emb_layer_norm = LayerNorm(embed_dim)
        self.one_more = nn.Linear(embed_dim, embed_dim)
        self.one_more_layer_norm = LayerNorm(embed_dim)
        self.out_proj = nn.Linear(embed_dim, self.vocab.size)
        
        self.attn_mask = SelfAttentionMask(device=local_rank)
        self.smoothing = LabelSmoothing(local_rank, self.vocab.size, self.vocab.padding_idx, smoothing_factor)
       
        self.dropout = dropout
        self.device = local_rank

        if approx == "none":
            self.approx = None
        elif approx == "adaptive":
            self.approx = nn.AdaptiveLogSoftmaxWithLoss(self.embed_dim, self.vocab.size, [10000, 20000, 200000])
        else:
            raise NotImplementedError("%s has not been implemented"%approx)
        self.reset_parameters()
    def __init__(self, local_rank, vocab, embed_dim, ff_embed_dim, num_heads, dropout, layers, approx):
        super(BERTLM, self).__init__()
        self.vocab = vocab
        self.embed_dim =embed_dim
        self.tok_embed = Embedding(self.vocab.size, embed_dim, self.vocab.padding_idx)
        self.pos_embed = LearnedPositionalEmbedding(embed_dim, device=local_rank)
        self.seg_embed = Embedding(2, embed_dim, None)

        self.out_proj_bias = nn.Parameter(torch.Tensor(self.vocab.size))

        self.layers = nn.ModuleList()
        for i in range(layers):
            self.layers.append(TransformerLayer(embed_dim, ff_embed_dim, num_heads, dropout))
        self.emb_layer_norm = LayerNorm(embed_dim)
        self.one_more = nn.Linear(embed_dim, embed_dim)
        self.one_more_layer_norm = LayerNorm(embed_dim)
        self.one_more_nxt_snt = nn.Linear(embed_dim, embed_dim) 
        self.nxt_snt_pred = nn.Linear(embed_dim, 1)
        self.dropout = dropout
        self.device = local_rank

        if approx == "none":
            self.approx = None
        elif approx == "adaptive":
            self.approx = nn.AdaptiveLogSoftmaxWithLoss(self.embed_dim, self.vocab.size, [10000, 20000, 200000])
        else:
            raise NotImplementedError("%s has not been implemented"%approx)
        self.reset_parameters()
Exemple #3
0
 def __init__(self, dim, heads, max_len):
     super().__init__()
     self.attention = Attention(dim, heads, max_len)
     self.norm1 = LayerNorm(dim)
     self.ff = feedforward(dim, heads, max_len)
     self.norm2 = LayerNorm(dim)
     self.drop = nn.Dropout(0.1)
 def __init__(self, embed_dim, ff_embed_dim, num_heads, dropout, with_external=False, weights_dropout = True):
     super(TransformerLayer, self).__init__()
     self.self_attn = MultiheadAttention(embed_dim, num_heads, dropout, weights_dropout)
     self.fc1 = nn.Linear(embed_dim, ff_embed_dim)
     self.fc2 = nn.Linear(ff_embed_dim, embed_dim)
     self.attn_layer_norm = LayerNorm(embed_dim)
     self.ff_layer_norm = LayerNorm(embed_dim)
     self.with_external = with_external
     self.dropout = dropout
     if self.with_external:
         self.external_attn = MultiheadAttention(embed_dim, num_heads, dropout, weights_dropout)
         self.external_layer_norm = LayerNorm(embed_dim)
     self.reset_parameters()
Exemple #5
0
    def __init__(self, opt, padding_idx4item=0, padding_idx4prefer=0):
        super().__init__()  # self.pad_idx, self.start_idx, self.end_idx)
        self.batch_size = opt['batch_size']
        self.max_length = opt['max_length']
        self.dropout = opt['dropout']
        self.num_layers = 2  #opt['num_layers']
        self.vocab_size = opt['vocab_size']
        self.user_size = opt['user_size']
        self.dim = opt['dim']
        self.embedding_size = opt['embedding_size']

        self.pad_idx4item = padding_idx4item
        self.pad_idx4prefer = padding_idx4prefer

        self.embeddings = _create_embeddings(self.vocab_size,
                                             self.embedding_size,
                                             self.pad_idx4item)
        self.user_embeddings = _create_embeddings(self.user_size,
                                                  self.embedding_size,
                                                  self.pad_idx4item)
        self.position_embeddings = nn.Embedding(opt['max_length'], opt['dim'])
        self.LayerNorm = LayerNorm(opt['dim'], eps=1e-12)
        self.dropout = nn.Dropout(opt['dropout'])

        opt['num_layers'] = 2

        self.SAS_encoder = Encoder(opt)
        self.prefer_SAS_encoder = Encoder(opt)
        self.neg_SAS_encoder = Encoder(opt)

        self.item_norm = nn.Linear(opt['dim'], opt['dim'])

        self.criterion = nn.BCELoss()
        self.cs_loss = nn.CrossEntropyLoss()
Exemple #6
0
 def __init__(self, dim, vocab_size, max_len, n_segs):
     super().__init__()
     self.embed = nn.Embedding(vocab_size, dim)
     self.embedpos = nn.Embedding(max_len, dim)
     self.segembed = nn.Embedding(n_segs, dim)
     self.norm = LayerNorm(dim)
     self.drop = nn.Dropout(0.1)
    def __init__(self, model, hyper_config):
        super(standard_layernorm, self).__init__()

        if torch.cuda.is_available():
            self.dtype = torch.cuda.FloatTensor
        else:
            self.dtype = torch.FloatTensor

        self.z_size = model.z_size
        self.x_size = model.x_size
        self.act_func = model.act_func

        #Encoder
        self.encoder_weights = []
        self.layer_norms = []
        for i in range(len(hyper_config['encoder_arch'])):
            self.encoder_weights.append(
                nn.Linear(hyper_config['encoder_arch'][i][0],
                          hyper_config['encoder_arch'][i][1]))

            if i != len(hyper_config['encoder_arch']) - 1:
                self.layer_norms.append(
                    LayerNorm(hyper_config['encoder_arch'][i][1]))

        count = 1
        for i in range(len(self.encoder_weights)):
            self.add_module(str(count), self.encoder_weights[i])
            count += 1

            if i != len(hyper_config['encoder_arch']) - 1:
                self.add_module(str(count), self.layer_norms[i])
                count += 1
Exemple #8
0
    def __init__(self,
                 local_rank,
                 vocab,
                 embed_dim,
                 ff_embed_dim,
                 num_heads,
                 dropout,
                 layers,
                 smoothing_factor,
                 approx=None):
        super(BIGLM, self).__init__()
        self.vocab = vocab
        self.embed_dim = embed_dim

        self.tok_embed = Embedding(self.vocab.size, embed_dim,
                                   self.vocab.padding_idx)
        self.pos_embed = LearnedPositionalEmbedding(embed_dim,
                                                    device=local_rank)

        self.layers = nn.ModuleList()
        for i in range(layers):
            self.layers.append(
                TransformerLayer(embed_dim,
                                 ff_embed_dim,
                                 num_heads,
                                 dropout,
                                 with_external=True))
        self.emb_layer_norm = LayerNorm(embed_dim)
        self.one_more = nn.Linear(embed_dim, embed_dim)
        self.one_more_layer_norm = LayerNorm(embed_dim)
        self.out_proj = nn.Linear(embed_dim, self.vocab.size)

        self.attn_mask = SelfAttentionMask(device=local_rank)
        self.smoothing = LabelSmoothing(local_rank, self.vocab.size,
                                        self.vocab.padding_idx,
                                        smoothing_factor)

        self.dropout = dropout
        self.device = local_rank

        self.approx = approx
        self.reset_parameters()
Exemple #9
0
 def __init__(self, dim, heads, max_len, n_seg):
     super().__init__()
     self.allenc = AllEncode(dim, heads, max_len, n_seg)
     self.fc1 = nn.Linear(dim, dim)
     self.tanh = nn.Tanh()
     self.fc2 = nn.Linear(dim, 2)
     self.norm = LayerNorm(dim)
     embed_weight = self.allenc.embed.embed.weight
     n_vocab, n_dim = embed_weight.size()
     self.decoder = nn.Linear(n_dim, n_vocab, bias=False)
     self.decoder.weight = embed_weight
     self.linear = nn.Linear(dim, dim)
Exemple #10
0
    def __init__(self, hyper_config, seed=1):
        super(VAE, self).__init__()

        torch.manual_seed(seed)


        self.z_size = hyper_config['z_size']
        self.x_size = hyper_config['x_size']
        self.act_func = hyper_config['act_func']

        self.q_dist = hyper_config['q_dist'](self, hyper_config=hyper_config)

        # for aaa in self.q_dist.parameters():
        #     # print (aaa)
        #     print (aaa.size())

        # # fasdfs


        if torch.cuda.is_available():
            self.dtype = torch.cuda.FloatTensor
            self.q_dist.cuda()
        else:
            self.dtype = torch.FloatTensor
            

        #Decoder
        self.decoder_weights = []
        self.layer_norms = []
        for i in range(len(hyper_config['decoder_arch'])):
            self.decoder_weights.append(nn.Linear(hyper_config['decoder_arch'][i][0], hyper_config['decoder_arch'][i][1]))

            if i != len(hyper_config['decoder_arch'])-1:
                self.layer_norms.append(LayerNorm(hyper_config['decoder_arch'][i][1]))

        count =1
        for i in range(len(self.decoder_weights)):
            self.add_module(str(count), self.decoder_weights[i])
            count+=1

            if i != len(hyper_config['decoder_arch'])-1:
                self.add_module(str(count), self.layer_norms[i])
                count+=1    
Exemple #11
0
    def __init__(self,
                 local_rank,
                 input_dim=768,
                 ff_dim=2048,
                 num_heads=8,
                 dropout=0.2,
                 layers=6):
        super(PrefixPredict, self).__init__()
        self.input_dim = input_dim

        self.layers = nn.ModuleList()
        for i in range(layers):
            self.layers.append(
                TransformerLayer(input_dim, ff_dim, num_heads, dropout))
        self.one_more = nn.Linear(input_dim, input_dim)
        self.one_more_layer_norm = LayerNorm(input_dim)

        self.attn_mask = SelfAttentionMask(device=local_rank)
        self.loss_fun = ContrativeLoss(device=local_rank)
        self.dropout = dropout
        self.device = local_rank
        self.reset_parameters()
Exemple #12
0
 def __init__(self, layer, N):
     super(Decoder, self).__init__()
     self.layers = clones(layer, N)
     self.norm = LayerNorm(layer.size)
Exemple #13
0
 def __init__(self, layer, n=1):
     super(Encoder, self).__init__()
     self.layers = clones(layer, n)
     self.norm = LayerNorm(layer.size)
Exemple #14
0
 def __init__(self, size: int, dropout=0.1):
     super(SublayerConnection, self).__init__()
     self.norm = LayerNorm(size)
     self.dropout = nn.Dropout(p=dropout)