def __init__(self,
                 output_dim,
                 emb_dim,
                 enc_hid_dim,
                 dec_hid_dim,
                 dropout,
                 device,
                 pad_idx,
                 embedding,
                 att_type='concat',
                 num_layers=1,
                 adaptive_softmax=None):

        super(Decoder, self).__init__()

        #The num directions in decoder is always 1....

        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.dropout = dropout
        self.device = device
        self.att_type = att_type
        self.num_layers = num_layers
        if self.att_type == 'concat':
            self.attn = nn.Linear((enc_hid_dim) + dec_hid_dim, dec_hid_dim)
        elif self.att_type == 'bilinear':
            self.attn = nn.Linear((enc_hid_dim), dec_hid_dim)
        self.context_linear = nn.Linear(enc_hid_dim * 2, enc_hid_dim)
        self.v = nn.Parameter(torch.rand(dec_hid_dim))
        self.pad_idx = pad_idx

        self.embedding = embedding

        self.attn_linear = nn.Linear((enc_hid_dim * 1) + emb_dim, dec_hid_dim)
        self.rnn_layer = nn.GRU((enc_hid_dim * 1),
                                dec_hid_dim,
                                num_layers=self.num_layers)
        self.out = nn.Linear((enc_hid_dim * 1) + dec_hid_dim + emb_dim,
                             output_dim)
        self.dropout = nn.Dropout(dropout)

        vocab_size = output_dim
        self.softmax_layer = nn.AdaptiveLogSoftmaxWithLoss(
            (enc_hid_dim * 1) + dec_hid_dim + emb_dim,
            vocab_size,
            cutoffs=[round(vocab_size / 15), 3 * round(vocab_size / 15)])
        self.adaptive_softmax = adaptive_softmax
Example #2
0
 def __init__(self, vocab, hidden_size, enc_num_layer):
     super(BertNoEmbed, self).__init__()
     self.encoder = BertModelNoEmbed(
         config=BertConfig(vocab_size_or_config_json_file=len(vocab),
                           hidden_size=hidden_size,
                           num_hidden_layers=enc_num_layer,
                           num_attention_heads=8,
                           intermediate_size=3072,
                           type_vocab_size=1,
                           hidden_dropout_prob=0.1,
                           attention_probs_dropout_prob=0.1))
     self.hidden_size = hidden_size
     self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss(hidden_size,
                                                           len(vocab),
                                                           cutoffs=[1000])
Example #3
0
    def __init__(self):
        super(aspect_rs, self).__init__()
        self.embedding_dim = conf.embedding_dim
        self.num_user = conf.num_users
        self.num_item = conf.num_items

        torch.manual_seed(0)
        self.embedding_user = nn.Embedding(self.num_user, self.embedding_dim)
        torch.manual_seed(0)
        self.embedding_item = nn.Embedding(self.num_item, self.embedding_dim)

        self.rating_loss_function = nn.MSELoss()
        self.review_loss_function = nn.AdaptiveLogSoftmaxWithLoss(\
            conf.hidden_size, conf.vocab_sz, cutoffs=[round(conf.vocab_sz/15), 3*round(conf.vocab_sz/15)], div_value=2)

        self.avg_rating = torch.FloatTensor([conf.avg_rating]).cuda()
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout=0.1, padding_idx=2):
        super().__init__()

        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=padding_idx)

        self.rnn = nn.GRU(emb_dim, 2*hid_dim, n_layers, dropout=dropout)

        self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss(2*hid_dim, output_dim, cutoffs=[10,12])

        self.dropout = nn.Dropout(dropout)
Example #5
0
 def __init__(self, vocab, hidden_size, num_layer):
     super(LanGenNoEmbed, self).__init__()
     self.model = BertModelNoEmbed(
         config=BertConfig(vocab_size_or_config_json_file=len(vocab),
                           hidden_size=hidden_size,
                           num_hidden_layers=num_layer,
                           num_attention_heads=8,
                           intermediate_size=3072,
                           type_vocab_size=2,
                           hidden_dropout_prob=0.1,
                           attention_probs_dropout_prob=0.1))
     self.model.encoder.layer = self.model.encoder.layer[:3]
     self.model.eval()
     self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss(hidden_size,
                                                           len(vocab),
                                                           cutoffs=[994])
Example #6
0
    def __init__(self, args, save_path=None):

        super(LMBase, self).__init__()
        logger.info(self.__class__.__name__)

        self.save_path = save_path

        self.d_model = args.transformer_d_model
        self.n_layers = args.n_layers
        self.n_heads = args.transformer_n_heads
        self.lsm_prob = args.lsm_prob

        self.vocab = args.vocab
        self.eos = 2
        self.pad = 3
        # NOTE: reserved in advance

        # for cache
        self.cache_theta = 0.2  # smoothing parameter
        self.cache_lambda = 0.2  # cache weight
        self.cache_ids = []
        self.cache_keys = []
        self.cache_attn = []

        self.embed = nn.Embedding(self.vocab, self.d_model, padding_idx=self.pad)
        self.pos_enc = PositionalEncoding(self.d_model, args.dropout_in, args.transformer_pe_type)
        self.layers = nn.ModuleList([copy.deepcopy(TransformerDecoderBlock(
            self.d_model, args.transformer_d_ff, args.transformer_attn_type, self.n_heads, args.dropout_hidden, args.dropout_att,
            args.dropout_residual * (l + 1) / self.n_layers,
            args.transformer_layer_norm_eps, args.transformer_ffn_activation, args.transformer_param_init,
            src_tgt_attention=False))for l in range(self.n_layers)])
        self.norm_out = nn.LayerNorm(self.d_model, eps=args.transformer_layer_norm_eps)

        self.adaptive_softmax = None
        self.output = None
        if args.adaptive_softmax:
            self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss(
                self.d_model, self.vocab,
                cutoffs=[round(self.vocab / 15), 3 * round(self.vocab / 15)],
                # cutoffs=[self.vocab // 25, 3 * self.vocab // 5],
                div_value=4.0)
        else:
            self.output = nn.Linear(self.d_model, self.vocab)
            if args.tie_embedding:
                self.output.weight = self.embed.weight

        self.reset_parameters()
    def __init__(self, config):
        super(XLMPredLayer, self).__init__()
        self.asm = config.asm
        self.n_words = config.n_words
        self.pad_index = config.pad_index
        dim = config.emb_dim

        if config.asm is False:
            self.proj = nn.Linear(dim, config.n_words, bias=True)
        else:
            self.proj = nn.AdaptiveLogSoftmaxWithLoss(
                in_features=dim,
                n_classes=config.n_words,
                cutoffs=config.asm_cutoffs,
                div_value=config.asm_div_value,
                head_bias=True,  # default is False
            )
Example #8
0
    def __init__(self, params):
        super().__init__()
        self.asm = params.asm
        self.n_words = params.n_words
        self.pad_index = params.pad_index
        dim = params.emb_dim

        if params.asm is False:
            self.proj = Linear(dim, params.n_words, bias=True)
        else:
            self.proj = nn.AdaptiveLogSoftmaxWithLoss(
                in_features=dim,
                n_classes=params.n_words,
                cutoffs=params.asm_cutoffs,
                div_value=params.asm_div_value,
                head_bias=True,  # default is False
            )
Example #9
0
    def __init__(self, num_embeddings, embedding_dim, padding_idx,
                 conv_filters, n_highways, word_size):
        super().__init__()
        self.char_embedding = CharEmbedding(
            num_embeddings,
            16,
            padding_idx,
            conv_filters,
            n_highways,
            embedding_dim,
        )

        self.forward_lm_1 = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=4 * embedding_dim,
            batch_first=True,
            num_layers=1,
        )
        self.forward_lp_1 = nn.Linear(4 * embedding_dim, embedding_dim)

        self.forward_lm_2 = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=4 * embedding_dim,
            batch_first=True,
            num_layers=1,
        )
        self.forward_lp_2 = nn.Linear(4 * embedding_dim, embedding_dim)

        self.backward_lm_1 = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=4 * embedding_dim,
            batch_first=True,
            num_layers=1,
        )
        self.backward_lp_1 = nn.Linear(4 * embedding_dim, embedding_dim)
        self.backward_lm_2 = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=4 * embedding_dim,
            batch_first=True,
            num_layers=1,
        )

        self.backward_lp_2 = nn.Linear(4 * embedding_dim, embedding_dim)
        self.loss = nn.AdaptiveLogSoftmaxWithLoss(embedding_dim, word_size,
                                                  [100, 1000, 10000])
Example #10
0
    def __init__(self, **kwargs):
        super(PersonaModelAlt, self).__init__()
        block_size = kwargs.get('block_size', 100)
        emb_dim = kwargs.get('emb_dim', 50)
        num_LTM = kwargs.get('num_LTM', 5)
        embedding = kwargs.get('embedding', None)
        indexer = kwargs.get('indexer', None)
        assert (indexer is not None)
        self.block_size = block_size
        self.num_LTM = num_LTM
        self.embedding = embedding
        self.indexer = indexer

        self.embed = nn.Embedding(num_embeddings=len(indexer),
                                  embedding_dim=emb_dim)
        if embedding is not None:
            self.embed.weight.data.copy_(torch.from_numpy(embedding))
        else:
            self.embed._parameters['weight'].data.normal_(
                0.0, 1 / np.sqrt(emb_dim))

        self.long_mem = EntNet(num_LTM, block_size)
        self.query_module_ltm = Attention(block_size)

        self.layernorm1 = nn.LayerNorm(emb_dim)
        self.layernorm2 = nn.LayerNorm(block_size)
        self.dropout = nn.Dropout(.5)
        self.LTM_state = None
        self.attn = Attention(block_size)
        self.self_attn = SelfAttention(block_size)
        self.encoder = nn.GRU(input_size=emb_dim,
                              hidden_size=block_size,
                              batch_first=True)
        self.decoder = nn.GRU(input_size=emb_dim,
                              hidden_size=block_size,
                              batch_first=True)  #produces queries of size e_d

        self.reverse_embed = nn.AdaptiveLogSoftmaxWithLoss(
            block_size, len(indexer), cutoffs=[7, 21, 500])
        self.nonlin = nn.LeakyReLU(.1)
        self.decoderAUX = nn.GRU(input_size=emb_dim,
                                 hidden_size=block_size,
                                 batch_first=True)
        self.idxs = np.arange(len(indexer))
Example #11
0
    def __init__(self,
                 input_size=300,
                 classes=1,
                 hidden_size=512,
                 drop_p=0.5,
                 out_of_words=80000):
        super(elmo_model, self).__init__()

        self.dropout = nn.Dropout(drop_p)
        self.num_layers = 1
        self.out_of_words = out_of_words

        self.char_embed = CharEmbedding(num_embeddings=260,
                                        embedding_dim=16,
                                        padding_idx=256,
                                        conv_filters=[(3, 128), (3, 128),
                                                      (3, 128)],
                                        n_highways=2,
                                        projection_size=hidden_size)

        self.lstm1_f = nn.LSTM(hidden_size,
                               hidden_size,
                               batch_first=True,
                               bidirectional=False)
        self.lstm2_f = nn.LSTM(hidden_size,
                               hidden_size,
                               batch_first=True,
                               bidirectional=False)
        self.lstm1_b = nn.LSTM(hidden_size,
                               hidden_size,
                               batch_first=True,
                               bidirectional=False)
        self.lstm2_b = nn.LSTM(hidden_size,
                               hidden_size,
                               batch_first=True,
                               bidirectional=False)

        self.out = nn.AdaptiveLogSoftmaxWithLoss(
            in_features=hidden_size * 2,
            n_classes=out_of_words,
            cutoffs=[20, 200, 1000, 10000],
            div_value=4.0,
            head_bias=False)
Example #12
0
    def __init__(self, params):
        super().__init__()
        self.asm = params.asm
        self.n_words = params.n_words
        self.pad_index = params.pad_index
        self.label_smoothing = params.label_smoothing
        dim = params.emb_dim

        if params.asm is False:
            self.proj = Linear(dim, params.n_words, bias=True)
            # if params.label_smoothing > 0:
            #     self.loss_func = LabelSmoothingCriterion(params.label_smoothing, params.n_words)
        else:
            self.proj = nn.AdaptiveLogSoftmaxWithLoss(
                in_features=dim,
                n_classes=params.n_words,
                cutoffs=params.asm_cutoffs,
                div_value=params.asm_div_value,
                head_bias=True,  # default is False
            )
Example #13
0
    def __init__(self, w_num, w_dim, rnn_unit, num_layers, hidden_dim, dropout,
                 cutoffs):
        super(LM, self).__init__()

        self.w_num = w_num
        self.w_dim = w_dim
        self.word_embed = nn.Embedding(w_num, w_dim)

        rnnunit_map = {'rnn': nn.RNN, 'lstm': nn.LSTM, 'gru': nn.GRU}
        self.rnn = rnnunit_map[rnn_unit](w_dim,
                                         hidden_dim,
                                         num_layers=num_layers,
                                         dropout=dropout)
        self.soft_max = nn.AdaptiveLogSoftmaxWithLoss(hidden_dim,
                                                      w_num,
                                                      cutoffs=cutoffs,
                                                      div_value=4.0)
        self.dropout = nn.Dropout(p=dropout)

        self.reset_parameters()
    def __init__(self,
                 in_vocab_size,
                 out_vocab_size,
                 embed_size,
                 hidden_size,
                 GRU_count_enc=2,
                 GRU_count_dec=2,
                 ignore_class=None,
                 use_feedforward=True):
        super(TranslatorModel, self).__init__()

        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.GRU_count_enc = GRU_count_enc
        self.GRU_count_dec = GRU_count_dec
        self.ignore_class = ignore_class
        self.use_feedforwad = use_feedforward

        self.enc_embed = nn.Embedding(in_vocab_size, self.embed_size)
        self.enc_GRU = nn.GRU(self.embed_size,
                              self.hidden_size,
                              num_layers=self.GRU_count_enc,
                              bidirectional=True,
                              dropout=0.2)

        self.dec_embed = nn.Embedding(out_vocab_size, self.embed_size)
        self.dec_ReLU = nn.ReLU()
        self.dec_GRU = nn.GRU(self.embed_size + self.hidden_size,
                              self.hidden_size,
                              num_layers=self.GRU_count_dec,
                              dropout=0.2)
        self.Adaptive_Softmax = nn.AdaptiveLogSoftmaxWithLoss(
            self.hidden_size, out_vocab_size,
            [round(out_vocab_size / 20), 4 * round(out_vocab_size / 20)])
        if self.use_feedforwad:
            self.feedforward_dense = nn.Linear(2 * self.hidden_size,
                                               self.hidden_size,
                                               bias=False)
        self.att_Softmax = nn.Softmax(dim=1)
        self.bridge = nn.Linear(2 * self.hidden_size,
                                self.GRU_count_dec * self.hidden_size)
Example #15
0
    def __init__(
        self,
        num_input_features: int,
        hp: HeadParams,
    ):
        x_reducer, loss_reducer, num_input_features = self._get_reducers(
            num_input_features=num_input_features, hp=hp)

        num_classes = hp.num_classes
        num_first_bin = round(num_classes / 20)
        head = nn.AdaptiveLogSoftmaxWithLoss(
            num_input_features,
            num_classes,
            cutoffs=[
                num_first_bin,
                5 * num_first_bin,
            ],
            div_value=4,
        )
        super().__init__(num_input_features, num_classes, head, x_reducer,
                         loss_reducer)
Example #16
0
def lm_criterion(in_features, vocab_size):
    # if weight_tying:
    #     in_features = 2 * input_size \
    #         if bidirectional else hidden_size
    # else:
    #     in_features = 2 * hidden_size \
    #         if bidirectional else hidden_size

    splits = []
    if vocab_size > _MEDIUM_TOKENS:
        # splits = [2800, 20000, 760000]
        splits = [2800, 20000]
    elif vocab_size > _HIGH_TOKENS:
        splits = [4200, 35000, 180000]
    splits += [vocab_size - 2]

    criterion = nn.AdaptiveLogSoftmaxWithLoss(in_features=in_features,
                                              n_classes=vocab_size,
                                              cutoffs=splits)

    return criterion
Example #17
0
 def __init__(self,
              max_seq_len,
              vocab_size,
              n_layers=6,
              dim=1024,
              d_ff=2048,
              dropout=0.1,
              heads=8,
              encoder_only=True,
              n_langs=15):
     super(Transformer, self).__init__()
     self.n_layers, self.max_seq_len, self.dim = n_layers, max_seq_len, dim
     self.encoder_only = encoder_only
     self.vocab_size = vocab_size
     self.embed = nn.Embedding(vocab_size, dim)
     self.pe = PositionalEncoding(dim=dim, max_seq_len=max_seq_len)
     self.lang_embed = nn.Embedding(n_langs, dim)
     self.encoders = nn.ModuleList([
         TransformerLayer(decoder=False,
                          dim=dim,
                          d_ff=d_ff,
                          dropout=dropout,
                          heads=heads) for _ in range(n_layers)
     ])
     if not encoder_only:
         self.decoders = nn.ModuleList([
             TransformerLayer(decoder=True,
                              dim=dim,
                              d_ff=d_ff,
                              dropout=dropout,
                              heads=heads) for _ in range(n_layers)
         ])
     self.pred = nn.AdaptiveLogSoftmaxWithLoss(in_features=dim,
                                               n_classes=vocab_size,
                                               cutoffs=[8000, 20000],
                                               head_bias=True)
     self.xnli_fc = nn.Linear(dim, 3)
     torch.nn.init.xavier_uniform_(self.embed.weight)
     torch.nn.init.xavier_uniform_(self.lang_embed.weight)
     torch.nn.init.xavier_uniform_(self.xnli_fc.weight)
Example #18
0
    def __init__(self,
                 hidden_dim,
                 embed_dim,
                 num_keywords,
                 num_layers,
                 weight,
                 num_labels,
                 bidirectional,
                 dropout=0.5,
                 **kwargs):
        super(MTALSTM, self).__init__(**kwargs)
        self.hidden_dim = hidden_dim
        self.embed_dim = embed_dim
        self.num_layers = num_layers
        self.num_labels = num_labels
        self.bidirectional = bidirectional

        if num_layers <= 1:
            self.dropout = 0
        else:
            self.dropout = dropout

        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.embedding_topic = nn.Embedding(28 + 5, embed_dim)  # todo
        # self.embedding = nn.Embedding.from_pretrained(weight)
        # self.embedding.weight.requires_grad = False

        self.Uf = nn.Linear(embed_dim * num_keywords, num_keywords, bias=False)

        # attention decoder
        self.decoder = AttentionDecoder(hidden_size=hidden_dim,
                                        embed_size=embed_dim,
                                        num_layers=num_layers,
                                        dropout=dropout)

        # adaptive softmax
        self.adaptiveSoftmax = nn.AdaptiveLogSoftmaxWithLoss(
            hidden_dim,
            num_labels,
            cutoffs=[round(num_labels / 20), 4 * round(num_labels / 20)])
Example #19
0
    def __init__(self, local_rank, vocab, embed_dim, ff_embed_dim, num_heads,
                 dropout, layers, smoothing_factor, approx):
        super(BIGLM, self).__init__()
        self.vocab = vocab
        self.embed_dim = embed_dim

        self.tok_embed = Embedding(self.vocab.size, embed_dim,
                                   self.vocab.padding_idx)
        self.pos_embed = LearnedPositionalEmbedding(embed_dim,
                                                    device=local_rank)

        self.layers = nn.ModuleList()
        for i in range(layers):
            self.layers.append(
                TransformerLayer(embed_dim,
                                 ff_embed_dim,
                                 num_heads,
                                 dropout,
                                 with_external=True))
        self.emb_layer_norm = LayerNorm(embed_dim)
        self.one_more = nn.Linear(embed_dim, embed_dim)
        self.one_more_layer_norm = LayerNorm(embed_dim)
        self.out_proj = nn.Linear(embed_dim, self.vocab.size)

        self.attn_mask = SelfAttentionMask(device=local_rank)
        self.smoothing = LabelSmoothing(local_rank, self.vocab.size,
                                        self.vocab.padding_idx,
                                        smoothing_factor)

        self.dropout = dropout
        self.device = local_rank

        if approx == "none":
            self.approx = None
        elif approx == "adaptive":
            self.approx = nn.AdaptiveLogSoftmaxWithLoss(
                self.embed_dim, self.vocab.size, [10000, 20000, 200000])
        else:
            raise NotImplementedError("%s has not been implemented" % approx)
        self.reset_parameters()
Example #20
0
 def __init__(
         self,
         policy,
         policy_optim,
         beta,
         beta_optim,
         hidden_size,
         gamma=0.99,
         k=10,
         weight_clip=2.0,
         offpolicy_correction=True,
         topk=True,
         adaptive_softmax=True,
         cutoffs=None,
         device=torch.device("cpu"),
 ):
     super(Reinforce, self).__init__()
     self.policy = policy
     self.policy_optim = policy_optim
     self.beta = beta
     self.beta_optim = beta_optim
     self.beta_criterion = nn.CrossEntropyLoss()
     self.gamma = gamma
     self.k = k
     self.weight_clip = weight_clip
     self.offpolicy_correction = offpolicy_correction
     self.topk = topk
     self.adaptive_softmax = adaptive_softmax
     if adaptive_softmax:
         assert cutoffs is not None, (
             "must provide cutoffs when using adaptive_softmax"
         )
         self.softmax_loss = nn.AdaptiveLogSoftmaxWithLoss(
             in_features=hidden_size,
             n_classes=policy.item_embeds.weight.size(0),
             cutoffs=cutoffs,
             div_value=4.
         ).to(device)
     self.device = device
Example #21
0
 def __init__(self, word_vocab, char_vocab):
     super(ELMO, self).__init__()
     self.embedding = CharEmbedding(num_embeddings=char_vocab.__len__(),
                                    embedding_dim=16,
                                    padding_idx=char_vocab.sp.pad.idx,
                                    conv_filters=[(1, 32), (2, 64),
                                                  (3, 128), (4, 128),
                                                  (5, 256), (6, 256),
                                                  (7, 512)],
                                    n_highways=2,
                                    projection_size=512)
     self.lstm_forward = nn.LSTM(input_size=512,
                                 num_layers=1,
                                 hidden_size=2048,
                                 bidirectional=False,
                                 batch_first=True)
     self.linear_forward = nn.Linear(2048, 512)
     self.lstm_forward2 = nn.LSTM(input_size=512,
                                  num_layers=1,
                                  hidden_size=2048,
                                  bidirectional=False,
                                  batch_first=True)
     self.linear_forward2 = nn.Linear(2048, 512)
     self.lstm_backward = nn.LSTM(input_size=512,
                                  num_layers=1,
                                  hidden_size=2048,
                                  bidirectional=False,
                                  batch_first=True)
     self.linear_backward = nn.Linear(2048, 512)
     self.lstm_backward2 = nn.LSTM(input_size=512,
                                   num_layers=1,
                                   hidden_size=2048,
                                   bidirectional=False,
                                   batch_first=True)
     self.linear_backward2 = nn.Linear(2048, 512)
     self.output_layer = nn.AdaptiveLogSoftmaxWithLoss(
         in_features=512,
         n_classes=word_vocab.__len__(),
         cutoffs=[100, 1000, 10000])
Example #22
0
    def __init__(self,
                 input_size,
                 embed_size,
                 hidden_size,
                 class_count,
                 LSTM_count,
                 ignore_class=None):
        super(LSTMModel, self).__init__()

        self.LSTM_layers = LSTM_count
        self.hidden_size = hidden_size
        self.ignore_class = ignore_class

        self.embedding = nn.Embedding(input_size, embed_size)
        self.LSTM = nn.LSTM(embed_size,
                            self.hidden_size,
                            num_layers=self.LSTM_layers,
                            bias=True,
                            batch_first=True,
                            dropout=0.25)

        self.Adaptive_Softmax = nn.AdaptiveLogSoftmaxWithLoss(
            self.hidden_size, class_count,
            [round(class_count / 20), 4 * round(class_count / 20)])
Example #23
0
    def __init__(self,
                 eos,
                 unk,
                 pad,
                 blank,
                 enc_n_units,
                 attn_type,
                 attn_n_heads,
                 n_layers,
                 d_model,
                 d_ff,
                 vocab,
                 tie_embedding=False,
                 pe_type='add',
                 layer_norm_eps=1e-12,
                 dropout=0.0,
                 dropout_emb=0.0,
                 dropout_att=0.0,
                 lsm_prob=0.0,
                 focal_loss_weight=0.0,
                 focal_loss_gamma=2.0,
                 ctc_weight=0.0,
                 ctc_lsm_prob=0.0,
                 ctc_fc_list=[],
                 backward=False,
                 global_weight=1.0,
                 mtl_per_batch=False,
                 adaptive_softmax=False):

        super(TransformerDecoder, self).__init__()
        logger = logging.getLogger('training')

        self.eos = eos
        self.unk = unk
        self.pad = pad
        self.blank = blank
        self.enc_n_units = enc_n_units
        self.d_model = d_model
        self.n_layers = n_layers
        self.attn_n_heads = attn_n_heads
        self.pe_type = pe_type
        self.lsm_prob = lsm_prob
        self.focal_loss_weight = focal_loss_weight
        self.focal_loss_gamma = focal_loss_gamma
        self.ctc_weight = ctc_weight
        self.bwd = backward
        self.global_weight = global_weight
        self.mtl_per_batch = mtl_per_batch

        if ctc_weight > 0:
            self.ctc = CTC(eos=eos,
                           blank=blank,
                           enc_n_units=enc_n_units,
                           vocab=vocab,
                           dropout=dropout,
                           lsm_prob=ctc_lsm_prob,
                           fc_list=ctc_fc_list,
                           param_init=0.1)

        if ctc_weight < global_weight:
            self.embed = Embedding(
                vocab,
                d_model,
                dropout=0,  # NOTE: do not apply dropout here
                ignore_index=pad)
            self.pos_enc = PositionalEncoding(d_model, dropout_emb, pe_type)
            self.layers = nn.ModuleList([
                TransformerDecoderBlock(d_model, d_ff, attn_type, attn_n_heads,
                                        dropout, dropout_att, layer_norm_eps)
                for _ in range(n_layers)
            ])
            self.norm_out = nn.LayerNorm(d_model, eps=layer_norm_eps)

            if adaptive_softmax:
                self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss(
                    d_model,
                    vocab,
                    cutoffs=[
                        round(self.vocab / 15), 3 * round(self.vocab / 15)
                    ],
                    # cutoffs=[self.vocab // 25, 3 * self.vocab // 5],
                    div_value=4.0)
                self.output = None
            else:
                self.adaptive_softmax = None
                self.output = Linear(d_model, vocab)

                # Optionally tie weights as in:
                # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
                # https://arxiv.org/abs/1608.05859
                # and
                # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
                # https://arxiv.org/abs/1611.01462
                if tie_embedding:
                    self.output.fc.weight = self.embed.embed.weight

        # Initialize parameters
        self.reset_parameters()
Example #24
0
    def __init__(self, args, save_path=None):

        super(LMBase, self).__init__()
        logger.info(self.__class__.__name__)

        self.lm_type = args.lm_type
        self.save_path = save_path

        self.emb_dim = args.emb_dim
        self.n_units = args.n_units
        self.n_layers = args.n_layers
        self.lsm_prob = args.lsm_prob

        self.vocab = args.vocab
        self.eos = 2
        self.pad = 3
        # NOTE: reserved in advance

        # for cache
        self.cache_theta = 0.2  # smoothing parameter
        self.cache_lambda = 0.2  # cache weight
        self.cache_ids = []
        self.cache_keys = []
        self.cache_attn = []
        self.embed_cache = None

        self.embed = nn.Embedding(self.vocab,
                                  args.emb_dim,
                                  padding_idx=self.pad)
        self.dropout_embed = nn.Dropout(p=args.dropout_in)

        model_size = args.lm_type.replace('gated_conv_', '')

        blocks = OrderedDict()
        dropout = args.dropout_hidden
        if model_size == 'custom':
            blocks['conv1'] = ConvGLUBlock(args.kernel_size,
                                           args.emb_dim,
                                           args.n_units,
                                           bottlececk_dim=args.n_projs,
                                           dropout=dropout)
            for lth in range(args.n_layers - 1):
                blocks['conv%d' % (lth + 2)] = ConvGLUBlock(
                    args.kernel_size,
                    args.n_units,
                    args.n_units,
                    bottlececk_dim=args.n_projs,
                    dropout=dropout)
            last_dim = args.n_units

        elif model_size == '8':
            blocks['conv1'] = ConvGLUBlock(4,
                                           args.emb_dim,
                                           900,
                                           dropout=dropout)
            for i in range(1, 8, 1):
                blocks['conv2-%d' % i] = ConvGLUBlock(4,
                                                      900,
                                                      900,
                                                      dropout=dropout)
            last_dim = 900

        elif model_size == '8B':
            blocks['conv1'] = ConvGLUBlock(1,
                                           args.emb_dim,
                                           512,
                                           dropout=dropout)
            for i in range(1, 4, 1):
                blocks['conv2-%d' % i] = ConvGLUBlock(5,
                                                      512,
                                                      512,
                                                      bottlececk_dim=128,
                                                      dropout=dropout)
            for i in range(1, 4, 1):
                blocks['conv3-%d' % i] = ConvGLUBlock(5,
                                                      512,
                                                      512,
                                                      bottlececk_dim=256,
                                                      dropout=dropout)
            blocks['conv4'] = ConvGLUBlock(1,
                                           512,
                                           2048,
                                           bottlececk_dim=1024,
                                           dropout=dropout)
            last_dim = 2048

        elif model_size == '9':
            blocks['conv1'] = ConvGLUBlock(4,
                                           args.emb_dim,
                                           807,
                                           dropout=dropout)
            for i in range(1, 4, 1):
                blocks['conv2-%d-1' % i] = ConvGLUBlock(4,
                                                        807,
                                                        807,
                                                        dropout=dropout)
                blocks['conv2-%d-2' % i] = ConvGLUBlock(4,
                                                        807,
                                                        807,
                                                        dropout=dropout)
            last_dim = 807

        elif model_size == '13':
            blocks['conv1'] = ConvGLUBlock(4,
                                           args.emb_dim,
                                           1268,
                                           dropout=dropout)
            for i in range(1, 13, 1):
                blocks['conv2-%d' % i] = ConvGLUBlock(4,
                                                      1268,
                                                      1268,
                                                      dropout=dropout)
            last_dim = 1268

        elif model_size == '14':
            for i in range(1, 4, 1):
                blocks['conv1-%d' % i] = ConvGLUBlock(
                    6, args.emb_dim if i == 1 else 850, 850, dropout=dropout)
            blocks['conv2'] = ConvGLUBlock(1, 850, 850, dropout=dropout)
            for i in range(1, 5, 1):
                blocks['conv3-%d' % i] = ConvGLUBlock(5,
                                                      850,
                                                      850,
                                                      dropout=dropout)
            blocks['conv4'] = ConvGLUBlock(1, 850, 850, dropout=dropout)
            for i in range(1, 4, 1):
                blocks['conv5-%d' % i] = ConvGLUBlock(4,
                                                      850,
                                                      850,
                                                      dropout=dropout)
            blocks['conv6'] = ConvGLUBlock(4, 850, 1024, dropout=dropout)
            blocks['conv7'] = ConvGLUBlock(4, 1024, 2048, dropout=dropout)
            last_dim = 2048

        elif model_size == '14B':
            blocks['conv1'] = ConvGLUBlock(5,
                                           args.emb_dim,
                                           512,
                                           dropout=dropout)
            for i in range(1, 4, 1):
                blocks['conv2-%d' % i] = ConvGLUBlock(5,
                                                      512,
                                                      512,
                                                      bottlececk_dim=128,
                                                      dropout=dropout)
            for i in range(1, 4, 1):
                blocks['conv3-%d' % i] = ConvGLUBlock(5,
                                                      512 if i == 1 else 1024,
                                                      1024,
                                                      bottlececk_dim=512,
                                                      dropout=dropout)
            for i in range(1, 7, 1):
                blocks['conv4-%d' % i] = ConvGLUBlock(5,
                                                      1024 if i == 1 else 2048,
                                                      2048,
                                                      bottlececk_dim=1024,
                                                      dropout=dropout)
            blocks['conv5'] = ConvGLUBlock(5,
                                           2048,
                                           4096,
                                           bottlececk_dim=1024,
                                           dropout=dropout)
            last_dim = 4096

        else:
            raise NotImplementedError(model_size)

        self.blocks = nn.Sequential(blocks)

        if args.adaptive_softmax:
            self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss(
                last_dim,
                self.vocab,
                # cutoffs=[self.vocab // 10, 3 * self.vocab // 10],
                cutoffs=[self.vocab // 25, self.vocab // 5],
                div_value=4.0)
            self.output = None
        else:
            self.adaptive_softmax = None
            self.output = nn.Linear(last_dim, self.vocab)
            if args.tie_embedding:
                if args.n_units != args.emb_dim:
                    raise ValueError(
                        'When using the tied flag, n_units must be equal to emb_dim.'
                    )
                self.output.weight = self.embed.weight

        self.reset_parameters(args.param_init)
Example #25
0
    def __init__(self, args, save_path=None):

        super(LMBase, self).__init__()
        logger.info(self.__class__.__name__)

        self.lm_type = args.lm_type
        self.save_path = save_path

        self.emb_dim = args.emb_dim
        self.rnn_type = args.lm_type
        assert args.lm_type in ['lstm', 'gru']
        self.n_units = args.n_units
        self.n_projs = args.n_projs
        self.n_layers = args.n_layers
        self.residual = args.residual
        self.n_units_cv = args.n_units_null_context
        self.lsm_prob = args.lsm_prob

        self.vocab = args.vocab
        self.eos = 2
        self.pad = 3
        # NOTE: reserved in advance

        # for cache
        self.cache_theta = 0.2  # smoothing parameter
        self.cache_lambda = 0.2  # cache weight
        self.cache_ids = []
        self.cache_keys = []
        self.cache_attn = []
        self.embed_cache = None

        self.embed = nn.Embedding(self.vocab, args.emb_dim, padding_idx=self.pad)
        self.dropout_emb = nn.Dropout(p=args.dropout_in)

        rnn = nn.LSTM if args.lm_type == 'lstm' else nn.GRU
        self.rnn = nn.ModuleList()
        self.dropout = nn.Dropout(p=args.dropout_hidden)
        if args.n_projs > 0:
            self.proj = repeat(nn.Linear(args.n_units, args.n_projs), args.n_layers)
        rnn_idim = args.emb_dim + args.n_units_null_context
        for _ in range(args.n_layers):
            self.rnn += [rnn(rnn_idim, args.n_units, 1, batch_first=True)]
            rnn_idim = args.n_units
            if args.n_projs > 0:
                rnn_idim = args.n_projs

        self.glu = None
        if args.use_glu:
            self.glu = LinearGLUBlock(rnn_idim)

        self._odim = rnn_idim

        self.adaptive_softmax = None
        self.output_proj = None
        self.output = None
        if args.adaptive_softmax:
            self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss(
                rnn_idim, self.vocab,
                # cutoffs=[self.vocab // 10, 3 * self.vocab // 10],
                cutoffs=[self.vocab // 25, self.vocab // 5],
                div_value=4.0)
        elif args.tie_embedding:
            if rnn_idim != args.emb_dim:
                self.output_proj = nn.Linear(rnn_idim, args.emb_dim)
                rnn_idim = args.emb_dim
                self._odim = rnn_idim
            self.output = nn.Linear(rnn_idim, self.vocab)
            self.output.weight = self.embed.weight
        else:
            self.output = nn.Linear(rnn_idim, self.vocab)

        self.reset_parameters(args.param_init)
Example #26
0
    def __init__(self, args, save_path=None):

        super(LMBase, self).__init__()
        logger = logging.getLogger('training')
        logger.info(self.__class__.__name__)

        self.save_path = save_path

        self.emb_dim = args.emb_dim
        self.rnn_type = args.lm_type
        assert args.lm_type in ['lstm', 'gru']
        self.n_units = args.n_units
        self.n_projs = args.n_projs
        self.n_layers = args.n_layers
        self.residual = args.residual
        self.use_glu = args.use_glu
        self.n_units_cv = args.n_units_null_context
        self.lsm_prob = args.lsm_prob

        self.vocab = args.vocab
        self.eos = 2
        self.pad = 3
        # NOTE: reserved in advance

        # for cache
        self.cache_theta = 0.2  # smoothing parameter
        self.cache_lambda = 0.2  # cache weight
        self.cache_ids = []
        self.cache_keys = []
        self.cache_attn = []

        self.embed = Embedding(vocab=self.vocab,
                               emb_dim=args.emb_dim,
                               dropout=args.dropout_in,
                               ignore_index=self.pad)

        rnn = nn.LSTM if args.lm_type == 'lstm' else nn.GRU
        self.rnn = nn.ModuleList()
        self.dropout = nn.ModuleList(
            [nn.Dropout(p=args.dropout_hidden) for _ in range(args.n_layers)])
        if args.n_projs > 0:
            self.proj = nn.ModuleList([
                Linear(args.n_units, args.n_projs)
                for _ in range(args.n_layers)
            ])
        rnn_idim = args.emb_dim + args.n_units_null_context
        for l in range(args.n_layers):
            self.rnn += [
                rnn(rnn_idim,
                    args.n_units,
                    1,
                    bias=True,
                    batch_first=True,
                    dropout=0,
                    bidirectional=False)
            ]
            rnn_idim = args.n_units
            if args.n_projs > 0:
                rnn_idim = args.n_projs

        if self.use_glu:
            self.fc_glu = Linear(rnn_idim,
                                 rnn_idim * 2,
                                 dropout=args.dropout_hidden)

        if args.adaptive_softmax:
            self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss(
                rnn_idim,
                self.vocab,
                # cutoffs=[self.vocab // 10, 3 * self.vocab // 10],
                cutoffs=[self.vocab // 25, self.vocab // 5],
                div_value=4.0)
            self.output = None
        else:
            self.adaptive_softmax = None
            self.output = Linear(rnn_idim,
                                 self.vocab,
                                 dropout=args.dropout_out)
            # NOTE: include bias even when tying weights

            # Optionally tie weights as in:
            # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
            # https://arxiv.org/abs/1608.05859
            # and
            # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
            # https://arxiv.org/abs/1611.01462
            if args.tie_embedding:
                if args.n_units != args.emb_dim:
                    raise ValueError(
                        'When using the tied flag, n_units must be equal to emb_dim.'
                    )
                self.output.fc.weight = self.embed.embed.weight

        # Initialize parameters
        self.reset_parameters(args.param_init)

        # Recurrent weights are orthogonalized
        if args.rec_weight_orthogonal:
            self.reset_parameters(args.param_init,
                                  dist='orthogonal',
                                  keys=['rnn', 'weight'])
ntokens = len(corpus.dictionary)
print("vocabulary size (ntokens): " + str(ntokens))

if not args.adaptivesoftmax:
    criterion = nn.CrossEntropyLoss().to(device)
else:
    print(
        "Adaptive Softmax is on: the performance depends on cutoff values. check if the cutoff is properly set"
    )
    print("Cutoffs: " + str(args.cutoffs))
    if args.cutoffs[-1] > ntokens:
        raise ValueError(
            "the last element of cutoff list must be lower than vocab size of the dataset"
        )
    criterion_adaptive = nn.AdaptiveLogSoftmaxWithLoss(
        args.nhid, ntokens, cutoffs=args.cutoffs).to(device)

model = rnn_models.RNNModel(args.model,
                            ntokens,
                            args.emsize,
                            args.nhid,
                            args.nlayers,
                            args.dropout,
                            args.tied,
                            use_cudnn_version=args.cudnn,
                            use_adaptive_softmax=args.adaptivesoftmax,
                            cutoffs=args.cutoffs).to(device)

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("model built, total trainable params: " + str(total_params))
if not args.cudnn:
    def __init__(self,
                 mem_slots,
                 head_size,
                 input_size,
                 num_tokens,
                 num_heads=1,
                 num_blocks=1,
                 forget_bias=1.,
                 input_bias=0.,
                 gate_style='unit',
                 attention_mlp_layers=2,
                 key_size=None,
                 use_adaptive_softmax=False,
                 cutoffs=None):
        super(RelationalMemory, self).__init__()

        ########## generic parameters for RMC ##########
        self.mem_slots = mem_slots
        self.head_size = head_size
        self.num_heads = num_heads
        self.mem_size = self.head_size * self.num_heads

        # a new fixed params needed for pytorch port of RMC
        # +1 is the concatenated input per time step : we do self-attention with the concatenated memory & input
        # so if the mem_slots = 1, this value is 2
        self.mem_slots_plus_input = self.mem_slots + 1

        if num_blocks < 1:
            raise ValueError(
                'num_blocks must be >=1. Got: {}.'.format(num_blocks))
        self.num_blocks = num_blocks

        if gate_style not in ['unit', 'memory', None]:
            raise ValueError(
                'gate_style must be one of [\'unit\', \'memory\', None]. got: '
                '{}.'.format(gate_style))
        self.gate_style = gate_style

        if attention_mlp_layers < 1:
            raise ValueError(
                'attention_mlp_layers must be >= 1. Got: {}.'.format(
                    attention_mlp_layers))
        self.attention_mlp_layers = attention_mlp_layers

        self.key_size = key_size if key_size else self.head_size

        ########## parameters for multihead attention ##########
        # value_size is same as head_size
        self.value_size = self.head_size
        # total size for query-key-value
        self.qkv_size = 2 * self.key_size + self.value_size
        self.total_qkv_size = self.qkv_size * self.num_heads  # denoted as F

        # each head has qkv_sized linear projector
        # just using one big param is more efficient, rather than this line
        # self.qkv_projector = [nn.Parameter(torch.randn((self.qkv_size, self.qkv_size))) for _ in range(self.num_heads)]
        self.qkv_projector = nn.Linear(self.mem_size, self.total_qkv_size)
        self.qkv_layernorm = nn.LayerNorm(
            [self.mem_slots_plus_input, self.total_qkv_size])

        # used for attend_over_memory function
        self.attention_mlp = nn.ModuleList(
            [nn.Linear(self.mem_size, self.mem_size)] *
            self.attention_mlp_layers)
        self.attended_memory_layernorm = nn.LayerNorm(
            [self.mem_slots_plus_input, self.mem_size])
        self.attended_memory_layernorm2 = nn.LayerNorm(
            [self.mem_slots_plus_input, self.mem_size])

        ########## parameters for initial embedded input projection ##########
        self.input_size = input_size
        self.input_projector = nn.Linear(self.input_size, self.mem_size)

        ########## parameters for gating ##########
        self.num_gates = 2 * self.calculate_gate_size()
        self.input_gate_projector = nn.Linear(self.mem_size, self.num_gates)
        self.memory_gate_projector = nn.Linear(self.mem_size, self.num_gates)
        # trainable scalar gate bias tensors
        self.forget_bias = nn.Parameter(
            torch.tensor(forget_bias, dtype=torch.float32))
        self.input_bias = nn.Parameter(
            torch.tensor(input_bias, dtype=torch.float32))

        ########## parameters for token-to-embed & output-to-token logit for softmax
        self.dropout = nn.Dropout()
        self.num_tokens = num_tokens
        self.token_to_input_encoder = nn.Embedding(self.num_tokens,
                                                   self.input_size)

        # needs 2 linear layers for tying weights for embedding layers
        # first match the "output" of the RMC to input_size, which is the embed dim
        self.output_to_embed_decoder = nn.Linear(
            self.mem_slots * self.mem_size, self.input_size)
        self.use_adaptive_softmax = use_adaptive_softmax
        if not self.use_adaptive_softmax:
            # then, this layer's weight can be tied to the embedding layer
            self.embed_to_logit_decoder = nn.Linear(self.input_size,
                                                    self.num_tokens)

            # tie embedding weights of encoder & decoder
            self.embed_to_logit_decoder.weight = self.token_to_input_encoder.weight

            ########## loss function
            self.criterion = nn.CrossEntropyLoss()
        else:
            # use adaptive softmax from the self.input_size logits, instead of the tied embed weights above
            self.criterion_adaptive = nn.AdaptiveLogSoftmaxWithLoss(
                self.input_size, self.num_tokens, cutoffs=cutoffs)
Example #29
0
    def __init__(self, args, save_path=None):

        super(LMBase, self).__init__()
        logger = logging.getLogger('training')
        logger.info(self.__class__.__name__)

        self.save_path = save_path

        self.emb_dim = args.emb_dim
        self.n_units = args.n_units
        self.n_layers = args.n_layers
        self.lsm_prob = args.lsm_prob

        self.vocab = args.vocab
        self.eos = 2
        self.pad = 3
        # NOTE: reserved in advance

        # for cache
        self.cache_theta = 0.2  # smoothing parameter
        self.cache_lambda = 0.2  # cache weight
        self.cache_ids = []
        self.cache_keys = []
        self.cache_attn = []

        self.embed = Embedding(vocab=self.vocab,
                               emb_dim=args.emb_dim,
                               dropout=args.dropout_in,
                               ignore_index=self.pad)

        model_size = args.lm_type.replace('gated_conv_', '')

        blocks = OrderedDict()
        if model_size == 'custom':
            blocks['conv1'] = GLUBlock(args.kernel_size,
                                       args.emb_dim,
                                       args.n_units,
                                       bottlececk_dim=args.n_projs,
                                       dropout=args.dropout_hidden)
            for l in range(args.n_layers - 1):
                blocks['conv%d' % (l + 2)] = GLUBlock(
                    args.kernel_size,
                    args.n_units,
                    args.n_units,
                    bottlececk_dim=args.n_projs,
                    dropout=args.dropout_hidden)
            last_dim = args.n_units

        elif model_size == '8':
            blocks['conv1'] = GLUBlock(4,
                                       args.emb_dim,
                                       900,
                                       dropout=args.dropout_hidden)
            for i in range(1, 8, 1):
                blocks['conv2-%d' % i] = GLUBlock(4,
                                                  900,
                                                  900,
                                                  dropout=args.dropout_hidden)
            last_dim = 900

        elif model_size == '8B':
            blocks['conv1'] = GLUBlock(1,
                                       args.emb_dim,
                                       512,
                                       dropout=args.dropout_hidden)
            for i in range(1, 4, 1):
                blocks['conv2-%d' % i] = GLUBlock(5,
                                                  512,
                                                  512,
                                                  bottlececk_dim=128,
                                                  dropout=args.dropout_hidden)
            for i in range(1, 4, 1):
                blocks['conv3-%d' % i] = GLUBlock(5,
                                                  512,
                                                  512,
                                                  bottlececk_dim=256,
                                                  dropout=args.dropout_hidden)
            blocks['conv4'] = GLUBlock(1,
                                       512,
                                       2048,
                                       bottlececk_dim=1024,
                                       dropout=args.dropout_hidden)
            last_dim = 2048

        elif model_size == '9':
            blocks['conv1'] = GLUBlock(4,
                                       args.emb_dim,
                                       807,
                                       dropout=args.dropout_hidden)
            for i in range(1, 4, 1):
                blocks['conv2-%d-1' % i] = GLUBlock(
                    4, 807, 807, dropout=args.dropout_hidden)
                blocks['conv2-%d-2' % i] = GLUBlock(
                    4, 807, 807, dropout=args.dropout_hidden)
            last_dim = 807

        elif model_size == '13':
            blocks['conv1'] = GLUBlock(4,
                                       args.emb_dim,
                                       1268,
                                       dropout=args.dropout_hidden)
            for i in range(1, 13, 1):
                blocks['conv2-%d' % i] = GLUBlock(4,
                                                  1268,
                                                  1268,
                                                  dropout=args.dropout_hidden)
            last_dim = 1268

        elif model_size == '14':
            for i in range(1, 4, 1):
                blocks['conv1-%d' % i] = GLUBlock(
                    6,
                    args.emb_dim if i == 1 else 850,
                    850,
                    dropout=args.dropout_hidden)
            blocks['conv2'] = GLUBlock(1,
                                       850,
                                       850,
                                       dropout=args.dropout_hidden)
            for i in range(1, 5, 1):
                blocks['conv3-%d' % i] = GLUBlock(5,
                                                  850,
                                                  850,
                                                  dropout=args.dropout_hidden)
            blocks['conv4'] = GLUBlock(1,
                                       850,
                                       850,
                                       dropout=args.dropout_hidden)
            for i in range(1, 4, 1):
                blocks['conv5-%d' % i] = GLUBlock(4,
                                                  850,
                                                  850,
                                                  dropout=args.dropout_hidden)
            blocks['conv6'] = GLUBlock(4,
                                       850,
                                       1024,
                                       dropout=args.dropout_hidden)
            blocks['conv7'] = GLUBlock(4,
                                       1024,
                                       2048,
                                       dropout=args.dropout_hidden)
            last_dim = 2048

        elif model_size == '14B':
            blocks['conv1'] = GLUBlock(5,
                                       args.emb_dim,
                                       512,
                                       dropout=args.dropout_hidden)
            for i in range(1, 4, 1):
                blocks['conv2-%d' % i] = GLUBlock(5,
                                                  512,
                                                  512,
                                                  bottlececk_dim=128,
                                                  dropout=args.dropout_hidden)
            for i in range(1, 4, 1):
                blocks['conv3-%d' % i] = GLUBlock(5,
                                                  512 if i == 1 else 1024,
                                                  1024,
                                                  bottlececk_dim=512,
                                                  dropout=args.dropout_hidden)
            for i in range(1, 7, 1):
                blocks['conv4-%d' % i] = GLUBlock(5,
                                                  1024 if i == 1 else 2048,
                                                  2048,
                                                  bottlececk_dim=1024,
                                                  dropout=args.dropout_hidden)
            blocks['conv5'] = GLUBlock(5,
                                       2048,
                                       4096,
                                       bottlececk_dim=1024,
                                       dropout=args.dropout_hidden)
            last_dim = 4096

        else:
            raise NotImplementedError(model_size)

        self.blocks = nn.Sequential(blocks)

        if args.adaptive_softmax:
            self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss(
                last_dim,
                self.vocab,
                # cutoffs=[self.vocab // 10, 3 * self.vocab // 10],
                cutoffs=[self.vocab // 25, self.vocab // 5],
                div_value=4.0)
            self.output = None
        else:
            self.adaptive_softmax = None
            self.output = LinearND(last_dim,
                                   self.vocab,
                                   dropout=args.dropout_out)
            # NOTE: include bias even when tying weights

            # Optionally tie weights as in:
            # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
            # https://arxiv.org/abs/1608.05859
            # and
            # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
            # https://arxiv.org/abs/1611.01462
            if args.tie_embedding:
                if args.n_units != args.emb_dim:
                    raise ValueError(
                        'When using the tied flag, n_units must be equal to emb_dim.'
                    )
                self.output.fc.weight = self.embed.embed.weight

        # Initialize parameters
        self.reset_parameters(args.param_init)
Example #30
0
    def __init__(self, args, save_path=None):

        super(LMBase, self).__init__()
        logger.info(self.__class__.__name__)

        self.lm_type = args.lm_type
        self.save_path = save_path

        self.d_model = args.transformer_d_model
        self.n_layers = args.n_layers
        self.n_heads = args.transformer_n_heads
        self.lsm_prob = args.lsm_prob

        if args.mem_len > 0:
            self.mem_len = args.mem_len
        else:
            self.mem_len = args.bptt
        if args.recog_mem_len > 0:
            self.mem_len = args.recog_mem_len

        self.vocab = args.vocab
        self.eos = 2
        self.pad = 3
        # NOTE: reserved in advance

        # for cache
        self.cache_theta = 0.2  # smoothing parameter
        self.cache_lambda = 0.2  # cache weight
        self.cache_ids = []
        self.cache_keys = []
        self.cache_attn = []
        self.embed_cache = None

        # positional embedding
        self.pos_emb = XLPositionalEmbedding(self.d_model, args.dropout_in)
        self.u_bias = nn.Parameter(torch.Tensor(self.n_heads, self.d_model // self.n_heads))
        self.v_bias = nn.Parameter(torch.Tensor(self.n_heads, self.d_model // self.n_heads))
        # NOTE: u_bias and v_bias are global parameters

        self.embed = nn.Embedding(self.vocab, self.d_model, padding_idx=self.pad)
        self.scale = math.sqrt(self.d_model)  # for token embedding
        self.dropout_emb = nn.Dropout(p=args.dropout_in)  # for token embedding
        self.layers = nn.ModuleList([copy.deepcopy(TransformerDecoderBlock(
            self.d_model, args.transformer_d_ff, 'scaled_dot',
            self.n_heads, args.dropout_hidden, args.dropout_att, args.dropout_layer,
            args.transformer_layer_norm_eps, args.transformer_ffn_activation, args.transformer_param_init,
            src_tgt_attention=False, memory_transformer=True)) for lth in range(self.n_layers)])
        self.norm_out = nn.LayerNorm(self.d_model, eps=args.transformer_layer_norm_eps)

        self.adaptive_softmax = None
        self.output = None
        if args.adaptive_softmax:
            self.adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss(
                self.d_model, self.vocab,
                cutoffs=[round(self.vocab / 15), 3 * round(self.vocab / 15)],
                # cutoffs=[self.vocab // 25, 3 * self.vocab // 5],
                div_value=4.0)
        else:
            self.output = nn.Linear(self.d_model, self.vocab)
            if args.tie_embedding:
                self.output.weight = self.embed.weight

        self.reset_parameters()