def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, cutoffs, dropout=0.5, tie_weights=False): super(RNNModel, self).__init__() self.drop = nn.Dropout(dropout) self.encoder = nn.Embedding(ntoken, ninp) if rnn_type is 'GRU': self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout) else: try: nonlinearity = { 'RNN_TANH': 'tanh', 'RNN_RELU': 'relu' }[rnn_type] except KeyError: raise ValueError( """An invalid option for `--model` was supplied, options are ['GRU', 'RNN_TANH' or 'RNN_RELU']""" ) self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout) self.decoder = nn.Linear(nhid, ntoken) if tie_weights: if nhid != ninp: raise ValueError( 'When using the tied flag, nhid must be equal to emsize') self.decoder.weight = self.encoder.weight self.init_weights() self.rnn_type = rnn_type self.nhid = nhid self.nlayers = nlayers self.softmax = AdaptiveSoftmax(nhid, cutoffs) self.full = False
def train(): cutoffs = [10, 20, 300 + 1] embeds = nn.Embedding(300, 100) adaptive_logits = AdaptiveLogits(embeds, cutoffs) adaptive_softmax = AdaptiveSoftmax(adaptive_logits) model = Toy(embeds) x = torch.randn((3, 100)) targets = torch.tensor([0, 1, 10]) optimizer = optim.Adam( itertools.chain( model.parameters(), # adaptive_logits.parameters())) embeds.parameters())) for i in range(1000): optimizer.zero_grad() hidden = model(x) logits = torch.mm( hidden, torch.transpose(embeds(torch.tensor(range(300))), 0, 1)) loss = F.cross_entropy(logits, targets) # logits = adaptive_logits(hidden, targets) # loss = adaptive_logits.loss(logits, targets) loss.backward() optimizer.step() # print(torch.argmax(adaptive_softmax(hidden), 1)) print(torch.argmax(F.softmax(logits, 1), 1))
def __init__(self, d_model, num_heads, max_position, d_ffn, num_layers, mem_len, vocab_size, dropout_rate=0.1, cutoffs=None, proj_factor=4, proj_dims=None, straight_through=False, **kwargs): super().__init__(**kwargs) assert mem_len >= 0 and max_position > 0 self.d_model = d_model self.mem_len = mem_len self.cutoffs = cutoffs self.num_layers = num_layers self.max_position = max_position self.embed = tf.keras.layers.Embedding(vocab_size, d_model) if cutoffs: self.final_layer = AdaptiveSoftmax(cutoffs, proj_factor, proj_dims) else: self.final_layer = tf.keras.layers.Dense(vocab_size) self.stoch_blks = [ StochasticBlock(d_model, num_heads, max_position, d_ffn, dropout_rate, straight_through) for _ in range(num_layers) ] self.dropout = tf.keras.layers.Dropout(dropout_rate, name='inp_dropout')
def test_softmax(): batch_size = 300 hidden_size = 200 vocab_size = 100 misclassification_error = 0 best_misclassification_error = 0 for i in range(10): embed_weights = nn.Parameter(torch.Tensor(vocab_size, hidden_size)) embed_weights.data.normal_(0, 1.0/math.sqrt(hidden_size)) vocab = nn.Embedding(vocab_size, hidden_size, _weight=embed_weights) cutoffs = [20, 30, vocab_size] adaptive_logits = AdaptiveLogits(vocab, cutoffs) adaptive_softmax = AdaptiveSoftmax(adaptive_logits) targets = torch.randint(low=0, high=vocab_size, size=[batch_size], dtype=torch.long) hidden = torch.randn(batch_size, hidden_size) probs = adaptive_softmax(hidden) preds = torch.argmax(probs, dim=1) probs_vocab = adaptive_softmax(vocab(targets)) preds_vocab = torch.argmax(probs_vocab, dim=1) misclassification_error += (preds - targets).float().norm(p=0) best_misclassification_error += (preds_vocab - targets).float().norm(p=0) assert approx_eq(torch.sum(probs, dim=1), torch.ones(probs.shape[0])) assert approx_eq(torch.sum(probs_vocab, dim=1), torch.ones(probs.shape[0])) assert best_misclassification_error < misclassification_error
class RNNModel(nn.Module): """Container module with an encoder, a recurrent module, and a decoder. Based on official pytorch examples""" def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, cutoffs, dropout=0.5, tie_weights=False): super(RNNModel, self).__init__() self.drop = nn.Dropout(dropout) self.encoder = nn.Embedding(ntoken, ninp) if rnn_type is 'GRU': self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout) else: try: nonlinearity = { 'RNN_TANH': 'tanh', 'RNN_RELU': 'relu' }[rnn_type] except KeyError: raise ValueError( """An invalid option for `--model` was supplied, options are ['GRU', 'RNN_TANH' or 'RNN_RELU']""" ) self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout) self.decoder = nn.Linear(nhid, ntoken) if tie_weights: if nhid != ninp: raise ValueError( 'When using the tied flag, nhid must be equal to emsize') self.decoder.weight = self.encoder.weight self.init_weights() self.rnn_type = rnn_type self.nhid = nhid self.nlayers = nlayers self.softmax = AdaptiveSoftmax(nhid, cutoffs) self.full = False def init_weights(self): initrange = 0.1 self.encoder.weight.data.uniform_(-initrange, initrange) self.decoder.bias.data.fill_(0) self.decoder.weight.data.uniform_(-initrange, initrange) def forward(self, input, hidden): emb = self.drop(self.encoder(input)) output, hidden = self.rnn(emb, hidden) output = self.drop(output) output = output.view(output.size(0) * output.size(1), output.size(2)) if self.full: decode = self.softmax.log_prob(output) else: decode = self.softmax(output) return decode, hidden def init_hidden(self, bsz): weight = next(self.parameters()).data return Variable(weight.new(self.nlayers, bsz, self.nhid).zero_())