def forward(self, input, hidden, return_h=False, return_prob=False): batch_size = input.size(1) emb = embedded_dropout(self.encoder, input, dropout=self.dropoute if (self.training and self.use_dropout) else 0) #emb = self.idrop(emb) emb = self.lockdrop(emb, self.dropouti if self.use_dropout else 0) raw_output = emb new_hidden = [] #raw_output, hidden = self.rnn(emb, hidden) raw_outputs = [] outputs = [] for l, rnn in enumerate(self.rnns): current_input = raw_output raw_output, new_h = rnn(raw_output, hidden[l]) new_hidden.append(new_h) raw_outputs.append(raw_output) if l != self.nlayers - 1: #self.hdrop(raw_output) raw_output = self.lockdrop( raw_output, self.dropouth if self.use_dropout else 0) outputs.append(raw_output) hidden = new_hidden output = self.lockdrop(raw_output, self.dropout if self.use_dropout else 0) outputs.append(output) latent = self.latent(output) latent = self.lockdrop(latent, self.dropoutl if self.use_dropout else 0) logit = self.decoder(latent.view(-1, self.ninp)) prior_logit = self.prior(output).contiguous().view(-1, self.n_experts) prior = nn.functional.softmax(prior_logit, -1) prob = nn.functional.softmax(logit.view(-1, self.ntoken), -1).view(-1, self.n_experts, self.ntoken) prob = (prob * prior.unsqueeze(2).expand_as(prob)).sum(1) if return_prob: model_output = prob else: log_prob = torch.log(prob.add_(1e-8)) model_output = log_prob model_output = model_output.view(-1, batch_size, self.ntoken) if return_h: return model_output, hidden, raw_outputs, outputs return model_output, hidden
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. if args.model == 'QRNN': rnn.reset() rnn.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = rnn.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args, evaluation=True) output, hidden = rnn(data, hidden) output_flat = output.view(-1, ntokens) total_loss += len(data) * criterion(output_flat, targets).data hidden = repackage_hidden(hidden) return total_loss[0] / len(data_source)
def forward_test(self, inp, hprev): hp = hprev.clone().view(1, -1) # print(hp.shape) rec_net = rnn() i = 0 for t in range(inp.shape[0]): #print(i) hp = rec_net.forward(hp, inp[t].view(1, -1), self.Bh, self.Whh, self.Wxh) i = i + 1 output = hp.mm(self.Why) return output
def __init__(self, nLayers, H, B, D, isTrain): self.layer = [] self.nLayers = nLayers # no_of_layers self.hidden_dim = H # hidden_layer_dim self.batch_size = B # Batch_Size self.input_dim = D # word_vector_size, len_unique self.out = 2 # no_of_output_classes self.isTrain = isTrain self.Wxh = torch.randn([self.input_dim, self.hidden_dim ]).double() * 0.1 self.Whh = torch.randn([self.hidden_dim, self.hidden_dim ]).double() * 0.1 self.Bh = torch.randn(self.hidden_dim).double() * 0.1 self.By = torch.randn(self.out).double() * 0.1 self.Why = torch.randn([self.hidden_dim, self.out]).double() * 0.1 for t in range(nLayers): self.addlayer(rnn())
X_batch = [] y_batch = [] for i in random_ix[:batch_size]: X_batch.append(np.asarray(X[i:i+seq_length].reshape(-1,1))) y_batch.append(X[i+1:i+seq_length+1].reshape(-1,1)) X_batch, y_batch = np.asarray(X_batch), np.asarray(y_batch) X_batch, y_batch = np.transpose(X_batch,(1,0,2)), np.transpose(y_batch,(1,0,2)) return X_batch, y_batch train_size = 4000 X_train = np.array([((i/10.)*np.sin(i/10.)+6*np.sin(5*(i/10.)))/48 for i in range(train_size)]) X_test = np.array([(((i+0.5)/10.)*np.sin((i+0.5)/10.) + 6*np.sin(5*((i+0.5)/10.)))/48 for i in range(train_size)]) n_epochs = 10 seq_length = 32 n_units = 16 learning_rate = 0.001 batch_size = 1000 n_batches = int(train_size/batch_size) network = rnn(n_units=n_units, X_length=1, y_length=1) # train for epoch in range(n_epochs): for batch in range(n_batches): learning_rate*=0.94 X_batch, y_batch = fetch_batch(batch_size, seq_length, X_train) X_test_batch, y_test_batch = fetch_batch(batch_size, seq_length, X_test) print("train loss: {}".format(network.loss(X_batch,y_batch))) print("test loss : {}".format(network.loss(X_test_batch, y_test_batch))) network.fit(X_batch, y_batch, learning_rate) #
def forward(self, *hidden, input=None, return_h=False, return_prob=False, return_student_distill_loss=False, average_ensemble=False, enable_rnd_distill=False, enable_rnd_tune=False, flatten_returned_lists=False): batch_size = input.size(1) if self.rnn_type == "lstm" or self.rnn_type == "sru": # hidden state must be rearranged a (h, c) tuple rearranged_hidden = [] for i in range(0, len(hidden), 2): rearranged_hidden.append((hidden[i], hidden[i+1])) hidden = rearranged_hidden emb = embedded_dropout(self.encoder, input, dropout=self.dropoute if (self.training and self.use_dropout) else 0) #emb = self.idrop(emb) emb = self.lockdrop(emb, self.dropouti if self.use_dropout else 0) raw_output = emb new_hidden = [] #raw_output, hidden = self.rnn(emb, hidden) raw_outputs = [] outputs = [] distill_loss_acc = [torch.tensor(0.0).to(input.device)] if return_student_distill_loss else None for l, rnn in enumerate(self.rnns): state_post_proc = None assert(not (enable_rnd_distill and enable_rnd_tune)), "enable_rnd_distill and enable_rnd_tune can't be enabled at the same time" if enable_rnd_distill: state_post_proc = self.rnd_models[l].get_rnd_distill_loss_proc(distill_loss_acc) if enable_rnd_tune: state_post_proc = self.rnd_models[l].get_rnd_scale_proc(distill_loss_acc) current_input = raw_output if self.ndistilstudents == 0: raw_output, new_h = rnn(current_input, hidden[l], distill_loss_acc=distill_loss_acc, state_post_proc=state_post_proc) else: raw_output, new_h = rnn(current_input, hidden[l], distill_loss_acc=distill_loss_acc, average_ensemble=average_ensemble, state_post_proc=state_post_proc) new_hidden.append(new_h) raw_outputs.append(raw_output) if l != self.nlayers - 1: #self.hdrop(raw_output) raw_output = self.lockdrop(raw_output, self.dropouth if self.use_dropout else 0) outputs.append(raw_output) hidden = new_hidden output = self.lockdrop(raw_output, self.dropout if self.use_dropout else 0) outputs.append(output) latent = self.latent(output) latent = self.lockdrop(latent, self.dropoutl if self.use_dropout else 0) logit = self.decoder(latent.view(-1, self.ninp) * self.decoder_gain) #print(self.decoder_gain.max().item(), self.decoder_gain.min().item(), self.decoder_gain.mean().item()) prior_logit = self.prior(output).contiguous().view(-1, self.n_experts) prior = nn.functional.softmax(prior_logit, -1) prob = nn.functional.softmax(logit.view(-1, self.ntoken), -1).view(-1, self.n_experts, self.ntoken) prob = (prob * prior.unsqueeze(2).expand_as(prob)).sum(1) if return_prob: model_output = prob else: log_prob = torch.log(prob.add_(1e-8)) model_output = log_prob model_output = model_output.view(-1, batch_size, self.ntoken) rv = (model_output, hidden) if return_h: rv = rv + (raw_outputs, outputs) if return_student_distill_loss: rv = rv + (distill_loss_acc[0].reshape([1, 1]), ) if flatten_returned_lists: new_rv = [] for e in rv: if isinstance(e, list): for ee in e: new_rv.append(ee) else: new_rv.append(e) rv = new_rv return rv
def train(): # Turn on training mode which enables dropout. if args.model == 'QRNN': rnn.reset() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = rnn.init_hidden(args.batch_size) batch, i = 0, 0 while i < train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths seq_len = max(5, int(np.random.normal(bptt, 5))) # There's a very small chance that it could select a very long sequence length resulting in OOM seq_len = min(seq_len, args.bptt + 10) lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt rnn.train() data, targets = get_batch(train_data, i, args, seq_len=seq_len) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) optimizer.zero_grad() output, hidden, rnn_hs, dropped_rnn_hs = rnn(data, hidden, return_h=True) raw_loss = criterion(output.view(-1, ntokens), targets) loss = raw_loss # Activiation Regularization loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(rnn.parameters(), args.clip) optimizer.step() total_loss += raw_loss.data optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() ### batch += 1 i += seq_len
if torch.cuda.is_available(): rnn.cuda() criterion = nn.CrossEntropyLoss() optimizer = torch.optim.RMSprop(rnn.parameters(), lr=learning_rate) # Re-train the network but don't update zero-weights (by setting the corresponding gradients to zero) for epoch in range(num_epochs): for i, (images, labels) in enumerate(train_loader): t0 = time() images = to_var(images.view(-1, sequence_length, input_size)) labels = to_var(labels) # Forward + Backward + Optimize optimizer.zero_grad() outputs = rnn(images) loss = criterion(outputs, labels) loss.backward() # zero-out all the gradients corresponding to the pruned connections # for l,p in enumerate(rnn.parameters()): # pruned_inds = pruned_inds_by_layer[l] # if type(pruned_inds) is not str: # p.grad.data[pruned_inds] = 0. optimizer.step() losses.append(loss.data[0]) if (i + 1) % 100 == 0: accuracy = compute_accuracy(rnn, sequence_length, input_size,
import numpy as np import scipy as sp import matplotlib.pyplot as plt import numpy.linalg as alg from nnutils import * from rnn import * if __name__ == "__main__": A = np.array([[1, -1, 1, -1],[2, 1, -2, 1],[-1, -1, -2, 1],[1, -2, 1, 1]]) b = np.matrix([0, -1, -3, -1]).T nn = rnn(4, 1, 0) W = np.zeros([4,4]) W[0,3] = -6 W[3,0] = -6 W[1,2] = 6 W[2,1] = 6 W[1,3] = -2 W[3,1] = -2 t = np.array([7, -1, -4, 2]) x = np.matrix([0, 1, 0, 1]).T nn.setWeight(W) nn.setThreshold(t) nn.setValue(x) for i in range(0, 20): nn.printValue() x_now = nn.getValue() axb = np.dot(A, x_now) - b
d["t"] = t d["W"] = W return d if __name__ == "__main__": n = 4 n2 = n * n d = expandEnergy(energyNHP, n2) a = 100 nhp = rnn(n2, a, 0) nhp.setThreshold(d["t"]) nhp.setWeight(d["W"]) num_loop = 500 solutions = [0] * num_loop subs = [0] * num_loop count = 0 update_until_end = [0] * num_loop for i in range(0, num_loop): nhp.setValue(randomBinaryVec(n2)) for j in range(0, num_loop): nhp.update()
import numpy as np from rnn import * np.random.seed(0) X_length = 3 seq_length = 16 y_length = 3 num_units = 10 batch_size = 20 X_seq = np.random.normal(size=[seq_length, batch_size, X_length]) y_seq = np.random.normal(size=[seq_length, batch_size, y_length]) net = rnn(num_units, X_length, y_length) net.grad_check(X_seq, y_seq)