def run_epoch(model, data, is_train=False, lr=1.0): """Runs the model on the given data.""" if is_train: model.train() else: model.eval() epoch_size = ((len(data) // model.batch_size) - 1) // model.num_steps start_time = time.time() hidden = model.init_hidden() costs = 0.0 iters = 0 for step, (x, y) in enumerate(reader.ptb_iterator(data, model.batch_size, model.num_steps)): inputs = Variable(torch.from_numpy(x.astype(np.int64)).transpose(0, 1).contiguous()).cuda() model.zero_grad() hidden = repackage_hidden(hidden) outputs, hidden = model(inputs, hidden) targets = Variable(torch.from_numpy(y.astype(np.int64)).transpose(0, 1).contiguous()).cuda() tt = torch.squeeze(targets.view(-1, model.batch_size * model.num_steps)) loss = criterion(outputs.view(-1, model.vocab_size), tt) costs += loss.data[0] * model.num_steps iters += model.num_steps if is_train: loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), 0.25) for p in model.parameters(): p.data.add_(-lr, p.grad.data) if step % (epoch_size // 10) == 10: print("{} perplexity: {:8.2f} speed: {} wps".format(step * 1.0 / epoch_size, np.exp(costs / iters), iters * model.batch_size / (time.time() - start_time))) return np.exp(costs / iters)
def run_epoch(model, data, is_train=False, lr=1.0, device=torch.device('cpu')): """Runs the model on the given data.""" if is_train: model.train() else: model.eval() epoch_size = ((len(data) // model.batch_size) - 1) // model.num_steps start_time = time.time() hidden = model.init_hidden() costs = 0.0 iters = 0 for step, (x, y) in enumerate(reader.ptb_iterator(data, model.batch_size, model.num_steps, model.direction)): # I think x is the input to the LSTM and y is the expected output inputs = Variable(torch.from_numpy(x.astype(np.int64)).transpose(0, 1).contiguous()).to(device) model.zero_grad() hidden = repackage_hidden(hidden) outputs, hidden = model.forward(inputs=inputs, hidden=hidden) targets = Variable(torch.from_numpy(y.astype(np.int64)).transpose(0, 1).contiguous()).to(device) # Tranposes and puts target words in tensor tt = torch.squeeze(targets.view(-1, model.batch_size * model.num_steps)) loss = criterion(outputs.view(-1, model.vocab_size), tt) # Computes the cross entropy loss costs += loss.item() * model.num_steps # was loss.data[0] saves loss across iterations? iters += model.num_steps if is_train: loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), 0.25) for p in model.parameters(): p.data.add_(-lr, p.grad.data) if step % (epoch_size // 10) == 10: print("{} perplexity: {:8.2f} speed: {} wps".format(step * 1.0 / epoch_size, np.exp(costs / iters), iters * model.batch_size / (time.time() - start_time))) return np.exp(costs / iters)
def run_prediction(model, my_words, inputs, device=torch.device('cpu')): """ Runs prediction on single query sequence. Returns the predicted word, top hidden layer and top cell state. :param model: Container for the LSTM models :param my_words: Containter for converting between words and word_ids :param inputs: A single query sequence as a list of word ids :param device: Stores whether to use cuda (GPU) :return: last_word: string: the predicted next word h_f: torch tensor: The last top level hidden layer c_f: torch tensor: The last top level cell state. Think this gives the best vectorization """ working = np.array(inputs) if model.direction == 'backward': working = np.flip(working) working = working.reshape(1, -1) working = Variable(torch.from_numpy(working.astype(np.int64)).transpose(0, 1).contiguous()).to(device) model.eval() num_steps = len(inputs) batch_size = 1 hidden = model.init_hidden(batch_size=batch_size) hidden = repackage_hidden(hidden) # output, ((num_layers * num_directions, batch, hidden_size), (num_layers * num_directions, batch, hidden_size)) output, (h_n, c_n) = model.forward(inputs=working, hidden=hidden, num_steps=num_steps, batch_size=batch_size) # Change to (num_layers, num_directions, batch, hidden_size) h_n = h_n.view(model.num_layers, model.num_directions, batch_size, model.hidden_dim) h_n = torch.transpose(h_n, 1, 2) # change to (num_layers, batch, num_directions, hidden_size) # Change to (num_layers, num_directions, batch, hidden_size) c_n = c_n.view(model.num_layers, model.num_directions, batch_size, model.hidden_dim) c_n = torch.transpose(c_n, 1, 2) # change to (num_layers, batch, num_directions, hidden_size) # Pull out last word last_word = output[-1, 0] last_word = torch.argmax(last_word).tolist() if not isinstance(last_word, str): last_word = [last_word] last_word = my_words.word_ids_to_words(last_word)[0] # Get the last word list then take the 0 index # Pull out the top final h_f and c_f h_f = h_n[model.lstm.num_layers - 1].view(-1) c_f = c_n[model.lstm.num_layers - 1].view(-1) return last_word, h_f, c_f
def run_epoch(model, data, is_train=False, lr=1.0, prt_out=False): """Runs the model on the given data.""" model.train() epoch_size = ((len(data) // model.batch_size) - 1) // model.num_steps start_time = time.time() hidden = model.init_hidden() costs = 0.0 iters = 0 for step, (x, y) in enumerate(train_iter): if len(x) < num_steps: continue inputs = Variable(x.contiguous()).cuda() # print("inputs", inputs) targets = Variable(y.contiguous()).cuda() # print("targets", targets.size()) model.zero_grad() hidden = repackage_hidden(hidden) outputs, hidden = model(inputs, hidden) tt = torch.squeeze(targets.view(-1, model.batch_size * model.num_steps)) if prt_out: for o in outputs: for w in o: val, idx = torch.max(w, 0) # print(m) print(TEXT.vocab.itos[idx.data[0]], end=" ") print() loss = criterion(outputs.view(-1, model.vocab_size), tt) costs += loss.data[0] * model.num_steps iters += model.num_steps loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), 0.25) for p in model.parameters(): p.data.add_(-lr, p.grad.data) if step % 30 == 0: print("{} perplexity: {:8.2f} speed: {} wps".format( step * 1.0 / epoch_size, np.exp(costs / iters), iters * model.batch_size / (time.time() - start_time))) return np.exp(costs / iters)
def run_epoch(model, data, optimizer, is_train=False): """Runs one epoch on the give data.""" if is_train: model.train() else: model.eval() epoch_size = ((len(data) // model.batch_size) - 1) // model.num_steps start_time = time.time() hidden = model.init_hidden() costs = 0.0 iters = 0 data_iterator = reader.ptb_iterator(data, model.batch_size, model.num_steps) for step, (x, y) in enumerate(data_iterator): inputs = Variable( torch.from_numpy(x.astype(np.int64)).transpose( 0, 1).contiguous()).cuda() targets = Variable( torch.from_numpy(y.astype(np.int64)).transpose( 0, 1).contiguous()).cuda() tt = torch.squeeze(targets.view(-1, model.batch_size * model.num_steps)) optimizer.zero_grad() hidden = repackage_hidden(hidden) outputs, hidden = model(inputs, hidden) loss = criterion(outputs.view(-1, model.vocab_size), tt) costs += loss.data[0] * model.num_steps iters += model.num_steps # Report perplexity for PTB or BPC otherwise metric = "perplexity" if args.data_set == "ptb" else "bpc" perf = np.exp( costs / iters) if args.data_set == "ptb" else 1.4427 * (costs / iters) if is_train: loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), args.grad_clip) optimizer.step() if step % (epoch_size // 10) == 10: wps = iters * model.batch_size / (time.time() - start_time) print("{} : {} {:8.4f} speed: {} wps".format( step * 1.0 / epoch_size, metric, perf, wps)) return perf
def run_epoch(model, data, is_train=False, lr=1.0): """Runs the model on the given data.""" if is_train: model.train() else: model.eval() epoch_size = ((len(data) // model.batch_size) - 1) // model.num_steps start_time = time.time() hidden = model.init_hidden() costs = 0.0 iters = 0 for step, (x, y) in enumerate( reader.ptb_iterator(data, model.batch_size, model.num_steps)): inputs = Variable( torch.from_numpy(x.astype(np.int64)).transpose( 0, 1).contiguous()).cuda() model.zero_grad() hidden = repackage_hidden(hidden) num_steps_time, bs = inputs.size() indices = np.random.permutation(bs) targets = Variable( torch.from_numpy(y.astype(np.int64)).transpose( 0, 1).contiguous()).cuda() if is_train: #alpha = 0.1 lam = np.random.beta(args.mixup_alpha, args.mixup_alpha) #lam = np.random.uniform(0.95, 1.0) lam = Variable( torch.from_numpy(np.array([lam]).astype('float32')).cuda()) targets = targets.permute(1, 0) target_shuffled = targets[indices] targets = targets.permute(1, 0).contiguous() target_shuffled = target_shuffled.permute(1, 0).contiguous() tt_shuffled = torch.squeeze( target_shuffled.view(-1, model.batch_size * model.num_steps)) targets = Variable( torch.from_numpy(y.astype(np.int64)).transpose( 0, 1).contiguous()).cuda() tt = torch.squeeze(targets.view(-1, model.batch_size * model.num_steps)) if is_train: outputs, hidden = model(inputs, hidden, is_train, indices, lam) loss = lam * criterion(outputs.view( -1, model.vocab_size), tt) + (1 - lam) * criterion( outputs.view(-1, model.vocab_size), tt_shuffled) else: outputs, hidden = model(inputs, hidden, False, None, None) loss = criterion(outputs.view(-1, model.vocab_size), tt) costs += loss.data[0] * model.num_steps iters += model.num_steps if is_train: loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), 0.25) for p in model.parameters(): p.data.add_(-lr, p.grad.data) if step % (epoch_size // 10) == 10: print("{} perplexity: {:8.2f} speed: {} wps".format( step * 1.0 / epoch_size, np.exp(costs / iters), iters * model.batch_size / (time.time() - start_time))) return np.exp(costs / iters)