def export_onnx(path, batch_size, seq_len): print('The model is also exported in ONNX format at {}'. format(os.path.realpath(args.onnx_export))) model.eval() dummy_input = torch.LongTensor(seq_len * batch_size).zero_().view(-1, batch_size).to(device) hidden = model.init_hidden(batch_size) torch.onnx.export(model, (dummy_input, hidden), path)
def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.data if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def evaluate(data_source, batch_size=10, window=args.window): # Turn on evaluation mode which disables dropout. if args.model == 'QRNN': model.reset() model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) next_word_history = None pointer_history = None for i in range(0, data_source.size(0) - 1, args.bptt): if i > 0: print(i, len(data_source), math.exp(total_loss / i)) data, targets = get_batch(data_source, i, evaluation=True, args=args) output, hidden, rnn_outs, _ = model(data, hidden, return_h=True) rnn_out = rnn_outs[-1].squeeze() output_flat = output.view(-1, ntokens) ### # Fill pointer history start_idx = len(next_word_history) if next_word_history is not None else 0 next_word_history = torch.cat([one_hot(t.data[0], ntokens) for t in targets]) if next_word_history is None else torch.cat([next_word_history, torch.cat([one_hot(t.data[0], ntokens) for t in targets])]) #print(next_word_history) pointer_history = Variable(rnn_out.data) if pointer_history is None else torch.cat([pointer_history, Variable(rnn_out.data)], dim=0) #print(pointer_history) ### # Built-in cross entropy # total_loss += len(data) * criterion(output_flat, targets).data[0] ### # Manual cross entropy # softmax_output_flat = torch.nn.functional.softmax(output_flat) # soft = torch.gather(softmax_output_flat, dim=1, index=targets.view(-1, 1)) # entropy = -torch.log(soft) # total_loss += len(data) * entropy.mean().data[0] ### # Pointer manual cross entropy loss = 0 softmax_output_flat = torch.nn.functional.softmax(output_flat) for idx, vocab_loss in enumerate(softmax_output_flat): p = vocab_loss if start_idx + idx > window: valid_next_word = next_word_history[start_idx + idx - window:start_idx + idx] valid_pointer_history = pointer_history[start_idx + idx - window:start_idx + idx] logits = torch.mv(valid_pointer_history, rnn_out[idx]) theta = args.theta ptr_attn = torch.nn.functional.softmax(theta * logits).view(-1, 1) ptr_dist = (ptr_attn.expand_as(valid_next_word) * valid_next_word).sum(0).squeeze() lambdah = args.lambdasm p = lambdah * ptr_dist + (1 - lambdah) * vocab_loss ### target_loss = p[targets[idx].data] loss += (-torch.log(target_loss)).data[0] total_loss += loss / batch_size ### hidden = repackage_hidden(hidden) next_word_history = next_word_history[-window:] pointer_history = pointer_history[-window:] return total_loss / len(data_source)
def train(): # Turn on training mode which enables dropout. if args.model == 'QRNN': model.reset() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) batch, i = 0, 0 while i < train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths seq_len = max(5, int(np.random.normal(bptt, 5))) # There's a very small chance that it could select a very long sequence length resulting in OOM # seq_len = min(seq_len, args.bptt + 10) lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt model.train() data, targets = get_batch(train_data, i, args, seq_len=seq_len) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) optimizer.zero_grad() output, hidden, rnn_hs, dropped_rnn_hs = model(data, hidden, return_h=True) raw_loss = criterion(model.decoder.weight, model.decoder.bias, output, targets) loss = raw_loss # Activiation Regularization if args.alpha: loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) if args.beta: loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. if args.clip: torch.nn.utils.clip_grad_norm_(params, args.clip) optimizer.step() total_loss += raw_loss.data optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss.item() / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format( epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), cur_loss / math.log(2))) total_loss = 0 start_time = time.time() ### batch += 1 i += seq_len
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() if args.model == 'QRNN': model.reset() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args, evaluation=True) output, hidden = model(data, hidden) total_loss += len(data) * criterion(model.decoder.weight, model.decoder.bias, output, targets).data hidden = repackage_hidden(hidden) return total_loss.item() / len(data_source)
def evaluate(data_source): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(eval_batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, evaluation=True) output, hidden = model(data, hidden) output_flat = output.view(-1, ntokens) total_loss += len(data) * criterion(output_flat, targets).data hidden = repackage_hidden(hidden) return total_loss[0] / len(data_source)
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args, evaluation=True) targets = targets.view(-1) log_prob, hidden = parallel_model(data, hidden) loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), targets).data total_loss += len(data) * loss hidden = repackage_hidden(hidden) return total_loss[0] / len(data_source)
def train(model, train_data, lr): # Turn on training mode which enables dropout. model.train() model.set_mode('train') total_loss = 0. start_time = time.time() ntokens = len(dictionary) hidden = model.init_hidden(args.batch_size) optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=args.wdecay) # optimizer = torch.optim.Adam(model.parameters(), lr=lr) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() # gs534 add sentence resetting eosidx = dictionary.get_eos() if args.loss == 'nce': output, hidden = model(data, hidden, eosidx, targets) loss = criterion(output) loss.backward() else: output, hidden = model(data, hidden, separate=args.reset, eosidx=eosidx) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) #for p in model.parameters(): # p.data.add_(-lr, p.grad.data) optimizer.step() total_loss += loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time logging('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() if args.model == 'QRNN': model.reset() loss_measure = AverageMeter() acc_measure = AverageMeter() ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args, evaluation=True) output, hidden = model(data, hidden) loss = criterion(model.decoder.weight, model.decoder.bias, output, targets).data loss_measure.update(float(loss), targets.nelement()) acc = float(accuracy(output.data, targets.data)[0]) acc_measure.update(acc, targets.nelement()) hidden = repackage_hidden(hidden) return loss_measure.avg, acc_measure.avg
def evaluate(data_source, use_dropout=False, batch_size=10): # Turn on evaluation mode which disables dropout. if not use_dropout: model.eval() else: model.train() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args, evaluation=True) output, hidden = model(data, hidden) total_loss += len(data) * criterion( model.decoder.weight, model.decoder.bias, output, targets).data hidden = repackage_hidden(hidden) # turn on eval mode at the end because we expect eval mode model.eval() return total_loss.item() / len(data_source)
def predict_full_sequence(model, x, input_size, num_steps,pm25_index,scaler): predictions = torch.zeros(num_steps) hidden = model.init_hidden(1) # y_pred, _, hidden = model(x.contiguous().view(-1, 1, input_size), hidden) # x = torch.cat((x, y_pred)) # predictions[0] = y_pred[:,pm25_index] if city=='beijing' or city=='chengdu' or city=='shanghai' or city=='shenyang': x= x.unsqueeze(1) for i in range(0, num_steps): y_pred, _, hidden = model(x.contiguous().view(-1, 1, input_size), hidden) # x = x*(min_max_valid[i,1]-min_max_valid[i,0])+min_max_valid[i,0] #print(y_pred.shape) #rint(x.shape) x = torch.cat((x, y_pred)) x = x[1:] predictions[i] = torch.FloatTensor(scaler.inverse_transform(np.expand_dims(y_pred[0].data, axis=0))[:,-1]) # predictions[i]=y_pred[:,pm25_index] return predictions
def evaluate(args, model, test_dataset): # Turn on evaluation mode which disables dropout. model.eval() with torch.no_grad(): total_loss = 0 hidden = model.init_hidden(args.eval_batch_size) nbatch = 1 for nbatch, i in enumerate( range(0, test_dataset.size(0) - 1, args.bptt)): inputSeq, targetSeq = get_batch(args, test_dataset, i) # inputSeq: [ seq_len * batch_size * feature_size ] # targetSeq: [ seq_len * batch_size * feature_size ] hidden_ = model.repackage_hidden(hidden) '''Loss1: Free running loss''' outVal = inputSeq[0].unsqueeze(0) outVals = [] hids1 = [] for i in range(inputSeq.size(0)): outVal, hidden_, hid = model.forward(outVal, hidden_, return_hiddens=True) outVals.append(outVal) hids1.append(hid) outSeq1 = torch.cat(outVals, dim=0) hids1 = torch.cat(hids1, dim=0) loss1 = criterion(outSeq1.contiguous().view(args.batch_size, -1), targetSeq.contiguous().view(args.batch_size, -1)) '''Loss2: Teacher forcing loss''' outSeq2, hidden, hids2 = model.forward(inputSeq, hidden, return_hiddens=True) loss2 = criterion(outSeq2.contiguous().view(args.batch_size, -1), targetSeq.contiguous().view(args.batch_size, -1)) '''Loss3: Simplified Professor forcing loss''' loss3 = criterion(hids1.view(args.batch_size, -1), hids2.view(args.batch_size, -1).detach()) '''Total loss = Loss1+Loss2+Loss3''' loss = loss1 + loss2 + loss3 total_loss += loss.item() return total_loss / (nbatch + 1)
def evaluate_1step_pred(args, model, test_dataset): # turn on evaluation mode which disables dropout model.eval() total_loss = 0 with torch.no_grad(): hidden = model.init_hidden(args.eval_batch_size) for nbatch, i in enumerate( range(0, test_dataset.size(0) - 1, args.bptt)): inputSeq, targetSeq = get_batch(args, test_dataset, i) outSeq, hidden = model.forward(inputSeq, hidden) loss = criterion(outSeq.view(args.batch_size, -1), targetSeq.view(args.batch_size, -1)) hidden = model.repackage_hidden(hidden) total_loss += loss.item() return total_loss / nbatch
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) with torch.no_grad(): for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args) targets = targets.view(-1) log_prob, hidden = parallel_model(data, hidden) loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), targets).data total_loss += len(data) * loss hidden = repackage_hidden(hidden) return total_loss.item() / len(data_source)
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args, evaluation=True) targets = targets.view(-1) print('data size: {} target size: {}'.format(data.size(), targets.size())) log_prob, hidden = parallel_model(data, hidden) loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), targets).data total_loss += loss * len(data) hidden = repackage_hidden(hidden) return total_loss[0] / len(data_source)
def evaluate(data_source, h_sp=[0., 0.], h_th=[0., 0.], block=-1): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0. ntokens = len(corpus.dictionary) hidden = model.init_hidden(eval_batch_size) with torch.no_grad(): for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i) output, hidden = model(data, hidden, sparse=True, h_sp=h_sp, h_th=h_th, block=block) output_flat = output.view(-1, ntokens) total_loss += len(data) * criterion(output_flat, targets).item() hidden = repackage_hidden(hidden) return total_loss / (len(data_source) - 1)
def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) if args.model != 'Transformer': hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. model.zero_grad() if args.model == 'Transformer': output = model(data) output = output.view(-1, ntokens) else: hidden = repackage_hidden(hidden) output, hidden = model(data, hidden) loss = criterion(output, targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(p.grad, alpha=-lr) total_loss += loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() if args.dry_run: break
def evaluate(data_source, source_sampler, target_sampler, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() if args.model == 'QRNN': model.reset() total_loss = 0 hidden = model.init_hidden(batch_size) for source_sample, target_sample in zip(source_sampler, target_sampler): model.train() data = Variable(torch.stack([data_source[i] for i in source_sample]), volatile=True) targets = Variable(torch.stack([data_source[i] for i in target_sample])).view(-1) output, hidden = model(data, hidden) total_loss += len(data) * criterion( model.decoder.weight, model.decoder.bias, output, targets).data hidden = repackage_hidden(hidden) return total_loss[0] / len(data_source)
def train(): # Turn on training mode which enables dropout. print('Load training data') model.train() if hasattr(model.rnn, 'step_slope'): model.rnn.step_slope = step_slope total_loss = 0 start_time = time.time() hidden = model.init_hidden(args.batch_size) # Shuffle order of talks train_data = data_shuffle(datafile_train) print('Start training') for (data,targets,batch) in data_producer(train_data, args.batch_size, args.bptt, cuda=args.cuda, use_durs=args.use_durs): # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() optimizer.zero_grad() output, hidden = model(data, hidden) if args.tier=='combined': loss, loss_phone, loss_word = model.criterion(output, targets) else: loss = model.criterion(output, targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) # for p in model.parameters(): # p.data.add_(-lr, p.grad.data) optimizer.step() total_loss += loss.data if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.6f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data[0]) // args.batch_size // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def evaluate(data_source): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0. ntokens = len(corpus.dictionary) # 2*10*200 hidden = model.init_hidden(eval_batch_size) with torch.no_grad(): for i in range(0, data_source.size(0) - 1, args.bptt): # data:35*10, targets:350 data, targets = get_batch(data_source, i) # 参数:(35*10, 2*10*200), output=(35,10,33278) hidden=(2,10,200) output, hidden = model(data, hidden) # output_flat == > 350*33278 output_flat = output.view(-1, ntokens) # len(data)=35, 称为sequence length total_loss += len(data) * criterion(output_flat, targets).item() hidden = repackage_hidden(hidden) return total_loss / (len(data_source) - 1)
def train(): model.train() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden( args.batch_size, hidden_init_method=args.initialization["hidden_state"]) iter_idx = range(0, train_data.size(0) - 1, args.sequence_length) if args.shuffle: np.random.shuffle(iter_idx) for batch, i in enumerate(iter_idx): data, targets = get_batch(train_data, i) hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() clipped_lr = lr * clip_gradient(model, args.clip) for param_group in opt.param_groups: param_group['lr'] = clipped_lr opt.step() total_loss += loss.data if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time ppl = 0 try: ppl = math.exp(cur_loss) except OverflowError: ppl = float('inf') logger.info( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.sequence_length, lr, elapsed * 1000 / args.log_interval, cur_loss, ppl)) total_loss = 0 start_time = time.time()
def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) optimizer.zero_grad() # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) #model.zero_grad() probs, hidden = model(data, hidden, targets) loss = -torch.mean(torch.log(probs)) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) #for p in model.parameters(): # p.data.add_(-lr, p.grad.data) optimizer.step() total_loss += loss.data if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | perplexity {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0 tot_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it # was previously produced. If we didn't, the model would try # backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem # in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.data tot_loss += loss.data if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ' 'ms/batch {:5.2f} | loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() add_to_pickle(('train', epoch, tot_loss[0] / train_data.size(0), math.exp(tot_loss[0] / train_data.size(0))))
def train_epoch(model, criterion, train_data, vocab, hps, lr, epoch): model.train() total_loss = 0 start_time = time.time() ntokens = vocab.size() hidden = model.init_hidden(hps['batch_size']) last_log_batch = 0 for batch, i in enumerate(range(0, train_data.size(0) - 1, hps['bptt'])): data, targets = get_batch(train_data, i, hps['bptt']) # pylint:disable=line-too-long # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to # start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), hps['clip']) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.data if (batch % hps['log_interval'] == 0 and batch > 0) or ( batch == len(train_data) // hps['bptt']): cur_loss = total_loss[0] / (batch - last_log_batch) last_log_batch = batch elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:14.8f}'.format( epoch, batch, len(train_data) // hps['bptt'], lr, elapsed * 1000 / hps['log_interval'], cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def test_evaluate(test_sentences, data_source): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) if args.words: print('word sentid sentpos wlen surp entropy')#,end='') if args.guess: for i in range(args.guessn): print(' guess'+str(i))#,end='') if args.guessscores: print(' gscore'+str(i))#,end='') sys.stdout.write('\n') bar = Bar('Processing', max=len(data_source)) for i in range(len(data_source)): sent_ids = data_source[i] sent = test_sentences[i] if args.cuda: sent_ids = sent_ids.cuda() if (not args.single) and (torch.cuda.device_count() > 1): # "module" is necessary when using DataParallel hidden = model.module.init_hidden(1) # number of parallel sentences being processed else: hidden = model.init_hidden(1) # number of parallel sentences being processed data, targets = test_get_batch(sent_ids, evaluation=True) data=data.unsqueeze(1) # only needed if there is just a single sentence being processed print data output, hidden = model(data, hidden) output_flat = output.view(-1, ntokens) curr_loss = criterion(output_flat, targets).data #curr_loss = len(data) * criterion(output_flat, targets).data # needed if there is more than a single sentence being processed total_loss += curr_loss if args.words: # output word-level complexity metrics get_complexity_apply(output_flat,targets,i) else: # output sentence-level loss print(str(sent)+":"+str(curr_loss[0])) hidden = repackage_hidden(hidden) bar.next() bar.finish() return total_loss[0] / len(data_source)
def HLU_score_for_data(model, data_loader, batch_size): device = model.device nb_batch = len(data_loader.dataset) // batch_size if len(data_loader.dataset) % batch_size == 0: total_batch = nb_batch else : total_batch = nb_batch + 1 print(total_batch) list_HLU_score = [] C = 100 beta = 5 model.eval() for i, data_pack in enumerate(data_loader, 0): data_x, data_seq_len, data_y = data_pack x_ = data_x.to_dense().to(dtype=model.d_type, device=device) real_batch_size = x_.size()[0] hidden = model.init_hidden(real_batch_size) y_ = data_y.to(dtype=model.d_type, device=device) predict_ = model(x_, data_seq_len, hidden) sigmoid_pred = torch.sigmoid(predict_) sorted_rank, indices = torch.sort(sigmoid_pred, descending = True) for seq_idx, a_seq_idx in enumerate(y_): # print(seq_idx) idx_item_in_target_basket = (a_seq_idx == 1.0).nonzero() # print(idx_item_in_target_basket) sum_of_rank_score = 0 for idx_item in idx_item_in_target_basket: item_rank = (indices[seq_idx] == idx_item).nonzero().item() rank_score = 2**((1-(item_rank+1))/(beta-1)) sum_of_rank_score += rank_score sum_of_rank_target_basket = 0 # tinh mau so cua HLU target_basket_size = idx_item_in_target_basket.size()[0] for r in range(1, target_basket_size+1): target_rank_score = 2**((1-r)/(beta-1)) sum_of_rank_target_basket += target_rank_score HLU_score = C * sum_of_rank_score / sum_of_rank_target_basket list_HLU_score.append(HLU_score) print("HLU list len: %d" % len(list_HLU_score)) return np.array(list_HLU_score).mean()
def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. """ A special variant of Backpropagation is used here called as backpropagation through time. We don't want the hidden states to retain their state across the batches. So we detach them basically and assign to a new variable so that the computation graph is sort of reset for the current batch. """ hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.data if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(): model.train() # applies dropout total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) if args.optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=0.01) elif args.optimizer == 'sgd': optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5) else: optimizer = None for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): if optimizer: optimizer.zero_grad() else: model.zero_grad() data, targets = get_batch(train_data, i) hidden = repackage_hidden(hidden) output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() if optimizer: optimizer.step() clipped_lr = lr * clip_gradient(model, args.clip) for p in model.parameters(): p.data.add_(-clipped_lr, p.grad.data) total_loss += loss.data if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0 batch_number = 0 start_time = time.time() ntokens = vocab_length hidden = model.init_hidden(args.batch_size) for batch in train_iter: batch_number += 1 data = batch.src.transpose(0, 1) targets = batch.trg.transpose(0, 1) targets.contiguous() hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets.view(-1)) loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.data if batch_number % args.log_interval == 0 and batch_number > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch_number, len(train_data) // args.batch_size, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(): # Turn on training mode which enables dropout. total_loss = 0 start_time = time.time() hidden = model.init_hidden(args.batch_size) batch = 0 for source_sample, target_sample in zip(train_source_sampler, train_target_sampler): model.train() data = torch.stack([train_data[i] for i in source_sample ]).t_().contiguous().to('cuda') targets = torch.stack([train_data[i] for i in target_sample ]).t_().contiguous().view(-1).to('cuda') optimizer.zero_grad() print(model) output, hidden, rnn_hs, dropped_rnn_hs = model(data, hidden) loss = criterion(output, targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. if args.clip: torch.nn.utils.clip_grad_norm_(params, args.clip) optimizer.step() total_loss += raw_loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format( epoch, batch, len(train_source_sampler) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss), cur_loss / math.log(2))) total_loss = 0 start_time = time.time() ### batch += 1
def evaluate(data_source): model.eval() # Turn on evaluation mode which disables dropout. total_loss = 0 # Define the total loss of the model to be 0 to start ntokens = len(corpus.dictionary) # Define our vocabulary size hidden = model.init_hidden(eval_batch_size) # Define our hidden states for i in range( 0, data_source.size(0) - 1, args.bptt): # For every batch (batch#, batch starting index) data, targets = get_batch( data_source, i, evaluation=True) # Get the batch in evaulation mode output, hidden = model(data, hidden) # Get the output of the model output_flat = output.view( -1, ntokens ) # Get the final output vector from the model (the last word predicted) total_loss += len(data) * criterion( output_flat, targets).data # Get the loss of the predicitons hidden = repackage_hidden(hidden) # Reset the hidden states return total_loss[0] / len(data_source) # Return the losses
def evaluate(data_source): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 hidden = model.init_hidden(eval_batch_size) with torch.no_grad(): for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args.bptt) #> output has size seq_length x batch_size x vocab_size output, hidden = model(data, hidden) #> output_flat has size num_targets x vocab_size (batches are stacked together) #> ! important, otherwise softmax computation (e.g. with F.softmax()) is incorrect output_flat = output.view(-1, ntokens) #output_candidates_info(output_flat.data, targets.data) total_loss += len(data) * nn.CrossEntropyLoss()(output_flat, targets).item() hidden = repackage_hidden(hidden) return total_loss / (len(data_source) - 1)
def evaluate(data_source): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0. ntokens = len(corpus.dictionary) if args.model != 'Transformer' and args.model != 'FNN': hidden = model.init_hidden(eval_batch_size) with torch.no_grad(): for i in range(args.n - 1, data_source.size(0) - 1, 1): data, targets = get_fnn_batch(data_source, i) if args.model == 'Transformer': output = model(data) output = output.view(-1, ntokens) elif args.model == 'FNN': output = model(data) else: output, hidden = model(data, hidden) hidden = repackage_hidden(hidden) total_loss += criterion(output, targets).item() return total_loss / (len(data_source) - 1)
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() model.reset() total_loss = 0 ntokens = len(corpus.dictionary) with torch.no_grad(): hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args, evaluation=True) settings.set_sequence(data[:, 0]) # added by Ju output, hidden = model(data, hidden) settings.visualize_sequence() # added by Ju settings.inc_seq_idx() total_loss += len(data) * criterion( model.decoder.weight, model.decoder.bias, output, targets).data hidden = repackage_hidden(hidden) return total_loss.item() / len(data_source)
def evaluate(data_source): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 with torch.no_grad(): for sent in split_data_by_sentence(data_source): data = sent[:-1] targets = torch.flatten(sent[1:]) hidden = model.init_hidden(1) hidden = repackage_hidden(hidden) output, hidden = model(data, hidden) #> output_flat has size num_targets x vocab_size (batches are stacked together) #> ! important, otherwise softmax computation (e.g. with F.softmax()) is incorrect output_flat = output.view(-1, ntokens) #output_candidates_info(output_flat.data, targets.data) total_loss += len(data) * nn.CrossEntropyLoss()(output_flat, targets).item() hidden = repackage_hidden(hidden) return total_loss / (len(data_source) - 1)
def evaluate(data_source, batch_size=10): # Turn on evaluation mode which disables dropout. model.eval() if args.model == 'QRNN': model.reset() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i, args, evaluation=True) output, hidden = model(data, hidden) output_flat = output.view( -1, args.nhid) if args.adaptive else output.view(-1, ntokens) if args.adaptive: output, loss = criterion(output_flat, targets) else: loss = criterion(output_flat, targets) total_loss += len(data) * loss.data hidden = repackage_hidden(hidden) return total_loss.item() / len(data_source)
def evaluate(test, batch_size=1): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(1) log_probs = [] for i in range(0, test.size(0) - 1, 70): data, targets = get_batch(test, i, evaluation=True) targets = targets.view(-1) log_prob, hidden = parallel_model(data, hidden) log_probs.append(log_prob) loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), targets).data total_loss += loss * len(data) hidden = repackage_hidden(hidden) return log_probs, math.exp(total_loss[0] / len(test))
def train(): assert args.batch_size % args.small_batch_size == 0, 'batch_size must be divisible by small_batch_size' # Turn on training mode which enables dropout. total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = [model.init_hidden(args.small_batch_size) for _ in range(args.batch_size // args.small_batch_size)] batch, i = 0, 0 while i < train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths seq_len = max(5, int(np.random.normal(bptt, 5))) # There's a very small chance that it could select a very long sequence length resulting in OOM seq_len = min(seq_len, args.bptt + args.max_seq_len_delta) lr2 = optimizer.param_groups[0]['lr'] optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt model.train() data, targets = get_batch(train_data, i, args, seq_len=seq_len) optimizer.zero_grad() start, end, s_id = 0, args.small_batch_size, 0 while start < args.batch_size: cur_data, cur_targets = data[:, start: end], targets[:, start: end].contiguous().view(-1) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden[s_id] = repackage_hidden(hidden[s_id]) log_prob, hidden[s_id], rnn_hs, dropped_rnn_hs = parallel_model(cur_data, hidden[s_id], return_h=True) raw_loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), cur_targets) loss = raw_loss # Activiation Regularization loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) loss *= args.small_batch_size / args.batch_size total_loss += raw_loss.data * args.small_batch_size / args.batch_size loss.backward() s_id += 1 start = end end = start + args.small_batch_size gc.collect() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) optimizer.step() # total_loss += raw_loss.data optimizer.param_groups[0]['lr'] = lr2 if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time logging('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() ### batch += 1 i += seq_len