def evaluate(data_source): model.eval() total_loss = 0 processed_data_size = 0 with torch.no_grad(): for i in range(0, data_source.size(1) - 1, args.validseqlen): if i + args.seq_len - args.validseqlen >= data_source.size(1) - 1: continue data, targets = get_batch(data_source, i, args, evaluation=True) # Discard the effective history, just like in training eff_history = args.seq_len - args.validseqlen final_target = targets[:, eff_history:].contiguous().view(-1) if args.causal_stack: batchsize = data.shape[0] valseqlen = min(args.validseqlen, data.shape[1] - eff_history) causal_stack = torch.vstack([ data[:, j:j + eff_history] for j in range(1, valseqlen + 1) ]).reshape(valseqlen, batchsize, eff_history) final_output = model( causal_stack.permute(1, 0, 2).reshape( len(final_target), eff_history))[:, -1].contiguous() else: output = model(data) final_output = output[:, eff_history:].contiguous().view( -1, n_words) loss = criterion(final_output, final_target) # Note that we don't add TAR loss here total_loss += (data.size(1) - eff_history) * loss.item() processed_data_size += data.size(1) - eff_history return total_loss / processed_data_size
def evaluate(data_source): #区分model.train()和model.eval()的区别,训练时用train,测试时用eval,eval会把BatchNorm和DropOut固定住。相当于测试时不划分batch,并且没有dropout的残缺网络而用完整网络。 model.eval() total_loss = 0 processed_data_size = 0 ## with关键字能够自动帮忙执行 close 方法,不能处理异常 torch.no_grad()是一个上下文管理器,被该语句 wrap 起来的部分将不会track 梯度。因为不需更新网络所以,都包起来,不让track梯度。 with torch.no_grad(): for i in range(0, data_source.size(1) - 1, args.validseqlen): if i + args.seq_len - args.validseqlen >= data_source.size(1) - 1: continue data, targets = get_batch(data_source, i, args, evaluation=True) output = model(data) # Discard the effective history, just like in training eff_history = args.seq_len - args.validseqlen #contiguous是配合view使用的,view是用来改变tensor的形状的 final_output = output[:, eff_history:].contiguous().view(-1, n_words) final_target = targets[:, eff_history:].contiguous().view(-1) loss = criterion(final_output, final_target) # Note that we don't add TAR loss here total_loss += (data.size(1) - eff_history) * loss.item() processed_data_size += data.size(1) - eff_history return total_loss / processed_data_size
def train(): # Turn on training mode which enables dropout. global writer global train_data global write_graph model.train() total_loss = 0 start_time = time.time() for batch_idx, i in enumerate( range(0, train_data.size(1) - 1, args.validseqlen)): if i + args.seq_len - args.validseqlen >= train_data.size(1) - 1: continue data, targets = get_batch(train_data, i, args) optimizer.zero_grad() if write_graph: writer.add_graph(model, data) write_graph = False output = model(data) # Discard the effective history part eff_history = args.seq_len - args.validseqlen if eff_history < 0: raise ValueError( "Valid sequence length must be smaller than sequence length!") final_target = targets[:, eff_history:].contiguous().view(-1) final_output = output[:, eff_history:].contiguous().view(-1, n_words) loss = criterion(final_output, final_target) loss.backward() if args.clip > 0: torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) optimizer.step() total_loss += loss.data if batch_idx % args.log_interval == 0 and batch_idx > 0: cur_loss = total_loss.item() / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.5f} | ms/batch {:5.5f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch_idx, train_data.size(1) // args.validseqlen, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) writer.add_scalar('loss', cur_loss, batch_idx + 1) writer.add_scalar('perplexity', math.exp(cur_loss), batch_idx + 1) writer.add_scalar('learning rate', lr) total_loss = 0 start_time = time.time()
def evaluate(data_source): model.eval() total_loss = 0 processed_data_size = 0 for i in range(0, data_source.size(1) - 1, args.validseqlen): if i + args.seq_len - args.validseqlen >= data_source.size(1) - 1: continue data, targets = get_batch(data_source, i, args, evaluation=True) output = model(data) # Discard the effective history, just like in training eff_history = args.seq_len - args.validseqlen final_output = output[:, eff_history:].contiguous().view(-1, n_words) final_target = targets[:, eff_history:].contiguous().view(-1) loss = criterion(final_output, final_target) # Note that we don't add TAR loss here total_loss += (data.size(1) - eff_history) * loss.data processed_data_size += data.size(1) - eff_history return total_loss[0] / processed_data_size
def train(): # Turn on training mode which enables dropout. global train_data model.train() total_loss = 0 start_time = time.time() for batch_idx, i in enumerate(range(0, train_data.size(1) - 1, args.validseqlen)): if i + args.seq_len - args.validseqlen >= train_data.size(1) - 1: continue data, targets = get_batch(train_data, i, args) optimizer.zero_grad() output = model(data) # Discard the effective history part eff_history = args.seq_len - args.validseqlen if eff_history < 0: raise ValueError("Valid sequence length must be smaller than sequence length!") final_target = targets[:, eff_history:].contiguous().view(-1) final_output = output[:, eff_history:].contiguous().view(-1, n_words) loss = criterion(final_output, final_target) loss.backward() if args.clip > 0: torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) optimizer.step() total_loss += loss.data if batch_idx % args.log_interval == 0 and batch_idx > 0: cur_loss = total_loss[0] / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.5f} | ms/batch {:5.5f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch_idx, train_data.size(1) // args.validseqlen, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(ep): # Turn on training mode which enables dropout. global train_data model.train() train_loss, total_loss = 0, 0 start_time = time.time() for batch_idx, i in enumerate( range(0, train_data.size(1) - 1, args.validseqlen)): if i + args.seq_len - args.validseqlen >= train_data.size(1) - 1: continue data, targets = get_batch(train_data, i, args) optimizer.zero_grad() # Discard the effective history part eff_history = args.seq_len - args.validseqlen if eff_history < 0: raise ValueError( "Valid sequence length must be smaller than sequence length!") final_target = targets[:, eff_history:].contiguous().view(-1) if args.causal_stack: valseqlen = min(args.validseqlen, data.shape[1] - eff_history) #causal_stack = torch.vstack([data[:,j:j+eff_history] for j in range(1, valseqlen+1)]).reshape(valseqlen, args.batch_size, eff_history) #final_output = model(causal_stack.permute(1, 0, 2).reshape(len(final_target), eff_history))[:,-1].contiguous() causal_stack = torch.vstack([ data[:, j:j + eff_history] for j in range(1, valseqlen + 1) ]).reshape(valseqlen, args.batch_size, eff_history).permute(1, 0, 2).reshape(len(final_target), eff_history) span = len(final_target) // args.accumulation_rounds intervals = [(offset, offset + span) for offset in range(0, len(final_target), span)] if len(intervals) > args.accumulation_rounds: intervals = intervals[:-1] intervals[-1] = (intervals[-1][0], len(final_target)) for a, b in intervals: loss = criterion( model(causal_stack[a:b])[:, -1].contiguous(), final_target[a:b]) / args.accumulation_rounds loss.backward() train_loss += loss.item() total_loss += loss.item() else: output = model(data) final_output = output[:, eff_history:].contiguous().view(-1, n_words) loss = criterion(final_output, final_target) loss.backward() train_loss += loss.item() total_loss += loss.item() if args.clip > 0: torch.nn.utils.clip_grad_norm_(model.model_weights(), args.clip) optimizer.step() if batch_idx % args.log_interval == 0 and batch_idx > 0: cur_loss = train_loss / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.5f} | ms/batch {:5.5f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch_idx, train_data.size(1) // args.validseqlen, optimizer.optimizers[0].param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) train_loss = 0 start_time = time.time() writer.add_scalar('train/loss', total_loss / (batch_idx + 1.), ep)