def benchmark(trainer): # Benchmark to achieve the backward time per layer p = Profiling(trainer.net) # Warmup input_shape, output_shape = trainer.get_data_shape() warmup = 5 # warmup should be 0 on some GPUs (e.g., P102-100) iteration = 50 for i in range(iteration+warmup): data = trainer.data_iter() if trainer.dataset == 'an4': inputs, labels_cpu, input_percentages, target_sizes = data input_sizes = input_percentages.mul_(int(inputs.size(3))).int() else: inputs, labels_cpu = data if trainer.is_cuda: if trainer.dnn == 'lstm' : inputs = Variable(inputs.transpose(0, 1).contiguous()).cuda() labels = Variable(labels_cpu.transpose(0, 1).contiguous()).cuda() else: inputs, labels = inputs.cuda(non_blocking=True), labels_cpu.cuda(non_blocking=True) else: labels = labels_cpu if trainer.dnn == 'lstman4': out, output_sizes = trainer.net(inputs, input_sizes) out = out.transpose(0, 1) # TxNxH loss = trainer.criterion(out, labels_cpu, output_sizes, target_sizes) torch.cuda.synchronize() loss = loss / inputs.size(0) # average the loss by minibatch elif trainer.dnn == 'lstm' : hidden = trainer.net.init_hidden() hidden = lstmpy.repackage_hidden(hidden) #print(inputs.size(), hidden[0].size(), hidden[1].size()) outputs, hidden = trainer.net(inputs, hidden) tt = torch.squeeze(labels.view(-1, trainer.net.batch_size * trainer.net.num_steps)) loss = trainer.criterion(outputs.view(-1, trainer.net.vocab_size), tt) torch.cuda.synchronize() else: # forward + backward + optimize outputs = trainer.net(inputs) loss = trainer.criterion(outputs, labels) torch.cuda.synchronize() if i >= warmup: p.start() loss.backward() if trainer.is_cuda: torch.cuda.synchronize() layerwise_times, sum_total = p.get_layerwise_times() seq_keys = p.get_backward_seq_keys() p.stop() return seq_keys[::-1], layerwise_times[::-1], p.get_backward_key_sizes()[::-1]
def test(self, epoch): self.net.eval() test_loss = 0 correct = 0 top1_acc = [] top5_acc = [] total = 0 total_steps = 0 costs = 0.0 total_iters = 0 total_wer = 0 for batch_idx, data in enumerate(self.testloader): if self.dataset == 'an4': inputs, labels_cpu, input_percentages, target_sizes = data input_sizes = input_percentages.mul_(int(inputs.size(3))).int() else: inputs, labels_cpu = data if self.is_cuda: if self.dnn == 'lstm' : inputs = Variable(inputs.transpose(0, 1).contiguous()).cuda() labels = Variable(labels_cpu.transpose(0, 1).contiguous()).cuda() else: inputs, labels = inputs.cuda(non_blocking=True), labels_cpu.cuda(non_blocking=True) else: labels = labels_cpu if self.dnn == 'lstm' : hidden = self.net.init_hidden() hidden = lstmpy.repackage_hidden(hidden) outputs, hidden = self.net(inputs, hidden) tt = torch.squeeze(labels.view(-1, self.net.batch_size * self.net.num_steps)) loss = self.criterion(outputs.view(-1, self.net.vocab_size), tt) test_loss += loss.data[0] costs += loss.data[0] * self.net.num_steps total_steps += self.net.num_steps elif self.dnn == 'lstman4': targets = labels_cpu split_targets = [] offset = 0 for size in target_sizes: split_targets.append(targets[offset:offset + size]) offset += size out, output_sizes = self.net(inputs, input_sizes) decoded_output, _ = self.decoder.decode(out.data, output_sizes) target_strings = self.decoder.convert_to_strings(split_targets) wer, cer = 0, 0 target_strings = self.decoder.convert_to_strings(split_targets) wer, cer = 0, 0 for x in range(len(target_strings)): transcript, reference = decoded_output[x][0], target_strings[x][0] wer += self.decoder.wer(transcript, reference) / float(len(reference.split())) total_wer += wer else: outputs = self.net(inputs) loss = self.criterion(outputs, labels) acc1, acc5 = self.cal_accuracy(outputs, labels, topk=(1, 5)) top1_acc.append(float(acc1)) top5_acc.append(float(acc5)) test_loss += loss.data.item() total += labels.size(0) total_iters += 1 test_loss /= total_iters if self.dnn not in ['lstm', 'lstman4']: acc = np.mean(top1_acc) acc5 = np.mean(top5_acc) elif self.dnn == 'lstm': acc = np.exp(costs / total_steps) acc5 = 0.0 elif self.dnn == 'lstman4': wer = total_wer / len(self.testloader.dataset) acc = wer acc5 = 0.0 loss = float(test_loss)/total logger.info('Epoch %d, lr: %f, val loss: %f, val top-1 acc: %f, top-5 acc: %f' % (epoch, self.lr, test_loss, acc, acc5)) self.net.train() return acc
def train(self, num_of_iters=1, data=None, hidden=None): self.loss = 0.0 s = time.time() # zero the parameter gradients #self.optimizer.zero_grad() for i in range(num_of_iters): self.adjust_learning_rate(self.train_epoch, self.optimizer) if self.train_iter % self.num_batches_per_epoch == 0 and self.train_iter > 0: self.train_epoch += 1 logger.info('train iter: %d, num_batches_per_epoch: %d', self.train_iter, self.num_batches_per_epoch) logger.info('Epoch %d, avg train acc: %f, lr: %f, avg loss: %f' % (self.train_iter//self.num_batches_per_epoch, np.mean(self.train_acc_top1), self.lr, self.avg_loss_per_epoch/self.num_batches_per_epoch)) if self.rank == 0 and self.writer is not None: self.writer.add_scalar('cross_entropy', self.avg_loss_per_epoch/self.num_batches_per_epoch, self.train_epoch) self.writer.add_scalar('top-1_acc', np.mean(self.train_acc_top1), self.train_epoch) if self.rank == 0: self.test(self.train_epoch) self.sparsities = [] self.compression_ratios = [] self.communication_sizes = [] self.train_acc_top1 = [] self.epochs_info.append(self.avg_loss_per_epoch/self.num_batches_per_epoch) self.avg_loss_per_epoch = 0.0 # Save checkpoint if self.train_iter > 0 and self.rank == 0: state = {'iter': self.train_iter, 'epoch': self.train_epoch, 'state': self.get_model_state()} if self.prefix: relative_path = './weights/%s/%s-n%d-bs%d-lr%.4f' % (self.prefix, self.dnn, self.nworkers, self.batch_size, self.base_lr) else: relative_path = './weights/%s-n%d-bs%d-lr%.4f' % (self.dnn, self.nworkers, self.batch_size, self.base_lr) utils.create_path(relative_path) filename = '%s-rank%d-epoch%d.pth'%(self.dnn, self.rank, self.train_epoch) fn = os.path.join(relative_path, filename) if self.train_epoch % 2== 0: self.save_checkpoint(state, fn) self.remove_dict(state) if self.train_sampler and (self.nworkers > 1): self.train_sampler.set_epoch(self.train_epoch) ss = time.time() if data is None: data = self.data_iter() if self.dataset == 'an4': inputs, labels_cpu, input_percentages, target_sizes = data input_sizes = input_percentages.mul_(int(inputs.size(3))).int() else: inputs, labels_cpu = data if self.is_cuda: if self.dnn == 'lstm' : inputs = Variable(inputs.transpose(0, 1).contiguous()).cuda() labels = Variable(labels_cpu.transpose(0, 1).contiguous()).cuda() else: inputs, labels = inputs.cuda(non_blocking=True), labels_cpu.cuda(non_blocking=True) else: labels = labels_cpu self.iotime += (time.time() - ss) sforward = time.time() if self.dnn == 'lstman4': out, output_sizes = self.net(inputs, input_sizes) out = out.transpose(0, 1) # TxNxH loss = self.criterion(out, labels_cpu, output_sizes, target_sizes) #torch.cuda.synchronize() self.forwardtime += (time.time() - sforward) loss = loss / inputs.size(0) # average the loss by minibatch elif self.dnn == 'lstm' : hidden = lstmpy.repackage_hidden(hidden) outputs, hidden = self.net(inputs, hidden) tt = torch.squeeze(labels.view(-1, self.net.batch_size * self.net.num_steps)) loss = self.criterion(outputs.view(-1, self.net.vocab_size), tt) #torch.cuda.synchronize() self.forwardtime += (time.time() - sforward) else: # forward + backward + optimize outputs = self.net(inputs) loss = self.criterion(outputs, labels) #torch.cuda.synchronize() self.forwardtime += (time.time() - sforward) sbackward = time.time() if self.amp_handle is not None: with apex.amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() loss = scaled_loss else: loss.backward() loss_value = loss.item() #torch.cuda.synchronize() self.backwardtime += (time.time() - sbackward) self.loss += loss_value self.avg_loss_per_epoch += loss_value if self.dnn not in ['lstm', 'lstman4']: acc1, = self.cal_accuracy(outputs, labels, topk=(1,)) self.train_acc_top1.append(float(acc1)) self.train_iter += 1 self.num_of_updates_during_comm += 1 self.loss /= num_of_iters self.timer += time.time() - s display = 40 if self.train_iter % display == 0: logger.warn('[%3d][%5d/%5d][rank:%d] loss: %.3f, average forward (%f) and backward (%f) time: %f, iotime: %f ' % (self.train_epoch, self.train_iter, self.num_batches_per_epoch, self.rank, self.loss, self.forwardtime/display, self.backwardtime/display, self.timer/display, self.iotime/display)) self.timer = 0.0 self.iotime = 0.0 self.forwardtime = 0.0 self.backwardtime = 0.0 if self.dnn == 'lstm': return num_of_iters, hidden return num_of_iters
def train(self, num_of_iters=1, data=None, hidden=None): self.loss = 0.0 s = time.time() for i in range(num_of_iters): self.adjust_learning_rate(self.train_epoch, self.optimizer) if self.train_iter % self.num_batches_per_epoch == 0 and self.train_iter > 0: logger.info('train iter: %d, num_batches_per_epoch: %d', self.train_iter, self.num_batches_per_epoch) logger.info( 'Epoch %d, avg train acc: %f, lr: %f, avg loss: %f' % (self.train_iter // self.num_batches_per_epoch, np.mean(self.train_acc_top1), self.lr, self.avg_loss_per_epoch / self.num_batches_per_epoch)) mean_s = np.mean(self.sparsities) if self.train_iter > 0 and np.isnan(mean_s): logger.warn('NaN detected! sparsities: %s' % self.sparsities) logger.info( 'Average Sparsity: %f, compression ratio: %f, communication size: %f', np.mean(self.sparsities), np.mean(self.compression_ratios), np.mean(self.communication_sizes)) if self.rank == 0 and self.writer is not None: self.writer.add_scalar( 'cross_entropy', self.avg_loss_per_epoch / self.num_batches_per_epoch, self.train_epoch) self.writer.add_scalar('top-1 acc', np.mean(self.train_acc_top1), self.train_epoch) if self.rank == 0: self.test(self.train_epoch) self.sparsities = [] self.compression_ratios = [] self.communication_sizes = [] self.train_acc_top1 = [] self.epochs_info.append(self.avg_loss_per_epoch / self.num_batches_per_epoch) self.avg_loss_per_epoch = 0.0 if self.train_iter > 0 and self.rank == 0: state = { 'iter': self.train_iter, 'epoch': self.train_epoch, 'state': self.get_model_state() } if self.prefix: relative_path = './weights/%s/%s-n%d-bs%d-lr%.4f' % ( self.prefix, self.dnn, self.nworkers, self.batch_size, self.base_lr) else: relative_path = './weights/%s-n%d-bs%d-lr%.4f' % ( self.dnn, self.nworkers, self.batch_size, self.base_lr) if settings.SPARSE: relative_path += '-s%.5f' % self.sparsity utils.create_path(relative_path) filename = '%s-rank%d-epoch%d.pth' % (self.dnn, self.rank, self.train_epoch) fn = os.path.join(relative_path, filename) #self.save_checkpoint(state, fn) #self.remove_dict(state) self.train_epoch += 1 if self.train_sampler and (self.nworkers > 1): self.train_sampler.set_epoch(self.train_epoch) ss = time.time() if data is None: data = self.data_iter() if self.dataset == 'an4': inputs, labels_cpu, input_percentages, target_sizes = data input_sizes = input_percentages.mul_(int(inputs.size(3))).int() else: inputs, labels_cpu = data if self.is_cuda: if self.dnn == 'lstm': inputs = Variable(inputs.transpose(0, 1).contiguous()).cuda() labels = Variable(labels_cpu.transpose( 0, 1).contiguous()).cuda() else: inputs, labels = inputs.cuda( non_blocking=True), labels_cpu.cuda(non_blocking=True) else: labels = labels_cpu self.iotime += (time.time() - ss) if self.dnn == 'lstman4': out, output_sizes = self.net(inputs, input_sizes) out = out.transpose(0, 1) # TxNxH loss = self.criterion(out, labels_cpu, output_sizes, target_sizes) loss = loss / inputs.size(0) # average the loss by minibatch loss.backward() elif self.dnn == 'lstm': hidden = lstmpy.repackage_hidden(hidden) outputs, hidden = self.net(inputs, hidden) tt = torch.squeeze( labels.view(-1, self.net.batch_size * self.net.num_steps)) loss = self.criterion(outputs.view(-1, self.net.vocab_size), tt) loss.backward() else: # forward + backward + optimize outputs = self.net(inputs) loss = self.criterion(outputs, labels) loss.backward() loss_value = loss.item() # logger.info statistics self.loss += loss_value self.avg_loss_per_epoch += loss_value if self.dnn not in ['lstm', 'lstman4']: acc1, = self.cal_accuracy(outputs, labels, topk=(1, )) self.train_acc_top1.append(acc1) self.train_iter += 1 self.num_of_updates_during_comm += 1 self.loss /= num_of_iters self.timer += time.time() - s display = 100 if self.train_iter % display == 0: logger.info( '[%3d][%5d/%5d][rank:%d] loss: %.3f, average forward and backward time: %f, iotime: %f ' % (self.train_epoch, self.train_iter, self.num_batches_per_epoch, self.rank, self.loss, self.timer / display, self.iotime / display)) mbytes = 1024. * 1024 logger.info( 'GPU memory usage memory_allocated: %d MBytes, max_memory_allocated: %d MBytes, memory_cached: %d MBytes, max_memory_cached: %d MBytes, CPU memory usage: %d MBytes', ct.memory_allocated() / mbytes, ct.max_memory_allocated() / mbytes, ct.memory_cached() / mbytes, ct.max_memory_cached() / mbytes, process.memory_info().rss / mbytes) self.timer = 0.0 self.iotime = 0.0 if self.is_cuda: torch.cuda.empty_cache() if self.dnn == 'lstm': return num_of_iters, hidden return num_of_iters