def feed_data(self, data_loader, training=True): if training: assert self.optimizer is not None batch_size = data_loader.batch_size losses_per_token = AverageMeter() loss_per_sentence = AverageMeter() num_toks = AverageMeter() start_time = 0 end_time = 0 t0 = time.time() for i, (src, tgt, _) in enumerate(data_loader): if i == self.profile_start: if self.cupti: start_cupti_tracing() elif self.nsight: torch.cuda.profiler.start() start_time = time.time() if i == self.profile_stop: end_time = time.time() if self.cupti: end_cupti_tracing() elif self.nsight: torch.cuda.profiler.stop() if i == self.profile_stop: break self.save_counter += 1 # measure data loading time # do a train/evaluate iteration stats = self.iterate(src, tgt, training=training) loss_per_token, loss_per_sentence, num_toks = stats save_chkpt = (self.save_counter % self.save_freq) == (self.save_freq - 1) if training and save_chkpt: self.save_counter = 0 self.save_info['iteration'] = i identifier = next(self.checkpoint_counter, -1) if identifier != -1: with sync_workers() as rank: if rank == 0: self.save(identifier=identifier) t1 = time.time() print("iteration {}: {} ms".format(i, (t1 - t0) * 1000)) t0 = t1 print("average time {:.2f} ms".format( (end_time - start_time) * 1000 / (self.profile_stop - self.profile_start))) return losses_per_token.avg
def feed_data(self, data_loader, training=True): if training: assert self.optimizer is not None batch_time = AverageMeter() data_time = AverageMeter() losses_per_token = AverageMeter() losses_per_sentence = AverageMeter() tot_tok_time = AverageMeter() src_tok_time = AverageMeter() tgt_tok_time = AverageMeter() batch_size = data_loader.batch_size end = time.time() for i, (src, tgt, _) in enumerate(data_loader): print("iteration {}".format(i)) if i >= self.num_steps and self.num_steps > 0: break if i == 5 and self.cupti: start_cupti_tracing() self.save_counter += 1 # measure data loading time data_time.update(time.time() - end) # do a train/evaluate iteration stats = self.iterate(src, tgt, training=training) loss_per_token, loss_per_sentence, num_toks = stats # measure accuracy and record loss losses_per_token.update(loss_per_token, num_toks['tgt']) losses_per_sentence.update(loss_per_sentence, batch_size) # measure elapsed time elapsed = time.time() - end batch_time.update(elapsed) src_tok_time.update(num_toks['src'] / elapsed) tgt_tok_time.update(num_toks['tgt'] / elapsed) tot_num_toks = num_toks['tgt'] + num_toks['src'] tot_tok_time.update(tot_num_toks / elapsed) self.loss = losses_per_token.avg end = time.time() if i % self.print_freq == 0: phase = 'TRAIN' if training else 'EVAL' log = [] log += ['{} [{}][{}/{}]'.format(phase, self.epoch, i, len(data_loader))] log += ['Time {:.3f} ({:.3f})'.format(batch_time.val, batch_time.avg)] log += ['Data {:.3f} ({:.3f})'.format(data_time.val, data_time.avg)] log += ['Tok/s {:.0f} ({:.0f})'.format(tot_tok_time.val, tot_tok_time.avg)] if self.verbose: log += ['Src tok/s {:.0f} ({:.0f})'.format(src_tok_time.val, src_tok_time.avg)] log += ['Tgt tok/s {:.0f} ({:.0f})'.format(tgt_tok_time.val, tgt_tok_time.avg)] log += ['Loss/sentence {:.1f} ({:.1f})'.format(losses_per_sentence.val, losses_per_sentence.avg)] log += ['Loss/tok {:.8f} ({:.8f})'.format(losses_per_token.val, losses_per_token.avg)] log = '\t'.join(log) print(log) #logging.info(log) save_chkpt = (self.save_counter % self.save_freq) == (self.save_freq - 1) if training and save_chkpt: self.save_counter = 0 self.save_info['iteration'] = i identifier = next(self.checkpoint_counter, -1) if identifier != -1: with sync_workers() as rank: if rank == 0: self.save(identifier=identifier) if self.cupti: end_cupti_tracing() return losses_per_token.avg
def feed_data(self, data_loader, training=True): """ Runs training or validation on batches from data_loader. :param data_loader: data loader :param training: if True runs training else runs validation """ if training: assert self.optimizer is not None eval_fractions = np.linspace(0, 1, self.intra_epoch_eval + 2)[1:-1] iters_with_update = len(data_loader) // self.iter_size eval_iters = (eval_fractions * iters_with_update).astype(int) eval_iters = eval_iters * self.iter_size eval_iters = set(eval_iters) batch_time = AverageMeter() data_time = AverageMeter() losses_per_token = AverageMeter(skip_first=False) losses_per_sentence = AverageMeter(skip_first=False) tot_tok_time = AverageMeter() src_tok_time = AverageMeter() tgt_tok_time = AverageMeter() batch_size = data_loader.batch_size end = time.time() for i, (src, tgt) in enumerate(data_loader): self.save_counter += 1 # measure data loading time data_time.update(time.time() - end) update = False if i % self.iter_size == self.iter_size - 1: update = True # do a train/evaluate iteration stats = self.iterate(src, tgt, update, training=training) loss_per_token, loss_per_sentence, num_toks = stats # measure accuracy and record loss losses_per_token.update(loss_per_token, num_toks['tgt']) losses_per_sentence.update(loss_per_sentence, batch_size) # measure elapsed time elapsed = time.time() - end batch_time.update(elapsed) src_tok_time.update(num_toks['src'] / elapsed) tgt_tok_time.update(num_toks['tgt'] / elapsed) tot_num_toks = num_toks['tgt'] + num_toks['src'] tot_tok_time.update(tot_num_toks / elapsed) self.loss = losses_per_token.avg if training and i in eval_iters: test_bleu, _ = self.translator.run(calc_bleu=True, epoch=self.epoch, iteration=i) log = [] log += [f'TRAIN [{self.epoch}][{i}/{len(data_loader)}]'] log += [f'BLEU: {test_bleu:.2f}'] log = '\t'.join(log) logging.info(log) self.model.train() self.preallocate(data_loader, training=True) if i % self.print_freq == 0: phase = 'TRAIN' if training else 'VALIDATION' log = [] log += [f'{phase} [{self.epoch}][{i}/{len(data_loader)}]'] log += [f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})'] log += [f'Data {data_time.val:.2e} ({data_time.avg:.2e})'] log += [ f'Tok/s {tot_tok_time.val:.0f} ({tot_tok_time.avg:.0f})' ] if self.verbose: log += [ f'Src tok/s {src_tok_time.val:.0f} ({src_tok_time.avg:.0f})' ] log += [ f'Tgt tok/s {tgt_tok_time.val:.0f} ({tgt_tok_time.avg:.0f})' ] log += [ f'Loss/sentence {losses_per_sentence.val:.1f} ({losses_per_sentence.avg:.1f})' ] log += [ f'Loss/tok {losses_per_token.val:.4f} ({losses_per_token.avg:.4f})' ] if training: lr = self.optimizer.param_groups[0]['lr'] log += [f'LR {lr:.3e}'] log = '\t'.join(log) logging.info(log) save_chkpt = (self.save_counter % self.save_freq) == (self.save_freq - 1) if training and save_chkpt: self.save_counter = 0 self.save_info['iteration'] = i identifier = next(self.checkpoint_counter, -1) if identifier != -1: with sync_workers() as rank: if rank == 0: self.save(identifier=identifier) end = time.time() tot_tok_time.reduce('sum') losses_per_token.reduce('mean') return losses_per_token.avg, tot_tok_time.avg
def main(): args = parse_args() print(args) if args.cuda: torch.cuda.set_device(0) if not args.cuda and torch.cuda.is_available(): warnings.warn('cuda is available but not enabled') if args.math == 'fp16' and not args.cuda: raise RuntimeError('fp16 requires cuda') if not args.cudnn: torch.backends.cudnn.enabled = False checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'}) vocab_size = checkpoint['tokenizer'].vocab_size model_config = dict(vocab_size=vocab_size, math=checkpoint['config'].math, **literal_eval(checkpoint['config'].model_config)) model_config['batch_first'] = args.batch_first model = models.GNMT(**model_config) state_dict = checkpoint['state_dict'] if checkpoint_from_distributed(state_dict): state_dict = unwrap_distributed(state_dict) model.load_state_dict(state_dict) if args.math == 'fp32': dtype = torch.FloatTensor if args.math == 'fp16': dtype = torch.HalfTensor model.type(dtype) if args.cuda: model = model.cuda() model.eval() tokenizer = checkpoint['tokenizer'] test_data = ParallelDataset(src_fname=os.path.join(args.dataset_dir, config.SRC_TEST_FNAME), tgt_fname=os.path.join(args.dataset_dir, config.TGT_TEST_FNAME), tokenizer=tokenizer, min_len=0, max_len=150, sort=False) test_loader = test_data.get_loader(batch_size=args.batch_size, batch_first=True, shuffle=False, num_workers=0, drop_last=False, distributed=False) translator = Translator(model, tokenizer, beam_size=args.beam_size, max_seq_len=args.max_seq_len, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, cuda=args.cuda) model.eval() torch.cuda.empty_cache() # only write the output to file in accuracy mode if args.mode == 'accuracy': test_file = open(args.output, 'w', encoding='UTF-8') batch_time = AverageMeter(False) tot_tok_per_sec = AverageMeter(False) iterations = AverageMeter(False) enc_seq_len = AverageMeter(False) dec_seq_len = AverageMeter(False) stats = {} for i, (src, tgt, indices) in enumerate(test_loader): translate_timer = time.time() src, src_length = src if translator.batch_first: batch_size = src.size(0) else: batch_size = src.size(1) beam_size = args.beam_size bos = [translator.insert_target_start] * (batch_size * beam_size) bos = torch.LongTensor(bos) if translator.batch_first: bos = bos.view(-1, 1) else: bos = bos.view(1, -1) src_length = torch.LongTensor(src_length) stats['total_enc_len'] = int(src_length.sum()) if args.cuda: src = src.cuda() src_length = src_length.cuda() bos = bos.cuda() with torch.no_grad(): context = translator.model.encode(src, src_length) context = [context, src_length, None] if beam_size == 1: generator = translator.generator.greedy_search else: generator = translator.generator.beam_search preds, lengths, counter = generator(batch_size, bos, context) stats['total_dec_len'] = lengths.sum().item() stats['iters'] = counter preds = preds.cpu() lengths = lengths.cpu() output = [] for idx, pred in enumerate(preds): end = lengths[idx] - 1 pred = pred[1:end] pred = pred.tolist() out = translator.tok.detokenize(pred) output.append(out) # only write the output to file in accuracy mode if args.mode == 'accuracy': output = [output[indices.index(i)] for i in range(len(output))] for line in output: test_file.write(line) test_file.write('\n') # Get timing elapsed = time.time() - translate_timer batch_time.update(elapsed, batch_size) total_tokens = stats['total_dec_len'] + stats['total_enc_len'] ttps = total_tokens / elapsed tot_tok_per_sec.update(ttps, batch_size) iterations.update(stats['iters']) enc_seq_len.update(stats['total_enc_len'] / batch_size, batch_size) dec_seq_len.update(stats['total_dec_len'] / batch_size, batch_size) if i % 5 == 0: log = [] log += 'TEST ' log += 'Time {:.3f} ({:.3f})\t'.format(batch_time.val, batch_time.avg) log += 'Decoder iters {:.1f} ({:.1f})\t'.format( iterations.val, iterations.avg) log += 'Tok/s {:.0f} ({:.0f})'.format(tot_tok_per_sec.val, tot_tok_per_sec.avg) log = ''.join(log) print(log) # summary timing time_per_sentence = (batch_time.avg / batch_size) log = [] log += 'TEST SUMMARY:\n' log += 'Lines translated: {}\t'.format(len(test_loader.dataset)) log += 'Avg total tokens/s: {:.0f}\n'.format(tot_tok_per_sec.avg) log += 'Avg time per batch: {:.3f} s\t'.format(batch_time.avg) log += 'Avg time per sentence: {:.3f} ms\n'.format(1000 * time_per_sentence) log += 'Avg encoder seq len: {:.2f}\t'.format(enc_seq_len.avg) log += 'Avg decoder seq len: {:.2f}\t'.format(dec_seq_len.avg) log += 'Total decoder iterations: {}'.format(int(iterations.sum)) log = ''.join(log) print(log) # only write the output to file in accuracy mode if args.mode == 'accuracy': test_file.close() test_path = args.output # run moses detokenizer detok_path = os.path.join(args.dataset_dir, config.DETOKENIZER) detok_test_path = test_path + '.detok' with open(detok_test_path, 'w') as detok_test_file, \ open(test_path, 'r') as test_file: subprocess.run(['perl', detok_path], stdin=test_file, stdout=detok_test_file, stderr=subprocess.DEVNULL) # run sacrebleu reference_path = os.path.join(args.dataset_dir, config.TGT_TEST_TARGET_FNAME) sacrebleu = subprocess.run([ 'sacrebleu --input {} {} --score-only -lc --tokenize intl'.format( detok_test_path, reference_path) ], stdout=subprocess.PIPE, shell=True) bleu = float(sacrebleu.stdout.strip()) print('BLEU on test dataset: {}'.format(bleu)) print('Finished evaluation on test set')
def main(): args = parse_args() print(args) if args.cuda: torch.cuda.set_device(0) if not args.cuda and torch.cuda.is_available(): warnings.warn('cuda is available but not enabled') if args.math == 'fp16' and not args.cuda: raise RuntimeError('fp16 requires cuda') if not args.cudnn: torch.backends.cudnn.enabled = False checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'}) vocab_size = checkpoint['tokenizer'].vocab_size model_config = dict(vocab_size=vocab_size, math=checkpoint['config'].math, **literal_eval(checkpoint['config'].model_config)) model_config['batch_first'] = args.batch_first model = models.GNMT(**model_config) state_dict = checkpoint['state_dict'] if checkpoint_from_distributed(state_dict): state_dict = unwrap_distributed(state_dict) model.load_state_dict(state_dict) if args.math == 'fp32': dtype = torch.FloatTensor if args.math == 'fp16': dtype = torch.HalfTensor model.type(dtype) if args.cuda: model = model.cuda() model.eval() tokenizer = checkpoint['tokenizer'] translation_model = Translator(model, tokenizer, beam_size=args.beam_size, max_seq_len=args.max_seq_len, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, cuda=args.cuda) output_file = codecs.open(args.output, 'w', encoding='UTF-8') # run model on generated data, for accurate timings starting from 1st batch dummy_data = ['abc ' * (args.max_seq_len // 4)] * args.batch_size translation_model.translate(dummy_data) if args.cuda: torch.cuda.synchronize() batch_time = AverageMeter(False) enc_tok_per_sec = AverageMeter(False) dec_tok_per_sec = AverageMeter(False) tot_tok_per_sec = AverageMeter(False) enc_seq_len = AverageMeter(False) dec_seq_len = AverageMeter(False) total_lines = 0 total_iters = 0 with codecs.open(args.input, encoding='UTF-8') as input_file: for idx, lines in enumerate(grouper(input_file, args.batch_size)): lines = [l for l in lines if l] n_lines = len(lines) total_lines += n_lines translate_timer = time.time() translated_lines, stats = translation_model.translate(lines) elapsed = time.time() - translate_timer batch_time.update(elapsed, n_lines) etps = stats['total_enc_len'] / elapsed dtps = stats['total_dec_len'] / elapsed enc_seq_len.update(stats['total_enc_len'] / n_lines, n_lines) dec_seq_len.update(stats['total_dec_len'] / n_lines, n_lines) enc_tok_per_sec.update(etps, n_lines) dec_tok_per_sec.update(dtps, n_lines) tot_tok = stats['total_dec_len'] + stats['total_enc_len'] ttps = tot_tok / elapsed tot_tok_per_sec.update(ttps, n_lines) n_iterations = stats['iters'] total_iters += n_iterations write_output(output_file, translated_lines) if idx % args.print_freq == args.print_freq - 1: print(f'TRANSLATION: ' f'Batch {idx} ' f'Iters {n_iterations}\t' f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' f'Tot tok/s {tot_tok_per_sec.val:.0f} ({tot_tok_per_sec.avg:.0f})\t' f'Enc tok/s {enc_tok_per_sec.val:.0f} ({enc_tok_per_sec.avg:.0f})\t' f'Dec tok/s {dec_tok_per_sec.val:.0f} ({dec_tok_per_sec.avg:.0f})') output_file.close() print(f'TRANSLATION SUMMARY:\n' f'Lines translated: {total_lines}\t' f'Avg time per batch: {batch_time.avg:.3f} s\t' f'Avg time per sentence: {1000*(batch_time.avg / args.batch_size):.3f} ms\n' f'Avg enc seq len: {enc_seq_len.avg:.2f}\t' f'Avg dec seq len: {dec_seq_len.avg:.2f}\t' f'Total iterations: {total_iters}\t\n' f'Avg tot tok/s: {tot_tok_per_sec.avg:.0f}\t' f'Avg enc tok/s: {enc_tok_per_sec.avg:.0f}\t' f'Avg dec tok/s: {dec_tok_per_sec.avg:.0f}')
def evaluate(self, epoch, iteration, summary): """ Runs evaluation on test dataset. :param epoch: index of the current epoch :param iteration: index of the current iteration :param summary: if True prints summary """ batch_time = AverageMeter(False) tot_tok_per_sec = AverageMeter(False) iterations = AverageMeter(False) enc_seq_len = AverageMeter(False) dec_seq_len = AverageMeter(False) stats = {} output = [] for i, (src, indices) in enumerate(self.loader): translate_timer = time.time() src, src_length = src batch_size = self.loader.batch_size global_batch_size = batch_size * get_world_size() beam_size = self.beam_size bos = [self.insert_target_start] * (batch_size * beam_size) bos = torch.LongTensor(bos) if self.batch_first: bos = bos.view(-1, 1) else: bos = bos.view(1, -1) src_length = torch.LongTensor(src_length) stats['total_enc_len'] = int(src_length.sum()) if self.cuda: src = src.cuda() src_length = src_length.cuda() bos = bos.cuda() with torch.no_grad(): context = self.model.encode(src, src_length) context = [context, src_length, None] if beam_size == 1: generator = self.generator.greedy_search else: generator = self.generator.beam_search preds, lengths, counter = generator(batch_size, bos, context) stats['total_dec_len'] = lengths.sum().item() stats['iters'] = counter indices = torch.tensor(indices).to(preds) preds = preds.scatter(0, indices.unsqueeze(1).expand_as(preds), preds) preds = gather_predictions(preds).cpu() for pred in preds: pred = pred.tolist() detok = self.tokenizer.detokenize(pred) output.append(detok + '\n') elapsed = time.time() - translate_timer batch_time.update(elapsed, batch_size) total_tokens = stats['total_dec_len'] + stats['total_enc_len'] ttps = total_tokens / elapsed tot_tok_per_sec.update(ttps, batch_size) iterations.update(stats['iters']) enc_seq_len.update(stats['total_enc_len'] / batch_size, batch_size) dec_seq_len.update(stats['total_dec_len'] / batch_size, batch_size) if i % self.print_freq == 0: log = [] log += f'TEST ' if epoch is not None: log += f'[{epoch}]' if iteration is not None: log += f'[{iteration}]' log += f'[{i}/{len(self.loader)}]\t' log += f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' log += f'Decoder iters {iterations.val:.1f} ({iterations.avg:.1f})\t' log += f'Tok/s {tot_tok_per_sec.val:.0f} ({tot_tok_per_sec.avg:.0f})' log = ''.join(log) logging.info(log) tot_tok_per_sec.reduce('sum') enc_seq_len.reduce('mean') dec_seq_len.reduce('mean') batch_time.reduce('mean') iterations.reduce('sum') if summary and get_rank() == 0: time_per_sentence = (batch_time.avg / global_batch_size) log = [] log += f'TEST SUMMARY:\n' log += f'Lines translated: {len(self.loader.dataset)}\t' log += f'Avg total tokens/s: {tot_tok_per_sec.avg:.0f}\n' log += f'Avg time per batch: {batch_time.avg:.3f} s\t' log += f'Avg time per sentence: {1000*time_per_sentence:.3f} ms\n' log += f'Avg encoder seq len: {enc_seq_len.avg:.2f}\t' log += f'Avg decoder seq len: {dec_seq_len.avg:.2f}\t' log += f'Total decoder iterations: {int(iterations.sum)}' log = ''.join(log) logging.info(log) return output
def feed_data(self, data_loader, training=True): if training: assert self.optimizer is not None batch_time = AverageMeter() data_time = AverageMeter() losses_per_token = AverageMeter() losses_per_sentence = AverageMeter() tot_tok_time = AverageMeter() src_tok_time = AverageMeter() tgt_tok_time = AverageMeter() batch_size = data_loader.batch_size end = time.time() for i, (src, tgt, _) in enumerate(data_loader): self.save_counter += 1 # measure data loading time data_time.update(time.time() - end) # do a train/evaluate iteration stats = self.iterate(src, tgt, training=training) loss_per_token, loss_per_sentence, num_toks = stats # measure accuracy and record loss losses_per_token.update(loss_per_token, num_toks['tgt']) losses_per_sentence.update(loss_per_sentence, batch_size) # measure elapsed time elapsed = time.time() - end batch_time.update(elapsed) src_tok_time.update(num_toks['src'] / elapsed) tgt_tok_time.update(num_toks['tgt'] / elapsed) tot_num_toks = num_toks['tgt'] + num_toks['src'] tot_tok_time.update(tot_num_toks / elapsed) self.loss = losses_per_token.avg end = time.time() if i % self.print_freq == 0: phase = 'TRAIN' if training else 'EVAL' log = [] log += [f'{phase} [{self.epoch}][{i}/{len(data_loader)}]'] log += [f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})'] log += [f'Data {data_time.val:.3f} ({data_time.avg:.3f})'] log += [ f'Tok/s {tot_tok_time.val:.0f} ({tot_tok_time.avg:.0f})' ] if self.verbose: log += [ f'Src tok/s {src_tok_time.val:.0f} ({src_tok_time.avg:.0f})' ] log += [ f'Tgt tok/s {tgt_tok_time.val:.0f} ({tgt_tok_time.avg:.0f})' ] log += [ f'Loss/sentence {losses_per_sentence.val:.1f} ({losses_per_sentence.avg:.1f})' ] log += [ f'Loss/tok {losses_per_token.val:.8f} ({losses_per_token.avg:.8f})' ] log = '\t'.join(log) logging.info(log) save_chkpt = (self.save_counter % self.save_freq) == (self.save_freq - 1) if training and save_chkpt: self.save_counter = 0 self.save_info['iteration'] = i identifier = next(self.checkpoint_counter, -1) if identifier != -1: with sync_workers() as rank: if rank == 0: self.save(identifier=identifier) return losses_per_token.avg
def feed_data(self, data_loader, training=True): if training: assert self.optimizer is not None batch_time = AverageMeter() data_time = AverageMeter() losses_per_token = AverageMeter() losses_per_sentence = AverageMeter() tot_tok_time = AverageMeter() src_tok_time = AverageMeter() tgt_tok_time = AverageMeter() batch_size = data_loader.batch_size count = 0 end = time.time() for i, (src, tgt, _) in enumerate(data_loader): if count > self.number: break count += 1 self.save_counter += 1 # measure data loading time data_time.update(time.time() - end) # do a train/evaluate iteration stats = self.iterate(src, tgt, training=training) loss_per_token, loss_per_sentence, num_toks = stats # measure accuracy and record loss losses_per_token.update(loss_per_token, num_toks['tgt']) losses_per_sentence.update(loss_per_sentence, batch_size) # measure elapsed time elapsed = time.time() - end batch_time.update(elapsed) src_tok_time.update(num_toks['src'] / elapsed) tgt_tok_time.update(num_toks['tgt'] / elapsed) tot_num_toks = num_toks['tgt'] + num_toks['src'] tot_tok_time.update(tot_num_toks / elapsed) self.loss = losses_per_token.avg end = time.time() return losses_per_token.avg
def feed_data(self, data_loader, training=True): """ Runs training or validation on batches from data_loader. :param data_loader: data loader :param training: if True runs training else runs validation """ if training: assert self.optimizer is not None eval_fractions = np.linspace(0, 1, self.intra_epoch_eval + 2)[1:-1] eval_iters = (eval_fractions * len(data_loader)).astype(int) eval_iters = set(eval_iters) batch_time = AverageMeter() data_time = AverageMeter() losses_per_token = AverageMeter() losses_per_sentence = AverageMeter() tot_tok_time = AverageMeter() src_tok_time = AverageMeter() tgt_tok_time = AverageMeter() batch_size = data_loader.batch_size layer_timestamps = [] verbose = True module_whitelist = ["EmuBidirLSTM", "RecurrentAttention", "Classifier"] for i, (src, tgt) in enumerate(data_loader): break (src, src_length) = src (tgt, tgt_length) = tgt src_length = torch.LongTensor(src_length).cuda() src = src.cuda() tgt = tgt.cuda() model_input = (src, src_length, tgt[:-1]) summary = torchsummary.summary(model=self.model, module_whitelist=module_whitelist, model_input=model_input, verbose=True) end = time.time() NUM_STEPS_TO_PROFILE = 100 # profile 100 steps for i, (src, tgt) in enumerate(data_loader): self.save_counter += 1 # measure data loading time data_time.update(time.time() - end) with torchprofiler.Profiling(self.model, module_whitelist) as p: # do a train/evaluate iteration stats = self.iterate(src, tgt, training=training) loss_per_token, loss_per_sentence, num_toks = stats print(str(p)) layer_timestamps.append(p.processed_times()) # measure accuracy and record loss losses_per_token.update(loss_per_token, num_toks['tgt']) losses_per_sentence.update(loss_per_sentence, batch_size) # measure elapsed time elapsed = time.time() - end batch_time.update(elapsed) src_tok_time.update(num_toks['src'] / elapsed) tgt_tok_time.update(num_toks['tgt'] / elapsed) tot_num_toks = num_toks['tgt'] + num_toks['src'] tot_tok_time.update(tot_num_toks / elapsed) self.loss = losses_per_token.avg if training and i in eval_iters: test_bleu, _ = self.translator.run(calc_bleu=True, epoch=self.epoch, iteration=i) log = [] log += [f'TRAIN [{self.epoch}][{i}/{len(data_loader)}]'] log += [f'BLEU: {test_bleu:.2f}'] log = '\t'.join(log) logging.info(log) self.model.train() self.preallocate(data_loader, training=True) if i % self.print_freq == 0: phase = 'TRAIN' if training else 'VALIDATION' log = [] log += [f'{phase} [{self.epoch}][{i}/{len(data_loader)}]'] log += [f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})'] log += [f'Data {data_time.val:.5f} ({data_time.avg:.5f})'] log += [ f'Tok/s {tot_tok_time.val:.0f} ({tot_tok_time.avg:.0f})' ] if self.verbose: log += [ f'Src tok/s {src_tok_time.val:.0f} ({src_tok_time.avg:.0f})' ] log += [ f'Tgt tok/s {tgt_tok_time.val:.0f} ({tgt_tok_time.avg:.0f})' ] log += [ f'Loss/sentence {losses_per_sentence.val:.1f} ({losses_per_sentence.avg:.1f})' ] log += [ f'Loss/tok {losses_per_token.val:.4f} ({losses_per_token.avg:.4f})' ] lr = [ param_group['lr'] for param_group in self.optimizer.param_groups ] log += [f'Learning Rate {lr}'] log = '\t'.join(log) logging.info(log) if i >= NUM_STEPS_TO_PROFILE: break save_chkpt = (self.save_counter % self.save_freq) == (self.save_freq - 1) if training and save_chkpt: self.save_counter = 0 self.save_info['iteration'] = i identifier = next(self.checkpoint_counter, -1) if identifier != -1: with sync_workers() as rank: if rank == 0: self.save(identifier=identifier) end = time.time() if verbose: print( "\n==========================================================") print("Layer Type Forward Time (ms) Backward Time (ms)") print("==========================================================") tot_accounted_time = 0.0 per_layer_times = [] for i in range(len(layer_timestamps[0])): layer_type = str(layer_timestamps[0][i][0]) layer_forward_time_sum = 0.0 layer_backward_time_sum = 0.0 for j in range(len(layer_timestamps)): layer_forward_time_sum += (layer_timestamps[j][i][2] / 1000) layer_backward_time_sum += (layer_timestamps[j][i][5] / 1000) per_layer_times.append( (layer_type, layer_forward_time_sum / len(layer_timestamps), layer_backward_time_sum / len(layer_timestamps))) if verbose: print(per_layer_times[-1][0], per_layer_times[-1][1], per_layer_times[-1][2]) tot_accounted_time += (per_layer_times[-1][1] + per_layer_times[-1][2]) print("Total accounted time: %.3f ms" % tot_accounted_time) summary_i = 0 per_layer_times_i = 0 last_summary_i = -1 last_per_layer_times_i = -1 while len(per_layer_times) > 0: per_layer_time = per_layer_times.pop(0) for summary_i in range(len(summary)): summary_elem = summary[summary_i] if str(summary_elem['layer_name']) != str(per_layer_time[0]): continue if 'forward_time' in summary_elem and 'backward_time' in summary_elem: continue summary_elem['forward_time'] = per_layer_time[1] summary_elem['backward_time'] = per_layer_time[2] break if training: create_graph(self.model, module_whitelist, (src, tgt), summary, os.path.join("profiles", self.arch)) tot_tok_time.reduce('sum') losses_per_token.reduce('mean') return losses_per_token.avg, tot_tok_time.avg
def evaluate(self, epoch, iteration, eval_path, summary): """ Runs evaluation on test dataset. :param epoch: index of the current epoch :param iteration: index of the current iteration :param eval_path: path to the file for saving results :param summary: if True prints summary """ eval_file = open(eval_path, 'w') batch_time = AverageMeter(False) tot_tok_per_sec = AverageMeter(False) iterations = AverageMeter(False) enc_seq_len = AverageMeter(False) dec_seq_len = AverageMeter(False) total_iters = 0 total_lines = 0 stats = {} for i, (src, indices) in enumerate(self.loader): translate_timer = time.time() src, src_length = src if self.batch_first: batch_size = src.size(0) else: batch_size = src.size(1) total_lines += batch_size beam_size = self.beam_size bos = [self.insert_target_start] * (batch_size * beam_size) bos = torch.LongTensor(bos) if self.batch_first: bos = bos.view(-1, 1) else: bos = bos.view(1, -1) src_length = torch.LongTensor(src_length) stats['total_enc_len'] = int(src_length.sum()) if self.cuda: src = src.cuda() src_length = src_length.cuda() bos = bos.cuda() with torch.no_grad(): context = self.model.encode(src, src_length) context = [context, src_length, None] if beam_size == 1: generator = self.generator.greedy_search else: generator = self.generator.beam_search preds, lengths, counter = generator(batch_size, bos, context) preds = preds.cpu() lengths = lengths.cpu() stats['total_dec_len'] = int(lengths.sum()) stats['iters'] = counter total_iters += stats['iters'] output = [] for idx, pred in enumerate(preds): end = lengths[idx] - 1 pred = pred[1:end].tolist() out = self.tokenizer.detokenize(pred) output.append(out) output = [output[indices.index(i)] for i in range(len(output))] elapsed = time.time() - translate_timer batch_time.update(elapsed, batch_size) total_tokens = stats['total_dec_len'] + stats['total_enc_len'] ttps = total_tokens / elapsed tot_tok_per_sec.update(ttps, batch_size) iterations.update(stats['iters']) enc_seq_len.update(stats['total_enc_len'] / batch_size, batch_size) dec_seq_len.update(stats['total_dec_len'] / batch_size, batch_size) if i % self.print_freq == 0: log = [] log += f'TEST ' if epoch is not None: log += f'[{epoch}]' if iteration is not None: log += f'[{iteration}]' log += f'[{i}/{len(self.loader)}]\t' log += f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' log += f'Decoder iters {iterations.val:.1f} ({iterations.avg:.1f})\t' log += f'Tok/s {tot_tok_per_sec.val:.0f} ({tot_tok_per_sec.avg:.0f})' log = ''.join(log) logging.info(log) for line in output: eval_file.write(line) eval_file.write('\n') eval_file.close() if summary: time_per_sentence = (batch_time.avg / self.loader.batch_size) log = [] log += f'TEST SUMMARY:\n' log += f'Lines translated: {total_lines}\t' log += f'Avg total tokens/s: {tot_tok_per_sec.avg:.0f}\n' log += f'Avg time per batch: {batch_time.avg:.3f} s\t' log += f'Avg time per sentence: {1000*time_per_sentence:.3f} ms\n' log += f'Avg encoder seq len: {enc_seq_len.avg:.2f}\t' log += f'Avg decoder seq len: {dec_seq_len.avg:.2f}\t' log += f'Total decoder iterations: {total_iters}' log = ''.join(log) logging.info(log)
def main(): execution_timer = time.time() tfiargs = tfiParser.getParser() args = tfiargs.parse_args() # import os # os.environ['CUDA_LAUNCH_BLOCKING']='1' if args.seed is not None: np.random.seed(args.seed) random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True if args.gpu is not None: torch.cuda.set_device(args.gpu) cudnn.benchmark = True print("Use GPU: {} for training".format(args.gpu)) checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'}) vocab_size = checkpoint['tokenizer'].vocab_size model_config = dict(vocab_size=vocab_size, math=checkpoint['config'].math, **literal_eval(checkpoint['config'].model_config)) model_config['batch_first'] = args.batch_first model = models.GNMT(**model_config) state_dict = checkpoint['state_dict'] if checkpoint_from_distributed(state_dict): state_dict = unwrap_distributed(state_dict) model.load_state_dict(state_dict) if args.gpu is not None: model = model.cuda() tokenizer = checkpoint['tokenizer'] test_data = ParallelDataset(src_fname=os.path.join(args.data, config.SRC_TEST_FNAME), tgt_fname=os.path.join(args.data, config.TGT_TEST_FNAME), tokenizer=tokenizer, min_len=0, max_len=150, sort=False) test_loader = test_data.get_loader(batch_size=args.batch_size, batch_first=True, shuffle=False, num_workers=0, drop_last=False, distributed=False) translator = Translator(model, tokenizer, beam_size=args.beam_size, max_seq_len=args.max_seq_len, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, cuda=args.gpu is not None) model.eval() # torch.cuda.empty_cache() if args.record_prefix is not None: record = Record('GNMTv2', batch_size=args.batch_size, injection=args.injection, fiLayer=args.layer, fiFeatures=args.fiFeats, fiWeights=args.fiWeights) # Faulty Run if args.faulty: fi = FI(model, record=record, fiMode=args.injection, fiLayer=args.layer, fiBit=args.bit, fiFeatures=args.fiFeats, fiWeights=args.fiWeights, log=args.log) traverse_time = AverageMeter() start = time.time() fi.traverseModel(model) traverse_time.update(time.time() - start) displayConfig(args) fi.injectionMode = True print("\n Number of new layers: #%d \n" % fi.numNewLayers) elif args.golden: import distiller.modules as dist model = dist.convert_model_to_distiller_lstm(model) if args.quantize: overrides_yaml = """ .*att_rnn.attn.*: clip_acts: NONE # Quantize without clipping decoder.classifier.classifier: clip_acts: NONE # Quantize without clipping """ from distiller.utils import yaml_ordered_load overrides = yaml_ordered_load( overrides_yaml) # Basic quantizer defintion stats_file = '/home/bfgoldstein/torchfi/examples/wmt16/model_stats.yaml' quantizer = tfi.FIPostTraLinearQuantizer( model, mode=args.quant_mode, bits_activations=args.quant_bacts, bits_parameters=args.quant_bwts, bits_accum=args.quant_baccum, per_channel_wts=args.quant_channel, clip_acts=args.quant_cacts, model_activation_stats=args.quant_stats_file, overrides=overrides, clip_n_stds=args.quant_cnstds, scale_approx_mult_bits=args.quant_scalebits) quantizer.prepare_model() # model = quantizer.model if args.faulty: fi.setQuantParams(args) print(model._modules.items()) # Setting model to evaluation mode and cuda (if enabled) after FI traverse model.eval() if args.gpu is not None: model = model.cuda() test_file = open(args.record_prefix + getRecordPrefix(args, 'fp32', faulty=args.faulty) + ".tok", 'w', encoding='UTF-8') batch_time = AverageMeter(False) tot_tok_per_sec = AverageMeter(False) iterations = AverageMeter(False) enc_seq_len = AverageMeter(False) dec_seq_len = AverageMeter(False) bleu_score = AverageMeter(False) score_time = AverageMeter(False) stats = {} reference_content = readReferenceFile(args) for batch_idx, (input, target, indices) in enumerate(test_loader): translate_timer = time.time() input_data, input_lenght = input if translator.batch_first: batch_size = input_data.size(0) else: batch_size = input_data.size(1) beam_size = args.beam_size bos = [translator.insert_target_start] * (batch_size * beam_size) bos = torch.LongTensor(bos) if translator.batch_first: bos = bos.view(-1, 1) else: bos = bos.view(1, -1) input_lenght = torch.LongTensor(input_lenght) stats['total_enc_len'] = int(input_lenght.sum()) if args.gpu is not None: input_data = input_data.cuda(args.gpu, non_blocking=True) input_lenght = input_lenght.cuda(args.gpu, non_blocking=True) bos = bos.cuda(args.gpu, non_blocking=True) with torch.no_grad(): context = translator.model.encode(input_data, input_lenght) context = [context, input_lenght, None] if beam_size == 1: generator = translator.generator.greedy_search else: generator = translator.generator.beam_search preds, lengths, counter = generator(batch_size, bos, context) if args.faulty: fi.injectionMode = True stats['total_dec_len'] = lengths.sum().item() stats['iters'] = counter preds = preds.cpu() lengths = lengths.cpu() output = [] for idx, pred in enumerate(preds): end = lengths[idx] - 1 pred = pred[1:end] pred = pred.tolist() out = translator.tok.detokenize(pred) output.append(out) output = [output[indices.index(i)] for i in range(len(output))] for line_idx, line in enumerate(output): score_timer = time.time() detok_sentence = detokenizeSentence(args, line) chunk = (batch_idx * batch_size) + line_idx score = scoreBleuSentence(args, detok_sentence, reference_content[chunk]) bleu_score.update(score) record.addBleuScores(score) # Get timing elapsed = time.time() - score_timer score_time.update(elapsed) test_file.write(line) test_file.write('\n') # Get timing elapsed = time.time() - translate_timer batch_time.update(elapsed, batch_size) total_tokens = stats['total_dec_len'] + stats['total_enc_len'] ttps = total_tokens / elapsed tot_tok_per_sec.update(ttps, batch_size) iterations.update(stats['iters']) enc_seq_len.update(stats['total_enc_len'] / batch_size, batch_size) dec_seq_len.update(stats['total_dec_len'] / batch_size, batch_size) if batch_idx % args.print_freq == 0: print('[Test {}] Time: {:.3f} ({:.3f})\t \ Decoder iters {:.1f} ({:.1f})\t \ Tok/s {:.0f} ({:.0f})\n \ Bleu score: {:.2f} ({:.2f})\t \ Bleu time: {:.3f} ({:.3f})'.format( batch_idx, batch_time.val, batch_time.avg, iterations.val, iterations.avg, tot_tok_per_sec.val, tot_tok_per_sec.avg, bleu_score.val, bleu_score.avg, score_time.val, score_time.avg)) # summary timing time_per_sentence = (batch_time.avg / batch_size) print('[Test] Summary \n \ Lines translated: {}\t \ Avg total tokens/s: {:.0f}\n \ Avg time per batch: {:.3f} s\t \ Avg time per sentence: {:.3f} ms\n \ Avg encoder seq len: {:.2f}\t \ Avg decoder seq len: {:.2f}\t \ Total decoder iterations: {}\n \ Traverse time : {:.3f} s\t \ Total number of injections: {}'.format( len(test_loader.dataset), tot_tok_per_sec.avg, batch_time.avg, 1000 * time_per_sentence, enc_seq_len.avg, dec_seq_len.avg, int(iterations.sum), traverse_time.val if args.faulty else 0.0, int(fi.numInjections) if args.faulty else 0)) test_file.close() detok = detokenizeFile(args) bleu = scoreBleuFile(args, detok) record.setBleuScoreAvg(bleu) saveRecord( args.record_prefix + getRecordPrefix(args, 'fp32', faulty=args.faulty), record) print('BLEU on test dataset: {}'.format(bleu)) # Get timing execution_elapsed = time.time() - execution_timer print('Finished evaluation on test set in {:.2f} seconds'.format( execution_elapsed))