def skyline_model_provider(): args = get_args() vocab_size = 32317 model_config = { 'hidden_size': args.hidden_size, 'num_layers': args.num_layers, 'dropout': args.dropout, 'batch_first': False, 'share_embedding': args.share_embedding, } model = GNMTWithLoss( GNMT(vocab_size=vocab_size, **model_config), build_criterion(vocab_size, config.PAD, args.smoothing), ).cuda() model.zero_grad() return model
def main(): """ Launches data-parallel multi-gpu training. """ args = parse_args() if not args.cudnn: torch.backends.cudnn.enabled = False if args.seed is not None: torch.manual_seed(args.seed + args.rank) # initialize distributed backend distributed = args.world_size > 1 if distributed: backend = 'nccl' if args.cuda else 'gloo' dist.init_process_group(backend=backend, rank=args.rank, init_method=args.dist_url, world_size=args.world_size) # create directory for results save_path = os.path.join(args.results_dir, args.save) args.save_path = save_path os.makedirs(save_path, exist_ok=True) # setup logging log_filename = f'log_gpu_{args.rank}.log' setup_logging(os.path.join(save_path, log_filename)) logging.info(f'Saving results to: {save_path}') logging.info(f'Run arguments: {args}') if args.cuda: torch.cuda.set_device(args.rank) # build tokenizer tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME)) # build datasets train_data = ParallelDataset( src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME), tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME), tokenizer=tokenizer, min_len=args.min_length_train, max_len=args.max_length_train, sort=False, max_size=args.max_size) val_data = ParallelDataset(src_fname=os.path.join(args.dataset_dir, config.SRC_VAL_FNAME), tgt_fname=os.path.join(args.dataset_dir, config.TGT_VAL_FNAME), tokenizer=tokenizer, min_len=args.min_length_val, max_len=args.max_length_val, sort=True) test_data = TextDataset(src_fname=os.path.join(args.dataset_dir, config.SRC_TEST_FNAME), tokenizer=tokenizer, min_len=args.min_length_test, max_len=args.max_length_test, sort=False) vocab_size = tokenizer.vocab_size # build GNMT model model_config = dict(vocab_size=vocab_size, math=args.math, **literal_eval(args.model_config)) model = GNMT(**model_config) logging.info(model) batch_first = model.batch_first # define loss function (criterion) and optimizer criterion = build_criterion(vocab_size, config.PAD, args.smoothing) opt_config = literal_eval(args.optimization_config) logging.info(f'Training optimizer: {opt_config}') num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info(f'Number of parameters: {num_parameters}') # get data loaders train_loader = train_data.get_loader(batch_size=args.batch_size, batch_first=batch_first, shuffle=True, bucketing=args.bucketing, num_workers=args.workers, drop_last=True) val_loader = val_data.get_loader(batch_size=args.val_batch_size, batch_first=batch_first, shuffle=False, num_workers=args.workers, drop_last=False) test_loader = test_data.get_loader(batch_size=args.test_batch_size, batch_first=batch_first, shuffle=False, num_workers=args.workers, drop_last=False) translator = Translator(model=model, tokenizer=tokenizer, loader=test_loader, beam_size=args.beam_size, max_seq_len=args.max_length_test, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, cuda=args.cuda, print_freq=args.print_freq, dataset_dir=args.dataset_dir, target_bleu=args.target_bleu, save_path=args.save_path) # create trainer trainer_options = dict(criterion=criterion, grad_clip=args.grad_clip, save_path=save_path, save_freq=args.save_freq, save_info={ 'config': args, 'tokenizer': tokenizer }, opt_config=opt_config, batch_first=batch_first, keep_checkpoints=args.keep_checkpoints, math=args.math, print_freq=args.print_freq, cuda=args.cuda, distributed=distributed, intra_epoch_eval=args.intra_epoch_eval, translator=translator) trainer_options['model'] = model trainer = trainers.Seq2SeqTrainer(**trainer_options) # optionally resume from a checkpoint if args.resume: checkpoint_file = args.resume if os.path.isdir(checkpoint_file): checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth') if os.path.isfile(checkpoint_file): trainer.load(checkpoint_file) else: logging.error(f'No checkpoint found at {args.resume}') # training loop best_loss = float('inf') for epoch in range(args.start_epoch, args.epochs): logging.info(f'Starting epoch {epoch}') if distributed: train_loader.sampler.set_epoch(epoch) trainer.epoch = epoch train_loss, train_perf = trainer.optimize(train_loader) # evaluate on validation set if args.rank == 0 and not args.disable_eval: logging.info(f'Running validation on dev set') val_loss, val_perf = trainer.evaluate(val_loader) # remember best prec@1 and save checkpoint is_best = val_loss < best_loss best_loss = min(val_loss, best_loss) trainer.save(save_all=args.save_all, is_best=is_best) break_training = False if not args.disable_eval: test_bleu, break_training = translator.run(calc_bleu=True, epoch=epoch) if args.rank == 0 and not args.disable_eval: logging.info(f'Summary: Epoch: {epoch}\t' f'Training Loss: {train_loss:.4f}\t' f'Validation Loss: {val_loss:.4f}\t' f'Test BLEU: {test_bleu:.2f}') logging.info(f'Performance: Epoch: {epoch}\t' f'Training: {train_perf:.0f} Tok/s\t' f'Validation: {val_perf:.0f} Tok/s') else: logging.info(f'Summary: Epoch: {epoch}\t' f'Training Loss {train_loss:.4f}') logging.info(f'Performance: Epoch: {epoch}\t' f'Training: {train_perf:.0f} Tok/s') logging.info(f'Finished epoch {epoch}') if break_training: break
def main(): """ Launches translation (inference). Inference is executed on a single GPU, implementation supports beam search with length normalization and coverage penalty. """ args = parse_args() device = utils.set_device(args.cuda, args.local_rank) utils.init_distributed(args.cuda) args.rank = utils.get_rank() utils.setup_logging() if args.env: utils.log_env_info() logging.info(f'Run arguments: {args}') if not args.cuda and torch.cuda.is_available(): warnings.warn('cuda is available but not enabled') if not args.cudnn: torch.backends.cudnn.enabled = False # load checkpoint and deserialize to CPU (to save GPU memory) checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'}) # build GNMT model tokenizer = Tokenizer() tokenizer.set_state(checkpoint['tokenizer']) model_config = checkpoint['model_config'] model_config['batch_first'] = args.batch_first model_config['vocab_size'] = tokenizer.vocab_size model = GNMT(**model_config) model.load_state_dict(checkpoint['state_dict']) # construct the dataset if args.input: data = RawTextDataset( raw_datafile=args.input, tokenizer=tokenizer, sort=args.sort, ) elif args.input_text: data = RawTextDataset( raw_data=args.input_text, tokenizer=tokenizer, sort=args.sort, ) latency_table = tables.LatencyTable(args.percentiles) throughput_table = tables.ThroughputTable(args.percentiles) accuracy_table = tables.AccuracyTable('BLEU') dtype = {'fp32': torch.FloatTensor, 'fp16': torch.HalfTensor} for (math, batch_size, beam_size) in product(args.math, args.batch_size, args.beam_size): logging.info(f'math: {math}, batch size: {batch_size}, ' f'beam size: {beam_size}') model.type(dtype[math]) model = model.to(device) model.eval() # build the data loader loader = data.get_loader( batch_size=batch_size, batch_first=args.batch_first, pad=True, repeat=args.repeat[batch_size], num_workers=0, ) # build the translator object translator = Translator( model=model, tokenizer=tokenizer, loader=loader, beam_size=beam_size, max_seq_len=args.max_seq_len, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, print_freq=args.print_freq, ) # execute the inference output, stats = translator.run( calc_bleu=args.bleu, eval_path=args.output, summary=True, warmup=args.warmup, reference_path=args.reference, ) # print translated outputs if not args.output and args.rank == 0: logging.info(f'Translated output:') for out in output: print(out) key = (batch_size, beam_size) latency_table.add(key, {math: stats['runtimes']}) throughput_table.add(key, {math: stats['throughputs']}) accuracy_table.add(key, {math: stats['bleu']}) if args.tables: accuracy_table.write('Inference accuracy', args.math) if 'fp16' in args.math and 'fp32' in args.math: relative = 'fp32' else: relative = None if 'fp32' in args.math: throughput_table.write('Inference throughput', 'fp32') if 'fp16' in args.math: throughput_table.write('Inference throughput', 'fp16', relative=relative) if 'fp32' in args.math: latency_table.write('Inference latency', 'fp32') if 'fp16' in args.math: latency_table.write('Inference latency', 'fp16', relative=relative, reverse_speedup=True) passed = utils.benchmark(stats['bleu'], args.target_bleu, stats['tokens_per_sec'], args.target_perf) return passed
def main(): """ Launches data-parallel multi-gpu training. """ training_start = time.time() args = parse_args() device = utils.set_device(args.cuda, args.local_rank) utils.init_distributed(args.cuda) args.rank = utils.get_rank() if not args.cudnn: torch.backends.cudnn.enabled = False # create directory for results os.makedirs(args.save_dir, exist_ok=True) # setup logging log_filename = f'log_rank_{utils.get_rank()}.log' utils.setup_logging(args.log_all_ranks, os.path.join(args.save_dir, log_filename)) if args.env: utils.log_env_info() logging.info(f'Saving results to: {args.save_dir}') logging.info(f'Run arguments: {args}') args.train_iter_size = set_iter_size(args.train_iter_size, args.train_global_batch_size, args.train_batch_size) worker_seeds, shuffling_seeds = utils.setup_seeds(args.seed, args.epochs, device) worker_seed = worker_seeds[args.rank] logging.info(f'Worker {args.rank} is using worker seed: {worker_seed}') torch.manual_seed(worker_seed) # build tokenizer pad_vocab = utils.pad_vocabulary(args.math) tokenizer = Tokenizer(args.vocab, args.bpe_codes, args.lang, pad_vocab) # build datasets train_data = LazyParallelDataset( src_fname=args.train_src, tgt_fname=args.train_tgt, tokenizer=tokenizer, min_len=args.train_min_length, max_len=args.train_max_length, sort=False, max_size=args.train_max_size, ) val_data = ParallelDataset( src_fname=args.val_src, tgt_fname=args.val_tgt, tokenizer=tokenizer, min_len=args.val_min_length, max_len=args.val_max_length, sort=True, ) test_data = TextDataset( src_fname=args.test_src, tokenizer=tokenizer, min_len=args.test_min_length, max_len=args.test_max_length, sort=True, ) vocab_size = tokenizer.vocab_size # build GNMT model model_config = { 'hidden_size': args.hidden_size, 'vocab_size': vocab_size, 'num_layers': args.num_layers, 'dropout': args.dropout, 'batch_first': False, 'share_embedding': args.share_embedding, } model = GNMT(**model_config).to(device) logging.info(model) batch_first = model.batch_first # define loss function (criterion) and optimizer criterion = build_criterion(vocab_size, config.PAD, args.smoothing).to(device) opt_config = {'optimizer': args.optimizer, 'lr': args.lr} opt_config.update(literal_eval(args.optimizer_extra)) logging.info(f'Training optimizer config: {opt_config}') scheduler_config = { 'warmup_steps': args.warmup_steps, 'remain_steps': args.remain_steps, 'decay_interval': args.decay_interval, 'decay_steps': args.decay_steps, 'decay_factor': args.decay_factor } logging.info(f'Training LR schedule config: {scheduler_config}') num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info(f'Number of parameters: {num_parameters}') batching_opt = { 'shard_size': args.shard_size, 'num_buckets': args.num_buckets } # get data loaders train_loader = train_data.get_loader(batch_size=args.train_batch_size, seeds=shuffling_seeds, batch_first=batch_first, shuffle=True, batching=args.batching, batching_opt=batching_opt, num_workers=args.train_loader_workers) val_loader = val_data.get_loader(batch_size=args.val_batch_size, batch_first=batch_first, shuffle=False, num_workers=args.val_loader_workers) test_loader = test_data.get_loader(batch_size=args.test_batch_size, batch_first=batch_first, shuffle=False, pad=True, num_workers=args.test_loader_workers) translator = Translator( model=model, tokenizer=tokenizer, loader=test_loader, beam_size=args.beam_size, max_seq_len=args.test_max_length, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, print_freq=args.print_freq, reference=args.test_tgt, ) # create trainer total_train_iters = len(train_loader) // args.train_iter_size * args.epochs save_info = { 'model_config': model_config, 'config': args, 'tokenizer': tokenizer.get_state() } loss_scaling = { 'init_scale': args.init_scale, 'upscale_interval': args.upscale_interval } trainer_options = dict( model=model, criterion=criterion, grad_clip=args.grad_clip, iter_size=args.train_iter_size, save_dir=args.save_dir, save_freq=args.save_freq, save_info=save_info, opt_config=opt_config, scheduler_config=scheduler_config, train_iterations=total_train_iters, keep_checkpoints=args.keep_checkpoints, math=args.math, loss_scaling=loss_scaling, print_freq=args.print_freq, intra_epoch_eval=args.intra_epoch_eval, translator=translator, prealloc_mode=args.prealloc_mode, ) trainer = trainers.Seq2SeqTrainer(**trainer_options) # optionally resume from a checkpoint if args.resume: checkpoint_file = args.resume if os.path.isdir(checkpoint_file): checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth') if os.path.isfile(checkpoint_file): trainer.load(checkpoint_file) else: logging.error(f'No checkpoint found at {args.resume}') # training loop best_loss = float('inf') training_perf = [] break_training = False test_bleu = None for epoch in range(args.start_epoch, args.epochs): logging.info(f'Starting epoch {epoch}') train_loader.sampler.set_epoch(epoch) trainer.epoch = epoch train_loss, train_perf = trainer.optimize(train_loader) training_perf.append(train_perf) # evaluate on validation set if args.eval: logging.info(f'Running validation on dev set') val_loss, val_perf = trainer.evaluate(val_loader) # remember best prec@1 and save checkpoint if args.rank == 0: is_best = val_loss < best_loss best_loss = min(val_loss, best_loss) trainer.save(save_all=args.save_all, is_best=is_best) if args.eval: utils.barrier() eval_fname = f'eval_epoch_{epoch}' eval_path = os.path.join(args.save_dir, eval_fname) _, eval_stats = translator.run( calc_bleu=True, epoch=epoch, eval_path=eval_path, ) test_bleu = eval_stats['bleu'] if args.target_bleu and test_bleu >= args.target_bleu: logging.info(f'Target accuracy reached') break_training = True acc_log = [] acc_log += [f'Summary: Epoch: {epoch}'] acc_log += [f'Training Loss: {train_loss:.4f}'] if args.eval: acc_log += [f'Validation Loss: {val_loss:.4f}'] acc_log += [f'Test BLEU: {test_bleu:.2f}'] perf_log = [] perf_log += [f'Performance: Epoch: {epoch}'] perf_log += [f'Training: {train_perf:.0f} Tok/s'] if args.eval: perf_log += [f'Validation: {val_perf:.0f} Tok/s'] if args.rank == 0: logging.info('\t'.join(acc_log)) logging.info('\t'.join(perf_log)) logging.info(f'Finished epoch {epoch}') if break_training: break utils.barrier() training_stop = time.time() training_time = training_stop - training_start logging.info(f'Total training time {training_time:.0f} s') table = TrainingTable() avg_training_perf = sum(training_perf) / len(training_perf) table.add(utils.get_world_size(), args.train_batch_size, test_bleu, avg_training_perf, training_time) if utils.get_rank() == 0: table.write('Training Summary', args.math) passed = utils.benchmark(test_bleu, args.target_bleu, train_perf, args.target_perf) if not passed: sys.exit(1)
def main(): """ Launches translation (inference). Inference is executed on a single GPU, implementation supports beam search with length normalization and coverage penalty. """ args = parse_args() args.batch_first = False if args.cuda: torch.cuda.set_device(0) if not args.cuda and torch.cuda.is_available(): warnings.warn('cuda is available but not enabled') if args.math == 'fp16' and not args.cuda: raise RuntimeError('fp16 requires cuda') if not args.cudnn: torch.backends.cudnn.enabled = False num_stages = args.num_stages # compute BLEU score for every epoch print("Epoch\tBLEU score") epoch = 0 while True: # no more epochs to run, since desired file not available if not os.path.isfile( os.path.join(args.checkpoint_path, f"checkpoint.0.pth.tar.epoch.{epoch}")): break module = importlib.import_module(args.module) model = module.model(None) num_modules = len(model) key_to_module_mapping = OrderedDict() all_stages_state_dict = OrderedDict() module_id = 0 stage_id = 0 for stage_id in range(num_stages): # load the checkpoint associated with a stage full_checkpoint_path = os.path.join( args.checkpoint_path, f"checkpoint.{stage_id}.pth.tar.epoch.{epoch}") checkpoint = torch.load(full_checkpoint_path, map_location=torch.device('cpu')) # iterate through all modules in stage_id's checkpoint local_module_id = 0 # quit when checkpoints for all modules in full model are loaded while module_id < num_modules: # load checkpoint corresponding to different modules in our runtime state_dict = checkpoint["state_dict"] state_dict_key = "module%d" % local_module_id if state_dict_key not in state_dict: break state_dict = checkpoint["state_dict"][state_dict_key] # remove mask buffer keys_to_delete = [] for key in state_dict: if "mask" in key: keys_to_delete.append(key) for key in keys_to_delete: del state_dict[key] if checkpoint_from_distributed(state_dict): state_dict = unwrap_distributed(state_dict) # collect all state_dicts in a single OrderedDict for key in state_dict: all_stages_state_dict[(stage_id, local_module_id, key)] = state_dict[key] stage_module, _, _ = model[module_id] for key in state_dict: # key_to_module_mapping maps key (in state_dict) to the # torch.nn.Module wrapping the parameter and the name # of parameter (weight, bias, etc.) key_to_module_mapping[( stage_id, local_module_id, key)] = get_submodule_and_parameter_name( stage_module, key) # load tokenizer state tokenizer = Tokenizer() tokenizer.set_state(checkpoint['tokenizer']) vocab_size = tokenizer.vocab_size local_module_id += 1 module_id += 1 epoch += 1 # build model, and load state dict model_config = { 'vocab_size': vocab_size, 'batch_first': args.batch_first, 'hidden_size': 1024, 'num_layers': args.num_layers, 'dropout': 0.2, 'share_embedding': False } model = GNMT(**model_config) model_state_dict = OrderedDict() for real_key in model.state_dict(): (module, parameter_name) = get_submodule_and_parameter_name( model, real_key) # find key in all_stages_state_dict that corresponds to real_key in # model's state_dict for key in key_to_module_mapping: (module2, parameter_name2) = key_to_module_mapping[key] if parameter_name == parameter_name2 and str(module) == str( module2): break if parameter_name == parameter_name2 and str(module) == str( module2): model_state_dict[real_key] = all_stages_state_dict[key] del key_to_module_mapping[key] del all_stages_state_dict[key] # load state_dict into model, and perform inference model.load_state_dict(model_state_dict) if args.math == 'fp32': dtype = torch.FloatTensor if args.math == 'fp16': dtype = torch.HalfTensor model.type(dtype) model = model.cuda() model.eval() # construct the dataset test_data = TextDataset(src_fname=args.input, tokenizer=tokenizer, sort=False) # build the data loader test_loader = test_data.get_loader(world_size=1, rank=0, batch_size=args.batch_size, batch_first=args.batch_first, shuffle=False, pad=True, num_workers=0) # build the translator object translator = Translator(model=model, tokenizer=tokenizer, loader=test_loader, beam_size=args.beam_size, max_seq_len=args.max_seq_len, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, cuda=args.cuda, print_freq=args.print_freq, dataset_dir=args.dataset_dir) # execute the inference test_bleu, _ = translator.run(calc_bleu=args.bleu, eval_path=args.output, reference_path=args.reference, summary=True) print(f'{epoch}\t{test_bleu:.2f}')
def main(): """ Launches data-parallel multi-gpu training. """ mlperf_log.ROOT_DIR_GNMT = os.path.dirname(os.path.abspath(__file__)) mlperf_log.LOGGER.propagate = False args = parse_args() if args.cuda: torch.cuda.set_device(args.local_rank) device = torch.device('cuda') else: device = torch.device('cpu') # initialize distributed backend distributed = False if 'WORLD_SIZE' in os.environ: distributed = int(os.environ['WORLD_SIZE']) > 1 if distributed: assert args.cuda '''Initialize distributed communication''' torch.distributed.init_process_group(backend='nccl', init_method='env://') assert torch.distributed.is_initialized() gnmt_print(key=mlperf_log.RUN_START) args.rank = get_rank() if not args.cudnn: torch.backends.cudnn.enabled = False # create directory for results save_path = os.path.join(args.results_dir, args.save) args.save_path = save_path os.makedirs(save_path, exist_ok=True) # setup logging log_filename = f'log_gpu_{args.rank}.log' setup_logging(os.path.join(save_path, log_filename)) logging.info(f'Saving results to: {save_path}') logging.info(f'Run arguments: {args}') # setup L2 promotion if args.cuda: l2_promote() gnmt_print(key=mlperf_log.RUN_SET_RANDOM_SEED) # https://github.com/mlperf/policies/issues/120#issuecomment-431111348 if args.seed is None: # random master seed, random.SystemRandom() uses /dev/urandom on Unix master_seed = random.SystemRandom().randint(0, 2**32 - 1) if get_rank() == 0: # master seed is reported only from rank=0 worker, it's to avoid # confusion, seeds from rank=0 are later broadcasted to other # workers logging.info(f'Using random master seed: {master_seed}') else: # master seed was specified from command line master_seed = args.seed logging.info(f'Using master seed from command line: {master_seed}') # initialize seeding RNG seeding_rng = random.Random(master_seed) # generate worker seeds, one seed for every distributed worker worker_seeds = generate_seeds(seeding_rng, get_world_size()) # generate seeds for data shuffling, one seed for every epoch shuffling_seeds = generate_seeds(seeding_rng, args.epochs) # broadcast seeds from rank=0 to other workers worker_seeds = broadcast_seeds(worker_seeds, device) shuffling_seeds = broadcast_seeds(shuffling_seeds, device) # set worker seed worker_seed = worker_seeds[args.rank] logging.info(f'Worker {args.rank} is using worker seed: {worker_seed}') torch.manual_seed(worker_seed) # build tokenizer tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME)) # build datasets gnmt_print(key=mlperf_log.PREPROC_TOKENIZE_TRAINING) gnmt_print(key=mlperf_log.TRAIN_HP_MAX_SEQ_LEN, value=args.max_length_train) train_data = LazyParallelDataset( src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME), tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME), tokenizer=tokenizer, min_len=args.min_length_train, max_len=args.max_length_train, sort=False, max_size=args.max_size) gnmt_print(key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES, value=len(train_data)) val_data = ParallelDataset(src_fname=os.path.join(args.dataset_dir, config.SRC_VAL_FNAME), tgt_fname=os.path.join(args.dataset_dir, config.TGT_VAL_FNAME), tokenizer=tokenizer, min_len=args.min_length_val, max_len=args.max_length_val, sort=True) gnmt_print(key=mlperf_log.PREPROC_TOKENIZE_EVAL) test_data = TextDataset(src_fname=os.path.join(args.dataset_dir, config.SRC_TEST_FNAME), tokenizer=tokenizer, min_len=args.min_length_test, max_len=args.max_length_test, sort=False) gnmt_print(key=mlperf_log.PREPROC_NUM_EVAL_EXAMPLES, value=len(test_data)) vocab_size = tokenizer.vocab_size # size of the vocabulary has been padded to a multiple of 8 gnmt_print(key=mlperf_log.PREPROC_VOCAB_SIZE, value=vocab_size) # build GNMT model model_config = dict(vocab_size=vocab_size, math=args.math, **literal_eval(args.model_config)) model = GNMT(**model_config) logging.info(model) batch_first = model.batch_first # define loss function (criterion) and optimizer criterion = build_criterion(vocab_size, config.PAD, args.smoothing) opt_config = literal_eval(args.optimization_config) scheduler_config = literal_eval(args.scheduler_config) logging.info(f'Training optimizer: {opt_config}') logging.info(f'Training LR Schedule: {scheduler_config}') num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info(f'Number of parameters: {num_parameters}') # get data loaders train_loader = train_data.get_loader(batch_size=args.batch_size, seeds=shuffling_seeds, batch_first=batch_first, shuffle=True, bucketing=args.bucketing, num_workers=args.train_loader_workers) gnmt_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size * get_world_size()) gnmt_print(key=mlperf_log.INPUT_SIZE, value=train_loader.sampler.num_samples) val_loader = val_data.get_loader(batch_size=args.val_batch_size, batch_first=batch_first, shuffle=False, num_workers=args.val_loader_workers) test_loader = test_data.get_loader(batch_size=args.test_batch_size, batch_first=batch_first, shuffle=False, pad=True, num_workers=args.test_loader_workers) gnmt_print(key=mlperf_log.EVAL_SIZE, value=len(test_loader.dataset)) translator = Translator(model=model, tokenizer=tokenizer, loader=test_loader, beam_size=args.beam_size, max_seq_len=args.max_length_test, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, cuda=args.cuda, print_freq=args.print_freq, dataset_dir=args.dataset_dir, target_bleu=args.target_bleu, save_path=args.save_path) # create trainer trainer_options = dict( criterion=criterion, grad_clip=args.grad_clip, save_path=save_path, save_freq=args.save_freq, save_info={ 'config': args, 'tokenizer': tokenizer.get_state() }, opt_config=opt_config, scheduler_config=scheduler_config, batch_first=batch_first, keep_checkpoints=args.keep_checkpoints, math=args.math, print_freq=args.print_freq, cuda=args.cuda, distributed=distributed, distributed_overlap_allreduce=args.enable_apex_allreduce_overlap, distributed_overlap_allreduce_messagesize=args.apex_message_size, intra_epoch_eval=args.intra_epoch_eval, translator=translator, arch=args.arch) trainer_options['model'] = model trainer = trainers.Seq2SeqTrainer(**trainer_options) # optionally resume from a checkpoint if args.resume: checkpoint_file = args.resume if os.path.isdir(checkpoint_file): checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth') if os.path.isfile(checkpoint_file): trainer.load(checkpoint_file) else: logging.error(f'No checkpoint found at {args.resume}') # training loop # best_loss = float('inf') gnmt_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(1): logging.info(f'Starting epoch {epoch}') gnmt_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) if distributed: train_loader.sampler.set_epoch(epoch) trainer.epoch = epoch train_loss, train_perf = trainer.optimize(train_loader) logging.info(f'Finished epoch {epoch}') # Save the checkpoint at the end of the training loop, after the RUN_STOP # tag # https://github.com/mlperf/policies/issues/55#issuecomment-428335773 if not args.disable_eval: gnmt_print(key=mlperf_log.TRAIN_CHECKPOINT) if get_rank() == 0: trainer.save(save_all=args.save_all, is_best=True) gnmt_print(key=mlperf_log.RUN_FINAL)
def main(): """ Launches translation (inference). Inference is executed on a single GPU, implementation supports beam search with length normalization and coverage penalty. """ args = parse_args() # initialize distributed backend distributed = args.world_size > 1 if distributed: backend = 'nccl' if args.cuda else 'gloo' dist.init_process_group(backend=backend, rank=args.rank, init_method=args.dist_url, world_size=args.world_size) setup_logging() logging.info(f'Run arguments: {args}') if args.cuda: torch.cuda.set_device(args.rank) if not args.cuda and torch.cuda.is_available(): warnings.warn('cuda is available but not enabled') if args.math == 'fp16' and not args.cuda: raise RuntimeError('fp16 requires cuda') if not args.cudnn: torch.backends.cudnn.enabled = False # load checkpoint and deserialize to CPU (to save GPU memory) checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'}) # build GNMT model tokenizer = Tokenizer() tokenizer.set_state(checkpoint['tokenizer']) vocab_size = tokenizer.vocab_size model_config = dict(vocab_size=vocab_size, math=checkpoint['config'].math, **literal_eval(checkpoint['config'].model_config)) model_config['batch_first'] = args.batch_first model = GNMT(**model_config) state_dict = checkpoint['state_dict'] if checkpoint_from_distributed(state_dict): state_dict = unwrap_distributed(state_dict) model.load_state_dict(state_dict) if args.math == 'fp32': dtype = torch.FloatTensor if args.math == 'fp16': dtype = torch.HalfTensor model.type(dtype) if args.cuda: model = model.cuda() model.eval() # construct the dataset test_data = TextDataset(src_fname=args.input, tokenizer=tokenizer, sort=False) # build the data loader test_loader = test_data.get_loader(batch_size=args.batch_size, batch_first=args.batch_first, shuffle=False, pad=True, num_workers=0, drop_last=False) # build the translator object translator = Translator(model=model, tokenizer=tokenizer, loader=test_loader, beam_size=args.beam_size, max_seq_len=args.max_seq_len, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, cuda=args.cuda, print_freq=args.print_freq, dataset_dir=args.dataset_dir) # execute the inference translator.run(calc_bleu=args.bleu, eval_path=args.output, reference_path=args.reference, summary=True)
def main(): """ Launches translation (inference). Inference is executed on a single GPU, implementation supports beam search with length normalization and coverage penalty. """ args = parse_args() if args.affinity != 'disabled': nproc_per_node = torch.cuda.device_count() affinity = gpu_affinity.set_affinity(args.local_rank, nproc_per_node, args.affinity) print(f'{args.local_rank}: thread affinity: {affinity}') device = utils.set_device(args.cuda, args.local_rank) utils.init_distributed(args.cuda) args.rank = utils.get_rank() os.makedirs(args.save_dir, exist_ok=True) utils.setup_logging() dllog_file = os.path.join(args.save_dir, args.dllog_file) utils.setup_dllogger(enabled=True, filename=dllog_file) if args.profile: try: pyprof.init(enable_function_stack=True) except NameError: warnings.warn('Called pyprof.init() but pyprof is not available') if args.env: utils.log_env_info() logging.info(f'Run arguments: {args}') dllogger.log(step='PARAMETER', data=vars(args)) if not args.cuda and torch.cuda.is_available(): warnings.warn('cuda is available but not enabled') if not args.cudnn: torch.backends.cudnn.enabled = False # load checkpoint and deserialize to CPU (to save GPU memory) if args.model: checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'}) # build GNMT model tokenizer = Tokenizer() tokenizer.set_state(checkpoint['tokenizer']) model_config = checkpoint['model_config'] model_config['batch_first'] = args.batch_first model_config['vocab_size'] = tokenizer.vocab_size model = GNMT(**model_config) model.load_state_dict(checkpoint['state_dict']) elif args.synthetic: model = GNMT(args.synthetic_vocab, batch_first=args.batch_first) tokenizer = None else: raise RuntimeError( 'Specify model either with --synthetic or with --model flag') # construct the dataset if args.input: data = RawTextDataset( raw_datafile=args.input, tokenizer=tokenizer, sort=args.sort, ) elif args.input_text: data = RawTextDataset( raw_data=args.input_text, tokenizer=tokenizer, sort=args.sort, ) elif args.synthetic: data = SyntheticDataset(args.synthetic_vocab, args.synthetic_len, args.batch_size[0] * args.synthetic_batches) latency_table = tables.LatencyTable(args.percentiles) throughput_table = tables.ThroughputTable(args.percentiles) accuracy_table = tables.AccuracyTable('BLEU') dtype = { 'fp32': torch.FloatTensor, 'tf32': torch.FloatTensor, 'fp16': torch.HalfTensor } for (math, batch_size, beam_size) in product(args.math, args.batch_size, args.beam_size): logging.info(f'math: {math}, batch size: {batch_size}, ' f'beam size: {beam_size}') model.type(dtype[math]) model = model.to(device) model.eval() # build the data loader loader = data.get_loader( batch_size=batch_size, batch_first=args.batch_first, pad=True, repeat=args.repeat[batch_size], num_workers=0, ) # build the translator object translator = Translator( model=model, tokenizer=tokenizer, loader=loader, beam_size=beam_size, max_seq_len=args.max_seq_len, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, print_freq=args.print_freq, ) # execute the inference with torch.autograd.profiler.emit_nvtx(enabled=args.profile): output, stats = translator.run( calc_bleu=args.bleu, eval_path=args.output, summary=True, warmup=args.warmup, reference_path=args.reference, ) # print translated outputs if not args.synthetic and (not args.output and args.rank == 0): logging.info(f'Translated output:') for out in output: print(out) key = (batch_size, beam_size) latency_table.add(key, {math: stats['runtimes']}) throughput_table.add(key, {math: stats['throughputs']}) accuracy_table.add(key, {math: stats['bleu']}) if args.tables: accuracy_table.write('Inference accuracy', args.math) if 'fp16' in args.math and 'fp32' in args.math: relative = 'fp32' elif 'fp16' in args.math and 'tf32' in args.math: relative = 'tf32' else: relative = None if 'fp32' in args.math: throughput_table.write('Inference throughput', 'fp32') if 'tf32' in args.math: throughput_table.write('Inference throughput', 'tf32') if 'fp16' in args.math: throughput_table.write('Inference throughput', 'fp16', relative=relative) if 'fp32' in args.math: latency_table.write('Inference latency', 'fp32') if 'tf32' in args.math: latency_table.write('Inference latency', 'tf32') if 'fp16' in args.math: latency_table.write('Inference latency', 'fp16', relative=relative, reverse_speedup=True) avg_throughput = np.array(stats['throughputs']).mean() avg_latency = np.array(stats['runtimes']).mean() summary = { 'eval_throughput': avg_throughput, 'eval_bleu': stats['bleu'], 'eval_avg_latency': avg_latency, } for p in args.percentiles: summary[f'eval_{p}%_latency'] = np.percentile(stats['runtimes'], p) dllogger.log(step=tuple(), data=summary) passed = utils.benchmark(stats['bleu'], args.target_bleu, stats['tokens_per_sec'], args.target_perf) return passed
def main(): """ Launches data-parallel multi-gpu training. """ mlperf_log.ROOT_DIR_GNMT = os.path.dirname(os.path.abspath(__file__)) mlperf_log.LOGGER.propagate = False args = parse_args() device = utils.set_device(args.cuda, args.local_rank) distributed = utils.init_distributed(args.cuda) gnmt_print(key=mlperf_log.RUN_START, sync=True) args.rank = utils.get_rank() if not args.cudnn: torch.backends.cudnn.enabled = False # create directory for results save_path = os.path.join(args.results_dir, args.save) args.save_path = save_path os.makedirs(save_path, exist_ok=True) # setup logging log_filename = f'log_rank_{utils.get_rank()}.log' utils.setup_logging(os.path.join(save_path, log_filename)) if args.env: utils.log_env_info() logging.info(f'Saving results to: {save_path}') logging.info(f'Run arguments: {args}') # automatically set train_iter_size based on train_global_batch_size, # world_size and per-worker train_batch_size if args.train_global_batch_size is not None: global_bs = args.train_global_batch_size bs = args.train_batch_size world_size = utils.get_world_size() assert global_bs % (bs * world_size) == 0 args.train_iter_size = global_bs // (bs * world_size) logging.info(f'Global batch size was set in the config, ' f'Setting train_iter_size to {args.train_iter_size}') worker_seeds, shuffling_seeds = utils.setup_seeds(args.seed, args.epochs, device) worker_seed = worker_seeds[args.rank] logging.info(f'Worker {args.rank} is using worker seed: {worker_seed}') torch.manual_seed(worker_seed) # build tokenizer pad_vocab = utils.pad_vocabulary(args.math) tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME), pad_vocab) # build datasets gnmt_print(key=mlperf_log.PREPROC_TOKENIZE_TRAINING, sync=False) gnmt_print(key=mlperf_log.TRAIN_HP_MAX_SEQ_LEN, value=args.max_length_train, sync=False) train_data = LazyParallelDataset( src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME), tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME), tokenizer=tokenizer, min_len=args.min_length_train, max_len=args.max_length_train, sort=False, max_size=args.max_size) gnmt_print(key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES, value=len(train_data), sync=False) val_data = ParallelDataset(src_fname=os.path.join(args.dataset_dir, config.SRC_VAL_FNAME), tgt_fname=os.path.join(args.dataset_dir, config.TGT_VAL_FNAME), tokenizer=tokenizer, min_len=args.min_length_val, max_len=args.max_length_val, sort=True) gnmt_print(key=mlperf_log.PREPROC_TOKENIZE_EVAL, sync=False) test_data = TextDataset(src_fname=os.path.join(args.dataset_dir, config.SRC_TEST_FNAME), tokenizer=tokenizer, min_len=args.min_length_test, max_len=args.max_length_test, sort=True) gnmt_print(key=mlperf_log.PREPROC_NUM_EVAL_EXAMPLES, value=len(test_data), sync=False) vocab_size = tokenizer.vocab_size gnmt_print(key=mlperf_log.PREPROC_VOCAB_SIZE, value=vocab_size, sync=False) # build GNMT model model_config = { 'hidden_size': args.hidden_size, 'num_layers': args.num_layers, 'dropout': args.dropout, 'batch_first': False, 'share_embedding': args.share_embedding } model = GNMT(vocab_size=vocab_size, **model_config) logging.info(model) batch_first = model.batch_first # define loss function (criterion) and optimizer criterion = build_criterion(vocab_size, config.PAD, args.smoothing) opt_config = {'optimizer': args.optimizer, 'lr': args.lr} opt_config.update(literal_eval(args.optimizer_extra)) logging.info(f'Training optimizer config: {opt_config}') scheduler_config = { 'warmup_steps': args.warmup_steps, 'remain_steps': args.remain_steps, 'decay_interval': args.decay_interval, 'decay_steps': args.decay_steps, 'decay_factor': args.decay_factor } logging.info(f'Training LR schedule config: {scheduler_config}') num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info(f'Number of parameters: {num_parameters}') batching_opt = { 'shard_size': args.shard_size, 'num_buckets': args.num_buckets } # get data loaders train_loader = train_data.get_loader(batch_size=args.train_batch_size, seeds=shuffling_seeds, batch_first=batch_first, shuffle=True, batching=args.batching, batching_opt=batching_opt, num_workers=args.train_loader_workers) gnmt_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.train_batch_size * utils.get_world_size(), sync=False) gnmt_print(key=mlperf_log.INPUT_SIZE, value=train_loader.sampler.num_samples, sync=False) val_loader = val_data.get_loader(batch_size=args.val_batch_size, batch_first=batch_first, shuffle=False, num_workers=args.val_loader_workers) test_loader = test_data.get_loader(batch_size=args.test_batch_size, batch_first=batch_first, shuffle=False, pad=True, num_workers=args.test_loader_workers) gnmt_print(key=mlperf_log.EVAL_SIZE, value=len(test_loader.dataset), sync=False) translator = Translator(model=model, tokenizer=tokenizer, loader=test_loader, beam_size=args.beam_size, max_seq_len=args.max_length_test, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, cuda=args.cuda, print_freq=args.print_freq, dataset_dir=args.dataset_dir, target_bleu=args.target_bleu, save_path=args.save_path) # create trainer total_train_iters = len(train_loader) // args.train_iter_size * args.epochs save_info = { 'model_config': model_config, 'config': args, 'tokenizer': tokenizer.get_state() } trainer_options = dict(criterion=criterion, grad_clip=args.grad_clip, iter_size=args.train_iter_size, save_path=save_path, save_freq=args.save_freq, save_info=save_info, opt_config=opt_config, scheduler_config=scheduler_config, train_iterations=total_train_iters, batch_first=batch_first, keep_checkpoints=args.keep_checkpoints, math=args.math, print_freq=args.print_freq, cuda=args.cuda, distributed=distributed, intra_epoch_eval=args.intra_epoch_eval, translator=translator) trainer_options['model'] = model trainer = trainers.Seq2SeqTrainer(**trainer_options) # optionally resume from a checkpoint if args.resume: checkpoint_file = args.resume if os.path.isdir(checkpoint_file): checkpoint_file = os.path.join(checkpoint_file, 'model_best.pth') if os.path.isfile(checkpoint_file): trainer.load(checkpoint_file) else: logging.error(f'No checkpoint found at {args.resume}') # training loop best_loss = float('inf') break_training = False test_bleu = None gnmt_print(key=mlperf_log.TRAIN_LOOP, sync=True) for epoch in range(args.start_epoch, args.epochs): logging.info(f'Starting epoch {epoch}') gnmt_print(key=mlperf_log.TRAIN_EPOCH, value=epoch, sync=True) train_loader.sampler.set_epoch(epoch) trainer.epoch = epoch train_loss, train_perf = trainer.optimize(train_loader) # evaluate on validation set if args.eval: logging.info(f'Running validation on dev set') val_loss, val_perf = trainer.evaluate(val_loader) # remember best prec@1 and save checkpoint gnmt_print(key=mlperf_log.TRAIN_CHECKPOINT, sync=False) if args.rank == 0: is_best = val_loss < best_loss best_loss = min(val_loss, best_loss) trainer.save(save_all=args.save_all, is_best=is_best) if args.eval: gnmt_print(key=mlperf_log.EVAL_START, value=epoch, sync=True) test_bleu, break_training = translator.run(calc_bleu=True, epoch=epoch) gnmt_print(key=mlperf_log.EVAL_ACCURACY, value={ "epoch": epoch, "value": round(test_bleu, 2) }, sync=False) gnmt_print(key=mlperf_log.EVAL_TARGET, value=args.target_bleu, sync=False) gnmt_print(key=mlperf_log.EVAL_STOP, sync=True) acc_log = [] acc_log += [f'Summary: Epoch: {epoch}'] acc_log += [f'Training Loss: {train_loss:.4f}'] if args.eval: acc_log += [f'Validation Loss: {val_loss:.4f}'] acc_log += [f'Test BLEU: {test_bleu:.2f}'] perf_log = [] perf_log += [f'Performance: Epoch: {epoch}'] perf_log += [f'Training: {train_perf:.0f} Tok/s'] if args.eval: perf_log += [f'Validation: {val_perf:.0f} Tok/s'] if args.rank == 0: logging.info('\t'.join(acc_log)) logging.info('\t'.join(perf_log)) logging.info(f'Finished epoch {epoch}') if break_training: break gnmt_print(key=mlperf_log.RUN_STOP, value={"success": bool(break_training)}, sync=True) gnmt_print(key=mlperf_log.RUN_FINAL, sync=False)
def main(): """ Launches translation (inference). Inference is executed on a single GPU, implementation supports beam search with length normalization and coverage penalty. """ args = parse_args() utils.set_device(args.cuda, args.local_rank) utils.init_distributed(args.cuda) setup_logging() if args.env: utils.log_env_info() logging.info(f'Run arguments: {args}') if not args.cuda and torch.cuda.is_available(): warnings.warn('cuda is available but not enabled') if not args.cudnn: torch.backends.cudnn.enabled = False # load checkpoint and deserialize to CPU (to save GPU memory) checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'}) # build GNMT model tokenizer = Tokenizer() tokenizer.set_state(checkpoint['tokenizer']) vocab_size = tokenizer.vocab_size model_config = checkpoint['model_config'] model_config['batch_first'] = args.batch_first model = GNMT(vocab_size=vocab_size, **model_config) model.load_state_dict(checkpoint['state_dict']) for (math, batch_size, beam_size) in product(args.math, args.batch_size, args.beam_size): logging.info(f'math: {math}, batch size: {batch_size}, ' f'beam size: {beam_size}') if math == 'fp32': dtype = torch.FloatTensor if math == 'fp16': dtype = torch.HalfTensor model.type(dtype) if args.cuda: model = model.cuda() model.eval() # construct the dataset test_data = TextDataset(src_fname=args.input, tokenizer=tokenizer, sort=args.sort) # build the data loader test_loader = test_data.get_loader(batch_size=batch_size, batch_first=args.batch_first, shuffle=False, pad=True, num_workers=0) # build the translator object translator = Translator(model=model, tokenizer=tokenizer, loader=test_loader, beam_size=beam_size, max_seq_len=args.max_seq_len, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, cuda=args.cuda, print_freq=args.print_freq, dataset_dir=args.dataset_dir) # execute the inference translator.run(calc_bleu=args.bleu, eval_path=args.output, reference_path=args.reference, summary=True)
def main(): """ Launches data-parallel multi-gpu training. """ mlperf_compliance.mlperf_log.LOGGER.propagate = False mlperf_compliance.mlperf_log.setdefault( root_dir=os.path.dirname(os.path.abspath(__file__)), benchmark=mlperf_compliance.constants.GNMT, stack_offset=1, extra_print=False ) mlperf_print(key=mlperf_compliance.constants.INIT_START, log_all_ranks=True) args = parse_args() device = utils.set_device(args.cuda, args.local_rank) distributed = utils.init_distributed(args.cuda) # preinit and warmup streams/ groups for apex DDP communicators allreduce_communicators=None if distributed and args.apex_num_allreduce_streams > 1: bucket_pgs = [torch.distributed.new_group() for _ in range(args.apex_num_allreduce_streams)] bucket_streams = [torch.cuda.Stream() for _ in range(args.apex_num_allreduce_streams)] for pg, stream in zip(bucket_pgs,bucket_streams): with torch.cuda.stream(stream): torch.distributed.all_reduce(torch.cuda.FloatTensor(1), group=pg) allreduce_communicators=(bucket_pgs,bucket_streams) args.rank = utils.get_rank() if not args.cudnn: torch.backends.cudnn.enabled = False # create directory for results save_path = os.path.join(args.results_dir, args.save) args.save_path = save_path os.makedirs(save_path, exist_ok=True) # setup logging log_filename = f'log_rank_{utils.get_rank()}.log' utils.setup_logging(args.log_all_ranks, os.path.join(save_path, log_filename)) if args.env: utils.log_env_info() logging.info(f'Saving results to: {save_path}') logging.info(f'Run arguments: {args}') # automatically set train_iter_size based on train_global_batch_size, # world_size and per-worker train_batch_size if args.train_global_batch_size is not None: global_bs = args.train_global_batch_size bs = args.train_batch_size world_size = utils.get_world_size() assert global_bs % (bs * world_size) == 0 args.train_iter_size = global_bs // (bs * world_size) logging.info(f'Global batch size was set in the config, ' f'Setting train_iter_size to {args.train_iter_size}') # setup L2 promotion if args.cuda: utils.l2_promote() worker_seeds, shuffling_seeds = utils.setup_seeds(args.seed, args.epochs, device) worker_seed = worker_seeds[args.rank] logging.info(f'Worker {args.rank} is using worker seed: {worker_seed}') torch.manual_seed(worker_seed) # build tokenizer # https://github.com/mlperf/policies/issues/201 pad_vocab = utils.pad_vocabulary(args.math) tokenizer = Tokenizer(os.path.join(args.dataset_dir, config.VOCAB_FNAME), pad_vocab) vocab_size = tokenizer.vocab_size # build GNMT model model_config = {'hidden_size': args.hidden_size, 'num_layers': args.num_layers, 'dropout': args.dropout, 'batch_first': False, 'share_embedding': args.share_embedding, 'fusion': args.fused_attention} model = GNMT(vocab_size=vocab_size, **model_config) logging.info(model) # define loss function (criterion) and optimizer criterion = build_criterion(vocab_size, config.PAD, args.smoothing, args.fused_xentropy) opt_config = {'optimizer': args.optimizer, 'lr': args.lr} opt_config.update(literal_eval(args.optimizer_extra)) logging.info(f'Training optimizer config: {opt_config}') num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info(f'Number of parameters: {num_parameters}') # create trainer save_info = {'model_config': model_config, 'config': args, 'tokenizer': tokenizer.get_state()} loss_scaling = {'init_scale': args.init_scale, 'upscale_interval': args.upscale_interval} trainer_options = dict( criterion=criterion, grad_clip=args.grad_clip, iter_size=args.train_iter_size, save_path=save_path, save_freq=args.save_freq, save_info=save_info, opt_config=opt_config, batch_first=model.batch_first, keep_checkpoints=args.keep_checkpoints, math=args.math, loss_scaling=loss_scaling, print_freq=args.print_freq, cuda=args.cuda, distributed=distributed, distributed_overlap_allreduce=args.enable_apex_allreduce_overlap, distributed_overlap_num_allreduce_streams=args.apex_num_allreduce_streams, distributed_overlap_allreduce_messagesize=args.apex_message_size, distributed_overlap_allreduce_communicators=allreduce_communicators, intra_epoch_eval=args.intra_epoch_eval, prealloc_mode=args.prealloc_mode) trainer_options['model'] = model trainer = trainers.Seq2SeqTrainer(**trainer_options) trainer.preallocate(args.train_batch_size, args.max_length_train, training=True) mlperf_print(key=mlperf_compliance.constants.INIT_STOP, sync=True) mlperf_print(key=mlperf_compliance.constants.RUN_START, sync=True) utils.barrier() mlperf_print(key=mlperf_compliance.constants.MAX_SEQUENCE_LENGTH, value=args.max_length_train, metadata={'method': 'discard'}) if args.use_preproc_data: train_data = PreprocessedDataset( min_len=args.min_length_train, max_len=args.max_length_train, vocab_size=tokenizer.vocab_size, ) train_data.read_data( os.path.join(args.preproc_data_dir, 'training.bin'), tokenizer.vocab_size, ) train_data.prepare() else: train_data = LazyParallelDataset( src_fname=os.path.join(args.dataset_dir, config.SRC_TRAIN_FNAME), tgt_fname=os.path.join(args.dataset_dir, config.TGT_TRAIN_FNAME), tokenizer=tokenizer, min_len=args.min_length_train, max_len=args.max_length_train, sort=False, max_size=args.max_size, ) test_data = TextDataset( src_fname=os.path.join(args.dataset_dir, config.SRC_TEST_FNAME), tokenizer=tokenizer, min_len=args.min_length_test, max_len=args.max_length_test, sort=True) batching_opt = {'shard_size': args.shard_size, 'num_buckets': args.num_buckets} # get data loaders train_loader = train_data.get_loader(batch_size=args.train_batch_size, seeds=shuffling_seeds, batch_first=model.batch_first, shuffle=True, batching=args.batching, batching_opt=batching_opt, num_workers=args.train_loader_workers) mlperf_print(key=mlperf_compliance.constants.GLOBAL_BATCH_SIZE, value=args.train_batch_size * utils.get_world_size(), sync=False) test_loader = test_data.get_loader(batch_size=args.test_batch_size, batch_first=model.batch_first, shuffle=False, num_workers=args.test_loader_workers) translator = Translator(model=model, tokenizer=tokenizer, loader=test_loader, beam_size=args.beam_size, max_seq_len=args.max_length_test, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, cuda=args.cuda, print_freq=args.print_freq, dataset_dir=args.dataset_dir, target_bleu=args.target_bleu, save_path=args.save_path) total_train_iters = len(train_loader) // args.train_iter_size * args.epochs scheduler_config = {'warmup_steps': args.warmup_steps, 'remain_steps': args.remain_steps, 'decay_interval': args.decay_interval, 'decay_steps': args.decay_steps, 'decay_factor': args.decay_factor} logging.info(f'Training LR schedule config: {scheduler_config}') scheduler = WarmupMultiStepLR(trainer.optimizer, total_train_iters, **scheduler_config) trainer.scheduler = scheduler trainer.translator = translator # optionally resume from a checkpoint if args.resume: checkpoint_file = args.resume if os.path.isdir(checkpoint_file): checkpoint_file = os.path.join( checkpoint_file, 'model_best.pth') if os.path.isfile(checkpoint_file): trainer.load(checkpoint_file) else: logging.error(f'No checkpoint found at {args.resume}') # training loop break_training = False test_bleu = None for epoch in range(args.start_epoch, args.epochs): mlperf_print(key=mlperf_compliance.constants.BLOCK_START, metadata={'first_epoch_num': epoch + 1, 'epoch_count': 1}, sync=True) mlperf_print(key=mlperf_compliance.constants.EPOCH_START, metadata={'epoch_num': epoch + 1}, sync=True) logging.info(f'Starting epoch {epoch}') train_loader.sampler.set_epoch(epoch) trainer.epoch = epoch train_loss, train_perf = trainer.optimize(train_loader) mlperf_print(key=mlperf_compliance.constants.EPOCH_STOP, metadata={'epoch_num': epoch + 1}, sync=True) if args.eval: mlperf_print(key=mlperf_compliance.constants.EVAL_START, metadata={'epoch_num': epoch + 1}, sync=True) test_bleu, break_training = translator.run(calc_bleu=True, epoch=epoch) mlperf_print(key=mlperf_compliance.constants.EVAL_ACCURACY, value=test_bleu, metadata={'epoch_num': epoch + 1}, sync=False) mlperf_print(key=mlperf_compliance.constants.EVAL_STOP, metadata={'epoch_num': epoch + 1}, sync=True) acc_log = [] acc_log += [f'Summary: Epoch: {epoch}'] acc_log += [f'Training Loss: {train_loss:.4f}'] if args.eval: acc_log += [f'Test BLEU: {test_bleu:.2f}'] perf_log = [] perf_log += [f'Performance: Epoch: {epoch}'] perf_log += [f'Training: {train_perf:.0f} Tok/s'] if args.rank == 0: logging.info('\t'.join(acc_log)) logging.info('\t'.join(perf_log)) logging.info(f'Finished epoch {epoch}') mlperf_print(key=mlperf_compliance.constants.BLOCK_STOP, metadata={'first_epoch_num': epoch + 1}, sync=True) if break_training: break if args.use_preproc_data: train_data.finalize() status = 'success' if break_training else 'aborted' mlperf_print(key=mlperf_compliance.constants.RUN_STOP, metadata={'status': status}, sync=True)
def main(): """ Launches translation (inference). Inference is executed on a single GPU, implementation supports beam search with length normalization and coverage penalty. """ args = parse_args() logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", filename='log.log', filemode='w') console = logging.StreamHandler() console.setLevel(logging.INFO) formatter = logging.Formatter('%(message)s') console.setFormatter(formatter) logging.getLogger('').addHandler(console) logging.info(args) if args.cuda: torch.cuda.set_device(0) if not args.cuda and torch.cuda.is_available(): warnings.warn('cuda is available but not enabled') if args.math == 'fp16' and not args.cuda: raise RuntimeError('fp16 requires cuda') if not args.cudnn: torch.backends.cudnn.enabled = False # load checkpoint and deserialize to CPU (to save GPU memory) checkpoint = torch.load(args.model, map_location={'cuda:0': 'cpu'}) # build GNMT model tokenizer = checkpoint['tokenizer'] vocab_size = tokenizer.vocab_size model_config = dict(vocab_size=vocab_size, math=checkpoint['config'].math, **literal_eval(checkpoint['config'].model_config)) model_config['batch_first'] = args.batch_first model = GNMT(**model_config) state_dict = checkpoint['state_dict'] if checkpoint_from_distributed(state_dict): state_dict = unwrap_distributed(state_dict) model.load_state_dict(state_dict) if args.math == 'fp32': dtype = torch.FloatTensor if args.math == 'fp16': dtype = torch.HalfTensor model.type(dtype) if args.cuda: model = model.cuda() model.eval() # construct the dataset test_data = TextDataset(src_fname=args.input, tokenizer=tokenizer, sort=False) # build the data loader test_loader = test_data.get_loader(batch_size=args.batch_size, batch_first=args.batch_first, shuffle=False, num_workers=0, drop_last=False) # build the translator object translator = Translator(model=model, tokenizer=tokenizer, loader=test_loader, beam_size=args.beam_size, max_seq_len=args.max_seq_len, len_norm_factor=args.len_norm_factor, len_norm_const=args.len_norm_const, cov_penalty_factor=args.cov_penalty_factor, cuda=args.cuda, print_freq=args.print_freq, dataset_dir=args.dataset_dir) # execute the inference translator.run(calc_bleu=args.bleu, eval_path=args.output, reference_path=args.reference, summary=True)