def validate_and_save(cfg: DictConfig, trainer: Trainer, task: tasks.FairseqTask, epoch_itr, valid_subsets: List[str], end_of_epoch: bool) -> Tuple[List[Optional[float]], bool]: num_updates = trainer.get_num_updates() max_update = cfg.optimization.max_update or math.inf do_save = ( (end_of_epoch and epoch_itr.epoch % cfg.checkpoint.save_interval == 0) or num_updates >= max_update or ( cfg.checkpoint.save_interval_updates > 0 and num_updates > 0 and num_updates % cfg.checkpoint.save_interval_updates == 0 and num_updates >= cfg.dataset.validate_after_updates ) ) do_validate = ( (not end_of_epoch and do_save) # validate during mid-epoch saves or (end_of_epoch and epoch_itr.epoch % cfg.dataset.validate_interval == 0) or num_updates >= max_update or ( cfg.dataset.validate_interval_updates > 0 and num_updates > 0 and num_updates % cfg.dataset.validate_interval_updates == 0 ) ) and not cfg.dataset.disable_validation # Validate valid_losses = [None] if do_validate: valid_losses = validate(cfg, trainer, task, epoch_itr, valid_subsets) # Stopping conditions should_stop = ( should_stop_early(cfg, valid_losses[0]) or num_updates >= max_update or ( cfg.optimization.stop_time_hours > 0 and trainer.cumulative_training_time() / (60 * 60) > cfg.optimization.stop_time_hours ) ) # Save checkpoint if do_save or should_stop: logger.info("begin save checkpoint") checkpoint_utils.save_checkpoint(cfg.checkpoint, trainer, epoch_itr, valid_losses[0]) return valid_losses, should_stop
def get_valid_stats(cfg: DictConfig, trainer: Trainer, stats: Dict[str, Any]) -> Dict[str, Any]: stats["num_updates"] = trainer.get_num_updates() if hasattr(checkpoint_utils.save_checkpoint, "best"): key = "best_{0}".format(cfg.checkpoint.best_checkpoint_metric) best_function = max if cfg.checkpoint.maximize_best_checkpoint_metric else min stats[key] = best_function( checkpoint_utils.save_checkpoint.best, stats[cfg.checkpoint.best_checkpoint_metric] ) return stats
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf valid_losses = [None] valid_subsets = args.valid_subset.split(',') valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets)
def main(args): if args.max_tokens is None: args.max_tokens = 6000 print(args) if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits task.load_dataset('test') # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format( sum(p.numel() for p in model.parameters()))) # Make a dummy batch to (i) warm the caching allocator and (ii) as a # placeholder DistributedDataParallel when there's an uneven number of # batches per worker. max_positions = utils.resolve_max_positions(task.max_positions(), model.max_positions()) # Build trainer trainer = Trainer(args, task, model, criterion, None) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences)) # Initialize dataloader epoch_itr = task.get_batch_iterator( dataset=task.datasets['test'], max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers) # Load the latest checkpoint if one is available assert load_checkpoint(args, trainer, epoch_itr), "There should be a checkpoint." # Train until the learning rate gets too small valid_losses = validate(args, trainer, task, epoch_itr)
def validate( cfg: DictConfig, trainer: Trainer, task: tasks.FairseqTask, epoch_itr, subsets: List[str], ) -> List[Optional[float]]: """Evaluate the model on the validation set(s) and return the losses.""" if cfg.dataset.fixed_validation_seed is not None: # set fixed seed for every validation utils.set_torch_seed(cfg.dataset.fixed_validation_seed) trainer.begin_valid_epoch(epoch_itr.epoch) valid_losses = [] for subset in subsets: logger.info('begin validation on "{}" subset'.format(subset)) # Initialize data iterator itr = trainer.get_valid_iterator(subset).next_epoch_itr( shuffle=False, set_dataset_epoch=False # use a fixed valid set ) if cfg.common.tpu: itr = utils.tpu_data_loader(itr) progress = progress_bar.progress_bar( itr, log_format=cfg.common.log_format, log_interval=cfg.common.log_interval, epoch=epoch_itr.epoch, prefix=f"valid on '{subset}' subset", tensorboard_logdir=( cfg.common.tensorboard_logdir if distributed_utils.is_master(cfg.distributed_training) else None ), default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"), wandb_project=( cfg.common.wandb_project if distributed_utils.is_master(cfg.distributed_training) else None ), wandb_run_name=os.environ.get( "WANDB_NAME", os.path.basename(cfg.checkpoint.save_dir) ), ) # create a new root metrics aggregator so validation metrics # don't pollute other aggregators (e.g., train meters) with metrics.aggregate(new_root=True) as agg: for i, sample in enumerate(progress): if cfg.dataset.max_valid_steps is not None and i > cfg.dataset.max_valid_steps: break trainer.valid_step(sample) # log validation stats stats = get_valid_stats(cfg, trainer, agg.get_smoothed_values()) progress.print(stats, tag=subset, step=trainer.get_num_updates()) valid_losses.append(stats[cfg.checkpoint.best_checkpoint_metric]) return valid_losses
def prepare_task(args, xla_device): # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=True, epoch=0) # Build models and criteria to print some metadata torch.manual_seed(args.seed) model, criterion = task.build_model(args), task.build_criterion(args) xm.master_print(model) xm.master_print('| model {}, criterion {}'.format( args.arch, criterion.__class__.__name__)) xm.master_print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad))) model = model.to(xla_device) trainer = Trainer(args, task, model, criterion, xla_device=xla_device) lr = trainer.get_lr() # Load the latest checkpoint if one is available and restore the # corresponding train iterator # we overwrite distributed args here to shard data using torch_xla's # distributed training. trainer.args.distributed_rank = xm.get_ordinal() trainer.args.distributed_world_size = xm.xrt_world_size() extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) trainer.args.distributed_rank = 0 trainer.args.distributed_world_size = 1 trainer.meters_to_device(xla_device) valid_subsets = args.valid_subset.split(',') ordinal = xm.get_ordinal(defval=-1) device_str = ( str(xla_device) if ordinal < 0 else '{}/{}'.format(xla_device, ordinal) ) return task, trainer, model, epoch_itr, lr, valid_subsets, device_str
def validate( cfg: DictConfig, trainer: Trainer, task: tasks.FairseqTask, cur_step, epoch_itr, subsets: List[str], ) -> List[Optional[float]]: """Evaluate the model on the validation set(s) and return the losses.""" if cfg.dataset.fixed_validation_seed is not None: # set fixed seed for every validation utils.set_torch_seed(cfg.dataset.fixed_validation_seed) trainer.begin_valid_epoch(epoch_itr.epoch) valid_losses = [] for subset in subsets: logger.info('begin validation on "{}" subset'.format(subset)) # Initialize data iterator itr = trainer.get_valid_iterator(subset).next_epoch_itr(shuffle=False) if cfg.common.tpu: itr = utils.tpu_data_loader(itr) progress = progress_bar.progress_bar( itr, log_format=cfg.common.log_format, log_interval=cfg.common.log_interval, epoch=epoch_itr.epoch, prefix=f"valid on '{subset}' subset", tensorboard_logdir=(cfg.common.tensorboard_logdir if distributed_utils.is_master( cfg.distributed_training) else None), default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"), wandb_project=(cfg.common.wandb_project if distributed_utils.is_master( cfg.distributed_training) else None), wandb_run_name=os.environ.get( "WANDB_NAME", os.path.basename(cfg.checkpoint.save_dir)), ) # create a new root metrics aggregator so validation metrics # don't pollute other aggregators (e.g., train meters) # 需要通过metrics.log_scalar("key", val)添加到metrics里面,才能在agg中显示出来log with metrics.aggregate(new_root=True) as agg: for sample in progress: trainer.valid_step(sample, cur_step=cur_step) # import pdb # pdb.set_trace() # log validation stats # stats里面已经有了agg.get_smoothed_values()这个orderedDict作为基础,通过get_valid_stats函数获得一些其他的state值 stats = get_valid_stats(cfg, trainer, agg.get_smoothed_values()) progress.print(stats, tag=subset, step=trainer.get_num_updates()) valid_losses.append(stats[cfg.checkpoint.best_checkpoint_metric]) # print("In fairseq/fairseq_cli/train.py line 400:\n{} not in stats".format(cfg.checkpoint.best_checkpoint_metric)) return valid_losses
def get_ready(args, ac='a'): train_args = deepcopy(args) if ac == 'a': train_args.restore_file = args.actor_restore_file train_args.task = args.actor_task train_args.criterion = args.actor_criterion train_args.save_interval_updates = args.actor_save_update elif ac == 'c': train_args.restore_file = args.critic_restore_file train_args.task = args.critic_task train_args.criterion = args.critic_criterion train_args.save_interval_updates = args.critic_save_update task = tasks.setup_task(train_args) model = task.build_model(train_args) criterion = task.build_criterion(train_args) logger.info(model) logger.info('model {}, criterion {}'.format(train_args.arch, criterion.__class__.__name__)) logger.info('num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer if train_args.model_parallel_size == 1: trainer = Trainer(train_args, task, model, criterion) else: trainer = MegatronTrainer(train_args, task, model, criterion) logger.info('training on {} GPUs'.format( train_args.distributed_world_size)) logger.info( 'max tokens per GPU = {} and max sentences per GPU = {}'.format( train_args.max_tokens, train_args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint( train_args, trainer) return train_args, task, model, criterion, trainer, epoch_itr, extra_state
def prepare_task(args, devices): # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=True, epoch=0) # Build models and criteria to print some metadata model_parallel = dp.DataParallel( lambda: task.build_model(args), device_ids=devices) model, criterion = task.build_model(args), task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) del model, criterion # Build trainers trainers = { device: Trainer(args, task, model, task.build_criterion(args), xla=True) for device, model in zip(model_parallel.devices, model_parallel.models) } trainer = trainers[devices[0]] lr = trainer.get_lr() # TODO(taylanbil): for now, this next line is only creating the iterator. # validate its behavior with the case where a checkpoint actually exists. # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) valid_subsets = args.valid_subset.split(',') return task, trainers, model_parallel, epoch_itr, lr, valid_subsets
def main(args): if args.max_tokens is None: args.max_tokens = 6000 print(args) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(task, ['train', 'valid']) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format( sum(p.numel() for p in model.parameters()))) # Make a dummy batch to (i) warm the caching allocator and (ii) as a # placeholder DistributedDataParallel when there's an uneven number of # batches per worker. max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) dummy_batch = task.dataset('train').get_dummy_batch( args.max_tokens, max_positions) # Build trainer trainer = Trainer(args, task, model, criterion, dummy_batch) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) ## "Prune" heads (actually mask but shh...) #if len(args.transformer_mask_heads) > 0: # # Determine which head to prune # to_prune = parse_head_pruning_descriptors( # args.transformer_mask_heads, # reverse_descriptors=args.transformer_mask_all_but_one_head, # n_heads=model.encoder.layers[0].self_attn.num_heads # ) # print(to_prune) # # Apply pruning # mask_heads(model, to_prune, args.transformer_mask_rescale) # Save initial model initial = os.path.join(args.save_dir, "checkpoint_initial.pt") trainer.save_checkpoint(initial, {}) # Initialize dataloader epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ) # Load the latest checkpoint if one is available if not load_checkpoint(args, trainer, epoch_itr): trainer.dummy_train_step([dummy_batch]) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) #if epoch_itr.epoch % args.save_interval == 0: # save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) # ***********************************Below Changed****************************** # save checkpoint #if epoch_itr.epoch % args.save_interval == 0: save_interval = 5 # prune and save checkpoint for every five epoch if epoch_itr.epoch % save_interval == 0: #****** changed # do prunning before saving prune2(args, task, model, trainer, epoch_itr) #****** changed2 # save checkpoint save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) prune2(args, task, model, trainer, epoch_itr ) #****** changed2 do last prunning on the last chekcpoint saved # ***********************************Above Changed****************************** train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def prune(args): if args.max_tokens is None: args.max_tokens = 6000 print(args) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(task, ['train', "valid"]) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {},'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format( sum(p.numel() for p in model.parameters()))) # Make a dummy batch to (i) warm the caching allocator and (ii) as a # placeholder DistributedDataParallel when there's an uneven number of # batches per worker. max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) dummy_batch = task.dataset('train').get_dummy_batch( args.max_tokens, max_positions) # Build trainer trainer = Trainer(args, task, model, criterion, dummy_batch) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) print('| Optimizer {}'.format(trainer.optimizer.__class__.__name__)) # Initialize dataloader epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ) # Load the latest checkpoint if one is available if not load_checkpoint(args, trainer, epoch_itr): trainer.dummy_train_step([dummy_batch]) # Train until the learning rate gets too small prune_meter = StopwatchMeter() prune_meter.start() # Estimate head importance scores head_importance, head_stats = estimate_head_importance( args, trainer, task, epoch_itr) prune_meter.stop() print('| done estimating head importance in {:.1f} seconds'.format( prune_meter.sum)) torch.save(head_stats, f"{os.path.dirname(args.restore_file)}/heads_stats.bin") # Print print("Head importances") print("Encoder self attention") for layer in range(head_importance["encoder_self"].size(0)): print("\t".join(f"{x:.5f}" for x in head_importance["encoder_self"][layer])) print("Encoder decoder attention") for layer in range(head_importance["encoder_decoder"].size(0)): print("\t".join(f"{x:.5f}" for x in head_importance["encoder_decoder"][layer])) print("Decoder self attention") for layer in range(head_importance["decoder_self"].size(0)): print("\t".join(f"{x:.5f}" for x in head_importance["decoder_self"][layer])) # Print sorted pruning profile encoder_self_profile = get_profile(head_importance["encoder_self"], prefix="E") encoder_decoder_profile = get_profile(head_importance["encoder_decoder"], prefix="A") decoder_self_profile = get_profile(head_importance["decoder_self"], prefix="D") # Join all all_profiles = {} if not (args.decoder_self_only or args.encoder_decoder_only): all_profiles.update(encoder_self_profile) if not (args.encoder_self_only or args.decoder_self_only): all_profiles.update(encoder_decoder_profile) if not (args.encoder_self_only or args.encoder_decoder_only): all_profiles.update(decoder_self_profile) sorted_profiles = sorted(all_profiles.items(), key=lambda x: x[1], reverse=args.one_minus) print("Heads sorted by importance:") print(" ".join(p for p, _ in sorted_profiles)) print("Sorted head importance scores:") print(" ".join(f"{v.data:.5f}" for _, v in sorted_profiles)) if args.only_importance: return tot_n_heads = len(sorted_profiles) # Eval pruning if args.one_head: kept_layers = set() to_prune_profile = [] for p, _ in reversed(sorted_profiles): layer_name = ":".join(p.split(":")[:-1]) if layer_name not in kept_layers: kept_layers.add(layer_name) continue else: to_prune_profile.insert(0, p) to_prune = parse_head_pruning_descriptors(to_prune_profile, reverse_descriptors=False) print(f"Evaluating following profile: \t{' '.join(to_prune_profile)}") # Apply pruning mask_heads(model, to_prune, args.transformer_mask_rescale) bleu = eval_bleu_score( model, task, task.dataset(args.valid_subset), beam=args.beam, replace_unk=args.replace_unk, lenpen=args.lenpen, buffer_size=100, use_cuda=torch.cuda.is_available() and not args.cpu, remove_bpe=args.remove_bpe, max_sentences=args.max_sentences, max_tokens=args.max_tokens, stop_early=not args.no_early_stop, normalize_scores=not args.unnormalized, min_len=args.min_len, ) print(f"BLEU score: \t{bleu.score:.2f}") sys.stdout.flush() return for i in range(0, 10): n_to_prune = int(ceil(tot_n_heads * i / 10)) to_prune_profile = [p for p, _ in sorted_profiles[:n_to_prune]] to_prune = parse_head_pruning_descriptors(to_prune_profile, reverse_descriptors=False) print(f"Evaluating following profile: \t{' '.join(to_prune_profile)}") # Apply pruning mask_heads(model, to_prune, args.transformer_mask_rescale) bleu = eval_bleu_score( model, task, task.dataset(args.valid_subset), beam=args.beam, replace_unk=args.replace_unk, lenpen=args.lenpen, buffer_size=100, use_cuda=torch.cuda.is_available() and not args.cpu, remove_bpe=args.remove_bpe, max_sentences=args.max_sentences, max_tokens=args.max_tokens, stop_early=not args.no_early_stop, normalize_scores=not args.unnormalized, min_len=args.min_len, ) print(f"BLEU score: \t{bleu.score:.2f}") sys.stdout.flush()
def main(cfg: DictConfig) -> None: if isinstance(cfg, argparse.Namespace): cfg = convert_namespace_to_omegaconf(cfg) utils.import_user_module(cfg.common) assert ( cfg.dataset.max_tokens is not None or cfg.dataset.batch_size is not None ), "Must specify batch size either with --max-tokens or --batch-size" metrics.reset() np.random.seed(cfg.common.seed) utils.set_torch_seed(cfg.common.seed) if distributed_utils.is_master(cfg.distributed_training): checkpoint_utils.verify_checkpoint_directory(cfg.checkpoint.save_dir) # Print args logger.info(cfg) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(cfg.task) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in cfg.dataset.valid_subset.split(","): task.load_dataset(valid_sub_split, combine=False, epoch=1) assert cfg.criterion, "Please specify criterion to train a model" # Build model and criterion model = task.build_model(cfg.model) criterion = task.build_criterion(cfg.criterion) logger.info(model) logger.info("task: {}".format(task.__class__.__name__)) logger.info("model: {}".format(model.__class__.__name__)) logger.info("criterion: {})".format(criterion.__class__.__name__)) logger.info("num. model params: {} (num. trained: {})".format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # (optionally) Configure quantization if cfg.common.quantization_config_path is not None: quantizer = quantization_utils.Quantizer( config_path=cfg.common.quantization_config_path, max_epoch=cfg.optimization.max_epoch, max_update=cfg.optimization.max_update, ) else: quantizer = None # Build trainer if cfg.common.model_parallel_size == 1: trainer = Trainer(cfg, task, model, criterion, quantizer) else: trainer = MegatronTrainer(cfg, task, model, criterion) logger.info("training on {} devices (GPUs/TPUs)".format( cfg.distributed_training.distributed_world_size)) logger.info("max tokens per GPU = {} and batch size per GPU = {}".format( cfg.dataset.max_tokens, cfg.dataset.batch_size, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint( cfg.checkpoint, trainer, # don't cache epoch iterators for sharded datasets disable_iterator_cache=task.has_sharded_data("train"), ) max_epoch = cfg.optimization.max_epoch or math.inf lr = trainer.get_lr() train_meter = meters.StopwatchMeter() train_meter.start() while lr > cfg.optimization.min_lr and epoch_itr.next_epoch_idx <= max_epoch: # train for one epoch valid_losses, should_stop = train(cfg, trainer, task, epoch_itr) if should_stop: break # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) epoch_itr = trainer.get_train_iterator( epoch_itr.next_epoch_idx, # sharded data: get train iterator for next epoch load_dataset=task.has_sharded_data("train"), # don't cache epoch iterators for sharded datasets disable_iterator_cache=task.has_sharded_data("train"), ) train_meter.stop() logger.info("done training in {:.1f} seconds".format(train_meter.sum))
def main(args): # we should not do this! ''' if args.max_tokens is None: args.max_tokens = 6000 ''' utils.xpprint(args) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) utils.xprintln('setup task done!') # Load dataset splits load_dataset_splits(args, task, ['train']) valid_dataset = args.valid_subset.split(',') load_dataset_splits(args, task, valid_dataset, shuffle=False) utils.xprintln('load dataset done!') if args.task.startswith('extractive_summarization'): if distributed_utils.is_master(args): from sum_eval import MultiProcSumEval sum_eval_pool = MultiProcSumEval(args.ncpu_eval) sum_valid_pool_params = dict( article_file=args.raw_valid + '.article', summary_file=args.raw_valid + '.summary', entity_map_file=None, length=-1, eval_type='predict', topk=args.topk_sent_eval, rerank=False, with_m=False, cmd='-a -c 95 -m -n 4 -w 1.2', trigram_block=args.trigram_block, ) sum_test_pool_params = dict( article_file=args.raw_test + '.article', summary_file=args.raw_test + '.summary', entity_map_file=None, length=-1, eval_type='predict', topk=args.topk_sent_eval, rerank=False, with_m=False, cmd='-a -c 95 -m -n 4 -w 1.2', trigram_block=args.trigram_block, ) sum_pool_params = dict(valid=sum_valid_pool_params, test=sum_test_pool_params) def make_params(default_dict, result_file, out_rouge_file, rerank=False, with_m=False): para_dict = dict(default_dict) para_dict['result_file'] = result_file para_dict['out_rouge_file'] = out_rouge_file para_dict['rerank'] = rerank para_dict['with_m'] = with_m return para_dict # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format( sum(p.numel() for p in model.parameters()))) # print(model) import sys sys.stdout.flush() # if summarization try to load pretrained model # if args.task.startswith('extractive_summarization') or args.task == 'pretrain_document_modeling': # # assume this is a single GPU program if args.init_from_pretrained_doc_model: task.load_pretrained_model(model, args.pretrained_doc_model_path) sys.stdout.flush() # Build trainer trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Initialize dataloader max_positions = trainer.get_model().max_positions() epoch_itr = trainer.get_train_iterator(epoch=0, load_dataset=False) # Load the latest checkpoint if one is available # load_checkpoint(args, trainer, epoch_itr) # make sure training from a different checkpoint will use different random seed cur_dataset = task.dataset('train') if hasattr(cur_dataset, 'rng'): print('epoch ', epoch_itr.epoch) cur_dataset.rng = numpy.random.RandomState(args.seed + epoch_itr.epoch) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') for alpha in range(10, 9, -1): # train for one epoch # train(args, trainer, task, epoch_itr) epoch_itr.next_epoch_itr() if epoch_itr.epoch % args.validate_interval == 0: if args.task.startswith('extractive_summarization'): if distributed_utils.is_master(args): validate_metric(args, trainer, task, epoch_itr, valid_subsets)
def main(cfg: FairseqConfig) -> None: if isinstance(cfg, argparse.Namespace): cfg = convert_namespace_to_omegaconf(cfg) utils.import_user_module(cfg.common) if is_master(cfg.distributed_training) and "job_logging_cfg" in cfg: # make hydra logging work with ddp (see # see https://github.com/facebookresearch/hydra/issues/1126) logging.config.dictConfig(OmegaConf.to_container(cfg.job_logging_cfg)) assert ( cfg.dataset.max_tokens is not None or cfg.dataset.batch_size is not None ), "Must specify batch size either with --max-tokens or --batch-size" metrics.reset() np.random.seed(cfg.common.seed) utils.set_torch_seed(cfg.common.seed) if distributed_utils.is_master(cfg.distributed_training): checkpoint_utils.verify_checkpoint_directory(cfg.checkpoint.save_dir) # Print args logger.info(cfg) if cfg.checkpoint.write_checkpoints_asynchronously: try: import iopath # noqa: F401 except ImportError: logging.exception( "Asynchronous checkpoint writing is specified but iopath is " "not installed: `pip install iopath`") return # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(cfg.task) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in cfg.dataset.valid_subset.split(","): task.load_dataset(valid_sub_split, combine=False, epoch=1) assert cfg.criterion, "Please specify criterion to train a model" # Build model and criterion model = task.build_model(cfg.model) criterion = task.build_criterion(cfg.criterion) logger.info(model) logger.info("task: {}".format(task.__class__.__name__)) logger.info("model: {}".format(model.__class__.__name__)) logger.info("criterion: {}".format(criterion.__class__.__name__)) logger.info("num. model params: {:,} (num. trained: {:,})".format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # (optionally) Configure quantization if cfg.common.quantization_config_path is not None: quantizer = quantization_utils.Quantizer( config_path=cfg.common.quantization_config_path, max_epoch=cfg.optimization.max_epoch, max_update=cfg.optimization.max_update, ) else: quantizer = None # Build trainer if cfg.common.model_parallel_size == 1: trainer = Trainer(cfg, task, model, criterion, quantizer) else: trainer = MegatronTrainer(cfg, task, model, criterion) logger.info("training on {} devices (GPUs/TPUs)".format( cfg.distributed_training.distributed_world_size)) logger.info("max tokens per GPU = {} and batch size per GPU = {}".format( cfg.dataset.max_tokens, cfg.dataset.batch_size, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint( cfg.checkpoint, trainer, # don't cache epoch iterators for sharded datasets disable_iterator_cache=task.has_sharded_data("train"), ) max_epoch = cfg.optimization.max_epoch or math.inf lr = trainer.get_lr() train_meter = meters.StopwatchMeter() train_meter.start() while epoch_itr.next_epoch_idx <= max_epoch: if lr <= cfg.optimization.stop_min_lr: logger.info( f"stopping training because current learning rate ({lr}) is smaller " "than or equal to minimum learning rate " f"(--stop-min-lr={cfg.optimization.stop_min_lr})") break # train for one epoch valid_losses, should_stop = train(cfg, trainer, task, epoch_itr) if should_stop: break # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) epoch_itr = trainer.get_train_iterator( epoch_itr.next_epoch_idx, # sharded data: get train iterator for next epoch load_dataset=task.has_sharded_data("train"), # don't cache epoch iterators for sharded datasets disable_iterator_cache=task.has_sharded_data("train"), ) train_meter.stop() logger.info("done training in {:.1f} seconds".format(train_meter.sum)) # ioPath implementation to wait for all asynchronous file writes to complete. if cfg.checkpoint.write_checkpoints_asynchronously: logger.info( "ioPath PathManager waiting for all asynchronous checkpoint " "writes to finish.") PathManager.async_close() logger.info("ioPath PathManager finished waiting.")
def main(args): print(args) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Load dataset splits = ['train', 'valid'] if data.has_binary_files(args.data, splits): dataset = data.load_dataset( args.data, splits, args.source_lang, args.target_lang) else: dataset = data.load_raw_text_dataset( args.data, splits, args.source_lang, args.target_lang) if args.source_lang is None or args.target_lang is None: # record inferred languages in args, so that it's saved in checkpoints args.source_lang, args.target_lang = dataset.src, dataset.dst print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) for split in splits: print('| {} {} {} examples'.format(args.data, split, len(dataset.splits[split]))) # Build model and criterion model = models.build_model(args, dataset.src_dict, dataset.dst_dict) criterion = criterions.build_criterion(args, dataset.src_dict, dataset.dst_dict) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format(sum(p.data.numel() for p in model.parameters()))) # Build trainer trainer = Trainer(args, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available os.makedirs(args.save_dir, exist_ok=True) checkpoint_path = os.path.join(args.save_dir, args.restore_file) extra_state = trainer.load_checkpoint(checkpoint_path) if extra_state is not None: epoch = extra_state['epoch'] batch_offset = extra_state['batch_offset'] print('| loaded checkpoint {} (epoch {})'.format(checkpoint_path, epoch)) if batch_offset == 0: trainer.lr_step(epoch) epoch += 1 else: epoch, batch_offset = 1, 0 # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() while lr > args.min_lr and epoch <= max_epoch: # train for one epoch train(args, trainer, dataset, epoch, batch_offset) # evaluate on validate set for k, subset in enumerate(args.valid_subset.split(',')): val_loss = validate(args, trainer, dataset, subset, epoch) if k == 0: # only use first validation loss to update the learning schedule lr = trainer.lr_step(epoch, val_loss) # save checkpoint if not args.no_save: save_checkpoint(trainer, args, epoch, 0, val_loss) epoch += 1 batch_offset = 0 train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args): utils.import_user_module(args) assert ( args.max_tokens is not None or args.max_sentences is not None ), "Must specify batch size either with --max-tokens or --max-sentences" metrics.reset() np.random.seed(args.seed) utils.set_torch_seed(args.seed) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args logger.info(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(","): task.load_dataset(valid_sub_split, combine=False, epoch=1) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) logger.info(model) logger.info("task: {} ({})".format(args.task, task.__class__.__name__)) logger.info("model: {} ({})".format(args.arch, model.__class__.__name__)) logger.info("criterion: {} ({})".format(args.criterion, criterion.__class__.__name__)) logger.info("num. model params: {} (num. trained: {})".format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # (optionally) Configure quantization if args.quantization_config_path is not None: quantizer = quantization_utils.Quantizer( config_path=args.quantization_config_path, max_epoch=args.max_epoch, max_update=args.max_update, ) else: quantizer = None # Build trainer if args.model_parallel_size == 1: trainer = Trainer(args, task, model, criterion, quantizer) else: trainer = MegatronTrainer(args, task, model, criterion) logger.info("training on {} devices (GPUs/TPUs)".format( args.distributed_world_size)) logger.info( "max tokens per GPU = {} and max sentences per GPU = {}".format( args.max_tokens, args.max_sentences)) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = meters.StopwatchMeter() train_meter.start() while lr > args.min_lr and epoch_itr.next_epoch_idx <= max_epoch: # train for one epoch valid_losses, should_stop = train(args, trainer, task, epoch_itr) if should_stop: break # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) epoch_itr = trainer.get_train_iterator( epoch_itr.next_epoch_idx, # sharded data: get train iterator for next epoch load_dataset=task.has_sharded_data("train"), ) train_meter.stop() logger.info("done training in {:.1f} seconds".format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits task.load_dataset(args.train_subset, combine=True, epoch=0) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=True, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) # Initialize dataloader epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=args.required_batch_size_multiple, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, ) # Load the latest checkpoint if one is available load_checkpoint(args, trainer, epoch_itr, max_positions, task) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) epoch_itr = reload_train(args, epoch_itr, max_positions, task) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): import_user_module(args) if args.max_tokens is None: args.max_tokens = 6000 print(args) #args.distributed_world_size = 1 if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(args, task) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Make a dummy batch to (i) warm the caching allocator and (ii) as a # placeholder DistributedDataParallel when there's an uneven number of # batches per worker. max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) dummy_batch = task.dataset(args.train_subset).get_dummy_batch( args.max_tokens, max_positions) oom_batch = task.dataset(args.train_subset).get_dummy_batch( 1, max_positions) # Build trainer print("Building trainer...") trainer = Trainer(args, task, model, criterion, dummy_batch, oom_batch) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Initialize dataloader print("Initialize dataloader...") epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=args.required_batch_size_multiple, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, ) # Initialize distributed training (after data loading) print("Initialize distributed training (after data loading)...") if init_distributed: import socket args.distributed_rank = distributed_utils.distributed_init(args) print('| initialized host {} as rank {}'.format( socket.gethostname(), args.distributed_rank)) # Load the latest checkpoint if one is available print("Load the latest checkpoint if one is available...") if not load_checkpoint(args, trainer, epoch_itr): trainer.dummy_train_step([dummy_batch]) if args.reset_target_embedding: trainer.init_meters(args) print("reset trainer.meters") # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') if args.distributed_rank == 0: if os.path.basename(args.save_dir) != "": log_file = os.path.join( args.save_dir, "({0})-params.log".format(os.path.basename(args.save_dir))) else: log_file = os.path.join( args.save_dir, "({0})-params.log".format(args.save_dir.split('/')[-2])) # create log file args.log_file = log_file if os.path.exists(log_file): w = open(log_file, "a+", encoding="utf-8") else: w = open(log_file, "w", encoding="utf-8") w.write(str(args).replace(", ", ",\n") + "\n") w.write(str(model) + "\n") w.flush() w.close() print("saving params file into{}...".format(log_file)) while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args): if args.max_tokens is None: args.max_tokens = 6000 print(args) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(task, ['train', 'valid']) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format(sum(p.numel() for p in model.parameters()))) # Make a dummy batch to (i) warm the caching allocator and (ii) as a # placeholder DistributedDataParallel when there's an uneven number of # batches per worker. max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) dummy_batch = task.dataset('train').get_dummy_batch(args.max_tokens, max_positions) # Build trainer trainer = Trainer(args, task, model, criterion, dummy_batch) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Initialize dataloader epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ) # Load the latest checkpoint if one is available if not load_checkpoint(args, trainer, epoch_itr): trainer.dummy_train_step([dummy_batch]) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates() < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def setup_training(args): """Parse args, load dataset, and load model trainer.""" if not torch.cuda.is_available(): raise NotImplementedError("Training on CPU is not supported") torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Load dataset splits = [args.train_subset, args.valid_subset] validate_and_set_default_args(args) train_corpus = pytorch_translate_data.ParallelCorpusConfig( source=pytorch_translate_data.CorpusConfig( dialect=args.source_lang, data_file=args.train_source_binary_prefix), target=pytorch_translate_data.CorpusConfig( dialect=args.target_lang, data_file=args.train_target_binary_prefix), ) eval_corpus = pytorch_translate_data.ParallelCorpusConfig( source=pytorch_translate_data.CorpusConfig( dialect=args.source_lang, data_file=args.eval_source_binary_prefix), target=pytorch_translate_data.CorpusConfig( dialect=args.target_lang, data_file=args.eval_target_binary_prefix), ) if args.log_verbose: print("Starting to load binarized data files.", flush=True) dataset = pytorch_translate_data.load_binarized_dataset( train_corpus=train_corpus, eval_corpus=eval_corpus, train_split=args.train_subset, eval_split=args.valid_subset, args=args, ) if args.log_verbose: print("Finished loading dataset", flush=True) if args.source_lang is None or args.target_lang is None: # record inferred languages in args, so that it's saved in checkpoints args.source_lang, args.target_lang = dataset.src, dataset.dst print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types") print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types") for split in splits: print(f"| {split} {len(dataset.splits[split])} examples") # Build model and criterion model = models.build_model(args, dataset.src_dict, dataset.dst_dict) criterion = criterions.build_criterion(args, dataset.src_dict, dataset.dst_dict) print(f"| model {args.arch}, criterion {criterion.__class__.__name__}") print(f"| num. model params: \ {sum(p.data.numel() for p in model.parameters())}") # Build trainer trainer = Trainer(args, model, criterion) print(f"| training on {args.distributed_world_size} GPUs") print( f"| max tokens per GPU = {args.max_tokens} and \ max sentences per GPU = {args.max_sentences}", flush=True, ) extra_state = load_existing_checkpoint(args.save_dir, args.restore_file, trainer) return extra_state, trainer, dataset
def main(args): if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) mlperf_compliance.mlperf_log.LOGGER.propagate = False # framework = f'Pytorch NGC {os.environ["NVIDIA_PYTORCH_VERSION"]}' # mlperf_submission_log( # benchmark=mlperf_compliance.constants.TRANSFORMER, # framework=framework) mlperf_compliance.mlperf_log.setdefault( root_dir=os.path.dirname(os.path.abspath(__file__)), benchmark=mlperf_compliance.constants.TRANSFORMER, stack_offset=1, extra_print=False) mlperf_print(key=mlperf_compliance.constants.INIT_START, log_all_ranks=True) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) random.seed(args.seed) np.random.seed(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # preinit and warmup streams/groups for allreduce communicators allreduce_communicators = None if args.distributed_world_size > 1 and args.enable_parallel_backward_allred_opt: allreduce_groups = [ torch.distributed.new_group() for _ in range(args.parallel_backward_allred_cuda_nstreams) ] allreduce_streams = [ torch.cuda.Stream() for _ in range(args.parallel_backward_allred_cuda_nstreams) ] for group, stream in zip(allreduce_groups, allreduce_streams): with torch.cuda.stream(stream): torch.distributed.all_reduce(torch.cuda.FloatTensor(1), group=group) allreduce_communicators = (allreduce_groups, allreduce_streams) if args.max_tokens is None: args.max_tokens = 6000 print(args) mlperf_print(key=mlperf_compliance.constants.GLOBAL_BATCH_SIZE, value=args.max_tokens * args.distributed_world_size) mlperf_print(key=mlperf_compliance.constants.OPT_NAME, value=args.optimizer) assert (len(args.lr) == 1) mlperf_print(key=mlperf_compliance.constants.OPT_BASE_LR, value=args.lr[0] if len(args.lr) == 1 else args.lr) mlperf_print(key=mlperf_compliance.constants.OPT_LR_WARMUP_STEPS, value=args.warmup_updates) assert (args.max_source_positions == args.max_target_positions) mlperf_print(key=mlperf_compliance.constants.MAX_SEQUENCE_LENGTH, value=args.max_target_positions) mlperf_print(key=mlperf_compliance.constants.OPT_ADAM_BETA_1, value=eval(args.adam_betas)[0]) mlperf_print(key=mlperf_compliance.constants.OPT_ADAM_BETA_2, value=eval(args.adam_betas)[1]) mlperf_print(key=mlperf_compliance.constants.OPT_ADAM_EPSILON, value=args.adam_eps) pValue = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int)) result = torch.cuda.cudart().cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128)) result = torch.cuda.cudart().cudaDeviceGetLimit(pValue, ctypes.c_int(0x05)) # torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format( sum(p.numel() for p in model.parameters()))) # Build trainer if args.fp16: trainer = FP16Trainer(args, task, model, criterion, allreduce_communicators=allreduce_communicators) else: if torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) trainer = Trainer(args, task, model, criterion, allreduce_communicators=None) #if (args.online_eval or args.target_bleu) and not args.remove_bpe: # args.remove_bpe='@@ ' print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Initialize dataloader max_positions = trainer.get_model().max_positions() # Send a dummy batch to warm the caching allocator dummy_batch = language_pair_dataset.get_dummy_batch_isolated( args.max_tokens, max_positions, 8) trainer.dummy_train_step(dummy_batch) # Train until the learning rate gets too small or model reaches target score max_epoch = args.max_epoch if args.max_epoch >= 0 else math.inf max_update = args.max_update or math.inf tgt_bleu = args.target_bleu or math.inf current_bleu = 0.0 lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') # mlperf compliance synchronization if args.distributed_world_size > 1: assert (torch.distributed.is_initialized()) torch.distributed.all_reduce(torch.cuda.FloatTensor(1)) torch.cuda.synchronize() mlperf_print(key=mlperf_compliance.constants.INIT_STOP, sync=True) mlperf_print(key=mlperf_compliance.constants.RUN_START, sync=True) # second sync after RUN_START tag is printed. # this ensures no rank touches data until after RUN_START tag is printed. barrier() # Load dataset splits load_dataset_splits(task, ['train', 'test']) ctr = 0 class DummyEpochBatchIterator: def __init__(self, epoch=0): self.epoch = epoch epoch_itr = DummyEpochBatchIterator(0) # Main training loop while lr >= args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update and current_bleu < tgt_bleu: first_epoch = epoch_itr.epoch + 1 mlperf_print(key=mlperf_compliance.constants.BLOCK_START, metadata={ 'first_epoch_num': first_epoch, 'epoch_count': 1 }, sync=True) mlperf_print(key=mlperf_compliance.constants.EPOCH_START, metadata={'epoch_num': first_epoch}, sync=True) start = time.time() gc.disable() epoch_itr = data.EpochBatchIterator( dataset=task.dataset(args.train_subset), dataloader_num_workers=args.dataloader_num_workers, dataloader_pin_memory=args.enable_dataloader_pin_memory, max_tokens=args.max_tokens, max_sentences=args.max_sentences_valid, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, epoch=epoch_itr.epoch if ctr is not 0 else 0, bucket_growth_factor=args.bucket_growth_factor, seq_len_multiple=args.seq_len_multiple, batching_scheme=args.batching_scheme, batch_multiple_strategy=args.batch_multiple_strategy, ) print("got epoch iterator", time.time() - start) # Load the latest checkpoint if one is available if ctr is 0: load_checkpoint(args, trainer, epoch_itr) # train for one epoch start = time.time() #exit(1) train(args, trainer, task, epoch_itr) print("epoch time ", time.time() - start) start = time.time() mlperf_print(key=mlperf_compliance.constants.EPOCH_STOP, metadata={'epoch_num': first_epoch}, sync=True) #if epoch_itr.epoch % args.validate_interval == 0: # valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # Eval BLEU score if args.online_eval or (not tgt_bleu is math.inf): current_bleu = score(args, trainer, task, epoch_itr, args.gen_subset) mlperf_print(key=mlperf_compliance.tags.EVAL_ACCURACY, value=str(current_bleu), metadata={'epoch_num': first_epoch}) gc.enable() # Only use first validation loss to update the learning rate #lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # Save checkpoint #if epoch_itr.epoch % args.save_interval == 0: # save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) ctr = ctr + 1 print("validation and scoring ", time.time() - start) mlperf_print(key=mlperf_compliance.constants.BLOCK_STOP, metadata={'first_epoch_num': first_epoch}, sync=True) train_meter.stop() status = 'success' if current_bleu >= tgt_bleu else 'aborted' mlperf_print(key=mlperf_compliance.constants.RUN_STOP, metadata={'status': status}) print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args logger.info(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) logger.info(model) logger.info('model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) logger.info('num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) logger.info('training on {} GPUs'.format(args.distributed_world_size)) logger.info('max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') print(args.multi_views) while ( lr > args.min_lr and ( epoch_itr.epoch < max_epoch # allow resuming training from the final checkpoint or epoch_itr._next_epoch_itr is not None ) and trainer.get_num_updates() < max_update ): # train for one epoch train(args, trainer, task, epoch_itr) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) bart = BARTHubInterface(args, task, trainer.model).cuda() #print(bart.device) bart.eval() count = 1 bsz = 8 print("Test on val set: ") with open('../data/val_sent_trans_cons_label.source') as source, open('../data/val_sent_c99_label.source') as source2, open('./val_best_multi_attn_'+str(args.lr_weight)+'_.hypo', 'wt', encoding='utf-8') as fout: s1 = source.readlines() s2 = source2.readlines() slines = [s1[0].strip()] slines2 = [s2[0].strip()] for i in tqdm(range(1, len(s1))): if count % bsz == 0: with torch.no_grad(): if args.multi_views: hypotheses_batch = bart.sample(slines, sentences2 = slines2, balance = True, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) else: hypotheses_batch = bart.sample(slines, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) for hypothesis in hypotheses_batch: fout.write(hypothesis + '\n') fout.flush() slines = [] slines2 = [] slines.append(s1[i].strip()) slines2.append(s2[i].strip()) count += 1 if slines != []: if args.multi_views: hypotheses_batch = bart.sample(slines, sentences2 = slines2, balance = True, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) else: hypotheses_batch = bart.sample(slines, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) #hypotheses_batch = bart.sample(slines, sentences2 = slines2, balance = True, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) for hypothesis in hypotheses_batch: fout.write(hypothesis + '\n') fout.flush() hyp_path = './val_best_multi_attn_'+str(args.lr_weight)+'_.hypo' ref_path = '../data/val_sent_trans_cons_label.target' hypothesis = [] with open(hyp_path, 'r') as f: lines = f.readlines() for l in lines: hypothesis.append(l[:-1]) reference = [] with open(ref_path, 'r') as f: lines = f.readlines() for l in lines: reference.append(l[:-1]) rouge = Rouge() print("Val", rouge.get_scores(hypothesis, reference, avg = True)) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) print("Test on testing set: ") count = 1 bsz = 8 with open('../data/test_sent_trans_cons_label.source') as source, open('../data/test_sent_c99_label.source') as source2, open('./test_best_multi_attn_'+str(args.lr_weight)+'_.hypo', 'wt', encoding='utf-8') as fout: s1 = source.readlines() s2 = source2.readlines() slines = [s1[0].strip()] slines2 = [s2[0].strip()] for i in tqdm(range(1, len(s1))): if count % bsz == 0: with torch.no_grad(): if args.multi_views: hypotheses_batch = bart.sample(slines, sentences2 = slines2, balance = True, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) else: hypotheses_batch = bart.sample(slines, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) for hypothesis in hypotheses_batch: fout.write(hypothesis + '\n') fout.flush() slines = [] slines2 = [] slines.append(s1[i].strip()) slines2.append(s2[i].strip()) count += 1 if slines != []: if args.multi_views: hypotheses_batch = bart.sample(slines, sentences2 = slines2, balance = True, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) else: hypotheses_batch = bart.sample(slines, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) for hypothesis in hypotheses_batch: fout.write(hypothesis + '\n') fout.flush() hyp_path = './test_best_multi_attn_'+str(args.lr_weight)+'_.hypo' ref_path = '../data/test_sent_trans_cons_label.target' hypothesis = [] with open(hyp_path, 'r') as f: lines = f.readlines() for l in lines: hypothesis.append(l[:-1]) reference = [] with open(ref_path, 'r') as f: lines = f.readlines() for l in lines: reference.append(l[:-1]) rouge = Rouge() print('Test', rouge.get_scores(hypothesis, reference, avg = True)) # early stop if should_stop_early(args, valid_losses[0]): logger.info('early stop since valid performance hasn\'t improved for last {} runs'.format(args.patience)) break epoch_itr = trainer.get_train_iterator( epoch_itr.epoch, # sharded data: get train iterator for next epoch load_dataset=(os.pathsep in getattr(args, 'data', '')), ) train_meter.stop() logger.info('done training in {:.1f} seconds'.format(train_meter.sum))
def setup_training(args): """Parse args, load dataset, and load model trainer.""" if not torch.cuda.is_available(): raise NotImplementedError("Training on CPU is not supported") torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task and load dataset task = tasks.setup_task(args) task.load_dataset( args.train_subset, args.train_source_binary_path, args.train_target_binary_path, weights_file=getattr(args, "train_weights_path", None), ) task.load_dataset(args.valid_subset, args.eval_source_binary_path, args.eval_target_binary_path) # Build model and criterion model = task.build_model(args) print("| building criterion") criterion = task.build_criterion(args) print(f"| model {args.arch}, criterion {criterion.__class__.__name__}") print(f"| num. model params: \ {sum(p.numel() for p in model.parameters())}") # Build trainer if args.fp16: trainer = FP16Trainer(args, task, model, criterion) else: if torch.cuda.get_device_capability(0)[0] >= 7: print( "| NOTICE: your device may support faster training with --fp16" ) trainer = Trainer(args, task, model, criterion) print(f"| training on {args.distributed_world_size} GPUs") print( f"| max tokens per GPU = {args.max_tokens} and \ max sentences per GPU = {args.max_sentences}", flush=True, ) os.makedirs(args.save_dir, exist_ok=True) # If --restore-file is already present under --save-dir, use that one # instead of --pretrained-checkpoint-file. The idea is that # --pretrained-checkpoint-file allows the user to specify restoring from a # different run's checkpoint (possibly with different training params), # while not polluting the previous run's checkpoint directory # with new checkpoints. However, if training gets interrupted # and the user restarts training, we want to resume from # the checkpoints under --save-dir, instead of # restarting again from the old run's checkpoint at # --pretrained-checkpoint-file. # # Note that if args.restore_file is an absolute path, os.path.join() will # ignore previous directory args and just use the absolute path as is. checkpoint_path = os.path.join(args.save_dir, args.restore_file) restore_state = True if os.path.exists(checkpoint_path): print( f"| Using --save-dir={args.save_dir}, --restore-file={args.restore_file}." ) elif args.pretrained_checkpoint_file and os.path.exists( args.pretrained_checkpoint_file): checkpoint_path = args.pretrained_checkpoint_file restore_state = args.load_pretrained_checkpoint_state print( f"| Using --pretrained-checkpoint-file={args.pretrained_checkpoint_file}, " f"--load-pretrained-checkpoint-state={args.load_pretrained_checkpoint_state}." ) extra_state = default_extra_state(args) if not os.path.isfile(checkpoint_path) and args.multi_model_restore_files: print( f"| Restoring individual models from {args.multi_model_restore_files}" ) multi_model.import_individual_models(args.multi_model_restore_files, trainer) else: loaded, loaded_extra_state = load_existing_checkpoint( checkpoint_path=checkpoint_path, trainer=trainer, restore_state=restore_state, ) if loaded_extra_state: extra_state.update(loaded_extra_state) if loaded: args.path = [checkpoint_path] calculate_bleu_on_subset( args=args, task=task, epoch_str="initial loaded checkpoint", offset=None, dataset_split=args.valid_subset, ) print(f"| extra_state: {extra_state}") epoch_itr = data.EpochBatchIterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=trainer.get_model().max_positions(), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ) epoch = extra_state["epoch"] if extra_state["batch_offset"] == 0: epoch -= 1 # this will be incremented when we call epoch_itr.next_epoch_itr() epoch_itr.load_state_dict({ "epoch": epoch, "iterations_in_epoch": extra_state["batch_offset"] }) return extra_state, trainer, task, epoch_itr
def main(args): import_user_module(args) if args.max_tokens is None: args.max_tokens = 6000 print(args) if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(task, ['train', 'valid']) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Make a dummy batch to (i) warm the caching allocator and (ii) as a # placeholder DistributedDataParallel when there's an uneven number of # batches per worker. max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) dummy_batch = task.dataset('train').get_dummy_batch( args.max_tokens, max_positions) oom_batch = task.dataset('train').get_dummy_batch(1, max_positions) # Build trainer trainer = Trainer(args, task, model, criterion, dummy_batch, oom_batch) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Initialize dataloader epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, ) # Load the latest checkpoint if one is available if not load_checkpoint(args, trainer, epoch_itr): trainer.dummy_train_step([dummy_batch]) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args): import_user_module(args) assert ( args.max_tokens is not None or args.batch_size is not None ), "Must specify batch size either with --max-tokens or --batch-size" metrics.reset() np.random.seed(args.seed) utils.set_torch_seed(args.seed) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args logger.info(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(","): task.load_dataset(valid_sub_split, combine=False, epoch=1) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) logger.info(model) logger.info("task: {} ({})".format(args.task, task.__class__.__name__)) logger.info("model: {} ({})".format(args.arch, model.__class__.__name__)) logger.info("criterion: {} ({})".format(args.criterion, criterion.__class__.__name__)) logger.info("num. model params: {} (num. trained: {})".format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # breakpoint() # ========== initialize the model with pretrained BART parameters ========== # for shared embeddings and subtoken split for amr nodes if 'bartsv' in args.arch: if args.initialize_with_bart: logger.info( '-' * 10 + ' initializing model parameters with pretrained BART model ' + '-' * 10) new_state_dict = copy.deepcopy(task.bart.model.state_dict()) # treat the embedding initialization separately later, as the size different logger.info( '-' * 10 + ' delay encoder embeddings, decoder input and output embeddings initialization ' + '-' * 10) ignore_keys = set([ 'encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'decoder.output_projection.weight' ]) for k in ignore_keys: del new_state_dict[k] if not args.initialize_with_bart_enc: logger.info( '-' * 10 + ' do not initialize with BART encoder parameters ' + '-' * 10) for k in list(new_state_dict.keys()): if k.startswith('encoder'): del new_state_dict[k] if not args.initialize_with_bart_dec: logger.info( '-' * 10 + ' do not initialize with BART decoder parameters ' + '-' * 10) for k in list(new_state_dict.keys()): if k.startswith('decoder'): del new_state_dict[k] model.load_state_dict(new_state_dict, strict=False, args=args) # initialize the Bart part embeddings bart_vocab_size = task.target_dictionary.bart_vocab_size # NOTE we need to prune the pretrained BART embeddings, especially for bart.base bart_embed_weight = task.bart.model.encoder.embed_tokens.weight.data[: bart_vocab_size] assert len(bart_embed_weight) == bart_vocab_size with torch.no_grad(): model.encoder.embed_tokens.weight[:bart_vocab_size].copy_( bart_embed_weight) model.decoder.embed_tokens.weight[:bart_vocab_size].copy_( bart_embed_weight) model.decoder.output_projection.weight[:bart_vocab_size].copy_( bart_embed_weight) if args.bart_emb_init_composition: logger.info( '-' * 10 + ' initialize extended target embeddings with compositional embeddings ' 'from BART vocabulary ' + '-' * 10) # breakpoint() symbols = [ task.target_dictionary[idx] for idx in range(bart_vocab_size, len(task.target_dictionary)) ] mapper = MapAvgEmbeddingBART(task.bart, task.bart.model.decoder.embed_tokens) comp_embed_weight, map_all = mapper.map_avg_embeddings( symbols, transform=transform_action_symbol, add_noise=False) assert len(comp_embed_weight) == len(symbols) with torch.no_grad(): model.encoder.embed_tokens.weight[bart_vocab_size:].copy_( comp_embed_weight) model.decoder.embed_tokens.weight[bart_vocab_size:].copy_( comp_embed_weight) model.decoder.output_projection.weight[bart_vocab_size:].copy_( comp_embed_weight) elif 'bart' in args.arch: if args.initialize_with_bart: logger.info( '-' * 10 + ' initializing model parameters with pretrained BART model ' + '-' * 10) new_state_dict = copy.deepcopy(task.bart.model.state_dict()) if not args.bart_emb_decoder: logger.info('-' * 10 + ' build a separate decoder dictionary embedding ' + '-' * 10) if not args.bart_emb_decoder_input: ignore_keys = set([ 'decoder.embed_tokens.weight', 'decoder.output_projection.weight' ]) else: logger.info( '-' * 10 + ' use BART dictionary embedding for target input ' + '-' * 10) ignore_keys = set(['decoder.output_projection.weight']) for k in ignore_keys: del new_state_dict[k] if not args.initialize_with_bart_enc: logger.info( '-' * 10 + ' do not initialize with BART encoder parameters ' + '-' * 10) for k in list(new_state_dict.keys()): if k.startswith('encoder'): del new_state_dict[k] if not args.initialize_with_bart_dec: logger.info( '-' * 10 + ' do not initialize with BART decoder parameters ' + '-' * 10) for k in list(new_state_dict.keys()): if k.startswith('decoder'): del new_state_dict[k] model.load_state_dict(new_state_dict, strict=False, args=args) # initialize the target embeddings with average of subtoken embeddings in BART vocabulary if args.bart_emb_init_composition: assert not args.bart_emb_decoder, 'should not use the compositional embeddings on top of BART vocabulary here' logger.info( '-' * 10 + ' initialize target embeddings with compositional embeddings from BART vocabulary ' + '-' * 10) composite_embed = CompositeEmbeddingBART( task.bart, task.bart.model.decoder.embed_tokens, task.target_dictionary) if args.bart_emb_decoder_input: # only initialize the decoder output embeddings with torch.no_grad(): model.decoder.output_projection.weight.copy_( composite_embed.embedding_weight) else: # initialize both the decoder input and output embeddings with torch.no_grad(): model.decoder.embed_tokens.weight.copy_( composite_embed.embedding_weight) model.decoder.output_projection.weight.copy_( composite_embed.embedding_weight) elif 'roberta' in args.arch: # initialize the target embeddings with average of subtoken embeddings in BART vocabulary if args.bart_emb_init_composition: assert not args.bart_emb_decoder, 'should not use the compositional embeddings on top of RoBERTa vocabulary here' logger.info( '-' * 10 + ' initialize target embeddings with compositional embeddings from RoBERTa vocabulary ' + '-' * 10) composite_embed = CompositeEmbeddingBART( task.bart, # NOTE here "bart" means roberta task.bart.model.encoder.sentence_encoder.embed_tokens, task.target_dictionary) if args.bart_emb_decoder_input: # only initialize the decoder output embeddings with torch.no_grad(): model.decoder.output_projection.weight.copy_( composite_embed.embedding_weight) else: # initialize both the decoder input and output embeddings with torch.no_grad(): model.decoder.embed_tokens.weight.copy_( composite_embed.embedding_weight) model.decoder.output_projection.weight.copy_( composite_embed.embedding_weight) else: raise ValueError # ========================================================================== # breakpoint() # (optionally) Configure quantization if args.quantization_config_path is not None: quantizer = quantization_utils.Quantizer( config_path=args.quantization_config_path, max_epoch=args.max_epoch, max_update=args.max_update, ) else: quantizer = None # Build trainer if args.model_parallel_size == 1: trainer = Trainer(args, task, model, criterion, quantizer) else: trainer = MegatronTrainer(args, task, model, criterion) logger.info("training on {} devices (GPUs/TPUs)".format( args.distributed_world_size)) logger.info( "max tokens per GPU = {} and max sentences per GPU = {}".format( args.max_tokens, args.batch_size)) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint( args, trainer, # don't cache epoch iterators for sharded datasets disable_iterator_cache=task.has_sharded_data("train"), ) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = meters.StopwatchMeter() train_meter.start() while lr > args.min_lr and epoch_itr.next_epoch_idx <= max_epoch: # train for one epoch valid_losses, should_stop = train(args, trainer, task, epoch_itr) if should_stop: break # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) epoch_itr = trainer.get_train_iterator( epoch_itr.next_epoch_idx, # sharded data: get train iterator for next epoch load_dataset=task.has_sharded_data("train"), # don't cache epoch iterators for sharded datasets disable_iterator_cache=task.has_sharded_data("train"), ) train_meter.stop() logger.info("done training in {:.1f} seconds".format(train_meter.sum))
def train(cfg: DictConfig, trainer: Trainer, task: tasks.FairseqTask, epoch_itr) -> Tuple[List[Optional[float]], bool]: """Train the model for one epoch and return validation losses.""" # Initialize data iterator itr = epoch_itr.next_epoch_itr( fix_batches_to_gpus=cfg.distributed_training.fix_batches_to_gpus, shuffle=(epoch_itr.next_epoch_idx > cfg.dataset.curriculum), ) update_freq = (cfg.optimization.update_freq[epoch_itr.epoch - 1] if epoch_itr.epoch <= len(cfg.optimization.update_freq) else cfg.optimization.update_freq[-1]) itr = iterators.GroupedIterator(itr, update_freq) if cfg.common.tpu: itr = utils.tpu_data_loader(itr) progress = progress_bar.progress_bar( itr, log_format=cfg.common.log_format, log_interval=cfg.common.log_interval, epoch=epoch_itr.epoch, tensorboard_logdir=(cfg.common.tensorboard_logdir if distributed_utils.is_master( cfg.distributed_training) else None), default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"), wandb_project=(cfg.common.wandb_project if distributed_utils.is_master( cfg.distributed_training) else None), wandb_run_name=os.environ.get( "WANDB_NAME", os.path.basename(cfg.checkpoint.save_dir)), azureml_logging=(cfg.common.azureml_logging if distributed_utils.is_master( cfg.distributed_training) else False), ) progress.update_config(_flatten_config(cfg)) trainer.begin_epoch(epoch_itr.epoch) valid_subsets = cfg.dataset.valid_subset.split(",") should_stop = False num_updates = trainer.get_num_updates() logger.info("Start iterating over samples") for i, samples in enumerate(progress): with metrics.aggregate( "train_inner"), torch.autograd.profiler.record_function( "train_step-%d" % i): log_output = trainer.train_step(samples) if log_output is not None: # not OOM, overflow, ... # log mid-epoch stats num_updates = trainer.get_num_updates() if num_updates % cfg.common.log_interval == 0: stats = get_training_stats( metrics.get_smoothed_values("train_inner")) progress.log(stats, tag="train_inner", step=num_updates) # reset mid-epoch stats after each log interval # the end-of-epoch stats will still be preserved metrics.reset_meters("train_inner") end_of_epoch = not itr.has_next() valid_losses, should_stop = validate_and_save(cfg, trainer, task, epoch_itr, valid_subsets, end_of_epoch) if should_stop: break # log end-of-epoch stats logger.info("end of epoch {} (average epoch stats below)".format( epoch_itr.epoch)) stats = get_training_stats(metrics.get_smoothed_values("train")) progress.print(stats, tag="train", step=num_updates) # reset epoch-level meters metrics.reset_meters("train") return valid_losses, should_stop
def main(args): if args.max_tokens is None: args.max_tokens = 6000 print(args) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(task, ['train', 'valid']) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format( sum(p.numel() for p in model.parameters()))) # Make a dummy batch to (i) warm the caching allocator and (ii) as a # placeholder DistributedDataParallel when there's an uneven number of # batches per worker. max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) dummy_batch = task.dataset('train').get_dummy_batch( args.max_tokens, max_positions) # Build trainer trainer = Trainer(args, task, model, criterion, dummy_batch) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) summary_writer = SummaryWriter(log_dir=args.save_dir, enable=args.distributed_rank == 0) # Initialize dataloader epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ) first_train = True # Load the latest checkpoint if one is available if not load_checkpoint(args, trainer, epoch_itr): trainer.dummy_train_step([dummy_batch]) first_train = False # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') if not hasattr(save_checkpoint, 'not_best'): save_checkpoint.not_best = 0 if not args.no_first_valid and first_train: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets, True, summary_writer) if args.finetune_params != '': print("| train parameters.") for name, param in trainer.model.named_parameters(): if trainer.should_train(name): print(name) print("| fixed parameters.") for name, param in trainer.model.named_parameters(): if not trainer.should_train(name): print(name) if args.start_ckpt != '': save_checkpoint.not_best = 0 save_checkpoint.best = 9999 print("| train begin.") while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr, summary_writer) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate( args, trainer, task, epoch_itr, valid_subsets, epoch_itr.epoch % args.test_bleu_interval == 0, summary_writer) if args.early_stop > 0: if hasattr(save_checkpoint, 'best') and valid_losses[0] > save_checkpoint.best: save_checkpoint.not_best += 1 print("| Not the best ckpt... not best:", save_checkpoint.not_best) if save_checkpoint.not_best > args.early_stop: print("| Early stop...") break else: save_checkpoint.not_best = 0 # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum)) os.system("ps aux | grep redis-server | awk '{print $2}' | xargs kill") if args.save_output: save_expert_outputs(args, task, trainer)
def main(args): if args.max_tokens is None: args.max_tokens = 6000 print(args) if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(args, task, ['train', 'valid']) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format( sum(p.numel() for p in model.parameters()))) # Build trainer if args.fp16: trainer = FP16Trainer(args, task, model, criterion) else: if torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Initialize dataloader max_positions = trainer.get_model().max_positions() epoch_itr = data.EpochBatchIterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences_valid, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, ) # Load the latest checkpoint if one is available load_checkpoint(args, trainer, epoch_itr) # Send a dummy batch to warm the caching allocator dummy_batch = task.dataset('train').get_dummy_batch( args.max_tokens, max_positions) trainer.dummy_train_step(dummy_batch) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch <= max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' metrics.reset() # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args logger.info(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=1) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) logger.info(model) logger.info('model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) logger.info('num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # (optionally) Configure quantization if args.quantization_config_path is not None: quantizer = quantization_utils.Quantizer( config_path=args.quantization_config_path, max_epoch=args.max_epoch, max_update=args.max_update, ) else: quantizer = None # Build trainer if args.model_parallel_size == 1: trainer = Trainer(args, task, model, criterion, quantizer) else: trainer = MegatronTrainer(args, task, model, criterion) logger.info('training on {} GPUs'.format(args.distributed_world_size)) logger.info( 'max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = meters.StopwatchMeter() train_meter.start() while (lr > args.min_lr and epoch_itr.next_epoch_idx <= max_epoch): # train for one epoch valid_losses = train(args, trainer, task, epoch_itr, max_update) if should_stop_early( args, valid_losses[0]) or trainer.get_num_updates() >= max_update: break # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) epoch_itr = trainer.get_train_iterator( epoch_itr.next_epoch_idx, # sharded data: get train iterator for next epoch load_dataset=(os.pathsep in getattr(args, 'data', '')), ) train_meter.stop() logger.info('done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) reload_dataset = ':' in getattr(args, 'data', '') # sharded data: get train iterator for next epoch epoch_itr = trainer.get_train_iterator(epoch_itr.epoch, load_dataset=reload_dataset) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args logger.info(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) logger.info(model) logger.info('model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) logger.info('num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) logger.info('training on {} GPUs'.format(args.distributed_world_size)) logger.info( 'max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') tokenize = sacrebleu.DEFAULT_TOKENIZER if not args.eval_tokenized_bleu else 'none' hyps, refs = validate(args, trainer, task, epoch_itr, valid_subsets) for h, r, split in zip(hyps, refs, args.valid_subset.split(',')): assert len(h) == len(r) sacrebleu_score, _, _ = sacrebleu.corpus_bleu( h, [r], tokenize=tokenize), hyps, refs bleu = compute_cvpr_bleu(h, r) rouge_score = rouge.rouge(h, r) print('{} set has {} samples,\n' 'sacrebleu: {},\n' 'CVPR BLEU scripts: {}\n' 'CVPR ROUGE: {}'.format(split, len(h), sacrebleu_score, bleu, rouge_score)) print('performance: {:.2f} {}'.format( rouge_score['rouge_l/f_score'] * 100, ' '.join([str(b) for b in bleu])))