def __init__(self, args): self.args = args args.wait_k = args.wait_k if args.wait_k is not None else args.sim_mt_target_k # necessary setup and check utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) if args.distributed_world_size > 1: args.distributed_rank = distributed_utils.distributed_init(args) self.max_epoch = args.max_epoch or math.inf self.max_update = args.max_update or math.inf self.task = None self.model = None self.trainer = None self.epoch_itr = None self.itr = None self.extra_meters = None self._done = True # recorded for state repr self.data = None self.last_loss = None self._state = None self.historical_train_loss = [[] for _ in range(args.sim_mt_sample_k_min, args.sim_mt_sample_k_max + 1)] self.historical_valid_loss = [] self.historical_last_epoch_valid_loss = [] self.k_coverage_num = [0 for _ in range(self.action_size)] self.k_coverage_num_current_epoch = [0 for _ in range(self.action_size)]
def multi_process_train( device_id: int, args, output_queue: Optional[mp_queues.Queue], start_rank: int = 0, init_fn: Optional[Callable[[], None]] = None, trainer_class=None, train_step_kwargs=None, ): # Enable faulthandler for better Python tracebacks upon segfaults under # multiprocessing. Without this, the stack trace only shows the # SpawnContext.join() call, rather than the actual line where the child # process segfaulted. faulthandler.enable(all_threads=True) if init_fn: init_fn() args.device_id = device_id args.distributed_rank = start_rank + device_id torch.cuda.set_device(args.device_id) if args.distributed_world_size > 1: args.distributed_rank = distributed_utils.distributed_init(args) extra_state, trainer, task, epoch_itr = setup_training(args, trainer_class) train( args=args, extra_state=extra_state, trainer=trainer, task=task, epoch_itr=epoch_itr, output_queue=output_queue, **train_step_kwargs, )
def multi_process_train( args, error_queue: mp_queues.SimpleQueue, output_queue: Optional[mp_queues.Queue], init_fn: Optional[Callable[[], None]] = None, ): try: if init_fn: init_fn() torch.cuda.set_device(args.device_id) if args.distributed_world_size > 1: args.distributed_rank = distributed_utils.distributed_init(args) extra_state, trainer, task, epoch_itr = setup_training(args) train( args=args, extra_state=extra_state, trainer=trainer, task=task, epoch_itr=epoch_itr, output_queue=output_queue, ) except KeyboardInterrupt: pass # killed by parent, do nothing except Exception: # propagate exception to parent process, keeping original traceback import traceback error_queue.put((args.distributed_rank, traceback.format_exc()))
def multi_process_train( device_id: int, args, output_queue: Optional[mp_queues.Queue], start_rank: int = 0, init_fn: Optional[Callable[[], None]] = None, trainer_class=None, train_step_kwargs=None, ): if init_fn: init_fn() args.device_id = device_id args.distributed_rank = start_rank + device_id torch.cuda.set_device(args.device_id) if args.distributed_world_size > 1: args.distributed_rank = distributed_utils.distributed_init(args) extra_state, trainer, task, epoch_itr = setup_training(args, trainer_class) train( args=args, extra_state=extra_state, trainer=trainer, task=task, epoch_itr=epoch_itr, output_queue=output_queue, **train_step_kwargs, )
def main(args): if args.distributed_init_method is None and args.distributed_port > 0: # We can determine the init method automatically for Slurm. node_list = os.environ.get('SLURM_JOB_NODELIST') if node_list is not None: try: hostnames = subprocess.check_output( ['scontrol', 'show', 'hostnames', node_list]) args.distributed_init_method = 'tcp://{host}:{port}'.format( host=hostnames.split()[0].decode('utf-8'), port=args.distributed_port) args.distributed_rank = int(os.environ.get('SLURM_PROCID')) args.device_id = int(os.environ.get('SLURM_LOCALID')) except subprocess.CalledProcessError as e: # scontrol failed raise e except FileNotFoundError as e: # Slurm is not installed pass if args.distributed_init_method is None: raise ValueError('--distributed-init-method or --distributed-port ' 'must be specified for distributed training') args.distributed_rank = distributed_utils.distributed_init(args) args.device_id = args.local_rank print('| initialized host {} as rank {} and device id {}'.format( socket.gethostname(), args.distributed_rank, args.device_id)) single_process_main(args)
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small valid_losses = [None] valid_subsets = args.valid_subset.split(',') if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) else: valid_losses = [None]
def distributed_main(i, args): import socket args.device_id = i if args.distributed_rank is None: # torch.multiprocessing.spawn args.distributed_rank = i args.distributed_rank = distributed_utils.distributed_init(args) print('| initialized host {} as rank {}'.format(socket.gethostname(), args.distributed_rank)) main(args)
def setup_model_loss_criterion(args, rank, is_cuda): """ setup model, criterion and optimizer based on input args """ args.distributed_rank = rank distributed_utils.distributed_init(args) torch.manual_seed(1) model = Model(args.input_size, args.nb_classes) loss_fn = nn.CrossEntropyLoss() if is_cuda: model = model.cuda() loss_fn = loss_fn.cuda() optimizer = optim.sgd.SGD(args, model.parameters()) optimizer = optim.FairseqBMUF(args, optimizer) return model, loss_fn, optimizer
def run(args, error_queue): try: args.distributed_rank = distributed_utils.distributed_init(args) single_process_main(args) except KeyboardInterrupt: pass # killed by parent, do nothing except Exception: # propagate exception to parent process, keeping original traceback import traceback error_queue.put((args.distributed_rank, traceback.format_exc()))
def run(args, error_queue): try: args.distributed_rank = distributed_utils.distributed_init(args) single_process_main(args) except KeyboardInterrupt: pass # killed by parent, do nothing except Exception: # propagate exception to parent process, keeping original traceback import traceback error_queue.put((args.distributed_rank, traceback.format_exc()))
def DistributedFairseqModel(args, model): """ Wrap a *model* to support distributed data parallel training. This is similar to the built-in DistributedDataParallel, but allows additional configuration of the DistributedDataParallel class to use, and also provides easier access to the wrapped model by forwarding requests for missing attributes to the wrapped model. Args: args (argparse.Namespace): fairseq args model (BaseFairseqModel): model to wrap """ # rendezvous with other workers args.distributed_rank = distributed_utils.distributed_init(args) print('| initialized host {} as rank {}'.format(socket.gethostname(), args.distributed_rank)) # determine which DDP class to extend assert isinstance(model, BaseFairseqModel) if args.ddp_backend == 'c10d': ddp_class = parallel.DistributedDataParallel init_kwargs = dict( module=model, device_ids=[args.device_id], output_device=args.device_id, broadcast_buffers=False, bucket_cap_mb=args.bucket_cap_mb, ) # Maintain backward compatibility for 0.4 or earlier if 'check_reduction' in inspect.getargspec(ddp_class)[0]: init_kwargs['check_reduction'] = True elif args.ddp_backend == 'no_c10d': ddp_class = LegacyDistributedDataParallel init_kwargs = dict( module=model, world_size=args.distributed_world_size, buffer_size=2**28, ) else: raise ValueError('Unknown --ddp-backend: ' + args.ddp_backend) class _DistributedFairseqModel(ddp_class): """Extend DistributedDataParallel to check for missing attributes in the wrapped module.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def __getattr__(self, name): wrapped_module = super().__getattr__('module') if hasattr(wrapped_module, name): return getattr(wrapped_module, name) return super().__getattr__(name) return _DistributedFairseqModel(**init_kwargs)
def run(args, error_queue): try: args.distributed_rank = distributed_utils.distributed_init(args) print('| initialized host {} as rank {}'.format(socket.gethostname(), args.distributed_rank)) single_process_main(args) except KeyboardInterrupt: pass # killed by parent, do nothing except Exception: # propagate exception to parent process, keeping original traceback import traceback error_queue.put((args.distributed_rank, traceback.format_exc()))
def run(args, error_queue): try: args.distributed_rank = distributed_utils.distributed_init(args) print( "| initialized host {} as rank {}".format(socket.gethostname(), args.distributed_rank) ) single_process_main(args) except KeyboardInterrupt: pass # killed by parent, do nothing except Exception: # propagate exception to parent process, keeping original traceback import traceback error_queue.put((args.distributed_rank, traceback.format_exc()))
def multi_process_train( device_id: int, args, output_queue: Optional[mp_queues.Queue], start_rank: int = 0, init_fn: Optional[Callable[[], None]] = None, trainer_class=None, train_step_kwargs=None, ): # Enable faulthandler for better Python tracebacks upon segfaults under # multiprocessing. Without this, the stack trace only shows the # SpawnContext.join() call, rather than the actual line where the child # process segfaulted. faulthandler.enable(all_threads=True) if init_fn: init_fn() args.device_id = device_id args.distributed_rank = start_rank + device_id if args.distributed_world_size > 1: args.distributed_rank = distributed_utils.distributed_init(args) if torch.cuda.is_available(): torch.cuda.set_device(args.device_id) trainer, task, epoch_itr = setup_training(args, trainer_class) # Distributed_init does initialization and works as a barrier. # Therefore, any expensive data preprocessing should happen before. extra_state, epoch_itr, checkpoint_manager = setup_training_state( args=args, trainer=trainer, task=task, epoch_itr=epoch_itr ) # Replay previous training progress so the output_queue contains all # previous training progress even when we resume training from an existing # checkpoint. if distributed_utils.is_master(args) and output_queue is not None: for progress_output in extra_state["training_progress"]: output_queue.put_nowait(progress_output) train( args=args, extra_state=extra_state, trainer=trainer, task=task, epoch_itr=epoch_itr, checkpoint_manager=checkpoint_manager, output_queue=output_queue, **train_step_kwargs, )
def multi_process_train( device_id: int, args, output_queue: Optional[mp_queues.Queue], start_rank: int = 0, init_fn: Optional[Callable[[], None]] = None, trainer_class=None, train_step_kwargs=None, ): # Enable faulthandler for better Python tracebacks upon segfaults under # multiprocessing. Without this, the stack trace only shows the # SpawnContext.join() call, rather than the actual line where the child # process segfaulted. faulthandler.enable(all_threads=True) if init_fn: init_fn() args.device_id = device_id args.distributed_rank = start_rank + device_id torch.cuda.set_device(args.device_id) trainer, task, epoch_itr = setup_training(args, trainer_class) # Distributed_init does initialization and works as a barrier. # Therefore, any expensive data preprocessing should happen before. if args.distributed_world_size > 1: args.distributed_rank = distributed_utils.distributed_init(args) extra_state = setup_training_state(args, trainer, task) epoch = extra_state["epoch"] if extra_state["batch_offset"] == 0: epoch -= 1 # this will be incremented when we call epoch_itr.next_epoch_itr() epoch_itr.load_state_dict({ "epoch": epoch, "iterations_in_epoch": extra_state["batch_offset"] }) train( args=args, extra_state=extra_state, trainer=trainer, task=task, epoch_itr=epoch_itr, output_queue=output_queue, **train_step_kwargs, )
def main(args): if args.distributed_init_method is None and args.distributed_port > 0: # We can determine the init method automatically for Slurm. node_list = os.environ.get('SLURM_JOB_NODELIST') if node_list is not None: try: hostnames = subprocess.check_output(['scontrol', 'show', 'hostnames', node_list]) args.distributed_init_method = 'tcp://{host}:{port}'.format( host=hostnames.split()[0].decode('utf-8'), port=args.distributed_port) args.distributed_rank = int(os.environ.get('SLURM_PROCID')) args.device_id = int(os.environ.get('SLURM_LOCALID')) except subprocess.CalledProcessError as e: # scontrol failed raise e except FileNotFoundError as e: # Slurm is not installed pass if args.distributed_init_method is None: raise ValueError('--distributed-init-method or --distributed-port ' 'must be specified for distributed training') args.distributed_rank = distributed_utils.distributed_init(args) print('| initialized host {} as rank {}'.format(socket.gethostname(), args.distributed_rank)) single_process_main(args)
def main(args, override_args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) # Print args print(args) use_fp16 = args.fp16 use_cuda = torch.cuda.is_available() and not args.cpu if override_args is not None: overrides = vars(override_args) overrides.update(eval(getattr(override_args, 'model_overrides', '{}'))) else: overrides = None # Load ensemble print('| loading model(s) from {}'.format(args.path)) models, model_args, task = checkpoint_utils.load_model_ensemble_and_task( [args.path], arg_overrides=overrides, ) model = models[0] # Move models to GPU for model in models: if use_fp16: model.half() if use_cuda: model.cuda() # Print args print(model_args) # Build criterion criterion = task.build_criterion(model_args) criterion.eval() # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build trainer trainer = validator(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator valid_subsets = args.valid_subset.split(',') valid_losses = validate(args, trainer, task, valid_subsets)
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' metrics.reset() # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args logger.info(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=1) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) logger.info(model) logger.info('model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) logger.info('num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer if args.model_parallel_size == 1: trainer = Trainer(args, task, model, criterion) else: trainer = MegatronTrainer(args, task, model, criterion) logger.info('training on {} GPUs'.format(args.distributed_world_size)) logger.info( 'max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = meters.StopwatchMeter() train_meter.start() while (lr > args.min_lr and epoch_itr.next_epoch_idx <= max_epoch): #print("===========================", epoch_itr.next_epoch_idx) # train for one epoch valid_losses = train(args, trainer, task, epoch_itr, max_update) if should_stop_early( args, valid_losses[0]) or trainer.get_num_updates() >= max_update: break # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) epoch_itr = trainer.get_train_iterator( epoch_itr.next_epoch_idx, # sharded data: get train iterator for next epoch load_dataset=(os.pathsep in getattr(args, 'data', '')), ) train_meter.stop() logger.info('done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args logger.info(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=1) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) logger.info(model) logger.info('model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) logger.info('num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) logger.info('training on {} GPUs'.format(args.distributed_world_size)) logger.info( 'max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = meters.StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') if args.eval_mode != 'none': start_val_time = time.time() with torch.no_grad(): if args.eval_mode != 'entropy': _ = validate(args, trainer, task, epoch_itr, valid_subsets, args.prune_num) print('elapsed time (seconds): {}'.format(time.time() - start_val_time)) _ = validate_iw(args, trainer, task, epoch_itr, valid_subsets, args.prune_num, mode=args.eval_mode) return while (lr > args.min_lr and epoch_itr.next_epoch_idx <= max_epoch and trainer.get_num_updates() < max_update): # train for one epoch train(args, trainer, task, epoch_itr) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) # early stop if should_stop_early(args, valid_losses[0]): logger.info( 'early stop since valid performance hasn\'t improved for last {} runs' .format(args.patience)) break epoch_itr = trainer.get_train_iterator( epoch_itr.next_epoch_idx, # sharded data: get train iterator for next epoch load_dataset=(os.pathsep in getattr(args, 'data', '')), ) logger.info('done training in {:.1f} seconds'.format(train_meter.sum)) # _ = validate_iw(args, trainer, task, epoch_itr, valid_subsets) train_meter.stop()
def main(args, init_distributed=False): utils.import_user_module(args) try: from fairseq.fb_pathmgr import fb_pathmgr global fb_pathmgr_registerd if not fb_pathmgr_registerd: fb_pathmgr.register() fb_pathmgr_registerd = True except (ModuleNotFoundError, ImportError): pass assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # filter the params that is unused for finetuing, ad-hoc for finetuing, should turn off when bert pretraining. for n, p in model.named_parameters(): if "lm_head" in n: p.requires_grad = False # print(n) # print(n, p.requires_grad, p.shape) # for i, (n, p) in enumerate(model.named_parameters()): # print(i, n, p.size()) # asdf # Build trainer trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') if not hasattr(checkpoint_utils.save_checkpoint, 'not_best'): checkpoint_utils.save_checkpoint.not_best = 0 #import pdb; pdb.set_trace() while epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: print('Start training') # train for one epoch train(args, trainer, task, epoch_itr) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) if args.early_stop > 0: if hasattr( checkpoint_utils.save_checkpoint, 'best' ) and valid_losses[0] > checkpoint_utils.save_checkpoint.best: checkpoint_utils.save_checkpoint.not_best += 1 print("| Not the best ckpt... not best:", checkpoint_utils.save_checkpoint.not_best) if checkpoint_utils.save_checkpoint.not_best > args.early_stop: print("| Early stop...") break else: checkpoint_utils.save_checkpoint.not_best = 0 else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) reload_dataset = ':' in getattr(args, 'data', '') # sharded data: get train iterator for next epoch epoch_itr = trainer.get_train_iterator(epoch_itr.epoch, load_dataset=reload_dataset) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits task.load_dataset(args.train_subset, combine=True, epoch=0) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=True, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) # Initialize dataloader epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=args.required_batch_size_multiple, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, ) # Load the latest checkpoint if one is available load_checkpoint(args, trainer, epoch_itr, max_positions, task) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) epoch_itr = reload_train(args, epoch_itr, max_positions, task) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, config=None, init_distributed=False): utils.import_user_module(args) experiment = None if config: experiment = ExistingExperiment( api_key=config["api_key"], previous_experiment=config["experiment_key"], auto_output_logging=None, ) assert ( args.max_tokens is not None or args.max_sentences is not None ), "Must specify batch size either with --max-tokens or --max-sentences" # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) print(args) if experiment: experiment.log_parameters(vars(args), prefix="Device {} :: ".format( args.device_id)) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(","): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print("| model {}, criterion {}".format(args.arch, criterion.__class__.__name__)) print("| num. model params: {} (num. trained: {})".format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) if experiment: experiment.log_parameters( { "criterion": criterion.__class__.__name__, "num. model params": sum(p.numel() for p in model.parameters()), "num. trained params": sum(p.numel() for p in model.parameters() if p.requires_grad), }, prefix="Device {} :: ".format(args.device_id), ) # Build trainer trainer = Trainer(args, task, model, criterion) print("| training on {} GPUs".format(args.distributed_world_size)) print("| max tokens per GPU = {} and max sentences per GPU = {}".format( args.max_tokens, args.max_sentences)) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(",") while (lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates() < max_update): # train for one epoch train(args, trainer, task, epoch_itr, experiment) if (not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0): valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets, experiment) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) reload_dataset = ":" in getattr(args, "data", "") # sharded data: get train iterator for next epoch epoch_itr = trainer.get_train_iterator(epoch_itr.epoch, load_dataset=reload_dataset) train_meter.stop() print("| done training in {:.1f} seconds".format(train_meter.sum)) if experiment: experiment.log_metrics( { "valid_loss": valid_losses[0], "lr": lr }, prefix="Device {} ".format(args.device_id), )
print('| initialized host {} as rank {}'.format(socket.gethostname(), args.distributed_rank)) main(args) if __name__ == '__main__': parser = options.get_training_parser() args = options.parse_args_and_arch(parser) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training distributed_main(args.device_id, args) args.distributed_rank = distributed_utils.distributed_init(args) main(args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format( port=port) args.distributed_rank = None # set based on device id print('''| NOTE: you may get better performance with: python -m torch.distributed.launch --nproc_per_node {ngpu} train.py {no_c10d}(...) '''.format( ngpu=args.distributed_world_size, no_c10d=('--ddp-backend=no_c10d ' if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d' else ''), ))
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args logger.info(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) logger.info(model) logger.info('model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) logger.info('num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) logger.info('training on {} GPUs'.format(args.distributed_world_size)) logger.info('max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') print(args.multi_views) while ( lr > args.min_lr and ( epoch_itr.epoch < max_epoch # allow resuming training from the final checkpoint or epoch_itr._next_epoch_itr is not None ) and trainer.get_num_updates() < max_update ): # train for one epoch train(args, trainer, task, epoch_itr) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) bart = BARTHubInterface(args, task, trainer.model).cuda() #print(bart.device) bart.eval() count = 1 bsz = 8 print("Test on val set: ") with open('../data/val_sent_trans_cons_label.source') as source, open('../data/val_sent_c99_label.source') as source2, open('./val_best_multi_attn_'+str(args.lr_weight)+'_.hypo', 'wt', encoding='utf-8') as fout: s1 = source.readlines() s2 = source2.readlines() slines = [s1[0].strip()] slines2 = [s2[0].strip()] for i in tqdm(range(1, len(s1))): if count % bsz == 0: with torch.no_grad(): if args.multi_views: hypotheses_batch = bart.sample(slines, sentences2 = slines2, balance = True, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) else: hypotheses_batch = bart.sample(slines, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) for hypothesis in hypotheses_batch: fout.write(hypothesis + '\n') fout.flush() slines = [] slines2 = [] slines.append(s1[i].strip()) slines2.append(s2[i].strip()) count += 1 if slines != []: if args.multi_views: hypotheses_batch = bart.sample(slines, sentences2 = slines2, balance = True, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) else: hypotheses_batch = bart.sample(slines, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) #hypotheses_batch = bart.sample(slines, sentences2 = slines2, balance = True, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) for hypothesis in hypotheses_batch: fout.write(hypothesis + '\n') fout.flush() hyp_path = './val_best_multi_attn_'+str(args.lr_weight)+'_.hypo' ref_path = '../data/val_sent_trans_cons_label.target' hypothesis = [] with open(hyp_path, 'r') as f: lines = f.readlines() for l in lines: hypothesis.append(l[:-1]) reference = [] with open(ref_path, 'r') as f: lines = f.readlines() for l in lines: reference.append(l[:-1]) rouge = Rouge() print("Val", rouge.get_scores(hypothesis, reference, avg = True)) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) print("Test on testing set: ") count = 1 bsz = 8 with open('../data/test_sent_trans_cons_label.source') as source, open('../data/test_sent_c99_label.source') as source2, open('./test_best_multi_attn_'+str(args.lr_weight)+'_.hypo', 'wt', encoding='utf-8') as fout: s1 = source.readlines() s2 = source2.readlines() slines = [s1[0].strip()] slines2 = [s2[0].strip()] for i in tqdm(range(1, len(s1))): if count % bsz == 0: with torch.no_grad(): if args.multi_views: hypotheses_batch = bart.sample(slines, sentences2 = slines2, balance = True, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) else: hypotheses_batch = bart.sample(slines, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) for hypothesis in hypotheses_batch: fout.write(hypothesis + '\n') fout.flush() slines = [] slines2 = [] slines.append(s1[i].strip()) slines2.append(s2[i].strip()) count += 1 if slines != []: if args.multi_views: hypotheses_batch = bart.sample(slines, sentences2 = slines2, balance = True, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) else: hypotheses_batch = bart.sample(slines, beam=4, lenpen=2.0, max_len_b=100, min_len=5, no_repeat_ngram_size=3) for hypothesis in hypotheses_batch: fout.write(hypothesis + '\n') fout.flush() hyp_path = './test_best_multi_attn_'+str(args.lr_weight)+'_.hypo' ref_path = '../data/test_sent_trans_cons_label.target' hypothesis = [] with open(hyp_path, 'r') as f: lines = f.readlines() for l in lines: hypothesis.append(l[:-1]) reference = [] with open(ref_path, 'r') as f: lines = f.readlines() for l in lines: reference.append(l[:-1]) rouge = Rouge() print('Test', rouge.get_scores(hypothesis, reference, avg = True)) # early stop if should_stop_early(args, valid_losses[0]): logger.info('early stop since valid performance hasn\'t improved for last {} runs'.format(args.patience)) break epoch_itr = trainer.get_train_iterator( epoch_itr.epoch, # sharded data: get train iterator for next epoch load_dataset=(os.pathsep in getattr(args, 'data', '')), ) train_meter.stop() logger.info('done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): import_user_module(args) if args.max_tokens is None: args.max_tokens = 6000 print(args) if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(args, task) ###get [APPEND] [SRC] [TGT] [SEP] symbol id args.unk_idx = task.src_dict.indices['<unk>'] args.dict_len = task.src_dict.indices.__len__() if '[APPEND]' in task.src_dict.indices.keys(): args.APPEND_ID = task.src_dict.indices['[APPEND]'] print("[APPEND] ID: {}".format(args.APPEND_ID)) else: args.APPEND_ID = -1 if '[SRC]' in task.src_dict.indices.keys(): args.SRC_ID = task.src_dict.indices['[SRC]'] print("[SRC] ID: {}".format(args.SRC_ID)) else: args.SRC_ID = -1 if '[TGT]' in task.src_dict.indices.keys(): args.TGT_ID = task.src_dict.indices['[TGT]'] print("[TGT] ID: {}".format(args.TGT_ID)) else: args.TGT_ID = -1 if '[SEP]' in task.src_dict.indices.keys(): args.SEP_ID = task.src_dict.indices['[SEP]'] print("[SEP] ID: {}".format(args.SEP_ID)) else: args.SEP_ID = -1 if '</s>' in task.src_dict.indices.keys(): args.EOS_ID = task.src_dict.indices['</s>'] else: args.EOD_ID = -1 if '<pad>' in task.src_dict.indices.keys(): args.PAD_ID = task.src_dict.indices['<pad>'] else: args.PAD_ID = -1 # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Make a dummy batch to (i) warm the caching allocator and (ii) as a # placeholder DistributedDataParallel when there's an uneven number of # batches per worker. max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) dummy_batch = task.dataset(args.train_subset).get_dummy_batch( args.max_tokens, max_positions) oom_batch = task.dataset(args.train_subset).get_dummy_batch( 1, max_positions) # Build trainer print("Building trainer...") trainer = Trainer(args, task, model, criterion, dummy_batch, oom_batch) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Initialize dataloader print("Initialize dataloader...") epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=args.required_batch_size_multiple, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, ) # Initialize distributed training (after data loading) print("Initialize distributed training (after data loading)...") if init_distributed: import socket args.distributed_rank = distributed_utils.distributed_init(args) print('| initialized host {} as rank {}'.format( socket.gethostname(), args.distributed_rank)) model.args = args # Load the latest checkpoint if one is available print("Load the latest checkpoint if one is available...") load_checkpoint(args, trainer, epoch_itr) #trainer.dummy_train_step([dummy_batch]) if args.reset_target_embedding: trainer.init_meters(args) print("reset trainer.meters") # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') if args.distributed_rank == 0: if os.path.basename(args.save_dir) != "": log_file = os.path.join( args.save_dir, "({0})-params.log".format(os.path.basename(args.save_dir))) else: log_file = os.path.join( args.save_dir, "({0})-params.log".format(args.save_dir.split('/')[-2])) # create log file args.log_file = log_file if os.path.exists(log_file): print( "It exists log file {}, add log to the file".format(log_file)) w = open(log_file, "a+", encoding="utf-8") else: print("It does not exists log file {}, create log file".format( log_file)) w = open(log_file, "w", encoding="utf-8") w.write(str(args).replace(", ", ",\n") + "\n") w.write(str(model) + "\n") w.flush() w.close() print("saving params file into{}...".format(log_file)) while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) assert ( args.max_tokens is not None or args.max_sentences is not None ), "Must specify batch size either with --max-tokens or --max-sentences" metrics.reset() # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu and not getattr( args, "tpu", False): torch.cuda.set_device(args.device_id) np.random.seed(args.seed) utils.set_torch_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args logger.info(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(","): task.load_dataset(valid_sub_split, combine=False, epoch=1) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) logger.info(model) logger.info("model {}, criterion {}".format(args.arch, criterion.__class__.__name__)) logger.info("num. model params: {} (num. trained: {})".format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # (optionally) Configure quantization if args.quantization_config_path is not None: quantizer = quantization_utils.Quantizer( config_path=args.quantization_config_path, max_epoch=args.max_epoch, max_update=args.max_update, ) else: quantizer = None # Build trainer if args.model_parallel_size == 1: trainer = Trainer(args, task, model, criterion, quantizer) else: trainer = MegatronTrainer(args, task, model, criterion) logger.info("training on {} devices (GPUs/TPUs)".format( args.distributed_world_size)) logger.info( "max tokens per GPU = {} and max sentences per GPU = {}".format( args.max_tokens, args.max_sentences)) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) if args.tpu: import torch_xla.core.xla_model as xm xm.rendezvous("load_checkpoint") # wait for all workers xm.mark_step() logger.info('After loading checkpoint....') # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf lr = trainer.get_lr() train_meter = meters.StopwatchMeter() train_meter.start() while lr > args.min_lr and epoch_itr.next_epoch_idx <= max_epoch: # train for one epoch valid_losses, should_stop = train(args, trainer, task, epoch_itr) if should_stop: break # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) epoch_itr = trainer.get_train_iterator( epoch_itr.next_epoch_idx, # sharded data: get train iterator for next epoch load_dataset=(os.pathsep in getattr(args, "data", "")), ) train_meter.stop() logger.info("done training in {:.1f} seconds".format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) utils.handle_save_path(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training #if torch.cuda.is_available() and not args.cpu: # torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args print(f"| Configs: {args}") # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print( f"| Model: {args.arch} \n| Criterion: {criterion.__class__.__name__}") # Log architecture if args.train_subtransformer: print(" \n\n\t\tWARNING!!! Training one single SubTransformer\n\n") print( f"| SubTransformer Arch: {utils.get_subtransformer_config(args)} \n" ) else: print(" \n\n\t\tWARNING!!! Training SuperTransformer\n\n") print(f"| SuperTransformer Arch: {model} \n") # Log model size if args.train_subtransformer: print( f"| SubTransformer size (without embedding weights): {model.get_sampled_params_numel(utils.get_subtransformer_config(args))}" ) embed_size = args.decoder_embed_dim_subtransformer * len(task.tgt_dict) print(f"| Embedding layer size: {embed_size} \n") else: model_s = 0 # if use model.state_dict, then will add 2 more parameters, they are encoder.version and decoder.version. Should not count them for name, param in model.named_parameters(): if 'embed' not in name: model_s += param.numel() print( f"| SuperTransofmer model size (without embedding weights): {model_s}" ) print( f"| Embedding layer size: {sum(p.numel() for p in model.parameters() if p.requires_grad) - model_s} \n" ) # specify the length of the dummy input for profile # for iwslt, the average length is 23, for wmt, that is 30 dummy_sentence_length_dict = {'iwslt': 23, 'wmt': 30} if 'iwslt' in args.arch: dummy_sentence_length = dummy_sentence_length_dict['iwslt'] elif 'wmt' in args.arch: dummy_sentence_length = dummy_sentence_length_dict['wmt'] else: raise NotImplementedError dummy_src_tokens = [2] + [7] * (dummy_sentence_length - 1) dummy_prev = [7] * (dummy_sentence_length - 1) + [2] # profile the overall FLOPs number if args.profile_flops: import torchprofile config_subtransformer = utils.get_subtransformer_config(args) model.set_sample_config(config_subtransformer) model.profile(mode=True) macs = torchprofile.profile_macs(model, args=(torch.tensor([dummy_src_tokens], dtype=torch.long), torch.tensor([30]), torch.tensor([dummy_prev], dtype=torch.long))) model.profile(mode=False) last_layer_macs = config_subtransformer['decoder'][ 'decoder_embed_dim'] * dummy_sentence_length * len(task.tgt_dict) print(f"| Total FLOPs: {macs * 2}") print(f"| Last layer FLOPs: {last_layer_macs * 2}") print( f"| Total FLOPs without last layer: {(macs - last_layer_macs) * 2} \n" ) exit(0) with torch.autograd.set_detect_anomaly(True): # Build trainer trainer = Trainer(args, task, model, criterion) print(f"| Training on {args.distributed_world_size} GPUs") # print(f"| Max tokens per GPU = {args.max_tokens} and max sentences per GPU = {args.max_sentences} \n") print( f"| Max tokens per GPU = {args.max_tokens} and max sentences per GPU = {None} \n" ) # Measure model latency, the program will exit after profiling latency if args.latcpu or args.latgpu: utils.measure_latency(args, model, dummy_src_tokens, dummy_prev) exit(0) # Load the latest checkpoint if one is available and restore the corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Evaluate the SubTransformer if args.validate_subtransformer: config = utils.get_subtransformer_config(args) trainer.set_sample_config(config) valid_loss = validate(args, trainer, task, epoch_itr, ['valid'], 'SubTransformer') print(f"| SubTransformer validation loss:{valid_loss}") # Loop boundaries max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') represent_configs = utils.get_represent_configs(args) # Main training loop while lr > args.stop_min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: for k, v in represent_configs.items(): trainer.set_sample_config(config=v) valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets, sampled_arch_name=k) else: valid_losses = [None] # update the best loss and get current lr; the real lr scheduling is done in trainer.train_step() lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint epoch level if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) train_meter.stop() print('| Done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args logger.info(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) logger.info(model) logger.info('model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) logger.info('num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) logger.info('training on {} GPUs'.format(args.distributed_world_size)) logger.info( 'max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') tokenize = sacrebleu.DEFAULT_TOKENIZER if not args.eval_tokenized_bleu else 'none' hyps, refs = validate(args, trainer, task, epoch_itr, valid_subsets) for h, r, split in zip(hyps, refs, args.valid_subset.split(',')): assert len(h) == len(r) sacrebleu_score, _, _ = sacrebleu.corpus_bleu( h, [r], tokenize=tokenize), hyps, refs bleu = compute_cvpr_bleu(h, r) rouge_score = rouge.rouge(h, r) print('{} set has {} samples,\n' 'sacrebleu: {},\n' 'CVPR BLEU scripts: {}\n' 'CVPR ROUGE: {}'.format(split, len(h), sacrebleu_score, bleu, rouge_score)) print('performance: {:.2f} {}'.format( rouge_score['rouge_l/f_score'] * 100, ' '.join([str(b) for b in bleu])))
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr, filtered_maxpos_indices = checkpoint_utils.load_checkpoint( args, trainer) # pretrain data actor # only the language actor model can be pretrained if args.pretrain_laser and args.pretrain_data_actor and args.data_actor == 'ave': # pretrain the agent with LASER score # epoch_itr, indices = trainer.get_train_iterator(1) path = '/home/wtan12/multiDDS/' trainer.pretrain_LASER('en-ps.laser-score', epoch_itr) if args.compare_laser: epoch_itr, indices = trainer.get_train_iterator(1) print('Number of Indices: ', len(indices)) scores = collections.defaultdict(float) # compare with laser label using R^2 Score, only used after model is trained # itr = epoch_itr.next_epoch_itr(fix_batches_to_gpus=False, shuffle=False) data_actor = trainer.data_actor itr = epoch_itr.next_epoch_itr( fix_batches_to_gpus=args.fix_batches_to_gpus, shuffle=False, offset=0, datasize=-1, ) for i, sample in enumerate(itr): sample = trainer._prepare_sample(sample) sample = list(sample.values())[0] score = data_actor(sample).cpu().detach().numpy().tolist() indices = sample['id'].data.cpu().numpy().ravel().tolist() for k, v in zip(indices, score): scores[k] = float(v[0]) scores = sorted(scores.items(), key=lambda x: x[0]) print('Number of Indices in Scoring file: ', len(scores)) path = '/home/wtan12/multiDDS/' with open(path + 'en-ps.laser-score', 'r') as r: data = r.read() laser_score = [] for i, item in enumerate(data.split('\n')): laser_score.append(item) laser_score.pop() r2 = 0.0 with open(path + 'en-ps.dds_score', 'w') as f: for k, v in scores: f.write(str(v) + '\n') truth = float(laser_score[k]) r2 += (truth - v)**2 print('R2 Score compared to LASER file: ', r2) return # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') if args.eval_bleu: generator = task.build_generator(args) args.maximize_best_checkpoint_metric = True else: generator = None while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch epoch_itr = train(args, trainer, task, epoch_itr, generator, filtered_maxpos_indices) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets, generator) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) if ':' in getattr(args, 'data', ''): # sharded data: get train iterator for next epoch epoch_itr = trainer.get_train_iterator(epoch_itr.epoch)[0] train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): import_user_module(args) if args.max_tokens is None: args.max_tokens = 6000 print(args) if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load dataset splits load_dataset_splits(task, ['train', 'valid']) # Initialize distributed training (after data loading) if init_distributed: import socket args.distributed_rank = distributed_utils.distributed_init(args) print('| initialized host {} as rank {}'.format( socket.gethostname(), args.distributed_rank)) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Make a dummy batch to (i) warm the caching allocator and (ii) as a # placeholder DistributedDataParallel when there's an uneven number of # batches per worker. max_positions = utils.resolve_max_positions( task.max_positions(), model.max_positions(), ) dummy_batch = task.dataset('train').get_dummy_batch( args.max_tokens, max_positions) oom_batch = task.dataset('train').get_dummy_batch(1, max_positions) # Build trainer trainer = Trainer(args, task, model, criterion, dummy_batch, oom_batch) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Initialize dataloader epoch_itr = task.get_batch_iterator( dataset=task.dataset(args.train_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=args.required_batch_size_multiple, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, ) # Load the latest checkpoint if one is available if not load_checkpoint(args, trainer, epoch_itr): trainer.dummy_train_step([dummy_batch]) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args): if args.distributed_init_method is None and args.distributed_port > 0: # We can determine the init method automatically for Slurm. node_list = os.environ.get('SLURM_JOB_NODELIST') if node_list is not None: try: if args.distributed_backend == 'nccl' and args.dist_avg == 'allreduce': hostnames = subprocess.check_output( ['scontrol', 'show', 'hostnames', node_list]) args.distributed_init_method = 'tcp://{host}:{port}'.format( host=hostnames.split()[0].decode('utf-8'), port=args.distributed_port) args.distributed_rank = int(os.environ.get('SLURM_PROCID')) args.device_id = int(os.environ.get( 'SLURM_LOCALID')) % torch.cuda.device_count() else: args.master_addr = os.environ['HOSTNAME'] if args.distributed_backend == 'mpi': if args.dist_process: args.distributed_rank = int( os.environ['OMPI_COMM_WORLD_RANK']) #print (int(os.environ['OMPI_COMM_WORLD_RANK']), int(os.environ['OMPI_COMM_WORLD_NODE_RANK']), os.environ['OMPI_COMM_WORLD_LOCAL_RANK'], int(os.environ['OMPI_UNIVERSE_SIZE'])) #args.distributed_rank = int(os.environ['OMPI_COMM_WORLD_NODE_RANK']) args.device_id = int( os.environ['OMPI_COMM_WORLD_LOCAL_RANK'] ) % torch.cuda.device_count() else: args.distributed_rank = int( os.environ['OMPI_COMM_WORLD_RANK']) args.distributed_world_size = int( os.environ['OMPI_UNIVERSE_SIZE']) args.device_id = int( os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) else: if args.dist_process: args.distributed_rank = int( os.environ.get('SLURM_PROCID')) args.device_id = int( os.environ.get('SLURM_LOCALID') ) % torch.cuda.device_count() else: args.distributed_rank = int( os.environ['SLURM_PROCID']) args.distributed_world_size = int( os.environ['SLURM_NTASKS']) args.device_id = int(os.environ['SLURM_LOCALID']) args.distributed_init_method = 'tcp://{host}:{port}'.format( host=args.master_addr, port=args.distributed_port) except subprocess.CalledProcessError as e: # scontrol failed raise e except FileNotFoundError as e: # Slurm is not installed pass if args.distributed_init_method is None and args.distributed_port is None: raise ValueError('--distributed-init-method or --distributed-port ' 'must be specified for distributed training') args.distributed_rank = distributed_utils.distributed_init(args) print('| initialized host {} as rank {}'.format(socket.gethostname(), args.distributed_rank)) print('init:', args.distributed_rank, args.distributed_world_size, args.distributed_init_method) single_process_main(args)
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) reload_dataset = ':' in getattr(args, 'data', '') # sharded data: get train iterator for next epoch epoch_itr = trainer.get_train_iterator(epoch_itr.epoch, load_dataset=reload_dataset) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
def main(args, init_distributed=False): utils.import_user_module(args) ########### xml initialization ## xml_model/mlm_tlm_xnli15_1024.pth reloaded = torch.load(args.xml_model_path) params = AttrDict(reloaded['params']) print("Supported languages: %s" % ", ".join(params.lang2id.keys())) # build dictionary / update parameters xml_dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts']) params.n_words = len(xml_dico) params.bos_index = xml_dico.index(BOS_WORD) params.eos_index = xml_dico.index(EOS_WORD) params.pad_index = xml_dico.index(PAD_WORD) params.unk_index = xml_dico.index(UNK_WORD) params.mask_index = xml_dico.index(MASK_WORD) xml_model = TransformerModel(params, xml_dico, True, True) xml_model.eval() print(xml_model._modules['position_embeddings']._parameters['weight'][ 0, 0].item()) xml_model.load_state_dict(reloaded['model']) print(xml_model._modules['position_embeddings']._parameters['weight'][ 0, 0].item()) ############## end of xml initilization assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, shuffle=False, epoch=0, xml_dico=xml_dico, xml_params=params, task=args.task) # Build model and criterion predictor, estimator, xml_estmimator = task.build_model(args) # xml_estmimator = None for n, p in predictor.named_parameters(): if not n.startswith('estimator'): p.requires_grad = False criterion = task.build_criterion(args) print(predictor) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in predictor.parameters()), sum(p.numel() for p in predictor.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, predictor, estimator, criterion, xml_model=xml_model, xml_estmimator=xml_estmimator) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator print(predictor._modules['encoder']._modules['embed_tokens']. _parameters['weight'].data[0, 0].item()) checkpoint_utils.load_predictor_checkpoint(args, trainer) print(predictor._modules['encoder']._modules['embed_tokens']. _parameters['weight'].data[0, 0].item()) extra_state, epoch_itr = checkpoint_utils.load_estimator_checkpoint( args, trainer, xml_dico=xml_dico, xml_params=params) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] valid_subsets = args.valid_subset.split(',') if args.evaluate > 0: validate(args, trainer, task, epoch_itr, valid_subsets, evaluate=True) return while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train_qe(args, trainer, task, epoch_itr) # if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: if not args.disable_validation: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint # if epoch_itr.epoch % args.save_interval == 0: # checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) if ':' in getattr(args, 'data', ''): # sharded data: get train iterator for next epoch epoch_itr = trainer.get_train_iterator(epoch_itr.epoch) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))