def extract_node_models(request): model_name, model_function, sample_batch, expected_output = request.param directory = os.path.join(TEMP_FOLDER, model_name) os.makedirs(directory, exist_ok=True) model_path = os.path.join(directory, "model.onnx") if not os.path.exists(model_path): module = model_function() exporter = ModuleExporter(module, directory) exporter.export_onnx(sample_batch=sample_batch) return os.path.expanduser(model_path), expected_output
def export_model(model, dataloader, output_dir): """ Export a trained model to ONNX :param model: trained model :param dataloader: dataloader to get sample batch :param output_dir: output directory for ONNX model """ exporter = ModuleExporter(model, output_dir=output_dir) for _, sample_batch in enumerate(dataloader): sample_input = (sample_batch["input_ids"], sample_batch["attention_mask"], sample_batch["token_type_ids"]) exporter.export_onnx(sample_batch=sample_input, convert_qat=True) break
def export_to_sparse_onnx( model: LightningModule, output_dir: str, sample_batch: Optional[torch.Tensor] = None) -> None: """Exports the model to ONNX format.""" with model._prevent_trainer_and_dataloaders_deepcopy(): exporter = ModuleExporter(model, output_dir=output_dir) sample_batch = sample_batch if sample_batch is not None else model.example_input_array if sample_batch is None: raise MisconfigurationException( "To export the model, a sample batch must be passed via " "``SparseMLCallback.export_to_sparse_onnx(model, output_dir, sample_batch=sample_batch)`` " "or an ``example_input_array`` property within the LightningModule" ) exporter.export_onnx(sample_batch=sample_batch)
def save_model_training( model: Module, optim: Optimizer, input_shape: Tuple[int, ...], save_name: str, save_dir: str, epoch: int, val_res: Union[ModuleRunResults, None], convert_qat: bool = False, ): """ :param model: model architecture :param optim: The optimizer used :param input_shape: A tuple of integers representing the input shape :param save_name: name to save model to :param save_dir: directory to save results in :param epoch: integer representing umber of epochs to :param val_res: results from validation run :param convert_qat: True if model is to be quantized before saving """ has_top1 = "top1acc" in val_res.results metric_name = "top-1 accuracy" if has_top1 else "val_loss" metric = val_res.result_mean( "top1acc" if has_top1 else DEFAULT_LOSS_KEY).item() print(f"Saving model for epoch {epoch} and {metric_name} " f"{metric} to {save_dir} for {save_name}") exporter = ModuleExporter(model, save_dir) exporter.export_pytorch(optim, epoch, f"{save_name}.pth") exporter.export_onnx( torch.randn(1, *input_shape), f"{save_name}.onnx", convert_qat=convert_qat, ) info_path = os.path.join(save_dir, f"{save_name}.txt") with open(info_path, "w") as info_file: info_lines = [ f"epoch: {epoch}", ] if val_res is not None: for loss in val_res.results.keys(): info_lines.append( f"{loss}: {val_res.result_mean(loss).item()}") info_file.write("\n".join(info_lines))
def export(args: ExportArgs, model: Module, val_loader: DataLoader, save_dir: str) -> None: """ Utility method to export the model and data :param args : An ExportArgs object containing config for export task. :param model: loaded model architecture to export :param val_loader: A DataLoader for validation data :param save_dir: Directory to store checkpoints at during exporting process """ exporter = ModuleExporter(model, save_dir) # export PyTorch state dict LOGGER.info(f"exporting pytorch in {save_dir}") exporter.export_pytorch(use_zipfile_serialization_if_available=( args.use_zipfile_serialization_if_available)) onnx_exported = False for batch, data in tqdm( enumerate(val_loader), desc="Exporting samples", total=args.num_samples if args.num_samples > 1 else 1, ): if not onnx_exported: # export onnx file using first sample for graph freezing LOGGER.info(f"exporting onnx in {save_dir}") exporter.export_onnx(data[0], opset=args.onnx_opset, convert_qat=True) onnx_exported = True if args.num_samples > 0: exporter.export_samples(sample_batches=[data[0]], sample_labels=[data[1]], exp_counter=batch)
def main(args): ############################ # logging and saving setup # ############################ save_dir = os.path.abspath(os.path.expanduser(args.save_dir)) # get unique model tag, defaults to '{model_name}' if not args.model_tag: model_tag = args.model.replace("/", ".") model_id = model_tag model_inc = 0 while os.path.exists(os.path.join(args.save_dir, model_id)): model_inc += 1 model_id = "{}__{:02d}".format(model_tag, model_inc) else: model_id = args.model_tag save_dir = os.path.join(save_dir, model_id) create_dirs(save_dir) print("Model id is set to {}".format(model_id)) ########################### # standard training setup # ########################### # create data loaders train_loader, _, _ = _create_imagefolder_dataloader(args, train=True) val_loader, num_classes, image_shape = _create_imagefolder_dataloader( args, train=False ) dataloaders = {"train": train_loader, "val": val_loader} # create model model = _get_torchvision_model( args.model, num_classes, args.pretrained, args.checkpoint_path, ) print("created model: {}".format(model)) device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) print("using device: {}".format(device)) # create standard SGD optimizer and cross entropy loss function criterion = CrossEntropyLoss() optimizer = SGD( model.parameters(), lr=0.001, momentum=0.9 ) # lr will be overridden by recipe ########################## # add sparseml modifiers # ########################## manager = ScheduledModifierManager.from_yaml(args.recipe_path) optimizer = ScheduledOptimizer( optimizer, model, manager, steps_per_epoch=len(train_loader), loggers=[PythonLogger()], ) ######################## # torchvision training # ######################## model, val_acc_history = train_model( model, dataloaders, criterion, optimizer, device, num_epochs=manager.max_epochs, is_inception="inception" in args.model, ) ######################## # export trained model # ######################## exporter = ModuleExporter(model, save_dir) sample_input = torch.randn(image_shape).unsqueeze(0) # sample batch for ONNX export exporter.export_onnx(sample_input) exporter.export_pytorch() print("Model ONNX export and PyTorch weights saved to {}".format(save_dir))
def main(): setup_default_logging() args, args_text = _parse_args() args.prefetcher = not args.no_prefetcher args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.device = 'cuda:0' args.world_size = 1 args.rank = 0 # global rank if args.distributed: args.device = 'cuda:%d' % args.local_rank torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.rank = torch.distributed.get_rank() _logger.info('Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.' % (args.rank, args.world_size)) else: _logger.info('Training with a single process on 1 GPUs.') assert args.rank >= 0 # resolve AMP arguments based on PyTorch / Apex availability use_amp = None if args.amp: # for backwards compat, `--amp` arg tries apex before native amp if has_apex: args.apex_amp = True elif has_native_amp: args.native_amp = True if args.apex_amp and has_apex: use_amp = 'apex' elif args.native_amp and has_native_amp: use_amp = 'native' elif args.apex_amp or args.native_amp: _logger.warning("Neither APEX or native Torch AMP is available, using float32. " "Install NVIDA apex or upgrade to PyTorch 1.6") torch.manual_seed(args.seed + args.rank) #################################################################################### # Start - SparseML optional load weights from SparseZoo #################################################################################### if args.initial_checkpoint == "zoo": # Load checkpoint from base weights associated with given SparseZoo recipe if args.sparseml_recipe.startswith("zoo:"): args.initial_checkpoint = Zoo.download_recipe_base_framework_files( args.sparseml_recipe, extensions=[".pth.tar", ".pth"] )[0] else: raise ValueError( "Attempting to load weights from SparseZoo recipe, but not given a " "SparseZoo recipe stub. When initial-checkpoint is set to 'zoo'. " "sparseml-recipe must start with 'zoo:' and be a SparseZoo model " f"stub. sparseml-recipe was set to {args.sparseml_recipe}" ) elif args.initial_checkpoint.startswith("zoo:"): # Load weights from a SparseZoo model stub zoo_model = Zoo.load_model_from_stub(args.initial_checkpoint) args.initial_checkpoint = zoo_model.download_framework_files(extensions=[".pth"]) #################################################################################### # End - SparseML optional load weights from SparseZoo #################################################################################### model = create_model( args.model, pretrained=args.pretrained, num_classes=args.num_classes, drop_rate=args.drop, drop_connect_rate=args.drop_connect, # DEPRECATED, use drop_path drop_path_rate=args.drop_path, drop_block_rate=args.drop_block, global_pool=args.gp, bn_tf=args.bn_tf, bn_momentum=args.bn_momentum, bn_eps=args.bn_eps, scriptable=args.torchscript, checkpoint_path=args.initial_checkpoint) if args.num_classes is None: assert hasattr(model, 'num_classes'), 'Model must have `num_classes` attr if not set on cmd line/config.' args.num_classes = model.num_classes # FIXME handle model default vs config num_classes more elegantly if args.local_rank == 0: _logger.info('Model %s created, param count: %d' % (args.model, sum([m.numel() for m in model.parameters()]))) data_config = resolve_data_config(vars(args), model=model, verbose=args.local_rank == 0) # setup augmentation batch splits for contrastive loss or split bn num_aug_splits = 0 if args.aug_splits > 0: assert args.aug_splits > 1, 'A split of 1 makes no sense' num_aug_splits = args.aug_splits # enable split bn (separate bn stats per batch-portion) if args.split_bn: assert num_aug_splits > 1 or args.resplit model = convert_splitbn_model(model, max(num_aug_splits, 2)) # move model to GPU, enable channels last layout if set model.cuda() if args.channels_last: model = model.to(memory_format=torch.channels_last) # setup synchronized BatchNorm for distributed training if args.distributed and args.sync_bn: assert not args.split_bn if has_apex and use_amp != 'native': # Apex SyncBN preferred unless native amp is activated model = convert_syncbn_model(model) else: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if args.local_rank == 0: _logger.info( 'Converted model to use Synchronized BatchNorm. WARNING: You may have issues if using ' 'zero initialized BN layers (enabled by default for ResNets) while sync-bn enabled.') if args.torchscript: assert not use_amp == 'apex', 'Cannot use APEX AMP with torchscripted model' assert not args.sync_bn, 'Cannot use SyncBatchNorm with torchscripted model' model = torch.jit.script(model) optimizer = create_optimizer(args, model) # setup automatic mixed-precision (AMP) loss scaling and op casting amp_autocast = suppress # do nothing loss_scaler = None if use_amp == 'apex': model, optimizer = amp.initialize(model, optimizer, opt_level='O1') loss_scaler = ApexScaler() if args.local_rank == 0: _logger.info('Using NVIDIA APEX AMP. Training in mixed precision.') elif use_amp == 'native': amp_autocast = torch.cuda.amp.autocast loss_scaler = NativeScaler() if args.local_rank == 0: _logger.info('Using native Torch AMP. Training in mixed precision.') else: if args.local_rank == 0: _logger.info('AMP not enabled. Training in float32.') # optionally resume from a checkpoint resume_epoch = None if args.resume: resume_epoch = resume_checkpoint( model, args.resume, optimizer=None if args.no_resume_opt else optimizer, loss_scaler=None if args.no_resume_opt else loss_scaler, log_info=args.local_rank == 0) # setup exponential moving average of model weights, SWA could be used here too model_ema = None if args.model_ema: # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper model_ema = ModelEmaV2( model, decay=args.model_ema_decay, device='cpu' if args.model_ema_force_cpu else None) if args.resume: load_checkpoint(model_ema.module, args.resume, use_ema=True) # setup distributed training if args.distributed: if has_apex and use_amp != 'native': # Apex DDP preferred unless native amp is activated if args.local_rank == 0: _logger.info("Using NVIDIA APEX DistributedDataParallel.") model = ApexDDP(model, delay_allreduce=True) else: if args.local_rank == 0: _logger.info("Using native Torch DistributedDataParallel.") model = NativeDDP(model, device_ids=[args.local_rank]) # can use device str in Torch >= 1.1 # NOTE: EMA model does not need to be wrapped by DDP # setup learning rate schedule and starting epoch lr_scheduler, num_epochs = create_scheduler(args, optimizer) start_epoch = 0 if args.start_epoch is not None: # a specified start_epoch will always override the resume epoch start_epoch = args.start_epoch elif resume_epoch is not None: start_epoch = resume_epoch if lr_scheduler is not None and start_epoch > 0: lr_scheduler.step(start_epoch) # create the train and eval datasets dataset_train = create_dataset( args.dataset, root=args.data_dir, split=args.train_split, is_training=True, batch_size=args.batch_size) dataset_eval = create_dataset( args.dataset, root=args.data_dir, split=args.val_split, is_training=False, batch_size=args.batch_size) # setup mixup / cutmix collate_fn = None mixup_fn = None mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None if mixup_active: mixup_args = dict( mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, label_smoothing=args.smoothing, num_classes=args.num_classes) if args.prefetcher: assert not num_aug_splits # collate conflict (need to support deinterleaving in collate mixup) collate_fn = FastCollateMixup(**mixup_args) else: mixup_fn = Mixup(**mixup_args) # wrap dataset in AugMix helper if num_aug_splits > 1: dataset_train = AugMixDataset(dataset_train, num_splits=num_aug_splits) # create data loaders w/ augmentation pipeiine train_interpolation = args.train_interpolation if args.no_aug or not train_interpolation: train_interpolation = data_config['interpolation'] loader_train = create_loader( dataset_train, input_size=data_config['input_size'], batch_size=args.batch_size, is_training=True, use_prefetcher=args.prefetcher, no_aug=args.no_aug, re_prob=args.reprob, re_mode=args.remode, re_count=args.recount, re_split=args.resplit, scale=args.scale, ratio=args.ratio, hflip=args.hflip, vflip=args.vflip, color_jitter=args.color_jitter, auto_augment=args.aa, num_aug_splits=num_aug_splits, interpolation=train_interpolation, mean=data_config['mean'], std=data_config['std'], num_workers=args.workers, distributed=args.distributed, collate_fn=collate_fn, pin_memory=args.pin_mem, use_multi_epochs_loader=args.use_multi_epochs_loader ) loader_eval = create_loader( dataset_eval, input_size=data_config['input_size'], batch_size=args.validation_batch_size_multiplier * args.batch_size, is_training=False, use_prefetcher=args.prefetcher, interpolation=data_config['interpolation'], mean=data_config['mean'], std=data_config['std'], num_workers=args.workers, distributed=args.distributed, crop_pct=data_config['crop_pct'], pin_memory=args.pin_mem, ) # setup loss function if args.jsd: assert num_aug_splits > 1 # JSD only valid with aug splits set train_loss_fn = JsdCrossEntropy(num_splits=num_aug_splits, smoothing=args.smoothing).cuda() elif mixup_active: # smoothing is handled with mixup target transform train_loss_fn = SoftTargetCrossEntropy().cuda() elif args.smoothing: train_loss_fn = LabelSmoothingCrossEntropy(smoothing=args.smoothing).cuda() else: train_loss_fn = nn.CrossEntropyLoss().cuda() validate_loss_fn = nn.CrossEntropyLoss().cuda() # setup checkpoint saver and eval metric tracking eval_metric = args.eval_metric best_metric = None best_epoch = None saver = None output_dir = '' if args.local_rank == 0: output_base = args.output if args.output else './output' exp_name = '-'.join([ datetime.now().strftime("%Y%m%d-%H%M%S"), args.model, str(data_config['input_size'][-1]) ]) output_dir = get_outdir(output_base, 'train', exp_name) decreasing = True if eval_metric == 'loss' else False saver = CheckpointSaver( model=model, optimizer=optimizer, args=args, model_ema=model_ema, amp_scaler=loss_scaler, checkpoint_dir=output_dir, recovery_dir=output_dir, decreasing=decreasing, max_history=args.checkpoint_hist) with open(os.path.join(output_dir, 'args.yaml'), 'w') as f: f.write(args_text) #################################################################################### # Start SparseML Integration #################################################################################### sparseml_loggers = ( [PythonLogger(), TensorBoardLogger(log_path=output_dir)] if output_dir else None ) manager = ScheduledModifierManager.from_yaml(args.sparseml_recipe) optimizer = ScheduledOptimizer( optimizer, model, manager, steps_per_epoch=len(loader_train), loggers=sparseml_loggers ) # override lr scheduler if recipe makes any LR updates if any("LearningRate" in str(modifier) for modifier in manager.modifiers): _logger.info("Disabling timm LR scheduler, managing LR using SparseML recipe") lr_scheduler = None if manager.max_epochs: _logger.info( f"Overriding max_epochs to {manager.max_epochs} from SparseML recipe" ) num_epochs = manager.max_epochs or num_epochs #################################################################################### # End SparseML Integration #################################################################################### if args.local_rank == 0: _logger.info('Scheduled epochs: {}'.format(num_epochs)) try: for epoch in range(start_epoch, num_epochs): if args.distributed and hasattr(loader_train.sampler, 'set_epoch'): loader_train.sampler.set_epoch(epoch) train_metrics = train_one_epoch( epoch, model, loader_train, optimizer, train_loss_fn, args, lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir, amp_autocast=amp_autocast, loss_scaler=loss_scaler, model_ema=model_ema, mixup_fn=mixup_fn) if args.distributed and args.dist_bn in ('broadcast', 'reduce'): if args.local_rank == 0: _logger.info("Distributing BatchNorm running means and vars") distribute_bn(model, args.world_size, args.dist_bn == 'reduce') eval_metrics = validate(model, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast) if model_ema is not None and not args.model_ema_force_cpu: if args.distributed and args.dist_bn in ('broadcast', 'reduce'): distribute_bn(model_ema, args.world_size, args.dist_bn == 'reduce') ema_eval_metrics = validate( model_ema.module, loader_eval, validate_loss_fn, args, amp_autocast=amp_autocast, log_suffix=' (EMA)') eval_metrics = ema_eval_metrics if lr_scheduler is not None: # step LR for next epoch lr_scheduler.step(epoch + 1, eval_metrics[eval_metric]) update_summary( epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'), write_header=best_metric is None) if saver is not None: # save proper checkpoint with eval metric save_metric = eval_metrics[eval_metric] best_metric, best_epoch = saver.save_checkpoint(epoch, metric=save_metric) ################################################################################# # Start SparseML ONNX Export ################################################################################# if output_dir: _logger.info( f"training complete, exporting ONNX to {output_dir}/model.onnx" ) exporter = ModuleExporter(model, output_dir) exporter.export_onnx(torch.randn((1, *data_config["input_size"]))) ################################################################################# # End SparseML ONNX Export ################################################################################# except KeyboardInterrupt: pass if best_metric is not None: _logger.info('*** Best metric: {0} (epoch {1})'.format(best_metric, best_epoch))
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if data_args.source_prefix is None and model_args.model_name_or_path in [ "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b", ]: logger.warning( "You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with " "`--source_prefix 'summarize: ' `") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files this script will use the first column for the full texts and the second column for the # summaries (unless you specify column names for this with the `text_column` and `summary_column` arguments). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model.resize_token_embeddings(len(tokenizer)) print(model) if model.config.decoder_start_token_id is None: raise ValueError( "Make sure that `config.decoder_start_token_id` is correctly defined" ) prefix = data_args.source_prefix if data_args.source_prefix is not None else "" # Preprocessing the datasets. # We need to tokenize inputs and targets. if training_args.do_train: column_names = datasets["train"].column_names elif training_args.do_eval: column_names = datasets["validation"].column_names elif training_args.do_predict: column_names = datasets["test"].column_names else: logger.info( "There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`." ) return # Get the column names for input/target. if data_args.text_column is None: text_column = dataset_columns[ 0] if dataset_columns is not None else column_names[0] else: text_column = data_args.text_column if text_column not in column_names: raise ValueError( f"--text_column' value '{data_args.text_column}' needs to be one of: {', '.join(column_names)}" ) if data_args.summary_column is None: summary_column = dataset_columns[ 1] if dataset_columns is not None else column_names[1] else: summary_column = data_args.summary_column if summary_column not in column_names: raise ValueError( f"--summary_column' value '{data_args.summary_column}' needs to be one of: {', '.join(column_names)}" ) # Temporarily set max_target_length for training. max_target_length = data_args.max_target_length padding = "max_length" if data_args.pad_to_max_length else False if training_args.label_smoothing_factor > 0 and not hasattr( model, "prepare_decoder_input_ids_from_labels"): logger.warning( "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for" f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory" ) def preprocess_function(examples): inputs = examples['input'] targets = examples['target'] inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) # Setup the tokenizer for targets with tokenizer.as_target_tokenizer(): labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and data_args.ignore_pad_token_for_loss: labels["input_ids"] = [[ (l if l != tokenizer.pad_token_id else -100) for l in label ] for label in labels["input_ids"]] model_inputs["labels"] = labels["input_ids"] return model_inputs if training_args.do_train: if "train" not in datasets: raise ValueError("--do_train requires a train dataset") train_dataset = datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) train_dataset = train_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_eval: max_target_length = data_args.val_max_target_length if "validation" not in datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = datasets["validation"] if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select( range(data_args.max_eval_samples)) eval_dataset = eval_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_predict: max_target_length = data_args.val_max_target_length if "test" not in datasets: raise ValueError("--do_predict requires a test dataset") predict_dataset = datasets["test"] if data_args.max_predict_samples is not None: predict_dataset = predict_dataset.select( range(data_args.max_predict_samples)) predict_dataset = predict_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8 if training_args.fp16 else None, ) # Metric metric = load_metric("rouge") def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] labels = [label.strip() for label in labels] # rougeLSum expects newline after each sentence preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds] labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels] return preds, labels def compute_metrics(eval_preds): preds, labels = eval_preds if isinstance(preds, tuple): preds = preds[0] decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) if data_args.ignore_pad_token_for_loss: # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Some simple post-processing decoded_preds, decoded_labels = postprocess_text( decoded_preds, decoded_labels) result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) # Extract a few results from ROUGE result = { key: value.mid.fmeasure * 100 for key, value in result.items() } prediction_lens = [ np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds ] result["gen_len"] = np.mean(prediction_lens) result = {k: round(v, 4) for k, v in result.items()} return result #################################################################################### # Start SparseML Integration #################################################################################### import pdb # pdb.set_trace() if training_args.do_train: optim = load_optimizer(model, training_args) steps_per_epoch = math.ceil( len(train_dataset) / (training_args.per_device_train_batch_size * training_args._n_gpu)) manager = ScheduledModifierManager.from_yaml(data_args.nm_prune_config) training_args.num_train_epochs = float(manager.max_epochs) optim = ScheduledOptimizer(optim, model, manager, steps_per_epoch=steps_per_epoch, loggers=None) # Initialize our Trainer trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics if training_args.predict_with_generate else None, optimizers=(optim, None) if training_args.do_train else (None, None), ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate(max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, metric_key_prefix="eval") max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) if training_args.do_predict: logger.info("*** Predict ***") predict_results = trainer.predict( predict_dataset, metric_key_prefix="predict", max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, ) metrics = predict_results.metrics max_predict_samples = (data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)) metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) trainer.log_metrics("predict", metrics) trainer.save_metrics("predict", metrics) if trainer.is_world_process_zero(): if training_args.predict_with_generate: predictions = tokenizer.batch_decode( predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True) predictions = [pred.strip() for pred in predictions] output_prediction_file = os.path.join( training_args.output_dir, "generated_predictions.txt") with open(output_prediction_file, "w") as writer: writer.write("\n".join(predictions)) if training_args.push_to_hub: kwargs = { "finetuned_from": model_args.model_name_or_path, "tags": "summarization" } if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name kwargs[ "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name trainer.push_to_hub(**kwargs) #################################################################################### # Start SparseML Integration #################################################################################### if data_args.do_onnx_export: logger.info("*** Export to ONNX ***") os.environ["TOKENIZERS_PARALLELISM"] = "false" exporter = ModuleExporter(student_model, output_dir=data_args.onnx_export_path) sample_batch = convert_example_to_features( datasets["train"][0], tokenizer, data_args.max_seq_length, sentence1_key, sentence2_key, ) exporter.export_onnx(sample_batch=sample_batch) #################################################################################### # End SparseML Integration #################################################################################### return results
def train(hyp, opt, device, tb_writer=None, wandb=None): logger.info( colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items())) save_dir, epochs, batch_size, total_batch_size, weights, rank = \ Path(opt.save_dir), opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank # Directories wdir = save_dir / 'weights' wdir.mkdir(parents=True, exist_ok=True) # make dir last = wdir / 'last.pt' best = wdir / 'best.pt' results_file = save_dir / 'results.txt' # Save run settings with open(save_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(save_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure plots = not opt.evolve # create plots cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.SafeLoader) # data dict with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict['train'] test_path = data_dict['val'] nc = 1 if opt.single_cls else int(data_dict['nc']) # number of classes names = ['item'] if opt.single_cls and len( data_dict['names']) != 1 else data_dict['names'] # class names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Model pretrained = weights.endswith('.pt') or weights.endswith( '.pth') # SparseML integration if pretrained: with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint if hyp.get('anchors'): ckpt['model'].yaml['anchors'] = round( hyp['anchors']) # force autoanchor model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device) # create exclude = ['anchor'] if opt.cfg or hyp.get('anchors') else [ ] # exclude keys state_dict = _load_checkpoint_model_state_dict( ckpt) # SparseML integration state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(state_dict, strict=False) # load logger.info( 'Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Model(opt.cfg, ch=3, nc=nc).to(device) # create # Freeze freeze = [] # parameter names to freeze (full or partial) for k, v in model.named_parameters(): v.requires_grad = True # train all layers if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay logger.info(f"Scaled weight_decay = {hyp['weight_decay']}") pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_modules(): if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): pg2.append(v.bias) # biases if isinstance(v, nn.BatchNorm2d): pg0.append(v.weight) # no decay elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): pg1.append(v.weight) # apply decay if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR if opt.linear_lr: lf = lambda x: (1 - x / (epochs - 1)) * (1.0 - hyp['lrf']) + hyp[ 'lrf'] # linear else: lf = one_cycle(1, hyp['lrf'], epochs) # cosine 1->hyp['lrf'] scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Logging if rank in [-1, 0] and wandb and wandb.run is None: opt.hyp = hyp # add hyperparameters wandb_run = wandb.init( config=opt, resume="allow", project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem, name=save_dir.stem, id=ckpt.get('wandb_id') if 'ckpt' in locals() else None) loggers = {'wandb': wandb} # loggers dict # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # Results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # Epochs start_epoch = ckpt['epoch'] + 1 if opt.resume: assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % ( weights, epochs) if epochs < start_epoch: logger.info( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, state_dict # Image sizes gs = int(model.stride.max()) # grid size (max stride) nl = model.model[ -1].nl # number of detection layers (used for scaling hyp['obj']) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info('Using SyncBatchNorm()') # EMA #################################################################################### # Start SparseML Integration - optional EMA #################################################################################### ema = ModelEMA(model) if rank in [-1, 0] and opt.use_ema else None #################################################################################### # End SparseML Integration - optional EMA #################################################################################### # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers, image_weights=opt.image_weights, quad=opt.quad, prefix=colorstr('train: ')) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) # Process 0 if rank in [-1, 0]: if ema: ema.updates = start_epoch * nb // accumulate # set EMA updates testloader = create_dataloader( test_path, imgsz_test, batch_size * 2, gs, opt, # testloader hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers, pad=0.5, prefix=colorstr('val: '))[0] if not opt.resume: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) if plots: plot_labels(labels, save_dir, loggers) if tb_writer: tb_writer.add_histogram('classes', c, 0) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Model parameters hyp['box'] *= 3. / nl # scale to layers hyp['cls'] *= nc / 80. * 3. / nl # scale to classes and layers hyp['obj'] *= (imgsz / 640)**2 * 3. / nl # scale to image size and layers model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # iou loss ratio (obj_loss = 1.0 or iou) model.class_weights = labels_to_class_weights( dataset.labels, nc).to(device) * nc # attach class weights model.names = names #################################################################################### # Start SparseML Integration #################################################################################### from sparseml.pytorch.nn import replace_activations from sparseml.pytorch.optim import ScheduledModifierManager, ScheduledOptimizer from sparseml.pytorch.utils import is_parallel_model, PythonLogger, TensorBoardLogger if not opt.no_leaky_relu_override: # use LeakyReLU activations model = replace_activations(model, "lrelu", inplace=True) manager = ScheduledModifierManager.from_yaml(opt.sparseml_recipe) optimizer = ScheduledOptimizer( optimizer, model if not is_parallel_model(model) else model.module, manager, steps_per_epoch=len(dataloader), loggers=[PythonLogger(), TensorBoardLogger(writer=tb_writer)]) # override lr scheduler if recipe makes any LR updates if any("LearningRate" in str(modifier) for modifier in manager.modifiers): logger.info( "Disabling yolo LR scheduler, managing LR using SparseML recipe") scheduler = None # disable model pickling if QAT is set qat = False if any("Quantization" in str(modifier) for modifier in manager.modifiers): logger.info("Disabling pickling for Yolo model, QAT modifiers present") qat = True if manager.max_epochs: epochs = manager.max_epochs or epochs # override num_epochs logger.info( f"overriding number of epochs from SparseML manager to {manager.max_epochs}" ) #################################################################################### # End SparseML Integration #################################################################################### # Start training t0 = time.time() nw = max(round(hyp['warmup_epochs'] * nb), 1000) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0 ) # P, R, [email protected], [email protected], val_loss(box, obj, cls) if scheduler: # SparseML integration scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=(cuda and opt.use_amp)) compute_loss = ComputeLoss(model) # init loss class logger.info(f'Image sizes {imgsz} train, {imgsz_test} test\n' f'Using {dataloader.num_workers} dataloader workers\n' f'Logging results to {save_dir}\n' f'Starting training for {epochs} epochs...') for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if opt.image_weights: # Generate indices if rank in [-1, 0]: cw = model.class_weights.cpu().numpy() * ( 1 - maps)**2 / nc # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices( range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int() dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info( ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'targets', 'img_size')) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 if scheduler: # SparseML integration, do not force warmup lr when overriding x['lr'] = np.interp(ni, xi, [ hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch) ]) if 'momentum' in x: x['momentum'] = np.interp( ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with amp.autocast(enabled=(cuda and opt.use_amp)): pred = model(imgs) # forward loss, loss_items = compute_loss( pred, targets.to(device)) # loss scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode if opt.quad: loss *= 4. # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if plots and ni < 3: f = save_dir / f'train_batch{ni}.jpg' # filename Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start() # if tb_writer: # tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard elif plots and ni == 10 and wandb: wandb.log( { "Mosaics": [ wandb.Image(str(x), caption=x.name) for x in save_dir.glob('train*.jpg') if x.exists() ] }, commit=False) # end batch ------------------------------------------------------------------------------------------------ # end epoch ---------------------------------------------------------------------------------------------------- # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard if scheduler: # SparseML integration scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: ema.update_attr(model, include=[ 'yaml', 'nc', 'hyp', 'gr', 'names', 'stride', 'class_weights' ]) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test( opt.data, batch_size=batch_size * 2, imgsz=imgsz_test, model=ema.ema if ema else model, single_cls=opt.single_cls, dataloader=testloader, save_dir=save_dir, verbose=nc < 50 and final_epoch, plots=plots and final_epoch, log_imgs=opt.log_imgs if wandb else 0, compute_loss=compute_loss, half_precision=opt.use_amp) # SparseML integration # Write with open(results_file, 'a') as f: f.write( s + '%10.4g' * 7 % results + '\n') # P, R, [email protected], [email protected], val_loss(box, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Log tags = [ 'train/box_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/box_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2' ] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): if tb_writer: tb_writer.add_scalar(tag, x, epoch) # tensorboard if wandb: wandb.log({tag: x}, step=epoch, commit=tag == tags[-1]) # W&B # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # weighted combination of [P, R, [email protected], [email protected]] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt_model = ema.ema if ema else model if not qat else model.state_dict( ) # SparseML integration ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ckpt_model, # SparseML integration 'optimizer': None if final_epoch else optimizer.state_dict(), 'wandb_id': wandb_run.id if wandb else None } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers final = best if best.exists() else last # final model for f in [last, best]: if f.exists( ) and not qat: # SparseML integration - qat state dict incompatible strip_optimizer(f) # strip optimizers if opt.bucket: os.system(f'gsutil cp {final} gs://{opt.bucket}/weights') # upload # Plots if plots: plot_results(save_dir=save_dir) # save as results.png if wandb: files = [ 'results.png', 'confusion_matrix.png', *[f'{x}_curve.png' for x in ('F1', 'PR', 'P', 'R')] ] wandb.log({ "Results": [ wandb.Image(str(save_dir / f), caption=f) for f in files if (save_dir / f).exists() ] }) if opt.log_artifacts: wandb.log_artifact(artifact_or_path=str(final), type='model', name=save_dir.stem) # Test best.pt logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) if opt.data.endswith('coco.yaml') and nc == 80: # if COCO for conf, iou, save_json in ([0.25, 0.45, False], [0.001, 0.65, True]): # speed, mAP tests # SparseML integration - load test model test_model = model if qat else attempt_load(final, device) if opt.use_amp: test_model = test_model.half() results, _, _ = test.test( opt.data, batch_size=batch_size * 2, imgsz=imgsz_test, conf_thres=conf, iou_thres=iou, model=test_model, single_cls=opt.single_cls, dataloader=testloader, save_dir=save_dir, save_json=save_json, plots=False, half_precision=opt.use_amp) # SparseML integration ################################################################################# # Start SparseML ONNX Export ################################################################################# from sparseml.pytorch.utils import ModuleExporter from sparseml.pytorch.utils.quantization import skip_onnx_input_quantize onnx_path = f"{save_dir}/model.onnx" logger.info(f"training complete, exporting ONNX to {onnx_path}") export_model = model.module if is_parallel_model(model) else model export_model.model[ -1].export = True # do not export grid post-procesing exporter = ModuleExporter(export_model, save_dir) exporter.export_onnx(torch.randn(1, 3, imgsz, imgsz), convert_qat=True) if qat: skip_onnx_input_quantize(onnx_path, onnx_path) ################################################################################# # End SparseML ONNX Export ################################################################################# else: dist.destroy_process_group() wandb.run.finish() if wandb and wandb.run else None torch.cuda.empty_cache() return results
'images': { 0: 'batch', 2: 'height', 3: 'width' }, # size(1,3,640,640) 'output': { 0: 'batch', 2: 'y', 3: 'x' } } if opt.dynamic else None) else: # export through SparseML so quantized and pruned graphs can be corrected save_dir = '/'.join(f.split('/')[:-1]) save_name = f.split('/')[-1] exporter = ModuleExporter(model, save_dir) exporter.export_onnx(img, name=save_name, convert_qat=True) try: skip_onnx_input_quantize(f, f) except: pass # Checks model_onnx = onnx.load(f) # load onnx model onnx.checker.check_model(model_onnx) # check onnx model # print(onnx.helper.printable_graph(model_onnx.graph)) # print # Simplify if opt.simplify: try: check_requirements(['onnx-simplifier'])
def main(): ### Dataset processing classes in main due to hugging face custom dataset map def prepare_train_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=data_args.max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length" if data_args.pad_to_max_length else False, ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offset_mapping = tokenized_examples.pop("offset_mapping") # Let's label those examples! tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] for i, offsets in enumerate(offset_mapping): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_examples["input_ids"][i] cls_index = input_ids.index(tokenizer.cls_token_id) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples.sequence_ids(i) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] answers = examples[answer_column_name][sample_index] # If no answers are given, set the cls_index as answer. if len(answers["answer_start"]) == 0: tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Start/end character index of the answer in the text. start_char = answers["answer_start"][0] end_char = start_char + len(answers["text"][0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != (1 if pad_on_right else 0): token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != (1 if pad_on_right else 0): token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while (token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char): token_start_index += 1 tokenized_examples["start_positions"].append( token_start_index - 1) while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples["end_positions"].append( token_end_index + 1) return tokenized_examples def compute_metrics(p: EvalPrediction): return metric.compute(predictions=p.predictions, references=p.label_ids) # Post-processing: def post_processing_function(examples, features, predictions): predictions = postprocess_qa_predictions( examples=examples, features=features, predictions=predictions, version_2_with_negative=data_args.version_2_with_negative, n_best_size=data_args.n_best_size, max_answer_length=data_args.max_answer_length, null_score_diff_threshold=data_args.null_score_diff_threshold, output_dir=training_args.output_dir, is_world_process_zero=trainer.is_world_process_zero(), ) if data_args.version_2_with_negative: formatted_predictions = [{ "id": k, "prediction_text": v, "no_answer_probability": 0.0 } for k, v in predictions.items()] else: formatted_predictions = [{ "id": k, "prediction_text": v } for k, v in predictions.items()] references = [{ "id": ex["id"], "answers": ex[answer_column_name] } for ex in datasets["validation"]] return EvalPrediction(predictions=formatted_predictions, label_ids=references) # Validation preprocessing def prepare_validation_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=data_args.max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length" if data_args.pad_to_max_length else False, ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the # corresponding example_id and we will store the offset mappings. tokenized_examples["example_id"] = [] for i in range(len(tokenized_examples["input_ids"])): # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples.sequence_ids(i) context_index = 1 if pad_on_right else 0 # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] tokenized_examples["example_id"].append( examples["id"][sample_index]) # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples["offset_mapping"][i] = [ (o if sequence_ids[k] == context_index else None) for k, o in enumerate(tokenized_examples["offset_mapping"][i]) ] return tokenized_examples transformers.utils.logging.set_verbosity_info() parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome.") logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) set_seed(training_args.seed) if data_args.dataset_name is not None: datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] datasets = load_dataset(extension, data_files=data_files, field="data") config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, ) model = AutoModelForQuestionAnswering.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) if data_args.layers_to_keep > 0: logger.info("Keeping %s model layers", data_args.layers_to_keep) model = drop_layers(model, data_args.layers_to_keep) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) logger.info("Model has %s parameters", params) # Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): raise ValueError( "This example script only works for models that have a fast tokenizer. Checkout the big table of models " "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this " "requirement") if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names question_column_name = "question" if "question" in column_names else column_names[ 0] context_column_name = "context" if "context" in column_names else column_names[ 1] answer_column_name = "answers" if "answers" in column_names else column_names[ 2] pad_on_right = tokenizer.padding_side == "right" if training_args.do_train: train_dataset = datasets["train"].map( prepare_train_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_eval: validation_dataset = datasets["validation"].map( prepare_validation_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) data_collator = (default_data_collator if data_args.pad_to_max_length else DataCollatorWithPadding(tokenizer)) current_dir = os.path.sep.join( os.path.join(__file__).split(os.path.sep)[:-1]) metric = load_metric( os.path.join(current_dir, "squad_v2_local") if data_args. version_2_with_negative else "squad") #################################################################################### # Start SparseML Integration #################################################################################### optim = load_optimizer(model, TrainingArguments) steps_per_epoch = math.ceil( len(datasets["train"]) / (training_args.per_device_train_batch_size * training_args._n_gpu)) manager = ScheduledModifierManager.from_yaml(data_args.nm_prune_config) training_args.num_train_epochs = float(manager.modifiers[0].end_epoch) optim = ScheduledOptimizer(optim, model, manager, steps_per_epoch=steps_per_epoch, loggers=None) #################################################################################### # End SparseML Integration #################################################################################### # Initialize our Trainer trainer = QuestionAnsweringTrainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=validation_dataset if training_args.do_eval else None, eval_examples=datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, post_process_function=post_processing_function, compute_metrics=compute_metrics, optimizers=(optim, None), ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") results = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") #################################################################################### # Start SparseML Integration #################################################################################### if data_args.do_onnx_export: logger.info("*** Export to ONNX ***") print("Exporting onnx model") os.environ["TOKENIZERS_PARALLELISM"] = "false" exporter = ModuleExporter(model, output_dir='onnx-export') sample_batch = convert_example_to_features( datasets["validation"][0], tokenizer, data_args.max_seq_length, data_args.doc_stride, data_args.max_query_length, ) exporter.export_onnx(sample_batch=sample_batch)
def export_to_sparse_onnx(model: "pl.LightningModule", output_dir: str, sample_batch: Optional[Tensor] = None, **kwargs) -> None: """Exports the model to ONNX format.""" import onnxruntime from sparseml.pytorch.utils import ModuleExporter with model._prevent_trainer_and_dataloaders_deepcopy(): exporter = ModuleExporter(model.model, output_dir=output_dir) sample_batch = sample_batch if sample_batch is not None else model.example_input_array if sample_batch is None: raise MisconfigurationException( "To export the model, a sample batch must be passed via " "``SparseMLCallback.export_to_sparse_onnx(model, output_dir, sample_batch=sample_batch)`` " "or an ``example_input_array`` property within the LightningModule" ) # the following is adapted from @natuan and @spacemanidol sample_inputs = os.path.join(output_dir, "sample-inputs") sample_outputs = os.path.join(output_dir, "sample-outputs") os.makedirs(sample_inputs, exist_ok=True) os.makedirs(sample_outputs, exist_ok=True) forward_args_spec = inspect.getfullargspec( exporter._module.__class__.forward) try: # assume sample_batch is a callable dictionary-type object one_sample_input = collections.OrderedDict([ (f, sample_batch[f][0].long().reshape(1, -1)) for f in forward_args_spec.args if f in sample_batch ]) except RuntimeError: # assume sample_batch is a tensor one_sample_input = sample_batch try: exporter.export_onnx(sample_batch=one_sample_input, convert_qat=True, **kwargs) exporter.export_onnx( sample_batch=one_sample_input, name="small_model.onnx", export_params=False, **kwargs, ) onnx_file = os.path.join(output_dir, "model.onnx") except RuntimeError: raise RuntimeError( "Error exporting ONNX models and/or inputs/outputs") sess = onnxruntime.InferenceSession(onnx_file) num_samples = 0 # add additional files for testing since this feature is very new if isinstance(one_sample_input, (collections.OrderedDict, dict)): input_names = list(sample_batch.keys()) output_names = [o.name for o in sess.get_outputs()] for input_vals in zip(*sample_batch.values()): input_feed = { k: v.long().numpy() for k, v in zip(input_names, input_vals) } output_vals = sess.run(output_names, { k: input_feed[k].reshape(1, -1) for k in input_feed }) output_dict = { name: numpy.squeeze(val) for name, val in zip(output_names, output_vals) } file_idx = f"{num_samples}".zfill(4) numpy.savez(f"{sample_inputs}/inp-{file_idx}.npz", **input_feed) numpy.savez(f"{sample_outputs}/out-{file_idx}.npz", **output_dict) num_samples += 1 else: output_names = [o.name for o in sess.get_outputs()] input_feed = {"input": sample_batch.numpy()} output_vals = sess.run(output_names, input_feed) output_dict = { name: numpy.squeeze(val) for name, val in zip(output_names, output_vals) } file_idx = f"{num_samples}".zfill(4) numpy.savez(f"{sample_inputs}/inp-{file_idx}.npz", **input_feed) numpy.savez(f"{sample_outputs}/out-{file_idx}.npz", **output_dict)
def test_export_batches(batch_size): sample_batch = torch.randn(batch_size, 8) exporter = ModuleExporter(MLPNet(), tempfile.gettempdir()) exporter.export_samples([sample_batch])
def test_exporter_onnx(): sample_batch = torch.randn(1, 8) exporter = ModuleExporter(MLPNet(), tempfile.gettempdir()) exporter.export_onnx(sample_batch)
def train( working_dir: str, config_path: str, model: Module, train_dataset: Dataset, val_dataset: Dataset, batch_size: int, optim_const: Callable[[Module], Optimizer], loss: Union[LossWrapper, Callable[[Any, Any], Tensor]], devices: str, ): """ Dataset setup """ LOGGER.info("batch_size set to {}".format(batch_size)) LOGGER.info("train_dataset set to {}".format(train_dataset)) LOGGER.info("val_dataset set to {}".format(val_dataset)) train_loader = DataLoader( train_dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True, ) val_loader = DataLoader( val_dataset, batch_size=batch_size, shuffle=False, num_workers=8, pin_memory=True, ) """ Model, optimizer, loss setup """ model_dir = clean_path(os.path.join(working_dir, "model")) optim = optim_const(model) LOGGER.info("model set to {}".format(model)) LOGGER.info("optimizer set to {}".format(optim)) LOGGER.info("loss set to {}".format(loss)) LOGGER.info("devices set to {}".format(devices)) """ Manager and config setup """ manager = ScheduledModifierManager.from_yaml(config_path) logs_dir = clean_path(os.path.join(working_dir, "logs")) loggers = [TensorBoardLogger(logs_dir), PythonLogger()] optim = ScheduledOptimizer( optim, model, manager, steps_per_epoch=len(train_loader), loggers=loggers ) """ Training and testing """ model, device, device_ids = model_to_device(model, devices) trainer = ModuleTrainer(model, device, loss, optim, loggers=loggers) tester = ModuleTester(model, device, loss, loggers=loggers, log_steps=-1) epoch = -1 tester.run_epoch(val_loader, epoch=epoch) for epoch in range(manager.max_epochs): LOGGER.info("starting training epoch {}".format(epoch)) train_res = trainer.run_epoch(train_loader, epoch) LOGGER.info("finished training epoch {}: {}".format(epoch, train_res)) val_res = tester.run_epoch(val_loader, epoch) LOGGER.info("finished validation epoch {}: {}".format(epoch, val_res)) exporter = ModuleExporter(model, model_dir) exporter.export_pytorch(optim, epoch) for data in val_loader: exporter.export_onnx(data)