def get_average_traces(self, max_iter=100, tolerance=1e-3) -> Tensor: """ Estimates average hessian trace for each parameter :param max_iter: maximum number of iterations for Hutchinson algorithm :param tolerance: - minimum relative tolerance for stopping the algorithm. It's calculated between mean average trace from previous iteration and current one. :return: Tensor with average hessian trace per parameter """ avg_total_trace = 0. avg_traces_per_iter = [] # type: List[Tensor] mean_avg_traces_per_param = None for i in range(max_iter): avg_traces_per_iter.append(self._calc_avg_traces_per_param()) mean_avg_traces_per_param = self._get_mean(avg_traces_per_iter) mean_avg_total_trace = torch.sum(mean_avg_traces_per_param) diff_avg = abs(mean_avg_total_trace - avg_total_trace) / ( avg_total_trace + self._diff_eps) if diff_avg < tolerance: return mean_avg_traces_per_param avg_total_trace = mean_avg_total_trace if is_main_process(): nncf_logger.info('{}# difference_avg={} avg_trace={}'.format( i, diff_avg, avg_total_trace)) return mean_avg_traces_per_param
def __init__(self, target_model: NNCFNetwork, params: NNCFConfig): super().__init__(target_model) scheduler_cls = BINARIZATION_SCHEDULERS.get("staged") self._scheduler = scheduler_cls(self, params) from nncf.utils import is_main_process if is_main_process(): self._compute_and_display_flops_binarization_rate()
def validate(val_loader, model, criterion, config): batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to evaluate mode model.eval() with torch.no_grad(): end = time.time() for i, (input_, target) in enumerate(val_loader): input_ = input_.to(config.device) target = target.to(config.device) # compute output output = model(input_) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss, input_.size(0)) top1.update(acc1, input_.size(0)) top5.update(acc5, input_.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % config.print_freq == 0: logger.info( '{rank}' 'Test: [{0}/{1}] ' 'Time: {batch_time.val:.3f} ({batch_time.avg:.3f}) ' 'Loss: {loss.val:.4f} ({loss.avg:.4f}) ' 'Acc@1: {top1.val:.3f} ({top1.avg:.3f}) ' 'Acc@5: {top5.val:.3f} ({top5.avg:.3f})'.format( i, len(val_loader), batch_time=batch_time, loss=losses, top1=top1, top5=top5, rank='{}:'.format(config.rank) if config.multiprocessing_distributed else '' )) if is_main_process(): config.tb.add_scalar("val/loss", losses.avg, len(val_loader) * config.get('cur_epoch', 0)) config.tb.add_scalar("val/top1", top1.avg, len(val_loader) * config.get('cur_epoch', 0)) config.tb.add_scalar("val/top5", top5.avg, len(val_loader) * config.get('cur_epoch', 0)) logger.info(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}\n'.format(top1=top1, top5=top5)) acc = top1.avg / 100 if config.metrics_dump is not None: write_metrics(acc, config.metrics_dump) return top1.avg, top5.avg
def choose_configuration(self, configuration_metric: List[Tensor], bits_configurations: List[List[int]], traces_order: List[int]) -> List[int]: num_weights = len(traces_order) ordered_config = [0] * num_weights median_metric = torch.Tensor(configuration_metric).to(self._device).median() configuration_index = configuration_metric.index(median_metric) bit_configuration = bits_configurations[configuration_index] for i, bitwidth in enumerate(bit_configuration): ordered_config[traces_order[i]] = bitwidth if is_main_process(): nncf_logger.info('Chosen HAWQ configuration (bitwidth per weightable layer)={}'.format(ordered_config)) nncf_logger.debug('Order of the weightable layers in the HAWQ configuration={}'.format(traces_order)) return ordered_config
def train(config, compression_ctrl, model, criterion, is_inception, lr_scheduler, model_name, optimizer, train_loader, train_sampler, val_loader, best_acc1=0): for epoch in range(config.start_epoch, config.epochs): config.cur_epoch = epoch if config.distributed: train_sampler.set_epoch(epoch) # train for one epoch train_epoch(train_loader, model, criterion, optimizer, compression_ctrl, epoch, config, is_inception) # Learning rate scheduling should be applied after optimizer’s update lr_scheduler.step(epoch if not isinstance(lr_scheduler, ReduceLROnPlateau) else best_acc1) # update compression scheduler state at the end of the epoch compression_ctrl.scheduler.epoch_step() # compute compression algo statistics stats = compression_ctrl.statistics() acc1 = best_acc1 if epoch % config.test_every_n_epochs == 0: # evaluate on validation set acc1, _ = validate(val_loader, model, criterion, config) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) acc = best_acc1 / 100 if config.metrics_dump is not None: write_metrics(acc, config.metrics_dump) if is_main_process(): print_statistics(stats) checkpoint_path = osp.join(config.checkpoint_save_dir, get_name(config) + '_last.pth') checkpoint = { 'epoch': epoch + 1, 'arch': model_name, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'acc1': acc1, 'optimizer': optimizer.state_dict(), 'scheduler': compression_ctrl.scheduler.state_dict() } torch.save(checkpoint, checkpoint_path) make_additional_checkpoints(checkpoint_path, is_best, epoch + 1, config) for key, value in stats.items(): if isinstance(value, (int, float)): config.tb.add_scalar("compression/statistics/{0}".format(key), value, len(train_loader) * epoch)
def apply_init(self): runner = HessianAwarePrecisionInitializeRunner(self._algo, self._model, self._data_loader, self._num_data_points, self._all_quantizations, self._ordered_weight_quantizations, self._bits, self._traces_per_layer_path) runner.run(self._criterion, self._iter_number, self._tolerance) self._model.rebuild_graph() if self._is_distributed: # NOTE: Order of quantization modules must be the same on GPUs to correctly broadcast num_bits sorted_quantizers = OrderedDict(sorted(self._all_quantizations.items(), key=lambda x: str(x[0]))) for quantizer in sorted_quantizers.values(): # type: BaseQuantizer quantizer.broadcast_num_bits() if is_main_process(): str_bw = [str(element) for element in self.get_bitwidth_per_scope(sorted_quantizers)] nncf_logger.info('\n'.join(['\n\"bitwidth_per_scope\": [', ',\n'.join(str_bw), ']']))
def train_bin(config, compression_ctrl, model, criterion, is_inception, optimizer_scheduler, model_name, optimizer, train_loader, train_sampler, val_loader, kd_loss_calculator, batch_multiplier, best_acc1=0): for epoch in range(config.start_epoch, config.epochs): config.cur_epoch = epoch if config.distributed: train_sampler.set_epoch(epoch) # train for one epoch train_epoch_bin(train_loader, batch_multiplier, model, criterion, optimizer, optimizer_scheduler, kd_loss_calculator, compression_ctrl, epoch, config, is_inception) # compute compression algo statistics stats = compression_ctrl.statistics() acc1 = best_acc1 if epoch % config.test_every_n_epochs == 0: # evaluate on validation set acc1, _ = validate(val_loader, model, criterion, config) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) # update compression scheduler state at the end of the epoch compression_ctrl.scheduler.epoch_step() optimizer_scheduler.epoch_step() if is_main_process(): print_statistics(stats) checkpoint_path = osp.join(config.checkpoint_save_dir, get_name(config) + '_last.pth') checkpoint = { 'epoch': epoch + 1, 'arch': model_name, 'state_dict': model.state_dict(), 'original_model_state_dict': kd_loss_calculator.original_model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), 'compression_scheduler': compression_ctrl.scheduler.state_dict(), 'optimizer_scheduler': optimizer_scheduler.state_dict() } torch.save(checkpoint, checkpoint_path) make_additional_checkpoints(checkpoint_path, is_best, epoch + 1, config) for key, value in stats.items(): if isinstance(value, (int, float)): config.tb.add_scalar("compression/statistics/{0}".format(key), value, len(train_loader) * epoch)
def load_detection_annotations(cachedir, dataset): cachefile = os.path.join(cachedir, 'annots_{}.json'.format(dataset.name)) imagenames = dataset.get_img_names() if is_main_process(): if not os.path.isfile(cachefile): # load annots gt = {} for i, imagename in enumerate(imagenames): _, gt[imagename] = dataset.pull_anno(i) if i % 100 == 0: logger.info('Reading annotation for {:d}/{:d}'.format( i + 1, len(imagenames))) # save logger.info('Saving cached annotations to {:s}'.format(cachefile)) pathlib.Path(cachedir).mkdir(parents=True, exist_ok=True) with open(cachefile, 'w') as f: json.dump(gt, f) if is_dist_avail_and_initialized(): dist.barrier() with open(cachefile, 'r') as f: gt = json.load(f) return gt, imagenames
def main_worker(current_gpu, config): config.current_gpu = current_gpu config.distributed = config.execution_mode in ( ExecutionMode.DISTRIBUTED, ExecutionMode.MULTIPROCESSING_DISTRIBUTED) if config.distributed: configure_distributed(config) if is_main_process(): configure_logging(logger, config) print_args(config) logger.info(config) config.device = get_device(config) dataset = get_dataset(config.dataset) color_encoding = dataset.color_encoding num_classes = len(color_encoding) if config.metrics_dump is not None: write_metrics(0, config.metrics_dump) weights = config.get('weights') model = load_model(config.model, pretrained=config.get('pretrained', True) if weights is None else False, num_classes=num_classes, model_params=config.get('model_params', {})) compression_ctrl, model = create_compressed_model(model, config) if weights: sd = torch.load(weights, map_location='cpu') load_state(model, sd) model, model_without_dp = prepare_model_for_execution(model, config) if config.distributed: compression_ctrl.distributed() resuming_checkpoint = config.resuming_checkpoint if resuming_checkpoint is not None: if not config.pretrained: # Load the previously saved model state model, _, _, _, _ = \ load_checkpoint(model, resuming_checkpoint, config.device, compression_scheduler=compression_ctrl.scheduler) if config.to_onnx is not None: compression_ctrl.export_model(config.to_onnx) logger.info("Saved to {}".format(config.to_onnx)) return if config.mode.lower() == 'test': logger.info(model) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) logger.info("Trainable argument count:{params}".format(params=params)) model = model.to(config.device) loaders, w_class = load_dataset(dataset, config) _, val_loader = loaders test(model, val_loader, w_class, color_encoding, config) print_statistics(compression_ctrl.statistics()) elif config.mode.lower() == 'train': loaders, w_class = load_dataset(dataset, config) train_loader, val_loader = loaders if not resuming_checkpoint: compression_ctrl.initialize(train_loader) train(model, model_without_dp, compression_ctrl, train_loader, val_loader, w_class, color_encoding, config) else: # Should never happen...but just in case it does raise RuntimeError( "\"{0}\" is not a valid choice for execution mode.".format( config.mode))
def create_compressed_model(model: Module, config: NNCFConfig, resuming_state_dict: dict = None, dummy_forward_fn: Callable[[Module], Any] = None, wrap_inputs_fn: Callable[[Tuple, Dict], Tuple[Tuple, Dict]] = None, dump_graphs=True,) \ -> Tuple[CompressionAlgorithmController, NNCFNetwork]: """ The main function used to produce a model ready for compression fine-tuning from an original PyTorch model and a configuration object. dummy_forward_fn :param model: The original model. Should have its parameters already loaded from a checkpoint or another source. :param config: A configuration object used to determine the exact compression modifications to be applied to the model :param resuming_state_dict: A PyTorch state dict object to load (strictly) into the compressed model after building. :param dummy_forward_fn: if supplied, will be used instead of a *forward* function call to build the internal graph representation via tracing. Specifying this is useful when the original training pipeline has special formats of data loader output or has additional *forward* arguments other than input tensors. Otherwise, the *forward* call of the model during graph tracing will be made with mock tensors according to the shape specified in the config object. :param wrap_inputs_fn: if supplied, will be used on the module's input arguments during a regular, non-dummy forward call before passing the inputs to the underlying compressed model. This is required if the model's input tensors that are important for compression are not supplied as arguments to the model's forward call directly, but instead are located in a container (such as list), and the model receives the container as an argument. wrap_inputs_fn should take as input two arguments - the tuple of positional arguments to the underlying model's forward call, and a dict of keyword arguments to the same. The function should wrap each tensor among the supplied model's args and kwargs that is important for compression (e.g. quantization) with an nncf.nncf_model_input function, which is a no-operation function and marks the tensors as inputs to be traced by NNCF in the internal graph representation. Output is the tuple of (args, kwargs), where args and kwargs are the same as were supplied in input, but each tensor in the original input. :param dump_graphs: Whether or not should also dump the internal graph representation of the original and compressed models in the .dot format into the log directory. :return: A controller for the compression algorithm (or algorithms, in which case the controller is an instance of CompositeCompressionController) and the model ready for compression parameter training wrapped as an object of NNCFNetwork.""" # Compress model that will be deployed for the inference on target device. No need to compress parts of the # model that are used on training stage only (e.g. AuxLogits of Inception-v3 model) or unused modules with weights. # As a consequence, no need to care about spoiling BN statistics, as there're disabled in eval mode. model.eval() if dump_graphs: if dummy_forward_fn is None: input_info_list = create_input_infos(config) graph_builder = GraphBuilder( custom_forward_fn=create_dummy_forward_fn( input_info_list, with_input_tracing=True)) else: graph_builder = GraphBuilder(custom_forward_fn=dummy_forward_fn) if is_main_process(): graph = graph_builder.build_graph(model) graph.visualize_graph( osp.join(config.get("log_dir", "."), "original_graph.dot")) set_debug_log_dir(config.get("log_dir", ".")) input_info_list = create_input_infos(config) scopes_without_shape_matching = config.get('scopes_without_shape_matching', []) ignored_scopes = config.get('ignored_scopes') target_scopes = config.get('target_scopes') compressed_model = NNCFNetwork( model, input_infos=input_info_list, dummy_forward_fn=dummy_forward_fn, wrap_inputs_fn=wrap_inputs_fn, ignored_scopes=ignored_scopes, target_scopes=target_scopes, scopes_without_shape_matching=scopes_without_shape_matching) should_init = resuming_state_dict is None compression_algo_builder_list = create_compression_algorithm_builders( config, should_init=should_init) for builder in compression_algo_builder_list: compressed_model = builder.apply_to(compressed_model) compression_ctrl = compressed_model.commit_compression_changes() try: if resuming_state_dict is not None: load_state(compressed_model, resuming_state_dict, is_resume=True) finally: if dump_graphs and is_main_process() and compression_algo_builder_list: if dummy_forward_fn is None: compressed_graph_builder = GraphBuilder( custom_forward_fn=create_dummy_forward_fn( input_info_list, with_input_tracing=False)) else: compressed_graph_builder = GraphBuilder( custom_forward_fn=dummy_forward_fn) graph = compressed_graph_builder.build_graph( compressed_model, compressed_model.get_tracing_context()) graph.visualize_graph( osp.join(config.get("log_dir", "."), "compressed_graph.dot")) return compression_ctrl, compressed_model
def main_worker_binarization(current_gpu, config): config.current_gpu = current_gpu config.distributed = config.execution_mode in (ExecutionMode.DISTRIBUTED, ExecutionMode.MULTIPROCESSING_DISTRIBUTED) if config.distributed: configure_distributed(config) config.device = get_device(config) if is_main_process(): configure_logging(logger, config) print_args(config) if config.seed is not None: manual_seed(config.seed) cudnn.deterministic = True cudnn.benchmark = False # create model model_name = config['model'] weights = config.get('weights') model = load_model(model_name, pretrained=config.get('pretrained', True) if weights is None else False, num_classes=config.get('num_classes', 1000), model_params=config.get('model_params')) original_model = copy.deepcopy(model) compression_ctrl, model = create_compressed_model(model, config) if not isinstance(compression_ctrl, BinarizationController): raise RuntimeError("The binarization sample worker may only be run with the binarization algorithm!") if weights: load_state(model, torch.load(weights, map_location='cpu')) model, _ = prepare_model_for_execution(model, config) original_model.to(config.device) if config.distributed: compression_ctrl.distributed() is_inception = 'inception' in model_name # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss() criterion = criterion.to(config.device) params_to_optimize = model.parameters() compression_config = config['compression'] binarization_config = compression_config if isinstance(compression_config, dict) else compression_config[0] optimizer = get_binarization_optimizer(params_to_optimize, binarization_config) optimizer_scheduler = BinarizationOptimizerScheduler(optimizer, binarization_config) kd_loss_calculator = KDLossCalculator(original_model) resuming_checkpoint = config.resuming_checkpoint best_acc1 = 0 # optionally resume from a checkpoint if resuming_checkpoint is not None: model, config, optimizer, optimizer_scheduler, kd_loss_calculator, compression_ctrl, best_acc1 = \ resume_from_checkpoint(resuming_checkpoint, model, config, optimizer, optimizer_scheduler, kd_loss_calculator, compression_ctrl) if config.to_onnx is not None: compression_ctrl.export_model(config.to_onnx) logger.info("Saved to {}".format(config.to_onnx)) return if config.execution_mode != ExecutionMode.CPU_ONLY: cudnn.benchmark = True # Data loading code train_dataset, val_dataset = create_datasets(config) train_loader, train_sampler, val_loader = create_data_loaders(config, train_dataset, val_dataset) if config.mode.lower() == 'test': print_statistics(compression_ctrl.statistics()) validate(val_loader, model, criterion, config) if config.mode.lower() == 'train': if not resuming_checkpoint: compression_ctrl.initialize(data_loader=train_loader, criterion=criterion) batch_multiplier = (binarization_config.get("params", {})).get("batch_multiplier", 1) train_bin(config, compression_ctrl, model, criterion, is_inception, optimizer_scheduler, model_name, optimizer, train_loader, train_sampler, val_loader, kd_loss_calculator, batch_multiplier, best_acc1)
def train_epoch_bin(train_loader, batch_multiplier, model, criterion, optimizer, optimizer_scheduler: BinarizationOptimizerScheduler, kd_loss_calculator: KDLossCalculator, compression_ctrl, epoch, config, is_inception=False): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() kd_losses_meter = AverageMeter() criterion_losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() compression_scheduler = compression_ctrl.scheduler # switch to train mode model.train() end = time.time() for i, (input_, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) input_ = input_.to(config.device) target = target.to(config.device) # compute output if is_inception: # From https://discuss.pytorch.org/t/how-to-optimize-inception-model-with-auxiliary-classifiers/7958 output, aux_outputs = model(input_) loss1 = criterion(output, target) loss2 = criterion(aux_outputs, target) criterion_loss = loss1 + 0.4 * loss2 else: output = model(input_) criterion_loss = criterion(output, target) # compute KD loss kd_loss = kd_loss_calculator.loss(input_, output) loss = criterion_loss + kd_loss # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), input_.size(0)) comp_loss_val = kd_loss.item() kd_losses_meter.update(comp_loss_val, input_.size(0)) criterion_losses.update(criterion_loss.item(), input_.size(0)) top1.update(acc1, input_.size(0)) top1.update(acc1, input_.size(0)) top5.update(acc5, input_.size(0)) # compute gradient and do SGD step if i % batch_multiplier == 0: optimizer.zero_grad() loss.backward() optimizer.step() else: loss.backward() compression_scheduler.step() optimizer_scheduler.step(float(i) / len(train_loader)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % config.print_freq == 0: logger.info( '{rank}: ' 'Epoch: [{0}][{1}/{2}] ' 'Lr: {3:.3} ' 'Wd: {4:.3} ' 'Time: {batch_time.val:.3f} ({batch_time.avg:.3f}) ' 'Data: {data_time.val:.3f} ({data_time.avg:.3f}) ' 'CE_loss: {ce_loss.val:.4f} ({ce_loss.avg:.4f}) ' 'KD_loss: {kd_loss.val:.4f} ({kd_loss.avg:.4f}) ' 'Loss: {loss.val:.4f} ({loss.avg:.4f}) ' 'Acc@1: {top1.val:.3f} ({top1.avg:.3f}) ' 'Acc@5: {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(train_loader), get_lr(optimizer), get_wd(optimizer), batch_time=batch_time, data_time=data_time, ce_loss=criterion_losses, kd_loss=kd_losses_meter, loss=losses, top1=top1, top5=top5, rank='{}:'.format(config.rank) if config.multiprocessing_distributed else '' )) if is_main_process(): global_step = len(train_loader) * epoch config.tb.add_scalar("train/learning_rate", get_lr(optimizer), i + global_step) config.tb.add_scalar("train/criterion_loss", criterion_losses.avg, i + global_step) config.tb.add_scalar("train/kd_loss", kd_losses_meter.avg, i + global_step) config.tb.add_scalar("train/loss", losses.avg, i + global_step) config.tb.add_scalar("train/top1", top1.avg, i + global_step) config.tb.add_scalar("train/top5", top5.avg, i + global_step) for stat_name, stat_value in compression_ctrl.statistics().items(): if isinstance(stat_value, (int, float)): config.tb.add_scalar('train/statistics/{}'.format(stat_name), stat_value, i + global_step)
def main_worker(current_gpu, config): config.current_gpu = current_gpu config.distributed = config.execution_mode in (ExecutionMode.DISTRIBUTED, ExecutionMode.MULTIPROCESSING_DISTRIBUTED) if config.distributed: configure_distributed(config) config.device = get_device(config) if is_main_process(): configure_logging(logger, config) print_args(config) if config.seed is not None: manual_seed(config.seed) cudnn.deterministic = True cudnn.benchmark = False # create model model_name = config['model'] weights = config.get('weights') model = load_model(model_name, pretrained=config.get('pretrained', True) if weights is None else False, num_classes=config.get('num_classes', 1000), model_params=config.get('model_params')) compression_ctrl, model = create_compressed_model(model, config) if weights: load_state(model, torch.load(weights, map_location='cpu')) model, _ = prepare_model_for_execution(model, config) if config.distributed: compression_ctrl.distributed() is_inception = 'inception' in model_name # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss() criterion = criterion.to(config.device) params_to_optimize = get_parameter_groups(model, config) optimizer, lr_scheduler = make_optimizer(params_to_optimize, config) resuming_checkpoint = config.resuming_checkpoint best_acc1 = 0 # optionally resume from a checkpoint if resuming_checkpoint is not None: model, config, optimizer, compression_ctrl, best_acc1 = \ resume_from_checkpoint(resuming_checkpoint, model, config, optimizer, compression_ctrl) if config.to_onnx is not None: compression_ctrl.export_model(config.to_onnx) logger.info("Saved to {}".format(config.to_onnx)) return if config.execution_mode != ExecutionMode.CPU_ONLY: cudnn.benchmark = True # Data loading code train_dataset, val_dataset = create_datasets(config) train_loader, train_sampler, val_loader = create_data_loaders(config, train_dataset, val_dataset) if config.mode.lower() == 'test': print_statistics(compression_ctrl.statistics()) validate(val_loader, model, criterion, config) if config.mode.lower() == 'train': if not resuming_checkpoint: compression_ctrl.initialize(data_loader=train_loader, criterion=criterion) train(config, compression_ctrl, model, criterion, is_inception, lr_scheduler, model_name, optimizer, train_loader, train_sampler, val_loader, best_acc1)
def _is_enabled(self): return self.is_suitable_mode and is_main_process()
def main_worker(current_gpu, config): ################################# # Setup experiment environment ################################# configure_device(current_gpu, config) config.mlflow = SafeMLFLow(config) if is_on_first_rank(config): configure_logging(logger, config) print_args(config) config.start_iter = 0 nncf_config = config.nncf_config ########################## # Prepare metrics log file ########################## if config.metrics_dump is not None: write_metrics(0, config.metrics_dump) ########################### # Criterion ########################### criterion = MultiBoxLoss(config, config['num_classes'], overlap_thresh=0.5, prior_for_matching=True, bkg_label=0, neg_mining=True, neg_pos=3, neg_overlap=0.5, encode_target=False, device=config.device) train_data_loader = test_data_loader = None resuming_checkpoint_path = config.resuming_checkpoint_path ########################### # Prepare data ########################### pretrained = is_pretrained_model_requested(config) if config.to_onnx is not None: assert pretrained or (resuming_checkpoint_path is not None) else: test_data_loader, train_data_loader, init_data_loader = create_dataloaders( config) def criterion_fn(model_outputs, target, criterion): loss_l, loss_c = criterion(model_outputs, target) return loss_l + loss_c def autoq_test_fn(model, eval_loader): # RL is maximization, change the loss polarity return -1 * test_net(model, config.device, eval_loader, distributed=config.distributed, loss_inference=True, criterion=criterion) nncf_config = register_default_init_args(nncf_config, init_data_loader, criterion, criterion_fn, autoq_test_fn, test_data_loader, config.device) ################## # Prepare model ################## resuming_checkpoint_path = config.resuming_checkpoint_path resuming_model_sd = None if resuming_checkpoint_path is not None: resuming_model_sd, resuming_checkpoint = load_resuming_model_state_dict_and_checkpoint_from_path( resuming_checkpoint_path) compression_ctrl, net = create_model(config, resuming_model_sd) if config.distributed: config.batch_size //= config.ngpus_per_node config.workers //= config.ngpus_per_node compression_ctrl.distributed() ########################### # Optimizer ########################### params_to_optimize = get_parameter_groups(net, config) optimizer, lr_scheduler = make_optimizer(params_to_optimize, config) ################################# # Load additional checkpoint data ################################# if resuming_checkpoint_path is not None and config.mode.lower( ) == 'train' and config.to_onnx is None: compression_ctrl.scheduler.load_state_dict( resuming_checkpoint['scheduler']) optimizer.load_state_dict( resuming_checkpoint.get('optimizer', optimizer.state_dict())) config.start_iter = resuming_checkpoint.get('iter', 0) + 1 log_common_mlflow_params(config) if config.to_onnx: compression_ctrl.export_model(config.to_onnx) logger.info("Saved to {}".format(config.to_onnx)) return if is_main_process(): print_statistics(compression_ctrl.statistics()) if config.mode.lower() == 'test': with torch.no_grad(): net.eval() if config['ssd_params'].get('loss_inference', False): model_loss = test_net(net, config.device, test_data_loader, distributed=config.distributed, loss_inference=True, criterion=criterion) logger.info("Final model loss: {:.3f}".format(model_loss)) else: mAp = test_net(net, config.device, test_data_loader, distributed=config.distributed) if config.metrics_dump is not None: write_metrics(mAp, config.metrics_dump) return train(net, compression_ctrl, train_data_loader, test_data_loader, criterion, optimizer, config, lr_scheduler)
def train_staged(config, compression_ctrl, model, criterion, is_inception, optimizer_scheduler, model_name, optimizer, train_loader, train_sampler, val_loader, kd_loss_calculator, batch_multiplier, best_acc1=0): best_compression_level = CompressionLevel.NONE for epoch in range(config.start_epoch, config.epochs): config.cur_epoch = epoch if config.distributed: train_sampler.set_epoch(epoch) # train for one epoch train_epoch_staged(train_loader, batch_multiplier, model, criterion, optimizer, optimizer_scheduler, kd_loss_calculator, compression_ctrl, epoch, config, is_inception) # compute compression algo statistics stats = compression_ctrl.statistics() acc1 = best_acc1 if epoch % config.test_every_n_epochs == 0: # evaluate on validation set acc1, _ = validate(val_loader, model, criterion, config) compression_level = compression_ctrl.compression_level() # remember best acc@1, considering compression level. If current acc@1 less then the best acc@1, checkpoint # still can be best if current compression level is bigger then best one. Compression levels in ascending # order: NONE, PARTIAL, FULL. is_best_by_accuracy = acc1 > best_acc1 and compression_level == best_compression_level is_best = is_best_by_accuracy or compression_level > best_compression_level best_acc1 = max(acc1, best_acc1) best_compression_level = max(compression_level, best_compression_level) # statistics (e.g. portion of the enabled quantizers) is related to the finished epoch, # hence printing should happen before epoch_step, which may inform about state of the next epoch (e.g. next # portion of enabled quantizers) if is_main_process(): print_statistics(stats) # update compression scheduler state at the end of the epoch compression_ctrl.scheduler.epoch_step() optimizer_scheduler.epoch_step() if is_main_process(): checkpoint_path = osp.join(config.checkpoint_save_dir, get_name(config) + '_last.pth') checkpoint = { 'epoch': epoch + 1, 'arch': model_name, 'state_dict': model.state_dict(), 'original_model_state_dict': kd_loss_calculator.original_model.state_dict(), 'best_acc1': best_acc1, 'compression_level': compression_level, 'optimizer': optimizer.state_dict(), 'compression_scheduler': compression_ctrl.scheduler.state_dict(), 'optimizer_scheduler': optimizer_scheduler.state_dict() } torch.save(checkpoint, checkpoint_path) make_additional_checkpoints(checkpoint_path, is_best, epoch + 1, config) for key, value in stats.items(): if isinstance(value, (int, float)): config.tb.add_scalar( "compression/statistics/{0}".format(key), value, len(train_loader) * epoch)
def staged_quantization_main_worker(current_gpu, config): config.current_gpu = current_gpu config.distributed = config.execution_mode in ( ExecutionMode.DISTRIBUTED, ExecutionMode.MULTIPROCESSING_DISTRIBUTED) if config.distributed: configure_distributed(config) config.device = get_device(config) if is_main_process(): configure_logging(logger, config) print_args(config) if config.seed is not None: manual_seed(config.seed) cudnn.deterministic = True cudnn.benchmark = False # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss() criterion = criterion.to(config.device) train_loader = train_sampler = val_loader = None resuming_checkpoint_path = config.resuming_checkpoint_path nncf_config = config.nncf_config pretrained = is_pretrained_model_requested(config) if config.to_onnx is not None: assert pretrained or (resuming_checkpoint_path is not None) else: # Data loading code train_dataset, val_dataset = create_datasets(config) train_loader, train_sampler, val_loader = create_data_loaders( config, train_dataset, val_dataset) nncf_config = register_default_init_args(nncf_config, criterion, train_loader) # create model model_name = config['model'] model = load_model(model_name, pretrained=pretrained, num_classes=config.get('num_classes', 1000), model_params=config.get('model_params'), weights_path=config.get('weights')) original_model = copy.deepcopy(model) model.to(config.device) resuming_model_sd = None resuming_checkpoint = None if resuming_checkpoint_path is not None: resuming_checkpoint = load_resuming_checkpoint( resuming_checkpoint_path) resuming_model_sd = resuming_checkpoint['state_dict'] compression_ctrl, model = create_compressed_model(model, nncf_config, resuming_model_sd) if not isinstance(compression_ctrl, (BinarizationController, QuantizationController)): raise RuntimeError( "The stage quantization sample worker may only be run with the binarization and quantization algorithms!" ) model, _ = prepare_model_for_execution(model, config) original_model.to(config.device) if config.distributed: compression_ctrl.distributed() is_inception = 'inception' in model_name params_to_optimize = model.parameters() compression_config = config['compression'] quantization_config = compression_config if isinstance( compression_config, dict) else compression_config[0] optimizer = get_quantization_optimizer(params_to_optimize, quantization_config) optimizer_scheduler = PolyLRDropScheduler(optimizer, quantization_config) kd_loss_calculator = KDLossCalculator(original_model) best_acc1 = 0 # optionally resume from a checkpoint if resuming_checkpoint is not None and config.to_onnx is None: config.start_epoch = resuming_checkpoint['epoch'] best_acc1 = resuming_checkpoint['best_acc1'] kd_loss_calculator.original_model.load_state_dict( resuming_checkpoint['original_model_state_dict']) compression_ctrl.scheduler.load_state_dict( resuming_checkpoint['compression_scheduler']) optimizer.load_state_dict(resuming_checkpoint['optimizer']) optimizer_scheduler.load_state_dict( resuming_checkpoint['optimizer_scheduler']) if config.mode.lower() == 'train': logger.info( "=> loaded checkpoint '{}' (epoch: {}, best_acc1: {:.3f})". format(resuming_checkpoint_path, resuming_checkpoint['epoch'], best_acc1)) else: logger.info( "=> loaded checkpoint '{}'".format(resuming_checkpoint_path)) if config.to_onnx: compression_ctrl.export_model(config.to_onnx) logger.info("Saved to {}".format(config.to_onnx)) return if config.execution_mode != ExecutionMode.CPU_ONLY: cudnn.benchmark = True if config.mode.lower() == 'test': print_statistics(compression_ctrl.statistics()) validate(val_loader, model, criterion, config) if config.mode.lower() == 'train': batch_multiplier = (quantization_config.get("params", {})).get( "batch_multiplier", 1) train_staged(config, compression_ctrl, model, criterion, is_inception, optimizer_scheduler, model_name, optimizer, train_loader, train_sampler, val_loader, kd_loss_calculator, batch_multiplier, best_acc1)
def main_worker(current_gpu, config): config.current_gpu = current_gpu config.distributed = config.execution_mode in ( ExecutionMode.DISTRIBUTED, ExecutionMode.MULTIPROCESSING_DISTRIBUTED) if config.distributed: configure_distributed(config) if is_main_process(): configure_logging(logger, config) print_args(config) logger.info(config) config.device = get_device(config) dataset = get_dataset(config.dataset) color_encoding = dataset.color_encoding num_classes = len(color_encoding) if config.metrics_dump is not None: write_metrics(0, config.metrics_dump) train_loader = val_loader = criterion = None resuming_checkpoint_path = config.resuming_checkpoint_path nncf_config = config.nncf_config pretrained = is_pretrained_model_requested(config) if config.to_onnx is not None: assert pretrained or (resuming_checkpoint_path is not None) else: loaders, w_class = load_dataset(dataset, config) train_loader, val_loader = loaders criterion = get_criterion(w_class, config) if not resuming_checkpoint_path: nncf_config = register_default_init_args(nncf_config, criterion, train_loader) model = load_model(config.model, pretrained=pretrained, num_classes=num_classes, model_params=config.get('model_params', {}), weights_path=config.get('weights')) model.to(config.device) compression_ctrl, model = create_compressed_model(model, nncf_config) model, model_without_dp = prepare_model_for_execution(model, config) if config.distributed: compression_ctrl.distributed() if resuming_checkpoint_path: if not config.pretrained: # Load the previously saved model state model, _, _, _, _ = \ load_checkpoint(model, resuming_checkpoint_path, config.device, compression_scheduler=compression_ctrl.scheduler) if config.to_onnx: compression_ctrl.export_model(config.to_onnx) logger.info("Saved to {}".format(config.to_onnx)) return if config.mode.lower() == 'test': logger.info(model) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) logger.info("Trainable argument count:{params}".format(params=params)) model = model.to(config.device) test(model, val_loader, criterion, color_encoding, config) print_statistics(compression_ctrl.statistics()) elif config.mode.lower() == 'train': train(model, model_without_dp, compression_ctrl, train_loader, val_loader, criterion, color_encoding, config) else: # Should never happen...but just in case it does raise RuntimeError( "\"{0}\" is not a valid choice for execution mode.".format( config.mode))
def train_epoch(train_loader, model, criterion, criterion_fn, optimizer, compression_ctrl, epoch, config): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() compression_losses = AverageMeter() criterion_losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() compression_scheduler = compression_ctrl.scheduler # switch to train mode model.train() end = time.time() for i, (input_, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) compression_scheduler.step() input_ = input_.to(config.device) target = target.to(config.device) # compute output output = model(input_) criterion_loss = criterion_fn(output, target, criterion) # compute compression loss compression_loss = compression_ctrl.loss() loss = criterion_loss + compression_loss if isinstance(output, InceptionOutputs): output = output.logits # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), input_.size(0)) comp_loss_val = compression_loss.item() if isinstance( compression_loss, torch.Tensor) else compression_loss compression_losses.update(comp_loss_val, input_.size(0)) criterion_losses.update(criterion_loss.item(), input_.size(0)) top1.update(acc1, input_.size(0)) top5.update(acc5, input_.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % config.print_freq == 0: logger.info('{rank}: ' 'Epoch: [{0}][{1}/{2}] ' 'Lr: {3:.3} ' 'Time: {batch_time.val:.3f} ({batch_time.avg:.3f}) ' 'Data: {data_time.val:.3f} ({data_time.avg:.3f}) ' 'CE_loss: {ce_loss.val:.4f} ({ce_loss.avg:.4f}) ' 'CR_loss: {cr_loss.val:.4f} ({cr_loss.avg:.4f}) ' 'Loss: {loss.val:.4f} ({loss.avg:.4f}) ' 'Acc@1: {top1.val:.3f} ({top1.avg:.3f}) ' 'Acc@5: {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(train_loader), get_lr(optimizer), batch_time=batch_time, data_time=data_time, ce_loss=criterion_losses, cr_loss=compression_losses, loss=losses, top1=top1, top5=top5, rank='{}:'.format(config.rank) if config.multiprocessing_distributed else '')) if is_main_process(): global_step = len(train_loader) * epoch config.tb.add_scalar("train/learning_rate", get_lr(optimizer), i + global_step) config.tb.add_scalar("train/criterion_loss", criterion_losses.avg, i + global_step) config.tb.add_scalar("train/compression_loss", compression_losses.avg, i + global_step) config.tb.add_scalar("train/loss", losses.avg, i + global_step) config.tb.add_scalar("train/top1", top1.avg, i + global_step) config.tb.add_scalar("train/top5", top5.avg, i + global_step) for stat_name, stat_value in compression_ctrl.statistics( quickly_collected_only=True).items(): if isinstance(stat_value, (int, float)): config.tb.add_scalar( 'train/statistics/{}'.format(stat_name), stat_value, i + global_step)
def train(config, compression_ctrl, model, criterion, criterion_fn, lr_scheduler, model_name, optimizer, train_loader, train_sampler, val_loader, best_acc1=0): best_compression_level = CompressionLevel.NONE for epoch in range(config.start_epoch, config.epochs): # update compression scheduler state at the begin of the epoch compression_ctrl.scheduler.epoch_step() config.cur_epoch = epoch if config.distributed: train_sampler.set_epoch(epoch) # train for one epoch train_epoch(train_loader, model, criterion, criterion_fn, optimizer, compression_ctrl, epoch, config) # Learning rate scheduling should be applied after optimizer’s update lr_scheduler.step(epoch if not isinstance( lr_scheduler, ReduceLROnPlateau) else best_acc1) # compute compression algo statistics stats = compression_ctrl.statistics() acc1 = best_acc1 if epoch % config.test_every_n_epochs == 0: # evaluate on validation set acc1, _ = validate(val_loader, model, criterion, config) compression_level = compression_ctrl.compression_level() # remember best acc@1, considering compression level. If current acc@1 less then the best acc@1, checkpoint # still can be best if current compression level is bigger then best one. Compression levels in ascending # order: NONE, PARTIAL, FULL. is_best_by_accuracy = acc1 > best_acc1 and compression_level == best_compression_level is_best = is_best_by_accuracy or compression_level > best_compression_level if is_best: best_acc1 = acc1 config.mlflow.safe_call('log_metric', "best_acc1", best_acc1) best_compression_level = max(compression_level, best_compression_level) acc = best_acc1 / 100 if config.metrics_dump is not None: write_metrics(acc, config.metrics_dump) if is_main_process(): print_statistics(stats) checkpoint_path = osp.join(config.checkpoint_save_dir, get_name(config) + '_last.pth') checkpoint = { 'epoch': epoch + 1, 'arch': model_name, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'compression_level': compression_level, 'acc1': acc1, 'optimizer': optimizer.state_dict(), 'scheduler': compression_ctrl.scheduler.state_dict() } torch.save(checkpoint, checkpoint_path) make_additional_checkpoints(checkpoint_path, is_best, epoch + 1, config) for key, value in stats.items(): if isinstance(value, (int, float)): config.mlflow.safe_call( 'log_metric', 'compression/statistics/{0}'.format(key), value, epoch) config.tb.add_scalar( "compression/statistics/{0}".format(key), value, len(train_loader) * epoch)
def train(model, model_without_dp, compression_ctrl, train_loader, val_loader, criterion, class_encoding, config, resuming_checkpoint): logger.info("\nTraining...\n") # Check if the network architecture is correct logger.info(model) optim_config = config.get('optimizer', {}) optim_params = optim_config.get('optimizer_params', {}) lr = optim_params.get("lr", 1e-4) params_to_optimize = get_params_to_optimize(model_without_dp, lr * 10, config) optimizer, lr_scheduler = make_optimizer(params_to_optimize, config) # Evaluation metric ignore_index = None ignore_unlabeled = config.get("ignore_unlabeled", True) if ignore_unlabeled and ('unlabeled' in class_encoding): ignore_index = list(class_encoding).index('unlabeled') metric = IoU(len(class_encoding), ignore_index=ignore_index) best_miou = -1 best_compression_level = CompressionLevel.NONE # Optionally resume from a checkpoint if resuming_checkpoint is not None: if optimizer is not None: optimizer.load_state_dict(resuming_checkpoint['optimizer']) start_epoch = resuming_checkpoint['epoch'] best_miou = resuming_checkpoint['miou'] if "scheduler" in resuming_checkpoint and compression_ctrl.scheduler is not None: compression_ctrl.scheduler.load_state_dict( resuming_checkpoint['scheduler']) logger.info("Resuming from model: Start epoch = {0} " "| Best mean IoU = {1:.4f}".format(start_epoch, best_miou)) config.start_epoch = start_epoch # Start Training train_obj = Train(model, train_loader, optimizer, criterion, compression_ctrl, metric, config.device, config.model) val_obj = Test(model, val_loader, criterion, metric, config.device, config.model) for epoch in range(config.start_epoch, config.epochs): compression_ctrl.scheduler.epoch_step() logger.info(">>>> [Epoch: {0:d}] Training".format(epoch)) if config.distributed: train_loader.sampler.set_epoch(epoch) epoch_loss, (iou, miou) = train_obj.run_epoch(config.print_step) if not isinstance(lr_scheduler, ReduceLROnPlateau): # Learning rate scheduling should be applied after optimizer’s update lr_scheduler.step(epoch) logger.info( ">>>> [Epoch: {0:d}] Avg. loss: {1:.4f} | Mean IoU: {2:.4f}". format(epoch, epoch_loss, miou)) if is_main_process(): config.tb.add_scalar("train/loss", epoch_loss, epoch) config.tb.add_scalar("train/mIoU", miou, epoch) config.tb.add_scalar("train/learning_rate", optimizer.param_groups[0]['lr'], epoch) config.tb.add_scalar("train/compression_loss", compression_ctrl.loss(), epoch) for key, value in compression_ctrl.statistics( quickly_collected_only=True).items(): if isinstance(value, (int, float)): config.tb.add_scalar( "compression/statistics/{0}".format(key), value, epoch) if (epoch + 1) % config.save_freq == 0 or epoch + 1 == config.epochs: logger.info(">>>> [Epoch: {0:d}] Validation".format(epoch)) loss, (iou, miou) = val_obj.run_epoch(config.print_step) logger.info( ">>>> [Epoch: {0:d}] Avg. loss: {1:.4f} | Mean IoU: {2:.4f}". format(epoch, loss, miou)) if is_main_process(): config.tb.add_scalar("val/mIoU", miou, epoch) config.tb.add_scalar("val/loss", loss, epoch) for i, (key, class_iou) in enumerate(zip(class_encoding.keys(), iou)): config.tb.add_scalar( "{}/mIoU_Cls{}_{}".format(config.dataset, i, key), class_iou, epoch) compression_level = compression_ctrl.compression_level() is_best_by_miou = miou > best_miou and compression_level == best_compression_level is_best = is_best_by_miou or compression_level > best_compression_level if is_best: best_miou = miou best_compression_level = max(compression_level, best_compression_level) if config.metrics_dump is not None: write_metrics(best_miou, config.metrics_dump) if isinstance(lr_scheduler, ReduceLROnPlateau): # Learning rate scheduling should be applied after optimizer’s update lr_scheduler.step(best_miou) # Print per class IoU on last epoch or if best iou if epoch + 1 == config.epochs or is_best: for key, class_iou in zip(class_encoding.keys(), iou): logger.info("{0}: {1:.4f}".format(key, class_iou)) # Save the model if it's the best thus far if is_main_process(): checkpoint_path = save_checkpoint(model, optimizer, epoch, best_miou, compression_level, compression_ctrl.scheduler, config) make_additional_checkpoints(checkpoint_path, is_best, epoch, config) print_statistics(compression_ctrl.statistics()) return model
def main_worker(current_gpu, config): configure_device(current_gpu, config) config.mlflow = SafeMLFLow(config) if is_main_process(): configure_logging(logger, config) print_args(config) logger.info(config) dataset = get_dataset(config.dataset) color_encoding = dataset.color_encoding num_classes = len(color_encoding) if config.metrics_dump is not None: write_metrics(0, config.metrics_dump) train_loader = val_loader = criterion = None resuming_checkpoint_path = config.resuming_checkpoint_path nncf_config = config.nncf_config pretrained = is_pretrained_model_requested(config) def criterion_fn(model_outputs, target, criterion_): labels, loss_outputs, _ = \ loss_funcs.do_model_specific_postprocessing(config.model, target, model_outputs) return criterion_(loss_outputs, labels) if config.to_onnx is not None: assert pretrained or (resuming_checkpoint_path is not None) else: loaders, w_class = load_dataset(dataset, config) train_loader, val_loader, init_loader = loaders criterion = get_criterion(w_class, config) def autoq_test_fn(model, eval_loader): return test(model, eval_loader, criterion, color_encoding, config) nncf_config = register_default_init_args(nncf_config, init_loader, criterion, criterion_fn, autoq_test_fn, val_loader, config.device) model = load_model(config.model, pretrained=pretrained, num_classes=num_classes, model_params=config.get('model_params', {}), weights_path=config.get('weights')) model.to(config.device) resuming_model_sd = None resuming_checkpoint = None if resuming_checkpoint_path is not None: resuming_model_sd, resuming_checkpoint = load_resuming_model_state_dict_and_checkpoint_from_path( resuming_checkpoint_path) compression_ctrl, model = create_compressed_model( model, nncf_config, resuming_state_dict=resuming_model_sd) model, model_without_dp = prepare_model_for_execution(model, config) if config.distributed: compression_ctrl.distributed() log_common_mlflow_params(config) if config.to_onnx: compression_ctrl.export_model(config.to_onnx) logger.info("Saved to {}".format(config.to_onnx)) return if is_main_process(): print_statistics(compression_ctrl.statistics()) if config.mode.lower() == 'test': logger.info(model) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) logger.info("Trainable argument count:{params}".format(params=params)) model = model.to(config.device) test(model, val_loader, criterion, color_encoding, config) elif config.mode.lower() == 'train': train(model, model_without_dp, compression_ctrl, train_loader, val_loader, criterion, color_encoding, config, resuming_checkpoint) else: # Should never happen...but just in case it does raise RuntimeError( "\"{0}\" is not a valid choice for execution mode.".format( config.mode))
def main_worker(current_gpu, config: SampleConfig): config.current_gpu = current_gpu config.distributed = config.execution_mode in ( ExecutionMode.DISTRIBUTED, ExecutionMode.MULTIPROCESSING_DISTRIBUTED) if config.distributed: configure_distributed(config) config.device = get_device(config) if is_main_process(): configure_logging(logger, config) print_args(config) if config.seed is not None: manual_seed(config.seed) cudnn.deterministic = True cudnn.benchmark = False # define loss function (criterion) criterion = nn.CrossEntropyLoss() criterion = criterion.to(config.device) train_loader = train_sampler = val_loader = None resuming_checkpoint_path = config.resuming_checkpoint_path nncf_config = config.nncf_config pretrained = is_pretrained_model_requested(config) if config.to_onnx is not None: assert pretrained or (resuming_checkpoint_path is not None) else: # Data loading code train_dataset, val_dataset = create_datasets(config) train_loader, train_sampler, val_loader = create_data_loaders( config, train_dataset, val_dataset) nncf_config = register_default_init_args(nncf_config, criterion, train_loader) # create model model_name = config['model'] model = load_model(model_name, pretrained=pretrained, num_classes=config.get('num_classes', 1000), model_params=config.get('model_params'), weights_path=config.get('weights')) model.to(config.device) resuming_model_sd = None resuming_checkpoint = None if resuming_checkpoint_path is not None: resuming_checkpoint = load_resuming_checkpoint( resuming_checkpoint_path) resuming_model_sd = resuming_checkpoint['state_dict'] compression_ctrl, model = create_compressed_model( model, nncf_config, resuming_state_dict=resuming_model_sd) if config.to_onnx: compression_ctrl.export_model(config.to_onnx) logger.info("Saved to {}".format(config.to_onnx)) return model, _ = prepare_model_for_execution(model, config) if config.distributed: compression_ctrl.distributed() # define optimizer params_to_optimize = get_parameter_groups(model, config) optimizer, lr_scheduler = make_optimizer(params_to_optimize, config) best_acc1 = 0 # optionally resume from a checkpoint if resuming_checkpoint_path is not None: if config.mode.lower() == 'train' and config.to_onnx is None: config.start_epoch = resuming_checkpoint['epoch'] best_acc1 = resuming_checkpoint['best_acc1'] compression_ctrl.scheduler.load_state_dict( resuming_checkpoint['scheduler']) optimizer.load_state_dict(resuming_checkpoint['optimizer']) logger.info( "=> loaded checkpoint '{}' (epoch: {}, best_acc1: {:.3f})". format(resuming_checkpoint_path, resuming_checkpoint['epoch'], best_acc1)) else: logger.info( "=> loaded checkpoint '{}'".format(resuming_checkpoint_path)) if config.execution_mode != ExecutionMode.CPU_ONLY: cudnn.benchmark = True if config.mode.lower() == 'test': print_statistics(compression_ctrl.statistics()) validate(val_loader, model, criterion, config) if config.mode.lower() == 'train': is_inception = 'inception' in model_name train(config, compression_ctrl, model, criterion, is_inception, lr_scheduler, model_name, optimizer, train_loader, train_sampler, val_loader, best_acc1)
def create_compressed_model(model: Module, config: NNCFConfig, resuming_state_dict: dict = None, dummy_forward_fn: Callable[[Module], Any] = None, dump_graphs=True,) \ -> Tuple[CompressionAlgorithmController, NNCFNetwork]: """ The main function used to produce a model ready for compression fine-tuning from an original PyTorch model and a configuration object. dummy_forward_fn :param model: The original model. Should have its parameters already loaded from a checkpoint or another source. :param config: A configuration object used to determine the exact compression modifications to be applied to the model :param resuming_state_dict: A PyTorch state dict object to load (strictly) into the compressed model after building. :param dummy_forward_fn: will be used instead of a *forward* function call to build the internal graph representation via tracing. Specifying this is useful when the original training pipeline has special formats of data loader output or has additional *forward* arguments other than input tensors. Otherwise, the *forward* call of the model during graph tracing will be made with mock tensors according to the shape specified in the config object. :param dump_graphs: Whether or not should also dump the internal graph representation of the original and compressed models in the .dot format into the log directory. :return: A controller for the compression algorithm (or algorithms, in which case the controller is an instance of CompositeCompressionController) and the model ready for compression parameter training wrapped as an object of NNCFNetwork.""" if dump_graphs: if dummy_forward_fn is None: input_info_list = create_input_infos(config) graph_builder = GraphBuilder( custom_forward_fn=create_dummy_forward_fn( input_info_list, with_input_tracing=True)) else: graph_builder = GraphBuilder(custom_forward_fn=dummy_forward_fn) if is_main_process(): graph = graph_builder.build_graph(model) graph.dump_graph(osp.join(config.get("log_dir", "."), "original_graph.dot"), extended=True) if is_debug(): set_debug_log_dir(config.get("log_dir", ".")) input_info_list = create_input_infos(config) scopes_without_shape_matching = config.get('scopes_without_shape_matching', []) ignored_scopes = config.get('ignored_scopes') target_scopes = config.get('target_scopes') compressed_model = NNCFNetwork( model, input_infos=input_info_list, dummy_forward_fn=dummy_forward_fn, ignored_scopes=ignored_scopes, target_scopes=target_scopes, scopes_without_shape_matching=scopes_without_shape_matching) should_init = resuming_state_dict is None compression_algo_builder_list = create_compression_algorithm_builders( config, should_init=should_init) for builder in compression_algo_builder_list: compressed_model = builder.apply_to(compressed_model) compression_ctrl = compressed_model.commit_compression_changes() if dump_graphs and is_main_process() and compression_algo_builder_list: if dummy_forward_fn is None: compressed_graph_builder = GraphBuilder( custom_forward_fn=create_dummy_forward_fn( input_info_list, with_input_tracing=False)) else: compressed_graph_builder = GraphBuilder( custom_forward_fn=dummy_forward_fn) graph = compressed_graph_builder.build_graph( compressed_model, compressed_model.get_tracing_context()) graph.dump_graph(osp.join(config.get("log_dir", "."), "compressed_graph.dot"), extended=True) if resuming_state_dict is not None: load_state(compressed_model, resuming_state_dict, is_resume=True) return compression_ctrl, compressed_model
def train(net, compression_ctrl, train_data_loader, test_data_loader, criterion, optimizer, config, lr_scheduler): net.train() # loss counters loc_loss = 0 # epoch conf_loss = 0 epoch_size = len(train_data_loader) logger.info('Training {} on {} dataset...'.format( config.model, train_data_loader.dataset.name)) batch_iterator = None t_start = time.time() best_mAp = 0 best_compression_level = CompressionLevel.NONE test_freq_in_epochs = max(config.test_interval // epoch_size, 1) for iteration in range(config.start_iter, config['max_iter']): if (not batch_iterator) or (iteration % epoch_size == 0): # create batch iterator batch_iterator = iter(train_data_loader) epoch = iteration // epoch_size compression_ctrl.scheduler.step() if iteration % epoch_size == 0: compression_ctrl.scheduler.epoch_step(epoch) if is_main_process(): print_statistics(compression_ctrl.statistics()) if (iteration + 1) % epoch_size == 0: compression_level = compression_ctrl.compression_level() is_best = False if (epoch + 1) % test_freq_in_epochs == 0: with torch.no_grad(): net.eval() mAP = test_net( net, config.device, test_data_loader, distributed=config.multiprocessing_distributed) is_best_by_mAP = mAP > best_mAp and compression_level == best_compression_level is_best = is_best_by_mAP or compression_level > best_compression_level if is_best: best_mAp = mAP best_compression_level = max(compression_level, best_compression_level) if isinstance(lr_scheduler, ReduceLROnPlateau): lr_scheduler.step(mAP) net.train() if is_on_first_rank(config): logger.info('Saving state, iter: {}'.format(iteration)) checkpoint_file_path = osp.join( config.checkpoint_save_dir, "{}_last.pth".format(get_name(config))) torch.save( { 'state_dict': net.state_dict(), 'optimizer': optimizer.state_dict(), 'iter': iteration, 'scheduler': compression_ctrl.scheduler.state_dict(), 'compression_level': compression_level, }, str(checkpoint_file_path)) make_additional_checkpoints(checkpoint_file_path, is_best=is_best, epoch=epoch + 1, config=config) # Learning rate scheduling should be applied after optimizer’s update if not isinstance(lr_scheduler, ReduceLROnPlateau): lr_scheduler.step(epoch) optimizer.zero_grad() batch_iterator, batch_loss, batch_loss_c, batch_loss_l, loss_comp = train_step( batch_iterator, compression_ctrl, config, criterion, net, train_data_loader) optimizer.step() batch_loss_l = batch_loss_l / config.iter_size batch_loss_c = batch_loss_c / config.iter_size model_loss = (batch_loss_l + batch_loss_c) / config.iter_size batch_loss = batch_loss / config.iter_size loc_loss += batch_loss_l.item() conf_loss += batch_loss_c.item() ########################### # Logging ########################### if is_on_first_rank(config): config.tb.add_scalar("train/loss_l", batch_loss_l.item(), iteration) config.tb.add_scalar("train/loss_c", batch_loss_c.item(), iteration) config.tb.add_scalar("train/loss", batch_loss.item(), iteration) if iteration % config.print_freq == 0: t_finish = time.time() t_elapsed = t_finish - t_start t_start = time.time() logger.info( '{}: iter {} epoch {} || Loss: {:.4} || Time {:.4}s || lr: {} || CR loss: {}' .format( config.rank, iteration, epoch, model_loss.item(), t_elapsed, optimizer.param_groups[0]['lr'], loss_comp.item() if isinstance(loss_comp, torch.Tensor) else loss_comp)) if config.metrics_dump is not None: write_metrics(best_mAp, config.metrics_dump)