def load_model(model, pretrained=True, num_classes=1000, model_params=None, weights_path: str = None) -> torch.nn.Module: """ ** WARNING: This is implemented using torch.load functionality, which itself uses Python's pickling facilities that may be used to perform arbitrary code execution during unpickling. Only load the data you trust. """ logger.info("Loading model: {}".format(model)) if model_params is None: model_params = {} if model in torchvision.models.__dict__: load_model_fn = partial(torchvision.models.__dict__[model], num_classes=num_classes, pretrained=pretrained, **model_params) elif model in custom_models.__dict__: load_model_fn = partial(custom_models.__dict__[model], num_classes=num_classes, pretrained=pretrained, **model_params) else: raise Exception("Undefined model name") loaded_model = safe_thread_call(load_model_fn) if not pretrained and weights_path is not None: sd = torch.load(weights_path, map_location='cpu') load_state(loaded_model, sd, is_resume=False) return loaded_model
def load_model(model, pretrained=True, num_classes=1000, model_params=None, weights_path: str = None) -> torch.nn.Module: logger.info("Loading model: {}".format(model)) if model_params is None: model_params = {} if model in torchvision.models.__dict__: load_model_fn = partial(torchvision.models.__dict__[model], num_classes=num_classes, pretrained=pretrained, **model_params) elif model in custom_models.__dict__: load_model_fn = partial(custom_models.__dict__[model], num_classes=num_classes, pretrained=pretrained, **model_params) else: raise Exception("Undefined model name") loaded_model = safe_thread_call(load_model_fn) if not pretrained and weights_path is not None: sd = torch.load(weights_path, map_location='cpu') load_state(loaded_model, sd, is_resume=False) return loaded_model
def predict_detections(data_loader, device, net): num_batches = len(data_loader) all_detections = [] timer = Timer() for batch_ind, (ims, _gts, hs, ws) in enumerate(data_loader): x = ims.to(device) hs = x.new_tensor(hs).view(-1, 1) ws = x.new_tensor(ws).view(-1, 1) timer.tic() batch_detections = net(x) top_k = batch_detections.size(2) batch_detections = batch_detections.view(-1, top_k, 7) detect_time = timer.toc(average=False) batch_detections[..., 3] *= ws batch_detections[..., 5] *= ws batch_detections[..., 4] *= hs batch_detections[..., 6] *= hs all_detections.append(batch_detections.cpu()) logger.info('Detect for batch: {:d}/{:d} {:.3f}s'.format(batch_ind + 1, num_batches, detect_time)) if all_detections: return torch.cat(all_detections) return None # No predictions
def load_resuming_checkpoint(resuming_checkpoint_path: str): if osp.isfile(resuming_checkpoint_path): logger.info( "=> loading checkpoint '{}'".format(resuming_checkpoint_path)) checkpoint = torch.load(resuming_checkpoint_path, map_location='cpu') return checkpoint raise FileNotFoundError( "no checkpoint found at '{}'".format(resuming_checkpoint_path))
def load_resuming_model_state_dict_and_checkpoint_from_path( resuming_checkpoint_path): logger.info( 'Resuming from checkpoint {}...'.format(resuming_checkpoint_path)) resuming_checkpoint = torch.load(resuming_checkpoint_path, map_location='cpu') # use checkpoint itself in case only the state dict was saved, # i.e. the checkpoint was created with `torch.save(module.state_dict())` resuming_model_state_dict = resuming_checkpoint.get( 'state_dict', resuming_checkpoint) return resuming_model_state_dict, resuming_checkpoint
def validate(val_loader, model, criterion, config): batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to evaluate mode model.eval() with torch.no_grad(): end = time.time() for i, (input_, target) in enumerate(val_loader): input_ = input_.to(config.device) target = target.to(config.device) # compute output output = model(input_) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss, input_.size(0)) top1.update(acc1, input_.size(0)) top5.update(acc5, input_.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % config.print_freq == 0: logger.info( '{rank}' 'Test: [{0}/{1}] ' 'Time: {batch_time.val:.3f} ({batch_time.avg:.3f}) ' 'Loss: {loss.val:.4f} ({loss.avg:.4f}) ' 'Acc@1: {top1.val:.3f} ({top1.avg:.3f}) ' 'Acc@5: {top5.val:.3f} ({top5.avg:.3f})'.format( i, len(val_loader), batch_time=batch_time, loss=losses, top1=top1, top5=top5, rank='{}:'.format(config.rank) if config.multiprocessing_distributed else '' )) if is_main_process(): config.tb.add_scalar("val/loss", losses.avg, len(val_loader) * config.get('cur_epoch', 0)) config.tb.add_scalar("val/top1", top1.avg, len(val_loader) * config.get('cur_epoch', 0)) config.tb.add_scalar("val/top5", top5.avg, len(val_loader) * config.get('cur_epoch', 0)) logger.info(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}\n'.format(top1=top1, top5=top5)) acc = top1.avg / 100 if config.metrics_dump is not None: write_metrics(acc, config.metrics_dump) return top1.avg, top5.avg
def test_net(net, device, data_loader, distributed=False): """Test a Fast R-CNN network on an image database.""" logger.info("Testing...") num_images = len(data_loader.dataset) batch_detections = predict_detections(data_loader, device, net) if distributed: batch_detections = gather_detections( batch_detections, data_loader.sampler.samples_per_rank) batch_detections = batch_detections[:num_images] all_boxes = convert_detections(batch_detections) logger.info('Evaluating detections') return evaluate_detections(all_boxes, data_loader.dataset)
def test(model, test_loader, class_weights, class_encoding, config): logger.info("\nTesting...\n") _, criterion = get_aux_loss_dependent_params(model, class_weights, 0, config) # Evaluation metric ignore_index = None ignore_unlabeled = config.get("ignore_unlabeled", True) if ignore_unlabeled and ('unlabeled' in class_encoding): ignore_index = list(class_encoding).index('unlabeled') metric = IoU(len(class_encoding), ignore_index=ignore_index) # Test the trained model on the test set test_obj = Test(model, test_loader, criterion, metric, config.device, config.model) logger.info(">>>> Running test dataset") loss, (iou, miou) = test_obj.run_epoch(config.print_step) class_iou = dict(zip(class_encoding.keys(), iou)) logger.info(">>>> Avg. loss: {0:.4f} | Mean IoU: {1:.4f}".format( loss, miou)) if config.metrics_dump is not None: write_metrics(miou, config.metrics_dump) # Print per class IoU for key, class_iou in zip(class_encoding.keys(), iou): logger.info("{0}: {1:.4f}".format(key, class_iou)) # Show a batch of samples and labels if config.imshow_batch: logger.info("A batch of predictions from the test set...") images, gt_labels = iter(test_loader).next() color_predictions = predict(model, images, class_encoding, config) from examples.common.models.segmentation.unet import UNet, center_crop if isinstance(model, UNet): # UNet predicts center image crops outputs_size_hw = (color_predictions.size()[2], color_predictions.size()[3]) gt_labels = center_crop(gt_labels, outputs_size_hw).contiguous() data_utils.show_ground_truth_vs_prediction(images, gt_labels, color_predictions, class_encoding)
def run_epoch(self, iteration_loss=False): """Runs an epoch of training. Keyword arguments: - iteration_loss (``bool``, optional): Prints loss at every step. Returns: - The epoch loss (float). """ compression_scheduler = self.compression_ctrl.scheduler self.model.train() epoch_loss = 0.0 self.metric.reset() for step, batch_data in enumerate(self.data_loader): compression_scheduler.step() # Get the inputs and labels inputs = batch_data[0].to(self.device) labels = batch_data[1].to(self.device) # Forward propagation outputs = self.model(inputs) labels, loss_outputs, metric_outputs = do_model_specific_postprocessing(self.model_name, labels, outputs) # Loss computation loss = self.criterion(loss_outputs, labels) compression_loss = self.compression_ctrl.loss() loss += compression_loss # Backpropagation self.optim.zero_grad() loss.backward() self.optim.step() # Keep track of loss for current epoch epoch_loss += loss.item() # Keep track of the evaluation metric self.metric.add(metric_outputs.detach(), labels.detach()) if iteration_loss: logger.info("[Step: {}] Iteration loss: {:.4f}".format(step, loss.item())) return epoch_loss / len(self.data_loader), self.metric.value()
def configure_distributed(config): if config.dist_url == "env://" and config.rank == -1: config.rank = int(os.environ["RANK"]) config.ngpus_per_node = torch.cuda.device_count() if config.current_gpu is not None: # Distributed multiprocessing config.rank = config.rank * config.ngpus_per_node + config.current_gpu logger.info('| distributed init (rank {}): {}'.format( config.rank, config.dist_url)) dist.init_process_group(backend=config.dist_backend, init_method=config.dist_url, world_size=config.world_size, rank=config.rank) config.world_size = dist.get_world_size()
def main(argv): parser = get_arguments_parser() arguments = parser.parse_args(args=argv) config = create_sample_config(arguments, parser) if arguments.dist_url == "env://": config.update_from_env() if not osp.exists(config.log_dir): os.makedirs(config.log_dir) config.log_dir = str(config.log_dir) configure_paths(config) logger.info("Save directory: {}".format(config.log_dir)) config.execution_mode = get_execution_mode(config) start_worker(main_worker, config)
def load_model(model, pretrained=True, num_classes=1000, model_params=None): logger.info("Loading model: {}".format(model)) if model_params is None: model_params = {} if model in torchvision.models.__dict__: load_model_fn = partial(torchvision.models.__dict__[model], num_classes=num_classes, pretrained=pretrained, **model_params) elif model in custom_models.__dict__: load_model_fn = partial(custom_models.__dict__[model], num_classes=num_classes, pretrained=pretrained, **model_params) else: raise Exception("Undefined model name") return safe_thread_call(load_model_fn)
def configure_distributed(config): if config.dist_url == "env://" and config.rank == -1: config.rank = int(os.environ["RANK"]) config.ngpus_per_node = torch.cuda.device_count() if config.current_gpu is not None: # Distributed multiprocessing config.rank = config.rank * config.ngpus_per_node + config.current_gpu # Must be called before execution of CUDA kernels to prevent failure the ones that allocate memory on the # default device (E.g. NMS kernel - https://github.com/facebookresearch/maskrcnn-benchmark/issues/74) torch.cuda.set_device(config.current_gpu) logger.info('| distributed init (rank {}): {}'.format( config.rank, config.dist_url)) dist.init_process_group(backend=config.dist_backend, init_method=config.dist_url, world_size=config.world_size, rank=config.rank) config.world_size = dist.get_world_size()
def main(argv): parser = get_common_argument_parser() arguments = parser.parse_args(args=argv) config = Config.from_json(arguments.config) config.update_from_args(arguments, parser) if config.dist_url == "env://": config.update_from_env() if config.mode.lower() != 'test': if not osp.exists(config.log_dir): os.makedirs(config.log_dir) config.log_dir = str(config.log_dir) configure_paths(config) logger.info("Save directory: {}".format(config.log_dir)) else: config.log_dir = "/tmp/" config.execution_mode = get_execution_mode(config) start_worker(main_worker, config)
def eval_net_loss(data_loader, device, net, criterion): batch_loss_l = AverageMeter() batch_loss_c = AverageMeter() batch_loss = AverageMeter() t_elapsed = AverageMeter() num_batches = len(data_loader) # Assume 10 lines of reporting print_freq = num_batches//10 print_freq = 1 if print_freq == 0 else print_freq # all_detections = [] timer = Timer() for batch_ind, (ims, _gts, _, _) in enumerate(data_loader): images = ims.to(device) targets = [anno.requires_grad_(False).to(device) for anno in _gts] # forward out = net(images) loss_l, loss_c = criterion(out, targets) loss = loss_l + loss_c batch_loss_l.update(loss_l.item(), images.size(0)) batch_loss_c.update(loss_c.item(), images.size(0)) batch_loss.update(loss.item(), images.size(0)) timer.tic() t_elapsed.update(timer.toc(average=False)) if batch_ind % print_freq == 0: logger.info('Loss_inference: [{}/{}] || Time: {elapsed.val:.4f}s ({elapsed.avg:.4f}s)' ' || Conf Loss: {conf_loss.val:.3f} ({conf_loss.avg:.3f})' ' || Loc Loss: {loc_loss.val:.3f} ({loc_loss.avg:.3f})' ' || Model Loss: {model_loss.val:.3f} ({model_loss.avg:.3f})'.format( batch_ind, num_batches, elapsed=t_elapsed, conf_loss=batch_loss_c, loc_loss=batch_loss_l, model_loss=batch_loss)) model_loss = batch_loss_l.avg + batch_loss_c.avg return model_loss
def create_dataloaders(config): logger.info('Loading Dataset...') train_dataset = get_training_dataset(config.dataset, config.train_anno, config.train_imgs, config) logger.info("Loaded {} training images".format(len(train_dataset))) if config.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, num_replicas=config.ngpus_per_node, rank=config.rank) else: train_sampler = None train_data_loader = data.DataLoader( train_dataset, config.batch_size, num_workers=config.workers, shuffle=(train_sampler is None), collate_fn=detection_collate, pin_memory=True, sampler=train_sampler ) test_dataset = get_testing_dataset(config.dataset, config.test_anno, config.test_imgs, config) logger.info("Loaded {} testing images".format(len(test_dataset))) if config.distributed: test_sampler = DistributedSampler(test_dataset, config.rank, config.world_size) else: test_sampler = torch.utils.data.SequentialSampler(test_dataset) test_data_loader = data.DataLoader( test_dataset, config.batch_size, num_workers=config.workers, shuffle=False, collate_fn=detection_collate, pin_memory=True, drop_last=False, sampler=test_sampler ) return test_data_loader, train_data_loader
def run_epoch(self, iteration_loss=False): """Runs an epoch of validation. Keyword arguments: - iteration_loss (``bool``, optional): Prints loss at every step. Returns: - The epoch loss (float), and the values of the specified metrics """ self.model.eval() epoch_loss = 0.0 self.metric.reset() for step, batch_data in tqdm(enumerate(self.data_loader), total=len(self.data_loader)): # Get the inputs and labels inputs = batch_data[0].to(self.device) labels = batch_data[1].to(self.device) with torch.no_grad(): # Forward propagation outputs = self.model(inputs) labels, loss_outputs, metric_outputs = do_model_specific_postprocessing(self.model_name, labels, outputs) # Loss computation loss = self.criterion(loss_outputs, labels) # Keep track of loss for current epoch epoch_loss += loss.item() self.metric.add(metric_outputs.detach(), labels.detach()) if iteration_loss: logger.info("[Step: {}] Iteration loss: {:.4f}".format(step, loss.item())) return epoch_loss / len(self.data_loader), self.metric.value()
def load_detection_annotations(cachedir, dataset): cachefile = os.path.join(cachedir, 'annots_{}.json'.format(dataset.name)) imagenames = dataset.get_img_names() if is_main_process(): if not os.path.isfile(cachefile): # load annots gt = {} for i, imagename in enumerate(imagenames): _, gt[imagename] = dataset.pull_anno(i) if i % 100 == 0: logger.info('Reading annotation for {:d}/{:d}'.format( i + 1, len(imagenames))) # save logger.info('Saving cached annotations to {:s}'.format(cachefile)) pathlib.Path(cachedir).mkdir(parents=True, exist_ok=True) with open(cachefile, 'w') as f: json.dump(gt, f) if is_dist_avail_and_initialized(): dist.barrier() with open(cachefile, 'r') as f: gt = json.load(f) return gt, imagenames
def evaluate_detections(box_list, dataset, use_07=False): cachedir = os.path.join('cache', 'annotations_cache') aps = [] # The PASCAL VOC metric changed in 2010 use_07_metric = use_07 logger.info('VOC07 metric? {}'.format('Yes' if use_07_metric else 'No')) for cls_ind, cls in enumerate(dataset.classes): # for each class class_boxes = box_list[box_list[:, 1] == cls_ind + 1] ap, _, _ = voc_eval( # calculate rec, prec, ap class_boxes, dataset, cls, cachedir, ovthresh=0.5, use_07_metric=use_07_metric) aps += [ap] logger.info('AP for {} = {:.4f}'.format(cls, ap)) mAp = np.mean(aps) logger.info('Mean AP = {:.4f}'.format(mAp)) return mAp
def resume_from_checkpoint(resuming_checkpoint, model, config, optimizer, compression_ctrl): best_acc1 = 0 if osp.isfile(resuming_checkpoint): logger.info("=> loading checkpoint '{}'".format(resuming_checkpoint)) checkpoint = torch.load(resuming_checkpoint, map_location='cpu') load_state(model, checkpoint['state_dict'], is_resume=True) if config.mode.lower() == 'train' and config.to_onnx is None: config.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] compression_ctrl.scheduler.load_state_dict(checkpoint['scheduler']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info("=> loaded checkpoint '{}' (epoch: {}, best_acc1: {:.3f})" .format(resuming_checkpoint, checkpoint['epoch'], best_acc1)) else: logger.info("=> loaded checkpoint '{}'".format(resuming_checkpoint)) else: raise FileNotFoundError("no checkpoint found at '{}'".format(resuming_checkpoint)) return model, config, optimizer, compression_ctrl, best_acc1
def get_class_weights(train_set, num_classes, config): # Get class weights from the selected weighing technique logger.info("\nWeighing technique: {}".format(config.weighing)) weighing = config.get('weighing', 'none') if isinstance(weighing, list): # Class weights were directly specified in config return np.asarray(weighing) train_loader_for_weight_count = torch.utils.data.DataLoader( train_set, batch_size=1, collate_fn=data_utils.collate_fn) logger.info("Computing class weights...") logger.info("(this can take a while depending on the dataset size)") if weighing.lower() == 'enet': class_weights = data_utils.enet_weighing(train_loader_for_weight_count, num_classes) elif weighing.lower() == 'mfb': class_weights = data_utils.median_freq_balancing( train_loader_for_weight_count, num_classes) else: class_weights = None return class_weights
def test_net(net, device, data_loader, distributed=False, loss_inference=False, criterion=None): """Test a Fast R-CNN network on an image database.""" if loss_inference is True: logger.info("Testing... loss function will be evaluated instead of detection mAP") if distributed: raise NotImplementedError if criterion is None: raise ValueError("Missing loss inference function (criterion)") return eval_net_loss(data_loader, device, net, criterion) logger.info("Testing...") num_images = len(data_loader.dataset) batch_detections = predict_detections(data_loader, device, net) if distributed: batch_detections = gather_detections(batch_detections, data_loader.sampler.samples_per_rank) batch_detections = batch_detections[:num_images] all_boxes = convert_detections(batch_detections) logger.info('Evaluating detections') return evaluate_detections(all_boxes, data_loader.dataset)
def train_epoch_bin(train_loader, batch_multiplier, model, criterion, optimizer, optimizer_scheduler: BinarizationOptimizerScheduler, kd_loss_calculator: KDLossCalculator, compression_ctrl, epoch, config, is_inception=False): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() kd_losses_meter = AverageMeter() criterion_losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() compression_scheduler = compression_ctrl.scheduler # switch to train mode model.train() end = time.time() for i, (input_, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) input_ = input_.to(config.device) target = target.to(config.device) # compute output if is_inception: # From https://discuss.pytorch.org/t/how-to-optimize-inception-model-with-auxiliary-classifiers/7958 output, aux_outputs = model(input_) loss1 = criterion(output, target) loss2 = criterion(aux_outputs, target) criterion_loss = loss1 + 0.4 * loss2 else: output = model(input_) criterion_loss = criterion(output, target) # compute KD loss kd_loss = kd_loss_calculator.loss(input_, output) loss = criterion_loss + kd_loss # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), input_.size(0)) comp_loss_val = kd_loss.item() kd_losses_meter.update(comp_loss_val, input_.size(0)) criterion_losses.update(criterion_loss.item(), input_.size(0)) top1.update(acc1, input_.size(0)) top1.update(acc1, input_.size(0)) top5.update(acc5, input_.size(0)) # compute gradient and do SGD step if i % batch_multiplier == 0: optimizer.zero_grad() loss.backward() optimizer.step() else: loss.backward() compression_scheduler.step() optimizer_scheduler.step(float(i) / len(train_loader)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % config.print_freq == 0: logger.info( '{rank}: ' 'Epoch: [{0}][{1}/{2}] ' 'Lr: {3:.3} ' 'Wd: {4:.3} ' 'Time: {batch_time.val:.3f} ({batch_time.avg:.3f}) ' 'Data: {data_time.val:.3f} ({data_time.avg:.3f}) ' 'CE_loss: {ce_loss.val:.4f} ({ce_loss.avg:.4f}) ' 'KD_loss: {kd_loss.val:.4f} ({kd_loss.avg:.4f}) ' 'Loss: {loss.val:.4f} ({loss.avg:.4f}) ' 'Acc@1: {top1.val:.3f} ({top1.avg:.3f}) ' 'Acc@5: {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(train_loader), get_lr(optimizer), get_wd(optimizer), batch_time=batch_time, data_time=data_time, ce_loss=criterion_losses, kd_loss=kd_losses_meter, loss=losses, top1=top1, top5=top5, rank='{}:'.format(config.rank) if config.multiprocessing_distributed else '' )) if is_main_process(): global_step = len(train_loader) * epoch config.tb.add_scalar("train/learning_rate", get_lr(optimizer), i + global_step) config.tb.add_scalar("train/criterion_loss", criterion_losses.avg, i + global_step) config.tb.add_scalar("train/kd_loss", kd_losses_meter.avg, i + global_step) config.tb.add_scalar("train/loss", losses.avg, i + global_step) config.tb.add_scalar("train/top1", top1.avg, i + global_step) config.tb.add_scalar("train/top5", top5.avg, i + global_step) for stat_name, stat_value in compression_ctrl.statistics().items(): if isinstance(stat_value, (int, float)): config.tb.add_scalar('train/statistics/{}'.format(stat_name), stat_value, i + global_step)
def main_worker(current_gpu, config): configure_device(current_gpu, config) config.mlflow = SafeMLFLow(config) if is_main_process(): configure_logging(logger, config) print_args(config) logger.info(config) dataset = get_dataset(config.dataset) color_encoding = dataset.color_encoding num_classes = len(color_encoding) if config.metrics_dump is not None: write_metrics(0, config.metrics_dump) train_loader = val_loader = criterion = None resuming_checkpoint_path = config.resuming_checkpoint_path nncf_config = config.nncf_config pretrained = is_pretrained_model_requested(config) def criterion_fn(model_outputs, target, criterion_): labels, loss_outputs, _ = \ loss_funcs.do_model_specific_postprocessing(config.model, target, model_outputs) return criterion_(loss_outputs, labels) if config.to_onnx is not None: assert pretrained or (resuming_checkpoint_path is not None) else: loaders, w_class = load_dataset(dataset, config) train_loader, val_loader, init_loader = loaders criterion = get_criterion(w_class, config) def autoq_test_fn(model, eval_loader): return test(model, eval_loader, criterion, color_encoding, config) nncf_config = register_default_init_args(nncf_config, init_loader, criterion, criterion_fn, autoq_test_fn, val_loader, config.device) model = load_model(config.model, pretrained=pretrained, num_classes=num_classes, model_params=config.get('model_params', {}), weights_path=config.get('weights')) model.to(config.device) resuming_model_sd = None resuming_checkpoint = None if resuming_checkpoint_path is not None: resuming_model_sd, resuming_checkpoint = load_resuming_model_state_dict_and_checkpoint_from_path( resuming_checkpoint_path) compression_ctrl, model = create_compressed_model( model, nncf_config, resuming_state_dict=resuming_model_sd) model, model_without_dp = prepare_model_for_execution(model, config) if config.distributed: compression_ctrl.distributed() log_common_mlflow_params(config) if config.to_onnx: compression_ctrl.export_model(config.to_onnx) logger.info("Saved to {}".format(config.to_onnx)) return if is_main_process(): print_statistics(compression_ctrl.statistics()) if config.mode.lower() == 'test': logger.info(model) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) logger.info("Trainable argument count:{params}".format(params=params)) model = model.to(config.device) test(model, val_loader, criterion, color_encoding, config) elif config.mode.lower() == 'train': train(model, model_without_dp, compression_ctrl, train_loader, val_loader, criterion, color_encoding, config, resuming_checkpoint) else: # Should never happen...but just in case it does raise RuntimeError( "\"{0}\" is not a valid choice for execution mode.".format( config.mode))
def train(model, model_without_dp, compression_ctrl, train_loader, val_loader, criterion, class_encoding, config, resuming_checkpoint): logger.info("\nTraining...\n") # Check if the network architecture is correct logger.info(model) optim_config = config.get('optimizer', {}) optim_params = optim_config.get('optimizer_params', {}) lr = optim_params.get("lr", 1e-4) params_to_optimize = get_params_to_optimize(model_without_dp, lr * 10, config) optimizer, lr_scheduler = make_optimizer(params_to_optimize, config) # Evaluation metric ignore_index = None ignore_unlabeled = config.get("ignore_unlabeled", True) if ignore_unlabeled and ('unlabeled' in class_encoding): ignore_index = list(class_encoding).index('unlabeled') metric = IoU(len(class_encoding), ignore_index=ignore_index) best_miou = -1 best_compression_level = CompressionLevel.NONE # Optionally resume from a checkpoint if resuming_checkpoint is not None: if optimizer is not None: optimizer.load_state_dict(resuming_checkpoint['optimizer']) start_epoch = resuming_checkpoint['epoch'] best_miou = resuming_checkpoint['miou'] if "scheduler" in resuming_checkpoint and compression_ctrl.scheduler is not None: compression_ctrl.scheduler.load_state_dict( resuming_checkpoint['scheduler']) logger.info("Resuming from model: Start epoch = {0} " "| Best mean IoU = {1:.4f}".format(start_epoch, best_miou)) config.start_epoch = start_epoch # Start Training train_obj = Train(model, train_loader, optimizer, criterion, compression_ctrl, metric, config.device, config.model) val_obj = Test(model, val_loader, criterion, metric, config.device, config.model) for epoch in range(config.start_epoch, config.epochs): compression_ctrl.scheduler.epoch_step() logger.info(">>>> [Epoch: {0:d}] Training".format(epoch)) if config.distributed: train_loader.sampler.set_epoch(epoch) epoch_loss, (iou, miou) = train_obj.run_epoch(config.print_step) if not isinstance(lr_scheduler, ReduceLROnPlateau): # Learning rate scheduling should be applied after optimizer’s update lr_scheduler.step(epoch) logger.info( ">>>> [Epoch: {0:d}] Avg. loss: {1:.4f} | Mean IoU: {2:.4f}". format(epoch, epoch_loss, miou)) if is_main_process(): config.tb.add_scalar("train/loss", epoch_loss, epoch) config.tb.add_scalar("train/mIoU", miou, epoch) config.tb.add_scalar("train/learning_rate", optimizer.param_groups[0]['lr'], epoch) config.tb.add_scalar("train/compression_loss", compression_ctrl.loss(), epoch) for key, value in compression_ctrl.statistics( quickly_collected_only=True).items(): if isinstance(value, (int, float)): config.tb.add_scalar( "compression/statistics/{0}".format(key), value, epoch) if (epoch + 1) % config.save_freq == 0 or epoch + 1 == config.epochs: logger.info(">>>> [Epoch: {0:d}] Validation".format(epoch)) loss, (iou, miou) = val_obj.run_epoch(config.print_step) logger.info( ">>>> [Epoch: {0:d}] Avg. loss: {1:.4f} | Mean IoU: {2:.4f}". format(epoch, loss, miou)) if is_main_process(): config.tb.add_scalar("val/mIoU", miou, epoch) config.tb.add_scalar("val/loss", loss, epoch) for i, (key, class_iou) in enumerate(zip(class_encoding.keys(), iou)): config.tb.add_scalar( "{}/mIoU_Cls{}_{}".format(config.dataset, i, key), class_iou, epoch) compression_level = compression_ctrl.compression_level() is_best_by_miou = miou > best_miou and compression_level == best_compression_level is_best = is_best_by_miou or compression_level > best_compression_level if is_best: best_miou = miou best_compression_level = max(compression_level, best_compression_level) if config.metrics_dump is not None: write_metrics(best_miou, config.metrics_dump) if isinstance(lr_scheduler, ReduceLROnPlateau): # Learning rate scheduling should be applied after optimizer’s update lr_scheduler.step(best_miou) # Print per class IoU on last epoch or if best iou if epoch + 1 == config.epochs or is_best: for key, class_iou in zip(class_encoding.keys(), iou): logger.info("{0}: {1:.4f}".format(key, class_iou)) # Save the model if it's the best thus far if is_main_process(): checkpoint_path = save_checkpoint(model, optimizer, epoch, best_miou, compression_level, compression_ctrl.scheduler, config) make_additional_checkpoints(checkpoint_path, is_best, epoch, config) print_statistics(compression_ctrl.statistics()) return model
def load_dataset(dataset, config): logger.info("\nLoading dataset...\n") logger.info("Selected dataset: {}".format(config.dataset)) logger.info("Dataset directory: {}".format(config.dataset_dir)) transforms_train = get_joint_transforms(is_train=True, config=config) transforms_val = get_joint_transforms(is_train=False, config=config) # Get selected dataset train_set = dataset(root=config.dataset_dir, image_set='train', transforms=transforms_train) val_set = dataset(config.dataset_dir, image_set='val', transforms=transforms_val) # Samplers if config.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_set) else: train_sampler = torch.utils.data.RandomSampler(train_set) batch_size = config.batch_size num_workers = config.workers if config.multiprocessing_distributed: batch_size //= config.ngpus_per_node num_workers //= config.ngpus_per_node def create_train_data_loader(batch_size_): return torch.utils.data.DataLoader(train_set, batch_size=batch_size_, sampler=train_sampler, num_workers=num_workers, collate_fn=data_utils.collate_fn, drop_last=True) # Loaders train_loader = create_train_data_loader(batch_size) if config.batch_size_init: init_loader = create_train_data_loader(config.batch_size_init) else: init_loader = deepcopy(train_loader) if config.distributed: init_loader.num_workers = 0 # PyTorch multiprocessing dataloader issue WA val_sampler = torch.utils.data.SequentialSampler(val_set) val_loader = torch.utils.data.DataLoader(val_set, batch_size=1, num_workers=num_workers, shuffle=False, sampler=val_sampler, collate_fn=data_utils.collate_fn, drop_last=True) # Get encoding between pixel values in label images and RGB colors class_encoding = train_set.color_encoding # Get number of classes to predict num_classes = len(class_encoding) # Print information for debugging logger.info("Number of classes to predict: {}".format(num_classes)) logger.info("Train dataset size: {}".format(len(train_set))) logger.info("Validation dataset size: {}".format(len(val_set))) # Get a batch of samples to display if config.mode.lower() == 'test': images, labels = iter(val_loader).next() else: images, labels = iter(train_loader).next() logger.info("Image size: {}".format(images.size())) logger.info("Label size: {}".format(labels.size())) logger.info("Class-color encoding: {}".format(class_encoding)) # Show a batch of samples and labels if config.imshow_batch and config.mode.lower() != 'test': logger.info("Close the figure window to continue...") label_to_rgb = T.Compose( [data_utils.LongTensorToRGBPIL(class_encoding), T.ToTensor()]) color_labels = data_utils.batch_transform(labels, label_to_rgb) data_utils.imshow_batch(images, color_labels) class_weights = get_class_weights(train_set, num_classes, config) if class_weights is not None: class_weights = torch.from_numpy(class_weights).float().to( config.device) # Set the weight of the unlabeled class to 0 ignore_unlabeled = config.get("ignore_unlabeled", True) if ignore_unlabeled and ('unlabeled' in class_encoding): ignore_index = list(class_encoding).index('unlabeled') class_weights[ignore_index] = 0 logger.info("Class weights: {}".format(class_weights)) return (train_loader, val_loader, init_loader), class_weights
def main_worker(current_gpu, config: SampleConfig): config.current_gpu = current_gpu config.distributed = config.execution_mode in ( ExecutionMode.DISTRIBUTED, ExecutionMode.MULTIPROCESSING_DISTRIBUTED) if config.distributed: configure_distributed(config) config.device = get_device(config) if is_main_process(): configure_logging(logger, config) print_args(config) if config.seed is not None: manual_seed(config.seed) cudnn.deterministic = True cudnn.benchmark = False # define loss function (criterion) criterion = nn.CrossEntropyLoss() criterion = criterion.to(config.device) train_loader = train_sampler = val_loader = None resuming_checkpoint_path = config.resuming_checkpoint_path nncf_config = config.nncf_config pretrained = is_pretrained_model_requested(config) if config.to_onnx is not None: assert pretrained or (resuming_checkpoint_path is not None) else: # Data loading code train_dataset, val_dataset = create_datasets(config) train_loader, train_sampler, val_loader = create_data_loaders( config, train_dataset, val_dataset) nncf_config = register_default_init_args(nncf_config, criterion, train_loader) # create model model_name = config['model'] model = load_model(model_name, pretrained=pretrained, num_classes=config.get('num_classes', 1000), model_params=config.get('model_params'), weights_path=config.get('weights')) model.to(config.device) resuming_model_sd = None resuming_checkpoint = None if resuming_checkpoint_path is not None: resuming_checkpoint = load_resuming_checkpoint( resuming_checkpoint_path) resuming_model_sd = resuming_checkpoint['state_dict'] compression_ctrl, model = create_compressed_model( model, nncf_config, resuming_state_dict=resuming_model_sd) if config.to_onnx: compression_ctrl.export_model(config.to_onnx) logger.info("Saved to {}".format(config.to_onnx)) return model, _ = prepare_model_for_execution(model, config) if config.distributed: compression_ctrl.distributed() # define optimizer params_to_optimize = get_parameter_groups(model, config) optimizer, lr_scheduler = make_optimizer(params_to_optimize, config) best_acc1 = 0 # optionally resume from a checkpoint if resuming_checkpoint_path is not None: if config.mode.lower() == 'train' and config.to_onnx is None: config.start_epoch = resuming_checkpoint['epoch'] best_acc1 = resuming_checkpoint['best_acc1'] compression_ctrl.scheduler.load_state_dict( resuming_checkpoint['scheduler']) optimizer.load_state_dict(resuming_checkpoint['optimizer']) logger.info( "=> loaded checkpoint '{}' (epoch: {}, best_acc1: {:.3f})". format(resuming_checkpoint_path, resuming_checkpoint['epoch'], best_acc1)) else: logger.info( "=> loaded checkpoint '{}'".format(resuming_checkpoint_path)) if config.execution_mode != ExecutionMode.CPU_ONLY: cudnn.benchmark = True if config.mode.lower() == 'test': print_statistics(compression_ctrl.statistics()) validate(val_loader, model, criterion, config) if config.mode.lower() == 'train': is_inception = 'inception' in model_name train(config, compression_ctrl, model, criterion, is_inception, lr_scheduler, model_name, optimizer, train_loader, train_sampler, val_loader, best_acc1)
def main_worker_binarization(current_gpu, config): config.current_gpu = current_gpu config.distributed = config.execution_mode in (ExecutionMode.DISTRIBUTED, ExecutionMode.MULTIPROCESSING_DISTRIBUTED) if config.distributed: configure_distributed(config) config.device = get_device(config) if is_main_process(): configure_logging(logger, config) print_args(config) if config.seed is not None: manual_seed(config.seed) cudnn.deterministic = True cudnn.benchmark = False # create model model_name = config['model'] weights = config.get('weights') model = load_model(model_name, pretrained=config.get('pretrained', True) if weights is None else False, num_classes=config.get('num_classes', 1000), model_params=config.get('model_params')) original_model = copy.deepcopy(model) compression_ctrl, model = create_compressed_model(model, config) if not isinstance(compression_ctrl, BinarizationController): raise RuntimeError("The binarization sample worker may only be run with the binarization algorithm!") if weights: load_state(model, torch.load(weights, map_location='cpu')) model, _ = prepare_model_for_execution(model, config) original_model.to(config.device) if config.distributed: compression_ctrl.distributed() is_inception = 'inception' in model_name # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss() criterion = criterion.to(config.device) params_to_optimize = model.parameters() compression_config = config['compression'] binarization_config = compression_config if isinstance(compression_config, dict) else compression_config[0] optimizer = get_binarization_optimizer(params_to_optimize, binarization_config) optimizer_scheduler = BinarizationOptimizerScheduler(optimizer, binarization_config) kd_loss_calculator = KDLossCalculator(original_model) resuming_checkpoint = config.resuming_checkpoint best_acc1 = 0 # optionally resume from a checkpoint if resuming_checkpoint is not None: model, config, optimizer, optimizer_scheduler, kd_loss_calculator, compression_ctrl, best_acc1 = \ resume_from_checkpoint(resuming_checkpoint, model, config, optimizer, optimizer_scheduler, kd_loss_calculator, compression_ctrl) if config.to_onnx is not None: compression_ctrl.export_model(config.to_onnx) logger.info("Saved to {}".format(config.to_onnx)) return if config.execution_mode != ExecutionMode.CPU_ONLY: cudnn.benchmark = True # Data loading code train_dataset, val_dataset = create_datasets(config) train_loader, train_sampler, val_loader = create_data_loaders(config, train_dataset, val_dataset) if config.mode.lower() == 'test': print_statistics(compression_ctrl.statistics()) validate(val_loader, model, criterion, config) if config.mode.lower() == 'train': if not resuming_checkpoint: compression_ctrl.initialize(data_loader=train_loader, criterion=criterion) batch_multiplier = (binarization_config.get("params", {})).get("batch_multiplier", 1) train_bin(config, compression_ctrl, model, criterion, is_inception, optimizer_scheduler, model_name, optimizer, train_loader, train_sampler, val_loader, kd_loss_calculator, batch_multiplier, best_acc1)
def main_worker(current_gpu, config): ################################# # Setup experiment environment ################################# config.current_gpu = current_gpu config.distributed = config.execution_mode in ( ExecutionMode.DISTRIBUTED, ExecutionMode.MULTIPROCESSING_DISTRIBUTED) if config.distributed: configure_distributed(config) if is_on_first_rank(config): configure_logging(logger, config) print_args(config) config.device = get_device(config) config.start_iter = 0 ########################## # Prepare metrics log file ########################## if config.metrics_dump is not None: write_metrics(0, config.metrics_dump) ########################### # Criterion ########################### criterion = MultiBoxLoss(config, config['num_classes'], overlap_thresh=0.5, prior_for_matching=True, bkg_label=0, neg_mining=True, neg_pos=3, neg_overlap=0.5, encode_target=False, device=config.device) train_data_loader = test_data_loader = None resuming_checkpoint_path = config.resuming_checkpoint_path ########################### # Prepare data ########################### pretrained = is_pretrained_model_requested(config) if config.to_onnx is not None: assert pretrained or (resuming_checkpoint_path is not None) else: test_data_loader, train_data_loader = create_dataloaders(config) config.nncf_config = register_default_init_args( config.nncf_config, criterion, train_data_loader) ################## # Prepare model ################## resuming_checkpoint_path = config.resuming_checkpoint_path resuming_checkpoint = None resuming_model_state_dict = None if resuming_checkpoint_path: logger.info( 'Resuming from checkpoint {}...'.format(resuming_checkpoint_path)) resuming_checkpoint = torch.load(resuming_checkpoint_path, map_location='cpu') # use checkpoint itself in case only the state dict was saved, # i.e. the checkpoint was created with `torch.save(module.state_dict())` resuming_model_state_dict = resuming_checkpoint.get( 'state_dict', resuming_checkpoint) compression_ctrl, net = create_model(config, resuming_model_state_dict) if config.distributed: config.batch_size //= config.ngpus_per_node config.workers //= config.ngpus_per_node compression_ctrl.distributed() ########################### # Optimizer ########################### params_to_optimize = get_parameter_groups(net, config) optimizer, lr_scheduler = make_optimizer(params_to_optimize, config) ################################# # Load additional checkpoint data ################################# if resuming_checkpoint is not None and config.mode.lower( ) == 'train' and config.to_onnx is None: compression_ctrl.scheduler.load_state_dict( resuming_checkpoint['scheduler']) optimizer.load_state_dict( resuming_checkpoint.get('optimizer', optimizer.state_dict())) config.start_iter = resuming_checkpoint.get('iter', 0) + 1 if config.to_onnx: compression_ctrl.export_model(config.to_onnx) logger.info("Saved to {}".format(config.to_onnx)) return if config.mode.lower() == 'test': with torch.no_grad(): print_statistics(compression_ctrl.statistics()) net.eval() mAp = test_net(net, config.device, test_data_loader, distributed=config.distributed) if config.metrics_dump is not None: write_metrics(mAp, config.metrics_dump) return train(net, compression_ctrl, train_data_loader, test_data_loader, criterion, optimizer, config, lr_scheduler)
def main_worker(current_gpu, config): config.current_gpu = current_gpu config.distributed = config.execution_mode in ( ExecutionMode.DISTRIBUTED, ExecutionMode.MULTIPROCESSING_DISTRIBUTED) if config.distributed: configure_distributed(config) if is_main_process(): configure_logging(logger, config) print_args(config) logger.info(config) config.device = get_device(config) dataset = get_dataset(config.dataset) color_encoding = dataset.color_encoding num_classes = len(color_encoding) if config.metrics_dump is not None: write_metrics(0, config.metrics_dump) weights = config.get('weights') model = load_model(config.model, pretrained=config.get('pretrained', True) if weights is None else False, num_classes=num_classes, model_params=config.get('model_params', {})) compression_ctrl, model = create_compressed_model(model, config) if weights: sd = torch.load(weights, map_location='cpu') load_state(model, sd) model, model_without_dp = prepare_model_for_execution(model, config) if config.distributed: compression_ctrl.distributed() resuming_checkpoint = config.resuming_checkpoint if resuming_checkpoint is not None: if not config.pretrained: # Load the previously saved model state model, _, _, _, _ = \ load_checkpoint(model, resuming_checkpoint, config.device, compression_scheduler=compression_ctrl.scheduler) if config.to_onnx is not None: compression_ctrl.export_model(config.to_onnx) logger.info("Saved to {}".format(config.to_onnx)) return if config.mode.lower() == 'test': logger.info(model) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) logger.info("Trainable argument count:{params}".format(params=params)) model = model.to(config.device) loaders, w_class = load_dataset(dataset, config) _, val_loader = loaders test(model, val_loader, w_class, color_encoding, config) print_statistics(compression_ctrl.statistics()) elif config.mode.lower() == 'train': loaders, w_class = load_dataset(dataset, config) train_loader, val_loader = loaders if not resuming_checkpoint: compression_ctrl.initialize(train_loader) train(model, model_without_dp, compression_ctrl, train_loader, val_loader, w_class, color_encoding, config) else: # Should never happen...but just in case it does raise RuntimeError( "\"{0}\" is not a valid choice for execution mode.".format( config.mode))