def run(*options, cfg=None, debug=False): """Run training and validation of model Notes: Options can be passed in via the options argument and loaded from the cfg file Options from default.py will be overridden by options loaded from cfg file Options from default.py will be overridden by options loaded from cfg file Options passed in via options argument will override option loaded from cfg file Args: *options (str,int ,optional): Options used to overide what is loaded from the config. To see what options are available consult default.py cfg (str, optional): Location of config file to load. Defaults to None. debug (bool): Places scripts in debug/test mode and only executes a few iterations """ # Configuration: update_config(config, options=options, config_file=cfg) # The model will be saved under: outputs/<config_file_name>/<model_dir> config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0] try: output_dir = generate_path( config.OUTPUT_DIR, git_branch(), git_hash(), config_file_name, config.TRAIN.MODEL_DIR, current_datetime(), ) except: output_dir = generate_path(config.OUTPUT_DIR, config_file_name, config.TRAIN.MODEL_DIR, current_datetime(),) # Logging: load_log_configuration(config.LOG_CONFIG) logger = logging.getLogger(__name__) logger.debug(config.WORKERS) # Set CUDNN benchmark mode: torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK # We will write the model under outputs / config_file_name / model_dir config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0] # Fix random seeds: torch.manual_seed(config.SEED) if torch.cuda.is_available(): torch.cuda.manual_seed_all(config.SEED) np.random.seed(seed=config.SEED) # Augmentation: basic_aug = Compose( [ Normalize(mean=(config.TRAIN.MEAN,), std=(config.TRAIN.STD,), max_pixel_value=1), PadIfNeeded( min_height=config.TRAIN.PATCH_SIZE, min_width=config.TRAIN.PATCH_SIZE, border_mode=config.OPENCV_BORDER_CONSTANT, always_apply=True, mask_value=255, value=0, ), Resize( config.TRAIN.AUGMENTATIONS.RESIZE.HEIGHT, config.TRAIN.AUGMENTATIONS.RESIZE.WIDTH, always_apply=True, ), PadIfNeeded( min_height=config.TRAIN.AUGMENTATIONS.PAD.HEIGHT, min_width=config.TRAIN.AUGMENTATIONS.PAD.WIDTH, border_mode=config.OPENCV_BORDER_CONSTANT, always_apply=True, mask_value=255, ), ] ) if config.TRAIN.AUGMENTATION: train_aug = Compose([basic_aug, HorizontalFlip(p=0.5)]) val_aug = basic_aug else: train_aug = val_aug = basic_aug # Training and Validation Loaders: TrainPatchLoader = get_patch_loader(config) logging.info(f"Using {TrainPatchLoader}") train_set = TrainPatchLoader( config.DATASET.ROOT, config.DATASET.NUM_CLASSES, split="train", is_transform=True, stride=config.TRAIN.STRIDE, patch_size=config.TRAIN.PATCH_SIZE, augmentations=train_aug, debug=debug, ) logger.info(train_set) n_classes = train_set.n_classes val_set = TrainPatchLoader( config.DATASET.ROOT, config.DATASET.NUM_CLASSES, split="val", is_transform=True, stride=config.TRAIN.STRIDE, patch_size=config.TRAIN.PATCH_SIZE, augmentations=val_aug, debug=debug, ) logger.info(val_set) if debug: logger.info("Running in debug mode..") train_set = data.Subset(train_set, range(config.TRAIN.BATCH_SIZE_PER_GPU * config.NUM_DEBUG_BATCHES)) val_set = data.Subset(val_set, range(config.VALIDATION.BATCH_SIZE_PER_GPU)) train_loader = data.DataLoader( train_set, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, shuffle=True ) val_loader = data.DataLoader( val_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, num_workers=1 ) # config.WORKERS) # Model: model = getattr(models, config.MODEL.NAME).get_seg_model(config) device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) # Optimizer and LR Scheduler: optimizer = torch.optim.SGD( model.parameters(), lr=config.TRAIN.MAX_LR, momentum=config.TRAIN.MOMENTUM, weight_decay=config.TRAIN.WEIGHT_DECAY, ) epochs_per_cycle = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS snapshot_duration = epochs_per_cycle * len(train_loader) if not debug else 2 * len(train_loader) scheduler = CosineAnnealingScheduler( optimizer, "lr", config.TRAIN.MAX_LR, config.TRAIN.MIN_LR, cycle_size=snapshot_duration ) # Tensorboard writer: summary_writer = create_summary_writer(log_dir=path.join(output_dir, "logs")) # class weights are inversely proportional to the frequency of the classes in the training set class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS, device=device, requires_grad=False) # Loss: criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=255, reduction="mean") # Ignite trainer and evaluator: trainer = create_supervised_trainer(model, optimizer, criterion, prepare_batch, device=device) transform_fn = lambda output_dict: (output_dict["y_pred"].squeeze(), output_dict["mask"].squeeze()) evaluator = create_supervised_evaluator( model, prepare_batch, metrics={ "nll": Loss(criterion, output_transform=transform_fn), "pixacc": pixelwise_accuracy(n_classes, output_transform=transform_fn, device=device), "cacc": class_accuracy(n_classes, output_transform=transform_fn), "mca": mean_class_accuracy(n_classes, output_transform=transform_fn), "ciou": class_iou(n_classes, output_transform=transform_fn), "mIoU": mean_iou(n_classes, output_transform=transform_fn), }, device=device, ) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Logging: trainer.add_event_handler( Events.ITERATION_COMPLETED, logging_handlers.log_training_output(log_interval=config.PRINT_FREQ), ) trainer.add_event_handler(Events.EPOCH_COMPLETED, logging_handlers.log_lr(optimizer)) # Tensorboard and Logging: trainer.add_event_handler(Events.ITERATION_COMPLETED, tensorboard_handlers.log_training_output(summary_writer)) trainer.add_event_handler(Events.ITERATION_COMPLETED, tensorboard_handlers.log_validation_output(summary_writer)) # add specific logger which also triggers printed metrics on training set @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): evaluator.run(train_loader) tensorboard_handlers.log_results(engine, evaluator, summary_writer, n_classes, stage="Training") logging_handlers.log_metrics(engine, evaluator, stage="Training") # add specific logger which also triggers printed metrics on validation set @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) tensorboard_handlers.log_results(engine, evaluator, summary_writer, n_classes, stage="Validation") logging_handlers.log_metrics(engine, evaluator, stage="Validation") # dump validation set metrics at the very end for debugging purposes if engine.state.epoch == config.TRAIN.END_EPOCH and debug: fname = f"metrics_{config_file_name}_{config.TRAIN.MODEL_DIR}.json" metrics = evaluator.state.metrics out_dict = {x: metrics[x] for x in ["nll", "pixacc", "mca", "mIoU"]} with open(fname, "w") as fid: json.dump(out_dict, fid) log_msg = " ".join(f"{k}: {out_dict[k]}" for k in out_dict.keys()) logging.info(log_msg) # Checkpointing: snapshotting trained models to disk checkpoint_handler = SnapshotHandler( output_dir, config.MODEL.NAME, extract_metric_from("mIoU"), lambda: (trainer.state.iteration % snapshot_duration) == 0, ) evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {"model": model}) logger.info("Starting training") trainer.run(train_loader, max_epochs=config.TRAIN.END_EPOCH, epoch_length=len(train_loader), seed=config.SEED) summary_writer.close()
def run(*options, cfg=None, debug=False): """Run testing of model Notes: Options can be passed in via the options argument and loaded from the cfg file Options from default.py will be overridden by options loaded from cfg file Options passed in via options argument will override option loaded from cfg file Args: *options (str,int ,optional): Options used to overide what is loaded from the config. To see what options are available consult default.py cfg (str, optional): Location of config file to load. Defaults to None. """ update_config(config, options=options, config_file=cfg) # Start logging load_log_configuration(config.LOG_CONFIG) logger = logging.getLogger(__name__) logger.debug(config.WORKERS) torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK torch.manual_seed(config.SEED) if torch.cuda.is_available(): torch.cuda.manual_seed_all(config.SEED) np.random.seed(seed=config.SEED) # Setup Augmentations test_aug = Compose( [ Normalize(mean=(config.TRAIN.MEAN,), std=(config.TRAIN.STD,), max_pixel_value=config.TRAIN.MAX,), PadIfNeeded( min_height=config.TRAIN.PATCH_SIZE, min_width=config.TRAIN.PATCH_SIZE, border_mode=cv2.BORDER_CONSTANT, always_apply=True, mask_value=mask_value, value=0, ), Resize( config.TRAIN.AUGMENTATIONS.RESIZE.HEIGHT, config.TRAIN.AUGMENTATIONS.RESIZE.WIDTH, always_apply=True, ), PadIfNeeded( min_height=config.TRAIN.AUGMENTATIONS.PAD.HEIGHT, min_width=config.TRAIN.AUGMENTATIONS.PAD.WIDTH, border_mode=cv2.BORDER_CONSTANT, always_apply=True, mask_value=mask_value, value=0, ), ] ) PenobscotDataset = get_patch_dataset(config) test_set = PenobscotDataset( config.DATASET.ROOT, config.TRAIN.PATCH_SIZE, config.TRAIN.STRIDE, split="test", transforms=test_aug, n_channels=config.MODEL.IN_CHANNELS, complete_patches_only=config.TEST.COMPLETE_PATCHES_ONLY, ) logger.info(str(test_set)) n_classes = test_set.n_classes test_loader = data.DataLoader( test_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, ) model = getattr(models, config.MODEL.NAME).get_seg_model(config) logger.info(f"Loading model {config.TEST.MODEL_PATH}") model.load_state_dict(torch.load(config.TEST.MODEL_PATH), strict=False) device = "cpu" if torch.cuda.is_available(): device = "cuda" model = model.to(device) # Send to GPU try: output_dir = generate_path(config.OUTPUT_DIR, git_branch(), git_hash(), config.MODEL.NAME, current_datetime(),) except TypeError: output_dir = generate_path(config.OUTPUT_DIR, config.MODEL.NAME, current_datetime(),) summary_writer = create_summary_writer(log_dir=path.join(output_dir, config.LOG_DIR)) # weights are inversely proportional to the frequency of the classes in # the training set class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS, device=device, requires_grad=False) criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=mask_value, reduction="mean") def _select_pred_and_mask(model_out_dict): return (model_out_dict["y_pred"].squeeze(), model_out_dict["mask"].squeeze()) def _select_all(model_out_dict): return ( model_out_dict["y_pred"].squeeze(), model_out_dict["mask"].squeeze(), model_out_dict["ids"], model_out_dict["patch_locations"], ) inline_mean_iou = InlineMeanIoU( config.DATASET.INLINE_HEIGHT, config.DATASET.INLINE_WIDTH, config.TRAIN.PATCH_SIZE, n_classes, padding=_padding_from(config), scale=_scale_from(config), output_transform=_select_all, ) evaluator = create_supervised_evaluator( model, _prepare_batch, metrics={ "nll": Loss(criterion, output_transform=_select_pred_and_mask, device=device), "inIoU": inline_mean_iou, "pixa": pixelwise_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device), "cacc": class_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device), "mca": mean_class_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device), "ciou": class_iou(n_classes, output_transform=_select_pred_and_mask, device=device), "mIoU": mean_iou(n_classes, output_transform=_select_pred_and_mask, device=device), }, device=device, ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, logging_handlers.log_metrics( "Test results", metrics_dict={ "nll": "Avg loss :", "mIoU": "Avg IoU :", "pixa": "Pixelwise Accuracy :", "mca": "Mean Class Accuracy :", "inIoU": "Mean Inline IoU :", }, ), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, tensorboard_handlers.log_metrics( summary_writer, evaluator, "epoch", metrics_dict={"mIoU": "Test/IoU", "nll": "Test/Loss", "mca": "Test/MCA", "inIoU": "Test/MeanInlineIoU",}, ), ) def _select_max(pred_tensor): return pred_tensor.max(1)[1] def _tensor_to_numpy(pred_tensor): return pred_tensor.squeeze().cpu().numpy() transform_func = compose( np_to_tb, decode_segmap(n_classes=n_classes, label_colours=_SEG_COLOURS), _tensor_to_numpy, ) transform_pred = compose(transform_func, _select_max) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Test/Image", "image"), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Test/Mask", "mask", transform_func=transform_func), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Test/Pred", "y_pred", transform_func=transform_pred), ) logger.info("Starting training") if debug: logger.info("Running in Debug/Test mode") test_loader = take(3, test_loader) evaluator.run(test_loader, max_epochs=1) # Log top N and bottom N inlines in terms of IoU to tensorboard inline_ious = inline_mean_iou.iou_per_inline() sorted_ious = sorted(inline_ious.items(), key=lambda x: x[1], reverse=True) topk = ((inline_mean_iou.predictions[key], inline_mean_iou.masks[key]) for key, iou in take(_TOP_K, sorted_ious)) bottomk = ( (inline_mean_iou.predictions[key], inline_mean_iou.masks[key]) for key, iou in tail(_BOTTOM_K, sorted_ious) ) stack_and_decode = compose(transform_func, torch.stack) predictions, masks = unzip(chain(topk, bottomk)) predictions_tensor = stack_and_decode(list(predictions)) masks_tensor = stack_and_decode(list(masks)) _log_tensor_to_tensorboard(predictions_tensor, "Test/InlinePredictions", summary_writer, evaluator) _log_tensor_to_tensorboard(masks_tensor, "Test/InlineMasks", summary_writer, evaluator) summary_writer.close()
def run(*options, cfg=None, local_rank=0, debug=False): """Run training and validation of model Notes: Options can be passed in via the options argument and loaded from the cfg file Options from default.py will be overridden by options loaded from cfg file Options passed in via options argument will override option loaded from cfg file Args: *options (str,int ,optional): Options used to overide what is loaded from the config. To see what options are available consult default.py cfg (str, optional): Location of config file to load. Defaults to None. """ update_config(config, options=options, config_file=cfg) # we will write the model under outputs / config_file_name / model_dir config_file_name = "default_config" if not cfg else cfg.split( "/")[-1].split(".")[0] # Start logging load_log_configuration(config.LOG_CONFIG) logger = logging.getLogger(__name__) logger.debug(config.WORKERS) silence_other_ranks = True world_size = int(os.environ.get("WORLD_SIZE", 1)) distributed = world_size > 1 if distributed: # FOR DISTRIBUTED: Set the device according to local_rank. torch.cuda.set_device(local_rank) # FOR DISTRIBUTED: Initialize the backend. torch.distributed.launch will # provide environment variables, and requires that you use init_method=`env://`. torch.distributed.init_process_group(backend="nccl", init_method="env://") epochs_per_cycle = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK torch.manual_seed(config.SEED) if torch.cuda.is_available(): torch.cuda.manual_seed_all(config.SEED) np.random.seed(seed=config.SEED) # Setup Augmentations basic_aug = Compose([ Normalize(mean=(config.TRAIN.MEAN, ), std=(config.TRAIN.STD, ), max_pixel_value=1), PadIfNeeded( min_height=config.TRAIN.PATCH_SIZE, min_width=config.TRAIN.PATCH_SIZE, border_mode=config.OPENCV_BORDER_CONSTANT, always_apply=True, mask_value=255, ), Resize( config.TRAIN.AUGMENTATIONS.RESIZE.HEIGHT, config.TRAIN.AUGMENTATIONS.RESIZE.WIDTH, always_apply=True, ), PadIfNeeded( min_height=config.TRAIN.AUGMENTATIONS.PAD.HEIGHT, min_width=config.TRAIN.AUGMENTATIONS.PAD.WIDTH, border_mode=config.OPENCV_BORDER_CONSTANT, always_apply=True, mask_value=255, ), ]) if config.TRAIN.AUGMENTATION: train_aug = Compose([basic_aug, HorizontalFlip(p=0.5)]) val_aug = basic_aug else: train_aug = val_aug = basic_aug TrainPatchLoader = get_patch_loader(config) train_set = TrainPatchLoader( config.DATASET.ROOT, split="train", is_transform=True, stride=config.TRAIN.STRIDE, patch_size=config.TRAIN.PATCH_SIZE, augmentations=train_aug, ) val_set = TrainPatchLoader( config.DATASET.ROOT, split="val", is_transform=True, stride=config.TRAIN.STRIDE, patch_size=config.TRAIN.PATCH_SIZE, augmentations=val_aug, ) logger.info(f"Validation examples {len(val_set)}") n_classes = train_set.n_classes if debug: val_set = data.Subset(val_set, range(config.VALIDATION.BATCH_SIZE_PER_GPU)) train_set = data.Subset(train_set, range(config.TRAIN.BATCH_SIZE_PER_GPU * 2)) logger.info(f"Training examples {len(train_set)}") logger.info(f"Validation examples {len(val_set)}") train_sampler = torch.utils.data.distributed.DistributedSampler( train_set, num_replicas=world_size, rank=local_rank) train_loader = data.DataLoader( train_set, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, sampler=train_sampler, ) val_sampler = torch.utils.data.distributed.DistributedSampler( val_set, num_replicas=world_size, rank=local_rank) val_loader = data.DataLoader( val_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, sampler=val_sampler, ) model = getattr(models, config.MODEL.NAME).get_seg_model(config) device = "cpu" if torch.cuda.is_available(): device = "cuda" model = model.to(device) # Send to GPU optimizer = torch.optim.SGD( model.parameters(), lr=config.TRAIN.MAX_LR, momentum=config.TRAIN.MOMENTUM, weight_decay=config.TRAIN.WEIGHT_DECAY, ) # weights are inversely proportional to the frequency of the classes in # the training set class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS, device=device, requires_grad=False) criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=255, reduction="mean") model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[device], find_unused_parameters=True) snapshot_duration = epochs_per_cycle * len( train_loader) if not debug else 2 * len(train_loader) warmup_duration = 5 * len(train_loader) warmup_scheduler = LinearCyclicalScheduler( optimizer, "lr", start_value=config.TRAIN.MAX_LR, end_value=config.TRAIN.MAX_LR * world_size, cycle_size=10 * len(train_loader), ) cosine_scheduler = CosineAnnealingScheduler( optimizer, "lr", config.TRAIN.MAX_LR * world_size, config.TRAIN.MIN_LR * world_size, cycle_size=snapshot_duration, ) scheduler = ConcatScheduler( schedulers=[warmup_scheduler, cosine_scheduler], durations=[warmup_duration]) trainer = create_supervised_trainer(model, optimizer, criterion, prepare_batch, device=device) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Set to update the epoch parameter of our distributed data sampler so that we get # different shuffles trainer.add_event_handler(Events.EPOCH_STARTED, update_sampler_epoch(train_loader)) if silence_other_ranks & local_rank != 0: logging.getLogger("ignite.engine.engine.Engine").setLevel( logging.WARNING) def _select_pred_and_mask(model_out_dict): return (model_out_dict["y_pred"].squeeze(), model_out_dict["mask"].squeeze()) evaluator = create_supervised_evaluator( model, prepare_batch, metrics={ "nll": Loss(criterion, output_transform=_select_pred_and_mask, device=device), "pixa": pixelwise_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device), "cacc": class_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device), "mca": mean_class_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device), "ciou": class_iou(n_classes, output_transform=_select_pred_and_mask, device=device), "mIoU": mean_iou(n_classes, output_transform=_select_pred_and_mask, device=device), }, device=device, ) # Set the validation run to start on the epoch completion of the training run trainer.add_event_handler(Events.EPOCH_COMPLETED, Evaluator(evaluator, val_loader)) if local_rank == 0: # Run only on master process trainer.add_event_handler( Events.ITERATION_COMPLETED, logging_handlers.log_training_output( log_interval=config.TRAIN.BATCH_SIZE_PER_GPU), ) trainer.add_event_handler(Events.EPOCH_STARTED, logging_handlers.log_lr(optimizer)) try: output_dir = generate_path( config.OUTPUT_DIR, git_branch(), git_hash(), config_file_name, config.TRAIN.MODEL_DIR, current_datetime(), ) except TypeError: output_dir = generate_path( config.OUTPUT_DIR, config_file_name, config.TRAIN.MODEL_DIR, current_datetime(), ) summary_writer = create_summary_writer( log_dir=path.join(output_dir, config.LOG_DIR)) logger.info( f"Logging Tensorboard to {path.join(output_dir, config.LOG_DIR)}") trainer.add_event_handler( Events.EPOCH_STARTED, tensorboard_handlers.log_lr(summary_writer, optimizer, "epoch"), ) trainer.add_event_handler( Events.ITERATION_COMPLETED, tensorboard_handlers.log_training_output(summary_writer), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, logging_handlers.log_metrics( "Validation results", metrics_dict={ "nll": "Avg loss :", "mIoU": " Avg IoU :", "pixa": "Pixelwise Accuracy :", "mca": "Mean Class Accuracy :", }, ), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, tensorboard_handlers.log_metrics( summary_writer, trainer, "epoch", metrics_dict={ "mIoU": "Validation/IoU", "nll": "Validation/Loss", "mca": "Validation/MCA", }, ), ) def _select_max(pred_tensor): return pred_tensor.max(1)[1] def _tensor_to_numpy(pred_tensor): return pred_tensor.squeeze().cpu().numpy() transform_func = compose(np_to_tb, decode_segmap(n_classes=n_classes), _tensor_to_numpy) transform_pred = compose(transform_func, _select_max) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Validation/Image", "image"), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Validation/Mask", "mask", transform_func=transform_func), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer( summary_writer, "Validation/Pred", "y_pred", transform_func=transform_pred, ), ) def snapshot_function(): return (trainer.state.iteration % snapshot_duration) == 0 checkpoint_handler = SnapshotHandler( output_dir, config.MODEL.NAME, extract_metric_from("mIoU"), snapshot_function, ) evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {"model": model}) logger.info("Starting training") if debug: trainer.run( train_loader, max_epochs=config.TRAIN.END_EPOCH, epoch_length=config.TRAIN.BATCH_SIZE_PER_GPU * 2, seed=config.SEED, ) else: trainer.run(train_loader, max_epochs=config.TRAIN.END_EPOCH, epoch_length=len(train_loader), seed=config.SEED)
def run(*options, cfg=None, debug=False): """Run training and validation of model Notes: Options can be passed in via the options argument and loaded from the cfg file Options from default.py will be overridden by options loaded from cfg file Options passed in via options argument will override option loaded from cfg file Args: *options (str,int ,optional): Options used to overide what is loaded from the config. To see what options are available consult default.py cfg (str, optional): Location of config file to load. Defaults to None. """ update_config(config, options=options, config_file=cfg) # Start logging load_log_configuration(config.LOG_CONFIG) logger = logging.getLogger(__name__) logger.debug(config.WORKERS) scheduler_step = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK torch.manual_seed(config.SEED) if torch.cuda.is_available(): torch.cuda.manual_seed_all(config.SEED) np.random.seed(seed=config.SEED) # Setup Augmentations basic_aug = Compose([ Normalize(mean=(config.TRAIN.MEAN, ), std=(config.TRAIN.STD, ), max_pixel_value=1) ]) if config.TRAIN.AUGMENTATION: train_aug = Compose([basic_aug, HorizontalFlip(p=0.5)]) val_aug = basic_aug else: train_aug = val_aug = basic_aug TrainLoader = get_section_loader(config) train_set = TrainLoader( data_dir=config.DATASET.ROOT, split="train", is_transform=True, augmentations=train_aug, ) val_set = TrainLoader( data_dir=config.DATASET.ROOT, split="val", is_transform=True, augmentations=val_aug, ) class CustomSampler(torch.utils.data.Sampler): def __init__(self, data_source): self.data_source = data_source def __iter__(self): char = ["i" if np.random.randint(2) == 1 else "x"] self.indices = [ idx for (idx, name) in enumerate(self.data_source) if char[0] in name ] return (self.indices[i] for i in torch.randperm(len(self.indices))) def __len__(self): return len(self.data_source) n_classes = train_set.n_classes val_list = val_set.sections train_list = val_set.sections train_loader = data.DataLoader( train_set, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, sampler=CustomSampler(train_list), num_workers=config.WORKERS, shuffle=False, ) val_loader = data.DataLoader( val_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, sampler=CustomSampler(val_list), num_workers=config.WORKERS, ) model = getattr(models, config.MODEL.NAME).get_seg_model(config) device = "cpu" if torch.cuda.is_available(): device = "cuda" model = model.to(device) # Send to GPU optimizer = torch.optim.SGD( model.parameters(), lr=config.TRAIN.MAX_LR, momentum=config.TRAIN.MOMENTUM, weight_decay=config.TRAIN.WEIGHT_DECAY, ) try: output_dir = generate_path( config.OUTPUT_DIR, git_branch(), git_hash(), config.MODEL.NAME, current_datetime(), ) except TypeError: output_dir = generate_path( config.OUTPUT_DIR, config.MODEL.NAME, current_datetime(), ) summary_writer = create_summary_writer( log_dir=path.join(output_dir, config.LOG_DIR)) snapshot_duration = scheduler_step * len(train_loader) scheduler = CosineAnnealingScheduler(optimizer, "lr", config.TRAIN.MAX_LR, config.TRAIN.MIN_LR, snapshot_duration) # weights are inversely proportional to the frequency of the classes in # the training set class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS, device=device, requires_grad=False) criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=255, reduction="mean") trainer = create_supervised_trainer(model, optimizer, criterion, prepare_batch, device=device) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) trainer.add_event_handler( Events.ITERATION_COMPLETED, logging_handlers.log_training_output(log_interval=config.PRINT_FREQ), ) trainer.add_event_handler(Events.EPOCH_STARTED, logging_handlers.log_lr(optimizer)) trainer.add_event_handler( Events.EPOCH_STARTED, tensorboard_handlers.log_lr(summary_writer, optimizer, "epoch"), ) trainer.add_event_handler( Events.ITERATION_COMPLETED, tensorboard_handlers.log_training_output(summary_writer), ) def _select_pred_and_mask(model_out_dict): return (model_out_dict["y_pred"].squeeze(), model_out_dict["mask"].squeeze()) evaluator = create_supervised_evaluator( model, prepare_batch, metrics={ "nll": Loss(criterion, output_transform=_select_pred_and_mask, device=device), "pixacc": pixelwise_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device), "cacc": class_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device), "mca": mean_class_accuracy(n_classes, output_transform=_select_pred_and_mask, device=device), "ciou": class_iou(n_classes, output_transform=_select_pred_and_mask, device=device), "mIoU": mean_iou(n_classes, output_transform=_select_pred_and_mask, device=device), }, device=device, ) if debug: logger.info("Running Validation in Debug/Test mode") val_loader = take(3, val_loader) trainer.add_event_handler(Events.EPOCH_COMPLETED, Evaluator(evaluator, val_loader)) evaluator.add_event_handler( Events.EPOCH_COMPLETED, logging_handlers.log_metrics( "Validation results", metrics_dict={ "nll": "Avg loss :", "pixacc": "Pixelwise Accuracy :", "mca": "Avg Class Accuracy :", "mIoU": "Avg Class IoU :", }, ), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, logging_handlers.log_class_metrics( "Per class validation results", metrics_dict={ "ciou": "Class IoU :", "cacc": "Class Accuracy :" }, ), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, tensorboard_handlers.log_metrics( summary_writer, trainer, "epoch", metrics_dict={ "mIoU": "Validation/mIoU", "nll": "Validation/Loss", "mca": "Validation/MCA", "pixacc": "Validation/Pixel_Acc", }, ), ) def _select_max(pred_tensor): return pred_tensor.max(1)[1] def _tensor_to_numpy(pred_tensor): return pred_tensor.squeeze().cpu().numpy() transform_func = compose(np_to_tb, decode_segmap(n_classes=n_classes), _tensor_to_numpy) transform_pred = compose(transform_func, _select_max) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Validation/Image", "image"), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Validation/Mask", "mask", transform_func=transform_func), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Validation/Pred", "y_pred", transform_func=transform_pred), ) def snapshot_function(): return (trainer.state.iteration % snapshot_duration) == 0 checkpoint_handler = SnapshotHandler( path.join(output_dir, config.TRAIN.MODEL_DIR), config.MODEL.NAME, extract_metric_from("mIoU"), snapshot_function, ) evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {"model": model}) logger.info("Starting training") if debug: logger.info("Running Validation in Debug/Test mode") train_loader = take(3, train_loader) trainer.run(train_loader, max_epochs=config.TRAIN.END_EPOCH)
def run(*options, cfg=None, debug=False): """Run training and validation of model Notes: Options can be passed in via the options argument and loaded from the cfg file Options loaded from default.py will be overridden by those loaded from cfg file Options passed in via options argument will override those loaded from cfg file Args: *options (str, int, optional): Options used to overide what is loaded from the config. To see what options are available consult default.py cfg (str, optional): Location of config file to load. Defaults to None. debug (bool): Places scripts in debug/test mode and only executes a few iterations """ update_config(config, options=options, config_file=cfg) # we will write the model under outputs / config_file_name / model_dir config_file_name = "default_config" if not cfg else cfg.split("/")[-1].split(".")[0] # Start logging load_log_configuration(config.LOG_CONFIG) logger = logging.getLogger(__name__) logger.debug(config.WORKERS) epochs_per_cycle = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK torch.manual_seed(config.SEED) if torch.cuda.is_available(): torch.cuda.manual_seed_all(config.SEED) np.random.seed(seed=config.SEED) device = "cpu" if torch.cuda.is_available(): device = "cuda" # Setup Augmentations basic_aug = Compose( [ Normalize(mean=(config.TRAIN.MEAN,), std=(config.TRAIN.STD,), max_pixel_value=config.TRAIN.MAX,), PadIfNeeded( min_height=config.TRAIN.PATCH_SIZE, min_width=config.TRAIN.PATCH_SIZE, border_mode=config.OPENCV_BORDER_CONSTANT, always_apply=True, mask_value=mask_value, value=0, ), Resize( config.TRAIN.AUGMENTATIONS.RESIZE.HEIGHT, config.TRAIN.AUGMENTATIONS.RESIZE.WIDTH, always_apply=True, ), PadIfNeeded( min_height=config.TRAIN.AUGMENTATIONS.PAD.HEIGHT, min_width=config.TRAIN.AUGMENTATIONS.PAD.WIDTH, border_mode=config.OPENCV_BORDER_CONSTANT, always_apply=True, mask_value=mask_value, value=0, ), ] ) if config.TRAIN.AUGMENTATION: train_aug = Compose([basic_aug, HorizontalFlip(p=0.5)]) val_aug = basic_aug else: train_aug = val_aug = basic_aug PenobscotDataset = get_patch_dataset(config) train_set = PenobscotDataset( config.DATASET.ROOT, config.TRAIN.PATCH_SIZE, config.TRAIN.STRIDE, split="train", transforms=train_aug, n_channels=config.MODEL.IN_CHANNELS, complete_patches_only=config.TRAIN.COMPLETE_PATCHES_ONLY, ) val_set = PenobscotDataset( config.DATASET.ROOT, config.TRAIN.PATCH_SIZE, config.TRAIN.STRIDE, split="val", transforms=val_aug, n_channels=config.MODEL.IN_CHANNELS, complete_patches_only=config.VALIDATION.COMPLETE_PATCHES_ONLY, ) logger.info(train_set) logger.info(val_set) n_classes = train_set.n_classes train_loader = data.DataLoader( train_set, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, shuffle=True, ) if debug: val_set = data.Subset(val_set, range(3)) val_loader = data.DataLoader(val_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS) model = getattr(models, config.MODEL.NAME).get_seg_model(config) model = model.to(device) # Send to GPU optimizer = torch.optim.SGD( model.parameters(), lr=config.TRAIN.MAX_LR, momentum=config.TRAIN.MOMENTUM, weight_decay=config.TRAIN.WEIGHT_DECAY, ) try: output_dir = generate_path( config.OUTPUT_DIR, git_branch(), git_hash(), config_file_name, config.TRAIN.MODEL_DIR, current_datetime(), ) except TypeError: output_dir = generate_path(config.OUTPUT_DIR, config_file_name, config.TRAIN.MODEL_DIR, current_datetime(),) summary_writer = create_summary_writer(log_dir=path.join(output_dir, config.LOG_DIR)) snapshot_duration = epochs_per_cycle * len(train_loader) if not debug else 2 * len(train_loader) scheduler = CosineAnnealingScheduler( optimizer, "lr", config.TRAIN.MAX_LR, config.TRAIN.MIN_LR, cycle_size=snapshot_duration ) # weights are inversely proportional to the frequency of the classes in # the training set class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS, device=device, requires_grad=False) criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=mask_value, reduction="mean") trainer = create_supervised_trainer(model, optimizer, criterion, _prepare_batch, device=device) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) trainer.add_event_handler( Events.ITERATION_COMPLETED, logging_handlers.log_training_output(log_interval=config.TRAIN.BATCH_SIZE_PER_GPU), ) trainer.add_event_handler(Events.EPOCH_STARTED, logging_handlers.log_lr(optimizer)) trainer.add_event_handler( Events.EPOCH_STARTED, tensorboard_handlers.log_lr(summary_writer, optimizer, "epoch"), ) trainer.add_event_handler( Events.ITERATION_COMPLETED, tensorboard_handlers.log_training_output(summary_writer), ) def _select_pred_and_mask(model_out_dict): return (model_out_dict["y_pred"].squeeze(), model_out_dict["mask"].squeeze()) evaluator = create_supervised_evaluator( model, _prepare_batch, metrics={ "pixacc": pixelwise_accuracy(n_classes, output_transform=_select_pred_and_mask), "nll": Loss(criterion, output_transform=_select_pred_and_mask), "cacc": class_accuracy(n_classes, output_transform=_select_pred_and_mask), "mca": mean_class_accuracy(n_classes, output_transform=_select_pred_and_mask), "ciou": class_iou(n_classes, output_transform=_select_pred_and_mask), "mIoU": mean_iou(n_classes, output_transform=_select_pred_and_mask), }, device=device, ) # Set the validation run to start on the epoch completion of the training run trainer.add_event_handler(Events.EPOCH_COMPLETED, Evaluator(evaluator, val_loader)) evaluator.add_event_handler( Events.EPOCH_COMPLETED, logging_handlers.log_metrics( "Validation results", metrics_dict={ "nll": "Avg loss :", "pixacc": "Pixelwise Accuracy :", "mca": "Avg Class Accuracy :", "mIoU": "Avg Class IoU :", }, ), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, tensorboard_handlers.log_metrics( summary_writer, trainer, "epoch", metrics_dict={ "mIoU": "Validation/mIoU", "nll": "Validation/Loss", "mca": "Validation/MCA", "pixacc": "Validation/Pixel_Acc", }, ), ) def _select_max(pred_tensor): return pred_tensor.max(1)[1] def _tensor_to_numpy(pred_tensor): return pred_tensor.squeeze().cpu().numpy() transform_func = compose(np_to_tb, decode_segmap, _tensor_to_numpy,) transform_pred = compose(transform_func, _select_max) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Validation/Image", "image"), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Validation/Mask", "mask", transform_func=transform_func), ) evaluator.add_event_handler( Events.EPOCH_COMPLETED, create_image_writer(summary_writer, "Validation/Pred", "y_pred", transform_func=transform_pred), ) def snapshot_function(): return (trainer.state.iteration % snapshot_duration) == 0 checkpoint_handler = SnapshotHandler(output_dir, config.MODEL.NAME, extract_metric_from("mIoU"), snapshot_function,) evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {"model": model}) logger.info("Starting training") if debug: trainer.run( train_loader, max_epochs=config.TRAIN.END_EPOCH, epoch_length=config.TRAIN.BATCH_SIZE_PER_GPU, seed=config.SEED, ) else: trainer.run(train_loader, max_epochs=config.TRAIN.END_EPOCH, epoch_length=len(train_loader), seed=config.SEED)
def _evaluate_split( split, section_aug, model, pre_processing, output_processing, device, running_metrics_overall, config, debug=False, ): logger = logging.getLogger(__name__) TestSectionLoader = get_test_loader(config) test_set = TestSectionLoader( config.DATASET.ROOT, split=split, is_transform=True, augmentations=section_aug, ) n_classes = test_set.n_classes test_loader = data.DataLoader(test_set, batch_size=1, num_workers=config.WORKERS, shuffle=False) if debug: logger.info("Running in Debug/Test mode") test_loader = take(1, test_loader) try: output_dir = generate_path( config.OUTPUT_DIR + "_test", git_branch(), git_hash(), config.MODEL.NAME, current_datetime(), ) except TypeError: output_dir = generate_path( config.OUTPUT_DIR + "_test", config.MODEL.NAME, current_datetime(), ) running_metrics_split = runningScore(n_classes) # testing mode: with torch.no_grad(): # operations inside don't track history model.eval() total_iteration = 0 for i, (images, labels) in enumerate(test_loader): logger.info(f"split: {split}, section: {i}") total_iteration = total_iteration + 1 outputs = _patch_label_2d( model, images, pre_processing, output_processing, config.TRAIN.PATCH_SIZE, config.TEST.TEST_STRIDE, config.VALIDATION.BATCH_SIZE_PER_GPU, device, n_classes, ) pred = outputs.detach().max(1)[1].numpy() gt = labels.numpy() running_metrics_split.update(gt, pred) running_metrics_overall.update(gt, pred) # dump images to disk for review mask_to_disk(pred.squeeze(), os.path.join(output_dir, f"{i}_pred.png")) mask_to_disk(gt.squeeze(), os.path.join(output_dir, f"{i}_gt.png")) # get scores score, class_iou = running_metrics_split.get_scores() # Log split results logger.info(f'Pixel Acc: {score["Pixel Acc: "]:.3f}') for cdx, class_name in enumerate(_CLASS_NAMES): logger.info( f' {class_name}_accuracy {score["Class Accuracy: "][cdx]:.3f}') logger.info(f'Mean Class Acc: {score["Mean Class Acc: "]:.3f}') logger.info(f'Freq Weighted IoU: {score["Freq Weighted IoU: "]:.3f}') logger.info(f'Mean IoU: {score["Mean IoU: "]:0.3f}') running_metrics_split.reset()
def _evaluate_split( split, section_aug, model, pre_processing, output_processing, device, running_metrics_overall, config, data_flow, debug=False, ): logger = logging.getLogger(__name__) TestSectionLoader = get_test_loader(config) test_set = TestSectionLoader( config, split=split, is_transform=True, augmentations=section_aug, debug=debug, ) n_classes = test_set.n_classes if debug: data_flow[split] = dict() data_flow[split]["test_section_loader_length"] = len(test_set) data_flow[split]["test_input_shape"] = test_set.seismic.shape data_flow[split]["test_label_shape"] = test_set.labels.shape data_flow[split]["n_classes"] = n_classes test_loader = data.DataLoader(test_set, batch_size=1, num_workers=config.WORKERS, shuffle=False) if debug: data_flow[split]["test_loader_length"] = len(test_loader) logger.info("Running in Debug/Test mode") take_n = 2 test_loader = take(take_n, test_loader) data_flow[split]["take_n_sections"] = take_n pred_list, gt_list, img_list = [], [], [] try: output_dir = generate_path( f"{config.OUTPUT_DIR}/test/{split}", git_branch(), git_hash(), config.MODEL.NAME, current_datetime(), ) except: output_dir = generate_path( f"{config.OUTPUT_DIR}/test/{split}", config.MODEL.NAME, current_datetime(), ) running_metrics_split = runningScore(n_classes) # evaluation mode: with torch.no_grad(): # operations inside don't track history model.eval() for i, (images, labels) in enumerate(test_loader): logger.info(f"split: {split}, section: {i}") outputs = _patch_label_2d( model, images, pre_processing, output_processing, config.TRAIN.PATCH_SIZE, config.TEST.TEST_STRIDE, config.VALIDATION.BATCH_SIZE_PER_GPU, device, n_classes, split, debug, config.DATASET.MIN, config.DATASET.MAX, ) pred = outputs.detach().max(1)[1].numpy() gt = labels.numpy() if debug: pred_list.append((pred.shape, len(np.unique(pred)))) gt_list.append((gt.shape, len(np.unique(gt)))) img_list.append(images.numpy().shape) running_metrics_split.update(gt, pred) running_metrics_overall.update(gt, pred) # dump images to disk for review mask_to_disk(pred.squeeze(), os.path.join(output_dir, f"{i}_pred.png"), n_classes) mask_to_disk(gt.squeeze(), os.path.join(output_dir, f"{i}_gt.png"), n_classes) if debug: data_flow[split]["pred_shape"] = pred_list data_flow[split]["gt_shape"] = gt_list data_flow[split]["img_shape"] = img_list # get scores score, class_iou = running_metrics_split.get_scores() # Log split results logger.info(f'Pixel Acc: {score["Pixel Acc: "]:.3f}') if debug: for cdx in range(n_classes): logger.info( f' Class_{cdx}_accuracy {score["Class Accuracy: "][cdx]:.3f}') else: for cdx, class_name in enumerate(_CLASS_NAMES): logger.info( f' {class_name}_accuracy {score["Class Accuracy: "][cdx]:.3f}' ) logger.info(f'Mean Class Acc: {score["Mean Class Acc: "]:.3f}') logger.info(f'Freq Weighted IoU: {score["Freq Weighted IoU: "]:.3f}') logger.info(f'Mean IoU: {score["Mean IoU: "]:0.3f}') running_metrics_split.reset()
def run(*options, cfg=None, local_rank=0, debug=False, input=None, distributed=False): """Run training and validation of model Notes: Options can be passed in via the options argument and loaded from the cfg file Options from default.py will be overridden by options loaded from cfg file Options from default.py will be overridden by options loaded from cfg file Options passed in via options argument will override option loaded from cfg file Args: *options (str,int ,optional): Options used to overide what is loaded from the config. To see what options are available consult default.py cfg (str, optional): Location of config file to load. Defaults to None. debug (bool): Places scripts in debug/test mode and only executes a few iterations input (str, optional): Location of data if Azure ML run, for local runs input is config.DATASET.ROOT distributed (bool): This flag tells the training script to run in distributed mode if more than one GPU exists. """ # if AML training pipeline supplies us with input if input is not None: data_dir = input output_dir = data_dir + config.OUTPUT_DIR # Start logging load_log_configuration(config.LOG_CONFIG) logger = logging.getLogger(__name__) logger.debug(config.WORKERS) # Configuration: update_config(config, options=options, config_file=cfg) silence_other_ranks = True world_size = int(os.environ.get("WORLD_SIZE", 1)) distributed = world_size > 1 if distributed: # FOR DISTRIBUTED: Set the device according to local_rank. torch.cuda.set_device(local_rank) # FOR DISTRIBUTED: Initialize the backend. torch.distributed.launch will # provide environment variables, and requires that you use init_method=`env://`. torch.distributed.init_process_group(backend="nccl", init_method="env://") logging.info(f"Started train.py using distributed mode.") else: logging.info(f"Started train.py using local mode.") # Set CUDNN benchmark mode: torch.backends.cudnn.benchmark = config.CUDNN.BENCHMARK # Fix random seeds: torch.manual_seed(config.SEED) if torch.cuda.is_available(): torch.cuda.manual_seed_all(config.SEED) np.random.seed(seed=config.SEED) # Augmentation: basic_aug = Compose([ Normalize(mean=(config.TRAIN.MEAN, ), std=(config.TRAIN.STD, ), max_pixel_value=1), PadIfNeeded( min_height=config.TRAIN.PATCH_SIZE, min_width=config.TRAIN.PATCH_SIZE, border_mode=config.OPENCV_BORDER_CONSTANT, always_apply=True, mask_value=255, value=0, ), Resize( config.TRAIN.AUGMENTATIONS.RESIZE.HEIGHT, config.TRAIN.AUGMENTATIONS.RESIZE.WIDTH, always_apply=True, ), PadIfNeeded( min_height=config.TRAIN.AUGMENTATIONS.PAD.HEIGHT, min_width=config.TRAIN.AUGMENTATIONS.PAD.WIDTH, border_mode=config.OPENCV_BORDER_CONSTANT, always_apply=True, mask_value=255, ), ]) if config.TRAIN.AUGMENTATION: train_aug = Compose([basic_aug, HorizontalFlip(p=0.5)]) val_aug = basic_aug else: train_aug = val_aug = basic_aug # Training and Validation Loaders: TrainPatchLoader = get_patch_loader(config) logging.info(f"Using {TrainPatchLoader}") train_set = TrainPatchLoader( config, split="train", is_transform=True, augmentations=train_aug, debug=debug, ) logger.info(train_set) n_classes = train_set.n_classes val_set = TrainPatchLoader( config, split="val", is_transform=True, augmentations=val_aug, debug=debug, ) logger.info(val_set) if debug: data_flow_dict = dict() data_flow_dict["train_patch_loader_length"] = len(train_set) data_flow_dict["validation_patch_loader_length"] = len(val_set) data_flow_dict["train_input_shape"] = train_set.seismic.shape data_flow_dict["train_label_shape"] = train_set.labels.shape data_flow_dict["n_classes"] = n_classes logger.info("Running in debug mode..") train_range = min( config.TRAIN.BATCH_SIZE_PER_GPU * config.NUM_DEBUG_BATCHES, len(train_set)) logging.info(f"train range in debug mode {train_range}") train_set = data.Subset(train_set, range(train_range)) valid_range = min(config.VALIDATION.BATCH_SIZE_PER_GPU, len(val_set)) val_set = data.Subset(val_set, range(valid_range)) data_flow_dict["train_length_subset"] = len(train_set) data_flow_dict["validation_length_subset"] = len(val_set) train_sampler = torch.utils.data.distributed.DistributedSampler( train_set, num_replicas=world_size, rank=local_rank) val_sampler = torch.utils.data.distributed.DistributedSampler( val_set, num_replicas=world_size, rank=local_rank) train_loader = data.DataLoader( train_set, batch_size=config.TRAIN.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, sampler=train_sampler, ) val_loader = data.DataLoader( val_set, batch_size=config.VALIDATION.BATCH_SIZE_PER_GPU, num_workers=config.WORKERS, sampler=val_sampler) if debug: data_flow_dict["train_loader_length"] = len(train_loader) data_flow_dict["validation_loader_length"] = len(val_loader) config_file_name = "default_config" if not cfg else cfg.split( "/")[-1].split(".")[0] fname = f"data_flow_train_{config_file_name}_{config.TRAIN.MODEL_DIR}.json" with open(fname, "w") as f: json.dump(data_flow_dict, f, indent=2) # Model: model = getattr(models, config.MODEL.NAME).get_seg_model(config) device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) # Optimizer and LR Scheduler: optimizer = torch.optim.SGD( model.parameters(), lr=config.TRAIN.MAX_LR, momentum=config.TRAIN.MOMENTUM, weight_decay=config.TRAIN.WEIGHT_DECAY, ) epochs_per_cycle = config.TRAIN.END_EPOCH // config.TRAIN.SNAPSHOTS snapshot_duration = epochs_per_cycle * len( train_loader) if not debug else 2 * len(train_loader) cosine_scheduler = CosineAnnealingScheduler( optimizer, "lr", config.TRAIN.MAX_LR * world_size, config.TRAIN.MIN_LR * world_size, cycle_size=snapshot_duration, ) if distributed: warmup_duration = 5 * len(train_loader) warmup_scheduler = LinearCyclicalScheduler( optimizer, "lr", start_value=config.TRAIN.MAX_LR, end_value=config.TRAIN.MAX_LR * world_size, cycle_size=10 * len(train_loader), ) scheduler = ConcatScheduler( schedulers=[warmup_scheduler, cosine_scheduler], durations=[warmup_duration]) else: scheduler = cosine_scheduler # class weights are inversely proportional to the frequency of the classes in the training set class_weights = torch.tensor(config.DATASET.CLASS_WEIGHTS, device=device, requires_grad=False) # Loss: criterion = torch.nn.CrossEntropyLoss(weight=class_weights, ignore_index=255, reduction="mean") # Model: if distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[device], find_unused_parameters=True) if silence_other_ranks & local_rank != 0: logging.getLogger("ignite.engine.engine.Engine").setLevel( logging.WARNING) # Ignite trainer and evaluator: trainer = create_supervised_trainer(model, optimizer, criterion, prepare_batch, device=device) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Set to update the epoch parameter of our distributed data sampler so that we get # different shuffles trainer.add_event_handler(Events.EPOCH_STARTED, update_sampler_epoch(train_loader)) transform_fn = lambda output_dict: (output_dict["y_pred"].squeeze(), output_dict["mask"].squeeze()) evaluator = create_supervised_evaluator( model, prepare_batch, metrics={ "nll": Loss(criterion, output_transform=transform_fn, device=device), "pixacc": pixelwise_accuracy(n_classes, output_transform=transform_fn, device=device), "cacc": class_accuracy(n_classes, output_transform=transform_fn, device=device), "mca": mean_class_accuracy(n_classes, output_transform=transform_fn, device=device), "ciou": class_iou(n_classes, output_transform=transform_fn, device=device), "mIoU": mean_iou(n_classes, output_transform=transform_fn, device=device), }, device=device, ) # The model will be saved under: outputs/<config_file_name>/<model_dir> config_file_name = "default_config" if not cfg else cfg.split( "/")[-1].split(".")[0] try: output_dir = generate_path( config.OUTPUT_DIR, git_branch(), git_hash(), config_file_name, config.TRAIN.MODEL_DIR, current_datetime(), ) except: output_dir = generate_path( config.OUTPUT_DIR, config_file_name, config.TRAIN.MODEL_DIR, current_datetime(), ) if local_rank == 0: # Run only on master process # Logging: trainer.add_event_handler( Events.ITERATION_COMPLETED, logging_handlers.log_training_output( log_interval=config.PRINT_FREQ), ) trainer.add_event_handler(Events.EPOCH_STARTED, logging_handlers.log_lr(optimizer)) # Checkpointing: snapshotting trained models to disk checkpoint_handler = SnapshotHandler( output_dir, config.MODEL.NAME, extract_metric_from("mIoU"), lambda: (trainer.state.iteration % snapshot_duration) == 0, ) evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {"model": model}) # Tensorboard and Logging: summary_writer = create_summary_writer( log_dir=path.join(output_dir, "logs")) trainer.add_event_handler( Events.EPOCH_STARTED, tensorboard_handlers.log_lr(summary_writer, optimizer, "epoch")) trainer.add_event_handler( Events.ITERATION_COMPLETED, tensorboard_handlers.log_training_output(summary_writer)) trainer.add_event_handler( Events.ITERATION_COMPLETED, tensorboard_handlers.log_validation_output(summary_writer)) @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): evaluator.run(train_loader) if local_rank == 0: # Run only on master process tensorboard_handlers.log_results(engine, evaluator, summary_writer, n_classes, stage="Training") logging_handlers.log_metrics(engine, evaluator, stage="Training") logger.info("Logging training results..") @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): evaluator.run(val_loader) if local_rank == 0: # Run only on master process tensorboard_handlers.log_results(engine, evaluator, summary_writer, n_classes, stage="Validation") logging_handlers.log_metrics(engine, evaluator, stage="Validation") logger.info("Logging validation results..") # dump validation set metrics at the very end for debugging purposes if engine.state.epoch == config.TRAIN.END_EPOCH and debug: fname = f"metrics_{config_file_name}_{config.TRAIN.MODEL_DIR}.json" metrics = evaluator.state.metrics out_dict = { x: metrics[x] for x in ["nll", "pixacc", "mca", "mIoU"] } with open(fname, "w") as fid: json.dump(out_dict, fid) log_msg = " ".join(f"{k}: {out_dict[k]}" for k in out_dict.keys()) logging.info(log_msg) logger.info("Starting training") trainer.run(train_loader, max_epochs=config.TRAIN.END_EPOCH, epoch_length=len(train_loader), seed=config.SEED) if local_rank == 0: summary_writer.close()