def initialize( config: Optional[Any], num_channels: int ) -> Tuple[Module, Optimizer, Module, Union[_LRScheduler, ParamScheduler]]: """Initializing model, optimizer, loss function, and lr scheduler with correct settings. Parameters ---------- config config object num_channels number of channels for Generator Returns ------- model, optimizer, loss_fn, lr_scheduler """ netG = idist.auto_model(Generator(config.z_dim, config.g_filters, num_channels)) netD = idist.auto_model(Discriminator(num_channels, config.d_filters)) loss_fn = nn.BCELoss() optimizerG = optim.Adam(netG.parameters(), lr=config.lr, betas=(config.beta_1, 0.999)) optimizerD = optim.Adam(netD.parameters(), lr=config.lr, betas=(config.beta_1, 0.999)) loss_fn = loss_fn.to(idist.device()) return netD, netG, optimizerD, optimizerG, loss_fn, None
def _init_distribution(self): self.rank = idist.get_rank() manual_seed(42 + self.rank) self.device = idist.device() if self.train_ds: if self.train_ds.sampler is not None: sampler = self.train_ds.sampler(self.train_ds, self.train_ds.get_label) isShuffle = False else: sampler = None isShuffle = True self.train_loader = idist.auto_dataloader( self.train_ds, batch_size=self.hparams.train_bs, num_workers=self.hparams.train_num_workers, shuffle=isShuffle, drop_last=True, sampler=sampler, **self.train_ds.additional_loader_params) if self.valid_ds: self.valid_loader = idist.auto_dataloader( self.valid_ds, batch_size=self.hparams.valid_bs, num_workers=self.hparams.valid_num_workers, shuffle=False, drop_last=False, **self.valid_ds.additional_loader_params) if self.test_ds: self.test_loader = idist.auto_dataloader( self.test_ds, batch_size=self.hparams.valid_bs, num_workers=self.hparams.valid_num_workers, shuffle=False, drop_last=False, **self.test_ds.additional_loader_params) if USE_AMP: self._init_optimizer() self.model = idist.auto_model(self.model) self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level="O1") else: self.model = idist.auto_model(self.model) if not USE_AMP: self._init_optimizer() self.optimizer = idist.auto_optim(self.optimizer) self._init_scheduler() self.criterion = self.criterion.to(self.device)
def initialize(config): model = utils.get_model(config["model"]) # Adapt model for distributed backend if provided model = idist.auto_model(model) optimizer = utils.get_optimizer( config["optimizer"], model, learning_rate=config["learning_rate"], weight_decay=config["weight_decay"], ) # Adapt optimizer for distributed backend if provided optimizer = idist.auto_optim(optimizer) criterion = nn.CrossEntropyLoss().to(idist.device()) le = config["num_iters_per_epoch"] milestones_values = [ (0, 0.0), (le * config["num_warmup_epochs"], config["learning_rate"]), (le * config["num_epochs"], 0.0), ] lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values) return model, optimizer, criterion, lr_scheduler
def initialize(config): device = idist.device() model = config.model.to(device) optimizer = config.optimizer # Adapt model to dist config model = idist.auto_model(model) if idist.backend() == "horovod": accumulation_steps = config.get("accumulation_steps", 1) # Can not use auto_optim with Horovod: https://github.com/horovod/horovod/issues/2670 import horovod.torch as hvd optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), backward_passes_per_step=accumulation_steps, ) hvd.broadcast_optimizer_state(optimizer, root_rank=0) if accumulation_steps > 1: # disable manual grads accumulation as it is already done on optimizer's side config.accumulation_steps = 1 else: optimizer = idist.auto_optim(optimizer) criterion = config.criterion.to(device) return model, optimizer, criterion
def get_model(config): """For a list of possible architectures, encoders and weights, please refer to: https://github.com/qubvel/segmentation_models.pytorch#architectures- """ assert config[ "architecture"] in smp.__dict__, f"Unknown architecture: {config['architecture']}" model = smp.__dict__[config["architecture"]]( encoder_name=config["encoder"], encoder_weights=config["encoder_weights"], classes=1, activation="sigmoid", ) if "encoder_freeze_at" in config: freeze_encoder_at(model.encoder, config["encoder_freeze_at"]) if "device" in config: if config["device"] == "cpu": return model # Adapt model for distributed settings if configured model = idist.auto_model(model) return model
def initialize(config): model = utils.get_model(config["model"], config["model_dir"], config["dropout"], config["n_fc"], config["num_classes"]) config["learning_rate"] *= idist.get_world_size() # Adapt model for distributed settings if configured model = idist.auto_model(model) optimizer = optim.AdamW( model.parameters(), lr=config["learning_rate"], weight_decay=config["weight_decay"], ) optimizer = idist.auto_optim(optimizer) criterion = nn.BCEWithLogitsLoss() le = config["num_iters_per_epoch"] milestones_values = [ (0, 0.0), (le * config["num_warmup_epochs"], config["learning_rate"]), (le * config["num_epochs"], 0.0), ] lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values) return model, optimizer, criterion, lr_scheduler
def initialize( config: ConfigSchema, wlm: WeakLabelManager ) -> Tuple[nn.Module, Optimizer, nn.Module]: model = get_model(config.model) # Adapt model for distributed settings if configured model = idist.auto_model(model) to_decay, not_to_deacy = [], [] for name, param in model.named_parameters(): if not param.requires_grad: continue elif len(param.shape) == 1 or name.endswith("bias"): not_to_deacy.append(param) else: to_decay.append(param) optimizer = optim.SGD( [ {"params": to_decay, "weight_decay": config.weight_decay}, {"params": not_to_deacy, "weight_decay": 0.0}, ], lr=config.learning_rate, momentum=config.momentum, nesterov=True, ) optimizer = idist.auto_optim(optimizer) criterion = get_weak_label_loss(config, wlm).to(idist.device()) return model, optimizer, criterion
def _test_ema_final_weight(model, device, ddp=False, interval=1): """Test if final smoothed weights are correct""" if isinstance(device, str): device = torch.device(device) model = model.to(device) if ddp: model = idist.auto_model(model) step_fn = _get_dummy_step_fn(model) engine = Engine(step_fn) # momentum will be constantly 0.5 ema_handler = EMAHandler(model, momentum_warmup=0.5, momentum=0.5, warmup_iters=1) ema_handler.attach(engine, "model", event=Events.ITERATION_COMPLETED(every=interval)) # engine will run 4 iterations engine.run(range(2), max_epochs=2) ema_weight = ema_handler.ema_model.weight.data model_weight = model.weight.data assert ema_weight.device == device assert model_weight.device == device if interval == 1: torch.testing.assert_allclose(ema_weight, torch.full((1, 2), 4.0625, device=device)) elif interval == 2: torch.testing.assert_allclose(ema_weight, torch.full((1, 2), 3.5, device=device)) else: pass torch.testing.assert_allclose(model_weight, torch.full((1, 2), 5.0, device=device))
def _test_ema_final_weight(model, device=None, ddp=False, interval=1): """Test if final smoothed weights are correct""" if device is None: # let horovod decide the device device = idist.device() if isinstance(device, str): device = torch.device(device) model = model.to(device) if ddp: model = idist.auto_model(model) step_fn = _get_dummy_step_fn(model) engine = Engine(step_fn) ema_handler = EMAHandler(model, momentum=0.5) ema_handler.attach(engine, "model", event=Events.ITERATION_COMPLETED(every=interval)) # engine will run 4 iterations engine.run(range(2), max_epochs=2) # ema_model and model can be DP or DDP # explicitly cast to float32 to avoid test failure on XLA devices ema_weight = _unwrap_model(ema_handler.ema_model).weight.data.to( torch.float32) model_weight = _unwrap_model(model).weight.data.to(torch.float32) assert ema_weight.device == device assert model_weight.device == device if interval == 1: assert ema_weight.allclose(ema_weight.new_full((1, 2), 4.0625)) elif interval == 2: assert ema_weight.allclose(ema_weight.new_full((1, 2), 3.5)) else: pass assert model_weight.allclose(model_weight.new_full((1, 2), 5.0))
def initialize(config): model = utils.get_model(config["model"]) # Adapt model for distributed settings if configured model = idist.auto_model(model, find_unused_parameters=True) optimizer = optim.SGD( model.parameters(), lr=config["learning_rate"], momentum=config["momentum"], weight_decay=config["weight_decay"], nesterov=True, ) optimizer = idist.auto_optim(optimizer) criterion = nn.CrossEntropyLoss().to(idist.device()) le = config["num_iters_per_epoch"] milestones_values = [ (0, 0.0), (le * config["num_warmup_epochs"], config["learning_rate"]), (le * config["num_epochs"], 0.0), ] lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values) return model, optimizer, criterion, lr_scheduler
def evaluation(local_rank, config, logger, with_clearml): rank = idist.get_rank() device = idist.device() manual_seed(config.seed + local_rank) data_loader = config.data_loader model = config.model.to(device) # Load weights: state_dict = get_model_weights(config, logger, with_clearml) model.load_state_dict(state_dict) # Adapt model to dist config model = idist.auto_model(model) # Setup evaluators num_classes = config.num_classes cm_metric = ConfusionMatrix(num_classes=num_classes) val_metrics = { "IoU": IoU(cm_metric), "mIoU_bg": mIoU(cm_metric), } if ("val_metrics" in config) and isinstance(config.val_metrics, dict): val_metrics.update(config.val_metrics) evaluator = create_evaluator(model, val_metrics, config, with_clearml, tag="val") # Setup Tensorboard logger if rank == 0: tb_logger = common.TensorboardLogger( log_dir=config.output_path.as_posix()) tb_logger.attach_output_handler( evaluator, event_name=Events.COMPLETED, tag="validation", metric_names="all", ) # Log confusion matrix to ClearML: if with_clearml: evaluator.add_event_handler(Events.COMPLETED, compute_and_log_cm, cm_metric, evaluator.state.iteration) state = evaluator.run(data_loader) utils.log_metrics(logger, 0, state.times["COMPLETED"], "Validation", state.metrics) if idist.get_rank() == 0: tb_logger.close()
def test_ema_ema_model_on_cuda(get_dummy_model): """Test if ema_handler.ema_model is nn.Module and under eval mode""" model = get_dummy_model().to(idist.device()) model = idist.auto_model(model) ema_handler = EMAHandler(model) ema_model = ema_handler.ema_model assert ( isinstance(ema_model, nn.Module) and not isinstance(ema_model, nn.parallel.DistributedDataParallel) and not isinstance(ema_model, nn.parallel.DataParallel) ) assert not ema_model.training
def initialize(config): model = config.model.to(config.device) optimizer = config.optimizer # Setup Nvidia/Apex AMP model, optimizer = amp.initialize(model, optimizer, opt_level=getattr(config, "fp16_opt_level", "O2"), num_losses=1) # Adapt model to dist conf model = idist.auto_model(model) criterion = config.criterion.to(config.device) return model, optimizer, criterion
def initialize(config): model = get_model(config["model"]) # Adapt model for distributed settings if configured model = idist.auto_model(model) optimizer = optim.SGD( model.parameters(), lr=config.get("learning_rate", 0.1), momentum=config.get("momentum", 0.9), weight_decay=config.get("weight_decay", 1e-5), nesterov=True, ) optimizer = idist.auto_optim(optimizer) criterion = nn.CrossEntropyLoss().to(idist.device()) le = config["num_iters_per_epoch"] lr_scheduler = StepLR(optimizer, step_size=le, gamma=0.9) return model, optimizer, criterion, lr_scheduler
def initialize(config): model = get_model(config.model, config.model_dir, config.dropout, config.n_fc, config.num_classes) config.learning_rate *= idist.get_world_size() # Adapt model for distributed settings if configured model = idist.auto_model(model) optimizer = optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay) optimizer = idist.auto_optim(optimizer) loss_fn = nn.BCEWithLogitsLoss() le = config.num_iters_per_epoch milestones_values = [ (0, 0.0), (le * config.num_warmup_epochs, config.learning_rate), (le * config.max_epochs, 0.0), ] lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values) return model, optimizer, loss_fn, lr_scheduler
def initialize(config: Optional[Any]) -> Tuple[Module, Optimizer, Module, Union[_LRScheduler, ParamScheduler]]: """Initializing model, optimizer, loss function, and lr scheduler with correct settings. Parameters ---------- config: config object Returns ------- model, optimizer, loss_fn, lr_scheduler """ model = ... optimizer = ... loss_fn = ... lr_scheduler = ... model = idist.auto_model(model) optimizer = idist.auto_optim(optimizer) loss_fn = loss_fn.to(idist.device()) return model, optimizer, loss_fn, lr_scheduler
def initialize(config): model = utils.get_model(config["model"], config["num_classes"]) # Adapt model for distributed settings if configured model = idist.auto_model(model) optimizer = optim.SGD( model.parameters(), lr=config["learning_rate"], momentum=config["momentum"], weight_decay=config["weight_decay"], # nesterov=True, ) optimizer = idist.auto_optim(optimizer) # criterion = nn.CrossEntropyLoss().to(idist.device()) criterion = nn.CrossEntropyLoss() le = config["num_iters_per_epoch"] cl = config["learning_rate"] # print("%d, %f" %(le,cl)) milestones_values = [ (30 * le, cl), (45 * le, 0.5 * cl), (46 * le, 0.1 * cl), (60 * le, 0.1 * cl), (61 * le, 0.01 * cl), (90 * le, 0.01 * cl), (120 * le, 0.001 * cl), # (le * config["num_warmup_epochs"], config["learning_rate"]), # (le * config["num_epochs"], 0.0), ] # print(milestones_values) lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=config["lr_step_size"], gamma=config["lr_gamma"]) return model, optimizer, criterion, lr_scheduler
def initialize(cfg): model = setup_model(cfg.model, num_classes=cfg.num_classes) ema_model = setup_model(cfg.model, num_classes=cfg.num_classes) model.to(idist.device()) ema_model.to(idist.device()) setup_ema(ema_model, model) model = idist.auto_model(model) if isinstance(model, nn.parallel.DataParallel): ema_model = nn.parallel.DataParallel(ema_model) optimizer = instantiate(cfg.solver.optimizer, model.parameters()) optimizer = idist.auto_optim(optimizer) sup_criterion = instantiate(cfg.solver.supervised_criterion) total_num_iters = cfg.solver.num_epochs * cfg.solver.epoch_length lr_scheduler = instantiate(cfg.solver.lr_scheduler, optimizer, T_max=total_num_iters) return model, ema_model, optimizer, sup_criterion, lr_scheduler
def training(rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() # Define output folder: config.output = "/tmp/output" model = idist.auto_model(config.model) optimizer = idist.auto_optim(config.optimizer) criterion = config.criterion train_set, val_set = config.train_set, config.val_set train_loader = idist.auto_dataloader(train_set, batch_size=config.train_batch_size) val_loader = idist.auto_dataloader(val_set, batch_size=config.val_batch_size) trainer = create_supervised_trainer(model, optimizer, criterion, device=device) trainer.logger = setup_logger("Trainer") metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)} train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) train_evaluator.logger = setup_logger("Train Evaluator") validation_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) validation_evaluator.logger = setup_logger("Val Evaluator") @trainer.on(Events.EPOCH_COMPLETED(every=config.val_interval)) def compute_metrics(engine): train_evaluator.run(train_loader) validation_evaluator.run(val_loader) if rank == 0: tb_logger = TensorboardLogger(log_dir=config.output) tb_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), tag="training", output_transform=lambda loss: {"batchloss": loss}, metric_names="all", ) for tag, evaluator in [("training", train_evaluator), ("validation", validation_evaluator)]: tb_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag=tag, metric_names=["loss", "accuracy"], global_step_transform=global_step_from_engine(trainer), ) tb_logger.attach_opt_params_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), optimizer=optimizer) model_checkpoint = ModelCheckpoint( config.output, n_saved=2, filename_prefix="best", score_name="accuracy", global_step_transform=global_step_from_engine(trainer), ) validation_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint, {"model": model}) trainer.run(train_loader, max_epochs=config.num_epochs) if rank == 0: tb_logger.close()
def train(loop: Loop, config: Config): seed_everything(22) setup_cudnn_reproducibility(True) dataloader, num_channels = get_dataloader(config) generator = auto_model( Generator(config.z_dim, config.g_filters, num_channels)) discriminator = auto_model(Discriminator(num_channels, config.d_filters)) bce = BCEWithLogitsLoss() opt_G = Adam(generator.parameters(), lr=config.lr * idist.get_world_size(), betas=(config.beta_1, 0.999)) opt_D = Adam(discriminator.parameters(), lr=config.lr * idist.get_world_size(), betas=(config.beta_1, 0.999)) device = idist.device() real_labels = torch.ones(config.batch_size, device=device) fake_labels = torch.zeros(config.batch_size, device=device) fixed_noise = torch.randn(16, config.z_dim, 1, 1, device=device) def dump_fake_images_to_tb(): with loop.mode("valid"): fake = make_grid(generator(fixed_noise), normalize=True, range=(-1, 1)).cpu() if idist.get_rank() == 0: sw: SummaryWriter = get_summary_writer(loop) sw.add_image("fake_images", fake, global_step=loop.iterations.current_epoch) def get_noise(): return torch.randn(config.batch_size, config.z_dim, 1, 1, device=device) error_D_avg = Average() error_G_avg = Average() loop.attach(generator=generator, discriminator=discriminator, d_opt=opt_D, g_opt=opt_G) def stage_1(loop: Loop): for _ in loop.iterate_epochs(config.epochs): for real, _ in loop.iterate_dataloader(dataloader, mode="train"): output = discriminator(real) error_D_real = bce(output, real_labels) loop.backward(error_D_real) fake = generator(get_noise()) # train with fake output = discriminator(fake.detach()) error_D_fake = bce(output, fake_labels) loop.backward(error_D_fake) loop.optimizer_step(opt_D) with torch.no_grad(): error_D = error_D_fake + error_D_real error_D_avg.update(error_D) with no_grad_for_module(discriminator), module_eval( discriminator): # We don't want to compute grads for # discriminator parameters on # error_G backward pass output = discriminator(fake) error_G = bce(output, real_labels) simple_gd_step(loop, opt_G, error_G) error_G_avg.update(error_G.detach()) loop.metrics.log("generator/error_batch", error_G.item()) loop.metrics.log("discriminator/error_batch", error_D.item()) loop.metrics.consume("generator/error_epoch", error_G_avg) loop.metrics.consume("discriminator/error_epoch", error_D_avg) dump_fake_images_to_tb() loop.run(stage_1)
def run( local_rank: int, device: str, experiment_name: str, gpus: Optional[Union[int, List[int], str]] = None, dataset_root: str = "./dataset", log_dir: str = "./log", model: str = "fasterrcnn_resnet50_fpn", epochs: int = 13, batch_size: int = 4, lr: float = 0.01, download: bool = False, image_size: int = 256, resume_from: Optional[dict] = None, ) -> None: bbox_params = A.BboxParams(format="pascal_voc") train_transform = A.Compose( [A.HorizontalFlip(p=0.5), ToTensorV2()], bbox_params=bbox_params, ) val_transform = A.Compose([ToTensorV2()], bbox_params=bbox_params) download = local_rank == 0 and download train_dataset = Dataset(root=dataset_root, download=download, image_set="train", transforms=train_transform) val_dataset = Dataset(root=dataset_root, download=download, image_set="val", transforms=val_transform) vis_dataset = Subset(val_dataset, random.sample(range(len(val_dataset)), k=16)) train_dataloader = idist.auto_dataloader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, num_workers=4) val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=4) vis_dataloader = DataLoader(vis_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=4) model = idist.auto_model(model) scaler = GradScaler() optimizer = SGD(lr=lr, params=model.parameters()) optimizer = idist.auto_optim(optimizer) scheduler = OneCycleLR(optimizer, max_lr=lr, total_steps=len(train_dataloader) * epochs) def update_model(engine, batch): model.train() images, targets = batch images = list(image.to(device) for image in images) targets = [{ k: v.to(device) for k, v in t.items() if isinstance(v, torch.Tensor) } for t in targets] with torch.autocast(device, enabled=True): loss_dict = model(images, targets) loss = sum(loss for loss in loss_dict.values()) optimizer.zero_grad() scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() loss_items = {k: v.item() for k, v in loss_dict.items()} loss_items["loss_average"] = loss.item() / 4 return loss_items @torch.no_grad() def inference(engine, batch): model.eval() images, targets = batch images = list(image.to(device) for image in images) outputs = model(images) outputs = [{k: v.to("cpu") for k, v in t.items()} for t in outputs] return { "y_pred": outputs, "y": targets, "x": [i.cpu() for i in images] } trainer = Engine(update_model) evaluator = Engine(inference) visualizer = Engine(inference) aim_logger = AimLogger( repo=os.path.join(log_dir, "aim"), experiment=experiment_name, ) CocoMetric(convert_to_coco_api(val_dataset)).attach(evaluator, "mAP") @trainer.on(Events.EPOCH_COMPLETED) @one_rank_only() def log_validation_results(engine): evaluator.run(val_dataloader) visualizer.run(vis_dataloader) @trainer.on(Events.ITERATION_COMPLETED) def step_scheduler(engine): scheduler.step() aim_logger.log_metrics({"lr": scheduler.get_last_lr()[0]}, step=engine.state.iteration) @visualizer.on(Events.EPOCH_STARTED) def reset_vis_images(engine): engine.state.model_outputs = [] @visualizer.on(Events.ITERATION_COMPLETED) def add_vis_images(engine): engine.state.model_outputs.append(engine.state.output) @visualizer.on(Events.ITERATION_COMPLETED) def submit_vis_images(engine): aim_images = [] for outputs in engine.state.model_outputs: for image, target, pred in zip(outputs["x"], outputs["y"], outputs["y_pred"]): image = (image * 255).byte() pred_labels = [ Dataset.class2name[label.item()] for label in pred["labels"] ] pred_boxes = pred["boxes"].long() image = draw_bounding_boxes(image, pred_boxes, pred_labels, colors="red") target_labels = [ Dataset.class2name[label.item()] for label in target["labels"] ] target_boxes = target["boxes"].long() image = draw_bounding_boxes(image, target_boxes, target_labels, colors="green") aim_images.append(aim.Image(image.numpy().transpose( (1, 2, 0)))) aim_logger.experiment.track(aim_images, name="vis", step=trainer.state.epoch) losses = [ "loss_classifier", "loss_box_reg", "loss_objectness", "loss_rpn_box_reg", "loss_average" ] for loss_name in losses: RunningAverage(output_transform=lambda x: x[loss_name]).attach( trainer, loss_name) ProgressBar().attach(trainer, losses) ProgressBar().attach(evaluator) objects_to_checkpoint = { "trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": scheduler, "scaler": scaler, } checkpoint = Checkpoint( to_save=objects_to_checkpoint, save_handler=DiskSaver(log_dir, require_empty=False), n_saved=3, score_name="mAP", global_step_transform=lambda *_: trainer.state.epoch, ) evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint) if resume_from: Checkpoint.load_objects(objects_to_checkpoint, torch.load(resume_from)) aim_logger.log_params({ "lr": lr, "image_size": image_size, "batch_size": batch_size, "epochs": epochs, }) aim_logger.attach_output_handler(trainer, event_name=Events.ITERATION_COMPLETED, tag="train", output_transform=lambda loss: loss) aim_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag="val", metric_names=["mAP"], global_step_transform=global_step_from_engine( trainer, Events.ITERATION_COMPLETED), ) trainer.run(train_dataloader, max_epochs=epochs)
def training(rank, config): # Specific ignite.distributed print( idist.get_rank(), ": run with config:", config, "- backend=", idist.backend(), "- world size", idist.get_world_size(), ) device = idist.device() # Data preparation: dataset = RndDataset(nb_samples=config["nb_samples"]) # Specific ignite.distributed train_loader = idist.auto_dataloader(dataset, batch_size=config["batch_size"]) # Model, criterion, optimizer setup model = idist.auto_model(wide_resnet50_2(num_classes=100)) criterion = NLLLoss() optimizer = idist.auto_optim(SGD(model.parameters(), lr=0.01)) # Training loop log param log_interval = config["log_interval"] def _train_step(engine, batch): data = batch[0].to(device) target = batch[1].to(device) optimizer.zero_grad() output = model(data) # Add a softmax layer probabilities = torch.nn.functional.softmax(output, dim=0) loss_val = criterion(probabilities, target) loss_val.backward() optimizer.step() return loss_val # Running the _train_step function on whole batch_data iterable only once trainer = Engine(_train_step) # Add a logger @trainer.on(Events.ITERATION_COMPLETED(every=log_interval)) def log_training(): print("Process {}/{} Train Epoch: {} [{}/{}]\tLoss: {}".format( idist.get_rank(), idist.get_world_size(), trainer.state.epoch, trainer.state.iteration * len(trainer.state.batch[0]), len(dataset) / idist.get_world_size(), trainer.state.output, )) trainer.run(train_loader, max_epochs=1)
def run(loop: Loop): seed_everything(42) setup_cudnn_reproducibility(True, False) train_ds, valid_ds = get_train_test_datasets("data/cifar") model = auto_model(get_model()) train_loader = auto_dataloader( train_ds, batch_size=512, shuffle=True, drop_last=True, num_workers=4, ) valid_loader = auto_dataloader( valid_ds, batch_size=512, num_workers=4, shuffle=False, ) optim = SGD(model.parameters(), lr=0.4, momentum=0.9) scheduler = OneCycleLR(optim, max_lr=1, epochs=NUM_EPOCHS, steps_per_epoch=len(train_loader)) criterion = CrossEntropyLoss() precision = Precision(average=False) recall = Recall(average=False) # Ignite metrics are combinable f1 = (precision * recall * 2 / (precision + recall)).mean() accuracy = Accuracy() # We are attaching metrics to automatically reset loop.attach( # Loop manages train/eval modes, device and requires_grad of attached `nn.Module`s criterion=criterion, # This criterion doesn't have any state or attribute tensors # So it's attachment doesn't introduce any behavior model=model, # Loop saves state of all attached objects having state_dict()/load_state_dict() methods # to checkpoints optimizer=optim, scheduler=scheduler, ) def train(loop: Loop): for _ in loop.iterate_epochs(NUM_EPOCHS): for x, y in loop.iterate_dataloader(train_loader, mode="train"): y_pred_logits = model(x) loss: torch.Tensor = criterion(y_pred_logits, y) loop.backward(loss) # Makes optimizer step and also # zeroes grad after (default) loop.optimizer_step(optim, zero_grad=True) # Here we call scheduler.step() every iteration # because we have one-cycle scheduler # we also can call it after all dataloader loop # if it's som usual scheduler scheduler.step() # Log learning rate. All metrics are written to tensorboard # with specified names # If iteration='auto' (default) its determined based on where the call is # performed. Here it will be batches loop.metrics.log("lr", scheduler.get_last_lr()[0], iteration="auto") # Loop disables gradients and calls Module.eval() inside loop # for all attached modules when mode="valid" (default) for x, y in loop.iterate_dataloader(valid_loader, mode="valid"): y_pred_logits: torch.Tensor = model(x) y_pred = to_onehot(y_pred_logits.argmax(dim=-1), num_classes=10) precision.update((y_pred, y)) recall.update((y_pred, y)) accuracy.update((y_pred, y)) # This metrics will be epoch metrics because they are called outside # dataloader loop # Here we logging metric without resetting it loop.metrics.log("valid/precision", precision.compute().mean()) loop.metrics.log("valid/recall", recall.compute().mean()) # .log() method above accepts values (tensors, floats, np.array's) # .consume() accepts Metric object. It resets it after logging loop.metrics.consume("valid/f1", f1) loop.metrics.consume("valid/accuracy", accuracy) loop.run(train)