def initialize(config): model = utils.get_model(config["model"], config["model_dir"], config["dropout"], config["n_fc"], config["num_classes"]) config["learning_rate"] *= idist.get_world_size() # Adapt model for distributed settings if configured model = idist.auto_model(model) optimizer = optim.AdamW( model.parameters(), lr=config["learning_rate"], weight_decay=config["weight_decay"], ) optimizer = idist.auto_optim(optimizer) criterion = nn.BCEWithLogitsLoss() le = config["num_iters_per_epoch"] milestones_values = [ (0, 0.0), (le * config["num_warmup_epochs"], config["learning_rate"]), (le * config["num_epochs"], 0.0), ] lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values) return model, optimizer, criterion, lr_scheduler
def initialize(config): model = utils.get_model(config["model"]) # Adapt model for distributed settings if configured model = idist.auto_model(model, find_unused_parameters=True) optimizer = optim.SGD( model.parameters(), lr=config["learning_rate"], momentum=config["momentum"], weight_decay=config["weight_decay"], nesterov=True, ) optimizer = idist.auto_optim(optimizer) criterion = nn.CrossEntropyLoss().to(idist.device()) le = config["num_iters_per_epoch"] milestones_values = [ (0, 0.0), (le * config["num_warmup_epochs"], config["learning_rate"]), (le * config["num_epochs"], 0.0), ] lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values) return model, optimizer, criterion, lr_scheduler
def initialize(config): device = idist.device() model = config.model.to(device) optimizer = config.optimizer # Adapt model to dist config model = idist.auto_model(model) if idist.backend() == "horovod": accumulation_steps = config.get("accumulation_steps", 1) # Can not use auto_optim with Horovod: https://github.com/horovod/horovod/issues/2670 import horovod.torch as hvd optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), backward_passes_per_step=accumulation_steps, ) hvd.broadcast_optimizer_state(optimizer, root_rank=0) if accumulation_steps > 1: # disable manual grads accumulation as it is already done on optimizer's side config.accumulation_steps = 1 else: optimizer = idist.auto_optim(optimizer) criterion = config.criterion.to(device) return model, optimizer, criterion
def initialize(config): model = utils.get_model(config["model"]) # Adapt model for distributed backend if provided model = idist.auto_model(model) optimizer = utils.get_optimizer( config["optimizer"], model, learning_rate=config["learning_rate"], weight_decay=config["weight_decay"], ) # Adapt optimizer for distributed backend if provided optimizer = idist.auto_optim(optimizer) criterion = nn.CrossEntropyLoss().to(idist.device()) le = config["num_iters_per_epoch"] milestones_values = [ (0, 0.0), (le * config["num_warmup_epochs"], config["learning_rate"]), (le * config["num_epochs"], 0.0), ] lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values) return model, optimizer, criterion, lr_scheduler
def initialize( config: ConfigSchema, wlm: WeakLabelManager ) -> Tuple[nn.Module, Optimizer, nn.Module]: model = get_model(config.model) # Adapt model for distributed settings if configured model = idist.auto_model(model) to_decay, not_to_deacy = [], [] for name, param in model.named_parameters(): if not param.requires_grad: continue elif len(param.shape) == 1 or name.endswith("bias"): not_to_deacy.append(param) else: to_decay.append(param) optimizer = optim.SGD( [ {"params": to_decay, "weight_decay": config.weight_decay}, {"params": not_to_deacy, "weight_decay": 0.0}, ], lr=config.learning_rate, momentum=config.momentum, nesterov=True, ) optimizer = idist.auto_optim(optimizer) criterion = get_weak_label_loss(config, wlm).to(idist.device()) return model, optimizer, criterion
def _init_distribution(self): self.rank = idist.get_rank() manual_seed(42 + self.rank) self.device = idist.device() if self.train_ds: if self.train_ds.sampler is not None: sampler = self.train_ds.sampler(self.train_ds, self.train_ds.get_label) isShuffle = False else: sampler = None isShuffle = True self.train_loader = idist.auto_dataloader( self.train_ds, batch_size=self.hparams.train_bs, num_workers=self.hparams.train_num_workers, shuffle=isShuffle, drop_last=True, sampler=sampler, **self.train_ds.additional_loader_params) if self.valid_ds: self.valid_loader = idist.auto_dataloader( self.valid_ds, batch_size=self.hparams.valid_bs, num_workers=self.hparams.valid_num_workers, shuffle=False, drop_last=False, **self.valid_ds.additional_loader_params) if self.test_ds: self.test_loader = idist.auto_dataloader( self.test_ds, batch_size=self.hparams.valid_bs, num_workers=self.hparams.valid_num_workers, shuffle=False, drop_last=False, **self.test_ds.additional_loader_params) if USE_AMP: self._init_optimizer() self.model = idist.auto_model(self.model) self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level="O1") else: self.model = idist.auto_model(self.model) if not USE_AMP: self._init_optimizer() self.optimizer = idist.auto_optim(self.optimizer) self._init_scheduler() self.criterion = self.criterion.to(self.device)
def get_optimizer(model, config): assert config[ "optimizer"] in optim.__dict__, f"Unknown optimizer: {config['optimizer']}" optimizer = optim.__dict__[config["optimizer"]]( model.parameters(), lr=config["learning_rate"], weight_decay=config["weight_decay"]) optimizer = idist.auto_optim(optimizer) return optimizer
def initialize(config): model = get_model(config["model"]) # Adapt model for distributed settings if configured model = idist.auto_model(model) optimizer = optim.SGD( model.parameters(), lr=config.get("learning_rate", 0.1), momentum=config.get("momentum", 0.9), weight_decay=config.get("weight_decay", 1e-5), nesterov=True, ) optimizer = idist.auto_optim(optimizer) criterion = nn.CrossEntropyLoss().to(idist.device()) le = config["num_iters_per_epoch"] lr_scheduler = StepLR(optimizer, step_size=le, gamma=0.9) return model, optimizer, criterion, lr_scheduler
def initialize(config): model = get_model(config.model, config.model_dir, config.dropout, config.n_fc, config.num_classes) config.learning_rate *= idist.get_world_size() # Adapt model for distributed settings if configured model = idist.auto_model(model) optimizer = optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay) optimizer = idist.auto_optim(optimizer) loss_fn = nn.BCEWithLogitsLoss() le = config.num_iters_per_epoch milestones_values = [ (0, 0.0), (le * config.num_warmup_epochs, config.learning_rate), (le * config.max_epochs, 0.0), ] lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values) return model, optimizer, loss_fn, lr_scheduler
def initialize(config): model = utils.get_model(config["model"], config["num_classes"]) # Adapt model for distributed settings if configured model = idist.auto_model(model) optimizer = optim.SGD( model.parameters(), lr=config["learning_rate"], momentum=config["momentum"], weight_decay=config["weight_decay"], # nesterov=True, ) optimizer = idist.auto_optim(optimizer) # criterion = nn.CrossEntropyLoss().to(idist.device()) criterion = nn.CrossEntropyLoss() le = config["num_iters_per_epoch"] cl = config["learning_rate"] # print("%d, %f" %(le,cl)) milestones_values = [ (30 * le, cl), (45 * le, 0.5 * cl), (46 * le, 0.1 * cl), (60 * le, 0.1 * cl), (61 * le, 0.01 * cl), (90 * le, 0.01 * cl), (120 * le, 0.001 * cl), # (le * config["num_warmup_epochs"], config["learning_rate"]), # (le * config["num_epochs"], 0.0), ] # print(milestones_values) lr_scheduler = PiecewiseLinear(optimizer, param_name="lr", milestones_values=milestones_values) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=config["lr_step_size"], gamma=config["lr_gamma"]) return model, optimizer, criterion, lr_scheduler
def initialize(config: Optional[Any]) -> Tuple[Module, Optimizer, Module, Union[_LRScheduler, ParamScheduler]]: """Initializing model, optimizer, loss function, and lr scheduler with correct settings. Parameters ---------- config: config object Returns ------- model, optimizer, loss_fn, lr_scheduler """ model = ... optimizer = ... loss_fn = ... lr_scheduler = ... model = idist.auto_model(model) optimizer = idist.auto_optim(optimizer) loss_fn = loss_fn.to(idist.device()) return model, optimizer, loss_fn, lr_scheduler
def initialize(cfg): model = setup_model(cfg.model, num_classes=cfg.num_classes) ema_model = setup_model(cfg.model, num_classes=cfg.num_classes) model.to(idist.device()) ema_model.to(idist.device()) setup_ema(ema_model, model) model = idist.auto_model(model) if isinstance(model, nn.parallel.DataParallel): ema_model = nn.parallel.DataParallel(ema_model) optimizer = instantiate(cfg.solver.optimizer, model.parameters()) optimizer = idist.auto_optim(optimizer) sup_criterion = instantiate(cfg.solver.supervised_criterion) total_num_iters = cfg.solver.num_epochs * cfg.solver.epoch_length lr_scheduler = instantiate(cfg.solver.lr_scheduler, optimizer, T_max=total_num_iters) return model, ema_model, optimizer, sup_criterion, lr_scheduler
def training(rank, config): # Specific ignite.distributed print( idist.get_rank(), ": run with config:", config, "- backend=", idist.backend(), "- world size", idist.get_world_size(), ) device = idist.device() # Data preparation: dataset = RndDataset(nb_samples=config["nb_samples"]) # Specific ignite.distributed train_loader = idist.auto_dataloader(dataset, batch_size=config["batch_size"]) # Model, criterion, optimizer setup model = idist.auto_model(wide_resnet50_2(num_classes=100)) criterion = NLLLoss() optimizer = idist.auto_optim(SGD(model.parameters(), lr=0.01)) # Training loop log param log_interval = config["log_interval"] def _train_step(engine, batch): data = batch[0].to(device) target = batch[1].to(device) optimizer.zero_grad() output = model(data) # Add a softmax layer probabilities = torch.nn.functional.softmax(output, dim=0) loss_val = criterion(probabilities, target) loss_val.backward() optimizer.step() return loss_val # Running the _train_step function on whole batch_data iterable only once trainer = Engine(_train_step) # Add a logger @trainer.on(Events.ITERATION_COMPLETED(every=log_interval)) def log_training(): print("Process {}/{} Train Epoch: {} [{}/{}]\tLoss: {}".format( idist.get_rank(), idist.get_world_size(), trainer.state.epoch, trainer.state.iteration * len(trainer.state.batch[0]), len(dataset) / idist.get_world_size(), trainer.state.output, )) trainer.run(train_loader, max_epochs=1)
def run( local_rank: int, device: str, experiment_name: str, gpus: Optional[Union[int, List[int], str]] = None, dataset_root: str = "./dataset", log_dir: str = "./log", model: str = "fasterrcnn_resnet50_fpn", epochs: int = 13, batch_size: int = 4, lr: float = 0.01, download: bool = False, image_size: int = 256, resume_from: Optional[dict] = None, ) -> None: bbox_params = A.BboxParams(format="pascal_voc") train_transform = A.Compose( [A.HorizontalFlip(p=0.5), ToTensorV2()], bbox_params=bbox_params, ) val_transform = A.Compose([ToTensorV2()], bbox_params=bbox_params) download = local_rank == 0 and download train_dataset = Dataset(root=dataset_root, download=download, image_set="train", transforms=train_transform) val_dataset = Dataset(root=dataset_root, download=download, image_set="val", transforms=val_transform) vis_dataset = Subset(val_dataset, random.sample(range(len(val_dataset)), k=16)) train_dataloader = idist.auto_dataloader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, num_workers=4) val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=4) vis_dataloader = DataLoader(vis_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=4) model = idist.auto_model(model) scaler = GradScaler() optimizer = SGD(lr=lr, params=model.parameters()) optimizer = idist.auto_optim(optimizer) scheduler = OneCycleLR(optimizer, max_lr=lr, total_steps=len(train_dataloader) * epochs) def update_model(engine, batch): model.train() images, targets = batch images = list(image.to(device) for image in images) targets = [{ k: v.to(device) for k, v in t.items() if isinstance(v, torch.Tensor) } for t in targets] with torch.autocast(device, enabled=True): loss_dict = model(images, targets) loss = sum(loss for loss in loss_dict.values()) optimizer.zero_grad() scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() loss_items = {k: v.item() for k, v in loss_dict.items()} loss_items["loss_average"] = loss.item() / 4 return loss_items @torch.no_grad() def inference(engine, batch): model.eval() images, targets = batch images = list(image.to(device) for image in images) outputs = model(images) outputs = [{k: v.to("cpu") for k, v in t.items()} for t in outputs] return { "y_pred": outputs, "y": targets, "x": [i.cpu() for i in images] } trainer = Engine(update_model) evaluator = Engine(inference) visualizer = Engine(inference) aim_logger = AimLogger( repo=os.path.join(log_dir, "aim"), experiment=experiment_name, ) CocoMetric(convert_to_coco_api(val_dataset)).attach(evaluator, "mAP") @trainer.on(Events.EPOCH_COMPLETED) @one_rank_only() def log_validation_results(engine): evaluator.run(val_dataloader) visualizer.run(vis_dataloader) @trainer.on(Events.ITERATION_COMPLETED) def step_scheduler(engine): scheduler.step() aim_logger.log_metrics({"lr": scheduler.get_last_lr()[0]}, step=engine.state.iteration) @visualizer.on(Events.EPOCH_STARTED) def reset_vis_images(engine): engine.state.model_outputs = [] @visualizer.on(Events.ITERATION_COMPLETED) def add_vis_images(engine): engine.state.model_outputs.append(engine.state.output) @visualizer.on(Events.ITERATION_COMPLETED) def submit_vis_images(engine): aim_images = [] for outputs in engine.state.model_outputs: for image, target, pred in zip(outputs["x"], outputs["y"], outputs["y_pred"]): image = (image * 255).byte() pred_labels = [ Dataset.class2name[label.item()] for label in pred["labels"] ] pred_boxes = pred["boxes"].long() image = draw_bounding_boxes(image, pred_boxes, pred_labels, colors="red") target_labels = [ Dataset.class2name[label.item()] for label in target["labels"] ] target_boxes = target["boxes"].long() image = draw_bounding_boxes(image, target_boxes, target_labels, colors="green") aim_images.append(aim.Image(image.numpy().transpose( (1, 2, 0)))) aim_logger.experiment.track(aim_images, name="vis", step=trainer.state.epoch) losses = [ "loss_classifier", "loss_box_reg", "loss_objectness", "loss_rpn_box_reg", "loss_average" ] for loss_name in losses: RunningAverage(output_transform=lambda x: x[loss_name]).attach( trainer, loss_name) ProgressBar().attach(trainer, losses) ProgressBar().attach(evaluator) objects_to_checkpoint = { "trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": scheduler, "scaler": scaler, } checkpoint = Checkpoint( to_save=objects_to_checkpoint, save_handler=DiskSaver(log_dir, require_empty=False), n_saved=3, score_name="mAP", global_step_transform=lambda *_: trainer.state.epoch, ) evaluator.add_event_handler(Events.EPOCH_COMPLETED, checkpoint) if resume_from: Checkpoint.load_objects(objects_to_checkpoint, torch.load(resume_from)) aim_logger.log_params({ "lr": lr, "image_size": image_size, "batch_size": batch_size, "epochs": epochs, }) aim_logger.attach_output_handler(trainer, event_name=Events.ITERATION_COMPLETED, tag="train", output_transform=lambda loss: loss) aim_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag="val", metric_names=["mAP"], global_step_transform=global_step_from_engine( trainer, Events.ITERATION_COMPLETED), ) trainer.run(train_dataloader, max_epochs=epochs)
def training(rank, config): rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() # Define output folder: config.output = "/tmp/output" model = idist.auto_model(config.model) optimizer = idist.auto_optim(config.optimizer) criterion = config.criterion train_set, val_set = config.train_set, config.val_set train_loader = idist.auto_dataloader(train_set, batch_size=config.train_batch_size) val_loader = idist.auto_dataloader(val_set, batch_size=config.val_batch_size) trainer = create_supervised_trainer(model, optimizer, criterion, device=device) trainer.logger = setup_logger("Trainer") metrics = {"accuracy": Accuracy(), "loss": Loss(criterion)} train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) train_evaluator.logger = setup_logger("Train Evaluator") validation_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device) validation_evaluator.logger = setup_logger("Val Evaluator") @trainer.on(Events.EPOCH_COMPLETED(every=config.val_interval)) def compute_metrics(engine): train_evaluator.run(train_loader) validation_evaluator.run(val_loader) if rank == 0: tb_logger = TensorboardLogger(log_dir=config.output) tb_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), tag="training", output_transform=lambda loss: {"batchloss": loss}, metric_names="all", ) for tag, evaluator in [("training", train_evaluator), ("validation", validation_evaluator)]: tb_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag=tag, metric_names=["loss", "accuracy"], global_step_transform=global_step_from_engine(trainer), ) tb_logger.attach_opt_params_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), optimizer=optimizer) model_checkpoint = ModelCheckpoint( config.output, n_saved=2, filename_prefix="best", score_name="accuracy", global_step_transform=global_step_from_engine(trainer), ) validation_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint, {"model": model}) trainer.run(train_loader, max_epochs=config.num_epochs) if rank == 0: tb_logger.close()