def run_train_with_empty_loader() -> None: """ In this function we push loader to be empty because we use batch_size > len(dataset) and drop_last=True. """ dataset = DummyDataset() model = nn.Linear(in_features=dataset.features_dim, out_features=dataset.out_dim) loader = DataLoader(dataset=dataset, batch_size=len(dataset) + 1, drop_last=True) runner = SupervisedRunner() runner.train(loaders={"train": loader}, model=model, num_epochs=1, criterion=nn.BCEWithLogitsLoss())
def do_train(data, log, log_dir): model = Net(num_features=2) runner = SupervisedRunner() optimizer = torch.optim.Adam(model.parameters(), lr=1e-2) criterion = MyLoss() log_batch(model, data, log, "init") log.debug("Starting training") runner.train(model=model, criterion=criterion, optimizer=optimizer, loaders=data, logdir=f"{log_dir}/run", load_best_on_end=True, num_epochs=1) log_batch(model, data, log, "exit")
def test_save_model_grads(): """ Tests a feature of `OptimizerCallback` for saving model gradients """ logdir = "./logs" dataset_root = "./dataset" loaders = _get_loaders(root=dataset_root, batch_size=4, num_workers=1) images, _ = next(iter(loaders["train"])) _, c, h, w = images.shape input_shape = (c, h, w) model = _SimpleNet(input_shape) criterion = nn.CrossEntropyLoss() optimizer = Adam(model.parameters()) criterion_callback = CriterionCallback() optimizer_callback = OptimizerCallback() save_model_grads_callback = SaveModelGradsCallback() prefix = save_model_grads_callback.grad_norm_prefix test_callback = _OnBatchEndCheckGradsCallback(prefix) callbacks = collections.OrderedDict( loss=criterion_callback, optimizer=optimizer_callback, grad_norm=save_model_grads_callback, test_callback=test_callback, ) runner = SupervisedRunner() runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir=logdir, callbacks=callbacks, check=True, verbose=True, ) shutil.rmtree(logdir) shutil.rmtree(dataset_root)
def train(in_csv: str, in_dir: str, model: str = 'resnet18', fold: int = None, n_epochs: int = 30, image_size: int = 224, augmentation: str = 'medium', learning_rate: float = 3e-3, n_milestones: int = 5, batch_size: int = 256, n_workers: int = 4, fast: bool = False, logdir: str = '.', verbose: bool = False): model = get_model(model=model) loss = criterion.FocalLossMultiClass() # CrossEntropyLoss lr_scaled = learning_rate * (batch_size / 256) # lr linear scaling optimizer = torch.optim.Adam(model.parameters(), lr=lr_scaled) scheduler = schedulers.MultiStepLR(optimizer, milestones=[5, 10, 20, 30, 40], gamma=0.3) runner = SupervisedRunner() runner.train( model=model, criterion=loss, optimizer=optimizer, scheduler=scheduler, loaders=get_dataloaders(in_csv=in_csv, in_dir=in_dir, stages=['train', 'valid'], fold=fold, batch_size=batch_size, n_workers=n_workers, image_size=(image_size, image_size), augmentation=augmentation, fast=fast), callbacks=[AccuracyCallback(accuracy_args=[1]), BinaryAUCCallback()], logdir=logdir, num_epochs=n_epochs, verbose=verbose)
def main(config): """ Main code for training a classification model. Args: config (dict): dictionary read from a yaml file i.e. experiments/finetune_classification.yml Returns: None """ # setting up the train/val split with filenames seed = config["io_params"]["split_seed"] seed_everything(seed) mode = config["mode"].lower() assert mode in ["classification", "segmentation", "both"], \ "The `mode` must be one of ['classification', 'segmentation', 'both']." if mode == "classification": raise NotImplementedError elif mode == "segmentation": if config["dim"] == 2: exp = TrainSegExperiment2D(config) elif config["dim"] == 3: exp = TrainSegExperiment(config) output_key = "logits" elif mode == "both": if config["dim"] == 2: exp = TrainClfSegExperiment2D(config) elif config["dim"] == 3: exp = TrainClfSegExperiment3D(config) output_key = ["seg_logits", "clf_logits"] print(f"Seed: {seed}\nMode: {mode}") runner = SupervisedRunner(output_key=output_key) runner.train(model=exp.model, criterion=exp.criterion, optimizer=exp.opt, scheduler=exp.lr_scheduler, loaders=exp.loaders, callbacks=exp.cb_list, **config["runner_params"])
def train(num_epochs, model, loaders, logdir): criterion = torch.nn.BCELoss() optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2) callbacks = [F1ScoreCallback()] # model runner runner = SupervisedRunner() # model training runner.train(model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, logdir=logdir, num_epochs=num_epochs, callbacks=callbacks, verbose=True)
def objective(trial): logdir = "./logdir" num_epochs = 10 model = define_model(trial) optimizer = torch.optim.Adam(model.parameters(), lr=0.02) criterion = torch.nn.CrossEntropyLoss() # model training runner = SupervisedRunner() runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir=logdir, num_epochs=num_epochs, verbose=True, callbacks={ # NOTE(crcrpar): Consult above [Why OptunaPruningCallback] for the use of # Catalys's callback for Optuna, not Optuna's one for Catalyst. # top-1 accuracy as metric for pruning "optuna": OptunaPruningCallback( loader_key="valid", metric_key="accuracy01", minimize=False, trial=trial, ), "accuracy": AccuracyCallback( input_key="logits", target_key="targets", num_classes=10, ), }, ) return runner.callbacks["optuna"].best_score
def test_logger(): # data num_samples, num_features = int(1e4), int(1e1) X, y = torch.rand(num_samples, num_features), torch.rand(num_samples) dataset = TensorDataset(X, y) loader = DataLoader(dataset, batch_size=32, num_workers=1) loaders = {"train": loader, "valid": loader} # model, criterion, optimizer, scheduler model = torch.nn.Linear(num_features, 1) criterion = torch.nn.MSELoss() optimizer = torch.optim.Adam(model.parameters()) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [3, 6]) # model training runner = SupervisedRunner() runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, callbacks=[CSVLogger()], loaders=loaders, logdir="./logdir/test_csv", num_epochs=8, verbose=True, ) assert os.path.exists("./logdir/test_csv/train_log/logs.csv") assert os.path.exists("./logdir/test_csv/valid_log/logs.csv") with open("./logdir/test_csv/train_log/logs.csv", "r") as log: length = 0 for i, line in enumerate(log): if i == 0: assert "step,loss" in line length += 1 assert length == 9
optimizer, factor=0.25, patience=3) num_epochs = args.e logdir = "./logs/effnet-b0" fp16_params = None # dict(opt_level="O1") runner = SupervisedRunner(device='cuda') runner.train( model=model, criterion=criterion, scheduler=scheduler, optimizer=optimizer, loaders=loaders, callbacks=[ # wAUC(), F1ScoreCallback(), AUCCallback(num_classes=4), AccuracyCallback(prefix='ACC'), OptimizerCallback(accumulation_steps=args.acc)], logdir=logdir, num_epochs=num_epochs, fp16=fp16_params, verbose=True ) if args.test > 0: test_preds_proba: Union[List, Iterable, np.ndarray] = [] model.eval() progress_bar_test = tqdm(test_dataset) with torch.no_grad(): for i, im in enumerate(progress_bar_test): inputs = im.to('cuda')
def main(): parser = argparse.ArgumentParser() parser.add_argument("-acc", "--accumulation-steps", type=int, default=1, help="Number of batches to process") parser.add_argument("--seed", type=int, default=42, help="Random seed") parser.add_argument("--obliterate", type=float, default=0, help="Change of obliteration") parser.add_argument("-nid", "--negative-image-dir", type=str, default=None, help="Change of obliteration") parser.add_argument("-v", "--verbose", action="store_true") parser.add_argument("--fast", action="store_true") parser.add_argument("--cache", action="store_true") parser.add_argument("-dd", "--data-dir", type=str, default=os.environ.get("KAGGLE_2020_ALASKA2")) parser.add_argument("-m", "--model", type=str, default="resnet34", help="") parser.add_argument("-b", "--batch-size", type=int, default=16, help="Batch Size during training, e.g. -b 64") parser.add_argument( "-wbs", "--warmup-batch-size", type=int, default=None, help="Batch Size during training, e.g. -b 64" ) parser.add_argument("-e", "--epochs", type=int, default=100, help="Epoch to run") parser.add_argument( "-es", "--early-stopping", type=int, default=None, help="Maximum number of epochs without improvement" ) parser.add_argument("-fe", "--freeze-encoder", action="store_true", help="Freeze encoder parameters for N epochs") parser.add_argument("-lr", "--learning-rate", type=float, default=1e-3, help="Initial learning rate") parser.add_argument( "-l", "--modification-flag-loss", type=str, default=None, action="append", nargs="+" # [["ce", 1.0]], ) parser.add_argument( "--modification-type-loss", type=str, default=None, action="append", nargs="+" # [["ce", 1.0]], ) parser.add_argument("--embedding-loss", type=str, default=None, action="append", nargs="+") # [["ce", 1.0]], parser.add_argument("--feature-maps-loss", type=str, default=None, action="append", nargs="+") # [["ce", 1.0]], parser.add_argument("--mask-loss", type=str, default=None, action="append", nargs="+") # [["ce", 1.0]], parser.add_argument("--bits-loss", type=str, default=None, action="append", nargs="+") # [["ce", 1.0]], parser.add_argument("-o", "--optimizer", default="RAdam", help="Name of the optimizer") parser.add_argument( "-c", "--checkpoint", type=str, default=None, help="Checkpoint filename to use as initial model weights" ) parser.add_argument("-w", "--workers", default=8, type=int, help="Num workers") parser.add_argument("-a", "--augmentations", default="safe", type=str, help="Level of image augmentations") parser.add_argument("--transfer", default=None, type=str, help="") parser.add_argument("--fp16", action="store_true") parser.add_argument("--mixup", action="store_true") parser.add_argument("--cutmix", action="store_true") parser.add_argument("--tsa", action="store_true") parser.add_argument("--fold", default=None, type=int) parser.add_argument("-s", "--scheduler", default=None, type=str, help="") parser.add_argument("-x", "--experiment", default=None, type=str, help="") parser.add_argument("-d", "--dropout", default=None, type=float, help="Dropout before head layer") parser.add_argument( "--warmup", default=0, type=int, help="Number of warmup epochs with reduced LR on encoder parameters" ) parser.add_argument( "--fine-tune", default=0, type=int, help="Number of warmup epochs with reduced LR on encoder parameters" ) parser.add_argument("-wd", "--weight-decay", default=0, type=float, help="L2 weight decay") parser.add_argument("--show", action="store_true") parser.add_argument("--balance", action="store_true") parser.add_argument("--freeze-bn", action="store_true") args = parser.parse_args() set_manual_seed(args.seed) assert ( args.modification_flag_loss or args.modification_type_loss or args.embedding_loss ), "At least one of losses must be set" modification_flag_loss = args.modification_flag_loss modification_type_loss = args.modification_type_loss embedding_loss = args.embedding_loss feature_maps_loss = args.feature_maps_loss mask_loss = args.mask_loss bits_loss = args.bits_loss freeze_encoder = args.freeze_encoder data_dir = args.data_dir cache = args.cache num_workers = args.workers num_epochs = args.epochs learning_rate = args.learning_rate model_name: str = args.model optimizer_name = args.optimizer image_size = (512, 512) fast = args.fast augmentations = args.augmentations fp16 = args.fp16 scheduler_name = args.scheduler experiment = args.experiment dropout = args.dropout verbose = args.verbose warmup = args.warmup show = args.show accumulation_steps = args.accumulation_steps weight_decay = args.weight_decay fold = args.fold balance = args.balance freeze_bn = args.freeze_bn train_batch_size = args.batch_size mixup = args.mixup cutmix = args.cutmix tsa = args.tsa fine_tune = args.fine_tune obliterate_p = args.obliterate negative_image_dir = args.negative_image_dir warmup_batch_size = args.warmup_batch_size or args.batch_size # Compute batch size for validation valid_batch_size = train_batch_size run_train = num_epochs > 0 custom_model_kwargs = {} if dropout is not None: custom_model_kwargs["dropout"] = float(dropout) if embedding_loss is not None: custom_model_kwargs["need_embedding"] = True model: nn.Module = get_model(model_name, **custom_model_kwargs).cuda() required_features = model.required_features if mask_loss is not None: required_features.append(INPUT_TRUE_MODIFICATION_MASK) if args.transfer: transfer_checkpoint = fs.auto_file(args.transfer) print("Transferring weights from model checkpoint", transfer_checkpoint) checkpoint = load_checkpoint(transfer_checkpoint) pretrained_dict = checkpoint["model_state_dict"] transfer_weights(model, pretrained_dict) if args.checkpoint: checkpoint = load_checkpoint(fs.auto_file(args.checkpoint)) unpack_checkpoint(checkpoint, model=model) print("Loaded model weights from:", args.checkpoint) report_checkpoint(checkpoint) if freeze_bn: from pytorch_toolbelt.optimization.functional import freeze_model freeze_model(model, freeze_bn=True) print("Freezing bn params") main_metric = "loss" main_metric_minimize = True current_time = datetime.now().strftime("%b%d_%H_%M") checkpoint_prefix = f"{current_time}_{args.model}_fold{fold}" if fp16: checkpoint_prefix += "_fp16" if fast: checkpoint_prefix += "_fast" if mixup: checkpoint_prefix += "_mixup" if cutmix: checkpoint_prefix += "_cutmix" if experiment is not None: checkpoint_prefix = experiment log_dir = os.path.join("runs", checkpoint_prefix) os.makedirs(log_dir, exist_ok=False) config_fname = os.path.join(log_dir, f"{checkpoint_prefix}.json") with open(config_fname, "w") as f: train_session_args = vars(args) f.write(json.dumps(train_session_args, indent=2)) default_callbacks = [] if show: default_callbacks += [ShowPolarBatchesCallback(draw_predictions, metric="loss", minimize=True)] # Pretrain/warmup if warmup: train_ds, valid_ds, train_sampler = get_datasets( data_dir=data_dir, augmentation=augmentations, balance=balance, fast=fast, fold=fold, features=required_features, obliterate_p=0, ) criterions_dict, loss_callbacks = get_criterions( modification_flag=modification_flag_loss, modification_type=modification_type_loss, embedding_loss=embedding_loss, mask_loss=mask_loss, bits_loss=bits_loss, feature_maps_loss=feature_maps_loss, num_epochs=warmup, mixup=mixup, cutmix=cutmix, tsa=tsa, ) callbacks = ( default_callbacks + loss_callbacks + [ OptimizerCallback(accumulation_steps=accumulation_steps, decouple_weight_decay=False), HyperParametersCallback( hparam_dict={ "model": model_name, "scheduler": scheduler_name, "optimizer": optimizer_name, "augmentations": augmentations, "size": image_size[0], "weight_decay": weight_decay, } ), ] ) loaders = collections.OrderedDict() loaders["train"] = DataLoader( train_ds, batch_size=warmup_batch_size, num_workers=num_workers, pin_memory=True, drop_last=True, shuffle=train_sampler is None, sampler=train_sampler, ) loaders["valid"] = DataLoader(valid_ds, batch_size=warmup_batch_size, num_workers=num_workers, pin_memory=True) if freeze_encoder: from pytorch_toolbelt.optimization.functional import freeze_model freeze_model(model.encoder, freeze_parameters=True, freeze_bn=None) optimizer = get_optimizer( "Ranger", get_optimizable_parameters(model), weight_decay=weight_decay, learning_rate=3e-4 ) scheduler = None print("Train session :", checkpoint_prefix) print(" FP16 mode :", fp16) print(" Fast mode :", args.fast) print(" Epochs :", num_epochs) print(" Workers :", num_workers) print(" Data dir :", data_dir) print(" Log dir :", log_dir) print(" Cache :", cache) print("Data ") print(" Augmentations :", augmentations) print(" Negative images:", negative_image_dir) print(" Train size :", len(loaders["train"]), "batches", len(train_ds), "samples") print(" Valid size :", len(loaders["valid"]), "batches", len(valid_ds), "samples") print(" Image size :", image_size) print(" Balance :", balance) print(" Mixup :", mixup) print(" CutMix :", cutmix) print(" TSA :", tsa) print("Model :", model_name) print(" Parameters :", count_parameters(model)) print(" Dropout :", dropout, "(Non-default)" if dropout is not None else "") print("Optimizer :", optimizer_name) print(" Learning rate :", learning_rate) print(" Weight decay :", weight_decay) print(" Scheduler :", scheduler_name) print(" Batch sizes :", train_batch_size, valid_batch_size) print("Losses ") print(" Flag :", modification_flag_loss) print(" Type :", modification_type_loss) print(" Embedding :", embedding_loss) print(" Feature maps :", feature_maps_loss) print(" Mask :", mask_loss) print(" Bits :", bits_loss) runner = SupervisedRunner(input_key=required_features, output_key=None) runner.train( fp16=fp16, model=model, criterion=criterions_dict, optimizer=optimizer, scheduler=scheduler, callbacks=callbacks, loaders=loaders, logdir=os.path.join(log_dir, "warmup"), num_epochs=warmup, verbose=verbose, main_metric=main_metric, minimize_metric=main_metric_minimize, checkpoint_data={"cmd_args": vars(args)}, ) del optimizer, loaders, runner, callbacks best_checkpoint = os.path.join(log_dir, "warmup", "checkpoints", "best.pth") model_checkpoint = os.path.join(log_dir, f"{checkpoint_prefix}_warmup.pth") clean_checkpoint(best_checkpoint, model_checkpoint) # Restore state of best model # unpack_checkpoint(load_checkpoint(model_checkpoint), model=model) torch.cuda.empty_cache() gc.collect() if run_train: train_ds, valid_ds, train_sampler = get_datasets( data_dir=data_dir, augmentation=augmentations, balance=balance, fast=fast, fold=fold, features=required_features, obliterate_p=obliterate_p, ) if negative_image_dir: negatives_ds = get_negatives_ds( negative_image_dir, fold=fold, features=required_features, max_images=16536 ) train_ds = train_ds + negatives_ds train_sampler = None # TODO: Add proper support of sampler print("Adding", len(negatives_ds), "negative samples to training set") criterions_dict, loss_callbacks = get_criterions( modification_flag=modification_flag_loss, modification_type=modification_type_loss, embedding_loss=embedding_loss, feature_maps_loss=feature_maps_loss, mask_loss=mask_loss, bits_loss=bits_loss, num_epochs=num_epochs, mixup=mixup, cutmix=cutmix, tsa=tsa, ) callbacks = ( default_callbacks + loss_callbacks + [ OptimizerCallback(accumulation_steps=accumulation_steps, decouple_weight_decay=False), HyperParametersCallback( hparam_dict={ "model": model_name, "scheduler": scheduler_name, "optimizer": optimizer_name, "augmentations": augmentations, "size": image_size[0], "weight_decay": weight_decay, } ), ] ) loaders = collections.OrderedDict() loaders["train"] = DataLoader( train_ds, batch_size=train_batch_size, num_workers=num_workers, pin_memory=True, drop_last=True, shuffle=train_sampler is None, sampler=train_sampler, ) loaders["valid"] = DataLoader(valid_ds, batch_size=valid_batch_size, num_workers=num_workers, pin_memory=True) print("Train session :", checkpoint_prefix) print(" FP16 mode :", fp16) print(" Fast mode :", args.fast) print(" Epochs :", num_epochs) print(" Workers :", num_workers) print(" Data dir :", data_dir) print(" Log dir :", log_dir) print(" Cache :", cache) print("Data ") print(" Augmentations :", augmentations) print(" Obliterate (%) :", obliterate_p) print(" Negative images:", negative_image_dir) print(" Train size :", len(loaders["train"]), "batches", len(train_ds), "samples") print(" Valid size :", len(loaders["valid"]), "batches", len(valid_ds), "samples") print(" Image size :", image_size) print(" Balance :", balance) print(" Mixup :", mixup) print(" CutMix :", cutmix) print(" TSA :", tsa) print("Model :", model_name) print(" Parameters :", count_parameters(model)) print(" Dropout :", dropout) print("Optimizer :", optimizer_name) print(" Learning rate :", learning_rate) print(" Weight decay :", weight_decay) print(" Scheduler :", scheduler_name) print(" Batch sizes :", train_batch_size, valid_batch_size) print("Losses ") print(" Flag :", modification_flag_loss) print(" Type :", modification_type_loss) print(" Embedding :", embedding_loss) print(" Feature maps :", feature_maps_loss) print(" Mask :", mask_loss) print(" Bits :", bits_loss) optimizer = get_optimizer( optimizer_name, get_optimizable_parameters(model), learning_rate=learning_rate, weight_decay=weight_decay ) scheduler = get_scheduler( scheduler_name, optimizer, lr=learning_rate, num_epochs=num_epochs, batches_in_epoch=len(loaders["train"]) ) if isinstance(scheduler, CyclicLR): callbacks += [SchedulerCallback(mode="batch")] # model training runner = SupervisedRunner(input_key=required_features, output_key=None) runner.train( fp16=fp16, model=model, criterion=criterions_dict, optimizer=optimizer, scheduler=scheduler, callbacks=callbacks, loaders=loaders, logdir=os.path.join(log_dir, "main"), num_epochs=num_epochs, verbose=verbose, main_metric=main_metric, minimize_metric=main_metric_minimize, checkpoint_data={"cmd_args": vars(args)}, ) del optimizer, loaders, runner, callbacks best_checkpoint = os.path.join(log_dir, "main", "checkpoints", "best.pth") model_checkpoint = os.path.join(log_dir, f"{checkpoint_prefix}.pth") # Restore state of best model clean_checkpoint(best_checkpoint, model_checkpoint) # unpack_checkpoint(load_checkpoint(model_checkpoint), model=model) torch.cuda.empty_cache() gc.collect() if fine_tune: train_ds, valid_ds, train_sampler = get_datasets( data_dir=data_dir, augmentation="light", balance=balance, fast=fast, fold=fold, features=required_features, obliterate_p=obliterate_p, ) criterions_dict, loss_callbacks = get_criterions( modification_flag=modification_flag_loss, modification_type=modification_type_loss, embedding_loss=embedding_loss, feature_maps_loss=feature_maps_loss, mask_loss=mask_loss, bits_loss=bits_loss, num_epochs=fine_tune, mixup=False, cutmix=False, tsa=False, ) callbacks = ( default_callbacks + loss_callbacks + [ OptimizerCallback(accumulation_steps=accumulation_steps, decouple_weight_decay=False), HyperParametersCallback( hparam_dict={ "model": model_name, "scheduler": scheduler_name, "optimizer": optimizer_name, "augmentations": augmentations, "size": image_size[0], "weight_decay": weight_decay, } ), ] ) loaders = collections.OrderedDict() loaders["train"] = DataLoader( train_ds, batch_size=train_batch_size, num_workers=num_workers, pin_memory=True, drop_last=True, shuffle=train_sampler is None, sampler=train_sampler, ) loaders["valid"] = DataLoader(valid_ds, batch_size=valid_batch_size, num_workers=num_workers, pin_memory=True) print("Train session :", checkpoint_prefix) print(" FP16 mode :", fp16) print(" Fast mode :", args.fast) print(" Epochs :", num_epochs) print(" Workers :", num_workers) print(" Data dir :", data_dir) print(" Log dir :", log_dir) print(" Cache :", cache) print("Data ") print(" Augmentations :", augmentations) print(" Obliterate (%) :", obliterate_p) print(" Negative images:", negative_image_dir) print(" Train size :", len(loaders["train"]), "batches", len(train_ds), "samples") print(" Valid size :", len(loaders["valid"]), "batches", len(valid_ds), "samples") print(" Image size :", image_size) print(" Balance :", balance) print(" Mixup :", mixup) print(" CutMix :", cutmix) print(" TSA :", tsa) print("Model :", model_name) print(" Parameters :", count_parameters(model)) print(" Dropout :", dropout) print("Optimizer :", optimizer_name) print(" Learning rate :", learning_rate) print(" Weight decay :", weight_decay) print(" Scheduler :", scheduler_name) print(" Batch sizes :", train_batch_size, valid_batch_size) print("Losses ") print(" Flag :", modification_flag_loss) print(" Type :", modification_type_loss) print(" Embedding :", embedding_loss) print(" Feature maps :", feature_maps_loss) print(" Mask :", mask_loss) print(" Bits :", bits_loss) optimizer = get_optimizer( "SGD", get_optimizable_parameters(model), learning_rate=learning_rate, weight_decay=weight_decay ) scheduler = get_scheduler( "cos", optimizer, lr=learning_rate, num_epochs=fine_tune, batches_in_epoch=len(loaders["train"]) ) if isinstance(scheduler, CyclicLR): callbacks += [SchedulerCallback(mode="batch")] # model training runner = SupervisedRunner(input_key=required_features, output_key=None) runner.train( fp16=fp16, model=model, criterion=criterions_dict, optimizer=optimizer, scheduler=scheduler, callbacks=callbacks, loaders=loaders, logdir=os.path.join(log_dir, "finetune"), num_epochs=fine_tune, verbose=verbose, main_metric=main_metric, minimize_metric=main_metric_minimize, checkpoint_data={"cmd_args": vars(args)}, ) best_checkpoint = os.path.join(log_dir, "finetune", "checkpoints", "best.pth") model_checkpoint = os.path.join(log_dir, f"{checkpoint_prefix}_finetune.pth") clean_checkpoint(best_checkpoint, model_checkpoint) unpack_checkpoint(load_checkpoint(model_checkpoint), model=model) del optimizer, loaders, runner, callbacks
) val_dataset = OcrDataset(DATASET_PATH + 'val/', DATASET_PATH + 'val.csv', transforms=ResizeToTensor( CV_CONFIG.data['ocr_image_size'])) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4) model = CRNN(**MODEL_PARAMS) optimizer = torch.optim.Adam(model.parameters()) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer) callbacks = [CheckpointCallback(save_n_best=10)] runner = SupervisedRunner(input_key="image", input_target_key="targets") runner.train(model=model, criterion=WrapCTCLoss(alphabet), optimizer=optimizer, scheduler=scheduler, loaders={ 'train': train_loader, "valid": val_loader }, logdir="./logs/ocr", num_epochs=NUM_EPOCHS, verbose=True, callbacks=callbacks)
def main(train, test, features, target): # get args args = parse_arguments() params = yaml_to_json(args.yaml_path) # hyper param num_folds = params.fold seed = params.seed base_path = params.base_path target_cols = params.target features_cols = params.features preprocessed_data_path = params.preprocessed_data batch_size = params.batch_size num_epochs = params.epochs # ex) '/hoge/logs' base_logdir = params.base_logdir # fix seed set_global_seed(seed) device = get_device() # set up logdir now = datetime.now() base_logdir = os.path.join(base_logdir + now.strftime("%Y%m%d%H%M%S")) os.makedirs(base_logdir, exist_ok=True) # dump yaml contents with open(os.path.join(base_logdir, 'params.json'), mode="w") as f: json.dump(params, f, indent=4) # dump this scripts my_file_path = os.path.abspath(__file__) shutil.copyfile(my_file_path, base_logdir) # load dataset if preprocessed_data_path == '': train, test, sample_submission = read_data(base_path) # noqa # TODO: You should implement these function!! train, test = preprocess(train, test) # noqa train, test = build_feature(train, test) # noqa else: train = pd.read_csv(preprocessed_data_path + 'train.csv') test = pd.read_csv(preprocessed_data_path + 'test.csv') sample_submission = pd.read_csv(preprocessed_data_path + 'sample_submission.csv') # execute CV # TODO: set your CV method kf = KFold(n_splits=num_folds, random_state=seed) ids = kf.split(train) fold_scores = [] test_preds = [] for fold, (train_idx, valid_idx) in enumerate(ids): print('Fold {}'.format(fold + 1)) logdir = os.path.join(base_logdir + 'fold_{}'.format(fold + 1)) os.makedirs(logdir, exist_ok=True) # data X_train = train[features_cols] # 目的変数の正規化は...? Y_train = train[target_cols] X_test = train[features_cols] # create dataloaders train_dls, test_dl = create_data_loader( X_train.iloc[train_idx].to_numpy(), Y_train.iloc[train_idx].to_numpy(), X_train.iloc[valid_idx].to_numpy(), Y_train.iloc[valid_idx].to_numpy(), X_test.to_numpy(), batch_size=batch_size) # init models # TODO: set your model and learning condition # ここは関数を用意して、キーワードで取り出すようにできると汎用性は上がる model = SampleNN(input_dim=1000, out_dim=1) criterion = nn.BCELoss() optimizer = torch.optim.AdamW(model.parameters()) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer) # init catalyst runner runner = SupervisedRunner(device=device) # model training runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=train_dls, logdir=logdir, num_epochs=num_epochs, callbacks=[EarlyStoppingCallback(patience=15, min_delta=0)], verbose=False) # calculate valid score best_model_path = logdir + '/checkpoints/best.pth' val_preds = runner.predict_loader(model, train_dls['valid'], resume=best_model_path, verbose=False) val_truth = Y_train.iloc[valid_idx].values # TODO: set your score function cv_score = mean_spearmanr_correlation_score(val_truth, val_preds) print('Fold {} CV score : {}'.format(fold + 1, cv_score)) fold_scores.append(cv_score) # test prediction test_pred = runner.predict_loader( model, test_dl, resume=best_model_path, verbose=False) / num_folds test_preds.append(test_pred) # submit # TODO: set your submit process sample_submission[target_cols] = np.mean(test_preds, axis=0) sample_submission.to_csv('submission.csv') return True
def main(): parser = argparse.ArgumentParser() parser.add_argument('--seed', type=int, default=42, help='Random seed') parser.add_argument('--fast', action='store_true') parser.add_argument('--mixup', action='store_true') parser.add_argument('--balance', action='store_true') parser.add_argument('--balance-datasets', action='store_true') parser.add_argument('--swa', action='store_true') parser.add_argument('--show', action='store_true') parser.add_argument('--use-idrid', action='store_true') parser.add_argument('--use-messidor', action='store_true') parser.add_argument('--use-aptos2015', action='store_true') parser.add_argument('--use-aptos2019', action='store_true') parser.add_argument('-v', '--verbose', action='store_true') parser.add_argument('--coarse', action='store_true') parser.add_argument('-acc', '--accumulation-steps', type=int, default=1, help='Number of batches to process') parser.add_argument('-dd', '--data-dir', type=str, default='data', help='Data directory') parser.add_argument('-m', '--model', type=str, default='resnet18_gap', help='') parser.add_argument('-b', '--batch-size', type=int, default=8, help='Batch Size during training, e.g. -b 64') parser.add_argument('-e', '--epochs', type=int, default=100, help='Epoch to run') parser.add_argument('-es', '--early-stopping', type=int, default=None, help='Maximum number of epochs without improvement') parser.add_argument('-f', '--fold', action='append', type=int, default=None) parser.add_argument('-ft', '--fine-tune', default=0, type=int) parser.add_argument('-lr', '--learning-rate', type=float, default=1e-4, help='Initial learning rate') parser.add_argument('--criterion-reg', type=str, default=None, nargs='+', help='Criterion') parser.add_argument('--criterion-ord', type=str, default=None, nargs='+', help='Criterion') parser.add_argument('--criterion-cls', type=str, default=['ce'], nargs='+', help='Criterion') parser.add_argument('-l1', type=float, default=0, help='L1 regularization loss') parser.add_argument('-l2', type=float, default=0, help='L2 regularization loss') parser.add_argument('-o', '--optimizer', default='Adam', help='Name of the optimizer') parser.add_argument('-p', '--preprocessing', default=None, help='Preprocessing method') parser.add_argument( '-c', '--checkpoint', type=str, default=None, help='Checkpoint filename to use as initial model weights') parser.add_argument('-w', '--workers', default=multiprocessing.cpu_count(), type=int, help='Num workers') parser.add_argument('-a', '--augmentations', default='medium', type=str, help='') parser.add_argument('-tta', '--tta', default=None, type=str, help='Type of TTA to use [fliplr, d4]') parser.add_argument('-t', '--transfer', default=None, type=str, help='') parser.add_argument('--fp16', action='store_true') parser.add_argument('-s', '--scheduler', default='multistep', type=str, help='') parser.add_argument('--size', default=512, type=int, help='Image size for training & inference') parser.add_argument('-wd', '--weight-decay', default=0, type=float, help='L2 weight decay') parser.add_argument('-wds', '--weight-decay-step', default=None, type=float, help='L2 weight decay step to add after each epoch') parser.add_argument('-d', '--dropout', default=0.0, type=float, help='Dropout before head layer') parser.add_argument( '--warmup', default=0, type=int, help= 'Number of warmup epochs with 0.1 of the initial LR and frozed encoder' ) parser.add_argument('-x', '--experiment', default=None, type=str, help='Dropout before head layer') args = parser.parse_args() data_dir = args.data_dir num_workers = args.workers num_epochs = args.epochs batch_size = args.batch_size learning_rate = args.learning_rate l1 = args.l1 l2 = args.l2 early_stopping = args.early_stopping model_name = args.model optimizer_name = args.optimizer image_size = (args.size, args.size) fast = args.fast augmentations = args.augmentations fp16 = args.fp16 fine_tune = args.fine_tune criterion_reg_name = args.criterion_reg criterion_cls_name = args.criterion_cls criterion_ord_name = args.criterion_ord folds = args.fold mixup = args.mixup balance = args.balance balance_datasets = args.balance_datasets use_swa = args.swa show_batches = args.show scheduler_name = args.scheduler verbose = args.verbose weight_decay = args.weight_decay use_idrid = args.use_idrid use_messidor = args.use_messidor use_aptos2015 = args.use_aptos2015 use_aptos2019 = args.use_aptos2019 warmup = args.warmup dropout = args.dropout use_unsupervised = False experiment = args.experiment preprocessing = args.preprocessing weight_decay_step = args.weight_decay_step coarse_grading = args.coarse class_names = get_class_names(coarse_grading) assert use_aptos2015 or use_aptos2019 or use_idrid or use_messidor current_time = datetime.now().strftime('%b%d_%H_%M') random_name = get_random_name() if folds is None or len(folds) == 0: folds = [None] for fold in folds: torch.cuda.empty_cache() checkpoint_prefix = f'{model_name}_{args.size}_{augmentations}' if preprocessing is not None: checkpoint_prefix += f'_{preprocessing}' if use_aptos2019: checkpoint_prefix += '_aptos2019' if use_aptos2015: checkpoint_prefix += '_aptos2015' if use_messidor: checkpoint_prefix += '_messidor' if use_idrid: checkpoint_prefix += '_idrid' if coarse_grading: checkpoint_prefix += '_coarse' if fold is not None: checkpoint_prefix += f'_fold{fold}' checkpoint_prefix += f'_{random_name}' if experiment is not None: checkpoint_prefix = experiment directory_prefix = f'{current_time}/{checkpoint_prefix}' log_dir = os.path.join('runs', directory_prefix) os.makedirs(log_dir, exist_ok=False) config_fname = os.path.join(log_dir, f'{checkpoint_prefix}.json') with open(config_fname, 'w') as f: train_session_args = vars(args) f.write(json.dumps(train_session_args, indent=2)) set_manual_seed(args.seed) num_classes = len(class_names) model = get_model(model_name, num_classes=num_classes, dropout=dropout).cuda() if args.transfer: transfer_checkpoint = fs.auto_file(args.transfer) print("Transfering weights from model checkpoint", transfer_checkpoint) checkpoint = load_checkpoint(transfer_checkpoint) pretrained_dict = checkpoint['model_state_dict'] for name, value in pretrained_dict.items(): try: model.load_state_dict(collections.OrderedDict([(name, value)]), strict=False) except Exception as e: print(e) report_checkpoint(checkpoint) if args.checkpoint: checkpoint = load_checkpoint(fs.auto_file(args.checkpoint)) unpack_checkpoint(checkpoint, model=model) report_checkpoint(checkpoint) train_ds, valid_ds, train_sizes = get_datasets( data_dir=data_dir, use_aptos2019=use_aptos2019, use_aptos2015=use_aptos2015, use_idrid=use_idrid, use_messidor=use_messidor, use_unsupervised=False, coarse_grading=coarse_grading, image_size=image_size, augmentation=augmentations, preprocessing=preprocessing, target_dtype=int, fold=fold, folds=4) train_loader, valid_loader = get_dataloaders( train_ds, valid_ds, batch_size=batch_size, num_workers=num_workers, train_sizes=train_sizes, balance=balance, balance_datasets=balance_datasets, balance_unlabeled=False) loaders = collections.OrderedDict() loaders["train"] = train_loader loaders["valid"] = valid_loader print('Datasets :', data_dir) print(' Train size :', len(train_loader), len(train_loader.dataset)) print(' Valid size :', len(valid_loader), len(valid_loader.dataset)) print(' Aptos 2019 :', use_aptos2019) print(' Aptos 2015 :', use_aptos2015) print(' IDRID :', use_idrid) print(' Messidor :', use_messidor) print('Train session :', directory_prefix) print(' FP16 mode :', fp16) print(' Fast mode :', fast) print(' Mixup :', mixup) print(' Balance cls. :', balance) print(' Balance ds. :', balance_datasets) print(' Warmup epoch :', warmup) print(' Train epochs :', num_epochs) print(' Fine-tune ephs :', fine_tune) print(' Workers :', num_workers) print(' Fold :', fold) print(' Log dir :', log_dir) print(' Augmentations :', augmentations) print('Model :', model_name) print(' Parameters :', count_parameters(model)) print(' Image size :', image_size) print(' Dropout :', dropout) print(' Classes :', class_names, num_classes) print('Optimizer :', optimizer_name) print(' Learning rate :', learning_rate) print(' Batch size :', batch_size) print(' Criterion (cls):', criterion_cls_name) print(' Criterion (reg):', criterion_reg_name) print(' Criterion (ord):', criterion_ord_name) print(' Scheduler :', scheduler_name) print(' Weight decay :', weight_decay, weight_decay_step) print(' L1 reg. :', l1) print(' L2 reg. :', l2) print(' Early stopping :', early_stopping) # model training callbacks = [] criterions = {} main_metric = 'cls/kappa' if criterion_reg_name is not None: cb, crits = get_reg_callbacks(criterion_reg_name, class_names=class_names, show=show_batches) callbacks += cb criterions.update(crits) if criterion_ord_name is not None: cb, crits = get_ord_callbacks(criterion_ord_name, class_names=class_names, show=show_batches) callbacks += cb criterions.update(crits) if criterion_cls_name is not None: cb, crits = get_cls_callbacks(criterion_cls_name, num_classes=num_classes, num_epochs=num_epochs, class_names=class_names, show=show_batches) callbacks += cb criterions.update(crits) if l1 > 0: callbacks += [ LPRegularizationCallback(start_wd=l1, end_wd=l1, schedule=None, prefix='l1', p=1) ] if l2 > 0: callbacks += [ LPRegularizationCallback(start_wd=l2, end_wd=l2, schedule=None, prefix='l2', p=2) ] callbacks += [CustomOptimizerCallback()] runner = SupervisedRunner(input_key='image') # Pretrain/warmup if warmup: set_trainable(model.encoder, False, False) optimizer = get_optimizer('Adam', get_optimizable_parameters(model), learning_rate=learning_rate * 0.1) runner.train(fp16=fp16, model=model, criterion=criterions, optimizer=optimizer, scheduler=None, callbacks=callbacks, loaders=loaders, logdir=os.path.join(log_dir, 'warmup'), num_epochs=warmup, verbose=verbose, main_metric=main_metric, minimize_metric=False, checkpoint_data={"cmd_args": vars(args)}) del optimizer # Main train if num_epochs: set_trainable(model.encoder, True, False) optimizer = get_optimizer(optimizer_name, get_optimizable_parameters(model), learning_rate=learning_rate, weight_decay=weight_decay) if use_swa: from torchcontrib.optim import SWA optimizer = SWA(optimizer, swa_start=len(train_loader), swa_freq=512) scheduler = get_scheduler(scheduler_name, optimizer, lr=learning_rate, num_epochs=num_epochs, batches_in_epoch=len(train_loader)) # Additional callbacks that specific to main stage only added here to copy of callbacks main_stage_callbacks = callbacks if early_stopping: es_callback = EarlyStoppingCallback(early_stopping, min_delta=1e-4, metric=main_metric, minimize=False) main_stage_callbacks = callbacks + [es_callback] runner.train(fp16=fp16, model=model, criterion=criterions, optimizer=optimizer, scheduler=scheduler, callbacks=main_stage_callbacks, loaders=loaders, logdir=os.path.join(log_dir, 'main'), num_epochs=num_epochs, verbose=verbose, main_metric=main_metric, minimize_metric=False, checkpoint_data={"cmd_args": vars(args)}) del optimizer, scheduler best_checkpoint = os.path.join(log_dir, 'main', 'checkpoints', 'best.pth') model_checkpoint = os.path.join(log_dir, 'main', 'checkpoints', f'{checkpoint_prefix}.pth') clean_checkpoint(best_checkpoint, model_checkpoint) # Restoring best model from checkpoint checkpoint = load_checkpoint(best_checkpoint) unpack_checkpoint(checkpoint, model=model) report_checkpoint(checkpoint) # Stage 3 - Fine tuning if fine_tune: set_trainable(model.encoder, False, False) optimizer = get_optimizer(optimizer_name, get_optimizable_parameters(model), learning_rate=learning_rate) scheduler = get_scheduler('multistep', optimizer, lr=learning_rate, num_epochs=fine_tune, batches_in_epoch=len(train_loader)) runner.train(fp16=fp16, model=model, criterion=criterions, optimizer=optimizer, scheduler=scheduler, callbacks=callbacks, loaders=loaders, logdir=os.path.join(log_dir, 'finetune'), num_epochs=fine_tune, verbose=verbose, main_metric=main_metric, minimize_metric=False, checkpoint_data={"cmd_args": vars(args)}) best_checkpoint = os.path.join(log_dir, 'finetune', 'checkpoints', 'best.pth') model_checkpoint = os.path.join(log_dir, 'finetune', 'checkpoints', f'{checkpoint_prefix}.pth') clean_checkpoint(best_checkpoint, model_checkpoint)
trn_df = df.loc[trn_idx, :].reset_index(drop=True) val_df = df.loc[val_idx, :].reset_index(drop=True) loaders = { phase: C.get_loader(df_, datadir, config, phase) for df_, phase in zip([trn_df, val_df], ["train", "valid"]) } model = models.get_model(config).to(device) criterion = C.get_criterion(config).to(device) optimizer = C.get_optimizer(model, config) scheduler = C.get_scheduler(optimizer, config) callbacks = clb.get_callbacks(config) runner = SupervisedRunner( device=device, input_key=global_params["input_key"], input_target_key=global_params["input_target_key"]) runner.train( model=model, criterion=criterion, loaders=loaders, optimizer=optimizer, scheduler=scheduler, num_epochs=global_params["num_epochs"], verbose=True, logdir=output_dir / f"fold{i}", callbacks=callbacks, main_metric=global_params["main_metric"], minimize_metric=global_params["minimize_metric"])
X, y = torch.rand(num_samples, num_features), torch.rand(num_samples) dataset = TensorDataset(X, y) loader = DataLoader(dataset, batch_size=32, num_workers=1) loaders = {"train": loader, "valid": loader} # model, criterion, optimizer, scheduler model = torch.nn.Linear(num_features, 1) criterion = torch.nn.MSELoss() optimizer = torch.optim.Adam(model.parameters()) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [3, 6]) runner = SupervisedRunner() # model training runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, logdir="./logdir", num_epochs=8, verbose=True, check=True, load_best_on_end=True, ) # model inference for prediction in runner.predict_loader(loader=loader): assert prediction["logits"].cpu().detach().numpy().shape == (32, 1) # model tracing traced_model = runner.trace(loader=loader)
def test_multiple_stages_with_magic_callback(): # NOTE: before first validation epoch # all checkpoints will be compared according # to a metric on a test dataset and # checkpoints will be overwritten according # to this value class BestStateCheckerCallback(Callback): def __init__(self): super().__init__(CallbackOrder.External) self.valid_loader = None self._after_first_validation = False def on_stage_start(self, runner: "IRunner") -> None: self.valid_loader = copy.copy(runner.valid_loader) def on_epoch_end(self, runner: "IRunner") -> None: if (self.valid_loader not in runner.loaders and runner.epoch > 1 and self._after_first_validation): assert ( not runner.is_best_valid ), f"Epochs (epoch={runner.epoch}) without valid loader can't be best!" else: assert runner.valid_metrics[runner.main_metric] is not None if self.valid_loader in runner.loaders: self._after_first_validation = True # experiment_setup logdir = "./logs/periodic_loader" # data num_samples, num_features = int(1e4), int(1e1) X = torch.rand(num_samples, num_features) y = torch.randint(0, 5, size=[num_samples]) dataset = TensorDataset(X, y) loader = DataLoader(dataset, batch_size=32, num_workers=1) loaders = { "train": loader, "valid": loader, } # model, criterion, optimizer, scheduler model = torch.nn.Linear(num_features, 5) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters()) runner = SupervisedRunner() # first stage runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir=logdir, num_epochs=5, verbose=False, callbacks=[ PeriodicLoaderCallback(valid=2), BestStateCheckerCallback(), CheckRunCallback(num_epoch_steps=5), ], ) # second stage runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir=logdir, num_epochs=6, verbose=False, callbacks=[ PeriodicLoaderCallback(valid=3), BestStateCheckerCallback(), CheckRunCallback(num_epoch_steps=6), ], ) # third stage runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir=logdir, num_epochs=6, verbose=False, callbacks=[ PeriodicLoaderCallback(valid=4), BestStateCheckerCallback(), CheckRunCallback(num_epoch_steps=6), ], ) shutil.rmtree(logdir, ignore_errors=True)
def test_ignoring_unknown_loaders(): old_stdout = sys.stdout sys.stdout = str_stdout = StringIO() # experiment_setup logdir = "./logs/periodic_loader" checkpoint = logdir + "/checkpoints" logfile = checkpoint + "/_metrics.json" # data num_samples, num_features = int(1e4), int(1e1) X = torch.rand(num_samples, num_features) y = torch.randint(0, 5, size=[num_samples]) dataset = TensorDataset(X, y) loader = DataLoader(dataset, batch_size=32, num_workers=1) loaders = { "train": loader, "train_additional": loader, "valid": loader, "valid_additional": loader, } # model, criterion, optimizer, scheduler model = torch.nn.Linear(num_features, 5) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters()) runner = SupervisedRunner() # first stage runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir=logdir, num_epochs=10, verbose=False, callbacks=[ PeriodicLoaderCallback( train_additional=2, train_not_exists=2, valid=3, valid_additional=0, valid_not_exist=1, ), CheckRunCallback(num_epoch_steps=10), ], ) sys.stdout = old_stdout exp_output = str_stdout.getvalue() assert len(re.findall(r"\(train\)", exp_output)) == 10 assert len(re.findall(r"\(train_additional\)", exp_output)) == 5 assert len(re.findall(r"\(train_not_exists\)", exp_output)) == 0 assert len(re.findall(r"\(valid\)", exp_output)) == 3 assert len(re.findall(r"\(valid_additional\)", exp_output)) == 0 assert len(re.findall(r"\(valid_not_exist\)", exp_output)) == 0 assert len(re.findall(r".*/train\.\d\.pth", exp_output)) == 1 assert os.path.isfile(logfile) assert os.path.isfile(checkpoint + "/train.9.pth") assert os.path.isfile(checkpoint + "/best.pth") assert os.path.isfile(checkpoint + "/best_full.pth") assert os.path.isfile(checkpoint + "/last.pth") assert os.path.isfile(checkpoint + "/last_full.pth") shutil.rmtree(logdir, ignore_errors=True)
def test_epoch_increasing(): class IncreaseCheckerCallback(Callback): def __init__(self, attribute: str, start_value: int = None): super().__init__(CallbackOrder.Internal) self.attr = attribute self.prev = start_value def on_epoch_start(self, runner): if not hasattr(runner, self.attr): raise ValueError(f"There is no {self.attr} in runner!") value = getattr(runner, self.attr) if self.prev is not None: print( f">>> '{self.attr}': previous - {self.prev}, current - {value}" ) assert self.prev < value self.prev = value # experiment_setup logdir = "./logs/core_runner" # data num_samples, num_features = int(1e4), int(1e1) X = torch.rand(num_samples, num_features) y = torch.randint(0, 5, size=[num_samples]) dataset = TensorDataset(X, y) loader = DataLoader(dataset, batch_size=32, num_workers=1) loaders = { "train": loader, "valid": loader, } # model, criterion, optimizer, scheduler model = torch.nn.Linear(num_features, 5) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters()) runner = SupervisedRunner() callbacks = [ IncreaseCheckerCallback("global_epoch"), IncreaseCheckerCallback("global_batch_step"), IncreaseCheckerCallback("global_sample_step"), ] # first stage runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir=logdir, num_epochs=2, verbose=False, callbacks=callbacks, ) # second stage runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir=logdir, num_epochs=3, verbose=False, callbacks=callbacks, ) # third stage runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir=logdir, num_epochs=4, verbose=False, callbacks=callbacks, ) shutil.rmtree(logdir, ignore_errors=True) # new exp runner = SupervisedRunner() # first stage runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir=logdir, num_epochs=2, verbose=False, callbacks=[ IncreaseCheckerCallback("global_epoch"), IncreaseCheckerCallback("global_batch_step"), IncreaseCheckerCallback("global_sample_step"), ], ) # second stage runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir=logdir, num_epochs=3, verbose=False, callbacks=[ IncreaseCheckerCallback("global_epoch", 2), IncreaseCheckerCallback("global_batch_step", 626), IncreaseCheckerCallback("global_sample_step", 20_000), ], ) shutil.rmtree(logdir, ignore_errors=True)
def test_mnist(self): utils.set_global_seed(42) x_train = np.random.random((100, 1, 28, 28)).astype(np.float32) y_train = _to_categorical( np.random.randint(10, size=(100, 1)), num_classes=10 ).astype(np.float32) x_valid = np.random.random((20, 1, 28, 28)).astype(np.float32) y_valid = _to_categorical( np.random.randint(10, size=(20, 1)), num_classes=10 ).astype(np.float32) x_train, y_train, x_valid, y_valid = \ list(map(torch.tensor, [x_train, y_train, x_valid, y_valid])) bs = 32 num_workers = 4 data_transform = transforms.ToTensor() loaders = collections.OrderedDict() trainset = torch.utils.data.TensorDataset(x_train, y_train) trainloader = torch.utils.data.DataLoader( trainset, batch_size=bs, shuffle=True, num_workers=num_workers) validset = torch.utils.data.TensorDataset(x_valid, y_valid) validloader = torch.utils.data.DataLoader( validset, batch_size=bs, shuffle=False, num_workers=num_workers) loaders["train"] = trainloader loaders["valid"] = validloader # experiment setup num_epochs = 3 logdir = "./logs" # model, criterion, optimizer model = Net() criterion = nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters()) # model runner runner = SupervisedRunner() # model training runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir=logdir, num_epochs=num_epochs, verbose=False, callbacks=[CheckpointCallback(save_n_best=3)] ) metrics = Safict.load("./logs/checkpoints/_metrics.json") metrics_flag1 = \ metrics.get("train.2", "loss") < metrics.get("train.0", "loss") metrics_flag2 = metrics.get("best", "loss") < 0.35 self.assertTrue(metrics_flag1) self.assertTrue(metrics_flag2)
drop_last=False) } output_path = './' # model model = GRU_model().to(device).double() # Optimizer optimizer = optim.Adam(model.parameters(), lr=0.001) # Scheduler scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=33) # Loss criterion = nn.MSELoss() runner = SupervisedRunner(device=device, ) runner.train( model=model, criterion=nn.MSELoss(), loaders=loaders, optimizer=optimizer, scheduler=scheduler, num_epochs=30, verbose=True, logdir=output_path, callbacks=[WandbLogger(project="GRU-project", name='train-7(h=512,l=2)')], )
metrics={ "loss_dice": 1, "loss_iou": 1, "loss_bce": 0.8 }, ), # metrics DiceCallback(input_key="mask"), # IouCallback(input_key="mask"), ] model.train() runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, # our dataloaders loaders=loaders, # We can specify the callbacks list for the experiment; callbacks=callbacks, # path to save logs logdir=logdir, num_epochs=num_epochs, # save our best checkpoint by Dice metric main_metric="dice", minimize_metric=False, fp16=dict(opt_level="O1"), verbose=True, )
def test_loading_best_state_at_end_with_custom_scores(): class Metric(Callback): def __init__(self, values): super().__init__(CallbackOrder.metric) self.values = values def on_loader_end(self, runner: "IRunner") -> None: score = self.values[runner.loader_key][runner.stage_epoch_step] runner.loader_metrics["metric"] = score old_stdout = sys.stdout sys.stdout = str_stdout = StringIO() # experiment_setup logdir = "./logs/periodic_loader" checkpoint = logdir # + "/checkpoints" logfile = checkpoint + "/_metrics.json" # data num_samples, num_features = int(1e4), int(1e1) X = torch.rand(num_samples, num_features) y = torch.randint(0, 5, size=[num_samples]) dataset = TensorDataset(X, y) loader = DataLoader(dataset, batch_size=32, num_workers=1) loaders = { "train": loader, "valid": loader, } # model, criterion, optimizer, scheduler model = torch.nn.Linear(num_features, 5) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters()) runner = SupervisedRunner() n_epochs = 10 period = 3 metrics = { "train": {i: i * 0.1 for i in range(1, 11)}, "valid": { i: v for i, v in enumerate( [0.05, 0.1, 0.15, 0.15, 0.2, 0.18, 0.22, 0.11, 0.13, 0.12], 1) }, } # first stage runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir=logdir, num_epochs=n_epochs, verbose=False, valid_loader="valid", valid_metric="metric", minimize_valid_metric=False, callbacks=[ PeriodicLoaderCallback(valid_loader_key="valid", valid_metric_key="metric", minimize=True, valid=period), CheckRunCallback(num_epoch_steps=n_epochs), Metric(metrics), ], load_best_on_end=True, ) sys.stdout = old_stdout exp_output = str_stdout.getvalue() # assert len(re.findall(r"\(train\)", exp_output)) == n_epochs # assert len(re.findall(r"\(valid\)", exp_output)) == (n_epochs // period) # assert len(re.findall(r"\(global epoch 6, epoch 6, stage train\)", exp_output)) == 1 # assert len(re.findall(r".*/train\.\d\.pth", exp_output)) == 1 assert os.path.isfile(logfile) assert os.path.isfile(checkpoint + "/train.6.pth") assert os.path.isfile(checkpoint + "/train.6_full.pth") assert os.path.isfile(checkpoint + "/best.pth") assert os.path.isfile(checkpoint + "/best_full.pth") assert os.path.isfile(checkpoint + "/last.pth") assert os.path.isfile(checkpoint + "/last_full.pth") shutil.rmtree(logdir, ignore_errors=True)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-acc", "--accumulation-steps", type=int, default=1, help="Number of batches to process") parser.add_argument("--seed", type=int, default=42, help="Random seed") parser.add_argument("-v", "--verbose", action="store_true") parser.add_argument("--fast", action="store_true") parser.add_argument("-dd", "--data-dir", type=str, required=True, help="Data directory for INRIA sattelite dataset") parser.add_argument("-m", "--model", type=str, default="resnet34_fpncat128", help="") parser.add_argument("-b", "--batch-size", type=int, default=8, help="Batch Size during training, e.g. -b 64") parser.add_argument("-e", "--epochs", type=int, default=100, help="Epoch to run") # parser.add_argument('-es', '--early-stopping', type=int, default=None, help='Maximum number of epochs without improvement') # parser.add_argument('-fe', '--freeze-encoder', type=int, default=0, help='Freeze encoder parameters for N epochs') # parser.add_argument('-ft', '--fine-tune', action='store_true') parser.add_argument("-lr", "--learning-rate", type=float, default=1e-3, help="Initial learning rate") parser.add_argument( "--disaster-type-loss", type=str, default=None, # [["ce", 1.0]], action="append", nargs="+", help="Criterion for classifying disaster type", ) parser.add_argument( "--damage-type-loss", type=str, default=None, # [["bce", 1.0]], action="append", nargs="+", help= "Criterion for classifying presence of building with particular damage type", ) parser.add_argument("-l", "--criterion", type=str, default=None, action="append", nargs="+", help="Criterion") parser.add_argument("--mask4", type=str, default=None, action="append", nargs="+", help="Criterion for mask with stride 4") parser.add_argument("--mask8", type=str, default=None, action="append", nargs="+", help="Criterion for mask with stride 8") parser.add_argument("--mask16", type=str, default=None, action="append", nargs="+", help="Criterion for mask with stride 16") parser.add_argument("--mask32", type=str, default=None, action="append", nargs="+", help="Criterion for mask with stride 32") parser.add_argument("--embedding", type=str, default=None) parser.add_argument("-o", "--optimizer", default="RAdam", help="Name of the optimizer") parser.add_argument( "-c", "--checkpoint", type=str, default=None, help="Checkpoint filename to use as initial model weights") parser.add_argument("-w", "--workers", default=8, type=int, help="Num workers") parser.add_argument("-a", "--augmentations", default="safe", type=str, help="Level of image augmentations") parser.add_argument("--transfer", default=None, type=str, help="") parser.add_argument("--fp16", action="store_true") parser.add_argument("--size", default=512, type=int) parser.add_argument("--fold", default=0, type=int) parser.add_argument("-s", "--scheduler", default="multistep", type=str, help="") parser.add_argument("-x", "--experiment", default=None, type=str, help="") parser.add_argument("-d", "--dropout", default=0.0, type=float, help="Dropout before head layer") parser.add_argument("-pl", "--pseudolabeling", type=str, required=True) parser.add_argument("-wd", "--weight-decay", default=0, type=float, help="L2 weight decay") parser.add_argument("--show", action="store_true") parser.add_argument("--dsv", action="store_true") parser.add_argument("--balance", action="store_true") parser.add_argument("--only-buildings", action="store_true") parser.add_argument("--freeze-bn", action="store_true") parser.add_argument("--crops", action="store_true", help="Train on random crops") parser.add_argument("--post-transform", action="store_true") args = parser.parse_args() set_manual_seed(args.seed) data_dir = args.data_dir num_workers = args.workers num_epochs = args.epochs learning_rate = args.learning_rate model_name = args.model optimizer_name = args.optimizer image_size = args.size, args.size fast = args.fast augmentations = args.augmentations fp16 = args.fp16 scheduler_name = args.scheduler experiment = args.experiment dropout = args.dropout segmentation_losses = args.criterion verbose = args.verbose show = args.show accumulation_steps = args.accumulation_steps weight_decay = args.weight_decay fold = args.fold balance = args.balance only_buildings = args.only_buildings freeze_bn = args.freeze_bn train_on_crops = args.crops enable_post_image_transform = args.post_transform disaster_type_loss = args.disaster_type_loss train_batch_size = args.batch_size embedding_criterion = args.embedding damage_type_loss = args.damage_type_loss pseudolabels_dir = args.pseudolabeling # Compute batch size for validaion if train_on_crops: valid_batch_size = max(1, (train_batch_size * (image_size[0] * image_size[1])) // (1024**2)) else: valid_batch_size = train_batch_size run_train = num_epochs > 0 model: nn.Module = get_model(model_name, dropout=dropout).cuda() if args.transfer: transfer_checkpoint = fs.auto_file(args.transfer) print("Transfering weights from model checkpoint", transfer_checkpoint) checkpoint = load_checkpoint(transfer_checkpoint) pretrained_dict = checkpoint["model_state_dict"] transfer_weights(model, pretrained_dict) if args.checkpoint: checkpoint = load_checkpoint(fs.auto_file(args.checkpoint)) unpack_checkpoint(checkpoint, model=model) print("Loaded model weights from:", args.checkpoint) report_checkpoint(checkpoint) if freeze_bn: torch_utils.freeze_bn(model) print("Freezing bn params") runner = SupervisedRunner(input_key=INPUT_IMAGE_KEY, output_key=None) main_metric = "weighted_f1" cmd_args = vars(args) current_time = datetime.now().strftime("%b%d_%H_%M") checkpoint_prefix = f"{current_time}_{args.model}_{args.size}_fold{fold}" if fp16: checkpoint_prefix += "_fp16" if fast: checkpoint_prefix += "_fast" if pseudolabels_dir: checkpoint_prefix += "_pseudo" if train_on_crops: checkpoint_prefix += "_crops" if experiment is not None: checkpoint_prefix = experiment log_dir = os.path.join("runs", checkpoint_prefix) os.makedirs(log_dir, exist_ok=False) config_fname = os.path.join(log_dir, f"{checkpoint_prefix}.json") with open(config_fname, "w") as f: train_session_args = vars(args) f.write(json.dumps(train_session_args, indent=2)) default_callbacks = [ CompetitionMetricCallback(input_key=INPUT_MASK_KEY, output_key=OUTPUT_MASK_KEY, prefix="weighted_f1"), ConfusionMatrixCallback( input_key=INPUT_MASK_KEY, output_key=OUTPUT_MASK_KEY, class_names=[ "land", "no_damage", "minor_damage", "major_damage", "destroyed" ], ignore_index=UNLABELED_SAMPLE, ), ] if show: default_callbacks += [ ShowPolarBatchesCallback(draw_predictions, metric=main_metric + "_batch", minimize=False) ] train_ds, valid_ds, train_sampler = get_datasets( data_dir=data_dir, image_size=image_size, augmentation=augmentations, fast=fast, fold=fold, balance=balance, only_buildings=only_buildings, train_on_crops=train_on_crops, crops_multiplication_factor=1, enable_post_image_transform=enable_post_image_transform, ) if run_train: loaders = collections.OrderedDict() callbacks = default_callbacks.copy() criterions_dict = {} losses = [] unlabeled_train = get_pseudolabeling_dataset( data_dir, include_masks=True, image_size=image_size, augmentation="medium_nmd", train_on_crops=train_on_crops, enable_post_image_transform=enable_post_image_transform, pseudolabels_dir=pseudolabels_dir, ) train_ds = train_ds + unlabeled_train print("Using online pseudolabeling with ", len(unlabeled_train), "samples") loaders["train"] = DataLoader( train_ds, batch_size=train_batch_size, num_workers=num_workers, pin_memory=True, drop_last=True, shuffle=True, ) loaders["valid"] = DataLoader(valid_ds, batch_size=valid_batch_size, num_workers=num_workers, pin_memory=True) # Create losses for criterion in segmentation_losses: if isinstance(criterion, (list, tuple)) and len(criterion) == 2: loss_name, loss_weight = criterion else: loss_name, loss_weight = criterion[0], 1.0 cd, criterion, criterion_name = get_criterion_callback( loss_name, prefix="segmentation", input_key=INPUT_MASK_KEY, output_key=OUTPUT_MASK_KEY, loss_weight=float(loss_weight), ) criterions_dict.update(cd) callbacks.append(criterion) losses.append(criterion_name) print(INPUT_MASK_KEY, "Using loss", loss_name, loss_weight) if args.mask4 is not None: for criterion in args.mask4: if isinstance(criterion, (list, tuple)): loss_name, loss_weight = criterion else: loss_name, loss_weight = criterion, 1.0 cd, criterion, criterion_name = get_criterion_callback( loss_name, prefix="mask4", input_key=INPUT_MASK_KEY, output_key=OUTPUT_MASK_4_KEY, loss_weight=float(loss_weight), ) criterions_dict.update(cd) callbacks.append(criterion) losses.append(criterion_name) print(OUTPUT_MASK_4_KEY, "Using loss", loss_name, loss_weight) if args.mask8 is not None: for criterion in args.mask8: if isinstance(criterion, (list, tuple)): loss_name, loss_weight = criterion else: loss_name, loss_weight = criterion, 1.0 cd, criterion, criterion_name = get_criterion_callback( loss_name, prefix="mask8", input_key=INPUT_MASK_KEY, output_key=OUTPUT_MASK_8_KEY, loss_weight=float(loss_weight), ) criterions_dict.update(cd) callbacks.append(criterion) losses.append(criterion_name) print(OUTPUT_MASK_8_KEY, "Using loss", loss_name, loss_weight) if args.mask16 is not None: for criterion in args.mask16: if isinstance(criterion, (list, tuple)): loss_name, loss_weight = criterion else: loss_name, loss_weight = criterion, 1.0 cd, criterion, criterion_name = get_criterion_callback( loss_name, prefix="mask16", input_key=INPUT_MASK_KEY, output_key=OUTPUT_MASK_16_KEY, loss_weight=float(loss_weight), ) criterions_dict.update(cd) callbacks.append(criterion) losses.append(criterion_name) print(OUTPUT_MASK_16_KEY, "Using loss", loss_name, loss_weight) if args.mask32 is not None: for criterion in args.mask32: if isinstance(criterion, (list, tuple)): loss_name, loss_weight = criterion else: loss_name, loss_weight = criterion, 1.0 cd, criterion, criterion_name = get_criterion_callback( loss_name, prefix="mask32", input_key=INPUT_MASK_KEY, output_key=OUTPUT_MASK_32_KEY, loss_weight=float(loss_weight), ) criterions_dict.update(cd) callbacks.append(criterion) losses.append(criterion_name) print(OUTPUT_MASK_32_KEY, "Using loss", loss_name, loss_weight) if disaster_type_loss is not None: callbacks += [ ConfusionMatrixCallback( input_key=DISASTER_TYPE_KEY, output_key=DISASTER_TYPE_KEY, class_names=DISASTER_TYPES, ignore_index=UNKNOWN_DISASTER_TYPE_CLASS, prefix=f"{DISASTER_TYPE_KEY}/confusion_matrix", ), AccuracyCallback( input_key=DISASTER_TYPE_KEY, output_key=DISASTER_TYPE_KEY, prefix=f"{DISASTER_TYPE_KEY}/accuracy", activation="Softmax", ), ] for criterion in disaster_type_loss: if isinstance(criterion, (list, tuple)): loss_name, loss_weight = criterion else: loss_name, loss_weight = criterion, 1.0 cd, criterion, criterion_name = get_criterion_callback( loss_name, prefix=DISASTER_TYPE_KEY, input_key=DISASTER_TYPE_KEY, output_key=DISASTER_TYPE_KEY, loss_weight=float(loss_weight), ignore_index=UNKNOWN_DISASTER_TYPE_CLASS, ) criterions_dict.update(cd) callbacks.append(criterion) losses.append(criterion_name) print(DISASTER_TYPE_KEY, "Using loss", loss_name, loss_weight) if damage_type_loss is not None: callbacks += [ # MultilabelConfusionMatrixCallback( # input_key=DAMAGE_TYPE_KEY, # output_key=DAMAGE_TYPE_KEY, # class_names=DAMAGE_TYPES, # prefix=f"{DAMAGE_TYPE_KEY}/confusion_matrix", # ), AccuracyCallback( input_key=DAMAGE_TYPE_KEY, output_key=DAMAGE_TYPE_KEY, prefix=f"{DAMAGE_TYPE_KEY}/accuracy", activation="Sigmoid", threshold=0.5, ) ] for criterion in damage_type_loss: if isinstance(criterion, (list, tuple)): loss_name, loss_weight = criterion else: loss_name, loss_weight = criterion, 1.0 cd, criterion, criterion_name = get_criterion_callback( loss_name, prefix=DAMAGE_TYPE_KEY, input_key=DAMAGE_TYPE_KEY, output_key=DAMAGE_TYPE_KEY, loss_weight=float(loss_weight), ) criterions_dict.update(cd) callbacks.append(criterion) losses.append(criterion_name) print(DAMAGE_TYPE_KEY, "Using loss", loss_name, loss_weight) if embedding_criterion is not None: cd, criterion, criterion_name = get_criterion_callback( embedding_criterion, prefix="embedding", input_key=INPUT_MASK_KEY, output_key=OUTPUT_EMBEDDING_KEY, loss_weight=1.0, ) criterions_dict.update(cd) callbacks.append(criterion) losses.append(criterion_name) print(OUTPUT_EMBEDDING_KEY, "Using loss", embedding_criterion) callbacks += [ CriterionAggregatorCallback(prefix="loss", loss_keys=losses), OptimizerCallback(accumulation_steps=accumulation_steps, decouple_weight_decay=False), ] optimizer = get_optimizer(optimizer_name, get_optimizable_parameters(model), learning_rate, weight_decay=weight_decay) scheduler = get_scheduler(scheduler_name, optimizer, lr=learning_rate, num_epochs=num_epochs, batches_in_epoch=len(loaders["train"])) if isinstance(scheduler, CyclicLR): callbacks += [SchedulerCallback(mode="batch")] print("Train session :", checkpoint_prefix) print(" FP16 mode :", fp16) print(" Fast mode :", args.fast) print(" Epochs :", num_epochs) print(" Workers :", num_workers) print(" Data dir :", data_dir) print(" Log dir :", log_dir) print("Data ") print(" Augmentations :", augmentations) print(" Train size :", len(loaders["train"]), len(train_ds)) print(" Valid size :", len(loaders["valid"]), len(valid_ds)) print(" Image size :", image_size) print(" Train on crops :", train_on_crops) print(" Balance :", balance) print(" Buildings only :", only_buildings) print(" Post transform :", enable_post_image_transform) print(" Pseudolabels :", pseudolabels_dir) print("Model :", model_name) print(" Parameters :", count_parameters(model)) print(" Dropout :", dropout) print("Optimizer :", optimizer_name) print(" Learning rate :", learning_rate) print(" Weight decay :", weight_decay) print(" Scheduler :", scheduler_name) print(" Batch sizes :", train_batch_size, valid_batch_size) print(" Criterion :", segmentation_losses) print(" Damage type :", damage_type_loss) print(" Disaster type :", disaster_type_loss) print(" Embedding :", embedding_criterion) # model training runner.train( fp16=fp16, model=model, criterion=criterions_dict, optimizer=optimizer, scheduler=scheduler, callbacks=callbacks, loaders=loaders, logdir=os.path.join(log_dir, "opl"), num_epochs=num_epochs, verbose=verbose, main_metric=main_metric, minimize_metric=False, checkpoint_data={"cmd_args": cmd_args}, ) # Training is finished. Let's run predictions using best checkpoint weights best_checkpoint = os.path.join(log_dir, "main", "checkpoints", "best.pth") model_checkpoint = os.path.join(log_dir, "main", "checkpoints", f"{checkpoint_prefix}.pth") clean_checkpoint(best_checkpoint, model_checkpoint) del optimizer, loaders
def test_multiple_best_checkpoints(): old_stdout = sys.stdout sys.stdout = str_stdout = StringIO() # experiment_setup logdir = "./logs/periodic_loader" checkpoint = logdir # + "/checkpoints" logfile = checkpoint + "/_metrics.json" # data num_samples, num_features = int(1e4), int(1e1) X = torch.rand(num_samples, num_features) y = torch.randint(0, 5, size=[num_samples]) dataset = TensorDataset(X, y) loader = DataLoader(dataset, batch_size=32, num_workers=1) loaders = { "train": loader, "valid": loader, } # model, criterion, optimizer, scheduler model = torch.nn.Linear(num_features, 5) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters()) runner = SupervisedRunner() n_epochs = 12 period = 2 # first stage runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir=logdir, num_epochs=n_epochs, verbose=False, valid_loader="valid", valid_metric="loss", minimize_valid_metric=True, callbacks=[ PeriodicLoaderCallback(valid_loader_key="valid", valid_metric_key="loss", minimize=True, valid=period), CheckRunCallback(num_epoch_steps=n_epochs), CheckpointCallback(logdir=logdir, loader_key="valid", metric_key="loss", minimize=True, save_n_best=3), ], ) sys.stdout = old_stdout exp_output = str_stdout.getvalue() # assert len(re.findall(r"\(train\)", exp_output)) == n_epochs # assert len(re.findall(r"\(valid\)", exp_output)) == (n_epochs // period) # assert len(re.findall(r".*/train\.\d{1,2}\.pth", exp_output)) == 3 assert os.path.isfile(logfile) assert os.path.isfile(checkpoint + "/train.8.pth") assert os.path.isfile(checkpoint + "/train.8_full.pth") assert os.path.isfile(checkpoint + "/train.10.pth") assert os.path.isfile(checkpoint + "/train.10_full.pth") assert os.path.isfile(checkpoint + "/train.12.pth") assert os.path.isfile(checkpoint + "/train.12_full.pth") assert os.path.isfile(checkpoint + "/best.pth") assert os.path.isfile(checkpoint + "/best_full.pth") assert os.path.isfile(checkpoint + "/last.pth") assert os.path.isfile(checkpoint + "/last_full.pth") shutil.rmtree(logdir, ignore_errors=True)
class TorchBackend(Estimator): def __init__(self, config: EstimatorConfig, model): super().__init__(config) self.runner = SupervisedRunner() self.model_metrics = dict() self.model = model self.ddp = False self.set_device() def torch_train(self, loaders, model, optimizer, loss_func, scheduler, config): self.config = config self.model = model self.optimizer = optimizer self.loss_func = loss_func self.scheduler = scheduler self.loader_key = list(loaders)[0] self.metric_key = 'loss' self.import_from_config() if 'cuda' in str(self.device): self.optimizer_to(optimizer, self.device) #checks if logdir exists - deletes it if yes self.check_logdir() if self.loader_key != 'train': warnings.warn( "WARNING: loader to be used for early-stop callback is '%s'. You can define it manually in /lib/estimator/pytorch_estimator.torch_train" % (self.loader_key)) model = self.model torch.cuda.empty_cache() if self.ddp: self.engine = None else: self.engine = DeviceEngine(self.device) self.print_info() self.runner.train( model=model, criterion=self.loss_func, optimizer=self.optimizer, scheduler=self.scheduler, loaders=loaders, logdir=self.config.logdir, num_epochs=self.config.n_epochs, callbacks=[ EarlyStoppingCallback(patience=self.config.patience, min_delta=self.config.min_delta, loader_key=self.loader_key, metric_key=self.metric_key, minimize=True), SchedulerCallback( loader_key=self.loader_key, metric_key=self.metric_key, ), SkipCheckpointCallback(logdir=self.config.logdir), ], verbose=False, check=False, engine=self.engine, ddp=self.ddp, ) self.config.parameters['model - device'] = str(self.runner.device) self.model_metrics['final epoch'] = self.runner.stage_epoch_step for key, value in self.runner.epoch_metrics.items(): self.model_metrics[key] = value with open('model_details.txt', 'w') as file: file.write('%s\n\n%s\n\n%s' % (str(self.runner.model), str( self.runner.optimizer), str(self.runner.scheduler))) return model def predict(self, inputs, config): self.model.eval() #overwrite device and ddp setting if provided upon loading the model, #otherwise device will be determined by availability and ddp=False if 'device' in config.kwargs: self.device = config.kwargs['device'] if 'ddp' in config.kwargs: self.ddp = config.kwargs['ddp'] self.print_info() if str(self.device) == 'cpu': data = torch.as_tensor(inputs) else: if not next(self.model.parameters()).is_cuda: self.model.to(self.device) cuda_id = next(self.model.parameters()).get_device() data = torch.as_tensor(inputs).cuda(cuda_id) return self.model(data).cpu().data.numpy() def metrics(self) -> Dict[str, float]: return self.model_metrics def set_device(self): self.device = torch.device( 'cuda:0' if torch.cuda.is_available() else "cpu") return self.device def print_info(self): if self.ddp and torch.cuda.is_available(): device_to_print = 'parallel cuda' else: device_to_print = self.device return print(''' ====== run info ====== Device used: {device} DDP: {ddp} ====================== '''.format(device=device_to_print, ddp=self.ddp)) def to_device(self, var): if str(self.device) == 'cpu': return var else: return var.cuda() def tensor_to_device(self): if str(self.device) == 'cpu': return torch.FloatTensor else: return torch.cuda.FloatTensor def import_from_config(self): if self.config.kwargs: for key, value in self.config.kwargs.items(): setattr(self, key, value) return def optimizer_to(self, optim, device): for param in optim.state.values(): # Not sure there are any global tensors in the state dict if isinstance(param, torch.Tensor): param.data = param.data.to(device) if param._grad is not None: param._grad.data = param._grad.data.to(device) elif isinstance(param, dict): for subparam in param.values(): if isinstance(subparam, torch.Tensor): subparam.data = subparam.data.to(device) if subparam._grad is not None: subparam._grad.data = subparam._grad.data.to( device) def save(self, path): model_save_path = "{path}/model.pt".format(path=path) params_save_path = "{path}/params.json".format(path=path) torch.save( { 'epoch': self.runner.stage_epoch_step, 'model_state_dict': self.runner.model.state_dict(), 'optimizer_state_dict': self.runner.optimizer.state_dict(), 'loss': self.runner.epoch_metrics['train']['loss'], }, model_save_path) self.config.save(params_save_path) @classmethod def load(cls, path: str, estimator=None, load_saved_config=False): model_save_path = "{path}/model.pt".format(path=path) params_save_path = "{path}/params.json".format(path=path) cfg = cls.load_config(params_save_path) if load_saved_config == True: print( """All config parameters will be loaded from saved params.json (anything provided in model config upon loading will be ignored)""") for key, value in cfg.items(): setattr(estimator.config, key, value) checkpoint = torch.load(model_save_path, map_location='cpu') estimator.model.load_state_dict(checkpoint['model_state_dict']) estimator.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) epoch = checkpoint['epoch'] loss = checkpoint['loss'] print(""" ==== Loaded Model ==== final Epoch: {epoch} final Loss: {loss} ====================== """.format(epoch=epoch, loss='%.4e' % loss)) return estimator @classmethod def load_config(cls, path: str): with open(path, 'r') as f: cfg = json.load(f) del cfg['parameters'] return cfg def check_logdir(self): #checks if logdir exists - deletes if yes if os.path.exists(self.config.logdir): shutil.rmtree(self.config.logdir)
def main(): parser = argparse.ArgumentParser() ########################################################################################### # Distributed-training related stuff parser.add_argument("--local_rank", type=int, default=0) ########################################################################################### parser.add_argument("-acc", "--accumulation-steps", type=int, default=1, help="Number of batches to process") parser.add_argument("--seed", type=int, default=42, help="Random seed") parser.add_argument("-v", "--verbose", action="store_true") parser.add_argument("--fast", action="store_true") parser.add_argument( "-dd", "--data-dir", type=str, help="Data directory for INRIA sattelite dataset", default=os.environ.get("INRIA_DATA_DIR"), ) parser.add_argument( "-dd-xview2", "--data-dir-xview2", type=str, required=False, help="Data directory for external xView2 dataset" ) parser.add_argument("-m", "--model", type=str, default="b6_unet32_s2", help="") parser.add_argument("-b", "--batch-size", type=int, default=8, help="Batch Size during training, e.g. -b 64") parser.add_argument("-e", "--epochs", type=int, default=100, help="Epoch to run") # parser.add_argument('-es', '--early-stopping', type=int, default=None, help='Maximum number of epochs without improvement') # parser.add_argument('-fe', '--freeze-encoder', type=int, default=0, help='Freeze encoder parameters for N epochs') # parser.add_argument('-ft', '--fine-tune', action='store_true') parser.add_argument("-lr", "--learning-rate", type=float, default=1e-3, help="Initial learning rate") parser.add_argument("-l", "--criterion", type=str, required=True, action="append", nargs="+", help="Criterion") parser.add_argument( "-l2", "--criterion2", type=str, required=False, action="append", nargs="+", help="Criterion for stride 2 mask", ) parser.add_argument( "-l4", "--criterion4", type=str, required=False, action="append", nargs="+", help="Criterion for stride 4 mask", ) parser.add_argument( "-l8", "--criterion8", type=str, required=False, action="append", nargs="+", help="Criterion for stride 8 mask", ) parser.add_argument( "-l16", "--criterion16", type=str, required=False, action="append", nargs="+", help="Criterion for stride 16 mask", ) parser.add_argument("-o", "--optimizer", default="RAdam", help="Name of the optimizer") parser.add_argument( "-c", "--checkpoint", type=str, default=None, help="Checkpoint filename to use as initial model weights" ) parser.add_argument("-w", "--workers", default=8, type=int, help="Num workers") parser.add_argument("-a", "--augmentations", default="hard", type=str, help="") parser.add_argument("-tm", "--train-mode", default="random", type=str, help="") parser.add_argument("--run-mode", default="fit_predict", type=str, help="") parser.add_argument("--transfer", default=None, type=str, help="") parser.add_argument("--fp16", action="store_true") parser.add_argument("--size", default=512, type=int) parser.add_argument("-s", "--scheduler", default="multistep", type=str, help="") parser.add_argument("-x", "--experiment", default=None, type=str, help="") parser.add_argument("-d", "--dropout", default=None, type=float, help="Dropout before head layer") parser.add_argument("--opl", action="store_true") parser.add_argument( "--warmup", default=0, type=int, help="Number of warmup epochs with reduced LR on encoder parameters" ) parser.add_argument("-wd", "--weight-decay", default=0, type=float, help="L2 weight decay") parser.add_argument("--show", action="store_true") parser.add_argument("--dsv", action="store_true") args = parser.parse_args() args.is_master = args.local_rank == 0 args.distributed = False fp16 = args.fp16 if "WORLD_SIZE" in os.environ: args.distributed = int(os.environ["WORLD_SIZE"]) > 1 args.world_size = int(os.environ["WORLD_SIZE"]) # args.world_size = torch.distributed.get_world_size() print("Initializing init_process_group", args.local_rank) torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl") print("Initialized init_process_group", args.local_rank) is_master = args.is_master | (not args.distributed) if args.distributed: distributed_params = {"rank": args.local_rank, "syncbn": True} if args.fp16: distributed_params["amp"] = True else: if args.fp16: distributed_params = {} distributed_params["amp"] = True else: distributed_params = False set_manual_seed(args.seed + args.local_rank) catalyst.utils.set_global_seed(args.seed + args.local_rank) torch.backends.cudnn.deterministic = False torch.backends.cudnn.benchmark = True data_dir = args.data_dir if data_dir is None: raise ValueError("--data-dir must be set") num_workers = args.workers num_epochs = args.epochs batch_size = args.batch_size learning_rate = args.learning_rate model_name = args.model optimizer_name = args.optimizer image_size = args.size, args.size fast = args.fast augmentations = args.augmentations train_mode = args.train_mode scheduler_name = args.scheduler experiment = args.experiment dropout = args.dropout online_pseudolabeling = args.opl criterions = args.criterion criterions2 = args.criterion2 criterions4 = args.criterion4 criterions8 = args.criterion8 criterions16 = args.criterion16 verbose = args.verbose show = args.show accumulation_steps = args.accumulation_steps weight_decay = args.weight_decay extra_data_xview2 = args.data_dir_xview2 run_train = num_epochs > 0 need_weight_mask = any(c[0] == "wbce" for c in criterions) custom_model_kwargs = {"full_size_mask": False} if dropout is not None: custom_model_kwargs["dropout"] = float(dropout) if any([criterions2, criterions4, criterions8, criterions16]): custom_model_kwargs["need_supervision_masks"] = True print("Enabling supervision masks") model: nn.Module = get_model(model_name, num_classes=16, **custom_model_kwargs).cuda() if args.transfer: transfer_checkpoint = fs.auto_file(args.transfer) print("Transfering weights from model checkpoint", transfer_checkpoint) checkpoint = load_checkpoint(transfer_checkpoint) pretrained_dict = checkpoint["model_state_dict"] transfer_weights(model, pretrained_dict) if args.checkpoint: checkpoint = load_checkpoint(fs.auto_file(args.checkpoint)) unpack_checkpoint(checkpoint, model=model) print("Loaded model weights from:", args.checkpoint) report_checkpoint(checkpoint) main_metric = "jaccard" current_time = datetime.now().strftime("%y%m%d_%H_%M") checkpoint_prefix = f"{current_time}_{args.model}" if fp16: checkpoint_prefix += "_fp16" if fast: checkpoint_prefix += "_fast" if online_pseudolabeling: checkpoint_prefix += "_opl" if extra_data_xview2: checkpoint_prefix += "_with_xview2" if experiment is not None: checkpoint_prefix = experiment default_callbacks = [ JaccardMetricPerImage( input_key=INPUT_MASK_KEY, output_key=OUTPUT_MASK_KEY, prefix="jaccard", inputs_to_labels=depth2mask, outputs_to_labels=decode_depth_mask, ), ] if is_master: default_callbacks += [ BestMetricCheckpointCallback(target_metric="jaccard", target_metric_minimize=False), HyperParametersCallback( hparam_dict={ "model": model_name, "scheduler": scheduler_name, "optimizer": optimizer_name, "augmentations": augmentations, "size": args.size, "weight_decay": weight_decay, "epochs": num_epochs, "dropout": None if dropout is None else float(dropout), } ), ] if show: visualize_inria_predictions = partial( draw_inria_predictions, image_key=INPUT_IMAGE_KEY, image_id_key=INPUT_IMAGE_ID_KEY, targets_key=INPUT_MASK_KEY, outputs_key=OUTPUT_MASK_KEY, inputs_to_labels=depth2mask, outputs_to_labels=decode_depth_mask, max_images=16, ) default_callbacks += [ ShowPolarBatchesCallback(visualize_inria_predictions, metric="accuracy", minimize=False), ShowPolarBatchesCallback(visualize_inria_predictions, metric="loss", minimize=True), ] train_ds, valid_ds, train_sampler = get_datasets( data_dir=data_dir, image_size=image_size, augmentation=augmentations, train_mode=train_mode, buildings_only=(train_mode == "tiles"), fast=fast, need_weight_mask=need_weight_mask, make_mask_target_fn=mask_to_ce_target, ) if extra_data_xview2 is not None: extra_train_ds, _ = get_xview2_extra_dataset( extra_data_xview2, image_size=image_size, augmentation=augmentations, fast=fast, need_weight_mask=need_weight_mask, ) weights = compute_sample_weight("balanced", [0] * len(train_ds) + [1] * len(extra_train_ds)) train_sampler = WeightedRandomSampler(weights, train_sampler.num_samples * 2) train_ds = train_ds + extra_train_ds print("Using extra data from xView2 with", len(extra_train_ds), "samples") if run_train: loaders = collections.OrderedDict() callbacks = default_callbacks.copy() criterions_dict = {} losses = [] ignore_index = None if online_pseudolabeling: ignore_index = UNLABELED_SAMPLE unlabeled_label = get_pseudolabeling_dataset( data_dir, include_masks=False, augmentation=None, image_size=image_size ) unlabeled_train = get_pseudolabeling_dataset( data_dir, include_masks=True, augmentation=augmentations, image_size=image_size ) if args.distributed: label_sampler = DistributedSampler(unlabeled_label, args.world_size, args.local_rank, shuffle=False) else: label_sampler = None loaders["infer"] = DataLoader( unlabeled_label, batch_size=batch_size // 2, num_workers=num_workers, pin_memory=True, sampler=label_sampler, drop_last=False, ) if train_sampler is not None: num_samples = 2 * train_sampler.num_samples else: num_samples = 2 * len(train_ds) weights = compute_sample_weight("balanced", [0] * len(train_ds) + [1] * len(unlabeled_label)) train_sampler = WeightedRandomSampler(weights, num_samples, replacement=True) train_ds = train_ds + unlabeled_train callbacks += [ BCEOnlinePseudolabelingCallback2d( unlabeled_train, pseudolabel_loader="infer", prob_threshold=0.7, output_key=OUTPUT_MASK_KEY, unlabeled_class=UNLABELED_SAMPLE, label_frequency=5, ) ] print("Using online pseudolabeling with ", len(unlabeled_label), "samples") valid_sampler = None if args.distributed: if train_sampler is not None: train_sampler = DistributedSamplerWrapper( train_sampler, args.world_size, args.local_rank, shuffle=True ) else: train_sampler = DistributedSampler(train_ds, args.world_size, args.local_rank, shuffle=True) valid_sampler = DistributedSampler(valid_ds, args.world_size, args.local_rank, shuffle=False) loaders["train"] = DataLoader( train_ds, batch_size=batch_size, num_workers=num_workers, pin_memory=True, drop_last=True, shuffle=train_sampler is None, sampler=train_sampler, ) loaders["valid"] = DataLoader( valid_ds, batch_size=batch_size, num_workers=num_workers, pin_memory=True, sampler=valid_sampler ) loss_callbacks, loss_criterions = get_criterions( criterions, criterions2, criterions4, criterions8, criterions16 ) callbacks += loss_callbacks optimizer = get_optimizer( optimizer_name, get_optimizable_parameters(model), learning_rate, weight_decay=weight_decay ) scheduler = get_scheduler( scheduler_name, optimizer, lr=learning_rate, num_epochs=num_epochs, batches_in_epoch=len(loaders["train"]) ) if isinstance(scheduler, (CyclicLR, OneCycleLRWithWarmup)): callbacks += [SchedulerCallback(mode="batch")] log_dir = os.path.join("runs", checkpoint_prefix) if is_master: os.makedirs(log_dir, exist_ok=False) config_fname = os.path.join(log_dir, f"{checkpoint_prefix}.json") with open(config_fname, "w") as f: train_session_args = vars(args) f.write(json.dumps(train_session_args, indent=2)) print("Train session :", checkpoint_prefix) print(" FP16 mode :", fp16) print(" Fast mode :", args.fast) print(" Train mode :", train_mode) print(" Epochs :", num_epochs) print(" Workers :", num_workers) print(" Data dir :", data_dir) print(" Log dir :", log_dir) print(" Augmentations :", augmentations) print(" Train size :", "batches", len(loaders["train"]), "dataset", len(train_ds)) print(" Valid size :", "batches", len(loaders["valid"]), "dataset", len(valid_ds)) print("Model :", model_name) print(" Parameters :", count_parameters(model)) print(" Image size :", image_size) print("Optimizer :", optimizer_name) print(" Learning rate :", learning_rate) print(" Batch size :", batch_size) print(" Criterion :", criterions) print(" Use weight mask:", need_weight_mask) if args.distributed: print("Distributed") print(" World size :", args.world_size) print(" Local rank :", args.local_rank) print(" Is master :", args.is_master) # model training runner = SupervisedRunner(input_key=INPUT_IMAGE_KEY, output_key=None, device="cuda") runner.train( fp16=distributed_params, model=model, criterion=loss_criterions, optimizer=optimizer, scheduler=scheduler, callbacks=callbacks, loaders=loaders, logdir=os.path.join(log_dir, "main"), num_epochs=num_epochs, verbose=verbose, main_metric=main_metric, minimize_metric=False, checkpoint_data={"cmd_args": vars(args)}, ) # Training is finished. Let's run predictions using best checkpoint weights if is_master: best_checkpoint = os.path.join(log_dir, "main", "checkpoints", "best.pth") model_checkpoint = os.path.join(log_dir, f"{checkpoint_prefix}.pth") clean_checkpoint(best_checkpoint, model_checkpoint) unpack_checkpoint(torch.load(model_checkpoint), model=model) mask = predict( model, read_inria_image("sample_color.jpg"), image_size=image_size, batch_size=args.batch_size ) mask = ((mask > 0) * 255).astype(np.uint8) name = os.path.join(log_dir, "sample_color.jpg") cv2.imwrite(name, mask)
def test_tracer_callback(): """ Tests a feature of `TracerCallback` for model tracing during training """ logdir = "./logs" dataset_root = "./dataset" loaders = _get_loaders(root=dataset_root, batch_size=4, num_workers=1) images, targets = next(iter(loaders["train"])) _, c, h, w = images.shape input_shape = (c, h, w) model = _TracedNet(input_shape) criterion = nn.CrossEntropyLoss() optimizer = Adam(model.parameters()) method_name = "forward" mode = "eval" requires_grad = False checkpoint_name = "best" opt_level = None trace_name = get_trace_name( method_name=method_name, mode=mode, requires_grad=requires_grad, additional_string=checkpoint_name, ) tracing_path = Path(logdir) / "trace" / trace_name criterion_callback = CriterionCallback() optimizer_callback = OptimizerCallback() tracer_callback = TracerCallback( metric="loss", minimize=False, trace_mode=mode, mode=checkpoint_name, do_once=True, method_name=method_name, requires_grad=requires_grad, opt_level=opt_level, ) test_callback = _OnStageEndCheckModelTracedCallback( path=tracing_path, inputs=images, ) callbacks = collections.OrderedDict( loss=criterion_callback, optimizer=optimizer_callback, tracer_callback=tracer_callback, test_callback=test_callback, ) runner = SupervisedRunner(input_key="x") runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir=logdir, callbacks=callbacks, check=True, verbose=True, ) shutil.rmtree(logdir) shutil.rmtree(dataset_root)
# data num_samples, num_features = int(1e4), int(1e1) X = torch.rand(int(1e4), num_features) y = torch.rand(X.shape[0]) dataset = TensorDataset(X, y) # model, criterion, optimizer, scheduler model = torch.nn.Linear(num_features, 1) criterion = torch.nn.MSELoss() optimizer = torch.optim.Adam(model.parameters()) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [3, 6]) runner = SupervisedRunner() runner.train( model=model, datasets={ "batch_size": 32, "num_workers": 1, "train": dataset, "valid": dataset, }, criterion=criterion, optimizer=optimizer, logdir="./logs/example_2", num_epochs=8, verbose=True, distributed=True, check=True, )
class BERTClassificationModel: def __init__(self, model_name="cl-tohoku/bert-base-japanese-whole-word-masking", checkpoints_dir=None): """ Text classification model based on Japanese BERT Model. Attributes ---------- model_name : str The BERT model file checkpoints_dir : str The path of trained BERT model dir ------- fit() Train a text classification model. eval() Evaluate the trained model. predict() Predict a label. """ self.runner = SupervisedRunner( input_key=("features", "attention_mask") ) if checkpoints_dir: config_file = f"{checkpoints_dir}/checkpoints/config.pkl" if os.path.exists(config_file): with open(config_file, "rb") as f: self.label2id, self.config = pickle.load(f) self.id2label = {v: k for k, v in self.label2id.items()} num_labels = len(self.label2id) self.max_seq_length = self.config["max_seq_length"] self.batch_size = self.config["batch_size"] self.model_name = self.config["model_name"] self.elapsed_time = self.config["elapsed_time"] self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) self.model = BERTBaseJapaneseModel(self.model_name, num_labels) self.data_for_predict = ClassificationDataset( tokenizer=self.tokenizer, label2id=self.label2id, max_seq_length=self.max_seq_length, texts=["checkpoints"] ) temporary_data = { "temporary": DataLoader( dataset=self.data_for_predict, batch_size=self.batch_size, shuffle=False ) } # Load the trained BERT model self.runner.infer( model=self.model, loaders=temporary_data, resume=f"{checkpoints_dir}/checkpoints/best.pth" ) else: self.model_name = model_name self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.pad_vid = self.tokenizer.vocab["[PAD]"] self.data_for_predict = None def fit(self, train_df, dev_df, batch_size=16, max_seq_length=256, learning_rate=5e-5, epochs=1, log_dir=None, verbose=False): start = time.time() config = { "model_name": self.model_name, "batch_size": batch_size, "max_seq_length": max_seq_length, "learning_rate": learning_rate, "epochs": epochs, "log_dir": log_dir } train_y = train_df[0] train_X = train_df[1] label2id = dict( zip(sorted(set(train_y)), range(len(set(train_y)))) ) self.id2label = {v: k for k, v in label2id.items()} num_labels = len(label2id) self.train_data = ClassificationDataset( tokenizer=self.tokenizer, label2id=label2id, max_seq_length=max_seq_length, texts=train_X, labels=train_y ) dev_y = dev_df[0] dev_X = dev_df[1] self.dev_data = ClassificationDataset( tokenizer=self.tokenizer, label2id=label2id, max_seq_length=max_seq_length, texts=dev_X, labels=dev_y ) train_dev_loaders = { "train": DataLoader( dataset=self.train_data, batch_size=batch_size, shuffle=True ), "valid": DataLoader( dataset=self.dev_data, batch_size=batch_size, shuffle=False ) } model = BERTBaseJapaneseModel(self.model_name, num_labels) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer) self.runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=train_dev_loaders, callbacks=[ AccuracyCallback(num_classes=num_labels), ], fp16=None, logdir=log_dir, num_epochs=epochs, verbose=verbose ) self.elapsed_time = time.time() - start config["elapsed_time"] = self.elapsed_time if os.path.exists(f"{log_dir}/checkpoints"): filename = f"{log_dir}/checkpoints/config.pkl" with open(filename, "wb") as f: pickle.dump([label2id, config], f) def predict(self, text): if self.data_for_predict: x = self.data_for_predict._from_text(text) else: x = self.train_data._from_text(text) x["features"] = x["features"].reshape(1, -1) x["attention_mask"] = x["attention_mask"].reshape(1, -1) logits = self.runner.predict_batch(x)['logits'] pred_id = logits.argmax(axis=1) pred_y = self.id2label[int(pred_id)] return pred_y def eval(self, test_df): test_Y = test_df[0] pred_Y = [self.predict(text) for text in test_df[1]] accuracy = accuracy_score(test_Y, pred_Y) macro_f1 = f1_score(test_Y, pred_Y, average="macro") cr = classification_report(test_Y, pred_Y) eval_metrics = classifiers.EvaluationMetrics( accuracy, macro_f1, cr, self.elapsed_time ) return eval_metrics
train_iter, val_iter, test_iter = data.BucketIterator.splits( (train, val, test), batch_sizes=(64, 64, 64), device=device, repeat=False, sort=False) train_loader = BucketIteratorWrapper(train_iter) valid_loader = BucketIteratorWrapper(val_iter) loaders = {"train": train_loader, "valid": valid_loader} TEXT.build_vocab(train, min_freq=2) LABELS.build_vocab(train) model = RNN(len(TEXT.vocab.stoi) + 1, num_layers=2, output_size=4) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=0.01) runner = SupervisedRunner() runner.train( model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir="./logdir", callbacks=[AccuracyCallback(num_classes=4, accuracy_args=[1])], num_epochs=10, verbose=True, )